or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

data-analysis.mdformat-conversion.mdindex.mdsystem-management.md

format-conversion.mddocs/

0

# Format Conversion

1

2

Comprehensive format support for quantum chemistry (VASP, Gaussian, CP2K), molecular dynamics (LAMMPS, GROMACS), machine learning (DeePMD-kit), and general formats (XYZ, SDF), with both Python API and command-line tools. The plugin-based architecture enables seamless interoperability between different computational science software packages.

3

4

## Capabilities

5

6

### Format System

7

8

Plugin-based format conversion system that handles reading from and writing to various atomistic data formats. The system uses a registry pattern to dynamically load format handlers.

9

10

```python { .api }

11

def load_format(fmt: str):

12

"""

13

Load format plugin by name.

14

15

Parameters:

16

- fmt: str, format identifier (e.g., 'vasp/poscar', 'lammps/lmp')

17

18

Returns:

19

Format handler instance

20

21

Raises:

22

NotImplementedError: if format is not supported

23

"""

24

25

class Format:

26

"""Abstract base class for file format plugins."""

27

28

@classmethod

29

def register(cls, key: str):

30

"""

31

Register format plugin decorator.

32

33

Parameters:

34

- key: str, format identifier

35

36

Returns:

37

Decorator function for format classes

38

"""

39

40

@classmethod

41

def register_from(cls, key: str):

42

"""

43

Register custom from method decorator.

44

45

Parameters:

46

- key: str, format identifier

47

48

Returns:

49

Decorator function for from methods

50

"""

51

52

@classmethod

53

def register_to(cls, key: str):

54

"""

55

Register custom to method decorator.

56

57

Parameters:

58

- key: str, format identifier

59

60

Returns:

61

Decorator function for to methods

62

"""

63

64

@classmethod

65

def get_formats(cls) -> dict:

66

"""Get all registered format plugins."""

67

68

@classmethod

69

def get_from_methods(cls) -> dict:

70

"""Get all registered from methods."""

71

72

@classmethod

73

def get_to_methods(cls) -> dict:

74

"""Get all registered to methods."""

75

76

@classmethod

77

def post(cls, func_name: str):

78

"""Register post-processing decorator.

79

80

Parameters:

81

- func_name: str, function name to post-process

82

83

Returns:

84

Decorator for post-processing functions

85

"""

86

87

def from_system(self, file_name: str, **kwargs) -> dict:

88

"""Load system data from file.

89

90

Parameters:

91

- file_name: str, path to input file

92

- kwargs: format-specific options

93

94

Returns:

95

dict: system data

96

"""

97

98

def to_system(self, data: dict, *args, **kwargs):

99

"""Write system data to file.

100

101

Parameters:

102

- data: dict, system data

103

- args: positional arguments for output

104

- kwargs: format-specific options

105

"""

106

107

def from_labeled_system(self, file_name: str, **kwargs) -> dict:

108

"""Load labeled system data from file.

109

110

Parameters:

111

- file_name: str, path to input file

112

- kwargs: format-specific options

113

114

Returns:

115

dict: labeled system data

116

"""

117

118

def to_labeled_system(self, data: dict, *args, **kwargs):

119

"""Write labeled system data to file.

120

121

Parameters:

122

- data: dict, labeled system data

123

- args: positional arguments for output

124

- kwargs: format-specific options

125

"""

126

127

def from_bond_order_system(self, file_name: str, **kwargs):

128

"""Load bond order system from file.

129

130

Parameters:

131

- file_name: str, path to input file

132

- kwargs: format-specific options

133

134

Returns:

135

RDKit molecule object

136

"""

137

138

def to_bond_order_system(self, data: dict, rdkit_mol, *args, **kwargs):

139

"""Write bond order system to file.

140

141

Parameters:

142

- data: dict, system data

143

- rdkit_mol: RDKit molecule object

144

- args: positional arguments for output

145

- kwargs: format-specific options

146

"""

147

148

def from_multi_systems(self, directory: str, **kwargs) -> dict:

149

"""Load multiple systems from directory.

150

151

Parameters:

152

- directory: str, directory path

153

- kwargs: format-specific options

154

155

Returns:

156

dict: multi-systems data

157

"""

158

159

def to_multi_systems(self, formulas: dict, directory: str, **kwargs):

160

"""Write multiple systems to directory.

161

162

Parameters:

163

- formulas: dict, system formulas and data

164

- directory: str, output directory

165

- kwargs: format-specific options

166

"""

167

```

168

169

### System I/O Methods

170

171

Core methods for loading and saving atomistic data in various formats. These methods are available on all System classes.

172

173

```python { .api }

174

class System:

175

@classmethod

176

def from_fmt(cls, file_name: str, fmt: str, **kwargs):

177

"""

178

Load system from file with specified format.

179

180

Parameters:

181

- file_name: str, path to input file

182

- fmt: str, format identifier

183

- kwargs: format-specific options

184

185

Returns:

186

System instance loaded from file

187

"""

188

189

def to(self, fmt: str, *args, **kwargs):

190

"""

191

Export system to specified format.

192

193

Parameters:

194

- fmt: str, output format identifier

195

- args: positional arguments for format

196

- kwargs: format-specific options

197

"""

198

```

199

200

### Command Line Interface

201

202

Command-line tools for format conversion and basic operations. Provides quick conversion between formats without writing Python code.

203

204

```python { .api }

205

def dpdata_cli():

206

"""

207

Main CLI entry point for format conversion.

208

209

Usage:

210

dpdata INPUT_FILE -i INPUT_FORMAT -o OUTPUT_FORMAT -O OUTPUT_PATH

211

"""

212

213

def convert(from_file: str, from_format: str, to_file: str, to_format: str, no_labeled: bool = False, multi: bool = False, type_map: list = None):

214

"""

215

Convert between file formats programmatically.

216

217

Parameters:

218

- from_file: str, source file path

219

- from_format: str, source format identifier

220

- to_file: str, target file path

221

- to_format: str, target format identifier

222

- no_labeled: bool, treat as unlabeled data

223

- multi: bool, handle multiple systems

224

- type_map: list, atom type mapping

225

"""

226

```

227

228

### Plugin System

229

230

Generic plugin registration system that enables extensible format support and custom functionality.

231

232

```python { .api }

233

class Plugin:

234

"""Generic plugin registration system."""

235

236

def __init__(self):

237

"""Initialize plugin registry."""

238

239

def register(self, key: str):

240

"""

241

Register plugin decorator.

242

243

Parameters:

244

- key: str, plugin identifier

245

246

Returns:

247

Decorator function for plugin classes

248

"""

249

250

def get_plugin(self, key: str):

251

"""

252

Retrieve plugin by key.

253

254

Parameters:

255

- key: str, plugin identifier

256

257

Returns:

258

Plugin instance

259

260

Raises:

261

RuntimeError: if plugin not found

262

"""

263

264

@property

265

def plugins(self) -> dict:

266

"""Dictionary of all registered plugins."""

267

```

268

269

## Supported Formats

270

271

DPData provides extensive format coverage across the computational science ecosystem:

272

273

### Quantum Chemistry Formats

274

275

**VASP**: Vienna Ab initio Simulation Package

276

- `vasp/poscar` - POSCAR/CONTCAR structure files

277

- `vasp/outcar` - OUTCAR output with energies and forces

278

- `vasp/xml` - vasprun.xml electronic structure data

279

280

**Gaussian**: Quantum chemistry software

281

- `gaussian/gjf` - Gaussian input files

282

- `gaussian/log` - Gaussian output files

283

284

**CP2K**: Quantum molecular dynamics

285

- `cp2k/output` - CP2K output files

286

- `cp2k/cell` - CP2K cell files

287

288

**ABACUS**: Density functional theory package

289

- `abacus/stru` - ABACUS structure files

290

- `abacus/scf` - Self-consistent field results

291

292

**Other QC Packages**:

293

- `qe/pw` - Quantum ESPRESSO pw.x

294

- `fhi_aims/output` - FHI-aims output

295

- `siesta/output` - SIESTA output

296

- `orca/output` - ORCA quantum chemistry

297

- `psi4/output` - PSI4 quantum chemistry

298

- `dftbplus/output` - DFTB+ calculations

299

300

### Molecular Dynamics Formats

301

302

**LAMMPS**: Large-scale Atomic/Molecular Massively Parallel Simulator

303

- `lammps/lmp` - LAMMPS data files

304

- `lammps/dump` - LAMMPS dump files

305

306

**GROMACS**: Molecular dynamics package

307

- `gromacs/gro` - GROMACS structure files

308

309

**AMBER**: Molecular dynamics suite

310

- `amber/nc` - AMBER NetCDF trajectories

311

312

### Machine Learning Formats

313

314

**DeePMD-kit**: Deep potential molecular dynamics

315

- `deepmd/raw` - Raw data format

316

- `deepmd/npy` - NumPy array format

317

- `deepmd/hdf5` - HDF5 data format

318

319

**ASE**: Atomic Simulation Environment

320

- `ase/structure` - ASE Atoms objects

321

322

### General Formats

323

324

**XYZ**: Cartesian coordinates

325

- `xyz` - Standard XYZ format

326

327

**SDF/MOL**: Chemical structure formats

328

- `sdf` - Structure Data Format

329

- `mol` - MOL file format

330

331

**PyMatGen**: Materials analysis

332

- `pymatgen/structure` - PyMatGen Structure objects

333

334

## Usage Examples

335

336

### Basic Format Conversion

337

338

```python

339

import dpdata

340

341

# Load VASP OUTCAR file

342

ls = dpdata.LabeledSystem('OUTCAR', fmt='vasp/outcar')

343

344

# Convert to DeePMD training format

345

ls.to('deepmd/npy', 'training_data')

346

347

# Load LAMMPS dump file

348

sys = dpdata.System('dump.lammpstrj', fmt='lammps/dump', type_map=['H', 'O'])

349

350

# Convert to XYZ format

351

sys.to('xyz', 'trajectory.xyz')

352

```

353

354

### Command Line Usage

355

356

```bash

357

# Convert VASP to DeePMD format

358

dpdata OUTCAR -i vasp/outcar -o deepmd/npy -O deepmd_data

359

360

# Convert LAMMPS to XYZ

361

dpdata dump.lammpstrj -i lammps/dump -o xyz -O trajectory.xyz

362

363

# Check version

364

dpdata --version

365

```

366

367

### Working with Multiple Formats

368

369

```python

370

# Load different formats into MultiSystems

371

ms = dpdata.MultiSystems()

372

373

# Add VASP data

374

vasp_sys = dpdata.System('POSCAR', fmt='vasp/poscar')

375

ms.append(vasp_sys)

376

377

# Add LAMMPS data

378

lammps_sys = dpdata.System('data.lmp', fmt='lammps/lmp')

379

ms.append(lammps_sys)

380

381

# Export all to consistent format

382

ms.to('xyz', 'combined_structures.xyz')

383

```

384

385

### Custom Format Registration

386

387

```python

388

from dpdata.format import Format

389

390

@Format.register('custom/myformat')

391

class MyFormat(Format):

392

def from_system(self, file_name, **kwargs):

393

# Implementation for reading custom format

394

pass

395

396

def to_system(self, system, file_name, **kwargs):

397

# Implementation for writing custom format

398

pass

399

400

# Use custom format

401

sys = dpdata.System('myfile.custom', fmt='custom/myformat')

402

```

403

404

### Format-Specific Options

405

406

```python

407

# VASP with type mapping

408

ls = dpdata.LabeledSystem('OUTCAR', fmt='vasp/outcar', type_map=['C', 'H'])

409

410

# LAMMPS with specific frame range

411

sys = dpdata.System('dump.lammpstrj', fmt='lammps/dump', begin=100, step=10)

412

413

# DeePMD with compression

414

ls.to('deepmd/hdf5', 'data.hdf5', compression='gzip')

415

416

# XYZ with custom formatting

417

sys.to('xyz', 'structure.xyz', format_string='%.6f')

418

```

419

420

### Error Handling and Validation

421

422

```python

423

try:

424

# Attempt to load file

425

sys = dpdata.System('input.xyz', fmt='xyz')

426

except FileNotFoundError:

427

print("Input file not found")

428

except NotImplementedError as e:

429

print(f"Format not supported: {e}")

430

except Exception as e:

431

print(f"Error loading data: {e}")

432

433

# Check available formats

434

formats = dpdata.format.Format.get_formats()

435

print("Available formats:", list(formats.keys()))

436

```