0
# System Management
1
2
Core classes for managing atomistic data including unlabeled structures, energy/force labeled datasets, multi-composition systems, and molecular systems with bond information. These classes provide the fundamental data structures for all dpdata operations.
3
4
## Capabilities
5
6
### System Class
7
8
The fundamental data container for atomic simulation systems. Contains frames with consistent atom ordering, storing coordinates, cell information, atom types, and topology without energy/force labels.
9
10
```python { .api }
11
class System:
12
def __init__(self, file_name=None, fmt=None, type_map=None, begin=0, step=1, data=None, convergence_check=True, **kwargs):
13
"""
14
Initialize a System from file or data.
15
16
Parameters:
17
- file_name: str, path to input file
18
- fmt: str, format identifier ('vasp/poscar', 'lammps/lmp', etc.)
19
- type_map: list, mapping from element names to indices
20
- begin: int, starting frame index
21
- step: int, frame step size
22
- data: dict, raw system data
23
- convergence_check: bool, check VASP convergence
24
"""
25
26
def get_atom_names(self) -> list[str]:
27
"""Get list of element names."""
28
29
def get_atom_types(self) -> np.ndarray:
30
"""Get array of atom type indices."""
31
32
def get_atom_numbs(self) -> list[int]:
33
"""Get number of atoms per type."""
34
35
def get_nframes(self) -> int:
36
"""Get number of frames."""
37
38
def get_natoms(self) -> int:
39
"""Get total number of atoms."""
40
41
def get_ntypes(self) -> int:
42
"""Get number of atom types."""
43
44
def copy(self):
45
"""Create deep copy of system."""
46
47
def sub_system(self, f_idx):
48
"""
49
Extract subsystem by frame indices.
50
51
Parameters:
52
- f_idx: array-like, frame indices to extract
53
54
Returns:
55
System with selected frames
56
"""
57
58
def append(self, system):
59
"""
60
Append another system.
61
62
Parameters:
63
- system: System, system to append
64
"""
65
66
def sort_atom_names(self, type_map=None):
67
"""
68
Sort atoms by element names.
69
70
Parameters:
71
- type_map: list, element name order
72
"""
73
74
def sort_atom_types(self):
75
"""Sort atoms by type indices."""
76
77
def check_data(self):
78
"""Validate system data integrity."""
79
80
def map_atom_types(self, type_map: list[str]):
81
"""Map atom types using custom mapping.
82
83
Parameters:
84
- type_map: list, mapping from indices to element names
85
"""
86
87
def extend(self, systems: list[System]):
88
"""Extend system with multiple other systems.
89
90
Parameters:
91
- systems: list of System instances to append
92
"""
93
94
def affine_map(self, trans: np.ndarray, f_idx: int = 0):
95
"""Apply affine transformation to coordinates.
96
97
Parameters:
98
- trans: array, 3x3 transformation matrix
99
- f_idx: int, frame index to transform
100
"""
101
102
def rot_lower_triangular(self):
103
"""Rotate all frames to have lower triangular cells."""
104
105
def rot_frame_lower_triangular(self, f_idx: int = 0):
106
"""Rotate specific frame to have lower triangular cell.
107
108
Parameters:
109
- f_idx: int, frame index to rotate
110
"""
111
112
def add_atom_names(self, atom_names: list[str]):
113
"""Add new atom types.
114
115
Parameters:
116
- atom_names: list, new element names to add
117
"""
118
119
def replicate(self, ncopy):
120
"""
121
Replicate system in 3D.
122
123
Parameters:
124
- ncopy: array-like [nx, ny, nz], replication counts
125
126
Returns:
127
System with replicated structure
128
"""
129
130
def apply_pbc(self):
131
"""Apply periodic boundary conditions."""
132
133
def remove_pbc(self, protect_layer=0):
134
"""
135
Remove PBC and create large cell.
136
137
Parameters:
138
- protect_layer: float, protection layer thickness
139
"""
140
141
def perturb(self, pert_num, cell_pert_fraction=0.03, atom_pert_distance=0.01, atom_pert_style='normal', atom_pert_prob=1.0):
142
"""
143
Generate perturbed structures.
144
145
Parameters:
146
- pert_num: int, number of perturbed structures
147
- cell_pert_fraction: float, cell deformation fraction
148
- atom_pert_distance: float, atom displacement distance
149
- atom_pert_style: str, perturbation style ('normal', 'const')
150
- atom_pert_prob: float, probability of perturbing each atom
151
152
Returns:
153
MultiSystems with perturbed structures
154
"""
155
156
def shuffle(self):
157
"""Randomly shuffle frames."""
158
159
def pick_atom_idx(self, idx, nopbc=False):
160
"""
161
Select atoms by indices.
162
163
Parameters:
164
- idx: array-like, atom indices to select
165
- nopbc: bool, whether system is non-periodic
166
167
Returns:
168
System with selected atoms
169
"""
170
171
def remove_atom_names(self, atom_names):
172
"""
173
Remove specific atom types.
174
175
Parameters:
176
- atom_names: list, element names to remove
177
178
Returns:
179
System without specified atoms
180
"""
181
182
def pick_by_amber_mask(self, param, maskstr, pass_coords=True, nopbc=False):
183
"""
184
Select atoms using Amber mask syntax.
185
186
Parameters:
187
- param: str, path to parameter file
188
- maskstr: str, Amber mask string
189
- pass_coords: bool, whether to pass coordinates
190
- nopbc: bool, whether system is non-periodic
191
192
Returns:
193
System with selected atoms
194
"""
195
196
def replace(self, initial_atom_type, end_atom_type, replace_num=None):
197
"""
198
Replace atoms of one type with another.
199
200
Parameters:
201
- initial_atom_type: str, element to replace
202
- end_atom_type: str, replacement element
203
- replace_num: int, number of atoms to replace
204
205
Returns:
206
System with replaced atoms
207
"""
208
209
def predict(self, *args, driver=None, **kwargs):
210
"""
211
Predict properties using ML models.
212
213
Parameters:
214
- driver: str or Driver, prediction driver
215
- args, kwargs: driver-specific arguments
216
217
Returns:
218
LabeledSystem with predicted properties
219
"""
220
221
def minimize(self, *args, minimizer=None, **kwargs):
222
"""
223
Minimize geometry.
224
225
Parameters:
226
- minimizer: str or Minimizer, optimization method
227
- args, kwargs: minimizer-specific arguments
228
229
Returns:
230
System with minimized geometry
231
"""
232
233
def to(self, fmt, *args, **kwargs):
234
"""
235
Export to various formats.
236
237
Parameters:
238
- fmt: str, output format
239
- args, kwargs: format-specific arguments
240
"""
241
242
@classmethod
243
def from_dict(cls, data: dict):
244
"""Create System from dictionary data."""
245
246
@classmethod
247
def load(cls, filename: str):
248
"""Load System from JSON/YAML file."""
249
250
@property
251
def formula(self) -> str:
252
"""Chemical formula string."""
253
254
@property
255
def uniq_formula(self) -> str:
256
"""Sorted formula for comparison."""
257
258
@property
259
def short_formula(self) -> str:
260
"""Compressed formula without zeros."""
261
262
@property
263
def formula_hash(self) -> str:
264
"""SHA256 hash of formula."""
265
266
@property
267
def short_name(self) -> str:
268
"""Abbreviated system name."""
269
270
@property
271
def nopbc(self) -> bool:
272
"""Whether system is non-periodic."""
273
```
274
275
### LabeledSystem Class
276
277
System with energy, force, and virial labels for machine learning model training. Extends System with additional methods for handling training data.
278
279
```python { .api }
280
class LabeledSystem(System):
281
def has_forces(self) -> bool:
282
"""Check if forces are present."""
283
284
def has_virial(self) -> bool:
285
"""Check if virial data is present."""
286
287
def affine_map_fv(self, trans: np.ndarray, f_idx: int):
288
"""Apply transformation to forces and virial.
289
290
Parameters:
291
- trans: array, 3x3 transformation matrix
292
- f_idx: int, frame index
293
"""
294
295
def rot_frame_lower_triangular(self, f_idx: int = 0):
296
"""Rotate frame to lower triangular and adjust forces/virial.
297
298
Parameters:
299
- f_idx: int, frame index
300
"""
301
302
def correction(self, hl_sys):
303
"""
304
Calculate correction between two labeled systems.
305
306
Parameters:
307
- hl_sys: LabeledSystem, high-level reference system
308
309
Returns:
310
LabeledSystem with correction data
311
"""
312
313
def remove_outlier(self, threshold=3.0):
314
"""
315
Remove outlier frames based on energy distribution.
316
317
Parameters:
318
- threshold: float, standard deviation threshold
319
320
Returns:
321
LabeledSystem with outliers removed
322
"""
323
```
324
325
### MultiSystems Class
326
327
Container for multiple System objects with different compositions but consistent atom naming. Enables handling of datasets with multiple chemical compositions.
328
329
```python { .api }
330
class MultiSystems:
331
def __init__(self, *systems, type_map=None):
332
"""
333
Initialize MultiSystems container.
334
335
Parameters:
336
- systems: System objects to include
337
- type_map: list, consistent atom type mapping
338
"""
339
340
def from_fmt_obj(self, fmtobj, directory, labeled=False, **kwargs):
341
"""
342
Load multiple systems from format object.
343
344
Parameters:
345
- fmtobj: Format, format handler
346
- directory: str, directory path
347
- labeled: bool, whether systems have labels
348
"""
349
350
def to(self, fmt, *args, **kwargs):
351
"""Export all systems to format."""
352
353
def get_nframes(self) -> int:
354
"""Get total frames across all systems."""
355
356
def append(self, *systems):
357
"""
358
Add systems or other MultiSystems.
359
360
Parameters:
361
- systems: System or MultiSystems objects to add
362
"""
363
364
def predict(self, *args, driver=None, **kwargs):
365
"""Predict properties for all systems."""
366
367
def minimize(self, *args, minimizer=None, **kwargs):
368
"""Minimize all systems."""
369
370
def pick_atom_idx(self, idx, nopbc=False):
371
"""Select atoms from all systems."""
372
373
def correction(self, hl_sys):
374
"""Calculate corrections for all systems."""
375
376
def train_test_split(self, test_size=0.2, seed=None):
377
"""
378
Split into training/testing sets.
379
380
Parameters:
381
- test_size: float, fraction for testing
382
- seed: int, random seed
383
384
Returns:
385
tuple: (train_MultiSystems, test_MultiSystems)
386
"""
387
388
@classmethod
389
def from_file(cls, file_name: str, fmt: str = 'auto', **kwargs):
390
"""Load MultiSystems from single file.
391
392
Parameters:
393
- file_name: str, path to input file
394
- fmt: str, format identifier
395
- kwargs: format-specific options
396
397
Returns:
398
MultiSystems instance
399
"""
400
401
@classmethod
402
def from_dir(cls, dir_name: str, file_name: str, fmt: str = 'auto', type_map: list[str] = None):
403
"""Load MultiSystems from directory with multiple files.
404
405
Parameters:
406
- dir_name: str, directory path
407
- file_name: str, file pattern to match
408
- fmt: str, format identifier
409
- type_map: list, atom type mapping
410
411
Returns:
412
MultiSystems instance
413
"""
414
415
def load_systems_from_file(self, file_name: str, fmt: str, **kwargs):
416
"""Load and append systems from file.
417
418
Parameters:
419
- file_name: str, path to input file
420
- fmt: str, format identifier
421
- kwargs: format-specific options
422
"""
423
```
424
425
### BondOrderSystem Class
426
427
System with chemical bond information and formal charges, typically loaded from molecular file formats. Provides access to molecular connectivity and chemical properties.
428
429
```python { .api }
430
class BondOrderSystem(System):
431
def __init__(self, file_name=None, fmt=None, type_map=None, begin=0, step=1, data=None, rdkit_mol=None, sanitize_level='high', raise_errors=True, verbose=True, **kwargs):
432
"""
433
Initialize BondOrderSystem.
434
435
Parameters:
436
- rdkit_mol: RDKit molecule object
437
- sanitize_level: str, RDKit sanitization level
438
- raise_errors: bool, whether to raise errors
439
- verbose: bool, verbose output
440
"""
441
442
def from_rdkit_mol(self, rdkit_mol):
443
"""
444
Initialize from RDKit molecule.
445
446
Parameters:
447
- rdkit_mol: RDKit molecule object
448
"""
449
450
def get_nbonds(self) -> int:
451
"""Get number of bonds."""
452
453
def get_charge(self) -> int:
454
"""Get total formal charge."""
455
456
def get_mol(self):
457
"""Get RDKit molecule object."""
458
459
def get_bond_order(self, begin_atom_idx: int, end_atom_idx: int) -> int:
460
"""
461
Get bond order between atoms.
462
463
Parameters:
464
- begin_atom_idx: int, first atom index
465
- end_atom_idx: int, second atom index
466
467
Returns:
468
int: bond order (1=single, 2=double, 3=triple)
469
"""
470
471
def from_rdkit_mol(self, rdkit_mol):
472
"""Initialize from RDKit molecule object.
473
474
Parameters:
475
- rdkit_mol: RDKit Mol, molecule object
476
"""
477
478
def get_formal_charges(self) -> list[int]:
479
"""Get formal charges on atoms.
480
481
Returns:
482
list: formal charges for each atom
483
"""
484
```
485
486
## Usage Examples
487
488
### Working with Systems
489
490
```python
491
import dpdata
492
493
# Load VASP structure
494
sys = dpdata.System('POSCAR', fmt='vasp/poscar')
495
496
# Basic properties
497
print(f"Formula: {sys.formula}")
498
print(f"Atoms: {sys.get_natoms()}")
499
print(f"Types: {sys.get_atom_names()}")
500
501
# Manipulate structure
502
replicated = sys.replicate([2, 2, 1]) # 2x2x1 supercell
503
perturbed = sys.perturb(10, atom_pert_distance=0.1) # 10 perturbed structures
504
505
# Export
506
sys.to('lammps/lmp', 'structure.lmp')
507
```
508
509
### Working with Labeled Data
510
511
```python
512
# Load VASP trajectory with energies/forces
513
ls = dpdata.LabeledSystem('OUTCAR', fmt='vasp/outcar')
514
515
print(f"Has forces: {ls.has_forces()}")
516
print(f"Has virial: {ls.has_virial()}")
517
518
# Split trajectory
519
train_data = ls.sub_system(range(0, 80))
520
test_data = ls.sub_system(range(80, 100))
521
522
# Export for ML training
523
train_data.to('deepmd/npy', 'train_data')
524
test_data.to('deepmd/npy', 'test_data')
525
```
526
527
### Working with Multiple Systems
528
529
```python
530
# Load multiple compositions
531
ms = dpdata.MultiSystems()
532
ms.append(dpdata.System('water.xyz', fmt='xyz'))
533
ms.append(dpdata.System('methane.xyz', fmt='xyz'))
534
535
# Train/test split across all systems
536
train_ms, test_ms = ms.train_test_split(test_size=0.2, seed=42)
537
538
print(f"Total frames: {ms.get_nframes()}")
539
print(f"Train frames: {train_ms.get_nframes()}")
540
print(f"Test frames: {test_ms.get_nframes()}")
541
```