CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-dpdata

Manipulating data formats of DeePMD-kit, VASP, QE, PWmat, and LAMMPS, etc.

Pending
Overview
Eval results
Files

system-management.mddocs/

System Management

Core classes for managing atomistic data including unlabeled structures, energy/force labeled datasets, multi-composition systems, and molecular systems with bond information. These classes provide the fundamental data structures for all dpdata operations.

Capabilities

System Class

The fundamental data container for atomic simulation systems. Contains frames with consistent atom ordering, storing coordinates, cell information, atom types, and topology without energy/force labels.

class System:
    def __init__(self, file_name=None, fmt=None, type_map=None, begin=0, step=1, data=None, convergence_check=True, **kwargs):
        """
        Initialize a System from file or data.
        
        Parameters:
        - file_name: str, path to input file
        - fmt: str, format identifier ('vasp/poscar', 'lammps/lmp', etc.)
        - type_map: list, mapping from element names to indices
        - begin: int, starting frame index
        - step: int, frame step size
        - data: dict, raw system data
        - convergence_check: bool, check VASP convergence
        """
        
    def get_atom_names(self) -> list[str]:
        """Get list of element names."""
        
    def get_atom_types(self) -> np.ndarray:
        """Get array of atom type indices."""
        
    def get_atom_numbs(self) -> list[int]:
        """Get number of atoms per type."""
        
    def get_nframes(self) -> int:
        """Get number of frames."""
        
    def get_natoms(self) -> int:
        """Get total number of atoms."""
        
    def get_ntypes(self) -> int:
        """Get number of atom types."""
        
    def copy(self):
        """Create deep copy of system."""
        
    def sub_system(self, f_idx):
        """
        Extract subsystem by frame indices.
        
        Parameters:
        - f_idx: array-like, frame indices to extract
        
        Returns:
        System with selected frames
        """
        
    def append(self, system):
        """
        Append another system.
        
        Parameters:
        - system: System, system to append
        """
        
    def sort_atom_names(self, type_map=None):
        """
        Sort atoms by element names.
        
        Parameters:
        - type_map: list, element name order
        """
        
    def sort_atom_types(self):
        """Sort atoms by type indices."""
        
    def check_data(self):
        """Validate system data integrity."""
        
    def map_atom_types(self, type_map: list[str]):
        """Map atom types using custom mapping.
        
        Parameters:
        - type_map: list, mapping from indices to element names
        """
        
    def extend(self, systems: list[System]):
        """Extend system with multiple other systems.
        
        Parameters:
        - systems: list of System instances to append
        """
        
    def affine_map(self, trans: np.ndarray, f_idx: int = 0):
        """Apply affine transformation to coordinates.
        
        Parameters:
        - trans: array, 3x3 transformation matrix
        - f_idx: int, frame index to transform
        """
        
    def rot_lower_triangular(self):
        """Rotate all frames to have lower triangular cells."""
        
    def rot_frame_lower_triangular(self, f_idx: int = 0):
        """Rotate specific frame to have lower triangular cell.
        
        Parameters:
        - f_idx: int, frame index to rotate
        """
        
    def add_atom_names(self, atom_names: list[str]):
        """Add new atom types.
        
        Parameters:
        - atom_names: list, new element names to add
        """
        
    def replicate(self, ncopy):
        """
        Replicate system in 3D.
        
        Parameters:
        - ncopy: array-like [nx, ny, nz], replication counts
        
        Returns:
        System with replicated structure
        """
        
    def apply_pbc(self):
        """Apply periodic boundary conditions."""
        
    def remove_pbc(self, protect_layer=0):
        """
        Remove PBC and create large cell.
        
        Parameters:
        - protect_layer: float, protection layer thickness
        """
        
    def perturb(self, pert_num, cell_pert_fraction=0.03, atom_pert_distance=0.01, atom_pert_style='normal', atom_pert_prob=1.0):
        """
        Generate perturbed structures.
        
        Parameters:
        - pert_num: int, number of perturbed structures
        - cell_pert_fraction: float, cell deformation fraction
        - atom_pert_distance: float, atom displacement distance
        - atom_pert_style: str, perturbation style ('normal', 'const')
        - atom_pert_prob: float, probability of perturbing each atom
        
        Returns:
        MultiSystems with perturbed structures
        """
        
    def shuffle(self):
        """Randomly shuffle frames."""
        
    def pick_atom_idx(self, idx, nopbc=False):
        """
        Select atoms by indices.
        
        Parameters:
        - idx: array-like, atom indices to select
        - nopbc: bool, whether system is non-periodic
        
        Returns:
        System with selected atoms
        """
        
    def remove_atom_names(self, atom_names):
        """
        Remove specific atom types.
        
        Parameters:
        - atom_names: list, element names to remove
        
        Returns:
        System without specified atoms
        """
        
    def pick_by_amber_mask(self, param, maskstr, pass_coords=True, nopbc=False):
        """
        Select atoms using Amber mask syntax.
        
        Parameters:
        - param: str, path to parameter file
        - maskstr: str, Amber mask string
        - pass_coords: bool, whether to pass coordinates
        - nopbc: bool, whether system is non-periodic
        
        Returns:
        System with selected atoms
        """
        
    def replace(self, initial_atom_type, end_atom_type, replace_num=None):
        """
        Replace atoms of one type with another.
        
        Parameters:
        - initial_atom_type: str, element to replace
        - end_atom_type: str, replacement element
        - replace_num: int, number of atoms to replace
        
        Returns:
        System with replaced atoms
        """
        
    def predict(self, *args, driver=None, **kwargs):
        """
        Predict properties using ML models.
        
        Parameters:
        - driver: str or Driver, prediction driver
        - args, kwargs: driver-specific arguments
        
        Returns:
        LabeledSystem with predicted properties
        """
        
    def minimize(self, *args, minimizer=None, **kwargs):
        """
        Minimize geometry.
        
        Parameters:
        - minimizer: str or Minimizer, optimization method
        - args, kwargs: minimizer-specific arguments
        
        Returns:
        System with minimized geometry
        """
        
    def to(self, fmt, *args, **kwargs):
        """
        Export to various formats.
        
        Parameters:
        - fmt: str, output format
        - args, kwargs: format-specific arguments
        """
        
    @classmethod
    def from_dict(cls, data: dict):
        """Create System from dictionary data."""
        
    @classmethod
    def load(cls, filename: str):
        """Load System from JSON/YAML file."""
        
    @property
    def formula(self) -> str:
        """Chemical formula string."""
        
    @property
    def uniq_formula(self) -> str:
        """Sorted formula for comparison."""
        
    @property
    def short_formula(self) -> str:
        """Compressed formula without zeros."""
        
    @property
    def formula_hash(self) -> str:
        """SHA256 hash of formula."""
        
    @property
    def short_name(self) -> str:
        """Abbreviated system name."""
        
    @property
    def nopbc(self) -> bool:
        """Whether system is non-periodic."""

LabeledSystem Class

System with energy, force, and virial labels for machine learning model training. Extends System with additional methods for handling training data.

class LabeledSystem(System):
    def has_forces(self) -> bool:
        """Check if forces are present."""
        
    def has_virial(self) -> bool:
        """Check if virial data is present."""
        
    def affine_map_fv(self, trans: np.ndarray, f_idx: int):
        """Apply transformation to forces and virial.
        
        Parameters:
        - trans: array, 3x3 transformation matrix
        - f_idx: int, frame index
        """
        
    def rot_frame_lower_triangular(self, f_idx: int = 0):
        """Rotate frame to lower triangular and adjust forces/virial.
        
        Parameters:
        - f_idx: int, frame index
        """
        
    def correction(self, hl_sys):
        """
        Calculate correction between two labeled systems.
        
        Parameters:
        - hl_sys: LabeledSystem, high-level reference system
        
        Returns:
        LabeledSystem with correction data
        """
        
    def remove_outlier(self, threshold=3.0):
        """
        Remove outlier frames based on energy distribution.
        
        Parameters:
        - threshold: float, standard deviation threshold
        
        Returns:
        LabeledSystem with outliers removed
        """

MultiSystems Class

Container for multiple System objects with different compositions but consistent atom naming. Enables handling of datasets with multiple chemical compositions.

class MultiSystems:
    def __init__(self, *systems, type_map=None):
        """
        Initialize MultiSystems container.
        
        Parameters:
        - systems: System objects to include
        - type_map: list, consistent atom type mapping
        """
        
    def from_fmt_obj(self, fmtobj, directory, labeled=False, **kwargs):
        """
        Load multiple systems from format object.
        
        Parameters:
        - fmtobj: Format, format handler
        - directory: str, directory path
        - labeled: bool, whether systems have labels
        """
        
    def to(self, fmt, *args, **kwargs):
        """Export all systems to format."""
        
    def get_nframes(self) -> int:
        """Get total frames across all systems."""
        
    def append(self, *systems):
        """
        Add systems or other MultiSystems.
        
        Parameters:
        - systems: System or MultiSystems objects to add
        """
        
    def predict(self, *args, driver=None, **kwargs):
        """Predict properties for all systems."""
        
    def minimize(self, *args, minimizer=None, **kwargs):
        """Minimize all systems."""
        
    def pick_atom_idx(self, idx, nopbc=False):
        """Select atoms from all systems."""
        
    def correction(self, hl_sys):
        """Calculate corrections for all systems."""
        
    def train_test_split(self, test_size=0.2, seed=None):
        """
        Split into training/testing sets.
        
        Parameters:
        - test_size: float, fraction for testing
        - seed: int, random seed
        
        Returns:
        tuple: (train_MultiSystems, test_MultiSystems)
        """
        
    @classmethod
    def from_file(cls, file_name: str, fmt: str = 'auto', **kwargs):
        """Load MultiSystems from single file.
        
        Parameters:
        - file_name: str, path to input file
        - fmt: str, format identifier
        - kwargs: format-specific options
        
        Returns:
        MultiSystems instance
        """
        
    @classmethod  
    def from_dir(cls, dir_name: str, file_name: str, fmt: str = 'auto', type_map: list[str] = None):
        """Load MultiSystems from directory with multiple files.
        
        Parameters:
        - dir_name: str, directory path
        - file_name: str, file pattern to match
        - fmt: str, format identifier
        - type_map: list, atom type mapping
        
        Returns:
        MultiSystems instance
        """
        
    def load_systems_from_file(self, file_name: str, fmt: str, **kwargs):
        """Load and append systems from file.
        
        Parameters:
        - file_name: str, path to input file
        - fmt: str, format identifier
        - kwargs: format-specific options
        """

BondOrderSystem Class

System with chemical bond information and formal charges, typically loaded from molecular file formats. Provides access to molecular connectivity and chemical properties.

class BondOrderSystem(System):
    def __init__(self, file_name=None, fmt=None, type_map=None, begin=0, step=1, data=None, rdkit_mol=None, sanitize_level='high', raise_errors=True, verbose=True, **kwargs):
        """
        Initialize BondOrderSystem.
        
        Parameters:
        - rdkit_mol: RDKit molecule object
        - sanitize_level: str, RDKit sanitization level
        - raise_errors: bool, whether to raise errors
        - verbose: bool, verbose output
        """
        
    def from_rdkit_mol(self, rdkit_mol):
        """
        Initialize from RDKit molecule.
        
        Parameters:
        - rdkit_mol: RDKit molecule object
        """
        
    def get_nbonds(self) -> int:
        """Get number of bonds."""
        
    def get_charge(self) -> int:
        """Get total formal charge."""
        
    def get_mol(self):
        """Get RDKit molecule object."""
        
    def get_bond_order(self, begin_atom_idx: int, end_atom_idx: int) -> int:
        """
        Get bond order between atoms.
        
        Parameters:
        - begin_atom_idx: int, first atom index
        - end_atom_idx: int, second atom index
        
        Returns:
        int: bond order (1=single, 2=double, 3=triple)
        """
        
    def from_rdkit_mol(self, rdkit_mol):
        """Initialize from RDKit molecule object.
        
        Parameters:
        - rdkit_mol: RDKit Mol, molecule object
        """
        
    def get_formal_charges(self) -> list[int]:
        """Get formal charges on atoms.
        
        Returns:
        list: formal charges for each atom
        """

Usage Examples

Working with Systems

import dpdata

# Load VASP structure
sys = dpdata.System('POSCAR', fmt='vasp/poscar')

# Basic properties
print(f"Formula: {sys.formula}")
print(f"Atoms: {sys.get_natoms()}")
print(f"Types: {sys.get_atom_names()}")

# Manipulate structure
replicated = sys.replicate([2, 2, 1])  # 2x2x1 supercell
perturbed = sys.perturb(10, atom_pert_distance=0.1)  # 10 perturbed structures

# Export
sys.to('lammps/lmp', 'structure.lmp')

Working with Labeled Data

# Load VASP trajectory with energies/forces
ls = dpdata.LabeledSystem('OUTCAR', fmt='vasp/outcar')

print(f"Has forces: {ls.has_forces()}")
print(f"Has virial: {ls.has_virial()}")

# Split trajectory
train_data = ls.sub_system(range(0, 80))
test_data = ls.sub_system(range(80, 100))

# Export for ML training
train_data.to('deepmd/npy', 'train_data')
test_data.to('deepmd/npy', 'test_data')

Working with Multiple Systems

# Load multiple compositions
ms = dpdata.MultiSystems()
ms.append(dpdata.System('water.xyz', fmt='xyz'))
ms.append(dpdata.System('methane.xyz', fmt='xyz'))

# Train/test split across all systems
train_ms, test_ms = ms.train_test_split(test_size=0.2, seed=42)

print(f"Total frames: {ms.get_nframes()}")
print(f"Train frames: {train_ms.get_nframes()}")
print(f"Test frames: {test_ms.get_nframes()}")

Install with Tessl CLI

npx tessl i tessl/pypi-dpdata

docs

data-analysis.md

format-conversion.md

index.md

system-management.md

tile.json