tessl/pypi-dscribe

A Python package for creating feature transformations in applications of machine learning to materials science.

—

Pending

Overview

Eval results

Files

Utilities

Name: tessl/pypi-dscribe
Author: tessl

DScribe provides utility functions for common operations in materials science applications, including species handling, geometry calculations, statistical analysis, and array operations. These utilities support the core descriptor functionality and provide helpful tools for preprocessing and analysis.

Capabilities

Species Utilities

Functions for working with atomic species, converting between symbols and atomic numbers, and handling species lists.

def symbols_to_numbers(symbols):
    """
    Convert chemical symbols to atomic numbers.
    
    Parameters:
    - symbols: Chemical symbols as strings, list, or array
    
    Returns:
    numpy.ndarray: Array of atomic numbers
    
    Examples:
    symbols_to_numbers("H") -> [1]
    symbols_to_numbers(["H", "He", "Li"]) -> [1, 2, 3]
    """

def get_atomic_numbers(species):
    """
    Get ordered atomic numbers from species list.
    
    Parameters:
    - species: List of atomic species (symbols or numbers)
    
    Returns:
    numpy.ndarray: Sorted array of unique atomic numbers
    
    Examples:
    get_atomic_numbers(["H", "O", "H"]) -> [1, 8]
    get_atomic_numbers([1, 8, 1]) -> [1, 8]
    """

Usage Example:

from dscribe.utils.species import symbols_to_numbers, get_atomic_numbers

# Convert symbols to atomic numbers
symbols = ["H", "H", "O"]
numbers = symbols_to_numbers(symbols)  # [1, 1, 8]

# Get unique atomic numbers from species list
species_list = ["C", "H", "O", "N", "H", "C"]
unique_numbers = get_atomic_numbers(species_list)  # [1, 6, 7, 8]

# Use with mixed input types
mixed_species = [1, "He", 3, "Be"]
sorted_numbers = get_atomic_numbers(mixed_species)  # [1, 2, 3, 4]

Geometry Utilities

Functions for geometric operations including neighbor finding, adjacency matrices, and system extensions for periodic boundary conditions.

def get_adjacency_matrix(radius, pos1, pos2=None, output_type="coo_matrix"):
    """
    Get sparse adjacency matrix for atoms within cutoff radius.
    
    Parameters:
    - radius (float): Cutoff radius in angstroms
    - pos1: First set of atomic positions
    - pos2: Second set of positions (optional, defaults to pos1)
    - output_type (str): Output format ("coo_matrix", "dense")
    
    Returns:
    scipy.sparse matrix or numpy.ndarray: Adjacency matrix indicating connections
    """

def get_adjacency_list(adjacency_matrix):
    """
    Convert adjacency matrix to list format.
    
    Parameters:
    - adjacency_matrix: Sparse or dense adjacency matrix
    
    Returns:
    list: List of neighbor lists for each atom
    """

def get_extended_system(system, radial_cutoff, centers=None, return_cell_indices=False):
    """
    Extend periodic system with neighboring unit cells to ensure complete
    local environments within the cutoff radius.
    
    Parameters:
    - system: ASE Atoms or DScribe System object
    - radial_cutoff (float): Cutoff radius for local environments
    - centers: Atom indices to center the extension around (optional)
    - return_cell_indices (bool): Whether to return cell indices for extended atoms
    
    Returns:
    System or tuple: Extended system, optionally with cell indices
    """

Usage Example:

from dscribe.utils.geometry import get_adjacency_matrix, get_extended_system
from ase.build import bulk
import numpy as np

# Create a periodic system
nacl = bulk("NaCl", "rocksalt", a=5.64)

# Get adjacency matrix for neighbors within 3 Å
positions = nacl.get_positions()
adjacency = get_adjacency_matrix(3.0, positions)
print(f"Adjacency matrix shape: {adjacency.shape}")
print(f"Number of connections: {adjacency.nnz}")

# Extend system for descriptor calculations
extended_system = get_extended_system(nacl, radial_cutoff=6.0)
print(f"Original atoms: {len(nacl)}")
print(f"Extended atoms: {len(extended_system)}")

# Get cell indices for tracking extended atoms
extended_system, cell_indices = get_extended_system(
    nacl, radial_cutoff=6.0, return_cell_indices=True
)

Statistics Utilities

Functions for gathering statistics from collections of atomic systems, useful for descriptor setup and data analysis.

def system_stats(system_iterator):
    """
    Gather statistics from multiple atomic systems.
    
    Parameters:
    - system_iterator: Iterable of ASE Atoms or DScribe System objects
    
    Returns:
    dict: Statistics dictionary containing:
        - n_atoms_max: Maximum number of atoms in any system
        - max_atomic_number: Highest atomic number present
        - min_atomic_number: Lowest atomic number present  
        - atomic_numbers: Set of all atomic numbers present
        - element_symbols: Set of all element symbols present
        - min_distance: Minimum interatomic distance found
    """

Usage Example:

from dscribe.utils.stats import system_stats
from ase.build import molecule, bulk

# Collect various atomic systems
systems = [
    molecule("H2O"),
    molecule("NH3"), 
    molecule("CH4"),
    bulk("NaCl", "rocksalt", a=5.64)
]

# Gather statistics
stats = system_stats(systems)

print(f"Maximum atoms in any system: {stats['n_atoms_max']}")
print(f"Atomic numbers present: {stats['atomic_numbers']}")
print(f"Element symbols: {stats['element_symbols']}")
print(f"Atomic number range: {stats['min_atomic_number']}-{stats['max_atomic_number']}")
print(f"Minimum distance: {stats['min_distance']:.3f} Å")

# Use statistics for descriptor setup
from dscribe.descriptors import CoulombMatrix
cm = CoulombMatrix(n_atoms_max=stats['n_atoms_max'])

Dimensionality Utilities

Functions for checking array dimensions and data types, useful for input validation and preprocessing.

def is1d(array, dtype=None):
    """
    Check if array is 1D with optional dtype verification.
    
    Parameters:
    - array: Array to check
    - dtype: Expected data type (e.g., np.integer, np.floating)
    
    Returns:
    bool: True if array is 1D and matches dtype (if specified)
    """

def is2d(array, dtype=None):
    """
    Check if array is 2D with optional dtype verification.
    
    Parameters:
    - array: Array to check  
    - dtype: Expected data type (e.g., np.integer, np.floating)
    
    Returns:
    bool: True if array is 2D and matches dtype (if specified)
    """

Usage Example:

from dscribe.utils.dimensionality import is1d, is2d
import numpy as np

# Test arrays
positions = np.random.rand(10, 3)  # 2D array
distances = np.random.rand(10)     # 1D array
indices = np.array([1, 2, 3, 4])   # 1D integer array

# Check dimensions
print(f"Positions is 2D: {is2d(positions)}")
print(f"Distances is 1D: {is1d(distances)}")
print(f"Indices is 1D integer: {is1d(indices, np.integer)}")
print(f"Positions is 1D: {is1d(positions)}")  # False

# Use for input validation
def validate_positions(pos):
    if not is2d(pos, np.floating):
        raise ValueError("Positions must be a 2D floating-point array")
    if pos.shape[1] != 3:
        raise ValueError("Positions must have 3 columns (x, y, z)")
    return True

# Validation example
try:
    validate_positions(positions)  # Pass
    validate_positions(distances)  # Fail - not 2D
except ValueError as e:
    print(f"Validation failed: {e}")

Common Utility Patterns

Preprocessing Workflows

from dscribe.utils.stats import system_stats
from dscribe.utils.species import get_atomic_numbers
from ase.build import molecule

# Step 1: Gather system statistics  
systems = [molecule("H2O"), molecule("NH3"), molecule("CH4")]
stats = system_stats(systems)

# Step 2: Set up species list
species = list(stats['element_symbols'])
atomic_nums = get_atomic_numbers(species)

# Step 3: Configure descriptors with statistics
from dscribe.descriptors import SOAP, CoulombMatrix

soap = SOAP(
    species=species,
    r_cut=6.0,
    n_max=8,
    l_max=6
)

cm = CoulombMatrix(n_atoms_max=stats['n_atoms_max'])

Periodic System Handling

from dscribe.utils.geometry import get_extended_system
from dscribe.descriptors import SOAP
from ase.build import bulk

# Create periodic system
crystal = bulk("Si", "diamond", a=5.43)

# Extend for local environment calculations
r_cut = 6.0
extended = get_extended_system(crystal, r_cut)

# Use extended system with descriptors
soap = SOAP(species=["Si"], r_cut=r_cut, n_max=8, l_max=6)
descriptors = soap.create(extended)

Data Validation

from dscribe.utils.dimensionality import is1d, is2d
import numpy as np

def validate_descriptor_input(systems, centers=None):
    """Validate input for descriptor calculations."""
    
    # Check systems is iterable
    try:
        iter(systems)
    except TypeError:
        systems = [systems]  # Single system
    
    # Validate centers if provided
    if centers is not None:
        if not is1d(centers, np.integer):
            raise ValueError("Centers must be 1D integer array")
        
        if np.any(centers < 0):
            raise ValueError("Centers indices must be non-negative")
    
    return systems, centers

# Usage in descriptor code
systems, centers = validate_descriptor_input(my_systems, my_centers)

Integration with Descriptors

These utilities are used internally by DScribe descriptors but are also available for user applications:

Species utilities: Used by all descriptors for species validation and processing
Geometry utilities: Used by local descriptors for neighbor finding and system extension
Statistics utilities: Helpful for setting up descriptor parameters across datasets
Dimensionality utilities: Used for input validation throughout the package

The utilities provide building blocks for custom analysis workflows and help ensure consistent data handling across different parts of the DScribe ecosystem.

Install with Tessl CLI