tessl/pypi-mdanalysis

An object-oriented toolkit to analyze molecular dynamics trajectories.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Selection Language

Name: tessl/pypi-mdanalysis
Author: tessl

MDAnalysis provides a powerful and flexible selection language for identifying atoms, residues, and molecular components. The selection syntax is similar to that used in VMD and PyMOL, making it familiar to molecular visualization users while extending functionality for analysis workflows.

Overview

The selection language allows you to:

Select atoms based on properties (names, types, residues, segments)
Use geometric criteria (distances, shapes, regions)
Combine selections with Boolean logic
Create dynamic selections that update with trajectory changes
Perform complex queries with multiple criteria

Basic Selection Syntax

Core Selection Function

def select_atoms(universe_or_atomgroup, *args, **kwargs):
    """
    Select atoms using the MDAnalysis selection language.
    
    Parameters
    ----------
    universe_or_atomgroup : Universe or AtomGroup
        Source for atom selection.
    *args : str
        Selection string(s) in MDAnalysis selection language.
    updating : bool, optional
        Whether selection should update when topology changes (default False).
    periodic : bool, optional
        Whether to account for periodic boundary conditions in 
        geometric selections (default True).
    rtol : float, optional
        Relative tolerance for floating point comparisons (default 1e-5).
    atol : float, optional
        Absolute tolerance for floating point comparisons (default 1e-8).
        
    Returns
    -------
    AtomGroup
        Selected atoms matching the criteria.
        
    Examples
    --------
    >>> # Basic property selections
    >>> ca_atoms = u.select_atoms("name CA")
    >>> protein = u.select_atoms("protein")
    >>> waters = u.select_atoms("resname SOL TIP3 WAT")
    
    >>> # Geometric selections
    >>> near_protein = u.select_atoms("around 5.0 protein")
    >>> binding_site = u.select_atoms("resid 23-45 and around 8.0 resname LIG")
    
    >>> # Complex Boolean logic
    >>> flexible = u.select_atoms("protein and not backbone and not resname PRO")
    """

Property-Based Selections

Atom Properties

# Atom name selections
u.select_atoms("name CA")              # Alpha carbons
u.select_atoms("name CA CB CG")        # Multiple atom names
u.select_atoms("name C*")              # Wildcard: all carbon atoms
u.select_atoms("name *A")              # Names ending with A

# Atom type selections  
u.select_atoms("type CT NH1 OH")       # Specific atom types
u.select_atoms("type CT*")             # Types starting with CT

# Mass selections
u.select_atoms("mass 12.01")           # Specific mass (carbon)
u.select_atoms("mass > 14.0")          # Heavier than nitrogen
u.select_atoms("mass 12.0:16.0")       # Mass range

# Charge selections
u.select_atoms("charge 0.0")           # Neutral atoms
u.select_atoms("charge < -0.5")        # Highly negative
u.select_atoms("charge -0.1:0.1")      # Nearly neutral range

# Element selections (if available)
u.select_atoms("element C")            # Carbon atoms
u.select_atoms("element N O")          # Nitrogen or oxygen

Residue Properties

# Residue name selections
u.select_atoms("resname ALA")          # Alanine residues
u.select_atoms("resname ALA GLY PRO")  # Multiple residue types
u.select_atoms("resname A*")           # Names starting with A

# Residue ID selections
u.select_atoms("resid 1")              # Residue 1
u.select_atoms("resid 1 5 10")         # Specific residues
u.select_atoms("resid 1-10")           # Range of residues
u.select_atoms("resid 1:10:2")         # Range with step (1,3,5,7,9)

# Residue number selections (different from resid)
u.select_atoms("resnum 1-100")         # Sequential numbering

# Insertion codes (for PDB files)
u.select_atoms("icode A")              # Specific insertion code
u.select_atoms("icode ''")             # Empty insertion code

Segment Properties

# Segment ID selections
u.select_atoms("segid PROA")           # Specific segment
u.select_atoms("segid PROA PROB")      # Multiple segments  
u.select_atoms("segid PRO*")           # Wildcard matching

# Combining segment and residue selections
u.select_atoms("segid PROA and resid 1-100")
u.select_atoms("segid PROB and resname ALA")

Predefined Selections

Common Macros

# Protein components
u.select_atoms("protein")              # All protein atoms
u.select_atoms("backbone")             # Protein backbone (N, CA, C, O)
u.select_atoms("sidechain")            # Protein sidechains  
u.select_atoms("nucleic")              # Nucleic acid atoms
u.select_atoms("nucleicacid")          # Alias for nucleic

# Water and ions
u.select_atoms("water")                # Water molecules
u.select_atoms("ion")                  # Ion atoms

# Secondary structure (if available)
u.select_atoms("alpha")                # Alpha helix
u.select_atoms("beta")                 # Beta sheet
u.select_atoms("coil")                 # Random coil

# Atom categories
u.select_atoms("hydrogen")             # Hydrogen atoms
u.select_atoms("heavy")                # Heavy (non-hydrogen) atoms
u.select_atoms("donor")                # Hydrogen bond donors
u.select_atoms("acceptor")             # Hydrogen bond acceptors

Predefined Residue Groups

# Amino acid categories
u.select_atoms("acidic")               # Acidic residues (ASP, GLU)
u.select_atoms("basic")                # Basic residues (LYS, ARG, HIS)
u.select_atoms("polar")                # Polar residues  
u.select_atoms("nonpolar")             # Nonpolar residues
u.select_atoms("charged")              # Charged residues (acidic + basic)
u.select_atoms("aromatic")             # Aromatic residues (PHE, TYR, TRP)
u.select_atoms("hydrophobic")          # Hydrophobic residues
u.select_atoms("small")                # Small residues (ALA, GLY, SER)
u.select_atoms("medium")               # Medium-sized residues
u.select_atoms("large")                # Large residues

# Specific amino acid types
u.select_atoms("aliphatic")            # Aliphatic residues
u.select_atoms("sulfur")               # Sulfur-containing (CYS, MET)
u.select_atoms("alcoholic")            # Alcohol groups (SER, THR, TYR)
u.select_atoms("amide")                # Amide groups (ASN, GLN)

Geometric Selections

Distance-Based Selections

# Around selections - atoms within distance
u.select_atoms("around 5.0 protein")               # Within 5Å of protein
u.select_atoms("around 3.5 resname LIG")           # Near ligand
u.select_atoms("around 8.0 (resid 23 and name CA)")  # Near specific atom

# Point selections - around coordinates  
u.select_atoms("point 10.0 20.0 30.0 5.0")        # Within 5Å of point
u.select_atoms("point 0 0 0 10.0")                 # Near origin

# Spherical layer selections
u.select_atoms("sphlayer 2.0 5.0 protein")        # Shell around protein
u.select_atoms("spherical_layer 1.5 4.0 resname LIG")  # Full name

# Cylindrical selections
u.select_atoms("cylayer 2.0 5.0 10.0 -5.0 protein")    # Cylindrical shell
u.select_atoms("cylinder 5.0 10.0 -5.0 protein")       # Inside cylinder

# Isolation selections - isolated atoms/residues
u.select_atoms("isolated")                         # Atoms with no neighbors
u.select_atoms("isolated 5.0")                     # No neighbors within 5Å

Shape-Based Selections

# Slab selections - between parallel planes
u.select_atoms("slab 10.0 20.0 z")                # Z-coordinate slab
u.select_atoms("slab -5.0 5.0 x")                 # X-coordinate slab

# Box selections - rectangular regions
u.select_atoms("prop x > 10.0")                   # X coordinate > 10
u.select_atoms("prop y < -5.0")                   # Y coordinate < -5
u.select_atoms("prop z 0.0:10.0")                 # Z range

# Complex geometric combinations
u.select_atoms("around 8.0 protein and prop z > 0")     # Upper half near protein
u.select_atoms("sphlayer 3.0 6.0 protein and not water")  # Protein shell, no water

Boolean Logic and Combinations

Logical Operators

# AND operations
u.select_atoms("protein and name CA")             # Protein alpha carbons
u.select_atoms("resname ALA and name CB")         # Alanine beta carbons
u.select_atoms("resid 1-10 and backbone")         # N-terminal backbone

# OR operations  
u.select_atoms("resname ALA or resname GLY")      # Alanine or glycine
u.select_atoms("name CA or name CB")              # Alpha or beta carbons
u.select_atoms("segid PROA or segid PROB")        # Multiple segments

# NOT operations
u.select_atoms("protein and not backbone")        # Protein sidechains
u.select_atoms("not water and not ion")           # Non-solvent atoms
u.select_atoms("around 5.0 protein and not protein")  # Solvent around protein

# Complex combinations with parentheses
u.select_atoms("(resname ALA or resname GLY) and name CA")
u.select_atoms("protein and not (backbone or resname PRO)")
u.select_atoms("(around 5.0 resname LIG) and (protein or nucleic)")

Selection Algebra

# Using AtomGroup operations for complex logic
protein = u.select_atoms("protein")
ligand = u.select_atoms("resname LIG")  
waters = u.select_atoms("resname SOL")

# Intersection
binding_site_protein = protein & u.select_atoms("around 5.0 resname LIG")

# Union
protein_and_ligand = protein | ligand

# Difference  
sidechain = protein - u.select_atoms("backbone")

# Symmetric difference
mobile_region = (protein | ligand) ^ u.select_atoms("around 3.0 (protein or resname LIG)")

Dynamic and Updating Selections

Updating Selections

# Static selections (default) - fixed atom set
static_near = u.select_atoms("around 5.0 resname LIG")

# Dynamic selections - update each frame
dynamic_near = u.select_atoms("around 5.0 resname LIG", updating=True)

# Example of difference
ligand = u.select_atoms("resname LIG")
for ts in u.trajectory:
    # Static selection - same atoms throughout
    static_atoms = static_near.positions
    
    # Dynamic selection - atoms change as ligand moves
    dynamic_atoms = dynamic_near.positions
    
    print(f"Frame {ts.frame}: Static={len(static_near)}, Dynamic={len(dynamic_near)}")

Periodic Boundary Considerations

# Selections accounting for periodic boundaries
u.select_atoms("around 5.0 protein", periodic=True)   # Default behavior
u.select_atoms("around 5.0 protein", periodic=False)  # Ignore PBC

# Important for systems where molecules cross box boundaries
wrapped_selection = u.select_atoms("sphlayer 2.0 5.0 protein", periodic=True)

Advanced Selection Patterns

Property Ranges and Comparisons

# Numerical comparisons
u.select_atoms("mass > 15.0")                     # Heavy atoms
u.select_atoms("charge < -0.5")                   # Negative atoms
u.select_atoms("charge >= -0.1 and charge <= 0.1")  # Nearly neutral

# Range specifications  
u.select_atoms("resid 1:100")                     # Residue range
u.select_atoms("mass 12.0:16.0")                  # Mass range
u.select_atoms("prop x -10.0:10.0")               # Coordinate range

# Multiple ranges
u.select_atoms("resid 1-50 100-150 200-250")      # Multiple residue ranges
u.select_atoms("name CA CB CG CD CE")              # Multiple names

Pattern Matching

# Wildcard patterns
u.select_atoms("name C*")                         # Names starting with C
u.select_atoms("name *A")                         # Names ending with A  
u.select_atoms("name H*")                         # All hydrogens
u.select_atoms("resname A*")                      # Residues starting with A
u.select_atoms("segid *A")                        # Segments ending with A

# Regular expressions (when supported)
u.select_atoms("name ~/^C[0-9]+$/")               # C followed by numbers
u.select_atoms("resname ~/^(ALA|GLY|PRO)$/")      # Specific residues

Index-Based Selections

# Atom index selections
u.select_atoms("index 0")                         # First atom
u.select_atoms("index 0 10 20 30")                # Specific indices
u.select_atoms("index 0-99")                      # Index range
u.select_atoms("index 0:100:10")                  # Every 10th atom

# Combining indices with other criteria
u.select_atoms("index 0-999 and protein")         # First 1000 protein atoms
u.select_atoms("protein and index 0:1000:2")      # Every other protein atom

Selection Language Extensions

Custom Selection Keywords

# MDAnalysis allows custom selection keyword definitions
from MDAnalysis.core.selection import SelectionParser

def hydrophobic_residues(group):
    """Custom selection for hydrophobic residues."""
    hydrophobic = ['ALA', 'VAL', 'LEU', 'ILE', 'PHE', 'PRO', 'MET', 'TRP']
    return group.residues[np.isin(group.residues.resnames, hydrophobic)].atoms

# Register custom selection
SelectionParser.parse_tokens.append(('hydrophobic_residues', hydrophobic_residues))

# Now can use in selections
u.select_atoms("hydrophobic_residues and name CA")

Precomputed Selections

def create_selection_library(universe):
    """
    Create library of commonly used selections for a system.
    """
    selections = {}
    
    # Basic components
    selections['protein'] = universe.select_atoms("protein")
    selections['backbone'] = universe.select_atoms("backbone")  
    selections['sidechain'] = universe.select_atoms("protein and not backbone")
    selections['water'] = universe.select_atoms("resname SOL WAT TIP*")
    selections['ions'] = universe.select_atoms("ion")
    
    # Secondary structure regions (if available)
    try:
        selections['helix'] = universe.select_atoms("alpha")
        selections['sheet'] = universe.select_atoms("beta")  
        selections['loop'] = universe.select_atoms("coil")
    except:
        pass
    
    # Functional regions
    selections['ca_atoms'] = universe.select_atoms("name CA")
    selections['heavy_atoms'] = universe.select_atoms("not name H*")
    
    # Binding site (example - customize for specific system)
    if 'LIG' in universe.residues.resnames:
        selections['ligand'] = universe.select_atoms("resname LIG")
        selections['binding_site'] = universe.select_atoms("around 5.0 resname LIG and protein")
        selections['first_shell'] = universe.select_atoms("sphlayer 0.0 5.0 resname LIG")
        selections['second_shell'] = universe.select_atoms("sphlayer 5.0 10.0 resname LIG")
    
    return selections

# Create and use selection library
sel_lib = create_selection_library(u)
protein_ca = sel_lib['ca_atoms']
binding_site = sel_lib.get('binding_site', u.select_atoms("name CA"))  # Fallback

Performance Optimization

Selection Caching

class SelectionCache:
    """
    Cache frequently used selections for performance.
    """
    
    def __init__(self, universe):
        self.universe = universe
        self._cache = {}
    
    def get_selection(self, selection_string, **kwargs):
        """
        Get selection with caching.
        
        Parameters
        ----------
        selection_string : str
            Selection string.
        **kwargs
            Additional selection arguments.
            
        Returns
        -------
        AtomGroup
            Cached or newly created selection.
        """
        # Create cache key from selection string and kwargs
        cache_key = (selection_string, tuple(sorted(kwargs.items())))
        
        if cache_key not in self._cache:
            self._cache[cache_key] = self.universe.select_atoms(
                selection_string, **kwargs
            )
        
        return self._cache[cache_key]
    
    def clear_cache(self):
        """Clear selection cache."""
        self._cache.clear()

# Usage
cache = SelectionCache(u)

# These will be cached after first call
protein = cache.get_selection("protein")
ca_atoms = cache.get_selection("name CA")
backbone = cache.get_selection("backbone")

# Fast subsequent access
for ts in u.trajectory:
    # Fast access to cached selections
    protein_com = protein.center_of_mass()
    ca_positions = ca_atoms.positions

Efficient Selection Strategies

def optimize_selection_workflow():
    """
    Demonstrate efficient selection strategies.
    """
    # Strategy 1: Use broad selections first, then narrow
    # Less efficient:
    # all_ca = u.select_atoms("name CA")
    # protein_ca = all_ca.select_atoms("protein")  # Re-selection overhead
    
    # More efficient:
    protein_ca = u.select_atoms("protein and name CA")  # Single selection
    
    # Strategy 2: Reuse selections when possible
    protein = u.select_atoms("protein")
    backbone = protein.select_atoms("backbone")        # Subset of existing selection
    sidechain = protein.select_atoms("not backbone")   # Complement
    
    # Strategy 3: Use static selections for unchanging criteria
    static_protein = u.select_atoms("protein")         # Won't change
    
    # Strategy 4: Use updating selections only when needed
    dynamic_contacts = u.select_atoms("around 5.0 resname LIG", updating=True)
    
    # Strategy 5: Batch similar selections
    selections = {
        'ca': u.select_atoms("name CA"),
        'cb': u.select_atoms("name CB"), 
        'backbone': u.select_atoms("backbone"),
        'sidechain': u.select_atoms("protein and not backbone")
    }
    
    return selections

# Apply optimization strategies
optimized_sels = optimize_selection_workflow()

Usage Examples

Complex Selection Workflows

def analyze_binding_pocket(universe, ligand_resname="LIG"):
    """
    Comprehensive binding pocket analysis using selections.
    """
    # Define key selections
    ligand = universe.select_atoms(f"resname {ligand_resname}")
    if len(ligand) == 0:
        raise ValueError(f"No ligand found with resname {ligand_resname}")
    
    # Binding pocket definition with multiple criteria
    pocket_atoms = universe.select_atoms(
        f"(around 5.0 resname {ligand_resname}) and protein and (not backbone)"
    )
    
    # Different shells around ligand
    first_shell = universe.select_atoms(f"sphlayer 0.0 4.0 resname {ligand_resname}")
    second_shell = universe.select_atoms(f"sphlayer 4.0 8.0 resname {ligand_resname}")
    
    # Specific interaction partners
    hbond_partners = universe.select_atoms(
        f"(around 3.5 resname {ligand_resname}) and (donor or acceptor)"
    )
    
    # Hydrophobic contacts
    hydrophobic_contacts = universe.select_atoms(
        f"(around 4.5 resname {ligand_resname}) and "
        f"(resname ALA VAL LEU ILE PHE PRO MET TRP) and "
        f"(not backbone)"
    )
    
    # Salt bridge partners
    salt_bridge_partners = universe.select_atoms(
        f"(around 6.0 resname {ligand_resname}) and "
        f"(resname LYS ARG ASP GLU) and "
        f"(name NZ NH1 NH2 OD1 OD2 OE1 OE2)"
    )
    
    # Analysis results
    results = {
        'ligand_atoms': len(ligand),
        'pocket_atoms': len(pocket_atoms),
        'first_shell_atoms': len(first_shell),
        'second_shell_atoms': len(second_shell),
        'hbond_partners': len(hbond_partners),
        'hydrophobic_contacts': len(hydrophobic_contacts),
        'salt_bridge_partners': len(salt_bridge_partners)
    }
    
    # Residue-level analysis
    pocket_residues = pocket_atoms.residues
    results['pocket_residues'] = len(pocket_residues)
    results['pocket_resnames'] = list(set(pocket_residues.resnames))
    
    return results, {
        'ligand': ligand,
        'pocket': pocket_atoms, 
        'hbond_partners': hbond_partners,
        'hydrophobic': hydrophobic_contacts,
        'salt_bridges': salt_bridge_partners
    }

# Run binding pocket analysis
results, selections = analyze_binding_pocket(u, "LIG")
print("Binding Pocket Analysis:")
for key, value in results.items():
    print(f"  {key}: {value}")

Dynamic Selection Analysis

def track_selection_changes(universe, selection_string, updating=True):
    """
    Track how selection membership changes over trajectory.
    """
    selection = universe.select_atoms(selection_string, updating=updating)
    
    results = {
        'frame': [],
        'n_atoms': [],
        'atom_indices': [],
        'unique_atoms': set()
    }
    
    for ts in universe.trajectory:
        frame = ts.frame
        n_atoms = len(selection)
        atom_indices = selection.indices.copy()
        
        results['frame'].append(frame)
        results['n_atoms'].append(n_atoms)
        results['atom_indices'].append(atom_indices)
        results['unique_atoms'].update(atom_indices)
    
    # Summary statistics
    results['mean_atoms'] = np.mean(results['n_atoms'])
    results['std_atoms'] = np.std(results['n_atoms'])
    results['min_atoms'] = np.min(results['n_atoms'])
    results['max_atoms'] = np.max(results['n_atoms'])
    results['total_unique'] = len(results['unique_atoms'])
    
    return results

# Example: Track water molecules near protein
water_dynamics = track_selection_changes(
    u, "resname SOL and around 5.0 protein", updating=True
)

print(f"Water dynamics around protein:")
print(f"  Average water molecules: {water_dynamics['mean_atoms']:.1f}")
print(f"  Range: {water_dynamics['min_atoms']}-{water_dynamics['max_atoms']}")
print(f"  Total unique waters: {water_dynamics['total_unique']}")

Selection Validation

def validate_selections(universe, selection_dict):
    """
    Validate multiple selections and check for issues.
    """
    validation_results = {}
    
    for name, selection_string in selection_dict.items():
        try:
            # Test selection
            atoms = universe.select_atoms(selection_string)
            
            # Basic validation
            results = {
                'valid': True,
                'n_atoms': len(atoms),
                'n_residues': len(atoms.residues) if len(atoms) > 0 else 0,
                'n_segments': len(atoms.segments) if len(atoms) > 0 else 0,
                'warnings': []
            }
            
            # Check for empty selection
            if len(atoms) == 0:
                results['warnings'].append("Selection returned no atoms")
            
            # Check for very large selections (might be inefficient)
            if len(atoms) > len(universe.atoms) * 0.8:
                results['warnings'].append("Selection includes >80% of system")
            
            # Check for geometric selections without proper context
            if any(keyword in selection_string.lower() 
                   for keyword in ['around', 'within', 'sphlayer']):
                if 'updating' not in selection_string:
                    results['warnings'].append(
                        "Geometric selection might benefit from updating=True"
                    )
            
            validation_results[name] = results
            
        except Exception as e:
            validation_results[name] = {
                'valid': False,
                'error': str(e),
                'n_atoms': 0
            }
    
    return validation_results

# Example selection validation
test_selections = {
    'protein': "protein",
    'ca_atoms': "name CA",
    'binding_site': "around 5.0 resname LIG and protein",
    'invalid': "name INVALID_ATOM",
    'water_shell': "sphlayer 5.0 10.0 protein and resname SOL"
}

validation = validate_selections(u, test_selections)
for name, result in validation.items():
    if result['valid']:
        print(f"{name}: {result['n_atoms']} atoms")
        for warning in result.get('warnings', []):
            print(f"  Warning: {warning}")
    else:
        print(f"{name}: ERROR - {result['error']}")

Install with Tessl CLI