A Python wrapper for PaDEL-Descriptor software that enables molecular descriptor and fingerprint calculation from SMILES, MDL, and SDF inputs
npx @tessl/cli install tessl/pypi-padelpy@0.1.0A Python wrapper for PaDEL-Descriptor software that enables molecular descriptor and fingerprint calculation from SMILES strings, MDL MolFiles, and SDF files. PaDELPy provides both high-level convenience functions and low-level command-line wrapper access to the bundled PaDEL-Descriptor tool.
pip install padelpyfrom padelpy import from_smiles, from_mdl, from_sdf, padeldescriptorfrom padelpy import from_smiles, from_mdl, from_sdf
# Calculate descriptors from SMILES string
descriptors = from_smiles('CCC') # propane
print(f"Number of descriptors: {len(descriptors)}")
print(f"Molecular weight: {descriptors['MW']}")
# Calculate descriptors from multiple SMILES
multi_descriptors = from_smiles(['CCC', 'CCCC']) # propane and butane
print(f"Processed {len(multi_descriptors)} molecules")
# Calculate both descriptors and fingerprints
desc_fp = from_smiles('CCC', fingerprints=True)
# Process MDL file
mdl_descriptors = from_mdl('molecules.mdl')
# Process SDF file
sdf_descriptors = from_sdf('molecules.sdf')
# Save results to CSV
from_smiles('CCC', output_csv='descriptors.csv')Converts SMILES strings to molecular descriptors and fingerprints with automatic 3D structure generation and comprehensive parameter control.
def from_smiles(smiles, output_csv: str = None, descriptors: bool = True, fingerprints: bool = False, timeout: int = 60, maxruntime: int = -1, threads: int = -1) -> 'OrderedDict | list':
"""
Convert SMILES string(s) to molecular descriptors/fingerprints.
Args:
smiles (str or list): SMILES string or list of SMILES strings
output_csv (str, optional): CSV file path to save descriptors
descriptors (bool): Calculate descriptors if True (default: True)
fingerprints (bool): Calculate fingerprints if True (default: False)
timeout (int): Maximum conversion time in seconds (default: 60)
maxruntime (int): Maximum running time per molecule in seconds (default: -1, unlimited)
threads (int): Number of threads to use (default: -1, max available)
Returns:
OrderedDict or list: Single OrderedDict for one molecule (str input),
list of OrderedDicts for multiple molecules (list input)
Raises:
RuntimeError: For invalid SMILES or processing failures
"""Usage Examples:
# Single SMILES
descriptors = from_smiles('CCC')
# Multiple SMILES
descriptors = from_smiles(['CCC', 'CCCC'])
# Only fingerprints
fingerprints = from_smiles('CCC', fingerprints=True, descriptors=False)
# Control performance
descriptors = from_smiles(['CCC', 'CCCC'], threads=1, maxruntime=30)
# Save to file
from_smiles('CCC', output_csv='propane_descriptors.csv')Processes MDL MolFiles containing one or more molecular structures, extracting descriptors and fingerprints for each compound.
def from_mdl(mdl_file: str, output_csv: str = None, descriptors: bool = True, fingerprints: bool = False, timeout: int = 60, maxruntime: int = -1, threads: int = -1) -> list:
"""
Convert MDL file to molecular descriptors/fingerprints.
Args:
mdl_file (str): Path to MDL file (must have .mdl extension)
output_csv (str, optional): CSV file path to save descriptors
descriptors (bool): Calculate descriptors if True (default: True)
fingerprints (bool): Calculate fingerprints if True (default: False)
timeout (int): Maximum conversion time in seconds (default: 60)
maxruntime (int): Maximum running time per molecule in seconds (default: -1, unlimited)
threads (int): Number of threads to use (default: -1, max available)
Returns:
list: List of dicts, each corresponding to a compound in the MDL file
Raises:
ValueError: For invalid file extension (.mdl required)
RuntimeError: For processing failures
"""Usage Examples:
# Process MDL file
descriptors = from_mdl('molecules.mdl')
# Include fingerprints
desc_fp = from_mdl('molecules.mdl', fingerprints=True)
# Single-threaded processing
descriptors = from_mdl('molecules.mdl', threads=1)
# Save results
from_mdl('molecules.mdl', output_csv='mdl_descriptors.csv')Processes Structure Data Format (SDF) files containing molecular structures with optional associated data.
def from_sdf(sdf_file: str, output_csv: str = None, descriptors: bool = True, fingerprints: bool = False, timeout: int = 60, maxruntime: int = -1, threads: int = -1) -> list:
"""
Convert SDF file to molecular descriptors/fingerprints.
Args:
sdf_file (str): Path to SDF file (must have .sdf extension)
output_csv (str, optional): CSV file path to save descriptors
descriptors (bool): Calculate descriptors if True (default: True)
fingerprints (bool): Calculate fingerprints if True (default: False)
timeout (int): Maximum conversion time in seconds (default: 60)
maxruntime (int): Maximum running time per molecule in seconds (default: -1, unlimited)
threads (int): Number of threads to use (default: -1, max available)
Returns:
list: List of dicts, each corresponding to a compound in the SDF file
Raises:
ValueError: For invalid file extension (.sdf required)
RuntimeError: For processing failures
"""Usage Examples:
# Process SDF file
descriptors = from_sdf('molecules.sdf')
# Only fingerprints
fingerprints = from_sdf('molecules.sdf', fingerprints=True, descriptors=False)
# Control processing time
descriptors = from_sdf('molecules.sdf', maxruntime=120, timeout=300)Direct access to PaDEL-Descriptor's command-line interface with full parameter control for advanced use cases and batch processing.
def padeldescriptor(maxruntime: int = -1, waitingjobs: int = -1, threads: int = -1, d_2d: bool = False, d_3d: bool = False, config: str = None, convert3d: bool = False, descriptortypes: str = None, detectaromaticity: bool = False, mol_dir: str = None, d_file: str = None, fingerprints: bool = False, log: bool = False, maxcpdperfile: int = 0, removesalt: bool = False, retain3d: bool = False, retainorder: bool = True, standardizenitro: bool = False, standardizetautomers: bool = False, tautomerlist: str = None, usefilenameasmolname: bool = False, sp_timeout: int = None, headless: bool = True) -> None:
"""
Complete wrapper for PaDEL-Descriptor command-line interface.
Args:
maxruntime (int): Maximum running time per molecule in milliseconds (default: -1, unlimited)
waitingjobs (int): Maximum jobs in queue for worker threads (default: -1, 50 * max threads)
threads (int): Maximum number of threads to use (default: -1, equal to CPU cores)
d_2d (bool): Calculate 2-D descriptors (default: False)
d_3d (bool): Calculate 3-D descriptors (default: False)
config (str): Path to configuration file (optional)
convert3d (bool): Convert molecule to 3-D (default: False)
descriptortypes (str): Path to descriptor types file (optional)
detectaromaticity (bool): Auto-detect aromaticity before calculation (default: False)
mol_dir (str): Path to directory/file containing structural files
d_file (str): Path to save calculated descriptors
fingerprints (bool): Calculate fingerprints (default: False)
log (bool): Create log file (default: False)
maxcpdperfile (int): Maximum compounds per descriptor file (default: 0, unlimited)
removesalt (bool): Remove salt from molecules (default: False)
retain3d (bool): Retain 3-D coordinates when standardizing (default: False)
retainorder (bool): Retain molecule order in files (default: True)
standardizenitro (bool): Standardize nitro groups to N(:O):O (default: False)
standardizetautomers (bool): Standardize tautomers (default: False)
tautomerlist (str): Path to SMIRKS tautomers file (optional)
usefilenameasmolname (bool): Use filename as molecule name (default: False)
sp_timeout (int): Subprocess timeout in seconds (optional)
headless (bool): Prevent PaDEL splash image from loading (default: True)
Returns:
None
Raises:
ReferenceError: If Java JRE 6+ not found
RuntimeError: For PaDEL-Descriptor processing errors
"""Usage Examples:
from padelpy import padeldescriptor
# Basic usage with MDL input
padeldescriptor(mol_dir='molecules.mdl', d_file='descriptors.csv')
# SDF input with 2D and 3D descriptors
padeldescriptor(
mol_dir='molecules.sdf',
d_file='descriptors.csv',
d_2d=True,
d_3d=True
)
# Directory of structure files
padeldescriptor(mol_dir='/path/to/molecules/', d_file='descriptors.csv')
# SMILES file input
padeldescriptor(mol_dir='molecules.smi', d_file='descriptors.csv')
# Advanced configuration
padeldescriptor(
mol_dir='molecules.sdf',
d_file='descriptors.csv',
fingerprints=True,
convert3d=True,
removesalt=True,
standardizetautomers=True,
threads=4,
maxruntime=30000, # 30 seconds per molecule
log=True
)
# Configuration file
padeldescriptor(config='/path/to/config.xml')# Import required for return types
from collections import OrderedDictAll functions may raise exceptions for various error conditions:
threads parameter to control parallel processingtimeout for overall processing and maxruntime per molecule