A Python package for creating feature transformations in applications of machine learning to materials science.
—
DScribe's kernel methods provide similarity measures between atomic structures based on local atomic environment comparisons. These kernels are particularly useful for machine learning applications where you need to measure structural similarity or build kernel-based models.
The AverageKernel computes global structural similarity as the average of local environment similarities. It provides a simple and intuitive way to measure how similar two structures are based on their local atomic environments.
class AverageKernel:
def __init__(self, metric, gamma=None, degree=3, coef0=1,
kernel_params=None, normalize_kernel=True):
"""
Initialize Average Kernel.
Parameters:
- metric (str): Distance metric for local similarities:
- "linear": Linear kernel (dot product)
- "polynomial": Polynomial kernel
- "rbf": Radial basis function (Gaussian) kernel
- "laplacian": Laplacian kernel
- "sigmoid": Sigmoid kernel
- gamma (float): Kernel coefficient for rbf, polynomial, and sigmoid kernels
- degree (int): Degree for polynomial kernel
- coef0 (float): Independent term for polynomial and sigmoid kernels
- kernel_params (dict): Additional parameters for specific kernels
- normalize_kernel (bool): Whether to normalize the kernel matrix
"""
def create(self, x, y=None):
"""
Create kernel matrix from local descriptors.
Parameters:
- x: Local descriptors for first set of structures (list of arrays)
- y: Local descriptors for second set of structures (optional, defaults to x)
Returns:
numpy.ndarray: Kernel matrix with shape (n_structures_x, n_structures_y)
"""
def get_global_similarity(self, localkernel):
"""
Compute global similarity from local similarity matrix.
Parameters:
- localkernel: Local kernel matrix between environments
Returns:
float: Global similarity value (average of local similarities)
"""Usage Example:
from dscribe.kernels import AverageKernel
from dscribe.descriptors import SOAP
from ase.build import molecule
# Setup SOAP descriptor for local environments
soap = SOAP(species=["H", "O"], r_cut=5.0, n_max=8, l_max=6)
# Create local descriptors for molecules
molecules = [molecule("H2O"), molecule("H2O2")]
soap_descriptors = [soap.create(mol) for mol in molecules]
# Setup Average Kernel with RBF similarity metric
kernel = AverageKernel(metric="rbf", gamma=1.0)
# Compute kernel matrix
K = kernel.create(soap_descriptors) # Shape: (2, 2)
print(f"Self-similarity: {K[0,0]}")
print(f"Cross-similarity: {K[0,1]}")
# Compare against different molecules
other_molecules = [molecule("NH3"), molecule("CH4")]
other_descriptors = [soap.create(mol) for mol in other_molecules]
K_cross = kernel.create(soap_descriptors, other_descriptors) # Shape: (2, 2)The REMatchKernel (Regularized-Entropy Match Kernel) uses optimal transport theory to find the best matching between local environments of two structures. This provides a more sophisticated similarity measure that accounts for the optimal assignment of local environments.
class REMatchKernel:
def __init__(self, alpha=0.1, threshold=1e-6, metric="linear", gamma=None,
degree=3, coef0=1, kernel_params=None, normalize_kernel=True):
"""
Initialize REMatch Kernel.
Parameters:
- alpha (float): Entropy regularization parameter (controls transport cost)
- threshold (float): Convergence threshold for Sinkhorn algorithm
- metric (str): Distance metric for local similarities:
- "linear": Linear kernel (dot product)
- "polynomial": Polynomial kernel
- "rbf": Radial basis function (Gaussian) kernel
- "laplacian": Laplacian kernel
- "sigmoid": Sigmoid kernel
- gamma (float): Kernel coefficient for rbf, polynomial, and sigmoid kernels
- degree (int): Degree for polynomial kernel
- coef0 (float): Independent term for polynomial and sigmoid kernels
- kernel_params (dict): Additional parameters for specific kernels
- normalize_kernel (bool): Whether to normalize the kernel matrix
"""
def create(self, x, y=None):
"""
Create REMatch kernel matrix from local descriptors.
Parameters:
- x: Local descriptors for first set of structures (list of arrays)
- y: Local descriptors for second set of structures (optional, defaults to x)
Returns:
numpy.ndarray: REMatch kernel matrix with shape (n_structures_x, n_structures_y)
"""
def get_global_similarity(self, localkernel):
"""
Compute global similarity using optimal transport matching.
Parameters:
- localkernel: Local kernel matrix between environments
Returns:
float: Global similarity value from optimal transport solution
"""Usage Example:
from dscribe.kernels import REMatchKernel
from dscribe.descriptors import SOAP
from ase.build import molecule
# Setup SOAP descriptor
soap = SOAP(species=["H", "O"], r_cut=5.0, n_max=8, l_max=6)
# Create local descriptors
molecules = [molecule("H2O"), molecule("H2O2")]
soap_descriptors = [soap.create(mol) for mol in molecules]
# Setup REMatch Kernel with custom parameters
rematch = REMatchKernel(
metric="rbf",
gamma=1.0,
alpha=0.1, # Lower alpha = more regularization
threshold=1e-8 # Higher precision convergence
)
# Compute REMatch kernel matrix
K_rematch = rematch.create(soap_descriptors) # Shape: (2, 2)
print(f"REMatch similarity: {K_rematch[0,1]}")
# Compare with different alpha values
rematch_low_reg = REMatchKernel(metric="rbf", gamma=1.0, alpha=0.01)
rematch_high_reg = REMatchKernel(metric="rbf", gamma=1.0, alpha=1.0)
K_low = rematch_low_reg.create(soap_descriptors)
K_high = rematch_high_reg.create(soap_descriptors)Both kernels build on the concept of local atomic environment similarity:
AverageKernel:
REMatchKernel:
All kernels support various similarity metrics:
# Linear kernel (fastest)
kernel = AverageKernel(metric="linear")
# RBF (Gaussian) kernel - most common
kernel = AverageKernel(metric="rbf", gamma=1.0)
# Polynomial kernel
kernel = AverageKernel(metric="polynomial", degree=3, gamma=1.0, coef0=1.0)
# Laplacian kernel
kernel = AverageKernel(metric="laplacian", gamma=1.0)
# Sigmoid kernel
kernel = AverageKernel(metric="sigmoid", gamma=1.0, coef0=1.0)from sklearn.svm import SVC
from dscribe.kernels import AverageKernel
from dscribe.descriptors import SOAP
# Prepare data
soap = SOAP(species=["C", "H", "O"], r_cut=5.0, n_max=8, l_max=6)
structures = [...] # List of ASE Atoms objects
labels = [...] # Target labels
# Compute local descriptors
local_descriptors = [soap.create(struct) for struct in structures]
# Compute kernel matrix
kernel = AverageKernel(metric="rbf", gamma=1.0)
K_train = kernel.create(local_descriptors)
# Use precomputed kernel in scikit-learn
svm = SVC(kernel="precomputed")
svm.fit(K_train, labels)
# Predict on new data
new_descriptors = [soap.create(new_struct) for new_struct in test_structures]
K_test = kernel.create(new_descriptors, local_descriptors)
predictions = svm.predict(K_test)# Compute pairwise similarities
similarities = kernel.create(local_descriptors)
# Find most similar structures
import numpy as np
most_similar_pairs = np.unravel_index(
np.argsort(similarities.ravel())[-10:],
similarities.shape
)
# Cluster structures based on kernel similarities
from sklearn.cluster import SpectralClustering
clustering = SpectralClustering(
n_clusters=3,
affinity="precomputed",
random_state=42
)
cluster_labels = clustering.fit_predict(similarities)Choose AverageKernel for large datasets or when computational efficiency is critical. Use REMatchKernel when maximum accuracy is needed and computational resources are available.
Install with Tessl CLI
npx tessl i tessl/pypi-dscribe