Lightweight Python client for Apache Solr
—
SolrCloud cluster support with ZooKeeper coordination, automatic failover, leader detection, and distributed query handling across multiple Solr nodes for high availability and scalability.
SolrCloud-aware client that extends the standard Solr client with cluster coordination capabilities.
class SolrCloud(Solr):
def __init__(self, zookeeper, collection, decoder=None, encoder=None, timeout=60,
retry_count=5, retry_timeout=0.2, auth=None, verify=True, *args, **kwargs):
"""
Initialize a SolrCloud client with ZooKeeper coordination.
Parameters:
- zookeeper (ZooKeeper): ZooKeeper client instance for cluster coordination
- collection (str): SolrCloud collection name to work with
- decoder (json.JSONDecoder, optional): Custom JSON decoder instance
- encoder (json.JSONEncoder, optional): Custom JSON encoder instance
- timeout (int): Request timeout in seconds (default: 60)
- retry_count (int): Number of retry attempts for failed requests (default: 5)
- retry_timeout (float): Delay between retry attempts in seconds (default: 0.2)
- auth (tuple or requests auth object, optional): HTTP authentication
- verify (bool): Enable SSL certificate verification (default: True)
- *args, **kwargs: Additional arguments passed to parent Solr class
Raises:
SolrError: If ZooKeeper connection or collection access fails
"""Usage:
import pysolr
# Create ZooKeeper client
zk = pysolr.ZooKeeper('localhost:2181')
# Create SolrCloud client
solr_cloud = pysolr.SolrCloud(
zookeeper=zk,
collection='my_collection',
timeout=30,
retry_count=3,
auth=('username', 'password')
)
# Use like regular Solr client with automatic failover
results = solr_cloud.search('*:*')
solr_cloud.add([{'id': 'doc1', 'title': 'Test Document'}])ZooKeeper client for managing cluster state, node discovery, and leader election in SolrCloud deployments.
class ZooKeeper:
def __init__(self, zkServerAddress, timeout=15, max_retries=-1, kazoo_client=None):
"""
Initialize ZooKeeper client for SolrCloud coordination.
Parameters:
- zkServerAddress (str): ZooKeeper server address (e.g., 'localhost:2181' or 'zk1:2181,zk2:2181,zk3:2181')
- timeout (int): Connection timeout in seconds (default: 15)
- max_retries (int): Maximum retry attempts (-1 for unlimited) (default: -1)
- kazoo_client (KazooClient, optional): Custom Kazoo client instance
Raises:
RuntimeError: If kazoo library is not installed
"""
def getHosts(self, collname, only_leader=False, seen_aliases=None):
"""
Get list of active Solr hosts for a collection.
Parameters:
- collname (str): Collection name or alias
- only_leader (bool): Return only leader nodes (default: False)
- seen_aliases (list, optional): Track aliases to prevent circular references
Returns:
list: List of active Solr base URLs for the collection
Raises:
SolrError: If collection is unknown or no active hosts found
"""
def getRandomURL(self, collname, only_leader=False):
"""
Get a random active Solr URL for load balancing.
Parameters:
- collname (str): Collection name or alias
- only_leader (bool): Return only leader nodes (default: False)
Returns:
str: Complete Solr URL including collection path
Raises:
SolrError: If no active shards are available
"""
def getLeaderURL(self, collname):
"""
Get a leader node URL for update operations.
Parameters:
- collname (str): Collection name or alias
Returns:
str: Complete Solr URL for a leader node
Raises:
SolrError: If no leader nodes are available
"""Usage:
import pysolr
# Initialize ZooKeeper client
zk = pysolr.ZooKeeper('zk1:2181,zk2:2181,zk3:2181')
# Get all active hosts for a collection
hosts = zk.getHosts('my_collection')
print(f"Active hosts: {hosts}")
# Get only leader hosts (for updates)
leaders = zk.getHosts('my_collection', only_leader=True)
print(f"Leader hosts: {leaders}")
# Get random URL for load balancing
random_url = zk.getRandomURL('my_collection')
print(f"Random URL: {random_url}")
# Get leader URL for updates
leader_url = zk.getLeaderURL('my_collection')
print(f"Leader URL: {leader_url}")import pysolr
# Step 1: Initialize ZooKeeper client
print("Connecting to ZooKeeper...")
zk = pysolr.ZooKeeper(
zkServerAddress='zk1:2181,zk2:2181,zk3:2181',
timeout=30,
max_retries=5
)
# Step 2: Create SolrCloud client
print("Creating SolrCloud client...")
solr_cloud = pysolr.SolrCloud(
zookeeper=zk,
collection='my_distributed_collection',
timeout=60,
retry_count=3,
retry_timeout=1.0,
auth=('solr_user', 'solr_password'),
always_commit=True
)
try:
# Step 3: Test connectivity
print("Testing SolrCloud connectivity...")
response = solr_cloud.ping()
print("SolrCloud is healthy")
# Step 4: Index documents (automatically routed to leader)
print("Indexing documents...")
docs = [
{'id': 'doc1', 'title': 'First Document', 'content': 'Content for first document'},
{'id': 'doc2', 'title': 'Second Document', 'content': 'Content for second document'},
{'id': 'doc3', 'title': 'Third Document', 'content': 'Content for third document'}
]
solr_cloud.add(docs)
print(f"Indexed {len(docs)} documents")
# Step 5: Search across cluster (load balanced)
print("Searching documents...")
results = solr_cloud.search('*:*', rows=100)
print(f"Found {results.hits} total documents")
# Step 6: Search with distributed faceting
print("Searching with facets...")
results = solr_cloud.search(
'*:*',
facet=True,
facet_field='title',
facet_mincount=1
)
print(f"Facet results: {results.facets}")
# Step 7: Demonstrate failover by getting multiple URLs
print("Available cluster nodes:")
hosts = zk.getHosts('my_distributed_collection')
for i, host in enumerate(hosts):
print(f" Node {i+1}: {host}")
except pysolr.SolrError as e:
print(f"SolrCloud operation failed: {e}")
print("This may indicate cluster issues or network problems")
finally:
# Cleanup is handled automatically by the clients
print("SolrCloud operations completed")SolrCloud clients automatically handle node failures and retry operations:
import pysolr
zk = pysolr.ZooKeeper('zk1:2181,zk2:2181,zk3:2181')
solr_cloud = pysolr.SolrCloud(
zookeeper=zk,
collection='ha_collection',
retry_count=5,
retry_timeout=2.0
)
try:
# This will automatically retry on different nodes if one fails
results = solr_cloud.search('important_query')
# Updates will automatically find and use leader nodes
solr_cloud.add({'id': 'critical_doc', 'data': 'important_data'})
except pysolr.SolrError as e:
print(f"All nodes failed after retries: {e}")Distribute read queries across available replicas:
import random
import pysolr
zk = pysolr.ZooKeeper('zk1:2181,zk2:2181,zk3:2181')
# Get available hosts for manual load balancing
hosts = zk.getHosts('my_collection')
print(f"Load balancing across {len(hosts)} nodes")
# Create multiple clients for different purposes
read_clients = []
for host in hosts:
client = pysolr.Solr(f"{host}/my_collection", timeout=30)
read_clients.append(client)
# Use SolrCloud client for updates (handles leader detection)
update_client = pysolr.SolrCloud(zk, 'my_collection')
# Distribute read queries
for i in range(10):
client = random.choice(read_clients)
results = client.search(f'query_{i}')
print(f"Query {i} executed on {client.url}")
# All updates go through SolrCloud client
update_client.add({'id': f'doc_{i}', 'content': f'Document {i}'})import pysolr
import time
zk = pysolr.ZooKeeper('zk1:2181,zk2:2181,zk3:2181')
solr_cloud = pysolr.SolrCloud(zk, 'monitored_collection')
def monitor_cluster_health():
"""Monitor SolrCloud cluster health."""
try:
# Check ZooKeeper connectivity
hosts = zk.getHosts('monitored_collection')
if not hosts:
print("WARNING: No active hosts found")
return False
# Check individual node health
healthy_nodes = 0
for host in hosts:
try:
client = pysolr.Solr(f"{host}/monitored_collection")
client.ping()
healthy_nodes += 1
except pysolr.SolrError:
print(f"WARNING: Node {host} is unhealthy")
print(f"Cluster health: {healthy_nodes}/{len(hosts)} nodes healthy")
return healthy_nodes > 0
except Exception as e:
print(f"Cluster monitoring failed: {e}")
return False
# Monitor cluster periodically
while True:
if monitor_cluster_health():
try:
# Perform operations when cluster is healthy
results = solr_cloud.search('*:*', rows=0) # Count query
print(f"Total documents in cluster: {results.hits}")
except pysolr.SolrError as e:
print(f"Cluster operation failed: {e}")
else:
print("Cluster is unhealthy, skipping operations")
time.sleep(60) # Check every minuteSolrCloud functionality requires the kazoo library:
# Install with SolrCloud support
pip install pysolr[solrcloud]
# Or install kazoo separately
pip install kazoo>=2.5.0# Check for SolrCloud support
try:
import pysolr
zk = pysolr.ZooKeeper('localhost:2181')
print("SolrCloud support is available")
except RuntimeError:
print("SolrCloud support requires 'kazoo' library")
print("Install with: pip install pysolr[solrcloud]")Install with Tessl CLI
npx tessl i tessl/pypi-pysolr