tessl/pypi-pysolr

Lightweight Python client for Apache Solr

—

Pending

Overview

Eval results

Files

SolrCloud Support

Name: tessl/pypi-pysolr
Author: tessl

SolrCloud cluster support with ZooKeeper coordination, automatic failover, leader detection, and distributed query handling across multiple Solr nodes for high availability and scalability.

Capabilities

SolrCloud Client

SolrCloud-aware client that extends the standard Solr client with cluster coordination capabilities.

class SolrCloud(Solr):
    def __init__(self, zookeeper, collection, decoder=None, encoder=None, timeout=60, 
                 retry_count=5, retry_timeout=0.2, auth=None, verify=True, *args, **kwargs):
        """
        Initialize a SolrCloud client with ZooKeeper coordination.

        Parameters:
        - zookeeper (ZooKeeper): ZooKeeper client instance for cluster coordination
        - collection (str): SolrCloud collection name to work with
        - decoder (json.JSONDecoder, optional): Custom JSON decoder instance
        - encoder (json.JSONEncoder, optional): Custom JSON encoder instance
        - timeout (int): Request timeout in seconds (default: 60)
        - retry_count (int): Number of retry attempts for failed requests (default: 5)
        - retry_timeout (float): Delay between retry attempts in seconds (default: 0.2)
        - auth (tuple or requests auth object, optional): HTTP authentication
        - verify (bool): Enable SSL certificate verification (default: True)
        - *args, **kwargs: Additional arguments passed to parent Solr class

        Raises:
        SolrError: If ZooKeeper connection or collection access fails
        """

Usage:

import pysolr

# Create ZooKeeper client
zk = pysolr.ZooKeeper('localhost:2181')

# Create SolrCloud client
solr_cloud = pysolr.SolrCloud(
    zookeeper=zk,
    collection='my_collection',
    timeout=30,
    retry_count=3,
    auth=('username', 'password')
)

# Use like regular Solr client with automatic failover
results = solr_cloud.search('*:*')
solr_cloud.add([{'id': 'doc1', 'title': 'Test Document'}])

ZooKeeper Coordination

ZooKeeper client for managing cluster state, node discovery, and leader election in SolrCloud deployments.

class ZooKeeper:
    def __init__(self, zkServerAddress, timeout=15, max_retries=-1, kazoo_client=None):
        """
        Initialize ZooKeeper client for SolrCloud coordination.

        Parameters:
        - zkServerAddress (str): ZooKeeper server address (e.g., 'localhost:2181' or 'zk1:2181,zk2:2181,zk3:2181')
        - timeout (int): Connection timeout in seconds (default: 15)
        - max_retries (int): Maximum retry attempts (-1 for unlimited) (default: -1)
        - kazoo_client (KazooClient, optional): Custom Kazoo client instance

        Raises:
        RuntimeError: If kazoo library is not installed
        """

    def getHosts(self, collname, only_leader=False, seen_aliases=None):
        """
        Get list of active Solr hosts for a collection.

        Parameters:
        - collname (str): Collection name or alias
        - only_leader (bool): Return only leader nodes (default: False)
        - seen_aliases (list, optional): Track aliases to prevent circular references

        Returns:
        list: List of active Solr base URLs for the collection

        Raises:
        SolrError: If collection is unknown or no active hosts found
        """

    def getRandomURL(self, collname, only_leader=False):
        """
        Get a random active Solr URL for load balancing.

        Parameters:
        - collname (str): Collection name or alias
        - only_leader (bool): Return only leader nodes (default: False)

        Returns:
        str: Complete Solr URL including collection path

        Raises:
        SolrError: If no active shards are available
        """

    def getLeaderURL(self, collname):
        """
        Get a leader node URL for update operations.

        Parameters:
        - collname (str): Collection name or alias

        Returns:
        str: Complete Solr URL for a leader node

        Raises:
        SolrError: If no leader nodes are available
        """

Usage:

import pysolr

# Initialize ZooKeeper client
zk = pysolr.ZooKeeper('zk1:2181,zk2:2181,zk3:2181')

# Get all active hosts for a collection
hosts = zk.getHosts('my_collection')
print(f"Active hosts: {hosts}")

# Get only leader hosts (for updates)
leaders = zk.getHosts('my_collection', only_leader=True)
print(f"Leader hosts: {leaders}")

# Get random URL for load balancing
random_url = zk.getRandomURL('my_collection')
print(f"Random URL: {random_url}")

# Get leader URL for updates
leader_url = zk.getLeaderURL('my_collection')
print(f"Leader URL: {leader_url}")

Complete SolrCloud Setup Example

import pysolr

# Step 1: Initialize ZooKeeper client
print("Connecting to ZooKeeper...")
zk = pysolr.ZooKeeper(
    zkServerAddress='zk1:2181,zk2:2181,zk3:2181',
    timeout=30,
    max_retries=5
)

# Step 2: Create SolrCloud client
print("Creating SolrCloud client...")
solr_cloud = pysolr.SolrCloud(
    zookeeper=zk,
    collection='my_distributed_collection',
    timeout=60,
    retry_count=3,
    retry_timeout=1.0,
    auth=('solr_user', 'solr_password'),
    always_commit=True
)

try:
    # Step 3: Test connectivity
    print("Testing SolrCloud connectivity...")
    response = solr_cloud.ping()
    print("SolrCloud is healthy")
    
    # Step 4: Index documents (automatically routed to leader)
    print("Indexing documents...")
    docs = [
        {'id': 'doc1', 'title': 'First Document', 'content': 'Content for first document'},
        {'id': 'doc2', 'title': 'Second Document', 'content': 'Content for second document'},
        {'id': 'doc3', 'title': 'Third Document', 'content': 'Content for third document'}
    ]
    solr_cloud.add(docs)
    print(f"Indexed {len(docs)} documents")
    
    # Step 5: Search across cluster (load balanced)
    print("Searching documents...")
    results = solr_cloud.search('*:*', rows=100)
    print(f"Found {results.hits} total documents")
    
    # Step 6: Search with distributed faceting
    print("Searching with facets...")
    results = solr_cloud.search(
        '*:*',
        facet=True,
        facet_field='title',
        facet_mincount=1
    )
    print(f"Facet results: {results.facets}")
    
    # Step 7: Demonstrate failover by getting multiple URLs
    print("Available cluster nodes:")
    hosts = zk.getHosts('my_distributed_collection')
    for i, host in enumerate(hosts):
        print(f"  Node {i+1}: {host}")
    
except pysolr.SolrError as e:
    print(f"SolrCloud operation failed: {e}")
    print("This may indicate cluster issues or network problems")

finally:
    # Cleanup is handled automatically by the clients
    print("SolrCloud operations completed")

High Availability Patterns

Automatic Failover

SolrCloud clients automatically handle node failures and retry operations:

import pysolr

zk = pysolr.ZooKeeper('zk1:2181,zk2:2181,zk3:2181')
solr_cloud = pysolr.SolrCloud(
    zookeeper=zk,
    collection='ha_collection',
    retry_count=5,
    retry_timeout=2.0
)

try:
    # This will automatically retry on different nodes if one fails
    results = solr_cloud.search('important_query')
    
    # Updates will automatically find and use leader nodes
    solr_cloud.add({'id': 'critical_doc', 'data': 'important_data'})
    
except pysolr.SolrError as e:
    print(f"All nodes failed after retries: {e}")

Load Balanced Queries

Distribute read queries across available replicas:

import random
import pysolr

zk = pysolr.ZooKeeper('zk1:2181,zk2:2181,zk3:2181')

# Get available hosts for manual load balancing
hosts = zk.getHosts('my_collection')
print(f"Load balancing across {len(hosts)} nodes")

# Create multiple clients for different purposes
read_clients = []
for host in hosts:
    client = pysolr.Solr(f"{host}/my_collection", timeout=30)
    read_clients.append(client)

# Use SolrCloud client for updates (handles leader detection)
update_client = pysolr.SolrCloud(zk, 'my_collection')

# Distribute read queries
for i in range(10):
    client = random.choice(read_clients)
    results = client.search(f'query_{i}')
    print(f"Query {i} executed on {client.url}")

# All updates go through SolrCloud client
update_client.add({'id': f'doc_{i}', 'content': f'Document {i}'})

Error Handling and Monitoring

import pysolr
import time

zk = pysolr.ZooKeeper('zk1:2181,zk2:2181,zk3:2181')
solr_cloud = pysolr.SolrCloud(zk, 'monitored_collection')

def monitor_cluster_health():
    """Monitor SolrCloud cluster health."""
    try:
        # Check ZooKeeper connectivity
        hosts = zk.getHosts('monitored_collection')
        if not hosts:
            print("WARNING: No active hosts found")
            return False
            
        # Check individual node health
        healthy_nodes = 0
        for host in hosts:
            try:
                client = pysolr.Solr(f"{host}/monitored_collection")
                client.ping()
                healthy_nodes += 1
            except pysolr.SolrError:
                print(f"WARNING: Node {host} is unhealthy")
        
        print(f"Cluster health: {healthy_nodes}/{len(hosts)} nodes healthy")
        return healthy_nodes > 0
        
    except Exception as e:
        print(f"Cluster monitoring failed: {e}")
        return False

# Monitor cluster periodically
while True:
    if monitor_cluster_health():
        try:
            # Perform operations when cluster is healthy
            results = solr_cloud.search('*:*', rows=0)  # Count query
            print(f"Total documents in cluster: {results.hits}")
        except pysolr.SolrError as e:
            print(f"Cluster operation failed: {e}")
    else:
        print("Cluster is unhealthy, skipping operations")
    
    time.sleep(60)  # Check every minute

Dependencies

SolrCloud functionality requires the kazoo library:

# Install with SolrCloud support
pip install pysolr[solrcloud]

# Or install kazoo separately
pip install kazoo>=2.5.0

# Check for SolrCloud support
try:
    import pysolr
    zk = pysolr.ZooKeeper('localhost:2181')
    print("SolrCloud support is available")
except RuntimeError:
    print("SolrCloud support requires 'kazoo' library")
    print("Install with: pip install pysolr[solrcloud]")

Install with Tessl CLI