tessl/pypi-pysolr

Lightweight Python client for Apache Solr

—

Pending

Overview

Eval results

Files

Utility Functions

Name: tessl/pypi-pysolr
Author: tessl

Helper functions for data conversion, text processing, URL encoding, and XML sanitization used throughout the pysolr library and available for custom processing needs.

Capabilities

Version Information

Get the current version of the pysolr library.

def get_version():
    """
    Get the current pysolr library version.

    Returns:
    str: Version string (e.g., "3.10.0")
    """

Usage:

import pysolr

version = pysolr.get_version()
print(f"PySOLR version: {version}")

# Use in application logging or debugging
print(f"Using pysolr {version} to connect to Solr")

Python Version Detection

Detect Python version for cross-platform compatibility.

def is_py3():
    """
    Check if running on Python 3.x.

    Returns:
    bool: True if Python 3.x, False if Python 2.x

    Note:
    - Used internally for handling differences between Python 2 and 3
    - Helps with string/bytes handling and compatibility
    """

Usage:

import pysolr

if pysolr.is_py3():
    print("Running on Python 3.x")
    # Python 3-specific logic
else:
    print("Running on Python 2.x") 
    # Python 2-specific logic

String Encoding Utilities

Convert between Unicode strings and byte strings for cross-platform compatibility.

def force_unicode(value):
    """
    Convert bytestrings to Unicode strings with error handling.

    Parameters:
    - value: Input value (bytes, str, or other type)

    Returns:
    str: Unicode string representation

    Note:
    - On Python 3: Decodes bytes to str, converts other types to str
    - On Python 2: Decodes str to unicode, converts other types to unicode
    - Uses UTF-8 encoding with 'replace' error handling
    """

def force_bytes(value):
    """
    Convert Unicode strings to bytestrings for HTTP transmission.

    Parameters:
    - value: Input value (str, unicode, or other type)

    Returns:
    bytes (Python 3) or str (Python 2): Byte string representation

    Note:
    - Uses UTF-8 encoding with appropriate error handling
    - Required for HTTP request bodies and XML processing
    """

Usage:

import pysolr

# Convert various types to Unicode
text_bytes = b"Hello, World! \xe2\x9c\x93"  # UTF-8 bytes with checkmark
unicode_text = pysolr.force_unicode(text_bytes)
print(f"Unicode: {unicode_text}")  # "Hello, World! ✓"

# Convert for HTTP transmission
unicode_string = "Café with special chars: áéíóú"
byte_string = pysolr.force_bytes(unicode_string)
print(f"Bytes: {byte_string}")

# Handle various input types
number_as_unicode = pysolr.force_unicode(12345)
print(f"Number as Unicode: {number_as_unicode}")  # "12345"

# Error handling with malformed data
malformed_bytes = b"\xff\xfe\x00\x41"  # Invalid UTF-8
safe_unicode = pysolr.force_unicode(malformed_bytes)
print(f"Safe conversion: {safe_unicode}")  # Uses replacement characters

HTML/XML Processing

Clean and process HTML/XML content for safe indexing and display.

def unescape_html(text):
    """
    Remove HTML or XML character references and entities from text.

    Parameters:
    - text (str): HTML or XML source text containing entities

    Returns:
    str: Plain text with entities converted to Unicode characters

    Note:
    - Handles both numeric (&#123;, &#x7B;) and named (&amp;, &lt;) entities
    - Useful for processing HTML content before indexing
    """

def clean_xml_string(s):
    """
    Remove invalid XML characters from string.

    Parameters:
    - s (str): String to clean

    Returns:
    str: String with invalid XML characters removed

    Note:
    - Removes control characters that would cause XML parsing errors
    - Applied automatically during document indexing
    """

Usage:

import pysolr

# Clean HTML entities
html_content = "Price: &pound;25.99 &amp; free shipping! Rating: 5&#9733;"
clean_content = pysolr.unescape_html(html_content)
print(f"Cleaned: {clean_content}")  # "Price: £25.99 & free shipping! Rating: 5★"

# Remove invalid XML characters
xml_content = "Valid text\x08\x0bInvalid control chars\x1f\x00More text"
clean_xml = pysolr.clean_xml_string(xml_content)
print(f"Clean XML: {clean_xml}")  # "Valid textInvalid control charsMore text"

# Process scraped web content
scraped_html = """
&lt;div class=&quot;article&quot;&gt;
    &lt;h1&gt;Article Title&lt;/h1&gt;
    &lt;p&gt;Content with &amp;quot;quotes&amp;quot; and &amp;lt;tags&amp;gt;&lt;/p&gt;
&lt;/div&gt;
"""
readable_text = pysolr.unescape_html(scraped_html)
print(f"Readable: {readable_text}")

URL Encoding

Safe URL encoding for HTTP parameters with UTF-8 support.

def safe_urlencode(params, doseq=0):
    """
    UTF-8-safe version of URL encoding.

    Parameters:
    - params (dict or list of tuples): Parameters to encode
    - doseq (int): Handle sequence values (0=single value, 1=multiple values)

    Returns:
    str: URL-encoded parameter string

    Note:
    - Fixes UTF-8 encoding issues in Python 2.x
    - Used internally for Solr HTTP requests
    - Handles both single and multi-valued parameters
    """

Usage:

import pysolr

# Basic parameter encoding
params = {
    'q': 'title:python AND content:"machine learning"',
    'fq': 'category:programming',
    'rows': 20,
    'start': 0
}
encoded = pysolr.safe_urlencode(params)
print(f"Encoded: {encoded}")

# Multi-valued parameters
multi_params = {
    'fq': ['category:tech', 'status:published', 'date:[2024-01-01T00:00:00Z TO NOW]'],
    'fl': ['id', 'title', 'content', 'score']
}
encoded_multi = pysolr.safe_urlencode(multi_params, doseq=1)
print(f"Multi-valued: {encoded_multi}")

# UTF-8 content (especially important for Python 2.x)
utf8_params = {
    'q': 'title:café OR content:naïve',
    'fq': 'author:"José García"'
}
encoded_utf8 = pysolr.safe_urlencode(utf8_params)
print(f"UTF-8 safe: {encoded_utf8}")

Data Sanitization

Clean data for safe XML processing and indexing.

def sanitize(data):
    """
    Remove control characters from data for safe XML processing.

    Parameters:
    - data (str or bytes): Data to sanitize

    Returns:
    str: Sanitized Unicode string safe for XML processing

    Note:
    - Removes ASCII control characters (0x00-0x1F except tab, newline, carriage return)
    - Applied automatically during document indexing unless disabled
    - Essential for processing binary data or untrusted input
    """

Usage:

import pysolr

# Sanitize text with control characters
dirty_text = "Clean text\x00\x01\x02\x08Bad control chars\x0b\x0c\x0e\x1fMore text"
clean_text = pysolr.sanitize(dirty_text)
print(f"Sanitized: {repr(clean_text)}")  # Control chars removed

# Process file content
with open('potentially_dirty_file.txt', 'rb') as f:
    file_content = f.read()
    safe_content = pysolr.sanitize(file_content)
    
    # Now safe to index
    doc = {
        'id': 'file_doc',
        'content': safe_content,
        'filename': 'potentially_dirty_file.txt'
    }

# Disable automatic sanitization if needed
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
solr.add(
    {'id': 'raw_doc', 'content': 'Raw content'},
    clean_ctrl_chars=False  # Skip automatic sanitization
)

Advanced Usage Patterns

Custom Data Processing Pipeline

Combine utility functions for comprehensive data processing:

import pysolr

def process_web_content(html_content, document_id):
    """
    Complete pipeline for processing web content for Solr indexing.
    
    Parameters:
    - html_content (str): Raw HTML content
    - document_id (str): Unique document identifier
    
    Returns:
    dict: Processed document ready for indexing
    """
    
    # Step 1: Convert to Unicode if needed
    unicode_content = pysolr.force_unicode(html_content)
    
    # Step 2: Unescape HTML entities
    unescaped_content = pysolr.unescape_html(unicode_content)
    
    # Step 3: Clean invalid XML characters
    clean_content = pysolr.clean_xml_string(unescaped_content)
    
    # Step 4: Sanitize control characters
    safe_content = pysolr.sanitize(clean_content)
    
    # Step 5: Create document
    document = {
        'id': document_id,
        'content': safe_content,
        'content_length': len(safe_content),
        'processed_timestamp': pysolr.force_unicode(str(datetime.datetime.now()))
    }
    
    return document

# Usage example
raw_html = """
&lt;article&gt;
    &lt;h1&gt;Caf&eacute; Review&lt;/h1&gt;
    &lt;p&gt;Great coffee with a rating of 5&#9733;&lt;/p&gt;
    \x08\x0bSome bad control characters\x1f
&lt;/article&gt;
"""

processed_doc = process_web_content(raw_html, 'cafe_review_1')
print(f"Processed document: {processed_doc}")

# Index the processed document
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
solr.add(processed_doc)

Error-Safe Utility Usage

Handle edge cases and errors gracefully:

import pysolr

def safe_process_data(data):
    """
    Safely process data with error handling.
    
    Parameters:
    - data: Input data of unknown type/encoding
    
    Returns:
    str: Safely processed string or empty string on error
    """
    
    try:
        # Try to convert to Unicode
        unicode_data = pysolr.force_unicode(data)
        
        # Clean HTML if it looks like HTML
        if '<' in unicode_data and '>' in unicode_data:
            unicode_data = pysolr.unescape_html(unicode_data)
        
        # Always clean XML and sanitize
        clean_data = pysolr.clean_xml_string(unicode_data)
        safe_data = pysolr.sanitize(clean_data)
        
        return safe_data
        
    except Exception as e:
        print(f"Data processing error: {e}")
        return ""

# Test with various problematic inputs
test_inputs = [
    b'\xff\xfe\x00\x41',  # Invalid UTF-8
    "Valid &amp; clean text",  # HTML entities
    "Text\x00with\x08bad\x1fchars",  # Control characters
    12345,  # Non-string type
    None,   # None value
]

for i, test_input in enumerate(test_inputs):
    result = safe_process_data(test_input)
    print(f"Input {i}: {repr(test_input)} -> {repr(result)}")

Performance Optimization

Use utility functions efficiently for large-scale processing:

import pysolr

def bulk_sanitize_documents(documents):
    """
    Efficiently sanitize a large number of documents.
    
    Parameters:
    - documents (list): List of document dictionaries
    
    Returns:
    list: List of sanitized documents
    """
    
    sanitized_docs = []
    
    for doc in documents:
        sanitized_doc = {'id': doc['id']}  # Preserve ID
        
        for field, value in doc.items():
            if field == 'id':
                continue
                
            if isinstance(value, (str, bytes)):
                # Process string/bytes fields
                unicode_value = pysolr.force_unicode(value)
                clean_value = pysolr.sanitize(unicode_value)
                sanitized_doc[field] = clean_value
                
            elif isinstance(value, list):
                # Process multi-valued fields
                clean_values = []
                for item in value:
                    if isinstance(item, (str, bytes)):
                        unicode_item = pysolr.force_unicode(item)
                        clean_item = pysolr.sanitize(unicode_item)
                        clean_values.append(clean_item)
                    else:
                        clean_values.append(item)
                sanitized_doc[field] = clean_values
                
            else:
                # Preserve non-string fields as-is
                sanitized_doc[field] = value
        
        sanitized_docs.append(sanitized_doc)
    
    return sanitized_docs

# Example usage with large dataset
large_dataset = []
for i in range(1000):
    doc = {
        'id': f'doc_{i}',
        'title': f'Document {i} with "special" chars',
        'content': f'Content\x08with\x1fbad\x00chars for doc {i}',
        'tags': ['tag1', 'tag2\x0b', 'tag3'],
        'score': i * 0.1
    }
    large_dataset.append(doc)

print("Sanitizing large dataset...")
clean_dataset = bulk_sanitize_documents(large_dataset)
print(f"Processed {len(clean_dataset)} documents")

# Index cleaned dataset
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
solr.add(clean_dataset, commit=True)

Install with Tessl CLI