Lightweight Python client for Apache Solr
—
Helper functions for data conversion, text processing, URL encoding, and XML sanitization used throughout the pysolr library and available for custom processing needs.
Get the current version of the pysolr library.
def get_version():
"""
Get the current pysolr library version.
Returns:
str: Version string (e.g., "3.10.0")
"""Usage:
import pysolr
version = pysolr.get_version()
print(f"PySOLR version: {version}")
# Use in application logging or debugging
print(f"Using pysolr {version} to connect to Solr")Detect Python version for cross-platform compatibility.
def is_py3():
"""
Check if running on Python 3.x.
Returns:
bool: True if Python 3.x, False if Python 2.x
Note:
- Used internally for handling differences between Python 2 and 3
- Helps with string/bytes handling and compatibility
"""Usage:
import pysolr
if pysolr.is_py3():
print("Running on Python 3.x")
# Python 3-specific logic
else:
print("Running on Python 2.x")
# Python 2-specific logicConvert between Unicode strings and byte strings for cross-platform compatibility.
def force_unicode(value):
"""
Convert bytestrings to Unicode strings with error handling.
Parameters:
- value: Input value (bytes, str, or other type)
Returns:
str: Unicode string representation
Note:
- On Python 3: Decodes bytes to str, converts other types to str
- On Python 2: Decodes str to unicode, converts other types to unicode
- Uses UTF-8 encoding with 'replace' error handling
"""
def force_bytes(value):
"""
Convert Unicode strings to bytestrings for HTTP transmission.
Parameters:
- value: Input value (str, unicode, or other type)
Returns:
bytes (Python 3) or str (Python 2): Byte string representation
Note:
- Uses UTF-8 encoding with appropriate error handling
- Required for HTTP request bodies and XML processing
"""Usage:
import pysolr
# Convert various types to Unicode
text_bytes = b"Hello, World! \xe2\x9c\x93" # UTF-8 bytes with checkmark
unicode_text = pysolr.force_unicode(text_bytes)
print(f"Unicode: {unicode_text}") # "Hello, World! ✓"
# Convert for HTTP transmission
unicode_string = "Café with special chars: áéíóú"
byte_string = pysolr.force_bytes(unicode_string)
print(f"Bytes: {byte_string}")
# Handle various input types
number_as_unicode = pysolr.force_unicode(12345)
print(f"Number as Unicode: {number_as_unicode}") # "12345"
# Error handling with malformed data
malformed_bytes = b"\xff\xfe\x00\x41" # Invalid UTF-8
safe_unicode = pysolr.force_unicode(malformed_bytes)
print(f"Safe conversion: {safe_unicode}") # Uses replacement charactersClean and process HTML/XML content for safe indexing and display.
def unescape_html(text):
"""
Remove HTML or XML character references and entities from text.
Parameters:
- text (str): HTML or XML source text containing entities
Returns:
str: Plain text with entities converted to Unicode characters
Note:
- Handles both numeric ({, {) and named (&, <) entities
- Useful for processing HTML content before indexing
"""
def clean_xml_string(s):
"""
Remove invalid XML characters from string.
Parameters:
- s (str): String to clean
Returns:
str: String with invalid XML characters removed
Note:
- Removes control characters that would cause XML parsing errors
- Applied automatically during document indexing
"""Usage:
import pysolr
# Clean HTML entities
html_content = "Price: £25.99 & free shipping! Rating: 5★"
clean_content = pysolr.unescape_html(html_content)
print(f"Cleaned: {clean_content}") # "Price: £25.99 & free shipping! Rating: 5★"
# Remove invalid XML characters
xml_content = "Valid text\x08\x0bInvalid control chars\x1f\x00More text"
clean_xml = pysolr.clean_xml_string(xml_content)
print(f"Clean XML: {clean_xml}") # "Valid textInvalid control charsMore text"
# Process scraped web content
scraped_html = """
<div class="article">
<h1>Article Title</h1>
<p>Content with &quot;quotes&quot; and &lt;tags&gt;</p>
</div>
"""
readable_text = pysolr.unescape_html(scraped_html)
print(f"Readable: {readable_text}")Safe URL encoding for HTTP parameters with UTF-8 support.
def safe_urlencode(params, doseq=0):
"""
UTF-8-safe version of URL encoding.
Parameters:
- params (dict or list of tuples): Parameters to encode
- doseq (int): Handle sequence values (0=single value, 1=multiple values)
Returns:
str: URL-encoded parameter string
Note:
- Fixes UTF-8 encoding issues in Python 2.x
- Used internally for Solr HTTP requests
- Handles both single and multi-valued parameters
"""Usage:
import pysolr
# Basic parameter encoding
params = {
'q': 'title:python AND content:"machine learning"',
'fq': 'category:programming',
'rows': 20,
'start': 0
}
encoded = pysolr.safe_urlencode(params)
print(f"Encoded: {encoded}")
# Multi-valued parameters
multi_params = {
'fq': ['category:tech', 'status:published', 'date:[2024-01-01T00:00:00Z TO NOW]'],
'fl': ['id', 'title', 'content', 'score']
}
encoded_multi = pysolr.safe_urlencode(multi_params, doseq=1)
print(f"Multi-valued: {encoded_multi}")
# UTF-8 content (especially important for Python 2.x)
utf8_params = {
'q': 'title:café OR content:naïve',
'fq': 'author:"José García"'
}
encoded_utf8 = pysolr.safe_urlencode(utf8_params)
print(f"UTF-8 safe: {encoded_utf8}")Clean data for safe XML processing and indexing.
def sanitize(data):
"""
Remove control characters from data for safe XML processing.
Parameters:
- data (str or bytes): Data to sanitize
Returns:
str: Sanitized Unicode string safe for XML processing
Note:
- Removes ASCII control characters (0x00-0x1F except tab, newline, carriage return)
- Applied automatically during document indexing unless disabled
- Essential for processing binary data or untrusted input
"""Usage:
import pysolr
# Sanitize text with control characters
dirty_text = "Clean text\x00\x01\x02\x08Bad control chars\x0b\x0c\x0e\x1fMore text"
clean_text = pysolr.sanitize(dirty_text)
print(f"Sanitized: {repr(clean_text)}") # Control chars removed
# Process file content
with open('potentially_dirty_file.txt', 'rb') as f:
file_content = f.read()
safe_content = pysolr.sanitize(file_content)
# Now safe to index
doc = {
'id': 'file_doc',
'content': safe_content,
'filename': 'potentially_dirty_file.txt'
}
# Disable automatic sanitization if needed
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
solr.add(
{'id': 'raw_doc', 'content': 'Raw content'},
clean_ctrl_chars=False # Skip automatic sanitization
)Combine utility functions for comprehensive data processing:
import pysolr
def process_web_content(html_content, document_id):
"""
Complete pipeline for processing web content for Solr indexing.
Parameters:
- html_content (str): Raw HTML content
- document_id (str): Unique document identifier
Returns:
dict: Processed document ready for indexing
"""
# Step 1: Convert to Unicode if needed
unicode_content = pysolr.force_unicode(html_content)
# Step 2: Unescape HTML entities
unescaped_content = pysolr.unescape_html(unicode_content)
# Step 3: Clean invalid XML characters
clean_content = pysolr.clean_xml_string(unescaped_content)
# Step 4: Sanitize control characters
safe_content = pysolr.sanitize(clean_content)
# Step 5: Create document
document = {
'id': document_id,
'content': safe_content,
'content_length': len(safe_content),
'processed_timestamp': pysolr.force_unicode(str(datetime.datetime.now()))
}
return document
# Usage example
raw_html = """
<article>
<h1>Café Review</h1>
<p>Great coffee with a rating of 5★</p>
\x08\x0bSome bad control characters\x1f
</article>
"""
processed_doc = process_web_content(raw_html, 'cafe_review_1')
print(f"Processed document: {processed_doc}")
# Index the processed document
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
solr.add(processed_doc)Handle edge cases and errors gracefully:
import pysolr
def safe_process_data(data):
"""
Safely process data with error handling.
Parameters:
- data: Input data of unknown type/encoding
Returns:
str: Safely processed string or empty string on error
"""
try:
# Try to convert to Unicode
unicode_data = pysolr.force_unicode(data)
# Clean HTML if it looks like HTML
if '<' in unicode_data and '>' in unicode_data:
unicode_data = pysolr.unescape_html(unicode_data)
# Always clean XML and sanitize
clean_data = pysolr.clean_xml_string(unicode_data)
safe_data = pysolr.sanitize(clean_data)
return safe_data
except Exception as e:
print(f"Data processing error: {e}")
return ""
# Test with various problematic inputs
test_inputs = [
b'\xff\xfe\x00\x41', # Invalid UTF-8
"Valid & clean text", # HTML entities
"Text\x00with\x08bad\x1fchars", # Control characters
12345, # Non-string type
None, # None value
]
for i, test_input in enumerate(test_inputs):
result = safe_process_data(test_input)
print(f"Input {i}: {repr(test_input)} -> {repr(result)}")Use utility functions efficiently for large-scale processing:
import pysolr
def bulk_sanitize_documents(documents):
"""
Efficiently sanitize a large number of documents.
Parameters:
- documents (list): List of document dictionaries
Returns:
list: List of sanitized documents
"""
sanitized_docs = []
for doc in documents:
sanitized_doc = {'id': doc['id']} # Preserve ID
for field, value in doc.items():
if field == 'id':
continue
if isinstance(value, (str, bytes)):
# Process string/bytes fields
unicode_value = pysolr.force_unicode(value)
clean_value = pysolr.sanitize(unicode_value)
sanitized_doc[field] = clean_value
elif isinstance(value, list):
# Process multi-valued fields
clean_values = []
for item in value:
if isinstance(item, (str, bytes)):
unicode_item = pysolr.force_unicode(item)
clean_item = pysolr.sanitize(unicode_item)
clean_values.append(clean_item)
else:
clean_values.append(item)
sanitized_doc[field] = clean_values
else:
# Preserve non-string fields as-is
sanitized_doc[field] = value
sanitized_docs.append(sanitized_doc)
return sanitized_docs
# Example usage with large dataset
large_dataset = []
for i in range(1000):
doc = {
'id': f'doc_{i}',
'title': f'Document {i} with "special" chars',
'content': f'Content\x08with\x1fbad\x00chars for doc {i}',
'tags': ['tag1', 'tag2\x0b', 'tag3'],
'score': i * 0.1
}
large_dataset.append(doc)
print("Sanitizing large dataset...")
clean_dataset = bulk_sanitize_documents(large_dataset)
print(f"Processed {len(clean_dataset)} documents")
# Index cleaned dataset
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
solr.add(clean_dataset, commit=True)Install with Tessl CLI
npx tessl i tessl/pypi-pysolr