Lightweight Python client for Apache Solr
—
Advanced document handling including content extraction with Apache Tika, nested document support, field update operations, and XML/JSON processing utilities for rich document indexing.
Extract text content and metadata from files using Solr's ExtractingRequestHandler (Tika integration).
def extract(self, file_obj, extractOnly=True, handler="update/extract", **kwargs):
"""
Extract content and metadata from files using Apache Tika.
Parameters:
- file_obj: File-like object with a 'name' attribute (e.g., result of open())
- extractOnly (bool): Extract without indexing (default: True)
- handler (str): Extraction handler path (default: "update/extract")
- **kwargs: Additional Tika/extraction parameters:
- literal.id (str): Document ID for extracted content
- fmap.content (str): Map extracted content to field name
- uprefix (str): Prefix for unknown fields
- defaultField (str): Default field for unmapped content
- xpath (str): XPath expression for content selection
- captureAttr (bool): Capture HTML attributes
- lowernames (bool): Convert field names to lowercase
Returns:
dict or None: Dictionary with 'contents' and 'metadata' keys, or None if extraction fails
Raises:
ValueError: If file_obj doesn't have a name attribute
SolrError: If extraction fails or handler is not configured
"""Usage:
import pysolr
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
# Extract content from a PDF file
with open('document.pdf', 'rb') as pdf_file:
extracted = solr.extract(pdf_file)
if extracted:
print("Extracted content:")
print(extracted['contents'])
print("Metadata:")
for key, value in extracted['metadata'].items():
print(f" {key}: {value}")
else:
print("No content could be extracted")
# Extract and index simultaneously
with open('document.docx', 'rb') as docx_file:
# This will extract and immediately index the document
solr.extract(
docx_file,
extractOnly=False,
literal_id='doc_123',
literal_title='Important Document',
fmap_content='text_content'
)
# Extract with custom field mapping
with open('presentation.pptx', 'rb') as pptx_file:
extracted = solr.extract(
pptx_file,
uprefix='extracted_',
defaultField='content',
captureAttr=True,
lowernames=True
)Handle parent-child document relationships for hierarchical data structures.
# Nested document key constant
NESTED_DOC_KEY = "_childDocuments_"Usage:
import pysolr
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
# Index document with nested children
parent_doc = {
"id": "blog_post_1",
"title": "Introduction to Machine Learning",
"author": "Jane Smith",
"category": "technology",
pysolr.NESTED_DOC_KEY: [
{
"id": "comment_1",
"type": "comment",
"author": "John Doe",
"text": "Great article! Very informative."
},
{
"id": "comment_2",
"type": "comment",
"author": "Alice Brown",
"text": "Thanks for sharing this."
}
]
}
solr.add(parent_doc)
# Alternative syntax using _doc key
parent_doc_alt = {
"id": "article_1",
"title": "Python Best Practices",
"_doc": [
{"id": "section_1", "title": "Code Style", "content": "Follow PEP 8..."},
{"id": "section_2", "title": "Testing", "content": "Write comprehensive tests..."}
]
}
solr.add(parent_doc_alt)
# Search nested documents
results = solr.search('{!parent which="type:parent"}text:"Great article"')
for doc in results:
print(f"Parent document: {doc['title']}")Perform atomic updates on specific document fields without reindexing entire documents.
Usage:
import pysolr
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
# Atomic field updates - set new value
solr.add(
{"id": "doc_1", "status": "published"},
fieldUpdates={"status": "set"}
)
# Add value to multi-valued field
solr.add(
{"id": "doc_1", "tags": "python"},
fieldUpdates={"tags": "add"}
)
# Increment numeric field
solr.add(
{"id": "doc_1", "view_count": 1},
fieldUpdates={"view_count": "inc"}
)
# Remove specific value from multi-valued field
solr.add(
{"id": "doc_1", "tags": "outdated"},
fieldUpdates={"tags": "remove"}
)
# Multiple field operations
solr.add(
{
"id": "doc_1",
"last_modified": "2024-01-15T10:30:00Z",
"tags": "updated",
"version": 1
},
fieldUpdates={
"last_modified": "set",
"tags": "add",
"version": "inc"
}
)Apply scoring boosts to documents and fields during indexing to influence search relevance.
Usage:
import pysolr
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
# Document-level boost
doc_with_boost = {
"id": "important_doc",
"title": "Critical Information",
"content": "This document contains vital information",
"boost": 2.0 # Document boost factor
}
solr.add(doc_with_boost)
# Field-level boosts
docs = [
{
"id": "doc_1",
"title": "Python Tutorial",
"content": "Learn Python programming"
},
{
"id": "doc_2",
"title": "Advanced Python",
"content": "Master advanced Python concepts"
}
]
# Boost title field more than content field
field_boosts = {
"title": 3.0,
"content": 1.0
}
solr.add(docs, boost=field_boosts)Efficiently process large numbers of documents with optimized batch operations.
Usage:
import pysolr
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
# Process large document batch
def process_large_dataset(documents, batch_size=1000):
"""Process documents in batches for optimal performance."""
for i in range(0, len(documents), batch_size):
batch = documents[i:i + batch_size]
try:
# Add batch without immediate commit
solr.add(batch, commit=False)
print(f"Processed batch {i//batch_size + 1}: {len(batch)} documents")
except pysolr.SolrError as e:
print(f"Batch {i//batch_size + 1} failed: {e}")
# Handle individual documents in failed batch
for doc in batch:
try:
solr.add(doc, commit=False)
except pysolr.SolrError:
print(f"Failed to index document: {doc.get('id', 'unknown')}")
# Commit all changes at once
solr.commit()
print("All batches committed")
# Example usage
large_dataset = []
for i in range(10000):
doc = {
"id": f"doc_{i}",
"title": f"Document {i}",
"content": f"Content for document number {i}",
"timestamp": "2024-01-15T10:30:00Z"
}
large_dataset.append(doc)
process_large_dataset(large_dataset)Handle complex document structures with dynamic fields, copies fields, and custom data types.
import pysolr
import datetime
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
# Complex document with various data types
complex_doc = {
# Basic fields
"id": "complex_doc_1",
"title": "Advanced Document Structure",
"content": "This document demonstrates complex field types",
# Date fields (automatically converted)
"created_date": datetime.datetime.now(),
"published_date": datetime.date.today(),
# Multi-valued fields
"tags": ["python", "solr", "search", "indexing"],
"authors": ["Alice Smith", "Bob Johnson"],
# Numeric fields
"price": 29.99,
"quantity": 100,
"rating": 4.5,
# Boolean fields
"is_featured": True,
"is_available": False,
# Dynamic fields (assuming *_s, *_i, *_dt patterns in schema)
"custom_string_s": "Custom string value",
"custom_int_i": 42,
"custom_date_dt": "2024-01-15T10:30:00Z",
# Location field (if geo-spatial search is configured)
"location": "37.7749,-122.4194", # San Francisco coordinates
# JSON field (if JSON field type is configured)
"metadata": {
"source": "api",
"version": "1.0",
"settings": {
"debug": True,
"timeout": 30
}
}
}
# Index complex document
solr.add(complex_doc)
# Search using various field types
results = solr.search('tags:python AND rating:[4.0 TO *]')
date_results = solr.search('created_date:[2024-01-01T00:00:00Z TO NOW]')
geo_results = solr.search('{!geofilt pt=37.7749,-122.4194 sfield=location d=10}')PySOLR automatically handles data type conversion between Python and Solr formats:
import pysolr
import datetime
# Python -> Solr conversion examples
conversion_examples = {
# Dates and times
"datetime_field": datetime.datetime(2024, 1, 15, 10, 30, 0), # -> "2024-01-15T10:30:00Z"
"date_field": datetime.date(2024, 1, 15), # -> "2024-01-15T00:00:00Z"
# Boolean values
"is_active": True, # -> "true"
"is_deleted": False, # -> "false"
# Numeric values (preserved)
"count": 42,
"price": 29.99,
# Strings (UTF-8 encoded and XML-safe)
"description": "Text with special chars: <>&\"'",
# Lists and tuples (multi-valued fields)
"categories": ["tech", "programming", "python"],
"coordinates": (37.7749, -122.4194),
# None values (excluded from indexing)
"optional_field": None, # This field will not be included
}
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
doc = {"id": "conversion_example"}
doc.update(conversion_examples)
solr.add(doc)Install with Tessl CLI
npx tessl i tessl/pypi-pysolr