CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pysolr

Lightweight Python client for Apache Solr

Pending
Overview
Eval results
Files

document-processing.mddocs/

Document Processing

Advanced document handling including content extraction with Apache Tika, nested document support, field update operations, and XML/JSON processing utilities for rich document indexing.

Capabilities

Content Extraction

Extract text content and metadata from files using Solr's ExtractingRequestHandler (Tika integration).

def extract(self, file_obj, extractOnly=True, handler="update/extract", **kwargs):
    """
    Extract content and metadata from files using Apache Tika.

    Parameters:
    - file_obj: File-like object with a 'name' attribute (e.g., result of open())
    - extractOnly (bool): Extract without indexing (default: True)
    - handler (str): Extraction handler path (default: "update/extract")
    - **kwargs: Additional Tika/extraction parameters:
        - literal.id (str): Document ID for extracted content
        - fmap.content (str): Map extracted content to field name
        - uprefix (str): Prefix for unknown fields
        - defaultField (str): Default field for unmapped content
        - xpath (str): XPath expression for content selection
        - captureAttr (bool): Capture HTML attributes
        - lowernames (bool): Convert field names to lowercase

    Returns:
    dict or None: Dictionary with 'contents' and 'metadata' keys, or None if extraction fails

    Raises:
    ValueError: If file_obj doesn't have a name attribute
    SolrError: If extraction fails or handler is not configured
    """

Usage:

import pysolr

solr = pysolr.Solr('http://localhost:8983/solr/my_core')

# Extract content from a PDF file
with open('document.pdf', 'rb') as pdf_file:
    extracted = solr.extract(pdf_file)
    
    if extracted:
        print("Extracted content:")
        print(extracted['contents'])
        
        print("Metadata:")
        for key, value in extracted['metadata'].items():
            print(f"  {key}: {value}")
    else:
        print("No content could be extracted")

# Extract and index simultaneously
with open('document.docx', 'rb') as docx_file:
    # This will extract and immediately index the document
    solr.extract(
        docx_file,
        extractOnly=False,
        literal_id='doc_123',
        literal_title='Important Document',
        fmap_content='text_content'
    )

# Extract with custom field mapping
with open('presentation.pptx', 'rb') as pptx_file:
    extracted = solr.extract(
        pptx_file,
        uprefix='extracted_',
        defaultField='content',
        captureAttr=True,
        lowernames=True
    )

Nested Document Support

Handle parent-child document relationships for hierarchical data structures.

# Nested document key constant
NESTED_DOC_KEY = "_childDocuments_"

Usage:

import pysolr

solr = pysolr.Solr('http://localhost:8983/solr/my_core')

# Index document with nested children
parent_doc = {
    "id": "blog_post_1",
    "title": "Introduction to Machine Learning",
    "author": "Jane Smith",
    "category": "technology",
    pysolr.NESTED_DOC_KEY: [
        {
            "id": "comment_1",
            "type": "comment",
            "author": "John Doe",
            "text": "Great article! Very informative."
        },
        {
            "id": "comment_2", 
            "type": "comment",
            "author": "Alice Brown",
            "text": "Thanks for sharing this."
        }
    ]
}

solr.add(parent_doc)

# Alternative syntax using _doc key
parent_doc_alt = {
    "id": "article_1",
    "title": "Python Best Practices",
    "_doc": [
        {"id": "section_1", "title": "Code Style", "content": "Follow PEP 8..."},
        {"id": "section_2", "title": "Testing", "content": "Write comprehensive tests..."}
    ]
}

solr.add(parent_doc_alt)

# Search nested documents
results = solr.search('{!parent which="type:parent"}text:"Great article"')
for doc in results:
    print(f"Parent document: {doc['title']}")

Field Update Operations

Perform atomic updates on specific document fields without reindexing entire documents.

Usage:

import pysolr

solr = pysolr.Solr('http://localhost:8983/solr/my_core')

# Atomic field updates - set new value
solr.add(
    {"id": "doc_1", "status": "published"},
    fieldUpdates={"status": "set"}
)

# Add value to multi-valued field
solr.add(
    {"id": "doc_1", "tags": "python"},
    fieldUpdates={"tags": "add"}
)

# Increment numeric field
solr.add(
    {"id": "doc_1", "view_count": 1},
    fieldUpdates={"view_count": "inc"}
)

# Remove specific value from multi-valued field  
solr.add(
    {"id": "doc_1", "tags": "outdated"},
    fieldUpdates={"tags": "remove"}
)

# Multiple field operations
solr.add(
    {
        "id": "doc_1",
        "last_modified": "2024-01-15T10:30:00Z",
        "tags": "updated",
        "version": 1
    },
    fieldUpdates={
        "last_modified": "set",
        "tags": "add", 
        "version": "inc"
    }
)

Document Boost Support

Apply scoring boosts to documents and fields during indexing to influence search relevance.

Usage:

import pysolr

solr = pysolr.Solr('http://localhost:8983/solr/my_core')

# Document-level boost
doc_with_boost = {
    "id": "important_doc",
    "title": "Critical Information",
    "content": "This document contains vital information",
    "boost": 2.0  # Document boost factor
}

solr.add(doc_with_boost)

# Field-level boosts
docs = [
    {
        "id": "doc_1",
        "title": "Python Tutorial",
        "content": "Learn Python programming"
    },
    {
        "id": "doc_2", 
        "title": "Advanced Python",
        "content": "Master advanced Python concepts"
    }
]

# Boost title field more than content field
field_boosts = {
    "title": 3.0,
    "content": 1.0
}

solr.add(docs, boost=field_boosts)

Batch Processing

Efficiently process large numbers of documents with optimized batch operations.

Usage:

import pysolr

solr = pysolr.Solr('http://localhost:8983/solr/my_core')

# Process large document batch
def process_large_dataset(documents, batch_size=1000):
    """Process documents in batches for optimal performance."""
    
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        
        try:
            # Add batch without immediate commit
            solr.add(batch, commit=False)
            print(f"Processed batch {i//batch_size + 1}: {len(batch)} documents")
            
        except pysolr.SolrError as e:
            print(f"Batch {i//batch_size + 1} failed: {e}")
            # Handle individual documents in failed batch
            for doc in batch:
                try:
                    solr.add(doc, commit=False)
                except pysolr.SolrError:
                    print(f"Failed to index document: {doc.get('id', 'unknown')}")
    
    # Commit all changes at once
    solr.commit()
    print("All batches committed")

# Example usage
large_dataset = []
for i in range(10000):
    doc = {
        "id": f"doc_{i}",
        "title": f"Document {i}",
        "content": f"Content for document number {i}",
        "timestamp": "2024-01-15T10:30:00Z"
    }
    large_dataset.append(doc)

process_large_dataset(large_dataset)

Advanced Document Structures

Handle complex document structures with dynamic fields, copies fields, and custom data types.

import pysolr
import datetime

solr = pysolr.Solr('http://localhost:8983/solr/my_core')

# Complex document with various data types
complex_doc = {
    # Basic fields
    "id": "complex_doc_1",
    "title": "Advanced Document Structure",
    "content": "This document demonstrates complex field types",
    
    # Date fields (automatically converted)
    "created_date": datetime.datetime.now(),
    "published_date": datetime.date.today(),
    
    # Multi-valued fields
    "tags": ["python", "solr", "search", "indexing"],
    "authors": ["Alice Smith", "Bob Johnson"],
    
    # Numeric fields
    "price": 29.99,
    "quantity": 100,
    "rating": 4.5,
    
    # Boolean fields
    "is_featured": True,
    "is_available": False,
    
    # Dynamic fields (assuming *_s, *_i, *_dt patterns in schema)
    "custom_string_s": "Custom string value",
    "custom_int_i": 42,
    "custom_date_dt": "2024-01-15T10:30:00Z",
    
    # Location field (if geo-spatial search is configured)
    "location": "37.7749,-122.4194",  # San Francisco coordinates
    
    # JSON field (if JSON field type is configured)
    "metadata": {
        "source": "api",
        "version": "1.0",
        "settings": {
            "debug": True,
            "timeout": 30
        }
    }
}

# Index complex document
solr.add(complex_doc)

# Search using various field types
results = solr.search('tags:python AND rating:[4.0 TO *]')
date_results = solr.search('created_date:[2024-01-01T00:00:00Z TO NOW]')
geo_results = solr.search('{!geofilt pt=37.7749,-122.4194 sfield=location d=10}')

Data Type Conversion

PySOLR automatically handles data type conversion between Python and Solr formats:

import pysolr
import datetime

# Python -> Solr conversion examples
conversion_examples = {
    # Dates and times
    "datetime_field": datetime.datetime(2024, 1, 15, 10, 30, 0),  # -> "2024-01-15T10:30:00Z"
    "date_field": datetime.date(2024, 1, 15),  # -> "2024-01-15T00:00:00Z"
    
    # Boolean values
    "is_active": True,   # -> "true"
    "is_deleted": False, # -> "false"
    
    # Numeric values (preserved)
    "count": 42,
    "price": 29.99,
    
    # Strings (UTF-8 encoded and XML-safe)
    "description": "Text with special chars: <>&\"'",
    
    # Lists and tuples (multi-valued fields)
    "categories": ["tech", "programming", "python"],
    "coordinates": (37.7749, -122.4194),
    
    # None values (excluded from indexing)
    "optional_field": None,  # This field will not be included
}

solr = pysolr.Solr('http://localhost:8983/solr/my_core')
doc = {"id": "conversion_example"}
doc.update(conversion_examples)
solr.add(doc)

Install with Tessl CLI

npx tessl i tessl/pypi-pysolr

docs

admin-operations.md

core-client.md

document-processing.md

index.md

search-operations.md

solrcloud-support.md

utilities.md

tile.json