Tessl Tile for pypi/pysolr@3.10.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

admin-operations.md core-client.md document-processing.md index.md search-operations.md solrcloud-support.md utilities.md

document-processing.mddocs/

0
# Document Processing
1

2
Advanced document handling including content extraction with Apache Tika, nested document support, field update operations, and XML/JSON processing utilities for rich document indexing.
3

4
## Capabilities
5

6
### Content Extraction
7

8
Extract text content and metadata from files using Solr's ExtractingRequestHandler (Tika integration).
9

10
```python { .api }
11
def extract(self, file_obj, extractOnly=True, handler="update/extract", **kwargs):
12
    """
13
    Extract content and metadata from files using Apache Tika.
14

15
    Parameters:
16
    - file_obj: File-like object with a 'name' attribute (e.g., result of open())
17
    - extractOnly (bool): Extract without indexing (default: True)
18
    - handler (str): Extraction handler path (default: "update/extract")
19
    - **kwargs: Additional Tika/extraction parameters:
20
        - literal.id (str): Document ID for extracted content
21
        - fmap.content (str): Map extracted content to field name
22
        - uprefix (str): Prefix for unknown fields
23
        - defaultField (str): Default field for unmapped content
24
        - xpath (str): XPath expression for content selection
25
        - captureAttr (bool): Capture HTML attributes
26
        - lowernames (bool): Convert field names to lowercase
27

28
    Returns:
29
    dict or None: Dictionary with 'contents' and 'metadata' keys, or None if extraction fails
30

31
    Raises:
32
    ValueError: If file_obj doesn't have a name attribute
33
    SolrError: If extraction fails or handler is not configured
34
    """
35
```
36

37
Usage:
38

39
```python
40
import pysolr
41

42
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
43

44
# Extract content from a PDF file
45
with open('document.pdf', 'rb') as pdf_file:
46
    extracted = solr.extract(pdf_file)
47
    
48
    if extracted:
49
        print("Extracted content:")
50
        print(extracted['contents'])
51
        
52
        print("Metadata:")
53
        for key, value in extracted['metadata'].items():
54
            print(f"  {key}: {value}")
55
    else:
56
        print("No content could be extracted")
57

58
# Extract and index simultaneously
59
with open('document.docx', 'rb') as docx_file:
60
    # This will extract and immediately index the document
61
    solr.extract(
62
        docx_file,
63
        extractOnly=False,
64
        literal_id='doc_123',
65
        literal_title='Important Document',
66
        fmap_content='text_content'
67
    )
68

69
# Extract with custom field mapping
70
with open('presentation.pptx', 'rb') as pptx_file:
71
    extracted = solr.extract(
72
        pptx_file,
73
        uprefix='extracted_',
74
        defaultField='content',
75
        captureAttr=True,
76
        lowernames=True
77
    )
78
```
79

80
### Nested Document Support
81

82
Handle parent-child document relationships for hierarchical data structures.
83

84
```python { .api }
85
# Nested document key constant
86
NESTED_DOC_KEY = "_childDocuments_"
87
```
88

89
Usage:
90

91
```python
92
import pysolr
93

94
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
95

96
# Index document with nested children
97
parent_doc = {
98
    "id": "blog_post_1",
99
    "title": "Introduction to Machine Learning",
100
    "author": "Jane Smith",
101
    "category": "technology",
102
    pysolr.NESTED_DOC_KEY: [
103
        {
104
            "id": "comment_1",
105
            "type": "comment",
106
            "author": "John Doe",
107
            "text": "Great article! Very informative."
108
        },
109
        {
110
            "id": "comment_2", 
111
            "type": "comment",
112
            "author": "Alice Brown",
113
            "text": "Thanks for sharing this."
114
        }
115
    ]
116
}
117

118
solr.add(parent_doc)
119

120
# Alternative syntax using _doc key
121
parent_doc_alt = {
122
    "id": "article_1",
123
    "title": "Python Best Practices",
124
    "_doc": [
125
        {"id": "section_1", "title": "Code Style", "content": "Follow PEP 8..."},
126
        {"id": "section_2", "title": "Testing", "content": "Write comprehensive tests..."}
127
    ]
128
}
129

130
solr.add(parent_doc_alt)
131

132
# Search nested documents
133
results = solr.search('{!parent which="type:parent"}text:"Great article"')
134
for doc in results:
135
    print(f"Parent document: {doc['title']}")
136
```
137

138
### Field Update Operations
139

140
Perform atomic updates on specific document fields without reindexing entire documents.
141

142
Usage:
143

144
```python
145
import pysolr
146

147
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
148

149
# Atomic field updates - set new value
150
solr.add(
151
    {"id": "doc_1", "status": "published"},
152
    fieldUpdates={"status": "set"}
153
)
154

155
# Add value to multi-valued field
156
solr.add(
157
    {"id": "doc_1", "tags": "python"},
158
    fieldUpdates={"tags": "add"}
159
)
160

161
# Increment numeric field
162
solr.add(
163
    {"id": "doc_1", "view_count": 1},
164
    fieldUpdates={"view_count": "inc"}
165
)
166

167
# Remove specific value from multi-valued field  
168
solr.add(
169
    {"id": "doc_1", "tags": "outdated"},
170
    fieldUpdates={"tags": "remove"}
171
)
172

173
# Multiple field operations
174
solr.add(
175
    {
176
        "id": "doc_1",
177
        "last_modified": "2024-01-15T10:30:00Z",
178
        "tags": "updated",
179
        "version": 1
180
    },
181
    fieldUpdates={
182
        "last_modified": "set",
183
        "tags": "add", 
184
        "version": "inc"
185
    }
186
)
187
```
188

189
### Document Boost Support
190

191
Apply scoring boosts to documents and fields during indexing to influence search relevance.
192

193
Usage:
194

195
```python
196
import pysolr
197

198
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
199

200
# Document-level boost
201
doc_with_boost = {
202
    "id": "important_doc",
203
    "title": "Critical Information",
204
    "content": "This document contains vital information",
205
    "boost": 2.0  # Document boost factor
206
}
207

208
solr.add(doc_with_boost)
209

210
# Field-level boosts
211
docs = [
212
    {
213
        "id": "doc_1",
214
        "title": "Python Tutorial",
215
        "content": "Learn Python programming"
216
    },
217
    {
218
        "id": "doc_2", 
219
        "title": "Advanced Python",
220
        "content": "Master advanced Python concepts"
221
    }
222
]
223

224
# Boost title field more than content field
225
field_boosts = {
226
    "title": 3.0,
227
    "content": 1.0
228
}
229

230
solr.add(docs, boost=field_boosts)
231
```
232

233
### Batch Processing
234

235
Efficiently process large numbers of documents with optimized batch operations.
236

237
Usage:
238

239
```python
240
import pysolr
241

242
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
243

244
# Process large document batch
245
def process_large_dataset(documents, batch_size=1000):
246
    """Process documents in batches for optimal performance."""
247
    
248
    for i in range(0, len(documents), batch_size):
249
        batch = documents[i:i + batch_size]
250
        
251
        try:
252
            # Add batch without immediate commit
253
            solr.add(batch, commit=False)
254
            print(f"Processed batch {i//batch_size + 1}: {len(batch)} documents")
255
            
256
        except pysolr.SolrError as e:
257
            print(f"Batch {i//batch_size + 1} failed: {e}")
258
            # Handle individual documents in failed batch
259
            for doc in batch:
260
                try:
261
                    solr.add(doc, commit=False)
262
                except pysolr.SolrError:
263
                    print(f"Failed to index document: {doc.get('id', 'unknown')}")
264
    
265
    # Commit all changes at once
266
    solr.commit()
267
    print("All batches committed")
268

269
# Example usage
270
large_dataset = []
271
for i in range(10000):
272
    doc = {
273
        "id": f"doc_{i}",
274
        "title": f"Document {i}",
275
        "content": f"Content for document number {i}",
276
        "timestamp": "2024-01-15T10:30:00Z"
277
    }
278
    large_dataset.append(doc)
279

280
process_large_dataset(large_dataset)
281
```
282

283
### Advanced Document Structures
284

285
Handle complex document structures with dynamic fields, copies fields, and custom data types.
286

287
```python
288
import pysolr
289
import datetime
290

291
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
292

293
# Complex document with various data types
294
complex_doc = {
295
    # Basic fields
296
    "id": "complex_doc_1",
297
    "title": "Advanced Document Structure",
298
    "content": "This document demonstrates complex field types",
299
    
300
    # Date fields (automatically converted)
301
    "created_date": datetime.datetime.now(),
302
    "published_date": datetime.date.today(),
303
    
304
    # Multi-valued fields
305
    "tags": ["python", "solr", "search", "indexing"],
306
    "authors": ["Alice Smith", "Bob Johnson"],
307
    
308
    # Numeric fields
309
    "price": 29.99,
310
    "quantity": 100,
311
    "rating": 4.5,
312
    
313
    # Boolean fields
314
    "is_featured": True,
315
    "is_available": False,
316
    
317
    # Dynamic fields (assuming *_s, *_i, *_dt patterns in schema)
318
    "custom_string_s": "Custom string value",
319
    "custom_int_i": 42,
320
    "custom_date_dt": "2024-01-15T10:30:00Z",
321
    
322
    # Location field (if geo-spatial search is configured)
323
    "location": "37.7749,-122.4194",  # San Francisco coordinates
324
    
325
    # JSON field (if JSON field type is configured)
326
    "metadata": {
327
        "source": "api",
328
        "version": "1.0",
329
        "settings": {
330
            "debug": True,
331
            "timeout": 30
332
        }
333
    }
334
}
335

336
# Index complex document
337
solr.add(complex_doc)
338

339
# Search using various field types
340
results = solr.search('tags:python AND rating:[4.0 TO *]')
341
date_results = solr.search('created_date:[2024-01-01T00:00:00Z TO NOW]')
342
geo_results = solr.search('{!geofilt pt=37.7749,-122.4194 sfield=location d=10}')
343
```
344

345
## Data Type Conversion
346

347
PySOLR automatically handles data type conversion between Python and Solr formats:
348

349
```python
350
import pysolr
351
import datetime
352

353
# Python -> Solr conversion examples
354
conversion_examples = {
355
    # Dates and times
356
    "datetime_field": datetime.datetime(2024, 1, 15, 10, 30, 0),  # -> "2024-01-15T10:30:00Z"
357
    "date_field": datetime.date(2024, 1, 15),  # -> "2024-01-15T00:00:00Z"
358
    
359
    # Boolean values
360
    "is_active": True,   # -> "true"
361
    "is_deleted": False, # -> "false"
362
    
363
    # Numeric values (preserved)
364
    "count": 42,
365
    "price": 29.99,
366
    
367
    # Strings (UTF-8 encoded and XML-safe)
368
    "description": "Text with special chars: <>&\"'",
369
    
370
    # Lists and tuples (multi-valued fields)
371
    "categories": ["tech", "programming", "python"],
372
    "coordinates": (37.7749, -122.4194),
373
    
374
    # None values (excluded from indexing)
375
    "optional_field": None,  # This field will not be included
376
}
377

378
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
379
doc = {"id": "conversion_example"}
380
doc.update(conversion_examples)
381
solr.add(doc)
382
```

Version

Tile

Files

document-processing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

document-processing.mddocs/