0
# Document Processing
1
2
Advanced document handling including content extraction with Apache Tika, nested document support, field update operations, and XML/JSON processing utilities for rich document indexing.
3
4
## Capabilities
5
6
### Content Extraction
7
8
Extract text content and metadata from files using Solr's ExtractingRequestHandler (Tika integration).
9
10
```python { .api }
11
def extract(self, file_obj, extractOnly=True, handler="update/extract", **kwargs):
12
"""
13
Extract content and metadata from files using Apache Tika.
14
15
Parameters:
16
- file_obj: File-like object with a 'name' attribute (e.g., result of open())
17
- extractOnly (bool): Extract without indexing (default: True)
18
- handler (str): Extraction handler path (default: "update/extract")
19
- **kwargs: Additional Tika/extraction parameters:
20
- literal.id (str): Document ID for extracted content
21
- fmap.content (str): Map extracted content to field name
22
- uprefix (str): Prefix for unknown fields
23
- defaultField (str): Default field for unmapped content
24
- xpath (str): XPath expression for content selection
25
- captureAttr (bool): Capture HTML attributes
26
- lowernames (bool): Convert field names to lowercase
27
28
Returns:
29
dict or None: Dictionary with 'contents' and 'metadata' keys, or None if extraction fails
30
31
Raises:
32
ValueError: If file_obj doesn't have a name attribute
33
SolrError: If extraction fails or handler is not configured
34
"""
35
```
36
37
Usage:
38
39
```python
40
import pysolr
41
42
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
43
44
# Extract content from a PDF file
45
with open('document.pdf', 'rb') as pdf_file:
46
extracted = solr.extract(pdf_file)
47
48
if extracted:
49
print("Extracted content:")
50
print(extracted['contents'])
51
52
print("Metadata:")
53
for key, value in extracted['metadata'].items():
54
print(f" {key}: {value}")
55
else:
56
print("No content could be extracted")
57
58
# Extract and index simultaneously
59
with open('document.docx', 'rb') as docx_file:
60
# This will extract and immediately index the document
61
solr.extract(
62
docx_file,
63
extractOnly=False,
64
literal_id='doc_123',
65
literal_title='Important Document',
66
fmap_content='text_content'
67
)
68
69
# Extract with custom field mapping
70
with open('presentation.pptx', 'rb') as pptx_file:
71
extracted = solr.extract(
72
pptx_file,
73
uprefix='extracted_',
74
defaultField='content',
75
captureAttr=True,
76
lowernames=True
77
)
78
```
79
80
### Nested Document Support
81
82
Handle parent-child document relationships for hierarchical data structures.
83
84
```python { .api }
85
# Nested document key constant
86
NESTED_DOC_KEY = "_childDocuments_"
87
```
88
89
Usage:
90
91
```python
92
import pysolr
93
94
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
95
96
# Index document with nested children
97
parent_doc = {
98
"id": "blog_post_1",
99
"title": "Introduction to Machine Learning",
100
"author": "Jane Smith",
101
"category": "technology",
102
pysolr.NESTED_DOC_KEY: [
103
{
104
"id": "comment_1",
105
"type": "comment",
106
"author": "John Doe",
107
"text": "Great article! Very informative."
108
},
109
{
110
"id": "comment_2",
111
"type": "comment",
112
"author": "Alice Brown",
113
"text": "Thanks for sharing this."
114
}
115
]
116
}
117
118
solr.add(parent_doc)
119
120
# Alternative syntax using _doc key
121
parent_doc_alt = {
122
"id": "article_1",
123
"title": "Python Best Practices",
124
"_doc": [
125
{"id": "section_1", "title": "Code Style", "content": "Follow PEP 8..."},
126
{"id": "section_2", "title": "Testing", "content": "Write comprehensive tests..."}
127
]
128
}
129
130
solr.add(parent_doc_alt)
131
132
# Search nested documents
133
results = solr.search('{!parent which="type:parent"}text:"Great article"')
134
for doc in results:
135
print(f"Parent document: {doc['title']}")
136
```
137
138
### Field Update Operations
139
140
Perform atomic updates on specific document fields without reindexing entire documents.
141
142
Usage:
143
144
```python
145
import pysolr
146
147
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
148
149
# Atomic field updates - set new value
150
solr.add(
151
{"id": "doc_1", "status": "published"},
152
fieldUpdates={"status": "set"}
153
)
154
155
# Add value to multi-valued field
156
solr.add(
157
{"id": "doc_1", "tags": "python"},
158
fieldUpdates={"tags": "add"}
159
)
160
161
# Increment numeric field
162
solr.add(
163
{"id": "doc_1", "view_count": 1},
164
fieldUpdates={"view_count": "inc"}
165
)
166
167
# Remove specific value from multi-valued field
168
solr.add(
169
{"id": "doc_1", "tags": "outdated"},
170
fieldUpdates={"tags": "remove"}
171
)
172
173
# Multiple field operations
174
solr.add(
175
{
176
"id": "doc_1",
177
"last_modified": "2024-01-15T10:30:00Z",
178
"tags": "updated",
179
"version": 1
180
},
181
fieldUpdates={
182
"last_modified": "set",
183
"tags": "add",
184
"version": "inc"
185
}
186
)
187
```
188
189
### Document Boost Support
190
191
Apply scoring boosts to documents and fields during indexing to influence search relevance.
192
193
Usage:
194
195
```python
196
import pysolr
197
198
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
199
200
# Document-level boost
201
doc_with_boost = {
202
"id": "important_doc",
203
"title": "Critical Information",
204
"content": "This document contains vital information",
205
"boost": 2.0 # Document boost factor
206
}
207
208
solr.add(doc_with_boost)
209
210
# Field-level boosts
211
docs = [
212
{
213
"id": "doc_1",
214
"title": "Python Tutorial",
215
"content": "Learn Python programming"
216
},
217
{
218
"id": "doc_2",
219
"title": "Advanced Python",
220
"content": "Master advanced Python concepts"
221
}
222
]
223
224
# Boost title field more than content field
225
field_boosts = {
226
"title": 3.0,
227
"content": 1.0
228
}
229
230
solr.add(docs, boost=field_boosts)
231
```
232
233
### Batch Processing
234
235
Efficiently process large numbers of documents with optimized batch operations.
236
237
Usage:
238
239
```python
240
import pysolr
241
242
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
243
244
# Process large document batch
245
def process_large_dataset(documents, batch_size=1000):
246
"""Process documents in batches for optimal performance."""
247
248
for i in range(0, len(documents), batch_size):
249
batch = documents[i:i + batch_size]
250
251
try:
252
# Add batch without immediate commit
253
solr.add(batch, commit=False)
254
print(f"Processed batch {i//batch_size + 1}: {len(batch)} documents")
255
256
except pysolr.SolrError as e:
257
print(f"Batch {i//batch_size + 1} failed: {e}")
258
# Handle individual documents in failed batch
259
for doc in batch:
260
try:
261
solr.add(doc, commit=False)
262
except pysolr.SolrError:
263
print(f"Failed to index document: {doc.get('id', 'unknown')}")
264
265
# Commit all changes at once
266
solr.commit()
267
print("All batches committed")
268
269
# Example usage
270
large_dataset = []
271
for i in range(10000):
272
doc = {
273
"id": f"doc_{i}",
274
"title": f"Document {i}",
275
"content": f"Content for document number {i}",
276
"timestamp": "2024-01-15T10:30:00Z"
277
}
278
large_dataset.append(doc)
279
280
process_large_dataset(large_dataset)
281
```
282
283
### Advanced Document Structures
284
285
Handle complex document structures with dynamic fields, copies fields, and custom data types.
286
287
```python
288
import pysolr
289
import datetime
290
291
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
292
293
# Complex document with various data types
294
complex_doc = {
295
# Basic fields
296
"id": "complex_doc_1",
297
"title": "Advanced Document Structure",
298
"content": "This document demonstrates complex field types",
299
300
# Date fields (automatically converted)
301
"created_date": datetime.datetime.now(),
302
"published_date": datetime.date.today(),
303
304
# Multi-valued fields
305
"tags": ["python", "solr", "search", "indexing"],
306
"authors": ["Alice Smith", "Bob Johnson"],
307
308
# Numeric fields
309
"price": 29.99,
310
"quantity": 100,
311
"rating": 4.5,
312
313
# Boolean fields
314
"is_featured": True,
315
"is_available": False,
316
317
# Dynamic fields (assuming *_s, *_i, *_dt patterns in schema)
318
"custom_string_s": "Custom string value",
319
"custom_int_i": 42,
320
"custom_date_dt": "2024-01-15T10:30:00Z",
321
322
# Location field (if geo-spatial search is configured)
323
"location": "37.7749,-122.4194", # San Francisco coordinates
324
325
# JSON field (if JSON field type is configured)
326
"metadata": {
327
"source": "api",
328
"version": "1.0",
329
"settings": {
330
"debug": True,
331
"timeout": 30
332
}
333
}
334
}
335
336
# Index complex document
337
solr.add(complex_doc)
338
339
# Search using various field types
340
results = solr.search('tags:python AND rating:[4.0 TO *]')
341
date_results = solr.search('created_date:[2024-01-01T00:00:00Z TO NOW]')
342
geo_results = solr.search('{!geofilt pt=37.7749,-122.4194 sfield=location d=10}')
343
```
344
345
## Data Type Conversion
346
347
PySOLR automatically handles data type conversion between Python and Solr formats:
348
349
```python
350
import pysolr
351
import datetime
352
353
# Python -> Solr conversion examples
354
conversion_examples = {
355
# Dates and times
356
"datetime_field": datetime.datetime(2024, 1, 15, 10, 30, 0), # -> "2024-01-15T10:30:00Z"
357
"date_field": datetime.date(2024, 1, 15), # -> "2024-01-15T00:00:00Z"
358
359
# Boolean values
360
"is_active": True, # -> "true"
361
"is_deleted": False, # -> "false"
362
363
# Numeric values (preserved)
364
"count": 42,
365
"price": 29.99,
366
367
# Strings (UTF-8 encoded and XML-safe)
368
"description": "Text with special chars: <>&\"'",
369
370
# Lists and tuples (multi-valued fields)
371
"categories": ["tech", "programming", "python"],
372
"coordinates": (37.7749, -122.4194),
373
374
# None values (excluded from indexing)
375
"optional_field": None, # This field will not be included
376
}
377
378
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
379
doc = {"id": "conversion_example"}
380
doc.update(conversion_examples)
381
solr.add(doc)
382
```