0
# Utility Functions
1
2
Helper functions for data conversion, text processing, URL encoding, and XML sanitization used throughout the pysolr library and available for custom processing needs.
3
4
## Capabilities
5
6
### Version Information
7
8
Get the current version of the pysolr library.
9
10
```python { .api }
11
def get_version():
12
"""
13
Get the current pysolr library version.
14
15
Returns:
16
str: Version string (e.g., "3.10.0")
17
"""
18
```
19
20
Usage:
21
22
```python
23
import pysolr
24
25
version = pysolr.get_version()
26
print(f"PySOLR version: {version}")
27
28
# Use in application logging or debugging
29
print(f"Using pysolr {version} to connect to Solr")
30
```
31
32
### Python Version Detection
33
34
Detect Python version for cross-platform compatibility.
35
36
```python { .api }
37
def is_py3():
38
"""
39
Check if running on Python 3.x.
40
41
Returns:
42
bool: True if Python 3.x, False if Python 2.x
43
44
Note:
45
- Used internally for handling differences between Python 2 and 3
46
- Helps with string/bytes handling and compatibility
47
"""
48
```
49
50
Usage:
51
52
```python
53
import pysolr
54
55
if pysolr.is_py3():
56
print("Running on Python 3.x")
57
# Python 3-specific logic
58
else:
59
print("Running on Python 2.x")
60
# Python 2-specific logic
61
```
62
63
### String Encoding Utilities
64
65
Convert between Unicode strings and byte strings for cross-platform compatibility.
66
67
```python { .api }
68
def force_unicode(value):
69
"""
70
Convert bytestrings to Unicode strings with error handling.
71
72
Parameters:
73
- value: Input value (bytes, str, or other type)
74
75
Returns:
76
str: Unicode string representation
77
78
Note:
79
- On Python 3: Decodes bytes to str, converts other types to str
80
- On Python 2: Decodes str to unicode, converts other types to unicode
81
- Uses UTF-8 encoding with 'replace' error handling
82
"""
83
84
def force_bytes(value):
85
"""
86
Convert Unicode strings to bytestrings for HTTP transmission.
87
88
Parameters:
89
- value: Input value (str, unicode, or other type)
90
91
Returns:
92
bytes (Python 3) or str (Python 2): Byte string representation
93
94
Note:
95
- Uses UTF-8 encoding with appropriate error handling
96
- Required for HTTP request bodies and XML processing
97
"""
98
```
99
100
Usage:
101
102
```python
103
import pysolr
104
105
# Convert various types to Unicode
106
text_bytes = b"Hello, World! \xe2\x9c\x93" # UTF-8 bytes with checkmark
107
unicode_text = pysolr.force_unicode(text_bytes)
108
print(f"Unicode: {unicode_text}") # "Hello, World! ✓"
109
110
# Convert for HTTP transmission
111
unicode_string = "Café with special chars: áéíóú"
112
byte_string = pysolr.force_bytes(unicode_string)
113
print(f"Bytes: {byte_string}")
114
115
# Handle various input types
116
number_as_unicode = pysolr.force_unicode(12345)
117
print(f"Number as Unicode: {number_as_unicode}") # "12345"
118
119
# Error handling with malformed data
120
malformed_bytes = b"\xff\xfe\x00\x41" # Invalid UTF-8
121
safe_unicode = pysolr.force_unicode(malformed_bytes)
122
print(f"Safe conversion: {safe_unicode}") # Uses replacement characters
123
```
124
125
### HTML/XML Processing
126
127
Clean and process HTML/XML content for safe indexing and display.
128
129
```python { .api }
130
def unescape_html(text):
131
"""
132
Remove HTML or XML character references and entities from text.
133
134
Parameters:
135
- text (str): HTML or XML source text containing entities
136
137
Returns:
138
str: Plain text with entities converted to Unicode characters
139
140
Note:
141
- Handles both numeric ({, {) and named (&, <) entities
142
- Useful for processing HTML content before indexing
143
"""
144
145
def clean_xml_string(s):
146
"""
147
Remove invalid XML characters from string.
148
149
Parameters:
150
- s (str): String to clean
151
152
Returns:
153
str: String with invalid XML characters removed
154
155
Note:
156
- Removes control characters that would cause XML parsing errors
157
- Applied automatically during document indexing
158
"""
159
```
160
161
Usage:
162
163
```python
164
import pysolr
165
166
# Clean HTML entities
167
html_content = "Price: £25.99 & free shipping! Rating: 5★"
168
clean_content = pysolr.unescape_html(html_content)
169
print(f"Cleaned: {clean_content}") # "Price: £25.99 & free shipping! Rating: 5★"
170
171
# Remove invalid XML characters
172
xml_content = "Valid text\x08\x0bInvalid control chars\x1f\x00More text"
173
clean_xml = pysolr.clean_xml_string(xml_content)
174
print(f"Clean XML: {clean_xml}") # "Valid textInvalid control charsMore text"
175
176
# Process scraped web content
177
scraped_html = """
178
<div class="article">
179
<h1>Article Title</h1>
180
<p>Content with &quot;quotes&quot; and &lt;tags&gt;</p>
181
</div>
182
"""
183
readable_text = pysolr.unescape_html(scraped_html)
184
print(f"Readable: {readable_text}")
185
```
186
187
### URL Encoding
188
189
Safe URL encoding for HTTP parameters with UTF-8 support.
190
191
```python { .api }
192
def safe_urlencode(params, doseq=0):
193
"""
194
UTF-8-safe version of URL encoding.
195
196
Parameters:
197
- params (dict or list of tuples): Parameters to encode
198
- doseq (int): Handle sequence values (0=single value, 1=multiple values)
199
200
Returns:
201
str: URL-encoded parameter string
202
203
Note:
204
- Fixes UTF-8 encoding issues in Python 2.x
205
- Used internally for Solr HTTP requests
206
- Handles both single and multi-valued parameters
207
"""
208
```
209
210
Usage:
211
212
```python
213
import pysolr
214
215
# Basic parameter encoding
216
params = {
217
'q': 'title:python AND content:"machine learning"',
218
'fq': 'category:programming',
219
'rows': 20,
220
'start': 0
221
}
222
encoded = pysolr.safe_urlencode(params)
223
print(f"Encoded: {encoded}")
224
225
# Multi-valued parameters
226
multi_params = {
227
'fq': ['category:tech', 'status:published', 'date:[2024-01-01T00:00:00Z TO NOW]'],
228
'fl': ['id', 'title', 'content', 'score']
229
}
230
encoded_multi = pysolr.safe_urlencode(multi_params, doseq=1)
231
print(f"Multi-valued: {encoded_multi}")
232
233
# UTF-8 content (especially important for Python 2.x)
234
utf8_params = {
235
'q': 'title:café OR content:naïve',
236
'fq': 'author:"José García"'
237
}
238
encoded_utf8 = pysolr.safe_urlencode(utf8_params)
239
print(f"UTF-8 safe: {encoded_utf8}")
240
```
241
242
### Data Sanitization
243
244
Clean data for safe XML processing and indexing.
245
246
```python { .api }
247
def sanitize(data):
248
"""
249
Remove control characters from data for safe XML processing.
250
251
Parameters:
252
- data (str or bytes): Data to sanitize
253
254
Returns:
255
str: Sanitized Unicode string safe for XML processing
256
257
Note:
258
- Removes ASCII control characters (0x00-0x1F except tab, newline, carriage return)
259
- Applied automatically during document indexing unless disabled
260
- Essential for processing binary data or untrusted input
261
"""
262
```
263
264
Usage:
265
266
```python
267
import pysolr
268
269
# Sanitize text with control characters
270
dirty_text = "Clean text\x00\x01\x02\x08Bad control chars\x0b\x0c\x0e\x1fMore text"
271
clean_text = pysolr.sanitize(dirty_text)
272
print(f"Sanitized: {repr(clean_text)}") # Control chars removed
273
274
# Process file content
275
with open('potentially_dirty_file.txt', 'rb') as f:
276
file_content = f.read()
277
safe_content = pysolr.sanitize(file_content)
278
279
# Now safe to index
280
doc = {
281
'id': 'file_doc',
282
'content': safe_content,
283
'filename': 'potentially_dirty_file.txt'
284
}
285
286
# Disable automatic sanitization if needed
287
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
288
solr.add(
289
{'id': 'raw_doc', 'content': 'Raw content'},
290
clean_ctrl_chars=False # Skip automatic sanitization
291
)
292
```
293
294
## Advanced Usage Patterns
295
296
### Custom Data Processing Pipeline
297
298
Combine utility functions for comprehensive data processing:
299
300
```python
301
import pysolr
302
303
def process_web_content(html_content, document_id):
304
"""
305
Complete pipeline for processing web content for Solr indexing.
306
307
Parameters:
308
- html_content (str): Raw HTML content
309
- document_id (str): Unique document identifier
310
311
Returns:
312
dict: Processed document ready for indexing
313
"""
314
315
# Step 1: Convert to Unicode if needed
316
unicode_content = pysolr.force_unicode(html_content)
317
318
# Step 2: Unescape HTML entities
319
unescaped_content = pysolr.unescape_html(unicode_content)
320
321
# Step 3: Clean invalid XML characters
322
clean_content = pysolr.clean_xml_string(unescaped_content)
323
324
# Step 4: Sanitize control characters
325
safe_content = pysolr.sanitize(clean_content)
326
327
# Step 5: Create document
328
document = {
329
'id': document_id,
330
'content': safe_content,
331
'content_length': len(safe_content),
332
'processed_timestamp': pysolr.force_unicode(str(datetime.datetime.now()))
333
}
334
335
return document
336
337
# Usage example
338
raw_html = """
339
<article>
340
<h1>Café Review</h1>
341
<p>Great coffee with a rating of 5★</p>
342
\x08\x0bSome bad control characters\x1f
343
</article>
344
"""
345
346
processed_doc = process_web_content(raw_html, 'cafe_review_1')
347
print(f"Processed document: {processed_doc}")
348
349
# Index the processed document
350
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
351
solr.add(processed_doc)
352
```
353
354
### Error-Safe Utility Usage
355
356
Handle edge cases and errors gracefully:
357
358
```python
359
import pysolr
360
361
def safe_process_data(data):
362
"""
363
Safely process data with error handling.
364
365
Parameters:
366
- data: Input data of unknown type/encoding
367
368
Returns:
369
str: Safely processed string or empty string on error
370
"""
371
372
try:
373
# Try to convert to Unicode
374
unicode_data = pysolr.force_unicode(data)
375
376
# Clean HTML if it looks like HTML
377
if '<' in unicode_data and '>' in unicode_data:
378
unicode_data = pysolr.unescape_html(unicode_data)
379
380
# Always clean XML and sanitize
381
clean_data = pysolr.clean_xml_string(unicode_data)
382
safe_data = pysolr.sanitize(clean_data)
383
384
return safe_data
385
386
except Exception as e:
387
print(f"Data processing error: {e}")
388
return ""
389
390
# Test with various problematic inputs
391
test_inputs = [
392
b'\xff\xfe\x00\x41', # Invalid UTF-8
393
"Valid & clean text", # HTML entities
394
"Text\x00with\x08bad\x1fchars", # Control characters
395
12345, # Non-string type
396
None, # None value
397
]
398
399
for i, test_input in enumerate(test_inputs):
400
result = safe_process_data(test_input)
401
print(f"Input {i}: {repr(test_input)} -> {repr(result)}")
402
```
403
404
### Performance Optimization
405
406
Use utility functions efficiently for large-scale processing:
407
408
```python
409
import pysolr
410
411
def bulk_sanitize_documents(documents):
412
"""
413
Efficiently sanitize a large number of documents.
414
415
Parameters:
416
- documents (list): List of document dictionaries
417
418
Returns:
419
list: List of sanitized documents
420
"""
421
422
sanitized_docs = []
423
424
for doc in documents:
425
sanitized_doc = {'id': doc['id']} # Preserve ID
426
427
for field, value in doc.items():
428
if field == 'id':
429
continue
430
431
if isinstance(value, (str, bytes)):
432
# Process string/bytes fields
433
unicode_value = pysolr.force_unicode(value)
434
clean_value = pysolr.sanitize(unicode_value)
435
sanitized_doc[field] = clean_value
436
437
elif isinstance(value, list):
438
# Process multi-valued fields
439
clean_values = []
440
for item in value:
441
if isinstance(item, (str, bytes)):
442
unicode_item = pysolr.force_unicode(item)
443
clean_item = pysolr.sanitize(unicode_item)
444
clean_values.append(clean_item)
445
else:
446
clean_values.append(item)
447
sanitized_doc[field] = clean_values
448
449
else:
450
# Preserve non-string fields as-is
451
sanitized_doc[field] = value
452
453
sanitized_docs.append(sanitized_doc)
454
455
return sanitized_docs
456
457
# Example usage with large dataset
458
large_dataset = []
459
for i in range(1000):
460
doc = {
461
'id': f'doc_{i}',
462
'title': f'Document {i} with "special" chars',
463
'content': f'Content\x08with\x1fbad\x00chars for doc {i}',
464
'tags': ['tag1', 'tag2\x0b', 'tag3'],
465
'score': i * 0.1
466
}
467
large_dataset.append(doc)
468
469
print("Sanitizing large dataset...")
470
clean_dataset = bulk_sanitize_documents(large_dataset)
471
print(f"Processed {len(clean_dataset)} documents")
472
473
# Index cleaned dataset
474
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
475
solr.add(clean_dataset, commit=True)
476
```