Tessl Tile for pypi/pysolr@3.10.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

admin-operations.md core-client.md document-processing.md index.md search-operations.md solrcloud-support.md utilities.md

utilities.mddocs/

0
# Utility Functions
1

2
Helper functions for data conversion, text processing, URL encoding, and XML sanitization used throughout the pysolr library and available for custom processing needs.
3

4
## Capabilities
5

6
### Version Information
7

8
Get the current version of the pysolr library.
9

10
```python { .api }
11
def get_version():
12
    """
13
    Get the current pysolr library version.
14

15
    Returns:
16
    str: Version string (e.g., "3.10.0")
17
    """
18
```
19

20
Usage:
21

22
```python
23
import pysolr
24

25
version = pysolr.get_version()
26
print(f"PySOLR version: {version}")
27

28
# Use in application logging or debugging
29
print(f"Using pysolr {version} to connect to Solr")
30
```
31

32
### Python Version Detection
33

34
Detect Python version for cross-platform compatibility.
35

36
```python { .api }
37
def is_py3():
38
    """
39
    Check if running on Python 3.x.
40

41
    Returns:
42
    bool: True if Python 3.x, False if Python 2.x
43

44
    Note:
45
    - Used internally for handling differences between Python 2 and 3
46
    - Helps with string/bytes handling and compatibility
47
    """
48
```
49

50
Usage:
51

52
```python
53
import pysolr
54

55
if pysolr.is_py3():
56
    print("Running on Python 3.x")
57
    # Python 3-specific logic
58
else:
59
    print("Running on Python 2.x") 
60
    # Python 2-specific logic
61
```
62

63
### String Encoding Utilities
64

65
Convert between Unicode strings and byte strings for cross-platform compatibility.
66

67
```python { .api }
68
def force_unicode(value):
69
    """
70
    Convert bytestrings to Unicode strings with error handling.
71

72
    Parameters:
73
    - value: Input value (bytes, str, or other type)
74

75
    Returns:
76
    str: Unicode string representation
77

78
    Note:
79
    - On Python 3: Decodes bytes to str, converts other types to str
80
    - On Python 2: Decodes str to unicode, converts other types to unicode
81
    - Uses UTF-8 encoding with 'replace' error handling
82
    """
83

84
def force_bytes(value):
85
    """
86
    Convert Unicode strings to bytestrings for HTTP transmission.
87

88
    Parameters:
89
    - value: Input value (str, unicode, or other type)
90

91
    Returns:
92
    bytes (Python 3) or str (Python 2): Byte string representation
93

94
    Note:
95
    - Uses UTF-8 encoding with appropriate error handling
96
    - Required for HTTP request bodies and XML processing
97
    """
98
```
99

100
Usage:
101

102
```python
103
import pysolr
104

105
# Convert various types to Unicode
106
text_bytes = b"Hello, World! \xe2\x9c\x93"  # UTF-8 bytes with checkmark
107
unicode_text = pysolr.force_unicode(text_bytes)
108
print(f"Unicode: {unicode_text}")  # "Hello, World! ✓"
109

110
# Convert for HTTP transmission
111
unicode_string = "Café with special chars: áéíóú"
112
byte_string = pysolr.force_bytes(unicode_string)
113
print(f"Bytes: {byte_string}")
114

115
# Handle various input types
116
number_as_unicode = pysolr.force_unicode(12345)
117
print(f"Number as Unicode: {number_as_unicode}")  # "12345"
118

119
# Error handling with malformed data
120
malformed_bytes = b"\xff\xfe\x00\x41"  # Invalid UTF-8
121
safe_unicode = pysolr.force_unicode(malformed_bytes)
122
print(f"Safe conversion: {safe_unicode}")  # Uses replacement characters
123
```
124

125
### HTML/XML Processing
126

127
Clean and process HTML/XML content for safe indexing and display.
128

129
```python { .api }
130
def unescape_html(text):
131
    """
132
    Remove HTML or XML character references and entities from text.
133

134
    Parameters:
135
    - text (str): HTML or XML source text containing entities
136

137
    Returns:
138
    str: Plain text with entities converted to Unicode characters
139

140
    Note:
141
    - Handles both numeric (&#123;, &#x7B;) and named (&amp;, &lt;) entities
142
    - Useful for processing HTML content before indexing
143
    """
144

145
def clean_xml_string(s):
146
    """
147
    Remove invalid XML characters from string.
148

149
    Parameters:
150
    - s (str): String to clean
151

152
    Returns:
153
    str: String with invalid XML characters removed
154

155
    Note:
156
    - Removes control characters that would cause XML parsing errors
157
    - Applied automatically during document indexing
158
    """
159
```
160

161
Usage:
162

163
```python
164
import pysolr
165

166
# Clean HTML entities
167
html_content = "Price: &pound;25.99 &amp; free shipping! Rating: 5&#9733;"
168
clean_content = pysolr.unescape_html(html_content)
169
print(f"Cleaned: {clean_content}")  # "Price: £25.99 & free shipping! Rating: 5★"
170

171
# Remove invalid XML characters
172
xml_content = "Valid text\x08\x0bInvalid control chars\x1f\x00More text"
173
clean_xml = pysolr.clean_xml_string(xml_content)
174
print(f"Clean XML: {clean_xml}")  # "Valid textInvalid control charsMore text"
175

176
# Process scraped web content
177
scraped_html = """
178
&lt;div class=&quot;article&quot;&gt;
179
    &lt;h1&gt;Article Title&lt;/h1&gt;
180
    &lt;p&gt;Content with &amp;quot;quotes&amp;quot; and &amp;lt;tags&amp;gt;&lt;/p&gt;
181
&lt;/div&gt;
182
"""
183
readable_text = pysolr.unescape_html(scraped_html)
184
print(f"Readable: {readable_text}")
185
```
186

187
### URL Encoding
188

189
Safe URL encoding for HTTP parameters with UTF-8 support.
190

191
```python { .api }
192
def safe_urlencode(params, doseq=0):
193
    """
194
    UTF-8-safe version of URL encoding.
195

196
    Parameters:
197
    - params (dict or list of tuples): Parameters to encode
198
    - doseq (int): Handle sequence values (0=single value, 1=multiple values)
199

200
    Returns:
201
    str: URL-encoded parameter string
202

203
    Note:
204
    - Fixes UTF-8 encoding issues in Python 2.x
205
    - Used internally for Solr HTTP requests
206
    - Handles both single and multi-valued parameters
207
    """
208
```
209

210
Usage:
211

212
```python
213
import pysolr
214

215
# Basic parameter encoding
216
params = {
217
    'q': 'title:python AND content:"machine learning"',
218
    'fq': 'category:programming',
219
    'rows': 20,
220
    'start': 0
221
}
222
encoded = pysolr.safe_urlencode(params)
223
print(f"Encoded: {encoded}")
224

225
# Multi-valued parameters
226
multi_params = {
227
    'fq': ['category:tech', 'status:published', 'date:[2024-01-01T00:00:00Z TO NOW]'],
228
    'fl': ['id', 'title', 'content', 'score']
229
}
230
encoded_multi = pysolr.safe_urlencode(multi_params, doseq=1)
231
print(f"Multi-valued: {encoded_multi}")
232

233
# UTF-8 content (especially important for Python 2.x)
234
utf8_params = {
235
    'q': 'title:café OR content:naïve',
236
    'fq': 'author:"José García"'
237
}
238
encoded_utf8 = pysolr.safe_urlencode(utf8_params)
239
print(f"UTF-8 safe: {encoded_utf8}")
240
```
241

242
### Data Sanitization
243

244
Clean data for safe XML processing and indexing.
245

246
```python { .api }
247
def sanitize(data):
248
    """
249
    Remove control characters from data for safe XML processing.
250

251
    Parameters:
252
    - data (str or bytes): Data to sanitize
253

254
    Returns:
255
    str: Sanitized Unicode string safe for XML processing
256

257
    Note:
258
    - Removes ASCII control characters (0x00-0x1F except tab, newline, carriage return)
259
    - Applied automatically during document indexing unless disabled
260
    - Essential for processing binary data or untrusted input
261
    """
262
```
263

264
Usage:
265

266
```python
267
import pysolr
268

269
# Sanitize text with control characters
270
dirty_text = "Clean text\x00\x01\x02\x08Bad control chars\x0b\x0c\x0e\x1fMore text"
271
clean_text = pysolr.sanitize(dirty_text)
272
print(f"Sanitized: {repr(clean_text)}")  # Control chars removed
273

274
# Process file content
275
with open('potentially_dirty_file.txt', 'rb') as f:
276
    file_content = f.read()
277
    safe_content = pysolr.sanitize(file_content)
278
    
279
    # Now safe to index
280
    doc = {
281
        'id': 'file_doc',
282
        'content': safe_content,
283
        'filename': 'potentially_dirty_file.txt'
284
    }
285

286
# Disable automatic sanitization if needed
287
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
288
solr.add(
289
    {'id': 'raw_doc', 'content': 'Raw content'},
290
    clean_ctrl_chars=False  # Skip automatic sanitization
291
)
292
```
293

294
## Advanced Usage Patterns
295

296
### Custom Data Processing Pipeline
297

298
Combine utility functions for comprehensive data processing:
299

300
```python
301
import pysolr
302

303
def process_web_content(html_content, document_id):
304
    """
305
    Complete pipeline for processing web content for Solr indexing.
306
    
307
    Parameters:
308
    - html_content (str): Raw HTML content
309
    - document_id (str): Unique document identifier
310
    
311
    Returns:
312
    dict: Processed document ready for indexing
313
    """
314
    
315
    # Step 1: Convert to Unicode if needed
316
    unicode_content = pysolr.force_unicode(html_content)
317
    
318
    # Step 2: Unescape HTML entities
319
    unescaped_content = pysolr.unescape_html(unicode_content)
320
    
321
    # Step 3: Clean invalid XML characters
322
    clean_content = pysolr.clean_xml_string(unescaped_content)
323
    
324
    # Step 4: Sanitize control characters
325
    safe_content = pysolr.sanitize(clean_content)
326
    
327
    # Step 5: Create document
328
    document = {
329
        'id': document_id,
330
        'content': safe_content,
331
        'content_length': len(safe_content),
332
        'processed_timestamp': pysolr.force_unicode(str(datetime.datetime.now()))
333
    }
334
    
335
    return document
336

337
# Usage example
338
raw_html = """
339
&lt;article&gt;
340
    &lt;h1&gt;Caf&eacute; Review&lt;/h1&gt;
341
    &lt;p&gt;Great coffee with a rating of 5&#9733;&lt;/p&gt;
342
    \x08\x0bSome bad control characters\x1f
343
&lt;/article&gt;
344
"""
345

346
processed_doc = process_web_content(raw_html, 'cafe_review_1')
347
print(f"Processed document: {processed_doc}")
348

349
# Index the processed document
350
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
351
solr.add(processed_doc)
352
```
353

354
### Error-Safe Utility Usage
355

356
Handle edge cases and errors gracefully:
357

358
```python
359
import pysolr
360

361
def safe_process_data(data):
362
    """
363
    Safely process data with error handling.
364
    
365
    Parameters:
366
    - data: Input data of unknown type/encoding
367
    
368
    Returns:
369
    str: Safely processed string or empty string on error
370
    """
371
    
372
    try:
373
        # Try to convert to Unicode
374
        unicode_data = pysolr.force_unicode(data)
375
        
376
        # Clean HTML if it looks like HTML
377
        if '<' in unicode_data and '>' in unicode_data:
378
            unicode_data = pysolr.unescape_html(unicode_data)
379
        
380
        # Always clean XML and sanitize
381
        clean_data = pysolr.clean_xml_string(unicode_data)
382
        safe_data = pysolr.sanitize(clean_data)
383
        
384
        return safe_data
385
        
386
    except Exception as e:
387
        print(f"Data processing error: {e}")
388
        return ""
389

390
# Test with various problematic inputs
391
test_inputs = [
392
    b'\xff\xfe\x00\x41',  # Invalid UTF-8
393
    "Valid &amp; clean text",  # HTML entities
394
    "Text\x00with\x08bad\x1fchars",  # Control characters
395
    12345,  # Non-string type
396
    None,   # None value
397
]
398

399
for i, test_input in enumerate(test_inputs):
400
    result = safe_process_data(test_input)
401
    print(f"Input {i}: {repr(test_input)} -> {repr(result)}")
402
```
403

404
### Performance Optimization
405

406
Use utility functions efficiently for large-scale processing:
407

408
```python
409
import pysolr
410

411
def bulk_sanitize_documents(documents):
412
    """
413
    Efficiently sanitize a large number of documents.
414
    
415
    Parameters:
416
    - documents (list): List of document dictionaries
417
    
418
    Returns:
419
    list: List of sanitized documents
420
    """
421
    
422
    sanitized_docs = []
423
    
424
    for doc in documents:
425
        sanitized_doc = {'id': doc['id']}  # Preserve ID
426
        
427
        for field, value in doc.items():
428
            if field == 'id':
429
                continue
430
                
431
            if isinstance(value, (str, bytes)):
432
                # Process string/bytes fields
433
                unicode_value = pysolr.force_unicode(value)
434
                clean_value = pysolr.sanitize(unicode_value)
435
                sanitized_doc[field] = clean_value
436
                
437
            elif isinstance(value, list):
438
                # Process multi-valued fields
439
                clean_values = []
440
                for item in value:
441
                    if isinstance(item, (str, bytes)):
442
                        unicode_item = pysolr.force_unicode(item)
443
                        clean_item = pysolr.sanitize(unicode_item)
444
                        clean_values.append(clean_item)
445
                    else:
446
                        clean_values.append(item)
447
                sanitized_doc[field] = clean_values
448
                
449
            else:
450
                # Preserve non-string fields as-is
451
                sanitized_doc[field] = value
452
        
453
        sanitized_docs.append(sanitized_doc)
454
    
455
    return sanitized_docs
456

457
# Example usage with large dataset
458
large_dataset = []
459
for i in range(1000):
460
    doc = {
461
        'id': f'doc_{i}',
462
        'title': f'Document {i} with "special" chars',
463
        'content': f'Content\x08with\x1fbad\x00chars for doc {i}',
464
        'tags': ['tag1', 'tag2\x0b', 'tag3'],
465
        'score': i * 0.1
466
    }
467
    large_dataset.append(doc)
468

469
print("Sanitizing large dataset...")
470
clean_dataset = bulk_sanitize_documents(large_dataset)
471
print(f"Processed {len(clean_dataset)} documents")
472

473
# Index cleaned dataset
474
solr = pysolr.Solr('http://localhost:8983/solr/my_core')
475
solr.add(clean_dataset, commit=True)
476
```

Version

Tile

Files

utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

utilities.mddocs/