or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

admin-operations.mdcore-client.mddocument-processing.mdindex.mdsearch-operations.mdsolrcloud-support.mdutilities.md

utilities.mddocs/

0

# Utility Functions

1

2

Helper functions for data conversion, text processing, URL encoding, and XML sanitization used throughout the pysolr library and available for custom processing needs.

3

4

## Capabilities

5

6

### Version Information

7

8

Get the current version of the pysolr library.

9

10

```python { .api }

11

def get_version():

12

"""

13

Get the current pysolr library version.

14

15

Returns:

16

str: Version string (e.g., "3.10.0")

17

"""

18

```

19

20

Usage:

21

22

```python

23

import pysolr

24

25

version = pysolr.get_version()

26

print(f"PySOLR version: {version}")

27

28

# Use in application logging or debugging

29

print(f"Using pysolr {version} to connect to Solr")

30

```

31

32

### Python Version Detection

33

34

Detect Python version for cross-platform compatibility.

35

36

```python { .api }

37

def is_py3():

38

"""

39

Check if running on Python 3.x.

40

41

Returns:

42

bool: True if Python 3.x, False if Python 2.x

43

44

Note:

45

- Used internally for handling differences between Python 2 and 3

46

- Helps with string/bytes handling and compatibility

47

"""

48

```

49

50

Usage:

51

52

```python

53

import pysolr

54

55

if pysolr.is_py3():

56

print("Running on Python 3.x")

57

# Python 3-specific logic

58

else:

59

print("Running on Python 2.x")

60

# Python 2-specific logic

61

```

62

63

### String Encoding Utilities

64

65

Convert between Unicode strings and byte strings for cross-platform compatibility.

66

67

```python { .api }

68

def force_unicode(value):

69

"""

70

Convert bytestrings to Unicode strings with error handling.

71

72

Parameters:

73

- value: Input value (bytes, str, or other type)

74

75

Returns:

76

str: Unicode string representation

77

78

Note:

79

- On Python 3: Decodes bytes to str, converts other types to str

80

- On Python 2: Decodes str to unicode, converts other types to unicode

81

- Uses UTF-8 encoding with 'replace' error handling

82

"""

83

84

def force_bytes(value):

85

"""

86

Convert Unicode strings to bytestrings for HTTP transmission.

87

88

Parameters:

89

- value: Input value (str, unicode, or other type)

90

91

Returns:

92

bytes (Python 3) or str (Python 2): Byte string representation

93

94

Note:

95

- Uses UTF-8 encoding with appropriate error handling

96

- Required for HTTP request bodies and XML processing

97

"""

98

```

99

100

Usage:

101

102

```python

103

import pysolr

104

105

# Convert various types to Unicode

106

text_bytes = b"Hello, World! \xe2\x9c\x93" # UTF-8 bytes with checkmark

107

unicode_text = pysolr.force_unicode(text_bytes)

108

print(f"Unicode: {unicode_text}") # "Hello, World! ✓"

109

110

# Convert for HTTP transmission

111

unicode_string = "Café with special chars: áéíóú"

112

byte_string = pysolr.force_bytes(unicode_string)

113

print(f"Bytes: {byte_string}")

114

115

# Handle various input types

116

number_as_unicode = pysolr.force_unicode(12345)

117

print(f"Number as Unicode: {number_as_unicode}") # "12345"

118

119

# Error handling with malformed data

120

malformed_bytes = b"\xff\xfe\x00\x41" # Invalid UTF-8

121

safe_unicode = pysolr.force_unicode(malformed_bytes)

122

print(f"Safe conversion: {safe_unicode}") # Uses replacement characters

123

```

124

125

### HTML/XML Processing

126

127

Clean and process HTML/XML content for safe indexing and display.

128

129

```python { .api }

130

def unescape_html(text):

131

"""

132

Remove HTML or XML character references and entities from text.

133

134

Parameters:

135

- text (str): HTML or XML source text containing entities

136

137

Returns:

138

str: Plain text with entities converted to Unicode characters

139

140

Note:

141

- Handles both numeric ({, {) and named (&, <) entities

142

- Useful for processing HTML content before indexing

143

"""

144

145

def clean_xml_string(s):

146

"""

147

Remove invalid XML characters from string.

148

149

Parameters:

150

- s (str): String to clean

151

152

Returns:

153

str: String with invalid XML characters removed

154

155

Note:

156

- Removes control characters that would cause XML parsing errors

157

- Applied automatically during document indexing

158

"""

159

```

160

161

Usage:

162

163

```python

164

import pysolr

165

166

# Clean HTML entities

167

html_content = "Price: £25.99 & free shipping! Rating: 5★"

168

clean_content = pysolr.unescape_html(html_content)

169

print(f"Cleaned: {clean_content}") # "Price: £25.99 & free shipping! Rating: 5★"

170

171

# Remove invalid XML characters

172

xml_content = "Valid text\x08\x0bInvalid control chars\x1f\x00More text"

173

clean_xml = pysolr.clean_xml_string(xml_content)

174

print(f"Clean XML: {clean_xml}") # "Valid textInvalid control charsMore text"

175

176

# Process scraped web content

177

scraped_html = """

178

<div class="article">

179

<h1>Article Title</h1>

180

<p>Content with "quotes" and <tags></p>

181

</div>

182

"""

183

readable_text = pysolr.unescape_html(scraped_html)

184

print(f"Readable: {readable_text}")

185

```

186

187

### URL Encoding

188

189

Safe URL encoding for HTTP parameters with UTF-8 support.

190

191

```python { .api }

192

def safe_urlencode(params, doseq=0):

193

"""

194

UTF-8-safe version of URL encoding.

195

196

Parameters:

197

- params (dict or list of tuples): Parameters to encode

198

- doseq (int): Handle sequence values (0=single value, 1=multiple values)

199

200

Returns:

201

str: URL-encoded parameter string

202

203

Note:

204

- Fixes UTF-8 encoding issues in Python 2.x

205

- Used internally for Solr HTTP requests

206

- Handles both single and multi-valued parameters

207

"""

208

```

209

210

Usage:

211

212

```python

213

import pysolr

214

215

# Basic parameter encoding

216

params = {

217

'q': 'title:python AND content:"machine learning"',

218

'fq': 'category:programming',

219

'rows': 20,

220

'start': 0

221

}

222

encoded = pysolr.safe_urlencode(params)

223

print(f"Encoded: {encoded}")

224

225

# Multi-valued parameters

226

multi_params = {

227

'fq': ['category:tech', 'status:published', 'date:[2024-01-01T00:00:00Z TO NOW]'],

228

'fl': ['id', 'title', 'content', 'score']

229

}

230

encoded_multi = pysolr.safe_urlencode(multi_params, doseq=1)

231

print(f"Multi-valued: {encoded_multi}")

232

233

# UTF-8 content (especially important for Python 2.x)

234

utf8_params = {

235

'q': 'title:café OR content:naïve',

236

'fq': 'author:"José García"'

237

}

238

encoded_utf8 = pysolr.safe_urlencode(utf8_params)

239

print(f"UTF-8 safe: {encoded_utf8}")

240

```

241

242

### Data Sanitization

243

244

Clean data for safe XML processing and indexing.

245

246

```python { .api }

247

def sanitize(data):

248

"""

249

Remove control characters from data for safe XML processing.

250

251

Parameters:

252

- data (str or bytes): Data to sanitize

253

254

Returns:

255

str: Sanitized Unicode string safe for XML processing

256

257

Note:

258

- Removes ASCII control characters (0x00-0x1F except tab, newline, carriage return)

259

- Applied automatically during document indexing unless disabled

260

- Essential for processing binary data or untrusted input

261

"""

262

```

263

264

Usage:

265

266

```python

267

import pysolr

268

269

# Sanitize text with control characters

270

dirty_text = "Clean text\x00\x01\x02\x08Bad control chars\x0b\x0c\x0e\x1fMore text"

271

clean_text = pysolr.sanitize(dirty_text)

272

print(f"Sanitized: {repr(clean_text)}") # Control chars removed

273

274

# Process file content

275

with open('potentially_dirty_file.txt', 'rb') as f:

276

file_content = f.read()

277

safe_content = pysolr.sanitize(file_content)

278

279

# Now safe to index

280

doc = {

281

'id': 'file_doc',

282

'content': safe_content,

283

'filename': 'potentially_dirty_file.txt'

284

}

285

286

# Disable automatic sanitization if needed

287

solr = pysolr.Solr('http://localhost:8983/solr/my_core')

288

solr.add(

289

{'id': 'raw_doc', 'content': 'Raw content'},

290

clean_ctrl_chars=False # Skip automatic sanitization

291

)

292

```

293

294

## Advanced Usage Patterns

295

296

### Custom Data Processing Pipeline

297

298

Combine utility functions for comprehensive data processing:

299

300

```python

301

import pysolr

302

303

def process_web_content(html_content, document_id):

304

"""

305

Complete pipeline for processing web content for Solr indexing.

306

307

Parameters:

308

- html_content (str): Raw HTML content

309

- document_id (str): Unique document identifier

310

311

Returns:

312

dict: Processed document ready for indexing

313

"""

314

315

# Step 1: Convert to Unicode if needed

316

unicode_content = pysolr.force_unicode(html_content)

317

318

# Step 2: Unescape HTML entities

319

unescaped_content = pysolr.unescape_html(unicode_content)

320

321

# Step 3: Clean invalid XML characters

322

clean_content = pysolr.clean_xml_string(unescaped_content)

323

324

# Step 4: Sanitize control characters

325

safe_content = pysolr.sanitize(clean_content)

326

327

# Step 5: Create document

328

document = {

329

'id': document_id,

330

'content': safe_content,

331

'content_length': len(safe_content),

332

'processed_timestamp': pysolr.force_unicode(str(datetime.datetime.now()))

333

}

334

335

return document

336

337

# Usage example

338

raw_html = """

339

<article>

340

<h1>Café Review</h1>

341

<p>Great coffee with a rating of 5★</p>

342

\x08\x0bSome bad control characters\x1f

343

</article>

344

"""

345

346

processed_doc = process_web_content(raw_html, 'cafe_review_1')

347

print(f"Processed document: {processed_doc}")

348

349

# Index the processed document

350

solr = pysolr.Solr('http://localhost:8983/solr/my_core')

351

solr.add(processed_doc)

352

```

353

354

### Error-Safe Utility Usage

355

356

Handle edge cases and errors gracefully:

357

358

```python

359

import pysolr

360

361

def safe_process_data(data):

362

"""

363

Safely process data with error handling.

364

365

Parameters:

366

- data: Input data of unknown type/encoding

367

368

Returns:

369

str: Safely processed string or empty string on error

370

"""

371

372

try:

373

# Try to convert to Unicode

374

unicode_data = pysolr.force_unicode(data)

375

376

# Clean HTML if it looks like HTML

377

if '<' in unicode_data and '>' in unicode_data:

378

unicode_data = pysolr.unescape_html(unicode_data)

379

380

# Always clean XML and sanitize

381

clean_data = pysolr.clean_xml_string(unicode_data)

382

safe_data = pysolr.sanitize(clean_data)

383

384

return safe_data

385

386

except Exception as e:

387

print(f"Data processing error: {e}")

388

return ""

389

390

# Test with various problematic inputs

391

test_inputs = [

392

b'\xff\xfe\x00\x41', # Invalid UTF-8

393

"Valid &amp; clean text", # HTML entities

394

"Text\x00with\x08bad\x1fchars", # Control characters

395

12345, # Non-string type

396

None, # None value

397

]

398

399

for i, test_input in enumerate(test_inputs):

400

result = safe_process_data(test_input)

401

print(f"Input {i}: {repr(test_input)} -> {repr(result)}")

402

```

403

404

### Performance Optimization

405

406

Use utility functions efficiently for large-scale processing:

407

408

```python

409

import pysolr

410

411

def bulk_sanitize_documents(documents):

412

"""

413

Efficiently sanitize a large number of documents.

414

415

Parameters:

416

- documents (list): List of document dictionaries

417

418

Returns:

419

list: List of sanitized documents

420

"""

421

422

sanitized_docs = []

423

424

for doc in documents:

425

sanitized_doc = {'id': doc['id']} # Preserve ID

426

427

for field, value in doc.items():

428

if field == 'id':

429

continue

430

431

if isinstance(value, (str, bytes)):

432

# Process string/bytes fields

433

unicode_value = pysolr.force_unicode(value)

434

clean_value = pysolr.sanitize(unicode_value)

435

sanitized_doc[field] = clean_value

436

437

elif isinstance(value, list):

438

# Process multi-valued fields

439

clean_values = []

440

for item in value:

441

if isinstance(item, (str, bytes)):

442

unicode_item = pysolr.force_unicode(item)

443

clean_item = pysolr.sanitize(unicode_item)

444

clean_values.append(clean_item)

445

else:

446

clean_values.append(item)

447

sanitized_doc[field] = clean_values

448

449

else:

450

# Preserve non-string fields as-is

451

sanitized_doc[field] = value

452

453

sanitized_docs.append(sanitized_doc)

454

455

return sanitized_docs

456

457

# Example usage with large dataset

458

large_dataset = []

459

for i in range(1000):

460

doc = {

461

'id': f'doc_{i}',

462

'title': f'Document {i} with "special" chars',

463

'content': f'Content\x08with\x1fbad\x00chars for doc {i}',

464

'tags': ['tag1', 'tag2\x0b', 'tag3'],

465

'score': i * 0.1

466

}

467

large_dataset.append(doc)

468

469

print("Sanitizing large dataset...")

470

clean_dataset = bulk_sanitize_documents(large_dataset)

471

print(f"Processed {len(clean_dataset)} documents")

472

473

# Index cleaned dataset

474

solr = pysolr.Solr('http://localhost:8983/solr/my_core')

475

solr.add(clean_dataset, commit=True)

476

```