or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

admin-operations.mdcore-client.mddocument-processing.mdindex.mdsearch-operations.mdsolrcloud-support.mdutilities.md

document-processing.mddocs/

0

# Document Processing

1

2

Advanced document handling including content extraction with Apache Tika, nested document support, field update operations, and XML/JSON processing utilities for rich document indexing.

3

4

## Capabilities

5

6

### Content Extraction

7

8

Extract text content and metadata from files using Solr's ExtractingRequestHandler (Tika integration).

9

10

```python { .api }

11

def extract(self, file_obj, extractOnly=True, handler="update/extract", **kwargs):

12

"""

13

Extract content and metadata from files using Apache Tika.

14

15

Parameters:

16

- file_obj: File-like object with a 'name' attribute (e.g., result of open())

17

- extractOnly (bool): Extract without indexing (default: True)

18

- handler (str): Extraction handler path (default: "update/extract")

19

- **kwargs: Additional Tika/extraction parameters:

20

- literal.id (str): Document ID for extracted content

21

- fmap.content (str): Map extracted content to field name

22

- uprefix (str): Prefix for unknown fields

23

- defaultField (str): Default field for unmapped content

24

- xpath (str): XPath expression for content selection

25

- captureAttr (bool): Capture HTML attributes

26

- lowernames (bool): Convert field names to lowercase

27

28

Returns:

29

dict or None: Dictionary with 'contents' and 'metadata' keys, or None if extraction fails

30

31

Raises:

32

ValueError: If file_obj doesn't have a name attribute

33

SolrError: If extraction fails or handler is not configured

34

"""

35

```

36

37

Usage:

38

39

```python

40

import pysolr

41

42

solr = pysolr.Solr('http://localhost:8983/solr/my_core')

43

44

# Extract content from a PDF file

45

with open('document.pdf', 'rb') as pdf_file:

46

extracted = solr.extract(pdf_file)

47

48

if extracted:

49

print("Extracted content:")

50

print(extracted['contents'])

51

52

print("Metadata:")

53

for key, value in extracted['metadata'].items():

54

print(f" {key}: {value}")

55

else:

56

print("No content could be extracted")

57

58

# Extract and index simultaneously

59

with open('document.docx', 'rb') as docx_file:

60

# This will extract and immediately index the document

61

solr.extract(

62

docx_file,

63

extractOnly=False,

64

literal_id='doc_123',

65

literal_title='Important Document',

66

fmap_content='text_content'

67

)

68

69

# Extract with custom field mapping

70

with open('presentation.pptx', 'rb') as pptx_file:

71

extracted = solr.extract(

72

pptx_file,

73

uprefix='extracted_',

74

defaultField='content',

75

captureAttr=True,

76

lowernames=True

77

)

78

```

79

80

### Nested Document Support

81

82

Handle parent-child document relationships for hierarchical data structures.

83

84

```python { .api }

85

# Nested document key constant

86

NESTED_DOC_KEY = "_childDocuments_"

87

```

88

89

Usage:

90

91

```python

92

import pysolr

93

94

solr = pysolr.Solr('http://localhost:8983/solr/my_core')

95

96

# Index document with nested children

97

parent_doc = {

98

"id": "blog_post_1",

99

"title": "Introduction to Machine Learning",

100

"author": "Jane Smith",

101

"category": "technology",

102

pysolr.NESTED_DOC_KEY: [

103

{

104

"id": "comment_1",

105

"type": "comment",

106

"author": "John Doe",

107

"text": "Great article! Very informative."

108

},

109

{

110

"id": "comment_2",

111

"type": "comment",

112

"author": "Alice Brown",

113

"text": "Thanks for sharing this."

114

}

115

]

116

}

117

118

solr.add(parent_doc)

119

120

# Alternative syntax using _doc key

121

parent_doc_alt = {

122

"id": "article_1",

123

"title": "Python Best Practices",

124

"_doc": [

125

{"id": "section_1", "title": "Code Style", "content": "Follow PEP 8..."},

126

{"id": "section_2", "title": "Testing", "content": "Write comprehensive tests..."}

127

]

128

}

129

130

solr.add(parent_doc_alt)

131

132

# Search nested documents

133

results = solr.search('{!parent which="type:parent"}text:"Great article"')

134

for doc in results:

135

print(f"Parent document: {doc['title']}")

136

```

137

138

### Field Update Operations

139

140

Perform atomic updates on specific document fields without reindexing entire documents.

141

142

Usage:

143

144

```python

145

import pysolr

146

147

solr = pysolr.Solr('http://localhost:8983/solr/my_core')

148

149

# Atomic field updates - set new value

150

solr.add(

151

{"id": "doc_1", "status": "published"},

152

fieldUpdates={"status": "set"}

153

)

154

155

# Add value to multi-valued field

156

solr.add(

157

{"id": "doc_1", "tags": "python"},

158

fieldUpdates={"tags": "add"}

159

)

160

161

# Increment numeric field

162

solr.add(

163

{"id": "doc_1", "view_count": 1},

164

fieldUpdates={"view_count": "inc"}

165

)

166

167

# Remove specific value from multi-valued field

168

solr.add(

169

{"id": "doc_1", "tags": "outdated"},

170

fieldUpdates={"tags": "remove"}

171

)

172

173

# Multiple field operations

174

solr.add(

175

{

176

"id": "doc_1",

177

"last_modified": "2024-01-15T10:30:00Z",

178

"tags": "updated",

179

"version": 1

180

},

181

fieldUpdates={

182

"last_modified": "set",

183

"tags": "add",

184

"version": "inc"

185

}

186

)

187

```

188

189

### Document Boost Support

190

191

Apply scoring boosts to documents and fields during indexing to influence search relevance.

192

193

Usage:

194

195

```python

196

import pysolr

197

198

solr = pysolr.Solr('http://localhost:8983/solr/my_core')

199

200

# Document-level boost

201

doc_with_boost = {

202

"id": "important_doc",

203

"title": "Critical Information",

204

"content": "This document contains vital information",

205

"boost": 2.0 # Document boost factor

206

}

207

208

solr.add(doc_with_boost)

209

210

# Field-level boosts

211

docs = [

212

{

213

"id": "doc_1",

214

"title": "Python Tutorial",

215

"content": "Learn Python programming"

216

},

217

{

218

"id": "doc_2",

219

"title": "Advanced Python",

220

"content": "Master advanced Python concepts"

221

}

222

]

223

224

# Boost title field more than content field

225

field_boosts = {

226

"title": 3.0,

227

"content": 1.0

228

}

229

230

solr.add(docs, boost=field_boosts)

231

```

232

233

### Batch Processing

234

235

Efficiently process large numbers of documents with optimized batch operations.

236

237

Usage:

238

239

```python

240

import pysolr

241

242

solr = pysolr.Solr('http://localhost:8983/solr/my_core')

243

244

# Process large document batch

245

def process_large_dataset(documents, batch_size=1000):

246

"""Process documents in batches for optimal performance."""

247

248

for i in range(0, len(documents), batch_size):

249

batch = documents[i:i + batch_size]

250

251

try:

252

# Add batch without immediate commit

253

solr.add(batch, commit=False)

254

print(f"Processed batch {i//batch_size + 1}: {len(batch)} documents")

255

256

except pysolr.SolrError as e:

257

print(f"Batch {i//batch_size + 1} failed: {e}")

258

# Handle individual documents in failed batch

259

for doc in batch:

260

try:

261

solr.add(doc, commit=False)

262

except pysolr.SolrError:

263

print(f"Failed to index document: {doc.get('id', 'unknown')}")

264

265

# Commit all changes at once

266

solr.commit()

267

print("All batches committed")

268

269

# Example usage

270

large_dataset = []

271

for i in range(10000):

272

doc = {

273

"id": f"doc_{i}",

274

"title": f"Document {i}",

275

"content": f"Content for document number {i}",

276

"timestamp": "2024-01-15T10:30:00Z"

277

}

278

large_dataset.append(doc)

279

280

process_large_dataset(large_dataset)

281

```

282

283

### Advanced Document Structures

284

285

Handle complex document structures with dynamic fields, copies fields, and custom data types.

286

287

```python

288

import pysolr

289

import datetime

290

291

solr = pysolr.Solr('http://localhost:8983/solr/my_core')

292

293

# Complex document with various data types

294

complex_doc = {

295

# Basic fields

296

"id": "complex_doc_1",

297

"title": "Advanced Document Structure",

298

"content": "This document demonstrates complex field types",

299

300

# Date fields (automatically converted)

301

"created_date": datetime.datetime.now(),

302

"published_date": datetime.date.today(),

303

304

# Multi-valued fields

305

"tags": ["python", "solr", "search", "indexing"],

306

"authors": ["Alice Smith", "Bob Johnson"],

307

308

# Numeric fields

309

"price": 29.99,

310

"quantity": 100,

311

"rating": 4.5,

312

313

# Boolean fields

314

"is_featured": True,

315

"is_available": False,

316

317

# Dynamic fields (assuming *_s, *_i, *_dt patterns in schema)

318

"custom_string_s": "Custom string value",

319

"custom_int_i": 42,

320

"custom_date_dt": "2024-01-15T10:30:00Z",

321

322

# Location field (if geo-spatial search is configured)

323

"location": "37.7749,-122.4194", # San Francisco coordinates

324

325

# JSON field (if JSON field type is configured)

326

"metadata": {

327

"source": "api",

328

"version": "1.0",

329

"settings": {

330

"debug": True,

331

"timeout": 30

332

}

333

}

334

}

335

336

# Index complex document

337

solr.add(complex_doc)

338

339

# Search using various field types

340

results = solr.search('tags:python AND rating:[4.0 TO *]')

341

date_results = solr.search('created_date:[2024-01-01T00:00:00Z TO NOW]')

342

geo_results = solr.search('{!geofilt pt=37.7749,-122.4194 sfield=location d=10}')

343

```

344

345

## Data Type Conversion

346

347

PySOLR automatically handles data type conversion between Python and Solr formats:

348

349

```python

350

import pysolr

351

import datetime

352

353

# Python -> Solr conversion examples

354

conversion_examples = {

355

# Dates and times

356

"datetime_field": datetime.datetime(2024, 1, 15, 10, 30, 0), # -> "2024-01-15T10:30:00Z"

357

"date_field": datetime.date(2024, 1, 15), # -> "2024-01-15T00:00:00Z"

358

359

# Boolean values

360

"is_active": True, # -> "true"

361

"is_deleted": False, # -> "false"

362

363

# Numeric values (preserved)

364

"count": 42,

365

"price": 29.99,

366

367

# Strings (UTF-8 encoded and XML-safe)

368

"description": "Text with special chars: <>&\"'",

369

370

# Lists and tuples (multi-valued fields)

371

"categories": ["tech", "programming", "python"],

372

"coordinates": (37.7749, -122.4194),

373

374

# None values (excluded from indexing)

375

"optional_field": None, # This field will not be included

376

}

377

378

solr = pysolr.Solr('http://localhost:8983/solr/my_core')

379

doc = {"id": "conversion_example"}

380

doc.update(conversion_examples)

381

solr.add(doc)

382

```