0
# Search Operations
1
2
Search operations provide powerful querying capabilities for discovering items in the Internet Archive using various search criteria, field selection, sorting, and full-text search options.
3
4
## Capabilities
5
6
### Basic Search
7
8
Search for items using Archive.org's search syntax with results returned as an iterable Search object.
9
10
```python { .api }
11
def search_items(query, fields=None, sorts=None, params=None, full_text_search=False, dsl_fts=False, archive_session=None, config=None, config_file=None, http_adapter_kwargs=None, request_kwargs=None, max_retries=None):
12
"""
13
Search for items on Archive.org with advanced filtering options.
14
15
Args:
16
query (str): Search query using Archive.org syntax:
17
- Basic: 'collection:nasa'
18
- Field search: 'creator:"Neil Armstrong"'
19
- Boolean: 'collection:nasa AND mediatype:movies'
20
- Date ranges: 'date:[1969-01-01 TO 1969-12-31]'
21
- Wildcards: 'title:apollo*'
22
fields (list, optional): Metadata fields to return in results:
23
- Common: ['identifier', 'title', 'creator', 'date', 'description']
24
- All available fields returned if None
25
sorts (list, optional): Sort criteria:
26
- ['downloads desc'] - Most downloaded first
27
- ['date desc'] - Newest first
28
- ['titleSorter asc'] - Alphabetical by title
29
- ['reviewdate desc', 'identifier asc'] - Multiple sorts
30
params (dict, optional): Additional URL parameters:
31
- 'rows': int, results per page (default: 25, max: 10000)
32
- 'page': int, page number (1-based)
33
- 'cursor': str, cursor for pagination
34
- 'save': bool, save search for future use
35
full_text_search (bool): Enable full-text search across item content
36
dsl_fts (bool): Enable DSL-based full-text search for advanced queries
37
archive_session (ArchiveSession, optional): Existing session to use
38
config (dict, optional): Configuration for new session
39
config_file (str, optional): Config file for new session
40
http_adapter_kwargs (dict, optional): HTTP adapter arguments
41
request_kwargs (dict, optional): Additional request arguments
42
max_retries (int, optional): Maximum retry attempts for failed requests
43
44
Returns:
45
Search: Search object for iterating over results
46
47
Raises:
48
ValueError: If query is invalid
49
requests.RequestException: If search request fails
50
"""
51
52
class Search:
53
"""
54
Represents a search query and provides access to results.
55
"""
56
57
def __init__(self, archive_session, query, fields=None, sorts=None, params=None, full_text_search=None, dsl_fts=None, request_kwargs=None, max_retries=None):
58
"""
59
Initialize Search object.
60
61
Args:
62
archive_session (ArchiveSession): Session object
63
query (str): Search query string
64
fields (list, optional): Fields to return
65
sorts (list, optional): Sort criteria
66
params (dict, optional): URL parameters
67
full_text_search (bool, optional): Enable full-text search
68
dsl_fts (bool, optional): Enable DSL full-text search
69
request_kwargs (dict, optional): Request arguments
70
max_retries (int, optional): Maximum retries
71
"""
72
```
73
74
### Search Properties
75
76
Access search configuration and result information.
77
78
```python { .api }
79
class Search:
80
@property
81
def session(self):
82
"""ArchiveSession: Session object used for this search."""
83
84
@property
85
def query(self):
86
"""str: Search query string."""
87
88
@property
89
def fields(self):
90
"""list: Metadata fields being returned."""
91
92
@property
93
def sorts(self):
94
"""list: Sort criteria applied to results."""
95
96
@property
97
def params(self):
98
"""dict: URL parameters for the search."""
99
100
@property
101
def fts(self):
102
"""bool: Whether full-text search is enabled."""
103
104
@property
105
def dsl_fts(self):
106
"""bool: Whether DSL full-text search is enabled."""
107
108
@property
109
def num_found(self):
110
"""int: Total number of results found (not just returned)."""
111
```
112
113
### Result Iteration
114
115
Iterate over search results in different formats.
116
117
```python { .api }
118
class Search:
119
def __iter__(self):
120
"""
121
Iterate over search results as dictionaries.
122
123
Yields:
124
dict: Result dictionaries with requested fields
125
"""
126
127
def iter_as_results(self):
128
"""
129
Explicitly iterate over search results as dictionaries.
130
131
Yields:
132
dict: Result dictionaries with metadata fields
133
"""
134
135
def iter_as_items(self):
136
"""
137
Iterate over search results as Item objects.
138
139
Yields:
140
Item: Item objects for each search result
141
142
Note:
143
Creates Item objects which may trigger additional API calls
144
for metadata. Use iter_as_results() for better performance
145
when you only need the search result fields.
146
"""
147
```
148
149
## Search Query Syntax
150
151
### Basic Query Examples
152
153
```python
154
import internetarchive
155
156
# Search by collection
157
search = internetarchive.search_items('collection:nasa')
158
159
# Search by media type
160
search = internetarchive.search_items('mediatype:movies')
161
162
# Search by creator
163
search = internetarchive.search_items('creator:"Internet Archive"')
164
165
# Search by title with wildcards
166
search = internetarchive.search_items('title:apollo*')
167
```
168
169
### Advanced Query Examples
170
171
```python
172
import internetarchive
173
174
# Boolean queries
175
search = internetarchive.search_items(
176
'collection:nasa AND mediatype:movies AND date:[1969-01-01 TO 1969-12-31]'
177
)
178
179
# Multiple collections
180
search = internetarchive.search_items('collection:(nasa OR loc)')
181
182
# Exclude results
183
search = internetarchive.search_items('collection:nasa NOT mediatype:data')
184
185
# Full-text search
186
search = internetarchive.search_items(
187
'moon landing',
188
full_text_search=True
189
)
190
```
191
192
### Field Selection and Sorting
193
194
```python
195
import internetarchive
196
197
# Select specific fields
198
search = internetarchive.search_items(
199
'collection:nasa',
200
fields=['identifier', 'title', 'creator', 'date', 'downloads']
201
)
202
203
# Sort by popularity
204
search = internetarchive.search_items(
205
'collection:movies',
206
sorts=['downloads desc', 'reviewdate desc']
207
)
208
209
# Sort alphabetically
210
search = internetarchive.search_items(
211
'collection:books',
212
sorts=['titleSorter asc']
213
)
214
```
215
216
### Pagination and Performance
217
218
```python
219
import internetarchive
220
221
# Large result sets
222
search = internetarchive.search_items(
223
'collection:opensource',
224
params={'rows': 1000} # Get up to 1000 results per page
225
)
226
227
# Specific page
228
search = internetarchive.search_items(
229
'collection:nasa',
230
params={'page': 5, 'rows': 50}
231
)
232
233
# Using cursor for efficient pagination
234
search = internetarchive.search_items(
235
'collection:books',
236
params={'cursor': 'next_cursor_value'}
237
)
238
```
239
240
## Usage Examples
241
242
### Basic Search and Iteration
243
244
```python
245
import internetarchive
246
247
# Search for NASA collection items
248
search = internetarchive.search_items('collection:nasa')
249
250
print(f"Found {search.num_found} total results")
251
252
# Iterate over first page of results
253
for result in search:
254
print(f"ID: {result['identifier']}")
255
if 'title' in result:
256
print(f"Title: {result['title']}")
257
print(f"Downloads: {result.get('downloads', 'N/A')}")
258
print("---")
259
```
260
261
### Working with Item Objects
262
263
```python
264
import internetarchive
265
266
# Search and get Item objects
267
search = internetarchive.search_items(
268
'collection:nasa AND mediatype:movies',
269
fields=['identifier', 'title', 'creator']
270
)
271
272
# Convert results to Item objects for full functionality
273
for item in search.iter_as_items():
274
print(f"Processing item: {item.identifier}")
275
276
# Access full metadata (triggers API call)
277
print(f"Full title: {item.metadata.get('title')}")
278
print(f"File count: {item.files_count}")
279
280
# Download first PDF file if available
281
for file in item.get_files(formats=['pdf']):
282
file.download()
283
break
284
```
285
286
### Advanced Search with Session
287
288
```python
289
import internetarchive
290
291
# Create session for multiple searches
292
session = internetarchive.get_session()
293
294
# Search with session for better performance
295
search1 = session.search_items(
296
'collection:movies AND year:2020',
297
fields=['identifier', 'title', 'year'],
298
sorts=['downloads desc']
299
)
300
301
search2 = session.search_items(
302
'creator:"Internet Archive" AND mediatype:texts',
303
fields=['identifier', 'title', 'creator', 'date']
304
)
305
306
# Process multiple searches
307
for search in [search1, search2]:
308
print(f"Query: {search.query}")
309
print(f"Results: {search.num_found}")
310
311
# Get top 10 results
312
count = 0
313
for result in search:
314
print(f" {result['identifier']}: {result.get('title', 'No title')}")
315
count += 1
316
if count >= 10:
317
break
318
print()
319
```
320
321
### Full-Text Search
322
323
```python
324
import internetarchive
325
326
# Search within document content
327
search = internetarchive.search_items(
328
'artificial intelligence machine learning',
329
full_text_search=True,
330
fields=['identifier', 'title', 'description']
331
)
332
333
print(f"Full-text search found {search.num_found} documents")
334
335
for result in search:
336
print(f"Document: {result['identifier']}")
337
print(f"Title: {result.get('title', 'No title')}")
338
if 'description' in result:
339
print(f"Description: {result['description'][:200]}...")
340
print("---")
341
```
342
343
### Specialized Collection Searches
344
345
```python
346
import internetarchive
347
348
# Search specific collections with targeted fields
349
collections_queries = {
350
'software': {
351
'query': 'collection:softwarelibrary',
352
'fields': ['identifier', 'title', 'creator', 'emulator']
353
},
354
'books': {
355
'query': 'collection:books AND language:eng',
356
'fields': ['identifier', 'title', 'creator', 'publisher', 'date']
357
},
358
'audio': {
359
'query': 'collection:etree AND year:2023',
360
'fields': ['identifier', 'title', 'creator', 'date', 'venue']
361
}
362
}
363
364
for collection_name, config in collections_queries.items():
365
search = internetarchive.search_items(
366
config['query'],
367
fields=config['fields'],
368
sorts=['downloads desc']
369
)
370
371
print(f"{collection_name.upper()} Collection ({search.num_found} items):")
372
373
count = 0
374
for result in search:
375
print(f" {result['identifier']}: {result.get('title', 'No title')}")
376
count += 1
377
if count >= 5: # Show top 5
378
break
379
print()
380
```