0
# GridFS File Storage
1
2
GridFS support for storing and retrieving large files, including streaming operations and metadata management.
3
4
## Capabilities
5
6
### GridFS Interface
7
8
Legacy GridFS interface for file storage operations.
9
10
```python { .api }
11
class GridFS:
12
def __init__(self, database, collection="fs", disable_md5=False):
13
"""
14
GridFS instance for file operations.
15
16
Parameters:
17
- database: Database instance
18
- collection: GridFS collection prefix (default "fs")
19
- disable_md5: disable MD5 checksum calculation
20
"""
21
22
def new_file(self, **kwargs):
23
"""
24
Create new GridFS file for writing.
25
26
Parameters:
27
- _id: file identifier
28
- filename: file name
29
- contentType: MIME content type
30
- chunkSize: chunk size in bytes
31
- metadata: custom metadata dictionary
32
33
Returns:
34
GridIn: File handle for writing
35
"""
36
37
def put(self, data, **kwargs):
38
"""
39
Store data as GridFS file.
40
41
Parameters:
42
- data: file data (bytes or file-like object)
43
- kwargs: same as new_file()
44
45
Returns:
46
ObjectId: File identifier
47
"""
48
49
def get(self, file_id, session=None):
50
"""
51
Retrieve file by ID.
52
53
Parameters:
54
- file_id: file identifier
55
- session: optional ClientSession
56
57
Returns:
58
GridOut: File handle for reading
59
60
Raises:
61
NoFile: if file not found
62
"""
63
64
def get_version(self, filename=None, version=-1, session=None, **kwargs):
65
"""
66
Retrieve file by filename and version.
67
68
Parameters:
69
- filename: file name
70
- version: version number (-1 for latest)
71
- session: optional ClientSession
72
73
Returns:
74
GridOut: File handle for reading
75
76
Raises:
77
NoFile: if file not found
78
"""
79
80
def get_last_version(self, filename=None, session=None, **kwargs):
81
"""
82
Retrieve latest version of file by filename.
83
84
Parameters:
85
- filename: file name
86
- session: optional ClientSession
87
88
Returns:
89
GridOut: File handle for reading
90
91
Raises:
92
NoFile: if file not found
93
"""
94
95
def delete(self, file_id, session=None):
96
"""
97
Delete file by ID.
98
99
Parameters:
100
- file_id: file identifier
101
- session: optional ClientSession
102
103
Raises:
104
NoFile: if file not found
105
"""
106
107
def list(self, session=None):
108
"""
109
List stored filenames.
110
111
Parameters:
112
- session: optional ClientSession
113
114
Returns:
115
list: List of filenames
116
"""
117
118
def find_one(self, filter=None, session=None, *args, **kwargs):
119
"""
120
Find single file by filter.
121
122
Parameters:
123
- filter: query criteria
124
- session: optional ClientSession
125
126
Returns:
127
GridOut: File handle or None
128
"""
129
130
def find(self, *args, **kwargs):
131
"""
132
Find files matching criteria.
133
134
Parameters:
135
- filter: query criteria
136
- skip: number of files to skip
137
- limit: maximum number of files
138
- sort: sort specification
139
- session: optional ClientSession
140
141
Returns:
142
GridOutCursor: Cursor for files
143
"""
144
145
def exists(self, document_or_id=None, session=None, **kwargs):
146
"""
147
Check if file exists.
148
149
Parameters:
150
- document_or_id: file ID or query document
151
- session: optional ClientSession
152
153
Returns:
154
bool: True if file exists
155
"""
156
```
157
158
### GridFSBucket Interface
159
160
Modern GridFS interface with streaming support (recommended).
161
162
```python { .api }
163
class GridFSBucket:
164
def __init__(
165
self,
166
db,
167
bucket_name="fs",
168
chunk_size_bytes=DEFAULT_CHUNK_SIZE,
169
write_concern=None,
170
read_preference=None,
171
disable_md5=False
172
):
173
"""
174
GridFS bucket for file operations.
175
176
Parameters:
177
- db: Database instance
178
- bucket_name: bucket name (default "fs")
179
- chunk_size_bytes: default chunk size
180
- write_concern: write concern for operations
181
- read_preference: read preference for operations
182
- disable_md5: disable MD5 checksum calculation
183
"""
184
185
def open_upload_stream(
186
self,
187
filename,
188
chunk_size_bytes=None,
189
metadata=None,
190
session=None
191
):
192
"""
193
Open upload stream for writing file.
194
195
Parameters:
196
- filename: file name
197
- chunk_size_bytes: chunk size override
198
- metadata: custom metadata dictionary
199
- session: optional ClientSession
200
201
Returns:
202
GridIn: Upload stream
203
"""
204
205
def open_upload_stream_with_id(
206
self,
207
file_id,
208
filename,
209
chunk_size_bytes=None,
210
metadata=None,
211
session=None
212
):
213
"""
214
Open upload stream with specific file ID.
215
216
Parameters:
217
- file_id: file identifier
218
- filename: file name
219
- chunk_size_bytes: chunk size override
220
- metadata: custom metadata dictionary
221
- session: optional ClientSession
222
223
Returns:
224
GridIn: Upload stream
225
"""
226
227
def upload_from_stream(
228
self,
229
filename,
230
source,
231
chunk_size_bytes=None,
232
metadata=None,
233
session=None
234
):
235
"""
236
Upload file from stream.
237
238
Parameters:
239
- filename: file name
240
- source: readable file-like object
241
- chunk_size_bytes: chunk size override
242
- metadata: custom metadata dictionary
243
- session: optional ClientSession
244
245
Returns:
246
ObjectId: File identifier
247
"""
248
249
def upload_from_stream_with_id(
250
self,
251
file_id,
252
filename,
253
source,
254
chunk_size_bytes=None,
255
metadata=None,
256
session=None
257
):
258
"""
259
Upload file from stream with specific ID.
260
261
Parameters:
262
- file_id: file identifier
263
- filename: file name
264
- source: readable file-like object
265
- chunk_size_bytes: chunk size override
266
- metadata: custom metadata dictionary
267
- session: optional ClientSession
268
"""
269
270
def open_download_stream(self, file_id, session=None):
271
"""
272
Open download stream by file ID.
273
274
Parameters:
275
- file_id: file identifier
276
- session: optional ClientSession
277
278
Returns:
279
GridOut: Download stream
280
281
Raises:
282
NoFile: if file not found
283
"""
284
285
def download_to_stream(self, file_id, destination, session=None):
286
"""
287
Download file to stream by ID.
288
289
Parameters:
290
- file_id: file identifier
291
- destination: writable file-like object
292
- session: optional ClientSession
293
294
Raises:
295
NoFile: if file not found
296
"""
297
298
def delete(self, file_id, session=None):
299
"""
300
Delete file by ID.
301
302
Parameters:
303
- file_id: file identifier
304
- session: optional ClientSession
305
306
Raises:
307
NoFile: if file not found
308
"""
309
310
def find(self, filter=None, session=None, **kwargs):
311
"""
312
Find files matching criteria.
313
314
Parameters:
315
- filter: query criteria for files collection
316
- batch_size: cursor batch size
317
- limit: maximum number of files
318
- skip: number of files to skip
319
- sort: sort specification
320
- session: optional ClientSession
321
322
Returns:
323
GridOutCursor: Cursor for files
324
"""
325
326
def open_download_stream_by_name(
327
self,
328
filename,
329
revision=-1,
330
session=None
331
):
332
"""
333
Open download stream by filename.
334
335
Parameters:
336
- filename: file name
337
- revision: file revision (-1 for latest)
338
- session: optional ClientSession
339
340
Returns:
341
GridOut: Download stream
342
343
Raises:
344
NoFile: if file not found
345
"""
346
347
def download_to_stream_by_name(
348
self,
349
filename,
350
destination,
351
revision=-1,
352
session=None
353
):
354
"""
355
Download file to stream by name.
356
357
Parameters:
358
- filename: file name
359
- destination: writable file-like object
360
- revision: file revision (-1 for latest)
361
- session: optional ClientSession
362
363
Raises:
364
NoFile: if file not found
365
"""
366
367
def rename(self, file_id, new_filename, session=None):
368
"""
369
Rename file.
370
371
Parameters:
372
- file_id: file identifier
373
- new_filename: new file name
374
- session: optional ClientSession
375
376
Raises:
377
NoFile: if file not found
378
"""
379
```
380
381
### GridFS File Objects
382
383
File objects for reading and writing GridFS files.
384
385
```python { .api }
386
class GridIn:
387
def __init__(self, root_collection, session=None, disable_md5=False, **kwargs):
388
"""
389
GridFS file for writing.
390
391
Parameters:
392
- root_collection: GridFS root collection
393
- session: optional ClientSession
394
- disable_md5: disable MD5 calculation
395
- kwargs: file metadata
396
"""
397
398
def write(self, data):
399
"""
400
Write data to file.
401
402
Parameters:
403
- data: bytes to write
404
"""
405
406
def writelines(self, lines):
407
"""
408
Write sequence of bytes.
409
410
Parameters:
411
- lines: sequence of bytes
412
"""
413
414
def close(self):
415
"""Close file and finalize upload."""
416
417
def abort(self):
418
"""Abort upload and delete partial file."""
419
420
@property
421
def closed(self):
422
"""
423
Check if file is closed.
424
425
Returns:
426
bool: True if closed
427
"""
428
429
@property
430
def _id(self):
431
"""
432
File identifier.
433
434
Returns:
435
ObjectId: File ID
436
"""
437
438
@property
439
def filename(self):
440
"""
441
File name.
442
443
Returns:
444
str: File name
445
"""
446
447
@property
448
def length(self):
449
"""
450
File size in bytes.
451
452
Returns:
453
int: File size
454
"""
455
456
@property
457
def chunk_size(self):
458
"""
459
Chunk size in bytes.
460
461
Returns:
462
int: Chunk size
463
"""
464
465
@property
466
def upload_date(self):
467
"""
468
Upload completion timestamp.
469
470
Returns:
471
datetime: Upload date
472
"""
473
474
@property
475
def md5(self):
476
"""
477
MD5 checksum (if enabled).
478
479
Returns:
480
str: MD5 hash or None
481
"""
482
483
@property
484
def metadata(self):
485
"""
486
Custom metadata.
487
488
Returns:
489
dict: Metadata dictionary
490
"""
491
492
class GridOut:
493
def __init__(self, root_collection, file_id=None, file_document=None, session=None):
494
"""
495
GridFS file for reading.
496
497
Parameters:
498
- root_collection: GridFS root collection
499
- file_id: file identifier
500
- file_document: file document
501
- session: optional ClientSession
502
"""
503
504
def read(self, size=-1):
505
"""
506
Read data from file.
507
508
Parameters:
509
- size: bytes to read (-1 for all)
510
511
Returns:
512
bytes: File data
513
"""
514
515
def readline(self, size=-1):
516
"""
517
Read line from file.
518
519
Parameters:
520
- size: maximum bytes to read
521
522
Returns:
523
bytes: Line data
524
"""
525
526
def readlines(self):
527
"""
528
Read all lines from file.
529
530
Returns:
531
list: List of lines as bytes
532
"""
533
534
def seek(self, pos, whence=0):
535
"""
536
Seek to file position.
537
538
Parameters:
539
- pos: position
540
- whence: seek mode (0=absolute, 1=relative, 2=from end)
541
"""
542
543
def tell(self):
544
"""
545
Get current file position.
546
547
Returns:
548
int: Current position
549
"""
550
551
def close(self):
552
"""Close file."""
553
554
def __iter__(self):
555
"""Iterate over file lines."""
556
557
def __enter__(self):
558
"""Context manager entry."""
559
560
def __exit__(self, exc_type, exc_val, exc_tb):
561
"""Context manager exit."""
562
563
# Same properties as GridIn
564
@property
565
def _id(self): ...
566
@property
567
def filename(self): ...
568
@property
569
def length(self): ...
570
@property
571
def chunk_size(self): ...
572
@property
573
def upload_date(self): ...
574
@property
575
def md5(self): ...
576
@property
577
def metadata(self): ...
578
579
class GridOutCursor:
580
def __init__(self, collection, filter=None, session=None, **kwargs):
581
"""
582
Cursor for GridFS files.
583
584
Parameters:
585
- collection: files collection
586
- filter: query criteria
587
- session: optional ClientSession
588
- kwargs: cursor options
589
"""
590
591
def __iter__(self):
592
"""Iterate over files."""
593
594
def __next__(self):
595
"""Get next file."""
596
597
def next(self):
598
"""Get next file (Python 2 compatibility)."""
599
600
def clone(self):
601
"""Clone cursor."""
602
603
def count(self):
604
"""
605
Count matching files.
606
607
Returns:
608
int: File count
609
"""
610
```
611
612
### Constants and Exceptions
613
614
GridFS-related constants and error handling.
615
616
```python { .api }
617
DEFAULT_CHUNK_SIZE: int # Default chunk size (255KB)
618
619
class NoFile(Exception):
620
"""Raised when GridFS file is not found."""
621
```
622
623
## Usage Examples
624
625
### Basic GridFS Operations
626
627
```python
628
from pymongo import MongoClient
629
import gridfs
630
from io import BytesIO
631
632
client = MongoClient()
633
db = client.mydb
634
fs = gridfs.GridFS(db)
635
636
# Store a file
637
with open("image.jpg", "rb") as f:
638
file_id = fs.put(f, filename="profile.jpg", contentType="image/jpeg")
639
print(f"Stored file with ID: {file_id}")
640
641
# Retrieve a file
642
grid_out = fs.get(file_id)
643
with open("downloaded.jpg", "wb") as f:
644
f.write(grid_out.read())
645
646
print(f"Downloaded {grid_out.filename}, size: {grid_out.length} bytes")
647
648
# Store with metadata
649
file_id = fs.put(
650
b"Hello, GridFS!",
651
filename="greeting.txt",
652
contentType="text/plain",
653
metadata={"author": "Alice", "tags": ["greeting", "sample"]}
654
)
655
656
# Find and list files
657
for grid_file in fs.find({"metadata.author": "Alice"}):
658
print(f"File: {grid_file.filename}, Author: {grid_file.metadata['author']}")
659
660
# Delete a file
661
fs.delete(file_id)
662
```
663
664
### GridFSBucket Operations (Recommended)
665
666
```python
667
from pymongo import MongoClient
668
import gridfs
669
from io import BytesIO
670
671
client = MongoClient()
672
db = client.mydb
673
bucket = gridfs.GridFSBucket(db, bucket_name="images")
674
675
# Upload from stream
676
with open("photo.jpg", "rb") as f:
677
file_id = bucket.upload_from_stream(
678
"user_photo.jpg",
679
f,
680
metadata={"user_id": 12345, "category": "profile"}
681
)
682
683
print(f"Uploaded photo with ID: {file_id}")
684
685
# Download to stream
686
with open("downloaded_photo.jpg", "wb") as f:
687
bucket.download_to_stream(file_id, f)
688
689
# Upload with custom chunk size for large files
690
with open("video.mp4", "rb") as f:
691
file_id = bucket.upload_from_stream(
692
"presentation.mp4",
693
f,
694
chunk_size_bytes=1024*1024, # 1MB chunks
695
metadata={"duration": 1800, "resolution": "1080p"}
696
)
697
698
# Stream processing
699
upload_stream = bucket.open_upload_stream(
700
"processed_data.csv",
701
metadata={"processing_date": "2023-06-01"}
702
)
703
704
# Write data in chunks
705
for chunk in process_large_dataset():
706
upload_stream.write(chunk.encode())
707
708
upload_stream.close()
709
print(f"Processed file ID: {upload_stream._id}")
710
```
711
712
### Advanced GridFS Usage
713
714
```python
715
import gridfs
716
from bson import ObjectId
717
from datetime import datetime
718
719
# Custom GridFS collection
720
fs = gridfs.GridFS(db, collection="documents")
721
722
# Store with specific file ID
723
custom_id = ObjectId()
724
fs.put(
725
b"Important document content",
726
_id=custom_id,
727
filename="contract.pdf",
728
contentType="application/pdf",
729
metadata={
730
"department": "legal",
731
"confidential": True,
732
"expires": datetime(2025, 12, 31)
733
}
734
)
735
736
# Find files with complex queries
737
large_images = fs.find({
738
"contentType": {"$regex": "^image/"},
739
"length": {"$gt": 1024*1024}, # > 1MB
740
"uploadDate": {"$gte": datetime(2023, 1, 1)}
741
}).sort("uploadDate", -1)
742
743
for img in large_images:
744
print(f"Large image: {img.filename}, {img.length/1024/1024:.1f}MB")
745
746
# Version management by filename
747
versions = list(fs.find({"filename": "document.txt"}).sort("uploadDate", 1))
748
print(f"Found {len(versions)} versions of document.txt")
749
750
# Get latest version
751
latest = fs.get_last_version("document.txt")
752
print(f"Latest version uploaded: {latest.upload_date}")
753
754
# Stream reading
755
grid_out = fs.get(file_id)
756
while True:
757
chunk = grid_out.read(8192) # Read 8KB chunks
758
if not chunk:
759
break
760
process_chunk(chunk)
761
grid_out.close()
762
```
763
764
### GridFS with Transactions
765
766
```python
767
import gridfs
768
from pymongo.errors import PyMongoError
769
770
client = MongoClient()
771
db = client.mydb
772
bucket = gridfs.GridFSBucket(db)
773
774
# GridFS operations in transaction
775
with client.start_session() as session:
776
with session.start_transaction():
777
try:
778
# Upload file
779
with open("data.json", "rb") as f:
780
file_id = bucket.upload_from_stream(
781
"backup.json",
782
f,
783
session=session
784
)
785
786
# Update metadata in related collection
787
db.backups.insert_one({
788
"file_id": file_id,
789
"created_date": datetime.now(),
790
"status": "completed"
791
}, session=session)
792
793
print("Backup created successfully")
794
795
except PyMongoError as e:
796
print(f"Backup failed: {e}")
797
raise # Will abort transaction
798
799
# Cleanup old backups
800
def cleanup_old_backups(session):
801
"""Remove backups older than 30 days."""
802
cutoff_date = datetime.now() - timedelta(days=30)
803
804
old_backups = db.backups.find(
805
{"created_date": {"$lt": cutoff_date}},
806
session=session
807
)
808
809
for backup in old_backups:
810
# Delete GridFS file
811
bucket.delete(backup["file_id"], session=session)
812
# Delete metadata
813
db.backups.delete_one({"_id": backup["_id"]}, session=session)
814
815
# Run cleanup in transaction
816
with client.start_session() as session:
817
session.with_transaction(cleanup_old_backups)
818
```