Utils for streaming large files from S3, HDFS, GCS, SFTP, Azure Blob Storage, and local filesystem with transparent compression support
—
Universal file operations that work across all supported storage systems with transparent compression support. These functions provide the primary interface for smart-open and serve as drop-in replacements for Python's built-in file operations.
The main entry point for accessing files across all supported storage systems.
def open(uri, mode='r', buffering=-1, encoding=None, errors=None, newline=None,
closefd=True, opener=None, compression='infer_from_extension',
transport_params=None):
"""Open URI object, returning a file-like object.
Parameters:
uri: str, pathlib.Path, or file-like object - The resource to open
mode: str - File access mode ('r', 'w', 'rb', 'wb', etc.)
buffering: int - Buffer size (-1 for system default)
encoding: str - Text encoding for text mode
errors: str - Error handling strategy for text mode
newline: str - Newline handling for text mode
closefd: bool - Close file descriptor (ignored, for compatibility)
opener: callable - Custom opener (ignored, for compatibility)
compression: str - Compression type or 'infer_from_extension'
transport_params: dict - Transport-specific parameters
Returns:
File-like object implementing standard Python I/O interface
"""Parse and analyze URI components for different storage systems.
def parse_uri(uri_as_string):
"""Parse URI string into transport-specific components.
Parameters:
uri_as_string: str - The URI to parse
Returns:
collections.namedtuple - Parsed URI with transport-specific fields
"""Deprecated functions maintained for backward compatibility.
def smart_open(uri, mode='rb', buffering=-1, encoding=None, errors=None,
newline=None, closefd=True, opener=None, ignore_extension=False,
**kwargs):
"""Legacy function - use open() instead.
Parameters:
uri: str - The resource to open
mode: str - File access mode (default: 'rb')
buffering: int - Buffer size
encoding: str - Text encoding
errors: str - Error handling strategy
newline: str - Newline handling
closefd: bool - Close file descriptor (ignored)
opener: callable - Custom opener (ignored)
ignore_extension: bool - If True, disables compression; if False, infers from extension
**kwargs: Transport parameters (deprecated - raises DeprecationWarning)
Deprecated: This function raises DeprecationWarning and is maintained only for
backward compatibility. Main differences from open():
- Default mode is 'rb' instead of 'r'
- Uses ignore_extension parameter instead of compression parameter
- Transport parameters passed as **kwargs (now deprecated)
Use smart_open.open() for new code.
"""
def s3_iter_bucket(bucket_name, prefix='', accept_key=None, key_limit=None,
workers=16, retries=3, **session_kwargs):
"""Deprecated - use smart_open.s3.iter_bucket instead.
Iterate over S3 bucket contents in parallel.
"""from smart_open import open
# Read text file
with open('s3://my-bucket/data.txt') as f:
content = f.read()
# Read binary file
with open('gs://my-bucket/image.jpg', 'rb') as f:
binary_data = f.read()
# Write text file
with open('azure://container/output.txt', 'w') as f:
f.write('Hello, world!')
# Write binary file
with open('s3://bucket/data.bin', 'wb') as f:
f.write(b'binary data')# Automatic compression detection from extension
with open('s3://bucket/data.txt.gz') as f:
uncompressed_text = f.read()
# Explicit compression specification
with open('s3://bucket/data.txt', compression='gzip') as f:
text = f.read()
# Writing compressed files
with open('gs://bucket/output.txt.bz2', 'w') as f:
f.write('This will be compressed automatically')# Custom encoding and error handling
with open('s3://bucket/text.txt', encoding='utf-8', errors='ignore') as f:
text = f.read()
# Custom buffer size
with open('gs://bucket/large-file.dat', 'rb', buffering=1024*1024) as f:
chunk = f.read(4096)
# Transport-specific parameters
transport_params = {
'client_kwargs': {'region_name': 'us-west-2'},
'buffer_size': 1024*1024
}
with open('s3://bucket/file.txt', transport_params=transport_params) as f:
data = f.read()# Random access to remote files
with open('s3://bucket/data.bin', 'rb') as f:
f.seek(1000) # Seek to byte 1000
chunk = f.read(100) # Read 100 bytes
f.seek(0, 2) # Seek to end
file_size = f.tell()
f.seek(-100, 2) # Seek to 100 bytes from end
tail = f.read()from smart_open import parse_uri
# Parse S3 URI
parsed = parse_uri('s3://my-bucket/path/to/file.txt')
print(parsed.scheme) # 's3'
print(parsed.bucket_id) # 'my-bucket'
print(parsed.key_id) # 'path/to/file.txt'
# Parse HTTP URI
parsed = parse_uri('https://example.com/data.json')
print(parsed.scheme) # 'https'
print(parsed.uri_path) # '/data.json'
# Parse local file URI
parsed = parse_uri('file:///home/user/data.txt')
print(parsed.scheme) # 'file'
print(parsed.uri_path) # '/home/user/data.txt'Smart-open provides consistent error handling across all transport layers:
from smart_open import open
import boto3
try:
with open('s3://bucket/nonexistent.txt') as f:
content = f.read()
except FileNotFoundError:
print("File not found")
except PermissionError:
print("Access denied")
except Exception as e:
print(f"Other error: {e}")'rb', 'wb') for better performance with large filesbuffering parameter for optimal I/O performancefor line in file) for memory efficiencyInstall with Tessl CLI
npx tessl i tessl/pypi-smart-open