- Spec files
pypi-openai
Describes: pkg:pypi/openai@1.106.x
- Description
- Official Python library for the OpenAI API providing chat completions, embeddings, audio, images, and more
- Author
- tessl
- Last updated
files.md docs/
1# Files23Upload, manage, and retrieve files for use with various OpenAI services including fine-tuning, assistants, and batch operations.45## Capabilities67### File Upload89Upload files to OpenAI for use with different services and purposes.1011```python { .api }12def create(13self,14*,15file: FileTypes,16purpose: FilePurpose,17expires_after: file_create_params.ExpiresAfter | NotGiven = NOT_GIVEN,18# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.19# The extra values given here take precedence over values defined on the client or passed to this method.20extra_headers: Headers | None = None,21extra_query: Query | None = None,22extra_body: Body | None = None,23timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,24) -> FileObject: ...25```2627Usage examples:2829```python30from openai import OpenAI3132client = OpenAI()3334# Upload file for fine-tuning35with open("training_data.jsonl", "rb") as f:36file_response = client.files.create(37file=f,38purpose="fine-tune"39)4041print(f"File uploaded: {file_response.id}")42print(f"Filename: {file_response.filename}")43print(f"Size: {file_response.bytes} bytes")4445# Upload file for assistants46with open("knowledge_base.txt", "rb") as f:47assistant_file = client.files.create(48file=f,49purpose="assistants"50)5152print(f"Assistant file ID: {assistant_file.id}")5354# Upload batch processing file55with open("batch_requests.jsonl", "rb") as f:56batch_file = client.files.create(57file=f,58purpose="batch"59)6061print(f"Batch file ID: {batch_file.id}")6263# Upload image for vision64with open("image.png", "rb") as f:65vision_file = client.files.create(66file=f,67purpose="vision"68)6970print(f"Vision file ID: {vision_file.id}")71```7273### File Management7475List, retrieve, and delete files with comprehensive metadata access.7677```python { .api }78def list(79self,80*,81after: str | NotGiven = NOT_GIVEN,82limit: int | NotGiven = NOT_GIVEN,83order: Literal["asc", "desc"] | NotGiven = NOT_GIVEN,84purpose: str | NotGiven = NOT_GIVEN,85# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.86# The extra values given here take precedence over values defined on the client or passed to this method.87extra_headers: Headers | None = None,88extra_query: Query | None = None,89extra_body: Body | None = None,90timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,91) -> SyncCursorPage[FileObject]: ...9293def retrieve(94self,95file_id: str,96*,97extra_headers: Headers | None = None,98extra_query: Query | None = None,99extra_body: Body | None = None,100timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN101) -> FileObject: ...102103def delete(104self,105file_id: str,106*,107extra_headers: Headers | None = None,108extra_query: Query | None = None,109extra_body: Body | None = None,110timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN111) -> FileDeleted: ...112```113114Usage examples:115116```python117# List all files118all_files = client.files.list()119120print("All files:")121for file in all_files:122print(f" {file.id}: {file.filename} ({file.bytes} bytes) - {file.purpose}")123124# List files by purpose125fine_tune_files = client.files.list(purpose="fine-tune")126127print("\nFine-tuning files:")128for file in fine_tune_files:129print(f" {file.id}: {file.filename}")130131# Retrieve specific file132file_id = "file-abc123"133file_info = client.files.retrieve(file_id)134135print(f"\nFile details:")136print(f" ID: {file_info.id}")137print(f" Filename: {file_info.filename}")138print(f" Size: {file_info.bytes} bytes")139print(f" Created: {file_info.created_at}")140print(f" Purpose: {file_info.purpose}")141print(f" Status: {file_info.status}")142143# Delete file144deletion_result = client.files.delete(file_id)145146if deletion_result.deleted:147print(f"File {deletion_result.id} deleted successfully")148else:149print(f"Failed to delete file {deletion_result.id}")150```151152### File Content Retrieval153154Download and access file content for processing and analysis.155156```python { .api }157def content(158self,159file_id: str,160*,161# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.162# The extra values given here take precedence over values defined on the client or passed to this method.163extra_headers: Headers | None = None,164extra_query: Query | None = None,165extra_body: Body | None = None,166timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,167) -> HttpxBinaryResponseContent: ...168169def wait_for_processing(170self,171id: str,172*,173poll_interval: float = 5.0,174max_wait_seconds: float = 30 * 60,175) -> FileObject: ...176```177178Usage examples:179180```python181# Download file content182file_id = "file-abc123"183file_content = client.files.content(file_id)184185# Save content to local file186with open("downloaded_file.txt", "wb") as f:187f.write(file_content.content)188189print("File downloaded successfully")190191# Process JSONL file content for fine-tuning192file_content = client.files.content(file_id)193content_str = file_content.content.decode('utf-8')194195# Parse JSONL content196import json197198lines = content_str.strip().split('\n')199training_examples = []200201for line in lines:202try:203example = json.loads(line)204training_examples.append(example)205except json.JSONDecodeError as e:206print(f"Error parsing line: {e}")207208print(f"Loaded {len(training_examples)} training examples")209210# Process and analyze file content211def analyze_file_content(file_id: str):212"""Analyze uploaded file content"""213214# Get file info215file_info = client.files.retrieve(file_id)216print(f"Analyzing file: {file_info.filename}")217218# Get content219file_content = client.files.content(file_id)220content = file_content.content221222# Basic analysis223analysis = {224"filename": file_info.filename,225"size_bytes": len(content),226"purpose": file_info.purpose,227"created_at": file_info.created_at228}229230# Content-specific analysis231if file_info.filename.endswith('.jsonl'):232try:233content_str = content.decode('utf-8')234lines = content_str.strip().split('\n')235analysis["line_count"] = len(lines)236237# Sample first line238if lines:239analysis["sample_line"] = json.loads(lines[0])240241except Exception as e:242analysis["parse_error"] = str(e)243244elif file_info.filename.endswith(('.txt', '.md')):245try:246content_str = content.decode('utf-8')247analysis["character_count"] = len(content_str)248analysis["word_count"] = len(content_str.split())249analysis["line_count"] = len(content_str.split('\n'))250251except Exception as e:252analysis["parse_error"] = str(e)253254return analysis255256# Analyze uploaded file257analysis = analyze_file_content("file-abc123")258print("File analysis:", analysis)259```260261### Batch File Operations262263Handle multiple files efficiently with batch upload and management operations.264265Usage examples:266267```python268import os269from pathlib import Path270from typing import List, Dict271import concurrent.futures272273def upload_files_batch(file_paths: List[str], purpose: str) -> List[Dict]:274"""Upload multiple files concurrently"""275276def upload_single_file(file_path):277try:278with open(file_path, "rb") as f:279file_response = client.files.create(280file=f,281purpose=purpose282)283284return {285"local_path": file_path,286"file_id": file_response.id,287"filename": file_response.filename,288"bytes": file_response.bytes,289"status": "success"290}291except Exception as e:292return {293"local_path": file_path,294"error": str(e),295"status": "failed"296}297298# Use thread pool for concurrent uploads299with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:300results = list(executor.map(upload_single_file, file_paths))301302return results303304# Upload multiple training files305training_files = [306"dataset_1.jsonl",307"dataset_2.jsonl",308"dataset_3.jsonl"309]310311upload_results = upload_files_batch(training_files, "fine-tune")312313successful_uploads = [r for r in upload_results if r["status"] == "success"]314failed_uploads = [r for r in upload_results if r["status"] == "failed"]315316print(f"Successfully uploaded: {len(successful_uploads)} files")317print(f"Failed uploads: {len(failed_uploads)} files")318319for result in successful_uploads:320print(f" {result['filename']}: {result['file_id']}")321322# Clean up old files323def cleanup_old_files(purpose: str = None, older_than_days: int = 30):324"""Delete files older than specified days"""325326import time327328current_time = time.time()329cutoff_time = current_time - (older_than_days * 24 * 60 * 60)330331# List files332if purpose:333files = client.files.list(purpose=purpose)334else:335files = client.files.list()336337deleted_count = 0338339for file in files:340if file.created_at < cutoff_time:341try:342client.files.delete(file.id)343print(f"Deleted old file: {file.filename}")344deleted_count += 1345except Exception as e:346print(f"Failed to delete {file.filename}: {e}")347348print(f"Cleanup completed. Deleted {deleted_count} files.")349350# Clean up files older than 30 days351cleanup_old_files(purpose="fine-tune", older_than_days=30)352353# File synchronization utility354def sync_files_with_local(local_dir: str, purpose: str):355"""Sync local directory with OpenAI files"""356357local_path = Path(local_dir)358359# Get remote files360remote_files = client.files.list(purpose=purpose)361remote_filenames = {f.filename: f for f in remote_files}362363# Get local files364local_files = list(local_path.glob("*.jsonl"))365local_filenames = {f.name for f in local_files}366367# Files to upload (in local but not remote)368to_upload = local_filenames - set(remote_filenames.keys())369370# Files to download (in remote but not local)371to_download = set(remote_filenames.keys()) - local_filenames372373print(f"Files to upload: {len(to_upload)}")374print(f"Files to download: {len(to_download)}")375376# Upload missing files377for filename in to_upload:378file_path = local_path / filename379380with open(file_path, "rb") as f:381file_response = client.files.create(382file=f,383purpose=purpose384)385386print(f"Uploaded: {filename} -> {file_response.id}")387388# Download missing files389for filename in to_download:390remote_file = remote_filenames[filename]391392file_content = client.files.content(remote_file.id)393394local_file_path = local_path / filename395with open(local_file_path, "wb") as f:396f.write(file_content.content)397398print(f"Downloaded: {filename}")399400# Sync local training data401sync_files_with_local("./training_data/", "fine-tune")402```403404### File Validation and Processing405406Validate file formats and content before upload for different purposes.407408Usage examples:409410```python411import json412from typing import Optional, Dict, Any413414def validate_jsonl_file(file_path: str) -> Dict[str, Any]:415"""Validate JSONL file for fine-tuning"""416417validation_result = {418"valid": True,419"errors": [],420"warnings": [],421"line_count": 0,422"sample_lines": []423}424425try:426with open(file_path, 'r', encoding='utf-8') as f:427for line_num, line in enumerate(f, 1):428line = line.strip()429if not line:430continue431432validation_result["line_count"] += 1433434try:435data = json.loads(line)436437# Validate required fields for chat format438if "messages" not in data:439validation_result["errors"].append(440f"Line {line_num}: Missing 'messages' field"441)442validation_result["valid"] = False443444# Store sample445if len(validation_result["sample_lines"]) < 3:446validation_result["sample_lines"].append(data)447448except json.JSONDecodeError as e:449validation_result["errors"].append(450f"Line {line_num}: Invalid JSON - {e}"451)452validation_result["valid"] = False453454except FileNotFoundError:455validation_result["errors"].append("File not found")456validation_result["valid"] = False457except Exception as e:458validation_result["errors"].append(f"Error reading file: {e}")459validation_result["valid"] = False460461# Warnings462if validation_result["line_count"] < 10:463validation_result["warnings"].append(464"File has fewer than 10 examples (recommended minimum)"465)466467return validation_result468469# Validate before upload470file_path = "training_data.jsonl"471validation = validate_jsonl_file(file_path)472473if validation["valid"]:474print(f"✓ File is valid ({validation['line_count']} lines)")475476# Upload validated file477with open(file_path, "rb") as f:478file_response = client.files.create(479file=f,480purpose="fine-tune"481)482483print(f"Uploaded: {file_response.id}")484485else:486print("✗ File validation failed:")487for error in validation["errors"]:488print(f" Error: {error}")489490for warning in validation["warnings"]:491print(f" Warning: {warning}")492493# File format converter494def convert_csv_to_jsonl(csv_path: str, output_path: str,495input_col: str, output_col: str):496"""Convert CSV to JSONL for fine-tuning"""497498import csv499500with open(csv_path, 'r') as csv_file, \501open(output_path, 'w') as jsonl_file:502503reader = csv.DictReader(csv_file)504505for row in reader:506# Create chat format507example = {508"messages": [509{"role": "user", "content": row[input_col]},510{"role": "assistant", "content": row[output_col]}511]512}513514jsonl_file.write(json.dumps(example) + '\n')515516print(f"Converted {csv_path} to {output_path}")517518# Convert and upload519convert_csv_to_jsonl(520"training_data.csv",521"training_data.jsonl",522"question",523"answer"524)525526# Validate and upload converted file527validation = validate_jsonl_file("training_data.jsonl")528if validation["valid"]:529with open("training_data.jsonl", "rb") as f:530file_response = client.files.create(531file=f,532purpose="fine-tune"533)534print(f"Uploaded converted file: {file_response.id}")535```536537## Types538539### Core Response Types540541```python { .api }542class FileObject(BaseModel):543id: str544bytes: int545created_at: int546filename: str547object: Literal["file"]548purpose: FilePurpose549status: Literal["uploaded", "processed", "error"]550status_details: Optional[str]551552class FileDeleted(BaseModel):553id: str554deleted: bool555object: Literal["file"]556557# File content response558HttpxBinaryResponseContent = bytes # Binary content from httpx response559560# File expiration settings561ExpiresAfter = TypedDict('ExpiresAfter', {562'anchor': Literal["uploaded"],563'days': int,564}, total=False)565```566567### Parameter Types568569```python { .api }570# File upload parameters571FileCreateParams = TypedDict('FileCreateParams', {572'file': Required[FileTypes],573'purpose': Required[FilePurpose],574'expires_after': NotRequired[ExpiresAfter],575'extra_headers': NotRequired[Headers],576'extra_query': NotRequired[Query],577'extra_body': NotRequired[Body],578'timeout': NotRequired[float],579}, total=False)580581# File list parameters582FileListParams = TypedDict('FileListParams', {583'after': NotRequired[str],584'limit': NotRequired[int],585'order': NotRequired[Literal["asc", "desc"]],586'purpose': NotRequired[str],587'extra_headers': NotRequired[Headers],588'extra_query': NotRequired[Query],589'extra_body': NotRequired[Body],590'timeout': NotRequired[float],591}, total=False)592593# File purpose enumeration594FilePurpose = Literal[595"assistants",596"batch",597"fine-tune",598"vision",599"user_data",600"evals"601]602603# File types for upload604FileTypes = Union[605bytes, # Raw file bytes606IO[bytes], # File-like object607str, # File path608os.PathLike[str] # Path object609]610```611612### File Status and Metadata613614```python { .api }615# File status enumeration616FileStatus = Literal["uploaded", "processed", "error"]617618# File metadata structure619class FileMetadata(BaseModel):620id: str621filename: str622size_bytes: int623upload_timestamp: int624purpose: FilePurpose625status: FileStatus626error_details: Optional[str]627628# Purpose-specific requirements629class FilePurposeRequirements:630fine_tune = {631"formats": [".jsonl"],632"max_size_mb": 100,633"required_fields": ["messages"],634"min_examples": 10635}636637assistants = {638"formats": [".c", ".cpp", ".csv", ".docx", ".html", ".java",639".json", ".md", ".pdf", ".php", ".pptx", ".py",640".rb", ".tex", ".txt", ".css", ".js", ".sh", ".ts"],641"max_size_mb": 512,642"max_tokens": 2000000, # 2 million tokens643"max_files_per_assistant": 20644}645646batch = {647"formats": [".jsonl"],648"max_size_mb": 200, # Updated to 200MB649"max_requests": 50000650}651652vision = {653"formats": [".png", ".jpg", ".jpeg", ".gif", ".webp"],654"max_size_mb": 20,655"max_resolution": "2048x2048"656}657658user_data = {659"formats": [".*"], # Flexible format support660"max_size_mb": 512,661"description": "Flexible file type for any purpose"662}663664evals = {665"formats": [".jsonl", ".json", ".csv"],666"max_size_mb": 100,667"description": "Used for evaluation data sets"668}669```670671### Configuration and Limits672673```python { .api }674# Global file limits675class FileLimits:676max_file_size: int = 512 * 1024 * 1024 # 512MB per file677max_organization_storage: int = 1024 * 1024 * 1024 * 1024 # 1TB total678679# Purpose-specific limits680fine_tune_max_size: int = 100 * 1024 * 1024 # 100MB681assistant_max_size: int = 512 * 1024 * 1024 # 512MB682batch_max_size: int = 200 * 1024 * 1024 # 200MB (updated)683vision_max_size: int = 20 * 1024 * 1024 # 20MB684user_data_max_size: int = 512 * 1024 * 1024 # 512MB685evals_max_size: int = 100 * 1024 * 1024 # 100MB686687# Default expiration policies688default_expiration = {689"batch": 30, # 30 days for batch files690"other": None # No expiration for other purposes691}692693# Pagination limits694list_limit_max: int = 10000695list_limit_default: int = 10000696697# Processing timeouts698wait_for_processing_default: float = 30 * 60 # 30 minutes699poll_interval_default: float = 5.0 # 5 seconds700```701702## Best Practices703704### File Preparation705706- Validate file format and content before upload707- Use appropriate file extensions for each purpose708- Ensure files are within size limits for their intended use709- Use UTF-8 encoding for text files710- Test with small files before uploading large datasets711712### Fine-tuning Files713714- Use JSONL format with proper message structure715- Include diverse examples covering your use cases716- Aim for at least 50-100 high-quality examples717- Balance your dataset to avoid bias718- Validate JSON structure before upload719720### Assistant Files721722- Organize content logically for better retrieval723- Use clear, descriptive filenames724- Chunk large documents appropriately725- Consider file format compatibility with retrieval726- Update files when source content changes727728### File Management729730- Use the `wait_for_processing()` method for files that need processing731- Implement proper cleanup procedures for old files732- Set appropriate expiration policies using `expires_after` parameter733- Monitor file usage and storage limits (1TB organization total)734- Use descriptive filenames for easy identification735- Keep local backups of important files736- Track file IDs and metadata for your applications737- Use pagination parameters (`after`, `limit`, `order`) for large file lists738739### Security and Privacy740741- Review file content before upload742- Be aware of data retention policies743- Use appropriate file permissions and access controls744- Consider encryption for sensitive local files745- Regularly audit uploaded files and their usage