pypi-openai

Description: Official Python library for the OpenAI API providing chat completions, embeddings, audio, images, and more

Author: tessl

Last updated: 21 days ago

How to use

npx @tessl/cli registry install tessl/pypi-openai@1.106.0

Provide feedback Docs

files.md docs/

1
# Files
2

3
Upload, manage, and retrieve files for use with various OpenAI services including fine-tuning, assistants, and batch operations.
4

5
## Capabilities
6

7
### File Upload
8

9
Upload files to OpenAI for use with different services and purposes.
10

11
```python { .api }
12
def create(
13
    self,
14
    *,
15
    file: FileTypes,
16
    purpose: FilePurpose,
17
    expires_after: file_create_params.ExpiresAfter | NotGiven = NOT_GIVEN,
18
    # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
19
    # The extra values given here take precedence over values defined on the client or passed to this method.
20
    extra_headers: Headers | None = None,
21
    extra_query: Query | None = None,
22
    extra_body: Body | None = None,
23
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
24
) -> FileObject: ...
25
```
26

27
Usage examples:
28

29
```python
30
from openai import OpenAI
31

32
client = OpenAI()
33

34
# Upload file for fine-tuning
35
with open("training_data.jsonl", "rb") as f:
36
    file_response = client.files.create(
37
        file=f,
38
        purpose="fine-tune"
39
    )
40

41
print(f"File uploaded: {file_response.id}")
42
print(f"Filename: {file_response.filename}")
43
print(f"Size: {file_response.bytes} bytes")
44

45
# Upload file for assistants
46
with open("knowledge_base.txt", "rb") as f:
47
    assistant_file = client.files.create(
48
        file=f,
49
        purpose="assistants"
50
    )
51

52
print(f"Assistant file ID: {assistant_file.id}")
53

54
# Upload batch processing file
55
with open("batch_requests.jsonl", "rb") as f:
56
    batch_file = client.files.create(
57
        file=f,
58
        purpose="batch"
59
    )
60

61
print(f"Batch file ID: {batch_file.id}")
62

63
# Upload image for vision
64
with open("image.png", "rb") as f:
65
    vision_file = client.files.create(
66
        file=f,
67
        purpose="vision"
68
    )
69

70
print(f"Vision file ID: {vision_file.id}")
71
```
72

73
### File Management
74

75
List, retrieve, and delete files with comprehensive metadata access.
76

77
```python { .api }
78
def list(
79
    self,
80
    *,
81
    after: str | NotGiven = NOT_GIVEN,
82
    limit: int | NotGiven = NOT_GIVEN,
83
    order: Literal["asc", "desc"] | NotGiven = NOT_GIVEN,
84
    purpose: str | NotGiven = NOT_GIVEN,
85
    # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
86
    # The extra values given here take precedence over values defined on the client or passed to this method.
87
    extra_headers: Headers | None = None,
88
    extra_query: Query | None = None,
89
    extra_body: Body | None = None,
90
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
91
) -> SyncCursorPage[FileObject]: ...
92

93
def retrieve(
94
    self,
95
    file_id: str,
96
    *,
97
    extra_headers: Headers | None = None,
98
    extra_query: Query | None = None,
99
    extra_body: Body | None = None,
100
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN
101
) -> FileObject: ...
102

103
def delete(
104
    self,
105
    file_id: str,
106
    *,
107
    extra_headers: Headers | None = None,
108
    extra_query: Query | None = None,
109
    extra_body: Body | None = None,
110
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN
111
) -> FileDeleted: ...
112
```
113

114
Usage examples:
115

116
```python
117
# List all files
118
all_files = client.files.list()
119

120
print("All files:")
121
for file in all_files:
122
    print(f"  {file.id}: {file.filename} ({file.bytes} bytes) - {file.purpose}")
123

124
# List files by purpose
125
fine_tune_files = client.files.list(purpose="fine-tune")
126

127
print("\nFine-tuning files:")
128
for file in fine_tune_files:
129
    print(f"  {file.id}: {file.filename}")
130

131
# Retrieve specific file
132
file_id = "file-abc123"
133
file_info = client.files.retrieve(file_id)
134

135
print(f"\nFile details:")
136
print(f"  ID: {file_info.id}")
137
print(f"  Filename: {file_info.filename}")
138
print(f"  Size: {file_info.bytes} bytes")
139
print(f"  Created: {file_info.created_at}")
140
print(f"  Purpose: {file_info.purpose}")
141
print(f"  Status: {file_info.status}")
142

143
# Delete file
144
deletion_result = client.files.delete(file_id)
145

146
if deletion_result.deleted:
147
    print(f"File {deletion_result.id} deleted successfully")
148
else:
149
    print(f"Failed to delete file {deletion_result.id}")
150
```
151

152
### File Content Retrieval
153

154
Download and access file content for processing and analysis.
155

156
```python { .api }
157
def content(
158
    self,
159
    file_id: str,
160
    *,
161
    # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
162
    # The extra values given here take precedence over values defined on the client or passed to this method.
163
    extra_headers: Headers | None = None,
164
    extra_query: Query | None = None,
165
    extra_body: Body | None = None,
166
    timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
167
) -> HttpxBinaryResponseContent: ...
168

169
def wait_for_processing(
170
    self,
171
    id: str,
172
    *,
173
    poll_interval: float = 5.0,
174
    max_wait_seconds: float = 30 * 60,
175
) -> FileObject: ...
176
```
177

178
Usage examples:
179

180
```python
181
# Download file content
182
file_id = "file-abc123"
183
file_content = client.files.content(file_id)
184

185
# Save content to local file
186
with open("downloaded_file.txt", "wb") as f:
187
    f.write(file_content.content)
188

189
print("File downloaded successfully")
190

191
# Process JSONL file content for fine-tuning
192
file_content = client.files.content(file_id)
193
content_str = file_content.content.decode('utf-8')
194

195
# Parse JSONL content
196
import json
197

198
lines = content_str.strip().split('\n')
199
training_examples = []
200

201
for line in lines:
202
    try:
203
        example = json.loads(line)
204
        training_examples.append(example)
205
    except json.JSONDecodeError as e:
206
        print(f"Error parsing line: {e}")
207

208
print(f"Loaded {len(training_examples)} training examples")
209

210
# Process and analyze file content
211
def analyze_file_content(file_id: str):
212
    """Analyze uploaded file content"""
213
    
214
    # Get file info
215
    file_info = client.files.retrieve(file_id)
216
    print(f"Analyzing file: {file_info.filename}")
217
    
218
    # Get content
219
    file_content = client.files.content(file_id)
220
    content = file_content.content
221
    
222
    # Basic analysis
223
    analysis = {
224
        "filename": file_info.filename,
225
        "size_bytes": len(content),
226
        "purpose": file_info.purpose,
227
        "created_at": file_info.created_at
228
    }
229
    
230
    # Content-specific analysis
231
    if file_info.filename.endswith('.jsonl'):
232
        try:
233
            content_str = content.decode('utf-8')
234
            lines = content_str.strip().split('\n')
235
            analysis["line_count"] = len(lines)
236
            
237
            # Sample first line
238
            if lines:
239
                analysis["sample_line"] = json.loads(lines[0])
240
                
241
        except Exception as e:
242
            analysis["parse_error"] = str(e)
243
    
244
    elif file_info.filename.endswith(('.txt', '.md')):
245
        try:
246
            content_str = content.decode('utf-8')
247
            analysis["character_count"] = len(content_str)
248
            analysis["word_count"] = len(content_str.split())
249
            analysis["line_count"] = len(content_str.split('\n'))
250
            
251
        except Exception as e:
252
            analysis["parse_error"] = str(e)
253
    
254
    return analysis
255

256
# Analyze uploaded file
257
analysis = analyze_file_content("file-abc123")
258
print("File analysis:", analysis)
259
```
260

261
### Batch File Operations
262

263
Handle multiple files efficiently with batch upload and management operations.
264

265
Usage examples:
266

267
```python
268
import os
269
from pathlib import Path
270
from typing import List, Dict
271
import concurrent.futures
272

273
def upload_files_batch(file_paths: List[str], purpose: str) -> List[Dict]:
274
    """Upload multiple files concurrently"""
275
    
276
    def upload_single_file(file_path):
277
        try:
278
            with open(file_path, "rb") as f:
279
                file_response = client.files.create(
280
                    file=f,
281
                    purpose=purpose
282
                )
283
            
284
            return {
285
                "local_path": file_path,
286
                "file_id": file_response.id,
287
                "filename": file_response.filename,
288
                "bytes": file_response.bytes,
289
                "status": "success"
290
            }
291
        except Exception as e:
292
            return {
293
                "local_path": file_path,
294
                "error": str(e),
295
                "status": "failed"
296
            }
297
    
298
    # Use thread pool for concurrent uploads
299
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
300
        results = list(executor.map(upload_single_file, file_paths))
301
    
302
    return results
303

304
# Upload multiple training files
305
training_files = [
306
    "dataset_1.jsonl",
307
    "dataset_2.jsonl", 
308
    "dataset_3.jsonl"
309
]
310

311
upload_results = upload_files_batch(training_files, "fine-tune")
312

313
successful_uploads = [r for r in upload_results if r["status"] == "success"]
314
failed_uploads = [r for r in upload_results if r["status"] == "failed"]
315

316
print(f"Successfully uploaded: {len(successful_uploads)} files")
317
print(f"Failed uploads: {len(failed_uploads)} files")
318

319
for result in successful_uploads:
320
    print(f"  {result['filename']}: {result['file_id']}")
321

322
# Clean up old files
323
def cleanup_old_files(purpose: str = None, older_than_days: int = 30):
324
    """Delete files older than specified days"""
325
    
326
    import time
327
    
328
    current_time = time.time()
329
    cutoff_time = current_time - (older_than_days * 24 * 60 * 60)
330
    
331
    # List files
332
    if purpose:
333
        files = client.files.list(purpose=purpose)
334
    else:
335
        files = client.files.list()
336
    
337
    deleted_count = 0
338
    
339
    for file in files:
340
        if file.created_at < cutoff_time:
341
            try:
342
                client.files.delete(file.id)
343
                print(f"Deleted old file: {file.filename}")
344
                deleted_count += 1
345
            except Exception as e:
346
                print(f"Failed to delete {file.filename}: {e}")
347
    
348
    print(f"Cleanup completed. Deleted {deleted_count} files.")
349

350
# Clean up files older than 30 days
351
cleanup_old_files(purpose="fine-tune", older_than_days=30)
352

353
# File synchronization utility
354
def sync_files_with_local(local_dir: str, purpose: str):
355
    """Sync local directory with OpenAI files"""
356
    
357
    local_path = Path(local_dir)
358
    
359
    # Get remote files
360
    remote_files = client.files.list(purpose=purpose)
361
    remote_filenames = {f.filename: f for f in remote_files}
362
    
363
    # Get local files
364
    local_files = list(local_path.glob("*.jsonl"))
365
    local_filenames = {f.name for f in local_files}
366
    
367
    # Files to upload (in local but not remote)
368
    to_upload = local_filenames - set(remote_filenames.keys())
369
    
370
    # Files to download (in remote but not local)
371
    to_download = set(remote_filenames.keys()) - local_filenames
372
    
373
    print(f"Files to upload: {len(to_upload)}")
374
    print(f"Files to download: {len(to_download)}")
375
    
376
    # Upload missing files
377
    for filename in to_upload:
378
        file_path = local_path / filename
379
        
380
        with open(file_path, "rb") as f:
381
            file_response = client.files.create(
382
                file=f,
383
                purpose=purpose
384
            )
385
        
386
        print(f"Uploaded: {filename} -> {file_response.id}")
387
    
388
    # Download missing files
389
    for filename in to_download:
390
        remote_file = remote_filenames[filename]
391
        
392
        file_content = client.files.content(remote_file.id)
393
        
394
        local_file_path = local_path / filename
395
        with open(local_file_path, "wb") as f:
396
            f.write(file_content.content)
397
        
398
        print(f"Downloaded: {filename}")
399

400
# Sync local training data
401
sync_files_with_local("./training_data/", "fine-tune")
402
```
403

404
### File Validation and Processing
405

406
Validate file formats and content before upload for different purposes.
407

408
Usage examples:
409

410
```python
411
import json
412
from typing import Optional, Dict, Any
413

414
def validate_jsonl_file(file_path: str) -> Dict[str, Any]:
415
    """Validate JSONL file for fine-tuning"""
416
    
417
    validation_result = {
418
        "valid": True,
419
        "errors": [],
420
        "warnings": [],
421
        "line_count": 0,
422
        "sample_lines": []
423
    }
424
    
425
    try:
426
        with open(file_path, 'r', encoding='utf-8') as f:
427
            for line_num, line in enumerate(f, 1):
428
                line = line.strip()
429
                if not line:
430
                    continue
431
                
432
                validation_result["line_count"] += 1
433
                
434
                try:
435
                    data = json.loads(line)
436
                    
437
                    # Validate required fields for chat format
438
                    if "messages" not in data:
439
                        validation_result["errors"].append(
440
                            f"Line {line_num}: Missing 'messages' field"
441
                        )
442
                        validation_result["valid"] = False
443
                    
444
                    # Store sample
445
                    if len(validation_result["sample_lines"]) < 3:
446
                        validation_result["sample_lines"].append(data)
447
                    
448
                except json.JSONDecodeError as e:
449
                    validation_result["errors"].append(
450
                        f"Line {line_num}: Invalid JSON - {e}"
451
                    )
452
                    validation_result["valid"] = False
453
    
454
    except FileNotFoundError:
455
        validation_result["errors"].append("File not found")
456
        validation_result["valid"] = False
457
    except Exception as e:
458
        validation_result["errors"].append(f"Error reading file: {e}")
459
        validation_result["valid"] = False
460
    
461
    # Warnings
462
    if validation_result["line_count"] < 10:
463
        validation_result["warnings"].append(
464
            "File has fewer than 10 examples (recommended minimum)"
465
        )
466
    
467
    return validation_result
468

469
# Validate before upload
470
file_path = "training_data.jsonl"
471
validation = validate_jsonl_file(file_path)
472

473
if validation["valid"]:
474
    print(f"✓ File is valid ({validation['line_count']} lines)")
475
    
476
    # Upload validated file
477
    with open(file_path, "rb") as f:
478
        file_response = client.files.create(
479
            file=f,
480
            purpose="fine-tune"
481
        )
482
    
483
    print(f"Uploaded: {file_response.id}")
484
    
485
else:
486
    print("✗ File validation failed:")
487
    for error in validation["errors"]:
488
        print(f"  Error: {error}")
489

490
for warning in validation["warnings"]:
491
    print(f"  Warning: {warning}")
492

493
# File format converter
494
def convert_csv_to_jsonl(csv_path: str, output_path: str, 
495
                        input_col: str, output_col: str):
496
    """Convert CSV to JSONL for fine-tuning"""
497
    
498
    import csv
499
    
500
    with open(csv_path, 'r') as csv_file, \
501
         open(output_path, 'w') as jsonl_file:
502
        
503
        reader = csv.DictReader(csv_file)
504
        
505
        for row in reader:
506
            # Create chat format
507
            example = {
508
                "messages": [
509
                    {"role": "user", "content": row[input_col]},
510
                    {"role": "assistant", "content": row[output_col]}
511
                ]
512
            }
513
            
514
            jsonl_file.write(json.dumps(example) + '\n')
515
    
516
    print(f"Converted {csv_path} to {output_path}")
517

518
# Convert and upload
519
convert_csv_to_jsonl(
520
    "training_data.csv", 
521
    "training_data.jsonl",
522
    "question", 
523
    "answer"
524
)
525

526
# Validate and upload converted file
527
validation = validate_jsonl_file("training_data.jsonl")
528
if validation["valid"]:
529
    with open("training_data.jsonl", "rb") as f:
530
        file_response = client.files.create(
531
            file=f,
532
            purpose="fine-tune"
533
        )
534
    print(f"Uploaded converted file: {file_response.id}")
535
```
536

537
## Types
538

539
### Core Response Types
540

541
```python { .api }
542
class FileObject(BaseModel):
543
    id: str
544
    bytes: int
545
    created_at: int
546
    filename: str
547
    object: Literal["file"]
548
    purpose: FilePurpose
549
    status: Literal["uploaded", "processed", "error"]
550
    status_details: Optional[str]
551

552
class FileDeleted(BaseModel):
553
    id: str
554
    deleted: bool
555
    object: Literal["file"]
556

557
# File content response
558
HttpxBinaryResponseContent = bytes  # Binary content from httpx response
559

560
# File expiration settings
561
ExpiresAfter = TypedDict('ExpiresAfter', {
562
    'anchor': Literal["uploaded"],
563
    'days': int,
564
}, total=False)
565
```
566

567
### Parameter Types
568

569
```python { .api }
570
# File upload parameters
571
FileCreateParams = TypedDict('FileCreateParams', {
572
    'file': Required[FileTypes],
573
    'purpose': Required[FilePurpose],
574
    'expires_after': NotRequired[ExpiresAfter],
575
    'extra_headers': NotRequired[Headers],
576
    'extra_query': NotRequired[Query],
577
    'extra_body': NotRequired[Body],
578
    'timeout': NotRequired[float],
579
}, total=False)
580

581
# File list parameters
582
FileListParams = TypedDict('FileListParams', {
583
    'after': NotRequired[str],
584
    'limit': NotRequired[int],
585
    'order': NotRequired[Literal["asc", "desc"]],
586
    'purpose': NotRequired[str],
587
    'extra_headers': NotRequired[Headers],
588
    'extra_query': NotRequired[Query],
589
    'extra_body': NotRequired[Body],
590
    'timeout': NotRequired[float],
591
}, total=False)
592

593
# File purpose enumeration
594
FilePurpose = Literal[
595
    "assistants",
596
    "batch", 
597
    "fine-tune",
598
    "vision",
599
    "user_data",
600
    "evals"
601
]
602

603
# File types for upload
604
FileTypes = Union[
605
    bytes,              # Raw file bytes
606
    IO[bytes],          # File-like object
607
    str,                # File path
608
    os.PathLike[str]    # Path object
609
]
610
```
611

612
### File Status and Metadata
613

614
```python { .api }
615
# File status enumeration
616
FileStatus = Literal["uploaded", "processed", "error"]
617

618
# File metadata structure
619
class FileMetadata(BaseModel):
620
    id: str
621
    filename: str
622
    size_bytes: int
623
    upload_timestamp: int
624
    purpose: FilePurpose
625
    status: FileStatus
626
    error_details: Optional[str]
627

628
# Purpose-specific requirements
629
class FilePurposeRequirements:
630
    fine_tune = {
631
        "formats": [".jsonl"],
632
        "max_size_mb": 100,
633
        "required_fields": ["messages"],
634
        "min_examples": 10
635
    }
636
    
637
    assistants = {
638
        "formats": [".c", ".cpp", ".csv", ".docx", ".html", ".java", 
639
                   ".json", ".md", ".pdf", ".php", ".pptx", ".py", 
640
                   ".rb", ".tex", ".txt", ".css", ".js", ".sh", ".ts"],
641
        "max_size_mb": 512,
642
        "max_tokens": 2000000,  # 2 million tokens
643
        "max_files_per_assistant": 20
644
    }
645
    
646
    batch = {
647
        "formats": [".jsonl"],
648
        "max_size_mb": 200,  # Updated to 200MB
649
        "max_requests": 50000
650
    }
651
    
652
    vision = {
653
        "formats": [".png", ".jpg", ".jpeg", ".gif", ".webp"],
654
        "max_size_mb": 20,
655
        "max_resolution": "2048x2048"
656
    }
657
    
658
    user_data = {
659
        "formats": [".*"],  # Flexible format support
660
        "max_size_mb": 512,
661
        "description": "Flexible file type for any purpose"
662
    }
663
    
664
    evals = {
665
        "formats": [".jsonl", ".json", ".csv"],
666
        "max_size_mb": 100,
667
        "description": "Used for evaluation data sets"
668
    }
669
```
670

671
### Configuration and Limits
672

673
```python { .api }
674
# Global file limits
675
class FileLimits:
676
    max_file_size: int = 512 * 1024 * 1024  # 512MB per file
677
    max_organization_storage: int = 1024 * 1024 * 1024 * 1024  # 1TB total
678
    
679
    # Purpose-specific limits
680
    fine_tune_max_size: int = 100 * 1024 * 1024  # 100MB
681
    assistant_max_size: int = 512 * 1024 * 1024  # 512MB
682
    batch_max_size: int = 200 * 1024 * 1024     # 200MB (updated)
683
    vision_max_size: int = 20 * 1024 * 1024     # 20MB
684
    user_data_max_size: int = 512 * 1024 * 1024 # 512MB
685
    evals_max_size: int = 100 * 1024 * 1024     # 100MB
686
    
687
    # Default expiration policies
688
    default_expiration = {
689
        "batch": 30,  # 30 days for batch files
690
        "other": None  # No expiration for other purposes
691
    }
692
    
693
    # Pagination limits
694
    list_limit_max: int = 10000
695
    list_limit_default: int = 10000
696
    
697
    # Processing timeouts
698
    wait_for_processing_default: float = 30 * 60  # 30 minutes
699
    poll_interval_default: float = 5.0  # 5 seconds
700
```
701

702
## Best Practices
703

704
### File Preparation
705

706
- Validate file format and content before upload
707
- Use appropriate file extensions for each purpose
708
- Ensure files are within size limits for their intended use
709
- Use UTF-8 encoding for text files
710
- Test with small files before uploading large datasets
711

712
### Fine-tuning Files
713

714
- Use JSONL format with proper message structure
715
- Include diverse examples covering your use cases
716
- Aim for at least 50-100 high-quality examples
717
- Balance your dataset to avoid bias
718
- Validate JSON structure before upload
719

720
### Assistant Files
721

722
- Organize content logically for better retrieval
723
- Use clear, descriptive filenames
724
- Chunk large documents appropriately
725
- Consider file format compatibility with retrieval
726
- Update files when source content changes
727

728
### File Management
729

730
- Use the `wait_for_processing()` method for files that need processing
731
- Implement proper cleanup procedures for old files
732
- Set appropriate expiration policies using `expires_after` parameter
733
- Monitor file usage and storage limits (1TB organization total)
734
- Use descriptive filenames for easy identification
735
- Keep local backups of important files
736
- Track file IDs and metadata for your applications
737
- Use pagination parameters (`after`, `limit`, `order`) for large file lists
738

739
### Security and Privacy
740

741
- Review file content before upload
742
- Be aware of data retention policies
743
- Use appropriate file permissions and access controls
744
- Consider encryption for sensitive local files
745
- Regularly audit uploaded files and their usage