pypi-openai

Description
Official Python library for the OpenAI API providing chat completions, embeddings, audio, images, and more
Author
tessl
Last updated

How to use

npx @tessl/cli registry install tessl/pypi-openai@1.106.0

files.md docs/

1
# Files
2
3
Upload, manage, and retrieve files for use with various OpenAI services including fine-tuning, assistants, and batch operations.
4
5
## Capabilities
6
7
### File Upload
8
9
Upload files to OpenAI for use with different services and purposes.
10
11
```python { .api }
12
def create(
13
self,
14
*,
15
file: FileTypes,
16
purpose: FilePurpose,
17
expires_after: file_create_params.ExpiresAfter | NotGiven = NOT_GIVEN,
18
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
19
# The extra values given here take precedence over values defined on the client or passed to this method.
20
extra_headers: Headers | None = None,
21
extra_query: Query | None = None,
22
extra_body: Body | None = None,
23
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
24
) -> FileObject: ...
25
```
26
27
Usage examples:
28
29
```python
30
from openai import OpenAI
31
32
client = OpenAI()
33
34
# Upload file for fine-tuning
35
with open("training_data.jsonl", "rb") as f:
36
file_response = client.files.create(
37
file=f,
38
purpose="fine-tune"
39
)
40
41
print(f"File uploaded: {file_response.id}")
42
print(f"Filename: {file_response.filename}")
43
print(f"Size: {file_response.bytes} bytes")
44
45
# Upload file for assistants
46
with open("knowledge_base.txt", "rb") as f:
47
assistant_file = client.files.create(
48
file=f,
49
purpose="assistants"
50
)
51
52
print(f"Assistant file ID: {assistant_file.id}")
53
54
# Upload batch processing file
55
with open("batch_requests.jsonl", "rb") as f:
56
batch_file = client.files.create(
57
file=f,
58
purpose="batch"
59
)
60
61
print(f"Batch file ID: {batch_file.id}")
62
63
# Upload image for vision
64
with open("image.png", "rb") as f:
65
vision_file = client.files.create(
66
file=f,
67
purpose="vision"
68
)
69
70
print(f"Vision file ID: {vision_file.id}")
71
```
72
73
### File Management
74
75
List, retrieve, and delete files with comprehensive metadata access.
76
77
```python { .api }
78
def list(
79
self,
80
*,
81
after: str | NotGiven = NOT_GIVEN,
82
limit: int | NotGiven = NOT_GIVEN,
83
order: Literal["asc", "desc"] | NotGiven = NOT_GIVEN,
84
purpose: str | NotGiven = NOT_GIVEN,
85
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
86
# The extra values given here take precedence over values defined on the client or passed to this method.
87
extra_headers: Headers | None = None,
88
extra_query: Query | None = None,
89
extra_body: Body | None = None,
90
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
91
) -> SyncCursorPage[FileObject]: ...
92
93
def retrieve(
94
self,
95
file_id: str,
96
*,
97
extra_headers: Headers | None = None,
98
extra_query: Query | None = None,
99
extra_body: Body | None = None,
100
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN
101
) -> FileObject: ...
102
103
def delete(
104
self,
105
file_id: str,
106
*,
107
extra_headers: Headers | None = None,
108
extra_query: Query | None = None,
109
extra_body: Body | None = None,
110
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN
111
) -> FileDeleted: ...
112
```
113
114
Usage examples:
115
116
```python
117
# List all files
118
all_files = client.files.list()
119
120
print("All files:")
121
for file in all_files:
122
print(f" {file.id}: {file.filename} ({file.bytes} bytes) - {file.purpose}")
123
124
# List files by purpose
125
fine_tune_files = client.files.list(purpose="fine-tune")
126
127
print("\nFine-tuning files:")
128
for file in fine_tune_files:
129
print(f" {file.id}: {file.filename}")
130
131
# Retrieve specific file
132
file_id = "file-abc123"
133
file_info = client.files.retrieve(file_id)
134
135
print(f"\nFile details:")
136
print(f" ID: {file_info.id}")
137
print(f" Filename: {file_info.filename}")
138
print(f" Size: {file_info.bytes} bytes")
139
print(f" Created: {file_info.created_at}")
140
print(f" Purpose: {file_info.purpose}")
141
print(f" Status: {file_info.status}")
142
143
# Delete file
144
deletion_result = client.files.delete(file_id)
145
146
if deletion_result.deleted:
147
print(f"File {deletion_result.id} deleted successfully")
148
else:
149
print(f"Failed to delete file {deletion_result.id}")
150
```
151
152
### File Content Retrieval
153
154
Download and access file content for processing and analysis.
155
156
```python { .api }
157
def content(
158
self,
159
file_id: str,
160
*,
161
# Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
162
# The extra values given here take precedence over values defined on the client or passed to this method.
163
extra_headers: Headers | None = None,
164
extra_query: Query | None = None,
165
extra_body: Body | None = None,
166
timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
167
) -> HttpxBinaryResponseContent: ...
168
169
def wait_for_processing(
170
self,
171
id: str,
172
*,
173
poll_interval: float = 5.0,
174
max_wait_seconds: float = 30 * 60,
175
) -> FileObject: ...
176
```
177
178
Usage examples:
179
180
```python
181
# Download file content
182
file_id = "file-abc123"
183
file_content = client.files.content(file_id)
184
185
# Save content to local file
186
with open("downloaded_file.txt", "wb") as f:
187
f.write(file_content.content)
188
189
print("File downloaded successfully")
190
191
# Process JSONL file content for fine-tuning
192
file_content = client.files.content(file_id)
193
content_str = file_content.content.decode('utf-8')
194
195
# Parse JSONL content
196
import json
197
198
lines = content_str.strip().split('\n')
199
training_examples = []
200
201
for line in lines:
202
try:
203
example = json.loads(line)
204
training_examples.append(example)
205
except json.JSONDecodeError as e:
206
print(f"Error parsing line: {e}")
207
208
print(f"Loaded {len(training_examples)} training examples")
209
210
# Process and analyze file content
211
def analyze_file_content(file_id: str):
212
"""Analyze uploaded file content"""
213
214
# Get file info
215
file_info = client.files.retrieve(file_id)
216
print(f"Analyzing file: {file_info.filename}")
217
218
# Get content
219
file_content = client.files.content(file_id)
220
content = file_content.content
221
222
# Basic analysis
223
analysis = {
224
"filename": file_info.filename,
225
"size_bytes": len(content),
226
"purpose": file_info.purpose,
227
"created_at": file_info.created_at
228
}
229
230
# Content-specific analysis
231
if file_info.filename.endswith('.jsonl'):
232
try:
233
content_str = content.decode('utf-8')
234
lines = content_str.strip().split('\n')
235
analysis["line_count"] = len(lines)
236
237
# Sample first line
238
if lines:
239
analysis["sample_line"] = json.loads(lines[0])
240
241
except Exception as e:
242
analysis["parse_error"] = str(e)
243
244
elif file_info.filename.endswith(('.txt', '.md')):
245
try:
246
content_str = content.decode('utf-8')
247
analysis["character_count"] = len(content_str)
248
analysis["word_count"] = len(content_str.split())
249
analysis["line_count"] = len(content_str.split('\n'))
250
251
except Exception as e:
252
analysis["parse_error"] = str(e)
253
254
return analysis
255
256
# Analyze uploaded file
257
analysis = analyze_file_content("file-abc123")
258
print("File analysis:", analysis)
259
```
260
261
### Batch File Operations
262
263
Handle multiple files efficiently with batch upload and management operations.
264
265
Usage examples:
266
267
```python
268
import os
269
from pathlib import Path
270
from typing import List, Dict
271
import concurrent.futures
272
273
def upload_files_batch(file_paths: List[str], purpose: str) -> List[Dict]:
274
"""Upload multiple files concurrently"""
275
276
def upload_single_file(file_path):
277
try:
278
with open(file_path, "rb") as f:
279
file_response = client.files.create(
280
file=f,
281
purpose=purpose
282
)
283
284
return {
285
"local_path": file_path,
286
"file_id": file_response.id,
287
"filename": file_response.filename,
288
"bytes": file_response.bytes,
289
"status": "success"
290
}
291
except Exception as e:
292
return {
293
"local_path": file_path,
294
"error": str(e),
295
"status": "failed"
296
}
297
298
# Use thread pool for concurrent uploads
299
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
300
results = list(executor.map(upload_single_file, file_paths))
301
302
return results
303
304
# Upload multiple training files
305
training_files = [
306
"dataset_1.jsonl",
307
"dataset_2.jsonl",
308
"dataset_3.jsonl"
309
]
310
311
upload_results = upload_files_batch(training_files, "fine-tune")
312
313
successful_uploads = [r for r in upload_results if r["status"] == "success"]
314
failed_uploads = [r for r in upload_results if r["status"] == "failed"]
315
316
print(f"Successfully uploaded: {len(successful_uploads)} files")
317
print(f"Failed uploads: {len(failed_uploads)} files")
318
319
for result in successful_uploads:
320
print(f" {result['filename']}: {result['file_id']}")
321
322
# Clean up old files
323
def cleanup_old_files(purpose: str = None, older_than_days: int = 30):
324
"""Delete files older than specified days"""
325
326
import time
327
328
current_time = time.time()
329
cutoff_time = current_time - (older_than_days * 24 * 60 * 60)
330
331
# List files
332
if purpose:
333
files = client.files.list(purpose=purpose)
334
else:
335
files = client.files.list()
336
337
deleted_count = 0
338
339
for file in files:
340
if file.created_at < cutoff_time:
341
try:
342
client.files.delete(file.id)
343
print(f"Deleted old file: {file.filename}")
344
deleted_count += 1
345
except Exception as e:
346
print(f"Failed to delete {file.filename}: {e}")
347
348
print(f"Cleanup completed. Deleted {deleted_count} files.")
349
350
# Clean up files older than 30 days
351
cleanup_old_files(purpose="fine-tune", older_than_days=30)
352
353
# File synchronization utility
354
def sync_files_with_local(local_dir: str, purpose: str):
355
"""Sync local directory with OpenAI files"""
356
357
local_path = Path(local_dir)
358
359
# Get remote files
360
remote_files = client.files.list(purpose=purpose)
361
remote_filenames = {f.filename: f for f in remote_files}
362
363
# Get local files
364
local_files = list(local_path.glob("*.jsonl"))
365
local_filenames = {f.name for f in local_files}
366
367
# Files to upload (in local but not remote)
368
to_upload = local_filenames - set(remote_filenames.keys())
369
370
# Files to download (in remote but not local)
371
to_download = set(remote_filenames.keys()) - local_filenames
372
373
print(f"Files to upload: {len(to_upload)}")
374
print(f"Files to download: {len(to_download)}")
375
376
# Upload missing files
377
for filename in to_upload:
378
file_path = local_path / filename
379
380
with open(file_path, "rb") as f:
381
file_response = client.files.create(
382
file=f,
383
purpose=purpose
384
)
385
386
print(f"Uploaded: {filename} -> {file_response.id}")
387
388
# Download missing files
389
for filename in to_download:
390
remote_file = remote_filenames[filename]
391
392
file_content = client.files.content(remote_file.id)
393
394
local_file_path = local_path / filename
395
with open(local_file_path, "wb") as f:
396
f.write(file_content.content)
397
398
print(f"Downloaded: {filename}")
399
400
# Sync local training data
401
sync_files_with_local("./training_data/", "fine-tune")
402
```
403
404
### File Validation and Processing
405
406
Validate file formats and content before upload for different purposes.
407
408
Usage examples:
409
410
```python
411
import json
412
from typing import Optional, Dict, Any
413
414
def validate_jsonl_file(file_path: str) -> Dict[str, Any]:
415
"""Validate JSONL file for fine-tuning"""
416
417
validation_result = {
418
"valid": True,
419
"errors": [],
420
"warnings": [],
421
"line_count": 0,
422
"sample_lines": []
423
}
424
425
try:
426
with open(file_path, 'r', encoding='utf-8') as f:
427
for line_num, line in enumerate(f, 1):
428
line = line.strip()
429
if not line:
430
continue
431
432
validation_result["line_count"] += 1
433
434
try:
435
data = json.loads(line)
436
437
# Validate required fields for chat format
438
if "messages" not in data:
439
validation_result["errors"].append(
440
f"Line {line_num}: Missing 'messages' field"
441
)
442
validation_result["valid"] = False
443
444
# Store sample
445
if len(validation_result["sample_lines"]) < 3:
446
validation_result["sample_lines"].append(data)
447
448
except json.JSONDecodeError as e:
449
validation_result["errors"].append(
450
f"Line {line_num}: Invalid JSON - {e}"
451
)
452
validation_result["valid"] = False
453
454
except FileNotFoundError:
455
validation_result["errors"].append("File not found")
456
validation_result["valid"] = False
457
except Exception as e:
458
validation_result["errors"].append(f"Error reading file: {e}")
459
validation_result["valid"] = False
460
461
# Warnings
462
if validation_result["line_count"] < 10:
463
validation_result["warnings"].append(
464
"File has fewer than 10 examples (recommended minimum)"
465
)
466
467
return validation_result
468
469
# Validate before upload
470
file_path = "training_data.jsonl"
471
validation = validate_jsonl_file(file_path)
472
473
if validation["valid"]:
474
print(f"✓ File is valid ({validation['line_count']} lines)")
475
476
# Upload validated file
477
with open(file_path, "rb") as f:
478
file_response = client.files.create(
479
file=f,
480
purpose="fine-tune"
481
)
482
483
print(f"Uploaded: {file_response.id}")
484
485
else:
486
print("✗ File validation failed:")
487
for error in validation["errors"]:
488
print(f" Error: {error}")
489
490
for warning in validation["warnings"]:
491
print(f" Warning: {warning}")
492
493
# File format converter
494
def convert_csv_to_jsonl(csv_path: str, output_path: str,
495
input_col: str, output_col: str):
496
"""Convert CSV to JSONL for fine-tuning"""
497
498
import csv
499
500
with open(csv_path, 'r') as csv_file, \
501
open(output_path, 'w') as jsonl_file:
502
503
reader = csv.DictReader(csv_file)
504
505
for row in reader:
506
# Create chat format
507
example = {
508
"messages": [
509
{"role": "user", "content": row[input_col]},
510
{"role": "assistant", "content": row[output_col]}
511
]
512
}
513
514
jsonl_file.write(json.dumps(example) + '\n')
515
516
print(f"Converted {csv_path} to {output_path}")
517
518
# Convert and upload
519
convert_csv_to_jsonl(
520
"training_data.csv",
521
"training_data.jsonl",
522
"question",
523
"answer"
524
)
525
526
# Validate and upload converted file
527
validation = validate_jsonl_file("training_data.jsonl")
528
if validation["valid"]:
529
with open("training_data.jsonl", "rb") as f:
530
file_response = client.files.create(
531
file=f,
532
purpose="fine-tune"
533
)
534
print(f"Uploaded converted file: {file_response.id}")
535
```
536
537
## Types
538
539
### Core Response Types
540
541
```python { .api }
542
class FileObject(BaseModel):
543
id: str
544
bytes: int
545
created_at: int
546
filename: str
547
object: Literal["file"]
548
purpose: FilePurpose
549
status: Literal["uploaded", "processed", "error"]
550
status_details: Optional[str]
551
552
class FileDeleted(BaseModel):
553
id: str
554
deleted: bool
555
object: Literal["file"]
556
557
# File content response
558
HttpxBinaryResponseContent = bytes # Binary content from httpx response
559
560
# File expiration settings
561
ExpiresAfter = TypedDict('ExpiresAfter', {
562
'anchor': Literal["uploaded"],
563
'days': int,
564
}, total=False)
565
```
566
567
### Parameter Types
568
569
```python { .api }
570
# File upload parameters
571
FileCreateParams = TypedDict('FileCreateParams', {
572
'file': Required[FileTypes],
573
'purpose': Required[FilePurpose],
574
'expires_after': NotRequired[ExpiresAfter],
575
'extra_headers': NotRequired[Headers],
576
'extra_query': NotRequired[Query],
577
'extra_body': NotRequired[Body],
578
'timeout': NotRequired[float],
579
}, total=False)
580
581
# File list parameters
582
FileListParams = TypedDict('FileListParams', {
583
'after': NotRequired[str],
584
'limit': NotRequired[int],
585
'order': NotRequired[Literal["asc", "desc"]],
586
'purpose': NotRequired[str],
587
'extra_headers': NotRequired[Headers],
588
'extra_query': NotRequired[Query],
589
'extra_body': NotRequired[Body],
590
'timeout': NotRequired[float],
591
}, total=False)
592
593
# File purpose enumeration
594
FilePurpose = Literal[
595
"assistants",
596
"batch",
597
"fine-tune",
598
"vision",
599
"user_data",
600
"evals"
601
]
602
603
# File types for upload
604
FileTypes = Union[
605
bytes, # Raw file bytes
606
IO[bytes], # File-like object
607
str, # File path
608
os.PathLike[str] # Path object
609
]
610
```
611
612
### File Status and Metadata
613
614
```python { .api }
615
# File status enumeration
616
FileStatus = Literal["uploaded", "processed", "error"]
617
618
# File metadata structure
619
class FileMetadata(BaseModel):
620
id: str
621
filename: str
622
size_bytes: int
623
upload_timestamp: int
624
purpose: FilePurpose
625
status: FileStatus
626
error_details: Optional[str]
627
628
# Purpose-specific requirements
629
class FilePurposeRequirements:
630
fine_tune = {
631
"formats": [".jsonl"],
632
"max_size_mb": 100,
633
"required_fields": ["messages"],
634
"min_examples": 10
635
}
636
637
assistants = {
638
"formats": [".c", ".cpp", ".csv", ".docx", ".html", ".java",
639
".json", ".md", ".pdf", ".php", ".pptx", ".py",
640
".rb", ".tex", ".txt", ".css", ".js", ".sh", ".ts"],
641
"max_size_mb": 512,
642
"max_tokens": 2000000, # 2 million tokens
643
"max_files_per_assistant": 20
644
}
645
646
batch = {
647
"formats": [".jsonl"],
648
"max_size_mb": 200, # Updated to 200MB
649
"max_requests": 50000
650
}
651
652
vision = {
653
"formats": [".png", ".jpg", ".jpeg", ".gif", ".webp"],
654
"max_size_mb": 20,
655
"max_resolution": "2048x2048"
656
}
657
658
user_data = {
659
"formats": [".*"], # Flexible format support
660
"max_size_mb": 512,
661
"description": "Flexible file type for any purpose"
662
}
663
664
evals = {
665
"formats": [".jsonl", ".json", ".csv"],
666
"max_size_mb": 100,
667
"description": "Used for evaluation data sets"
668
}
669
```
670
671
### Configuration and Limits
672
673
```python { .api }
674
# Global file limits
675
class FileLimits:
676
max_file_size: int = 512 * 1024 * 1024 # 512MB per file
677
max_organization_storage: int = 1024 * 1024 * 1024 * 1024 # 1TB total
678
679
# Purpose-specific limits
680
fine_tune_max_size: int = 100 * 1024 * 1024 # 100MB
681
assistant_max_size: int = 512 * 1024 * 1024 # 512MB
682
batch_max_size: int = 200 * 1024 * 1024 # 200MB (updated)
683
vision_max_size: int = 20 * 1024 * 1024 # 20MB
684
user_data_max_size: int = 512 * 1024 * 1024 # 512MB
685
evals_max_size: int = 100 * 1024 * 1024 # 100MB
686
687
# Default expiration policies
688
default_expiration = {
689
"batch": 30, # 30 days for batch files
690
"other": None # No expiration for other purposes
691
}
692
693
# Pagination limits
694
list_limit_max: int = 10000
695
list_limit_default: int = 10000
696
697
# Processing timeouts
698
wait_for_processing_default: float = 30 * 60 # 30 minutes
699
poll_interval_default: float = 5.0 # 5 seconds
700
```
701
702
## Best Practices
703
704
### File Preparation
705
706
- Validate file format and content before upload
707
- Use appropriate file extensions for each purpose
708
- Ensure files are within size limits for their intended use
709
- Use UTF-8 encoding for text files
710
- Test with small files before uploading large datasets
711
712
### Fine-tuning Files
713
714
- Use JSONL format with proper message structure
715
- Include diverse examples covering your use cases
716
- Aim for at least 50-100 high-quality examples
717
- Balance your dataset to avoid bias
718
- Validate JSON structure before upload
719
720
### Assistant Files
721
722
- Organize content logically for better retrieval
723
- Use clear, descriptive filenames
724
- Chunk large documents appropriately
725
- Consider file format compatibility with retrieval
726
- Update files when source content changes
727
728
### File Management
729
730
- Use the `wait_for_processing()` method for files that need processing
731
- Implement proper cleanup procedures for old files
732
- Set appropriate expiration policies using `expires_after` parameter
733
- Monitor file usage and storage limits (1TB organization total)
734
- Use descriptive filenames for easy identification
735
- Keep local backups of important files
736
- Track file IDs and metadata for your applications
737
- Use pagination parameters (`after`, `limit`, `order`) for large file lists
738
739
### Security and Privacy
740
741
- Review file content before upload
742
- Be aware of data retention policies
743
- Use appropriate file permissions and access controls
744
- Consider encryption for sensitive local files
745
- Regularly audit uploaded files and their usage