0
# PDF Objects and Data Types
1
2
PDF object types and data structures that form the foundation of PDF content representation. These classes provide the building blocks for manipulating PDF data at the object level.
3
4
## Capabilities
5
6
### Base Object Class
7
8
The fundamental PDF object type that all other PDF objects inherit from, providing common functionality for object manipulation and ownership.
9
10
```python { .api }
11
class Object:
12
"""
13
Universal PDF object type representing any PDF data structure.
14
15
All PDF objects (arrays, dictionaries, names, etc.) derive from this class.
16
"""
17
18
def is_owned_by(self, possible_owner: Pdf) -> bool:
19
"""
20
Check if this object is owned by a specific PDF.
21
22
Parameters:
23
- possible_owner (Pdf): PDF to check ownership against
24
25
Returns:
26
bool: True if this object belongs to the specified PDF
27
"""
28
29
def same_owner_as(self, other: Object) -> bool:
30
"""
31
Check if this object has the same owner as another object.
32
33
Parameters:
34
- other (Object): Object to compare ownership with
35
36
Returns:
37
bool: True if both objects have the same owner
38
"""
39
40
def with_same_owner_as(self, other: Object) -> Object:
41
"""
42
Return a copy of this object owned by the same PDF as another object.
43
44
Parameters:
45
- other (Object): Object whose owner should be used
46
47
Returns:
48
Object: Copy of this object with the same owner as other
49
50
Raises:
51
ForeignObjectError: If objects cannot be made compatible
52
"""
53
54
@staticmethod
55
def parse(data: str, *, pdf_context: Pdf = None) -> Object:
56
"""
57
Parse a string representation of PDF data into an Object.
58
59
Parameters:
60
- data (str): String containing PDF object data
61
- pdf_context (Pdf, optional): PDF context for parsing
62
63
Returns:
64
Object: Parsed PDF object
65
66
Raises:
67
PdfError: If the data cannot be parsed
68
"""
69
70
def unparse(self, *, resolved: bool = False) -> str:
71
"""
72
Convert the object back to its string representation.
73
74
Parameters:
75
- resolved (bool): Whether to resolve indirect references
76
77
Returns:
78
str: String representation of the object
79
"""
80
81
@property
82
def _type_code(self) -> ObjectType:
83
"""
84
The object's type code.
85
86
Returns:
87
ObjectType: Enumeration value indicating the object type
88
"""
89
90
@property
91
def is_indirect(self) -> bool:
92
"""
93
Whether this is an indirect object.
94
95
Returns:
96
bool: True if this is an indirect object reference
97
"""
98
99
@property
100
def objgen(self) -> tuple[int, int]:
101
"""
102
Object and generation numbers for indirect objects.
103
104
Returns:
105
tuple[int, int]: (object_number, generation_number) or (0, 0) for direct objects
106
"""
107
```
108
109
### Array Objects
110
111
PDF arrays represent ordered collections of PDF objects, similar to Python lists.
112
113
```python { .api }
114
class Array(Object):
115
"""
116
PDF array object representing an ordered list of PDF objects.
117
118
Behaves like a Python list with additional PDF-specific functionality.
119
"""
120
121
def __init__(self, iterable=None) -> None:
122
"""
123
Create a new PDF array.
124
125
Parameters:
126
- iterable (optional): Initial objects to populate the array
127
"""
128
129
def __len__(self) -> int:
130
"""Return the number of elements in the array."""
131
132
def __getitem__(self, index: int) -> Object:
133
"""Get an element by index."""
134
135
def __setitem__(self, index: int, value: Object) -> None:
136
"""Set an element at the given index."""
137
138
def append(self, obj: Object) -> None:
139
"""
140
Add an object to the end of the array.
141
142
Parameters:
143
- obj (Object): Object to append
144
"""
145
146
def extend(self, iterable) -> None:
147
"""
148
Extend the array with objects from an iterable.
149
150
Parameters:
151
- iterable: Objects to add to the array
152
"""
153
154
def insert(self, index: int, obj: Object) -> None:
155
"""
156
Insert an object at the specified index.
157
158
Parameters:
159
- index (int): Position to insert at
160
- obj (Object): Object to insert
161
"""
162
```
163
164
### Dictionary Objects
165
166
PDF dictionaries represent key-value mappings where keys are Name objects and values are any PDF objects.
167
168
```python { .api }
169
class Dictionary(Object):
170
"""
171
PDF dictionary object representing key-value mappings.
172
173
Keys must be Name objects, values can be any PDF objects.
174
Behaves like a Python dictionary with PDF-specific enhancements.
175
"""
176
177
def __init__(self, mapping=None, **kwargs) -> None:
178
"""
179
Create a new PDF dictionary.
180
181
Parameters:
182
- mapping (optional): Initial key-value pairs
183
- **kwargs: Additional key-value pairs (keys converted to Names)
184
"""
185
186
def __getitem__(self, key) -> Object:
187
"""Get a value by key (key can be str or Name)."""
188
189
def __setitem__(self, key, value: Object) -> None:
190
"""Set a key-value pair (key converted to Name if needed)."""
191
192
def __contains__(self, key) -> bool:
193
"""Check if key exists in dictionary."""
194
195
def __len__(self) -> int:
196
"""Return number of key-value pairs."""
197
198
def keys(self):
199
"""Return dictionary keys as Name objects."""
200
201
def values(self):
202
"""Return dictionary values."""
203
204
def items(self):
205
"""Return key-value pairs."""
206
207
def get(self, key, default=None) -> Object:
208
"""
209
Get a value with optional default.
210
211
Parameters:
212
- key: Dictionary key (str or Name)
213
- default: Default value if key not found
214
215
Returns:
216
Object: Value associated with key, or default
217
"""
218
```
219
220
### Name Objects
221
222
PDF names are atomic identifiers used as dictionary keys and various PDF constants.
223
224
```python { .api }
225
class Name(Object):
226
"""
227
PDF name object representing an immutable identifier.
228
229
Names are used as dictionary keys and PDF constants.
230
Supports both string construction and attribute-style access.
231
"""
232
233
def __init__(self, name_string: str) -> None:
234
"""
235
Create a PDF name from a string.
236
237
Parameters:
238
- name_string (str): String representation of the name
239
"""
240
241
def __str__(self) -> str:
242
"""Return string representation without leading slash."""
243
244
def __repr__(self) -> str:
245
"""Return full representation including leading slash."""
246
247
def __eq__(self, other) -> bool:
248
"""Compare names for equality."""
249
250
def __hash__(self) -> int:
251
"""Return hash for use as dictionary key."""
252
253
# Name constants can be accessed as attributes
254
# Example: Name.Type, Name.Font, Name.Contents
255
```
256
257
### String Objects
258
259
PDF strings can contain text or binary data with proper encoding handling.
260
261
```python { .api }
262
class String(Object):
263
"""
264
PDF string object for text or binary data.
265
266
Handles PDF string encoding including literal strings and hex strings.
267
"""
268
269
def __init__(self, str_or_bytes) -> None:
270
"""
271
Create a PDF string from text or bytes.
272
273
Parameters:
274
- str_or_bytes (str | bytes): String content
275
"""
276
277
def __str__(self) -> str:
278
"""Return string content as text."""
279
280
def __bytes__(self) -> bytes:
281
"""Return string content as bytes."""
282
283
def __len__(self) -> int:
284
"""Return length of string content."""
285
286
@property
287
def for_pdf(self) -> str:
288
"""
289
String representation suitable for PDF output.
290
291
Returns:
292
str: Properly escaped string for PDF files
293
"""
294
```
295
296
### Stream Objects
297
298
PDF streams contain both a dictionary of metadata and binary data content.
299
300
```python { .api }
301
class Stream(Object):
302
"""
303
PDF stream object containing dictionary metadata and binary data.
304
305
Streams are used for page content, images, fonts, and other binary data.
306
"""
307
308
def __init__(self, owner: Pdf, data=None, dict=None, **kwargs) -> None:
309
"""
310
Create a new PDF stream.
311
312
Parameters:
313
- owner (Pdf): PDF that will own this stream
314
- data (bytes, optional): Stream data content
315
- dict (Dictionary, optional): Stream dictionary
316
- **kwargs: Additional dictionary entries
317
"""
318
319
@property
320
def dictionary(self) -> Dictionary:
321
"""
322
The stream's dictionary containing metadata.
323
324
Returns:
325
Dictionary: Stream metadata and parameters
326
"""
327
328
def read_bytes(self) -> bytes:
329
"""
330
Read the stream's data as bytes.
331
332
Returns:
333
bytes: Decoded stream data
334
335
Raises:
336
DataDecodingError: If stream cannot be decoded
337
"""
338
339
def read_raw_bytes(self) -> bytes:
340
"""
341
Read the stream's raw (unfiltered) data.
342
343
Returns:
344
bytes: Raw stream data without decoding filters
345
"""
346
347
def write(self, data: bytes, *, filter=None, decode_parms=None) -> None:
348
"""
349
Write data to the stream.
350
351
Parameters:
352
- data (bytes): Data to write
353
- filter (optional): Compression filter to apply
354
- decode_parms (optional): Filter parameters
355
"""
356
```
357
358
### Operator Objects
359
360
PDF operators represent content stream commands and their operands.
361
362
```python { .api }
363
class Operator(Object):
364
"""
365
PDF content stream operator.
366
367
Represents commands in PDF content streams like 'Tj' (show text) or 'l' (line to).
368
"""
369
370
def __init__(self, name: str) -> None:
371
"""
372
Create a PDF operator.
373
374
Parameters:
375
- name (str): Operator name (e.g., 'Tj', 'cm', 'Do')
376
"""
377
378
def __str__(self) -> str:
379
"""Return operator name."""
380
381
def __repr__(self) -> str:
382
"""Return full representation."""
383
```
384
385
### Object Type Enumeration
386
387
Enumeration of all possible PDF object types for type checking and identification.
388
389
```python { .api }
390
from enum import Enum
391
392
class ObjectType(Enum):
393
"""Enumeration of PDF object types."""
394
uninitialized = ... # Uninitialized object
395
reserved = ... # Reserved type
396
null = ... # Null object
397
boolean = ... # Boolean true/false
398
integer = ... # Integer number
399
real = ... # Real (floating-point) number
400
string = ... # String object
401
name_ = ... # Name object (underscore avoids conflict with 'name')
402
array = ... # Array object
403
dictionary = ... # Dictionary object
404
stream = ... # Stream object
405
operator = ... # Content stream operator
406
inlineimage = ... # Inline image
407
```
408
409
## Usage Examples
410
411
### Working with Arrays
412
413
```python
414
import pikepdf
415
416
pdf = pikepdf.new()
417
418
# Create an array
419
arr = pikepdf.Array([1, 2, 3])
420
421
# Add elements
422
arr.append(pikepdf.String("hello"))
423
arr.extend([pikepdf.Name.Type, pikepdf.Name.Font])
424
425
# Access elements
426
first = arr[0] # Integer 1
427
last = arr[-1] # Name(/Font)
428
429
# Use in dictionary
430
dict_obj = pikepdf.Dictionary({
431
'/Contents': arr,
432
'/Type': pikepdf.Name.Page
433
})
434
```
435
436
### Working with Dictionaries
437
438
```python
439
import pikepdf
440
441
# Create a dictionary
442
page_dict = pikepdf.Dictionary({
443
'/Type': pikepdf.Name.Page,
444
'/MediaBox': pikepdf.Array([0, 0, 612, 792]),
445
'/Resources': pikepdf.Dictionary()
446
})
447
448
# Access values
449
page_type = page_dict['/Type'] # Name(/Page)
450
media_box = page_dict['/MediaBox'] # Array
451
452
# Add new entries
453
page_dict['/Rotate'] = 90
454
page_dict['/Contents'] = pikepdf.Array()
455
456
# Check for keys
457
if '/Resources' in page_dict:
458
resources = page_dict['/Resources']
459
```
460
461
### Working with Names
462
463
```python
464
import pikepdf
465
466
# Create names
467
type_name = pikepdf.Name.Type
468
page_name = pikepdf.Name.Page
469
custom_name = pikepdf.Name('/CustomAttribute')
470
471
# Names can be compared
472
if type_name == pikepdf.Name.Type:
473
print("Names are equal")
474
475
# Use in dictionaries
476
metadata = {
477
type_name: page_name,
478
pikepdf.Name.MediaBox: pikepdf.Array([0, 0, 612, 792])
479
}
480
```
481
482
### Working with Strings
483
484
```python
485
import pikepdf
486
487
# Create strings
488
title = pikepdf.String("Document Title")
489
binary_data = pikepdf.String(b'\x00\x01\x02\x03')
490
491
# Convert between representations
492
text_content = str(title) # "Document Title"
493
byte_content = bytes(binary_data) # b'\x00\x01\x02\x03'
494
495
# Use in document info
496
pdf = pikepdf.new()
497
pdf.docinfo['/Title'] = title
498
pdf.docinfo['/Author'] = pikepdf.String("Jane Doe")
499
```
500
501
### Working with Streams
502
503
```python
504
import pikepdf
505
506
pdf = pikepdf.new()
507
508
# Create a stream with text content
509
content_data = b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET"
510
content_stream = pikepdf.Stream(pdf, content_data)
511
512
# Set stream properties
513
content_stream.dictionary['/Length'] = len(content_data)
514
515
# Read stream data
516
data = content_stream.read_bytes()
517
raw_data = content_stream.read_raw_bytes()
518
519
# Use stream in a page
520
page = pdf.add_blank_page()
521
page['/Contents'] = content_stream
522
```
523
524
### Object Copying and Ownership
525
526
```python
527
import pikepdf
528
529
# Open two PDFs
530
pdf1 = pikepdf.open('source.pdf')
531
pdf2 = pikepdf.new()
532
533
# Copy object from one PDF to another
534
source_obj = pdf1.pages[0]['/Resources']
535
copied_obj = pdf2.copy_foreign(source_obj)
536
537
# Check ownership
538
assert copied_obj.is_owned_by(pdf2)
539
assert not copied_obj.is_owned_by(pdf1)
540
541
# Make object indirect
542
indirect_obj = pdf2.make_indirect(copied_obj)
543
obj_id, generation = indirect_obj.objgen
544
```