0
# Document Operations
1
2
Core document handling for opening, saving, and managing PDF and other document formats. PyMuPDF supports a wide range of document types including PDF, XPS, EPUB, MOBI, CBZ, and SVG files.
3
4
## Capabilities
5
6
### Opening Documents
7
8
Open documents from files, bytes, or streams with automatic format detection or explicit format specification.
9
10
```python { .api }
11
# Note: open() is an alias for the Document constructor
12
open = Document
13
```
14
15
### Document Class
16
17
Main document container with comprehensive document management capabilities.
18
19
```python { .api }
20
class Document:
21
def __init__(self, filename: str = None, stream: bytes = None, filetype: str = None,
22
rect: Rect = None, width: int = 0, height: int = 0, fontsize: int = 11):
23
"""
24
Create document object. Use open() as a synonym.
25
26
Parameters:
27
- filename: path to document file, or None for new document
28
- stream: document content as bytes
29
- filetype: explicit file type ('pdf', 'xps', 'epub', etc.)
30
- rect: Rect to crop pages (for reflowable documents)
31
- width: page width for reflowable documents
32
- height: page height for reflowable documents
33
- fontsize: font size for reflowable documents
34
"""
35
36
def save(self, filename: str, **kwargs) -> None:
37
"""
38
Save document to file.
39
40
Parameters:
41
- filename: output file path
42
- garbage: remove unused objects (0-4, default 0)
43
- clean: clean and sanitize document content
44
- deflate: compress uncompressed streams
45
- deflate_images: compress images
46
- deflate_fonts: compress fonts
47
- incremental: save incrementally (faster for small changes)
48
- ascii: write in ASCII mode
49
- expand: decompress streams
50
- linear: create linearized PDF
51
- permissions: set document permissions
52
- encryption: encryption method (0-4)
53
- owner_pw: owner password
54
- user_pw: user password
55
"""
56
57
def saveIncr(self) -> None:
58
"""Save document incrementally (in-place)."""
59
60
def close(self) -> None:
61
"""Close document and free memory."""
62
63
def load_page(self, page_num: int) -> Page:
64
"""
65
Load a specific page by number.
66
67
Parameters:
68
- page_num: zero-based page number
69
70
Returns:
71
Page object
72
"""
73
74
def new_page(self, pno: int = -1, width: float = 595, height: float = 842) -> Page:
75
"""
76
Create a new page.
77
78
Parameters:
79
- pno: insertion point (-1 for append)
80
- width: page width in points
81
- height: page height in points
82
83
Returns:
84
New Page object
85
"""
86
87
def delete_page(self, pno: int) -> None:
88
"""
89
Delete a page.
90
91
Parameters:
92
- pno: page number to delete
93
"""
94
95
def copy_page(self, pno: int, to: int = -1) -> None:
96
"""
97
Copy a page within the document.
98
99
Parameters:
100
- pno: source page number
101
- to: target position (-1 for append)
102
"""
103
104
def move_page(self, pno: int, to: int) -> None:
105
"""
106
Move a page to different position.
107
108
Parameters:
109
- pno: source page number
110
- to: target position
111
"""
112
113
def insert_pdf(self, docsrc: Document, from_page: int = 0, to_page: int = -1,
114
start_at: int = -1, rotate: int = -1, links: bool = True,
115
annots: bool = True, show_progress: int = 0, final: bool = True) -> int:
116
"""
117
Insert pages from another PDF document.
118
119
Parameters:
120
- docsrc: source Document object
121
- from_page: first source page (0-based)
122
- to_page: last source page (-1 for last)
123
- start_at: insertion point (-1 for append)
124
- rotate: rotation angle (0, 90, 180, 270)
125
- links: copy links
126
- annots: copy annotations
127
- show_progress: progress callback frequency
128
- final: finalize operation
129
130
Returns:
131
Number of pages inserted
132
"""
133
134
def authenticate(self, password: str) -> int:
135
"""
136
Authenticate encrypted document.
137
138
Parameters:
139
- password: document password
140
141
Returns:
142
Authentication result (0=failed, 1=user password, 2=owner password)
143
"""
144
145
@property
146
def page_count(self) -> int:
147
"""Number of pages in document."""
148
149
@property
150
def metadata(self) -> dict:
151
"""Document metadata dictionary."""
152
153
def set_metadata(self, m: dict) -> None:
154
"""
155
Set document metadata.
156
157
Parameters:
158
- m: metadata dictionary with keys like 'title', 'author', 'subject', 'creator', etc.
159
"""
160
161
@property
162
def needs_pass(self) -> bool:
163
"""True if document requires password authentication."""
164
165
@property
166
def is_encrypted(self) -> bool:
167
"""True if document is encrypted."""
168
169
@property
170
def is_pdf(self) -> bool:
171
"""True if document is PDF format."""
172
173
@property
174
def is_form_pdf(self) -> bool:
175
"""True if PDF contains interactive forms."""
176
177
@property
178
def is_reflowable(self) -> bool:
179
"""True if document has reflowable layout (EPUB, etc.)."""
180
181
@property
182
def is_closed(self) -> bool:
183
"""True if document has been closed."""
184
185
@property
186
def name(self) -> str:
187
"""Document filename or '<new document>' for new documents."""
188
189
def can_save_incrementally(self) -> bool:
190
"""True if document can be saved incrementally."""
191
192
def chapter_count(self) -> int:
193
"""Number of chapters (for EPUB documents)."""
194
195
def last_location(self) -> tuple:
196
"""Last location tuple for reflowable documents."""
197
198
def next_location(self, location: tuple) -> tuple:
199
"""
200
Next location after given location.
201
202
Parameters:
203
- location: current location tuple
204
205
Returns:
206
Next location tuple
207
"""
208
209
def previous_location(self, location: tuple) -> tuple:
210
"""
211
Previous location before given location.
212
213
Parameters:
214
- location: current location tuple
215
216
Returns:
217
Previous location tuple
218
"""
219
220
def page_xref(self, pno: int) -> int:
221
"""
222
Get PDF cross-reference number for page.
223
224
Parameters:
225
- pno: page number
226
227
Returns:
228
Cross-reference number
229
"""
230
```
231
232
### Table of Contents Operations
233
234
Manage document bookmarks and navigation structure.
235
236
```python { .api }
237
def get_toc(self, simple: bool = True) -> list:
238
"""
239
Get table of contents.
240
241
Parameters:
242
- simple: return simple format (default) or detailed format
243
244
Returns:
245
List of [level, title, page, dest] entries
246
"""
247
248
def set_toc(self, toc: list, collapse: int = 1) -> int:
249
"""
250
Set table of contents.
251
252
Parameters:
253
- toc: table of contents list
254
- collapse: collapse levels above this number
255
256
Returns:
257
Number of items processed
258
"""
259
```
260
261
### Embedded Files Operations
262
263
Handle files embedded within documents.
264
265
```python { .api }
266
def embeddedFileNames(self) -> list:
267
"""
268
Get list of embedded file names.
269
270
Returns:
271
List of embedded file names
272
"""
273
274
def embeddedFileGet(self, name: str) -> bytes:
275
"""
276
Extract embedded file content.
277
278
Parameters:
279
- name: embedded file name
280
281
Returns:
282
File content as bytes
283
"""
284
285
def embeddedFileAdd(self, name: str, buffer: typing.Union[str, bytes],
286
filename: str = None, ufilename: str = None,
287
desc: str = None) -> None:
288
"""
289
Add embedded file to document.
290
291
Parameters:
292
- name: reference name for the file
293
- buffer: file content
294
- filename: original filename
295
- ufilename: unicode filename
296
- desc: file description
297
"""
298
299
def embeddedFileDel(self, name: str) -> None:
300
"""
301
Delete embedded file.
302
303
Parameters:
304
- name: embedded file name to delete
305
"""
306
```
307
308
## Usage Examples
309
310
### Basic Document Operations
311
312
```python
313
import pymupdf
314
315
# Open document
316
doc = pymupdf.open("input.pdf")
317
318
# Check if password required
319
if doc.needs_pass:
320
success = doc.authenticate("password")
321
if not success:
322
raise ValueError("Invalid password")
323
324
# Get basic info
325
print(f"Pages: {doc.page_count}")
326
print(f"Metadata: {doc.metadata}")
327
328
# Save with compression
329
doc.save("output.pdf", garbage=4, deflate=True)
330
doc.close()
331
```
332
333
### Document Merging
334
335
```python
336
import pymupdf
337
338
# Open target document
339
target_doc = pymupdf.open("target.pdf")
340
341
# Open source document
342
source_doc = pymupdf.open("source.pdf")
343
344
# Insert all pages from source
345
target_doc.insert_pdf(source_doc)
346
347
# Save merged document
348
target_doc.save("merged.pdf")
349
350
# Clean up
351
target_doc.close()
352
source_doc.close()
353
```
354
355
### Creating New Documents
356
357
```python
358
import pymupdf
359
360
# Create new document
361
doc = pymupdf.open()
362
363
# Add pages
364
page1 = doc.new_page()
365
page2 = doc.new_page(width=792, height=612) # Letter size landscape
366
367
# Set metadata
368
doc.set_metadata({
369
"title": "My Document",
370
"author": "Author Name",
371
"subject": "Document Subject",
372
"creator": "PyMuPDF"
373
})
374
375
# Save new document
376
doc.save("new_document.pdf")
377
doc.close()
378
```