or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

annotations-forms.mddocument-creation-modification.mddocument-operations.mddocument-rendering.mdgeometry-transformations.mdindex.mdpage-content-extraction.mdtable-extraction.md

document-operations.mddocs/

0

# Document Operations

1

2

Core document handling for opening, saving, and managing PDF and other document formats. PyMuPDF supports a wide range of document types including PDF, XPS, EPUB, MOBI, CBZ, and SVG files.

3

4

## Capabilities

5

6

### Opening Documents

7

8

Open documents from files, bytes, or streams with automatic format detection or explicit format specification.

9

10

```python { .api }

11

# Note: open() is an alias for the Document constructor

12

open = Document

13

```

14

15

### Document Class

16

17

Main document container with comprehensive document management capabilities.

18

19

```python { .api }

20

class Document:

21

def __init__(self, filename: str = None, stream: bytes = None, filetype: str = None,

22

rect: Rect = None, width: int = 0, height: int = 0, fontsize: int = 11):

23

"""

24

Create document object. Use open() as a synonym.

25

26

Parameters:

27

- filename: path to document file, or None for new document

28

- stream: document content as bytes

29

- filetype: explicit file type ('pdf', 'xps', 'epub', etc.)

30

- rect: Rect to crop pages (for reflowable documents)

31

- width: page width for reflowable documents

32

- height: page height for reflowable documents

33

- fontsize: font size for reflowable documents

34

"""

35

36

def save(self, filename: str, **kwargs) -> None:

37

"""

38

Save document to file.

39

40

Parameters:

41

- filename: output file path

42

- garbage: remove unused objects (0-4, default 0)

43

- clean: clean and sanitize document content

44

- deflate: compress uncompressed streams

45

- deflate_images: compress images

46

- deflate_fonts: compress fonts

47

- incremental: save incrementally (faster for small changes)

48

- ascii: write in ASCII mode

49

- expand: decompress streams

50

- linear: create linearized PDF

51

- permissions: set document permissions

52

- encryption: encryption method (0-4)

53

- owner_pw: owner password

54

- user_pw: user password

55

"""

56

57

def saveIncr(self) -> None:

58

"""Save document incrementally (in-place)."""

59

60

def close(self) -> None:

61

"""Close document and free memory."""

62

63

def load_page(self, page_num: int) -> Page:

64

"""

65

Load a specific page by number.

66

67

Parameters:

68

- page_num: zero-based page number

69

70

Returns:

71

Page object

72

"""

73

74

def new_page(self, pno: int = -1, width: float = 595, height: float = 842) -> Page:

75

"""

76

Create a new page.

77

78

Parameters:

79

- pno: insertion point (-1 for append)

80

- width: page width in points

81

- height: page height in points

82

83

Returns:

84

New Page object

85

"""

86

87

def delete_page(self, pno: int) -> None:

88

"""

89

Delete a page.

90

91

Parameters:

92

- pno: page number to delete

93

"""

94

95

def copy_page(self, pno: int, to: int = -1) -> None:

96

"""

97

Copy a page within the document.

98

99

Parameters:

100

- pno: source page number

101

- to: target position (-1 for append)

102

"""

103

104

def move_page(self, pno: int, to: int) -> None:

105

"""

106

Move a page to different position.

107

108

Parameters:

109

- pno: source page number

110

- to: target position

111

"""

112

113

def insert_pdf(self, docsrc: Document, from_page: int = 0, to_page: int = -1,

114

start_at: int = -1, rotate: int = -1, links: bool = True,

115

annots: bool = True, show_progress: int = 0, final: bool = True) -> int:

116

"""

117

Insert pages from another PDF document.

118

119

Parameters:

120

- docsrc: source Document object

121

- from_page: first source page (0-based)

122

- to_page: last source page (-1 for last)

123

- start_at: insertion point (-1 for append)

124

- rotate: rotation angle (0, 90, 180, 270)

125

- links: copy links

126

- annots: copy annotations

127

- show_progress: progress callback frequency

128

- final: finalize operation

129

130

Returns:

131

Number of pages inserted

132

"""

133

134

def authenticate(self, password: str) -> int:

135

"""

136

Authenticate encrypted document.

137

138

Parameters:

139

- password: document password

140

141

Returns:

142

Authentication result (0=failed, 1=user password, 2=owner password)

143

"""

144

145

@property

146

def page_count(self) -> int:

147

"""Number of pages in document."""

148

149

@property

150

def metadata(self) -> dict:

151

"""Document metadata dictionary."""

152

153

def set_metadata(self, m: dict) -> None:

154

"""

155

Set document metadata.

156

157

Parameters:

158

- m: metadata dictionary with keys like 'title', 'author', 'subject', 'creator', etc.

159

"""

160

161

@property

162

def needs_pass(self) -> bool:

163

"""True if document requires password authentication."""

164

165

@property

166

def is_encrypted(self) -> bool:

167

"""True if document is encrypted."""

168

169

@property

170

def is_pdf(self) -> bool:

171

"""True if document is PDF format."""

172

173

@property

174

def is_form_pdf(self) -> bool:

175

"""True if PDF contains interactive forms."""

176

177

@property

178

def is_reflowable(self) -> bool:

179

"""True if document has reflowable layout (EPUB, etc.)."""

180

181

@property

182

def is_closed(self) -> bool:

183

"""True if document has been closed."""

184

185

@property

186

def name(self) -> str:

187

"""Document filename or '<new document>' for new documents."""

188

189

def can_save_incrementally(self) -> bool:

190

"""True if document can be saved incrementally."""

191

192

def chapter_count(self) -> int:

193

"""Number of chapters (for EPUB documents)."""

194

195

def last_location(self) -> tuple:

196

"""Last location tuple for reflowable documents."""

197

198

def next_location(self, location: tuple) -> tuple:

199

"""

200

Next location after given location.

201

202

Parameters:

203

- location: current location tuple

204

205

Returns:

206

Next location tuple

207

"""

208

209

def previous_location(self, location: tuple) -> tuple:

210

"""

211

Previous location before given location.

212

213

Parameters:

214

- location: current location tuple

215

216

Returns:

217

Previous location tuple

218

"""

219

220

def page_xref(self, pno: int) -> int:

221

"""

222

Get PDF cross-reference number for page.

223

224

Parameters:

225

- pno: page number

226

227

Returns:

228

Cross-reference number

229

"""

230

```

231

232

### Table of Contents Operations

233

234

Manage document bookmarks and navigation structure.

235

236

```python { .api }

237

def get_toc(self, simple: bool = True) -> list:

238

"""

239

Get table of contents.

240

241

Parameters:

242

- simple: return simple format (default) or detailed format

243

244

Returns:

245

List of [level, title, page, dest] entries

246

"""

247

248

def set_toc(self, toc: list, collapse: int = 1) -> int:

249

"""

250

Set table of contents.

251

252

Parameters:

253

- toc: table of contents list

254

- collapse: collapse levels above this number

255

256

Returns:

257

Number of items processed

258

"""

259

```

260

261

### Embedded Files Operations

262

263

Handle files embedded within documents.

264

265

```python { .api }

266

def embeddedFileNames(self) -> list:

267

"""

268

Get list of embedded file names.

269

270

Returns:

271

List of embedded file names

272

"""

273

274

def embeddedFileGet(self, name: str) -> bytes:

275

"""

276

Extract embedded file content.

277

278

Parameters:

279

- name: embedded file name

280

281

Returns:

282

File content as bytes

283

"""

284

285

def embeddedFileAdd(self, name: str, buffer: typing.Union[str, bytes],

286

filename: str = None, ufilename: str = None,

287

desc: str = None) -> None:

288

"""

289

Add embedded file to document.

290

291

Parameters:

292

- name: reference name for the file

293

- buffer: file content

294

- filename: original filename

295

- ufilename: unicode filename

296

- desc: file description

297

"""

298

299

def embeddedFileDel(self, name: str) -> None:

300

"""

301

Delete embedded file.

302

303

Parameters:

304

- name: embedded file name to delete

305

"""

306

```

307

308

## Usage Examples

309

310

### Basic Document Operations

311

312

```python

313

import pymupdf

314

315

# Open document

316

doc = pymupdf.open("input.pdf")

317

318

# Check if password required

319

if doc.needs_pass:

320

success = doc.authenticate("password")

321

if not success:

322

raise ValueError("Invalid password")

323

324

# Get basic info

325

print(f"Pages: {doc.page_count}")

326

print(f"Metadata: {doc.metadata}")

327

328

# Save with compression

329

doc.save("output.pdf", garbage=4, deflate=True)

330

doc.close()

331

```

332

333

### Document Merging

334

335

```python

336

import pymupdf

337

338

# Open target document

339

target_doc = pymupdf.open("target.pdf")

340

341

# Open source document

342

source_doc = pymupdf.open("source.pdf")

343

344

# Insert all pages from source

345

target_doc.insert_pdf(source_doc)

346

347

# Save merged document

348

target_doc.save("merged.pdf")

349

350

# Clean up

351

target_doc.close()

352

source_doc.close()

353

```

354

355

### Creating New Documents

356

357

```python

358

import pymupdf

359

360

# Create new document

361

doc = pymupdf.open()

362

363

# Add pages

364

page1 = doc.new_page()

365

page2 = doc.new_page(width=792, height=612) # Letter size landscape

366

367

# Set metadata

368

doc.set_metadata({

369

"title": "My Document",

370

"author": "Author Name",

371

"subject": "Document Subject",

372

"creator": "PyMuPDF"

373

})

374

375

# Save new document

376

doc.save("new_document.pdf")

377

doc.close()

378

```