or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

attachments.mdcli-tools.mddocument-management.mdimage-bitmap.mdindex.mdpage-manipulation.mdpage-objects.mdtext-processing.mdtransformation.mdversion-info.md

attachments.mddocs/

0

# File Attachments

1

2

Management of embedded file attachments within PDF documents. The PdfAttachment class provides comprehensive access to file attachment metadata, data extraction, and modification capabilities.

3

4

## Capabilities

5

6

### Attachment Access

7

8

Access and enumerate file attachments within PDF documents.

9

10

```python { .api }

11

# Document-level attachment methods

12

def count_attachments(self) -> int:

13

"""Get total number of file attachments in document."""

14

15

def get_attachment(self, index: int) -> PdfAttachment:

16

"""

17

Get attachment by index.

18

19

Parameters:

20

- index: int, attachment index (0-based)

21

22

Returns:

23

PdfAttachment: Attachment object

24

"""

25

26

def new_attachment(self, name: str) -> PdfAttachment:

27

"""

28

Create new file attachment.

29

30

Parameters:

31

- name: str, attachment filename

32

33

Returns:

34

PdfAttachment: New attachment object (not yet added to document)

35

"""

36

37

def del_attachment(self, index: int):

38

"""

39

Delete attachment by index.

40

41

Parameters:

42

- index: int, attachment index to delete

43

"""

44

```

45

46

Basic attachment operations:

47

48

```python

49

import pypdfium2 as pdfium

50

51

pdf = pdfium.PdfDocument("document.pdf")

52

53

# Check for attachments

54

attachment_count = pdf.count_attachments()

55

print(f"Document has {attachment_count} attachments")

56

57

if attachment_count > 0:

58

# Process each attachment

59

for i in range(attachment_count):

60

attachment = pdf.get_attachment(i)

61

name = attachment.get_name()

62

print(f"Attachment {i}: {name}")

63

```

64

65

### Attachment Properties

66

67

Access attachment metadata and parent document reference.

68

69

```python { .api }

70

class PdfAttachment:

71

@property

72

def raw(self) -> FPDF_ATTACHMENT:

73

"""Raw PDFium attachment handle for low-level operations."""

74

75

@property

76

def pdf(self) -> PdfDocument:

77

"""Parent document containing this attachment."""

78

```

79

80

### File Data Management

81

82

Extract and modify attachment file data.

83

84

```python { .api }

85

def get_name(self) -> str:

86

"""

87

Get attachment filename.

88

89

Returns:

90

str: Original filename of the attached file

91

"""

92

93

def get_data(self) -> ctypes.Array:

94

"""

95

Get attachment file data.

96

97

Returns:

98

ctypes.Array: Raw file data as ctypes array

99

"""

100

101

def set_data(self, data):

102

"""

103

Set attachment file data.

104

105

Parameters:

106

- data: bytes or ctypes array containing new file data

107

"""

108

```

109

110

File data operations:

111

112

```python

113

pdf = pdfium.PdfDocument("document.pdf")

114

115

for i in range(pdf.count_attachments()):

116

attachment = pdf.get_attachment(i)

117

118

# Get attachment information

119

filename = attachment.get_name()

120

file_data = attachment.get_data()

121

122

print(f"Attachment: {filename}")

123

print(f"Size: {len(file_data)} bytes")

124

125

# Extract attachment to file

126

output_path = f"extracted_{filename}"

127

with open(output_path, "wb") as f:

128

f.write(bytes(file_data))

129

130

print(f"Extracted to: {output_path}")

131

```

132

133

### Metadata Management

134

135

Access and modify attachment metadata including custom properties.

136

137

```python { .api }

138

def has_key(self, key: str) -> bool:

139

"""

140

Check if metadata key exists.

141

142

Parameters:

143

- key: str, metadata key name

144

145

Returns:

146

bool: True if key exists, False otherwise

147

"""

148

149

def get_value_type(self, key: str) -> int:

150

"""

151

Get metadata value type.

152

153

Parameters:

154

- key: str, metadata key name

155

156

Returns:

157

int: PDFium value type constant

158

"""

159

160

def get_str_value(self, key: str) -> str:

161

"""

162

Get string metadata value.

163

164

Parameters:

165

- key: str, metadata key name

166

167

Returns:

168

str: Metadata value as string, empty if key doesn't exist

169

"""

170

171

def set_str_value(self, key: str, value: str):

172

"""

173

Set string metadata value.

174

175

Parameters:

176

- key: str, metadata key name

177

- value: str, metadata value to set

178

"""

179

```

180

181

Metadata operations:

182

183

```python

184

pdf = pdfium.PdfDocument("document.pdf")

185

attachment = pdf.get_attachment(0)

186

187

# Common metadata keys

188

metadata_keys = [

189

"Title", # File title/description

190

"Author", # File author

191

"Subject", # File subject

192

"Keywords", # File keywords

193

"Creator", # Creating application

194

"Producer", # PDF producer

195

"CreationDate", # Creation date

196

"ModDate" # Modification date

197

]

198

199

print(f"Attachment: {attachment.get_name()}")

200

print("Metadata:")

201

202

for key in metadata_keys:

203

if attachment.has_key(key):

204

value = attachment.get_str_value(key)

205

value_type = attachment.get_value_type(key)

206

print(f" {key}: {value} (type: {value_type})")

207

208

# Set custom metadata

209

attachment.set_str_value("CustomField", "Custom Value")

210

attachment.set_str_value("ExtractedBy", "pypdfium2")

211

212

# Verify changes

213

if attachment.has_key("CustomField"):

214

custom_value = attachment.get_str_value("CustomField")

215

print(f"Custom field: {custom_value}")

216

```

217

218

### Creating New Attachments

219

220

Add new file attachments to PDF documents.

221

222

```python

223

def add_file_attachment(pdf, file_path, attachment_name=None):

224

"""Add file as attachment to PDF document."""

225

import os

226

227

# Use filename if no attachment name provided

228

if attachment_name is None:

229

attachment_name = os.path.basename(file_path)

230

231

# Create new attachment

232

attachment = pdf.new_attachment(attachment_name)

233

234

# Read file data

235

with open(file_path, "rb") as f:

236

file_data = f.read()

237

238

# Set attachment data

239

attachment.set_data(file_data)

240

241

# Set metadata

242

attachment.set_str_value("Title", attachment_name)

243

attachment.set_str_value("CreationDate", "D:20240101120000")

244

attachment.set_str_value("ModDate", "D:20240101120000")

245

246

print(f"Added attachment: {attachment_name} ({len(file_data)} bytes)")

247

248

return attachment

249

250

# Usage

251

pdf = pdfium.PdfDocument("document.pdf")

252

253

# Add a text file as attachment

254

add_file_attachment(pdf, "readme.txt", "README")

255

256

# Add an image as attachment

257

add_file_attachment(pdf, "chart.png", "Chart Image")

258

259

# Save document with new attachments

260

pdf.save("document_with_attachments.pdf")

261

```

262

263

### Attachment Analysis

264

265

Analyze and report on document attachments.

266

267

```python

268

def analyze_attachments(pdf):

269

"""Comprehensive attachment analysis."""

270

271

count = pdf.count_attachments()

272

273

if count == 0:

274

print("Document contains no attachments")

275

return

276

277

print(f"Document contains {count} attachment(s)")

278

279

total_size = 0

280

file_types = {}

281

282

for i in range(count):

283

attachment = pdf.get_attachment(i)

284

285

# Basic information

286

name = attachment.get_name()

287

data = attachment.get_data()

288

size = len(data)

289

total_size += size

290

291

# File extension analysis

292

ext = name.split('.')[-1].lower() if '.' in name else 'no_ext'

293

file_types[ext] = file_types.get(ext, 0) + 1

294

295

print(f"\nAttachment {i+1}: {name}")

296

print(f" Size: {size:,} bytes ({size/1024:.1f} KB)")

297

298

# Analyze metadata

299

metadata_keys = ["Title", "Author", "Subject", "CreationDate", "ModDate"]

300

metadata_found = False

301

302

for key in metadata_keys:

303

if attachment.has_key(key):

304

value = attachment.get_str_value(key)

305

if value:

306

if not metadata_found:

307

print(" Metadata:")

308

metadata_found = True

309

print(f" {key}: {value}")

310

311

if not metadata_found:

312

print(" No metadata found")

313

314

# File type detection (basic)

315

file_signature = bytes(data[:16])

316

if file_signature.startswith(b'\xFF\xD8\xFF'):

317

print(" Detected: JPEG image")

318

elif file_signature.startswith(b'\x89PNG'):

319

print(" Detected: PNG image")

320

elif file_signature.startswith(b'%PDF'):

321

print(" Detected: PDF document")

322

elif file_signature.startswith(b'PK'):

323

print(" Detected: ZIP archive or Office document")

324

325

# Summary

326

print(f"\nSummary:")

327

print(f" Total attachments: {count}")

328

print(f" Total size: {total_size:,} bytes ({total_size/1024:.1f} KB)")

329

print(f" File types: {dict(file_types)}")

330

331

# Usage

332

pdf = pdfium.PdfDocument("document.pdf")

333

analyze_attachments(pdf)

334

```

335

336

### Batch Attachment Processing

337

338

Process multiple attachments efficiently.

339

340

```python

341

def extract_all_attachments(pdf, output_dir):

342

"""Extract all attachments to specified directory."""

343

import os

344

345

os.makedirs(output_dir, exist_ok=True)

346

347

count = pdf.count_attachments()

348

if count == 0:

349

print("No attachments to extract")

350

return

351

352

extracted = 0

353

failed = 0

354

355

for i in range(count):

356

try:

357

attachment = pdf.get_attachment(i)

358

name = attachment.get_name()

359

data = attachment.get_data()

360

361

# Sanitize filename

362

safe_name = "".join(c for c in name if c.isalnum() or c in "._- ")

363

if not safe_name:

364

safe_name = f"attachment_{i}"

365

366

output_path = os.path.join(output_dir, safe_name)

367

368

# Handle filename conflicts

369

counter = 1

370

original_path = output_path

371

while os.path.exists(output_path):

372

name_parts = original_path.rsplit('.', 1)

373

if len(name_parts) == 2:

374

output_path = f"{name_parts[0]}_{counter}.{name_parts[1]}"

375

else:

376

output_path = f"{original_path}_{counter}"

377

counter += 1

378

379

# Write file

380

with open(output_path, "wb") as f:

381

f.write(bytes(data))

382

383

print(f"Extracted: {name} -> {output_path}")

384

extracted += 1

385

386

except Exception as e:

387

print(f"Failed to extract attachment {i}: {e}")

388

failed += 1

389

390

print(f"\nExtraction complete: {extracted} successful, {failed} failed")

391

return extracted, failed

392

393

# Usage

394

pdf = pdfium.PdfDocument("document.pdf")

395

extract_all_attachments(pdf, "extracted_attachments")

396

```

397

398

### Attachment Security

399

400

Handle attachment security and validation.

401

402

```python

403

def validate_attachments(pdf, max_size_mb=10, allowed_extensions=None):

404

"""Validate attachments for security and size constraints."""

405

406

if allowed_extensions is None:

407

allowed_extensions = {'.txt', '.pdf', '.jpg', '.png', '.gif', '.doc', '.docx'}

408

409

count = pdf.count_attachments()

410

issues = []

411

412

for i in range(count):

413

attachment = pdf.get_attachment(i)

414

name = attachment.get_name()

415

data = attachment.get_data()

416

size_mb = len(data) / (1024 * 1024)

417

418

# Size check

419

if size_mb > max_size_mb:

420

issues.append(f"Attachment {i} '{name}': Size {size_mb:.1f}MB exceeds limit {max_size_mb}MB")

421

422

# Extension check

423

ext = '.' + name.split('.')[-1].lower() if '.' in name else ''

424

if ext not in allowed_extensions:

425

issues.append(f"Attachment {i} '{name}': Extension '{ext}' not allowed")

426

427

# Basic content validation

428

file_data = bytes(data[:16])

429

if ext in ['.jpg', '.jpeg'] and not file_data.startswith(b'\xFF\xD8\xFF'):

430

issues.append(f"Attachment {i} '{name}': JPEG header mismatch")

431

elif ext == '.png' and not file_data.startswith(b'\x89PNG'):

432

issues.append(f"Attachment {i} '{name}': PNG header mismatch")

433

elif ext == '.pdf' and not file_data.startswith(b'%PDF'):

434

issues.append(f"Attachment {i} '{name}': PDF header mismatch")

435

436

if issues:

437

print("Attachment validation issues:")

438

for issue in issues:

439

print(f" - {issue}")

440

return False

441

else:

442

print(f"All {count} attachments passed validation")

443

return True

444

445

# Usage

446

pdf = pdfium.PdfDocument("document.pdf")

447

is_valid = validate_attachments(pdf, max_size_mb=5)

448

```

449

450

## Common Attachment Operations

451

452

### Attachment Backup

453

454

```python

455

def backup_attachments(pdf, backup_path):

456

"""Create backup of all attachments as ZIP file."""

457

import zipfile

458

import io

459

460

count = pdf.count_attachments()

461

if count == 0:

462

return False

463

464

with zipfile.ZipFile(backup_path, 'w', zipfile.ZIP_DEFLATED) as zf:

465

for i in range(count):

466

attachment = pdf.get_attachment(i)

467

name = attachment.get_name()

468

data = bytes(attachment.get_data())

469

470

# Add to ZIP with metadata

471

zf.writestr(name, data)

472

473

print(f"Backed up {count} attachments to {backup_path}")

474

return True

475

476

# Usage

477

pdf = pdfium.PdfDocument("document.pdf")

478

backup_attachments(pdf, "attachments_backup.zip")

479

```