or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced.mdattachments.mdcontent-streams.mdcore-operations.mdencryption.mdforms.mdimages.mdindex.mdmetadata.mdobjects.mdoutlines.mdpages.md

attachments.mddocs/

0

# File Attachments

1

2

Embedded file management including attachment, extraction, and metadata handling for portfolio PDFs and file attachments. These capabilities enable comprehensive file embedding and management within PDF documents.

3

4

## Capabilities

5

6

### AttachedFileSpec Class

7

8

Individual file attachment specifications with metadata and content management.

9

10

```python { .api }

11

class AttachedFileSpec:

12

"""

13

PDF attached file specification for embedded files.

14

15

Represents a single file embedded within a PDF document,

16

including its content, metadata, and relationship to the document.

17

"""

18

19

@staticmethod

20

def from_filepath(pdf: Pdf, path: str, *, description: str = '',

21

relationship: str = '/Unspecified') -> AttachedFileSpec:

22

"""

23

Create an attached file specification from a file path.

24

25

Reads the file from disk and creates a complete attachment

26

specification with appropriate metadata and content encoding.

27

28

Parameters:

29

- pdf (Pdf): PDF document to attach the file to

30

- path (str): Path to the file to attach

31

- description (str): Human-readable description of the file

32

- relationship (str): Relationship to the document

33

('/Source', '/Data', '/Alternative', '/Supplement', '/Unspecified')

34

35

Returns:

36

AttachedFileSpec: Attached file specification ready for embedding

37

38

Raises:

39

FileNotFoundError: If the specified file doesn't exist

40

IOError: If the file cannot be read

41

"""

42

43

def get_file(self) -> bytes:

44

"""

45

Retrieve the attached file's content as bytes.

46

47

Extracts and decodes the embedded file data from the PDF.

48

49

Returns:

50

bytes: Complete file content

51

52

Raises:

53

DataDecodingError: If file data cannot be decoded

54

"""

55

56

def get_all_filenames(self) -> dict[str, str]:

57

"""

58

Get all filename variants for this attachment.

59

60

PDF attachments can have multiple filename variants for

61

different platforms and character encodings.

62

63

Returns:

64

dict[str, str]: Mapping of filename types to actual filenames

65

Keys: 'F', 'UF', 'DOS', 'Mac', 'Unix'

66

"""

67

68

@property

69

def filename(self) -> str:

70

"""

71

Primary filename for the attached file.

72

73

Returns the most appropriate filename, preferring Unicode

74

filenames when available.

75

76

Returns:

77

str: Filename of the attached file

78

"""

79

80

@property

81

def description(self) -> str:

82

"""

83

Human-readable description of the attached file.

84

85

Returns:

86

str: File description or empty string if none provided

87

"""

88

89

@property

90

def relationship(self) -> str:

91

"""

92

Relationship of this file to the PDF document.

93

94

Common values:

95

- '/Source': Original source file for the PDF

96

- '/Data': Data file related to the PDF content

97

- '/Alternative': Alternative representation

98

- '/Supplement': Supplementary file

99

- '/Unspecified': Relationship not specified

100

101

Returns:

102

str: Relationship type as PDF name

103

"""

104

105

@property

106

def size(self) -> int:

107

"""

108

Size of the attached file in bytes.

109

110

Returns:

111

int: File size, or -1 if size is unknown

112

"""

113

114

@property

115

def creation_date(self) -> str:

116

"""

117

Creation date of the attached file.

118

119

Returns:

120

str: Creation date in PDF date format, or empty if unknown

121

"""

122

123

@property

124

def modification_date(self) -> str:

125

"""

126

Last modification date of the attached file.

127

128

Returns:

129

str: Modification date in PDF date format, or empty if unknown

130

"""

131

132

@property

133

def checksum(self) -> str:

134

"""

135

MD5 checksum of the attached file content.

136

137

Used for integrity verification of the embedded file.

138

139

Returns:

140

str: Hex-encoded MD5 hash, or empty if not available

141

"""

142

```

143

144

### Attachments Class

145

146

Collection interface for managing all attachments in a PDF document.

147

148

```python { .api }

149

class Attachments:

150

"""

151

Mapping interface for PDF attachments collection.

152

153

Provides dictionary-like access to all embedded files in a PDF,

154

with methods for adding, removing, and iterating attachments.

155

156

Implements MutableMapping[str, AttachedFileSpec] interface.

157

"""

158

159

def __len__(self) -> int:

160

"""

161

Number of attached files in the PDF.

162

163

Returns:

164

int: Count of embedded files

165

"""

166

167

def __iter__(self) -> Iterator[str]:

168

"""

169

Iterate over attachment names.

170

171

Yields:

172

str: Filename/key for each attached file

173

"""

174

175

def __getitem__(self, key: str) -> AttachedFileSpec:

176

"""

177

Get an attached file by name.

178

179

Parameters:

180

- key (str): Attachment filename or key

181

182

Returns:

183

AttachedFileSpec: Attached file specification

184

185

Raises:

186

KeyError: If attachment with specified key doesn't exist

187

"""

188

189

def __setitem__(self, key: str, value: AttachedFileSpec) -> None:

190

"""

191

Add or replace an attached file.

192

193

Parameters:

194

- key (str): Attachment name/key

195

- value (AttachedFileSpec): File specification to attach

196

"""

197

198

def __delitem__(self, key: str) -> None:

199

"""

200

Remove an attached file.

201

202

Parameters:

203

- key (str): Attachment name/key to remove

204

205

Raises:

206

KeyError: If attachment doesn't exist

207

"""

208

209

def __contains__(self, key: str) -> bool:

210

"""

211

Check if an attachment exists.

212

213

Parameters:

214

- key (str): Attachment name/key to check

215

216

Returns:

217

bool: True if attachment exists

218

"""

219

220

def keys(self):

221

"""

222

Get all attachment names.

223

224

Returns:

225

KeysView: View of all attachment keys

226

"""

227

228

def values(self):

229

"""

230

Get all attachment specifications.

231

232

Returns:

233

ValuesView: View of all AttachedFileSpec objects

234

"""

235

236

def items(self):

237

"""

238

Get all attachment name-specification pairs.

239

240

Returns:

241

ItemsView: View of (key, AttachedFileSpec) pairs

242

"""

243

244

def clear(self) -> None:

245

"""Remove all attachments from the PDF."""

246

```

247

248

## Usage Examples

249

250

### Adding File Attachments

251

252

```python

253

import pikepdf

254

from pathlib import Path

255

256

# Open or create a PDF

257

pdf = pikepdf.open('document.pdf')

258

259

# Access the attachments collection

260

attachments = pdf.attachments

261

262

# Attach a file from disk

263

document_file = Path('source_document.docx')

264

if document_file.exists():

265

# Create attachment specification

266

attachment = pikepdf.AttachedFileSpec.from_filepath(

267

pdf,

268

str(document_file),

269

description="Original Word document source",

270

relationship='/Source'

271

)

272

273

# Add to PDF

274

attachments['source_document.docx'] = attachment

275

print(f"Attached: {document_file.name}")

276

277

# Attach multiple files

278

files_to_attach = [

279

('data.csv', 'Supporting data file', '/Data'),

280

('image.png', 'Illustration used in document', '/Supplement'),

281

('readme.txt', 'Instructions and notes', '/Unspecified')

282

]

283

284

for filename, description, relationship in files_to_attach:

285

file_path = Path(filename)

286

if file_path.exists():

287

attachment = pikepdf.AttachedFileSpec.from_filepath(

288

pdf,

289

str(file_path),

290

description=description,

291

relationship=relationship

292

)

293

attachments[filename] = attachment

294

print(f"Attached: {filename} ({description})")

295

296

print(f"Total attachments: {len(attachments)}")

297

298

# Save PDF with attachments

299

pdf.save('document_with_attachments.pdf')

300

pdf.close()

301

```

302

303

### Extracting Attached Files

304

305

```python

306

import pikepdf

307

from pathlib import Path

308

309

def extract_all_attachments(pdf_path, output_dir):

310

"""Extract all attached files from a PDF."""

311

312

pdf = pikepdf.open(pdf_path)

313

attachments = pdf.attachments

314

315

if len(attachments) == 0:

316

print("No attachments found in PDF")

317

pdf.close()

318

return

319

320

# Create output directory

321

output_path = Path(output_dir)

322

output_path.mkdir(exist_ok=True)

323

324

extracted_files = []

325

326

print(f"Found {len(attachments)} attachments:")

327

328

for name, attachment in attachments.items():

329

try:

330

# Get file info

331

filename = attachment.filename or name

332

description = attachment.description

333

size = attachment.size

334

relationship = attachment.relationship

335

336

print(f"\nπŸ“Ž {filename}")

337

print(f" Description: {description}")

338

print(f" Size: {size:,} bytes" if size >= 0 else " Size: Unknown")

339

print(f" Relationship: {relationship}")

340

print(f" Created: {attachment.creation_date}")

341

print(f" Modified: {attachment.modification_date}")

342

343

# Extract file content

344

file_data = attachment.get_file()

345

346

# Save to disk

347

safe_filename = "".join(c for c in filename if c.isalnum() or c in '.-_')

348

output_file = output_path / safe_filename

349

350

# Handle filename conflicts

351

counter = 1

352

while output_file.exists():

353

stem = output_file.stem

354

suffix = output_file.suffix

355

output_file = output_path / f"{stem}_{counter}{suffix}"

356

counter += 1

357

358

with open(output_file, 'wb') as f:

359

f.write(file_data)

360

361

extracted_files.append(str(output_file))

362

print(f" βœ“ Extracted to: {output_file}")

363

364

# Verify checksum if available

365

if attachment.checksum:

366

import hashlib

367

actual_checksum = hashlib.md5(file_data).hexdigest().upper()

368

expected_checksum = attachment.checksum.upper()

369

370

if actual_checksum == expected_checksum:

371

print(f" βœ“ Checksum verified: {actual_checksum}")

372

else:

373

print(f" ⚠️ Checksum mismatch: expected {expected_checksum}, got {actual_checksum}")

374

375

except Exception as e:

376

print(f" ❌ Error extracting {name}: {e}")

377

378

pdf.close()

379

380

print(f"\nExtracted {len(extracted_files)} files to {output_dir}")

381

return extracted_files

382

383

# Extract attachments

384

extracted = extract_all_attachments('document_with_attachments.pdf', 'extracted_files')

385

```

386

387

### Managing Attachment Metadata

388

389

```python

390

import pikepdf

391

from datetime import datetime

392

393

def update_attachment_metadata(pdf_path):

394

"""Update metadata for existing attachments."""

395

396

pdf = pikepdf.open(pdf_path)

397

attachments = pdf.attachments

398

399

for name, attachment in attachments.items():

400

print(f"Attachment: {name}")

401

402

# Get all filename variants

403

filenames = attachment.get_all_filenames()

404

print(f" Filename variants: {filenames}")

405

406

# Display current metadata

407

print(f" Current description: '{attachment.description}'")

408

print(f" Current relationship: {attachment.relationship}")

409

print(f" File size: {attachment.size:,} bytes")

410

print(f" Creation date: {attachment.creation_date}")

411

print(f" Modification date: {attachment.modification_date}")

412

print(f" Checksum: {attachment.checksum}")

413

414

# Note: Modifying attachment metadata requires recreating the attachment

415

# This is a limitation of the PDF format and pikepdf's current API

416

417

pdf.close()

418

419

def create_portfolio_pdf(file_list, output_path):

420

"""Create a PDF portfolio with multiple attached files."""

421

422

# Create new PDF

423

pdf = pikepdf.new()

424

425

# Add a cover page

426

page = pdf.add_blank_page()

427

428

# Add basic content to cover page

429

content = f"""

430

BT

431

/F1 24 Tf

432

100 700 Td

433

(PDF Portfolio) Tj

434

435

/F1 12 Tf

436

100 650 Td

437

(This PDF contains {len(file_list)} attached files:) Tj

438

"""

439

440

y_pos = 620

441

for i, (file_path, description) in enumerate(file_list):

442

file_name = Path(file_path).name

443

content += f"""

444

100 {y_pos} Td

445

({i+1}. {file_name}) Tj

446

"""

447

y_pos -= 20

448

449

content += "\nET"

450

451

content_stream = pikepdf.Stream(pdf, content.encode())

452

page['/Contents'] = content_stream

453

454

# Add files as attachments

455

attachments = pdf.attachments

456

457

for file_path, description in file_list:

458

file_path_obj = Path(file_path)

459

460

if file_path_obj.exists():

461

# Determine relationship based on file type

462

suffix = file_path_obj.suffix.lower()

463

if suffix in ['.docx', '.doc', '.odt']:

464

relationship = '/Source'

465

elif suffix in ['.csv', '.xlsx', '.json']:

466

relationship = '/Data'

467

elif suffix in ['.png', '.jpg', '.jpeg', '.gif']:

468

relationship = '/Supplement'

469

else:

470

relationship = '/Unspecified'

471

472

# Create attachment

473

attachment = pikepdf.AttachedFileSpec.from_filepath(

474

pdf,

475

str(file_path_obj),

476

description=description,

477

relationship=relationship

478

)

479

480

attachments[file_path_obj.name] = attachment

481

print(f"Added to portfolio: {file_path_obj.name}")

482

483

# Save portfolio

484

pdf.save(output_path)

485

pdf.close()

486

487

print(f"Created portfolio PDF: {output_path}")

488

489

# Create a portfolio with multiple files

490

portfolio_files = [

491

('project_report.pdf', 'Main project report'),

492

('data_analysis.csv', 'Raw data and analysis'),

493

('chart.png', 'Key findings visualization'),

494

('source_code.py', 'Analysis script'),

495

('readme.txt', 'Project documentation')

496

]

497

498

# create_portfolio_pdf(portfolio_files, 'project_portfolio.pdf')

499

```

500

501

### Attachment Analysis and Reporting

502

503

```python

504

import pikepdf

505

from pathlib import Path

506

import hashlib

507

508

def analyze_pdf_attachments(pdf_path):

509

"""Comprehensive analysis of PDF attachments."""

510

511

pdf = pikepdf.open(pdf_path)

512

attachments = pdf.attachments

513

514

analysis = {

515

'total_attachments': len(attachments),

516

'total_size': 0,

517

'file_types': {},

518

'relationships': {},

519

'files': []

520

}

521

522

if analysis['total_attachments'] == 0:

523

print(f"No attachments found in {pdf_path}")

524

pdf.close()

525

return analysis

526

527

for name, attachment in attachments.items():

528

try:

529

# Basic file info

530

filename = attachment.filename or name

531

size = attachment.size if attachment.size >= 0 else 0

532

533

# Extract file for analysis

534

file_data = attachment.get_file()

535

actual_size = len(file_data)

536

537

# File type analysis

538

file_extension = Path(filename).suffix.lower()

539

if file_extension:

540

analysis['file_types'][file_extension] = analysis['file_types'].get(file_extension, 0) + 1

541

else:

542

analysis['file_types']['(no extension)'] = analysis['file_types'].get('(no extension)', 0) + 1

543

544

# Relationship analysis

545

relationship = attachment.relationship

546

analysis['relationships'][relationship] = analysis['relationships'].get(relationship, 0) + 1

547

548

# Calculate checksums

549

md5_hash = hashlib.md5(file_data).hexdigest().upper()

550

sha256_hash = hashlib.sha256(file_data).hexdigest().upper()

551

552

# File details

553

file_info = {

554

'name': filename,

555

'attachment_key': name,

556

'description': attachment.description,

557

'size_reported': size,

558

'size_actual': actual_size,

559

'size_match': size == actual_size,

560

'relationship': relationship,

561

'creation_date': attachment.creation_date,

562

'modification_date': attachment.modification_date,

563

'checksum_reported': attachment.checksum,

564

'checksum_md5': md5_hash,

565

'checksum_sha256': sha256_hash,

566

'checksum_verified': attachment.checksum.upper() == md5_hash if attachment.checksum else None,

567

'file_extension': file_extension,

568

'filenames_variants': attachment.get_all_filenames()

569

}

570

571

analysis['files'].append(file_info)

572

analysis['total_size'] += actual_size

573

574

except Exception as e:

575

print(f"Error analyzing attachment '{name}': {e}")

576

577

pdf.close()

578

return analysis

579

580

def print_attachment_report(analysis):

581

"""Print formatted attachment analysis report."""

582

583

print("PDF Attachment Analysis Report")

584

print("=" * 50)

585

586

print(f"Total Attachments: {analysis['total_attachments']}")

587

print(f"Total Size: {analysis['total_size']:,} bytes ({analysis['total_size'] / 1024 / 1024:.2f} MB)")

588

589

if analysis['file_types']:

590

print(f"\nFile Types:")

591

for ext, count in sorted(analysis['file_types'].items()):

592

print(f" {ext}: {count} files")

593

594

if analysis['relationships']:

595

print(f"\nFile Relationships:")

596

for rel, count in sorted(analysis['relationships'].items()):

597

print(f" {rel}: {count} files")

598

599

print(f"\nDetailed File Information:")

600

print("-" * 50)

601

602

for file_info in analysis['files']:

603

print(f"\nπŸ“Ž {file_info['name']}")

604

print(f" Key: {file_info['attachment_key']}")

605

print(f" Description: {file_info['description']}")

606

print(f" Size: {file_info['size_actual']:,} bytes", end="")

607

608

if not file_info['size_match']:

609

print(f" (reported: {file_info['size_reported']:,})", end="")

610

print()

611

612

print(f" Type: {file_info['file_extension']}")

613

print(f" Relationship: {file_info['relationship']}")

614

print(f" Created: {file_info['creation_date']}")

615

print(f" Modified: {file_info['modification_date']}")

616

617

# Checksum verification

618

if file_info['checksum_reported']:

619

verified = file_info['checksum_verified']

620

status = "βœ“ Verified" if verified else "❌ Failed"

621

print(f" Checksum: {status} ({file_info['checksum_reported']})")

622

else:

623

print(f" MD5: {file_info['checksum_md5']}")

624

625

# Filename variants

626

variants = file_info['filenames_variants']

627

if len(variants) > 1:

628

print(f" Filename variants: {variants}")

629

630

# Analyze attachments

631

pdf_path = 'document_with_attachments.pdf'

632

if Path(pdf_path).exists():

633

analysis = analyze_pdf_attachments(pdf_path)

634

print_attachment_report(analysis)

635

```

636

637

### Bulk Attachment Operations

638

639

```python

640

import pikepdf

641

from pathlib import Path

642

643

def add_attachments_to_directory(directory_path, attachment_dir):

644

"""Add the same set of attachments to all PDFs in a directory."""

645

646

directory = Path(directory_path)

647

attachment_path = Path(attachment_dir)

648

649

# Get list of files to attach

650

attachment_files = list(attachment_path.glob('*'))

651

attachment_files = [f for f in attachment_files if f.is_file()]

652

653

if not attachment_files:

654

print(f"No files found in {attachment_dir}")

655

return

656

657

# Get list of PDFs to process

658

pdf_files = list(directory.glob('*.pdf'))

659

660

results = {'success': [], 'failed': []}

661

662

for pdf_file in pdf_files:

663

try:

664

pdf = pikepdf.open(pdf_file)

665

attachments = pdf.attachments

666

667

# Skip if already has attachments

668

if len(attachments) > 0:

669

print(f"Skipping {pdf_file.name} - already has attachments")

670

pdf.close()

671

continue

672

673

# Add each attachment file

674

attachments_added = 0

675

for attach_file in attachment_files:

676

try:

677

attachment = pikepdf.AttachedFileSpec.from_filepath(

678

pdf,

679

str(attach_file),

680

description=f"Standard attachment: {attach_file.name}",

681

relationship='/Supplement'

682

)

683

attachments[attach_file.name] = attachment

684

attachments_added += 1

685

686

except Exception as e:

687

print(f"Failed to attach {attach_file.name} to {pdf_file.name}: {e}")

688

689

# Save if any attachments were added

690

if attachments_added > 0:

691

pdf.save()

692

results['success'].append((pdf_file.name, attachments_added))

693

print(f"Added {attachments_added} attachments to {pdf_file.name}")

694

695

pdf.close()

696

697

except Exception as e:

698

results['failed'].append((pdf_file.name, str(e)))

699

print(f"Failed to process {pdf_file.name}: {e}")

700

701

print(f"\nBulk attachment complete:")

702

print(f" Success: {len(results['success'])} PDFs")

703

print(f" Failed: {len(results['failed'])} PDFs")

704

705

def remove_all_attachments(directory_path):

706

"""Remove all attachments from PDFs in a directory."""

707

708

directory = Path(directory_path)

709

pdf_files = list(directory.glob('*.pdf'))

710

711

results = {'processed': 0, 'attachments_removed': 0, 'failed': []}

712

713

for pdf_file in pdf_files:

714

try:

715

pdf = pikepdf.open(pdf_file)

716

attachments = pdf.attachments

717

718

attachment_count = len(attachments)

719

720

if attachment_count > 0:

721

# Clear all attachments

722

attachments.clear()

723

pdf.save()

724

725

results['attachments_removed'] += attachment_count

726

print(f"Removed {attachment_count} attachments from {pdf_file.name}")

727

728

results['processed'] += 1

729

pdf.close()

730

731

except Exception as e:

732

results['failed'].append((pdf_file.name, str(e)))

733

print(f"Failed to process {pdf_file.name}: {e}")

734

735

print(f"\nAttachment removal complete:")

736

print(f" PDFs processed: {results['processed']}")

737

print(f" Attachments removed: {results['attachments_removed']}")

738

print(f" Failed: {len(results['failed'])} PDFs")

739

740

# Example usage (commented out to avoid file operations)

741

# add_attachments_to_directory('./pdfs', './standard_attachments')

742

# remove_all_attachments('./pdfs')

743

```