or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

agents.mdchains.mddocument-loaders.mdembeddings.mdexperimental.mdindex.mdmemory.mdoutput-parsers.mdretrievers.mdtools.mdutilities.md
tile.json

document-loaders.mddocs/

0

# Document Loading and Processing

1

2

Comprehensive document loaders for various file formats and sources, plus text splitting and transformation utilities. Document loaders provide the foundation for ingesting data into LangChain applications.

3

4

## Capabilities

5

6

### Base Document Loader

7

8

Foundation class for all document loaders with standardized interfaces.

9

10

```typescript { .api }

11

/**

12

* Base document loader class - all loaders inherit from this

13

*/

14

abstract class BaseDocumentLoader {

15

/** Load documents from the source */

16

abstract load(): Promise<DocumentInterface[]>;

17

18

/** Load and split documents using a text splitter */

19

loadAndSplit(textSplitter?: TextSplitter): Promise<DocumentInterface[]>;

20

}

21

22

/**

23

* Document interface representing loaded content

24

*/

25

interface DocumentInterface {

26

/** Main content of the document */

27

pageContent: string;

28

29

/** Metadata about the document */

30

metadata: Record<string, any>;

31

}

32

```

33

34

### File System Loaders

35

36

Loaders for various file formats commonly found on local file systems.

37

38

```typescript { .api }

39

/**

40

* Text file loader for .txt files

41

*/

42

class TextLoader extends BaseDocumentLoader {

43

constructor(filePathOrBlob: string | Blob);

44

45

/** File path or blob to load */

46

filePath: string;

47

48

load(): Promise<DocumentInterface[]>;

49

}

50

51

/**

52

* JSON file loader with JSONPointer support

53

*/

54

class JSONLoader extends BaseDocumentLoader {

55

constructor(

56

filePathOrBlob: string | Blob,

57

pointers?: string | string[]

58

);

59

60

/** File path or blob */

61

filePathOrBlob: string | Blob;

62

63

/** JSONPointer paths to extract */

64

pointers: string[];

65

66

load(): Promise<DocumentInterface[]>;

67

}

68

69

/**

70

* Directory loader that processes multiple files

71

*/

72

class DirectoryLoader extends BaseDocumentLoader {

73

constructor(

74

directoryPath: string,

75

loaders: Record<string, (filePath: string) => BaseDocumentLoader>,

76

recursive?: boolean,

77

unknown?: UnknownHandling

78

);

79

80

/** Directory path to scan */

81

directoryPath: string;

82

83

/** Map of file extensions to loader factories */

84

loaders: Record<string, (filePath: string) => BaseDocumentLoader>;

85

86

/** Whether to scan recursively */

87

recursive: boolean;

88

89

/** How to handle unknown file types */

90

unknown: UnknownHandling;

91

92

load(): Promise<DocumentInterface[]>;

93

}

94

95

/**

96

* Multi-file loader for processing multiple specific files

97

*/

98

class MultiFileLoader extends BaseDocumentLoader {

99

constructor(filePaths: string[], loaders: Record<string, typeof BaseDocumentLoader>);

100

101

/** Array of file paths to load */

102

filePaths: string[];

103

104

/** Map of extensions to loader classes */

105

loaders: Record<string, typeof BaseDocumentLoader>;

106

107

load(): Promise<DocumentInterface[]>;

108

}

109

110

/**

111

* Buffer loader for in-memory content

112

*/

113

class BufferLoader extends BaseDocumentLoader {

114

constructor(

115

buffer: Buffer,

116

metadata?: Record<string, any>

117

);

118

119

/** Buffer containing file data */

120

buffer: Buffer;

121

122

/** Additional metadata */

123

metadata: Record<string, any>;

124

125

load(): Promise<DocumentInterface[]>;

126

}

127

```

128

129

**Usage Examples:**

130

131

```typescript

132

import {

133

TextLoader,

134

JSONLoader,

135

DirectoryLoader,

136

MultiFileLoader

137

} from "langchain/document_loaders";

138

139

// Load single text file

140

const textLoader = new TextLoader("./documents/readme.txt");

141

const textDocs = await textLoader.load();

142

143

// Load JSON with specific fields

144

const jsonLoader = new JSONLoader(

145

"./data/users.json",

146

["/users/0/name", "/users/0/email"] // JSONPointer paths

147

);

148

const jsonDocs = await jsonLoader.load();

149

150

// Load entire directory

151

const directoryLoader = new DirectoryLoader(

152

"./documents",

153

{

154

".txt": (path) => new TextLoader(path),

155

".json": (path) => new JSONLoader(path),

156

".md": (path) => new TextLoader(path),

157

},

158

true // recursive

159

);

160

const allDocs = await directoryLoader.load();

161

162

// Load specific files

163

const multiLoader = new MultiFileLoader(

164

["./doc1.txt", "./doc2.json", "./doc3.md"],

165

{

166

".txt": TextLoader,

167

".json": JSONLoader,

168

".md": TextLoader,

169

}

170

);

171

const specificDocs = await multiLoader.load();

172

```

173

174

### Text Splitters

175

176

Utilities for splitting large documents into smaller, manageable chunks.

177

178

```typescript { .api }

179

/**

180

* Base text splitter interface

181

*/

182

abstract class TextSplitter {

183

constructor(fields?: TextSplitterParams);

184

185

/** Maximum chunk size */

186

chunkSize: number;

187

188

/** Overlap between chunks */

189

chunkOverlap: number;

190

191

/** Function to calculate text length */

192

lengthFunction: (text: string) => number;

193

194

/** Keep separator in chunks */

195

keepSeparator: boolean;

196

197

/** Split text into chunks */

198

abstract splitText(text: string): Promise<string[]>;

199

200

/** Create documents from text */

201

createDocuments(

202

texts: string[],

203

metadatas?: Record<string, any>[]

204

): Promise<DocumentInterface[]>;

205

206

/** Split existing documents */

207

splitDocuments(documents: DocumentInterface[]): Promise<DocumentInterface[]>;

208

}

209

210

/**

211

* Character-based text splitter

212

*/

213

class CharacterTextSplitter extends TextSplitter {

214

constructor(fields?: CharacterTextSplitterParams);

215

216

/** Separator character/string */

217

separator: string;

218

219

splitText(text: string): Promise<string[]>;

220

221

static fromTikTokenEncoder(

222

encoding: TikTokenEncoding,

223

fields?: Partial<CharacterTextSplitterParams>

224

): CharacterTextSplitter;

225

}

226

227

/**

228

* Recursive character text splitter with multiple separators

229

*/

230

class RecursiveCharacterTextSplitter extends TextSplitter {

231

constructor(fields?: RecursiveCharacterTextSplitterParams);

232

233

/** Array of separators to try in order */

234

separators: string[];

235

236

splitText(text: string): Promise<string[]>;

237

238

static fromLanguage(

239

language: "cpp" | "go" | "java" | "js" | "php" | "proto" | "python" | "rst" | "ruby" | "rust" | "scala" | "swift" | "markdown" | "latex" | "html" | "sol",

240

options?: Partial<RecursiveCharacterTextSplitterParams>

241

): RecursiveCharacterTextSplitter;

242

}

243

244

/**

245

* Token-based text splitter

246

*/

247

class TokenTextSplitter extends TextSplitter {

248

constructor(fields?: TokenTextSplitterParams);

249

250

/** Encoding name for tokenization */

251

encodingName: TikTokenEncoding;

252

253

/** Allowed special tokens */

254

allowedSpecial: Set<string> | "all";

255

256

/** Disallowed special tokens */

257

disallowedSpecial: Set<string> | "all";

258

259

splitText(text: string): Promise<string[]>;

260

}

261

```

262

263

**Usage Examples:**

264

265

```typescript

266

import {

267

CharacterTextSplitter,

268

RecursiveCharacterTextSplitter,

269

TokenTextSplitter

270

} from "langchain/text_splitter";

271

272

// Character-based splitting

273

const charSplitter = new CharacterTextSplitter({

274

separator: "\n\n",

275

chunkSize: 1000,

276

chunkOverlap: 200,

277

});

278

279

const chunks1 = await charSplitter.splitText(longText);

280

281

// Recursive splitting with multiple separators

282

const recursiveSplitter = new RecursiveCharacterTextSplitter({

283

chunkSize: 1000,

284

chunkOverlap: 200,

285

});

286

287

// Language-specific splitting

288

const jsSplitter = RecursiveCharacterTextSplitter.fromLanguage("js", {

289

chunkSize: 2000,

290

chunkOverlap: 200,

291

});

292

293

const jsChunks = await jsSplitter.splitText(javascriptCode);

294

295

// Token-based splitting

296

const tokenSplitter = new TokenTextSplitter({

297

encodingName: "gpt2",

298

chunkSize: 1000,

299

chunkOverlap: 0,

300

});

301

302

const tokenChunks = await tokenSplitter.splitText(text);

303

304

// Split documents with loader integration

305

const loader = new TextLoader("large_document.txt");

306

const docs = await loader.loadAndSplit(recursiveSplitter);

307

```

308

309

### Web Loaders

310

311

Loaders for web-based content and APIs.

312

313

```typescript { .api }

314

/**

315

* Web-based document loader

316

*/

317

class WebBaseLoader extends BaseDocumentLoader {

318

constructor(webPath: string | string[], options?: WebBaseLoaderParams);

319

320

/** URL(s) to load */

321

webPath: string | string[];

322

323

/** Request options */

324

requestOptions?: RequestInit;

325

326

/** Text decoder options */

327

textDecoder?: TextDecoder;

328

329

load(): Promise<DocumentInterface[]>;

330

}

331

332

/**

333

* Cheerio web scraper loader

334

*/

335

class CheerioWebBaseLoader extends BaseDocumentLoader {

336

constructor(webPath: string, options?: CheerioWebBaseLoaderParams);

337

338

/** URL to scrape */

339

webPath: string;

340

341

/** Cheerio selector */

342

selector?: string;

343

344

/** Text extraction options */

345

textTransformer?: (text: string) => string;

346

347

load(): Promise<DocumentInterface[]>;

348

}

349

350

/**

351

* Playwright web scraper loader

352

*/

353

class PlaywrightWebBaseLoader extends BaseDocumentLoader {

354

constructor(webPath: string, options?: PlaywrightWebBaseLoaderParams);

355

356

/** URL to scrape */

357

webPath: string;

358

359

/** Playwright launch options */

360

launchOptions?: LaunchOptions;

361

362

/** Page evaluation function */

363

evaluateOptions?: EvaluateOptions;

364

365

load(): Promise<DocumentInterface[]>;

366

}

367

```

368

369

### Database Loaders

370

371

Loaders for various database systems and data sources.

372

373

```typescript { .api }

374

/**

375

* SQL database loader

376

*/

377

class SQLDatabaseLoader extends BaseDocumentLoader {

378

constructor(query: string, database: SqlDatabase, options?: SQLDatabaseLoaderParams);

379

380

/** SQL query to execute */

381

query: string;

382

383

/** Database connection */

384

database: SqlDatabase;

385

386

/** Additional options */

387

options: SQLDatabaseLoaderParams;

388

389

load(): Promise<DocumentInterface[]>;

390

}

391

392

/**

393

* CSV file loader

394

*/

395

class CSVLoader extends BaseDocumentLoader {

396

constructor(filePath: string, options?: CSVLoaderParams);

397

398

/** CSV file path */

399

filePath: string;

400

401

/** Column to use as content */

402

column?: string;

403

404

/** CSV parsing options */

405

csvOptions?: CSVParseOptions;

406

407

load(): Promise<DocumentInterface[]>;

408

}

409

```

410

411

### API and Service Loaders

412

413

Loaders for external APIs and cloud services.

414

415

```typescript { .api }

416

/**

417

* Notion API loader

418

*/

419

class NotionAPILoader extends BaseDocumentLoader {

420

constructor(options: NotionAPILoaderParams);

421

422

/** Notion integration token */

423

integrationToken: string;

424

425

/** Notion page or database ID */

426

notionID: string;

427

428

/** Type of Notion resource */

429

type: "page" | "database";

430

431

load(): Promise<DocumentInterface[]>;

432

}

433

434

/**

435

* GitHub repository loader

436

*/

437

class GithubRepoLoader extends BaseDocumentLoader {

438

constructor(

439

githubUrl: string,

440

options?: GithubRepoLoaderParams

441

);

442

443

/** GitHub repository URL */

444

githubUrl: string;

445

446

/** Access token for private repos */

447

accessToken?: string;

448

449

/** Branch to load from */

450

branch?: string;

451

452

/** File patterns to include */

453

include?: string[];

454

455

/** File patterns to exclude */

456

exclude?: string[];

457

458

load(): Promise<DocumentInterface[]>;

459

}

460

461

/**

462

* S3 file loader

463

*/

464

class S3Loader extends BaseDocumentLoader {

465

constructor(bucket: string, key: string, options?: S3LoaderParams);

466

467

/** S3 bucket name */

468

bucket: string;

469

470

/** S3 object key */

471

key: string;

472

473

/** AWS configuration */

474

s3Config?: S3Config;

475

476

load(): Promise<DocumentInterface[]>;

477

}

478

```

479

480

### Specialized Format Loaders

481

482

Loaders for specific document formats and content types.

483

484

```typescript { .api }

485

/**

486

* PDF document loader

487

*/

488

class PDFLoader extends BaseDocumentLoader {

489

constructor(filePathOrBlob: string | Blob, options?: PDFLoaderParams);

490

491

/** PDF file path or blob */

492

filePathOrBlob: string | Blob;

493

494

/** Split pages into separate documents */

495

splitPages?: boolean;

496

497

/** PDF parsing options */

498

pdfParseOptions?: PDFParseOptions;

499

500

load(): Promise<DocumentInterface[]>;

501

}

502

503

/**

504

* Microsoft Word document loader

505

*/

506

class DocxLoader extends BaseDocumentLoader {

507

constructor(filePathOrBlob: string | Blob);

508

509

/** Word document path or blob */

510

filePathOrBlob: string | Blob;

511

512

load(): Promise<DocumentInterface[]>;

513

}

514

515

/**

516

* PowerPoint presentation loader

517

*/

518

class PPTXLoader extends BaseDocumentLoader {

519

constructor(filePathOrBlob: string | Blob);

520

521

/** PowerPoint file path or blob */

522

filePathOrBlob: string | Blob;

523

524

load(): Promise<DocumentInterface[]>;

525

}

526

527

/**

528

* Email message loader (EML format)

529

*/

530

class UnstructuredEmailLoader extends BaseDocumentLoader {

531

constructor(filePath: string);

532

533

/** Email file path */

534

filePath: string;

535

536

load(): Promise<DocumentInterface[]>;

537

}

538

```

539

540

### Document Transformers

541

542

Components for transforming and processing loaded documents.

543

544

```typescript { .api }

545

/**

546

* OpenAI functions document transformer

547

*/

548

class OpenAIFunctionsDocumentTransformer {

549

constructor(options?: OpenAIFunctionsTransformerOptions);

550

551

/** Transform documents using OpenAI functions */

552

transformDocuments(

553

documents: DocumentInterface[],

554

options?: TransformOptions

555

): Promise<DocumentInterface[]>;

556

}

557

558

/**

559

* HTML transformer for web content

560

*/

561

class HtmlToTextTransformer {

562

constructor(options?: HtmlToTextOptions);

563

564

/** Convert HTML to plain text */

565

transformDocuments(documents: DocumentInterface[]): Promise<DocumentInterface[]>;

566

}

567

```

568

569

## Types

570

571

### Base Loader Types

572

573

```typescript { .api }

574

interface BaseDocumentLoaderParams {

575

/** Additional metadata to add to all documents */

576

metadata?: Record<string, any>;

577

}

578

579

type UnknownHandling = "ignore" | "warn" | "error";

580

```

581

582

### File System Loader Types

583

584

```typescript { .api }

585

interface DirectoryLoaderOptions {

586

/** Whether to scan directories recursively */

587

recursive?: boolean;

588

/** How to handle unknown file types */

589

unknown?: UnknownHandling;

590

/** File patterns to include */

591

include?: string[];

592

/** File patterns to exclude */

593

exclude?: string[];

594

}

595

596

interface MultiFileLoaderOptions {

597

/** Map of file extensions to loader classes */

598

loaders: Record<string, typeof BaseDocumentLoader>;

599

}

600

```

601

602

### Text Splitter Types

603

604

```typescript { .api }

605

interface TextSplitterParams {

606

/** Maximum size of each chunk */

607

chunkSize?: number;

608

/** Number of characters to overlap between chunks */

609

chunkOverlap?: number;

610

/** Function to calculate text length */

611

lengthFunction?: (text: string) => number;

612

/** Whether to keep separator in results */

613

keepSeparator?: boolean;

614

}

615

616

interface CharacterTextSplitterParams extends TextSplitterParams {

617

/** String to split on */

618

separator?: string;

619

}

620

621

interface RecursiveCharacterTextSplitterParams extends TextSplitterParams {

622

/** List of separators to try in order */

623

separators?: string[];

624

}

625

626

interface TokenTextSplitterParams extends TextSplitterParams {

627

/** Name of tiktoken encoding */

628

encodingName?: TikTokenEncoding;

629

/** Allowed special tokens */

630

allowedSpecial?: Set<string> | "all";

631

/** Disallowed special tokens */

632

disallowedSpecial?: Set<string> | "all";

633

}

634

635

type TikTokenEncoding = "gpt2" | "r50k_base" | "p50k_base" | "cl100k_base" | "o200k_base";

636

```

637

638

### Web Loader Types

639

640

```typescript { .api }

641

interface WebBaseLoaderParams {

642

/** Request configuration */

643

requestOptions?: RequestInit;

644

/** Text decoder for response */

645

textDecoder?: TextDecoder;

646

/** Additional metadata */

647

metadata?: Record<string, any>;

648

}

649

650

interface CheerioWebBaseLoaderParams extends WebBaseLoaderParams {

651

/** CSS selector for content extraction */

652

selector?: string;

653

/** Function to transform extracted text */

654

textTransformer?: (text: string) => string;

655

}

656

657

interface PlaywrightWebBaseLoaderParams extends WebBaseLoaderParams {

658

/** Playwright browser launch options */

659

launchOptions?: LaunchOptions;

660

/** Page evaluation options */

661

evaluateOptions?: EvaluateOptions;

662

}

663

```

664

665

### Database Loader Types

666

667

```typescript { .api }

668

interface SQLDatabaseLoaderParams {

669

/** Column names to include in metadata */

670

metadataColumns?: string[];

671

/** Column to use as page content */

672

contentColumns?: string[];

673

}

674

675

interface CSVLoaderParams {

676

/** Column to use as document content */

677

column?: string;

678

/** Columns to include in metadata */

679

metadataColumns?: string[];

680

/** CSV parsing options */

681

csvOptions?: CSVParseOptions;

682

}

683

684

interface CSVParseOptions {

685

/** Field delimiter */

686

delimiter?: string;

687

/** Quote character */

688

quote?: string;

689

/** Escape character */

690

escape?: string;

691

/** Whether first row contains headers */

692

headers?: boolean;

693

}

694

```

695

696

### API Service Loader Types

697

698

```typescript { .api }

699

interface NotionAPILoaderParams {

700

/** Notion integration token */

701

integrationToken: string;

702

/** Notion page or database ID */

703

notionID: string;

704

/** Type of Notion resource */

705

type: "page" | "database";

706

/** Properties to include */

707

propertiesAsMetadata?: boolean;

708

}

709

710

interface GithubRepoLoaderParams {

711

/** GitHub access token */

712

accessToken?: string;

713

/** Branch to clone from */

714

branch?: string;

715

/** File patterns to include */

716

include?: string[];

717

/** File patterns to exclude */

718

exclude?: string[];

719

/** Maximum file size to process */

720

maxFileSize?: number;

721

}

722

723

interface S3LoaderParams {

724

/** AWS S3 configuration */

725

s3Config?: S3Config;

726

/** Additional metadata */

727

metadata?: Record<string, any>;

728

}

729

730

interface S3Config {

731

/** AWS region */

732

region?: string;

733

/** AWS access key ID */

734

accessKeyId?: string;

735

/** AWS secret access key */

736

secretAccessKey?: string;

737

/** AWS session token */

738

sessionToken?: string;

739

}

740

```

741

742

### Specialized Format Types

743

744

```typescript { .api }

745

interface PDFLoaderParams {

746

/** Whether to split each page into separate document */

747

splitPages?: boolean;

748

/** PDF.js parsing options */

749

pdfParseOptions?: PDFParseOptions;

750

}

751

752

interface PDFParseOptions {

753

/** Maximum pages to process */

754

maxPages?: number;

755

/** Whether to use legacy build */

756

useSystemFonts?: boolean;

757

/** Custom font loading */

758

fontExtraProperties?: boolean;

759

}

760

761

interface OpenAIFunctionsTransformerOptions {

762

/** OpenAI function definitions */

763

functions?: OpenAIFunctionDefinition[];

764

/** Whether to include raw function results */

765

includeRaw?: boolean;

766

}

767

768

interface HtmlToTextOptions {

769

/** Selectors to ignore */

770

ignoreSelectors?: string[];

771

/** Whether to preserve links */

772

preserveLinks?: boolean;

773

/** Word wrap width */

774

wordwrap?: number | false;

775

}

776

```

777

778

## Document Processing Patterns

779

780

### Batch Document Loading

781

782

```typescript

783

import { DirectoryLoader, RecursiveCharacterTextSplitter } from "langchain/document_loaders";

784

785

async function loadAndProcessDocuments(directory: string) {

786

// Load all documents from directory

787

const loader = new DirectoryLoader(directory, {

788

".txt": (path) => new TextLoader(path),

789

".md": (path) => new TextLoader(path),

790

".json": (path) => new JSONLoader(path),

791

});

792

793

// Split into chunks

794

const splitter = new RecursiveCharacterTextSplitter({

795

chunkSize: 1000,

796

chunkOverlap: 200,

797

});

798

799

const docs = await loader.loadAndSplit(splitter);

800

801

// Add processing metadata

802

return docs.map(doc => ({

803

...doc,

804

metadata: {

805

...doc.metadata,

806

processed_at: new Date().toISOString(),

807

chunk_size: doc.pageContent.length,

808

}

809

}));

810

}

811

```

812

813

### Custom Document Loader

814

815

```typescript

816

class CustomAPILoader extends BaseDocumentLoader {

817

constructor(private apiEndpoint: string, private apiKey: string) {

818

super();

819

}

820

821

async load(): Promise<DocumentInterface[]> {

822

const response = await fetch(this.apiEndpoint, {

823

headers: { 'Authorization': `Bearer ${this.apiKey}` }

824

});

825

826

const data = await response.json();

827

828

return data.results.map((item: any) => ({

829

pageContent: item.content,

830

metadata: {

831

source: this.apiEndpoint,

832

id: item.id,

833

created_at: item.created_at,

834

}

835

}));

836

}

837

}

838

```