0
# Document Loading and Processing
1
2
Comprehensive document loaders for various file formats and sources, plus text splitting and transformation utilities. Document loaders provide the foundation for ingesting data into LangChain applications.
3
4
## Capabilities
5
6
### Base Document Loader
7
8
Foundation class for all document loaders with standardized interfaces.
9
10
```typescript { .api }
11
/**
12
* Base document loader class - all loaders inherit from this
13
*/
14
abstract class BaseDocumentLoader {
15
/** Load documents from the source */
16
abstract load(): Promise<DocumentInterface[]>;
17
18
/** Load and split documents using a text splitter */
19
loadAndSplit(textSplitter?: TextSplitter): Promise<DocumentInterface[]>;
20
}
21
22
/**
23
* Document interface representing loaded content
24
*/
25
interface DocumentInterface {
26
/** Main content of the document */
27
pageContent: string;
28
29
/** Metadata about the document */
30
metadata: Record<string, any>;
31
}
32
```
33
34
### File System Loaders
35
36
Loaders for various file formats commonly found on local file systems.
37
38
```typescript { .api }
39
/**
40
* Text file loader for .txt files
41
*/
42
class TextLoader extends BaseDocumentLoader {
43
constructor(filePathOrBlob: string | Blob);
44
45
/** File path or blob to load */
46
filePath: string;
47
48
load(): Promise<DocumentInterface[]>;
49
}
50
51
/**
52
* JSON file loader with JSONPointer support
53
*/
54
class JSONLoader extends BaseDocumentLoader {
55
constructor(
56
filePathOrBlob: string | Blob,
57
pointers?: string | string[]
58
);
59
60
/** File path or blob */
61
filePathOrBlob: string | Blob;
62
63
/** JSONPointer paths to extract */
64
pointers: string[];
65
66
load(): Promise<DocumentInterface[]>;
67
}
68
69
/**
70
* Directory loader that processes multiple files
71
*/
72
class DirectoryLoader extends BaseDocumentLoader {
73
constructor(
74
directoryPath: string,
75
loaders: Record<string, (filePath: string) => BaseDocumentLoader>,
76
recursive?: boolean,
77
unknown?: UnknownHandling
78
);
79
80
/** Directory path to scan */
81
directoryPath: string;
82
83
/** Map of file extensions to loader factories */
84
loaders: Record<string, (filePath: string) => BaseDocumentLoader>;
85
86
/** Whether to scan recursively */
87
recursive: boolean;
88
89
/** How to handle unknown file types */
90
unknown: UnknownHandling;
91
92
load(): Promise<DocumentInterface[]>;
93
}
94
95
/**
96
* Multi-file loader for processing multiple specific files
97
*/
98
class MultiFileLoader extends BaseDocumentLoader {
99
constructor(filePaths: string[], loaders: Record<string, typeof BaseDocumentLoader>);
100
101
/** Array of file paths to load */
102
filePaths: string[];
103
104
/** Map of extensions to loader classes */
105
loaders: Record<string, typeof BaseDocumentLoader>;
106
107
load(): Promise<DocumentInterface[]>;
108
}
109
110
/**
111
* Buffer loader for in-memory content
112
*/
113
class BufferLoader extends BaseDocumentLoader {
114
constructor(
115
buffer: Buffer,
116
metadata?: Record<string, any>
117
);
118
119
/** Buffer containing file data */
120
buffer: Buffer;
121
122
/** Additional metadata */
123
metadata: Record<string, any>;
124
125
load(): Promise<DocumentInterface[]>;
126
}
127
```
128
129
**Usage Examples:**
130
131
```typescript
132
import {
133
TextLoader,
134
JSONLoader,
135
DirectoryLoader,
136
MultiFileLoader
137
} from "langchain/document_loaders";
138
139
// Load single text file
140
const textLoader = new TextLoader("./documents/readme.txt");
141
const textDocs = await textLoader.load();
142
143
// Load JSON with specific fields
144
const jsonLoader = new JSONLoader(
145
"./data/users.json",
146
["/users/0/name", "/users/0/email"] // JSONPointer paths
147
);
148
const jsonDocs = await jsonLoader.load();
149
150
// Load entire directory
151
const directoryLoader = new DirectoryLoader(
152
"./documents",
153
{
154
".txt": (path) => new TextLoader(path),
155
".json": (path) => new JSONLoader(path),
156
".md": (path) => new TextLoader(path),
157
},
158
true // recursive
159
);
160
const allDocs = await directoryLoader.load();
161
162
// Load specific files
163
const multiLoader = new MultiFileLoader(
164
["./doc1.txt", "./doc2.json", "./doc3.md"],
165
{
166
".txt": TextLoader,
167
".json": JSONLoader,
168
".md": TextLoader,
169
}
170
);
171
const specificDocs = await multiLoader.load();
172
```
173
174
### Text Splitters
175
176
Utilities for splitting large documents into smaller, manageable chunks.
177
178
```typescript { .api }
179
/**
180
* Base text splitter interface
181
*/
182
abstract class TextSplitter {
183
constructor(fields?: TextSplitterParams);
184
185
/** Maximum chunk size */
186
chunkSize: number;
187
188
/** Overlap between chunks */
189
chunkOverlap: number;
190
191
/** Function to calculate text length */
192
lengthFunction: (text: string) => number;
193
194
/** Keep separator in chunks */
195
keepSeparator: boolean;
196
197
/** Split text into chunks */
198
abstract splitText(text: string): Promise<string[]>;
199
200
/** Create documents from text */
201
createDocuments(
202
texts: string[],
203
metadatas?: Record<string, any>[]
204
): Promise<DocumentInterface[]>;
205
206
/** Split existing documents */
207
splitDocuments(documents: DocumentInterface[]): Promise<DocumentInterface[]>;
208
}
209
210
/**
211
* Character-based text splitter
212
*/
213
class CharacterTextSplitter extends TextSplitter {
214
constructor(fields?: CharacterTextSplitterParams);
215
216
/** Separator character/string */
217
separator: string;
218
219
splitText(text: string): Promise<string[]>;
220
221
static fromTikTokenEncoder(
222
encoding: TikTokenEncoding,
223
fields?: Partial<CharacterTextSplitterParams>
224
): CharacterTextSplitter;
225
}
226
227
/**
228
* Recursive character text splitter with multiple separators
229
*/
230
class RecursiveCharacterTextSplitter extends TextSplitter {
231
constructor(fields?: RecursiveCharacterTextSplitterParams);
232
233
/** Array of separators to try in order */
234
separators: string[];
235
236
splitText(text: string): Promise<string[]>;
237
238
static fromLanguage(
239
language: "cpp" | "go" | "java" | "js" | "php" | "proto" | "python" | "rst" | "ruby" | "rust" | "scala" | "swift" | "markdown" | "latex" | "html" | "sol",
240
options?: Partial<RecursiveCharacterTextSplitterParams>
241
): RecursiveCharacterTextSplitter;
242
}
243
244
/**
245
* Token-based text splitter
246
*/
247
class TokenTextSplitter extends TextSplitter {
248
constructor(fields?: TokenTextSplitterParams);
249
250
/** Encoding name for tokenization */
251
encodingName: TikTokenEncoding;
252
253
/** Allowed special tokens */
254
allowedSpecial: Set<string> | "all";
255
256
/** Disallowed special tokens */
257
disallowedSpecial: Set<string> | "all";
258
259
splitText(text: string): Promise<string[]>;
260
}
261
```
262
263
**Usage Examples:**
264
265
```typescript
266
import {
267
CharacterTextSplitter,
268
RecursiveCharacterTextSplitter,
269
TokenTextSplitter
270
} from "langchain/text_splitter";
271
272
// Character-based splitting
273
const charSplitter = new CharacterTextSplitter({
274
separator: "\n\n",
275
chunkSize: 1000,
276
chunkOverlap: 200,
277
});
278
279
const chunks1 = await charSplitter.splitText(longText);
280
281
// Recursive splitting with multiple separators
282
const recursiveSplitter = new RecursiveCharacterTextSplitter({
283
chunkSize: 1000,
284
chunkOverlap: 200,
285
});
286
287
// Language-specific splitting
288
const jsSplitter = RecursiveCharacterTextSplitter.fromLanguage("js", {
289
chunkSize: 2000,
290
chunkOverlap: 200,
291
});
292
293
const jsChunks = await jsSplitter.splitText(javascriptCode);
294
295
// Token-based splitting
296
const tokenSplitter = new TokenTextSplitter({
297
encodingName: "gpt2",
298
chunkSize: 1000,
299
chunkOverlap: 0,
300
});
301
302
const tokenChunks = await tokenSplitter.splitText(text);
303
304
// Split documents with loader integration
305
const loader = new TextLoader("large_document.txt");
306
const docs = await loader.loadAndSplit(recursiveSplitter);
307
```
308
309
### Web Loaders
310
311
Loaders for web-based content and APIs.
312
313
```typescript { .api }
314
/**
315
* Web-based document loader
316
*/
317
class WebBaseLoader extends BaseDocumentLoader {
318
constructor(webPath: string | string[], options?: WebBaseLoaderParams);
319
320
/** URL(s) to load */
321
webPath: string | string[];
322
323
/** Request options */
324
requestOptions?: RequestInit;
325
326
/** Text decoder options */
327
textDecoder?: TextDecoder;
328
329
load(): Promise<DocumentInterface[]>;
330
}
331
332
/**
333
* Cheerio web scraper loader
334
*/
335
class CheerioWebBaseLoader extends BaseDocumentLoader {
336
constructor(webPath: string, options?: CheerioWebBaseLoaderParams);
337
338
/** URL to scrape */
339
webPath: string;
340
341
/** Cheerio selector */
342
selector?: string;
343
344
/** Text extraction options */
345
textTransformer?: (text: string) => string;
346
347
load(): Promise<DocumentInterface[]>;
348
}
349
350
/**
351
* Playwright web scraper loader
352
*/
353
class PlaywrightWebBaseLoader extends BaseDocumentLoader {
354
constructor(webPath: string, options?: PlaywrightWebBaseLoaderParams);
355
356
/** URL to scrape */
357
webPath: string;
358
359
/** Playwright launch options */
360
launchOptions?: LaunchOptions;
361
362
/** Page evaluation function */
363
evaluateOptions?: EvaluateOptions;
364
365
load(): Promise<DocumentInterface[]>;
366
}
367
```
368
369
### Database Loaders
370
371
Loaders for various database systems and data sources.
372
373
```typescript { .api }
374
/**
375
* SQL database loader
376
*/
377
class SQLDatabaseLoader extends BaseDocumentLoader {
378
constructor(query: string, database: SqlDatabase, options?: SQLDatabaseLoaderParams);
379
380
/** SQL query to execute */
381
query: string;
382
383
/** Database connection */
384
database: SqlDatabase;
385
386
/** Additional options */
387
options: SQLDatabaseLoaderParams;
388
389
load(): Promise<DocumentInterface[]>;
390
}
391
392
/**
393
* CSV file loader
394
*/
395
class CSVLoader extends BaseDocumentLoader {
396
constructor(filePath: string, options?: CSVLoaderParams);
397
398
/** CSV file path */
399
filePath: string;
400
401
/** Column to use as content */
402
column?: string;
403
404
/** CSV parsing options */
405
csvOptions?: CSVParseOptions;
406
407
load(): Promise<DocumentInterface[]>;
408
}
409
```
410
411
### API and Service Loaders
412
413
Loaders for external APIs and cloud services.
414
415
```typescript { .api }
416
/**
417
* Notion API loader
418
*/
419
class NotionAPILoader extends BaseDocumentLoader {
420
constructor(options: NotionAPILoaderParams);
421
422
/** Notion integration token */
423
integrationToken: string;
424
425
/** Notion page or database ID */
426
notionID: string;
427
428
/** Type of Notion resource */
429
type: "page" | "database";
430
431
load(): Promise<DocumentInterface[]>;
432
}
433
434
/**
435
* GitHub repository loader
436
*/
437
class GithubRepoLoader extends BaseDocumentLoader {
438
constructor(
439
githubUrl: string,
440
options?: GithubRepoLoaderParams
441
);
442
443
/** GitHub repository URL */
444
githubUrl: string;
445
446
/** Access token for private repos */
447
accessToken?: string;
448
449
/** Branch to load from */
450
branch?: string;
451
452
/** File patterns to include */
453
include?: string[];
454
455
/** File patterns to exclude */
456
exclude?: string[];
457
458
load(): Promise<DocumentInterface[]>;
459
}
460
461
/**
462
* S3 file loader
463
*/
464
class S3Loader extends BaseDocumentLoader {
465
constructor(bucket: string, key: string, options?: S3LoaderParams);
466
467
/** S3 bucket name */
468
bucket: string;
469
470
/** S3 object key */
471
key: string;
472
473
/** AWS configuration */
474
s3Config?: S3Config;
475
476
load(): Promise<DocumentInterface[]>;
477
}
478
```
479
480
### Specialized Format Loaders
481
482
Loaders for specific document formats and content types.
483
484
```typescript { .api }
485
/**
486
* PDF document loader
487
*/
488
class PDFLoader extends BaseDocumentLoader {
489
constructor(filePathOrBlob: string | Blob, options?: PDFLoaderParams);
490
491
/** PDF file path or blob */
492
filePathOrBlob: string | Blob;
493
494
/** Split pages into separate documents */
495
splitPages?: boolean;
496
497
/** PDF parsing options */
498
pdfParseOptions?: PDFParseOptions;
499
500
load(): Promise<DocumentInterface[]>;
501
}
502
503
/**
504
* Microsoft Word document loader
505
*/
506
class DocxLoader extends BaseDocumentLoader {
507
constructor(filePathOrBlob: string | Blob);
508
509
/** Word document path or blob */
510
filePathOrBlob: string | Blob;
511
512
load(): Promise<DocumentInterface[]>;
513
}
514
515
/**
516
* PowerPoint presentation loader
517
*/
518
class PPTXLoader extends BaseDocumentLoader {
519
constructor(filePathOrBlob: string | Blob);
520
521
/** PowerPoint file path or blob */
522
filePathOrBlob: string | Blob;
523
524
load(): Promise<DocumentInterface[]>;
525
}
526
527
/**
528
* Email message loader (EML format)
529
*/
530
class UnstructuredEmailLoader extends BaseDocumentLoader {
531
constructor(filePath: string);
532
533
/** Email file path */
534
filePath: string;
535
536
load(): Promise<DocumentInterface[]>;
537
}
538
```
539
540
### Document Transformers
541
542
Components for transforming and processing loaded documents.
543
544
```typescript { .api }
545
/**
546
* OpenAI functions document transformer
547
*/
548
class OpenAIFunctionsDocumentTransformer {
549
constructor(options?: OpenAIFunctionsTransformerOptions);
550
551
/** Transform documents using OpenAI functions */
552
transformDocuments(
553
documents: DocumentInterface[],
554
options?: TransformOptions
555
): Promise<DocumentInterface[]>;
556
}
557
558
/**
559
* HTML transformer for web content
560
*/
561
class HtmlToTextTransformer {
562
constructor(options?: HtmlToTextOptions);
563
564
/** Convert HTML to plain text */
565
transformDocuments(documents: DocumentInterface[]): Promise<DocumentInterface[]>;
566
}
567
```
568
569
## Types
570
571
### Base Loader Types
572
573
```typescript { .api }
574
interface BaseDocumentLoaderParams {
575
/** Additional metadata to add to all documents */
576
metadata?: Record<string, any>;
577
}
578
579
type UnknownHandling = "ignore" | "warn" | "error";
580
```
581
582
### File System Loader Types
583
584
```typescript { .api }
585
interface DirectoryLoaderOptions {
586
/** Whether to scan directories recursively */
587
recursive?: boolean;
588
/** How to handle unknown file types */
589
unknown?: UnknownHandling;
590
/** File patterns to include */
591
include?: string[];
592
/** File patterns to exclude */
593
exclude?: string[];
594
}
595
596
interface MultiFileLoaderOptions {
597
/** Map of file extensions to loader classes */
598
loaders: Record<string, typeof BaseDocumentLoader>;
599
}
600
```
601
602
### Text Splitter Types
603
604
```typescript { .api }
605
interface TextSplitterParams {
606
/** Maximum size of each chunk */
607
chunkSize?: number;
608
/** Number of characters to overlap between chunks */
609
chunkOverlap?: number;
610
/** Function to calculate text length */
611
lengthFunction?: (text: string) => number;
612
/** Whether to keep separator in results */
613
keepSeparator?: boolean;
614
}
615
616
interface CharacterTextSplitterParams extends TextSplitterParams {
617
/** String to split on */
618
separator?: string;
619
}
620
621
interface RecursiveCharacterTextSplitterParams extends TextSplitterParams {
622
/** List of separators to try in order */
623
separators?: string[];
624
}
625
626
interface TokenTextSplitterParams extends TextSplitterParams {
627
/** Name of tiktoken encoding */
628
encodingName?: TikTokenEncoding;
629
/** Allowed special tokens */
630
allowedSpecial?: Set<string> | "all";
631
/** Disallowed special tokens */
632
disallowedSpecial?: Set<string> | "all";
633
}
634
635
type TikTokenEncoding = "gpt2" | "r50k_base" | "p50k_base" | "cl100k_base" | "o200k_base";
636
```
637
638
### Web Loader Types
639
640
```typescript { .api }
641
interface WebBaseLoaderParams {
642
/** Request configuration */
643
requestOptions?: RequestInit;
644
/** Text decoder for response */
645
textDecoder?: TextDecoder;
646
/** Additional metadata */
647
metadata?: Record<string, any>;
648
}
649
650
interface CheerioWebBaseLoaderParams extends WebBaseLoaderParams {
651
/** CSS selector for content extraction */
652
selector?: string;
653
/** Function to transform extracted text */
654
textTransformer?: (text: string) => string;
655
}
656
657
interface PlaywrightWebBaseLoaderParams extends WebBaseLoaderParams {
658
/** Playwright browser launch options */
659
launchOptions?: LaunchOptions;
660
/** Page evaluation options */
661
evaluateOptions?: EvaluateOptions;
662
}
663
```
664
665
### Database Loader Types
666
667
```typescript { .api }
668
interface SQLDatabaseLoaderParams {
669
/** Column names to include in metadata */
670
metadataColumns?: string[];
671
/** Column to use as page content */
672
contentColumns?: string[];
673
}
674
675
interface CSVLoaderParams {
676
/** Column to use as document content */
677
column?: string;
678
/** Columns to include in metadata */
679
metadataColumns?: string[];
680
/** CSV parsing options */
681
csvOptions?: CSVParseOptions;
682
}
683
684
interface CSVParseOptions {
685
/** Field delimiter */
686
delimiter?: string;
687
/** Quote character */
688
quote?: string;
689
/** Escape character */
690
escape?: string;
691
/** Whether first row contains headers */
692
headers?: boolean;
693
}
694
```
695
696
### API Service Loader Types
697
698
```typescript { .api }
699
interface NotionAPILoaderParams {
700
/** Notion integration token */
701
integrationToken: string;
702
/** Notion page or database ID */
703
notionID: string;
704
/** Type of Notion resource */
705
type: "page" | "database";
706
/** Properties to include */
707
propertiesAsMetadata?: boolean;
708
}
709
710
interface GithubRepoLoaderParams {
711
/** GitHub access token */
712
accessToken?: string;
713
/** Branch to clone from */
714
branch?: string;
715
/** File patterns to include */
716
include?: string[];
717
/** File patterns to exclude */
718
exclude?: string[];
719
/** Maximum file size to process */
720
maxFileSize?: number;
721
}
722
723
interface S3LoaderParams {
724
/** AWS S3 configuration */
725
s3Config?: S3Config;
726
/** Additional metadata */
727
metadata?: Record<string, any>;
728
}
729
730
interface S3Config {
731
/** AWS region */
732
region?: string;
733
/** AWS access key ID */
734
accessKeyId?: string;
735
/** AWS secret access key */
736
secretAccessKey?: string;
737
/** AWS session token */
738
sessionToken?: string;
739
}
740
```
741
742
### Specialized Format Types
743
744
```typescript { .api }
745
interface PDFLoaderParams {
746
/** Whether to split each page into separate document */
747
splitPages?: boolean;
748
/** PDF.js parsing options */
749
pdfParseOptions?: PDFParseOptions;
750
}
751
752
interface PDFParseOptions {
753
/** Maximum pages to process */
754
maxPages?: number;
755
/** Whether to use legacy build */
756
useSystemFonts?: boolean;
757
/** Custom font loading */
758
fontExtraProperties?: boolean;
759
}
760
761
interface OpenAIFunctionsTransformerOptions {
762
/** OpenAI function definitions */
763
functions?: OpenAIFunctionDefinition[];
764
/** Whether to include raw function results */
765
includeRaw?: boolean;
766
}
767
768
interface HtmlToTextOptions {
769
/** Selectors to ignore */
770
ignoreSelectors?: string[];
771
/** Whether to preserve links */
772
preserveLinks?: boolean;
773
/** Word wrap width */
774
wordwrap?: number | false;
775
}
776
```
777
778
## Document Processing Patterns
779
780
### Batch Document Loading
781
782
```typescript
783
import { DirectoryLoader, RecursiveCharacterTextSplitter } from "langchain/document_loaders";
784
785
async function loadAndProcessDocuments(directory: string) {
786
// Load all documents from directory
787
const loader = new DirectoryLoader(directory, {
788
".txt": (path) => new TextLoader(path),
789
".md": (path) => new TextLoader(path),
790
".json": (path) => new JSONLoader(path),
791
});
792
793
// Split into chunks
794
const splitter = new RecursiveCharacterTextSplitter({
795
chunkSize: 1000,
796
chunkOverlap: 200,
797
});
798
799
const docs = await loader.loadAndSplit(splitter);
800
801
// Add processing metadata
802
return docs.map(doc => ({
803
...doc,
804
metadata: {
805
...doc.metadata,
806
processed_at: new Date().toISOString(),
807
chunk_size: doc.pageContent.length,
808
}
809
}));
810
}
811
```
812
813
### Custom Document Loader
814
815
```typescript
816
class CustomAPILoader extends BaseDocumentLoader {
817
constructor(private apiEndpoint: string, private apiKey: string) {
818
super();
819
}
820
821
async load(): Promise<DocumentInterface[]> {
822
const response = await fetch(this.apiEndpoint, {
823
headers: { 'Authorization': `Bearer ${this.apiKey}` }
824
});
825
826
const data = await response.json();
827
828
return data.results.map((item: any) => ({
829
pageContent: item.content,
830
metadata: {
831
source: this.apiEndpoint,
832
id: item.id,
833
created_at: item.created_at,
834
}
835
}));
836
}
837
}
838
```