0
# Web Scraping
1
2
Single URL scraping with multiple output formats, browser automation, and structured data extraction.
3
4
## Core Scraping Method
5
6
```typescript { .api }
7
/**
8
* Scrape a single URL with optional format and processing options.
9
* @param url - Target URL to scrape
10
* @param options - Scraping configuration options
11
* @returns Promise resolving to scraped document
12
*/
13
scrape<Opts extends ScrapeOptions>(
14
url: string,
15
options: Opts
16
): Promise<Omit<Document, "json"> & { json?: InferredJsonFromOptions<Opts> }>;
17
18
scrape(url: string, options?: ScrapeOptions): Promise<Document>;
19
```
20
21
## Scrape Options
22
23
```typescript { .api }
24
interface ScrapeOptions {
25
// Output formats to include in response
26
formats?: FormatOption[];
27
28
// HTTP configuration
29
headers?: Record<string, string>;
30
timeout?: number;
31
skipTlsVerification?: boolean;
32
proxy?: "basic" | "stealth" | "auto" | string;
33
34
// Content filtering
35
includeTags?: string[];
36
excludeTags?: string[];
37
onlyMainContent?: boolean;
38
removeBase64Images?: boolean;
39
40
// Browser behavior
41
mobile?: boolean;
42
waitFor?: number;
43
fastMode?: boolean;
44
blockAds?: boolean;
45
46
// Browser automation
47
actions?: ActionOption[];
48
49
// Document parsing
50
parsers?: Array<string | { type: "pdf"; maxPages?: number }>;
51
52
// Location simulation
53
location?: LocationConfig;
54
55
// Caching
56
maxAge?: number;
57
storeInCache?: boolean;
58
59
// Testing
60
useMock?: string;
61
62
// Integration tracking
63
integration?: string;
64
}
65
```
66
67
## Format Options
68
69
```typescript { .api }
70
// Available format strings
71
type FormatString =
72
| "markdown"
73
| "html"
74
| "rawHtml"
75
| "links"
76
| "images"
77
| "screenshot"
78
| "summary"
79
| "changeTracking"
80
| "json"
81
| "attributes";
82
83
// Format configurations
84
type FormatOption =
85
| FormatString
86
| JsonFormat
87
| ScreenshotFormat
88
| ChangeTrackingFormat
89
| AttributesFormat;
90
91
// JSON extraction with schema
92
interface JsonFormat {
93
type: "json";
94
prompt?: string;
95
schema?: Record<string, unknown> | ZodTypeAny;
96
}
97
98
// Screenshot configuration
99
interface ScreenshotFormat {
100
type: "screenshot";
101
fullPage?: boolean;
102
quality?: number;
103
viewport?: Viewport | { width: number; height: number };
104
}
105
106
// Change tracking
107
interface ChangeTrackingFormat {
108
type: "changeTracking";
109
modes: ("git-diff" | "json")[];
110
schema?: Record<string, unknown>;
111
prompt?: string;
112
tag?: string;
113
}
114
115
// Attribute extraction
116
interface AttributesFormat {
117
type: "attributes";
118
selectors: Array<{
119
selector: string;
120
attribute: string;
121
}>;
122
}
123
```
124
125
## Browser Actions
126
127
```typescript { .api }
128
// Available action types
129
type ActionOption =
130
| WaitAction
131
| ScreenshotAction
132
| ClickAction
133
| WriteAction
134
| PressAction
135
| ScrollAction
136
| ScrapeAction
137
| ExecuteJavascriptAction
138
| PDFAction;
139
140
// Wait for element or time
141
interface WaitAction {
142
type: "wait";
143
milliseconds?: number;
144
selector?: string;
145
}
146
147
// Click elements
148
interface ClickAction {
149
type: "click";
150
selector: string;
151
}
152
153
// Type text
154
interface WriteAction {
155
type: "write";
156
text: string;
157
}
158
159
// Press keys
160
interface PressAction {
161
type: "press";
162
key: string;
163
}
164
165
// Scroll page
166
interface ScrollAction {
167
type: "scroll";
168
direction: "up" | "down";
169
selector?: string;
170
}
171
172
// Take screenshot
173
interface ScreenshotAction {
174
type: "screenshot";
175
fullPage?: boolean;
176
quality?: number;
177
viewport?: Viewport | { width: number; height: number };
178
}
179
180
// Scrape current state
181
interface ScrapeAction {
182
type: "scrape";
183
}
184
185
// Execute JavaScript
186
interface ExecuteJavascriptAction {
187
type: "executeJavascript";
188
script: string;
189
}
190
191
// Generate PDF
192
interface PDFAction {
193
type: "pdf";
194
format?: "A0" | "A1" | "A2" | "A3" | "A4" | "A5" | "A6" | "Letter" | "Legal" | "Tabloid" | "Ledger";
195
landscape?: boolean;
196
scale?: number;
197
}
198
```
199
200
## Location Configuration
201
202
```typescript { .api }
203
interface LocationConfig {
204
country?: string;
205
languages?: string[];
206
}
207
208
interface Viewport {
209
width: number;
210
height: number;
211
}
212
```
213
214
## Usage Examples
215
216
### Basic Scraping
217
218
```typescript
219
// Simple markdown extraction
220
const result = await app.scrape('https://example.com', {
221
formats: ['markdown']
222
});
223
console.log(result.markdown);
224
225
// Multiple formats
226
const result = await app.scrape('https://example.com', {
227
formats: ['markdown', 'html', 'links', 'images']
228
});
229
```
230
231
### JSON Extraction with Schema
232
233
```typescript
234
import { z } from 'zod';
235
236
// Using Zod schema
237
const ProductSchema = z.object({
238
name: z.string(),
239
price: z.number(),
240
description: z.string(),
241
inStock: z.boolean()
242
});
243
244
const result = await app.scrape('https://shop.example.com/product/123', {
245
formats: [{
246
type: 'json',
247
schema: ProductSchema
248
}]
249
});
250
// result.json is now typed as ProductSchema
251
252
// Using JSON schema object
253
const result2 = await app.scrape('https://shop.example.com/product/123', {
254
formats: [{
255
type: 'json',
256
schema: {
257
type: 'object',
258
properties: {
259
name: { type: 'string' },
260
price: { type: 'number' }
261
},
262
required: ['name', 'price']
263
}
264
}]
265
});
266
```
267
268
### Browser Automation
269
270
```typescript
271
// Login and scrape protected content
272
const result = await app.scrape('https://app.example.com/login', {
273
formats: ['markdown'],
274
actions: [
275
{ type: 'wait', selector: '#username' },
276
{ type: 'click', selector: '#username' },
277
{ type: 'write', text: 'myuser@example.com' },
278
{ type: 'click', selector: '#password' },
279
{ type: 'write', text: 'mypassword' },
280
{ type: 'click', selector: '#login-button' },
281
{ type: 'wait', milliseconds: 3000 },
282
{ type: 'scrape' }
283
]
284
});
285
```
286
287
### Screenshot with Custom Viewport
288
289
```typescript
290
const result = await app.scrape('https://example.com', {
291
formats: [{
292
type: 'screenshot',
293
fullPage: true,
294
quality: 90,
295
viewport: { width: 1920, height: 1080 }
296
}]
297
});
298
console.log(result.screenshot); // Base64 image data
299
```
300
301
### Attribute Extraction
302
303
```typescript
304
const result = await app.scrape('https://example.com', {
305
formats: [{
306
type: 'attributes',
307
selectors: [
308
{ selector: 'a', attribute: 'href' },
309
{ selector: 'img', attribute: 'src' },
310
{ selector: 'meta[name="description"]', attribute: 'content' }
311
]
312
}]
313
});
314
console.log(result.attributes);
315
```
316
317
### Advanced Configuration
318
319
```typescript
320
const result = await app.scrape('https://example.com', {
321
formats: ['markdown', 'screenshot'],
322
headers: {
323
'User-Agent': 'MyBot/1.0',
324
'Accept-Language': 'en-US,en;q=0.9'
325
},
326
mobile: true,
327
waitFor: 2000,
328
includeTags: ['article', 'main'],
329
excludeTags: ['nav', 'footer', 'aside'],
330
onlyMainContent: true,
331
blockAds: true,
332
location: {
333
country: 'US',
334
languages: ['en']
335
},
336
proxy: 'stealth'
337
});
338
```