or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

batch.mdcrawling.mdextraction.mdindex.mdmapping.mdmonitoring.mdscraping.mdsearch.mdusage.mdv1-api.md

scraping.mddocs/

0

# Web Scraping

1

2

Single URL scraping with multiple output formats, browser automation, and structured data extraction.

3

4

## Core Scraping Method

5

6

```typescript { .api }

7

/**

8

* Scrape a single URL with optional format and processing options.

9

* @param url - Target URL to scrape

10

* @param options - Scraping configuration options

11

* @returns Promise resolving to scraped document

12

*/

13

scrape<Opts extends ScrapeOptions>(

14

url: string,

15

options: Opts

16

): Promise<Omit<Document, "json"> & { json?: InferredJsonFromOptions<Opts> }>;

17

18

scrape(url: string, options?: ScrapeOptions): Promise<Document>;

19

```

20

21

## Scrape Options

22

23

```typescript { .api }

24

interface ScrapeOptions {

25

// Output formats to include in response

26

formats?: FormatOption[];

27

28

// HTTP configuration

29

headers?: Record<string, string>;

30

timeout?: number;

31

skipTlsVerification?: boolean;

32

proxy?: "basic" | "stealth" | "auto" | string;

33

34

// Content filtering

35

includeTags?: string[];

36

excludeTags?: string[];

37

onlyMainContent?: boolean;

38

removeBase64Images?: boolean;

39

40

// Browser behavior

41

mobile?: boolean;

42

waitFor?: number;

43

fastMode?: boolean;

44

blockAds?: boolean;

45

46

// Browser automation

47

actions?: ActionOption[];

48

49

// Document parsing

50

parsers?: Array<string | { type: "pdf"; maxPages?: number }>;

51

52

// Location simulation

53

location?: LocationConfig;

54

55

// Caching

56

maxAge?: number;

57

storeInCache?: boolean;

58

59

// Testing

60

useMock?: string;

61

62

// Integration tracking

63

integration?: string;

64

}

65

```

66

67

## Format Options

68

69

```typescript { .api }

70

// Available format strings

71

type FormatString =

72

| "markdown"

73

| "html"

74

| "rawHtml"

75

| "links"

76

| "images"

77

| "screenshot"

78

| "summary"

79

| "changeTracking"

80

| "json"

81

| "attributes";

82

83

// Format configurations

84

type FormatOption =

85

| FormatString

86

| JsonFormat

87

| ScreenshotFormat

88

| ChangeTrackingFormat

89

| AttributesFormat;

90

91

// JSON extraction with schema

92

interface JsonFormat {

93

type: "json";

94

prompt?: string;

95

schema?: Record<string, unknown> | ZodTypeAny;

96

}

97

98

// Screenshot configuration

99

interface ScreenshotFormat {

100

type: "screenshot";

101

fullPage?: boolean;

102

quality?: number;

103

viewport?: Viewport | { width: number; height: number };

104

}

105

106

// Change tracking

107

interface ChangeTrackingFormat {

108

type: "changeTracking";

109

modes: ("git-diff" | "json")[];

110

schema?: Record<string, unknown>;

111

prompt?: string;

112

tag?: string;

113

}

114

115

// Attribute extraction

116

interface AttributesFormat {

117

type: "attributes";

118

selectors: Array<{

119

selector: string;

120

attribute: string;

121

}>;

122

}

123

```

124

125

## Browser Actions

126

127

```typescript { .api }

128

// Available action types

129

type ActionOption =

130

| WaitAction

131

| ScreenshotAction

132

| ClickAction

133

| WriteAction

134

| PressAction

135

| ScrollAction

136

| ScrapeAction

137

| ExecuteJavascriptAction

138

| PDFAction;

139

140

// Wait for element or time

141

interface WaitAction {

142

type: "wait";

143

milliseconds?: number;

144

selector?: string;

145

}

146

147

// Click elements

148

interface ClickAction {

149

type: "click";

150

selector: string;

151

}

152

153

// Type text

154

interface WriteAction {

155

type: "write";

156

text: string;

157

}

158

159

// Press keys

160

interface PressAction {

161

type: "press";

162

key: string;

163

}

164

165

// Scroll page

166

interface ScrollAction {

167

type: "scroll";

168

direction: "up" | "down";

169

selector?: string;

170

}

171

172

// Take screenshot

173

interface ScreenshotAction {

174

type: "screenshot";

175

fullPage?: boolean;

176

quality?: number;

177

viewport?: Viewport | { width: number; height: number };

178

}

179

180

// Scrape current state

181

interface ScrapeAction {

182

type: "scrape";

183

}

184

185

// Execute JavaScript

186

interface ExecuteJavascriptAction {

187

type: "executeJavascript";

188

script: string;

189

}

190

191

// Generate PDF

192

interface PDFAction {

193

type: "pdf";

194

format?: "A0" | "A1" | "A2" | "A3" | "A4" | "A5" | "A6" | "Letter" | "Legal" | "Tabloid" | "Ledger";

195

landscape?: boolean;

196

scale?: number;

197

}

198

```

199

200

## Location Configuration

201

202

```typescript { .api }

203

interface LocationConfig {

204

country?: string;

205

languages?: string[];

206

}

207

208

interface Viewport {

209

width: number;

210

height: number;

211

}

212

```

213

214

## Usage Examples

215

216

### Basic Scraping

217

218

```typescript

219

// Simple markdown extraction

220

const result = await app.scrape('https://example.com', {

221

formats: ['markdown']

222

});

223

console.log(result.markdown);

224

225

// Multiple formats

226

const result = await app.scrape('https://example.com', {

227

formats: ['markdown', 'html', 'links', 'images']

228

});

229

```

230

231

### JSON Extraction with Schema

232

233

```typescript

234

import { z } from 'zod';

235

236

// Using Zod schema

237

const ProductSchema = z.object({

238

name: z.string(),

239

price: z.number(),

240

description: z.string(),

241

inStock: z.boolean()

242

});

243

244

const result = await app.scrape('https://shop.example.com/product/123', {

245

formats: [{

246

type: 'json',

247

schema: ProductSchema

248

}]

249

});

250

// result.json is now typed as ProductSchema

251

252

// Using JSON schema object

253

const result2 = await app.scrape('https://shop.example.com/product/123', {

254

formats: [{

255

type: 'json',

256

schema: {

257

type: 'object',

258

properties: {

259

name: { type: 'string' },

260

price: { type: 'number' }

261

},

262

required: ['name', 'price']

263

}

264

}]

265

});

266

```

267

268

### Browser Automation

269

270

```typescript

271

// Login and scrape protected content

272

const result = await app.scrape('https://app.example.com/login', {

273

formats: ['markdown'],

274

actions: [

275

{ type: 'wait', selector: '#username' },

276

{ type: 'click', selector: '#username' },

277

{ type: 'write', text: 'myuser@example.com' },

278

{ type: 'click', selector: '#password' },

279

{ type: 'write', text: 'mypassword' },

280

{ type: 'click', selector: '#login-button' },

281

{ type: 'wait', milliseconds: 3000 },

282

{ type: 'scrape' }

283

]

284

});

285

```

286

287

### Screenshot with Custom Viewport

288

289

```typescript

290

const result = await app.scrape('https://example.com', {

291

formats: [{

292

type: 'screenshot',

293

fullPage: true,

294

quality: 90,

295

viewport: { width: 1920, height: 1080 }

296

}]

297

});

298

console.log(result.screenshot); // Base64 image data

299

```

300

301

### Attribute Extraction

302

303

```typescript

304

const result = await app.scrape('https://example.com', {

305

formats: [{

306

type: 'attributes',

307

selectors: [

308

{ selector: 'a', attribute: 'href' },

309

{ selector: 'img', attribute: 'src' },

310

{ selector: 'meta[name="description"]', attribute: 'content' }

311

]

312

}]

313

});

314

console.log(result.attributes);

315

```

316

317

### Advanced Configuration

318

319

```typescript

320

const result = await app.scrape('https://example.com', {

321

formats: ['markdown', 'screenshot'],

322

headers: {

323

'User-Agent': 'MyBot/1.0',

324

'Accept-Language': 'en-US,en;q=0.9'

325

},

326

mobile: true,

327

waitFor: 2000,

328

includeTags: ['article', 'main'],

329

excludeTags: ['nav', 'footer', 'aside'],

330

onlyMainContent: true,

331

blockAds: true,

332

location: {

333

country: 'US',

334

languages: ['en']

335

},

336

proxy: 'stealth'

337

});

338

```