or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

batch.mdcrawling.mdextraction.mdindex.mdmapping.mdmonitoring.mdscraping.mdsearch.mdusage.mdv1-api.md

extraction.mddocs/

0

# Data Extraction

1

2

LLM-powered structured data extraction using natural language prompts, schemas, or AI agents for intelligent content processing.

3

4

## Core Extraction Methods

5

6

```typescript { .api }

7

/**

8

* Start an extract job (async)

9

* @param args - Extraction request configuration

10

* @returns Promise resolving to job ID or processing state

11

*/

12

startExtract(args: ExtractRequest): Promise<ExtractResponse>;

13

14

/**

15

* Get extract job status/data

16

* @param jobId - Extract job identifier

17

* @returns Promise resolving to extraction results

18

*/

19

getExtractStatus(jobId: string): Promise<ExtractResponse>;

20

21

/**

22

* Convenience waiter: start extract and poll until completion

23

* @param args - Extraction request plus waiter controls

24

* @returns Promise resolving to final extract response

25

*/

26

extract(args: ExtractRequest & { pollInterval?: number; timeout?: number }): Promise<ExtractResponse>;

27

```

28

29

## Extraction Configuration

30

31

```typescript { .api }

32

// Note: The exact ExtractRequest interface is inferred from method signatures

33

// Based on the v1 API, here are the typical extraction parameters:

34

interface ExtractRequest {

35

// URLs to extract data from

36

urls?: string[];

37

38

// Natural language extraction prompt

39

prompt?: string;

40

41

// Structured schema for extraction

42

schema?: Record<string, unknown> | ZodTypeAny;

43

44

// System prompt for AI context

45

systemPrompt?: string;

46

47

// Allow external link following

48

allowExternalLinks?: boolean;

49

50

// Enable web search for additional context

51

enableWebSearch?: boolean;

52

53

// Include subdomains in extraction

54

includeSubdomains?: boolean;

55

56

// Source origin tracking

57

origin?: string;

58

59

// Show source URLs in results

60

showSources?: boolean;

61

62

// Scraping options for URL processing

63

scrapeOptions?: ScrapeOptions;

64

65

// AI agent configuration

66

agent?: {

67

model?: string;

68

sessionId?: string;

69

};

70

}

71

```

72

73

## Extraction Response

74

75

```typescript { .api }

76

interface ExtractResponse {

77

success?: boolean;

78

id?: string;

79

status?: "processing" | "completed" | "failed" | "cancelled";

80

data?: unknown;

81

error?: string;

82

warning?: string;

83

sources?: Record<string, unknown>;

84

expiresAt?: string;

85

}

86

```

87

88

## Usage Examples

89

90

### Basic Data Extraction

91

92

```typescript

93

// Extract structured data using natural language

94

const extractResult = await app.extract({

95

urls: ['https://company.example.com/about'],

96

prompt: 'Extract the company name, founding year, number of employees, and main business areas'

97

});

98

99

console.log('Extracted data:', extractResult.data);

100

// Returns structured data based on the prompt

101

```

102

103

### Schema-Based Extraction

104

105

```typescript

106

import { z } from 'zod';

107

108

// Define extraction schema

109

const CompanySchema = z.object({

110

name: z.string(),

111

foundingYear: z.number(),

112

employees: z.number().optional(),

113

industry: z.string(),

114

headquarters: z.string(),

115

revenue: z.string().optional(),

116

products: z.array(z.string()),

117

keyExecutives: z.array(z.object({

118

name: z.string(),

119

title: z.string()

120

}))

121

});

122

123

const extractResult = await app.extract({

124

urls: [

125

'https://company.example.com/about',

126

'https://company.example.com/leadership',

127

'https://company.example.com/products'

128

],

129

schema: CompanySchema,

130

prompt: 'Extract comprehensive company information including leadership and product details'

131

});

132

133

// Result is typed according to CompanySchema

134

console.log('Company data:', extractResult.data);

135

```

136

137

### Multi-URL Product Extraction

138

139

```typescript

140

const productUrls = [

141

'https://shop.example.com/products/laptop-pro',

142

'https://shop.example.com/products/tablet-air',

143

'https://shop.example.com/products/phone-max'

144

];

145

146

const ProductSchema = z.object({

147

name: z.string(),

148

price: z.number(),

149

currency: z.string(),

150

description: z.string(),

151

specifications: z.record(z.string()),

152

availability: z.enum(['in-stock', 'out-of-stock', 'pre-order']),

153

rating: z.number().optional(),

154

reviews: z.number().optional(),

155

images: z.array(z.string()),

156

category: z.string(),

157

brand: z.string()

158

});

159

160

const extractResult = await app.extract({

161

urls: productUrls,

162

schema: ProductSchema,

163

prompt: 'Extract comprehensive product information including pricing, specifications, and availability',

164

showSources: true

165

});

166

167

console.log('Products extracted:', extractResult.data);

168

console.log('Source URLs:', extractResult.sources);

169

```

170

171

### News Article Analysis

172

173

```typescript

174

const NewsArticleSchema = z.object({

175

headline: z.string(),

176

summary: z.string(),

177

mainPoints: z.array(z.string()),

178

author: z.string().optional(),

179

publishDate: z.string(),

180

source: z.string(),

181

sentiment: z.enum(['positive', 'negative', 'neutral']),

182

topics: z.array(z.string()),

183

keyQuotes: z.array(z.string()),

184

relatedCompanies: z.array(z.string()),

185

impact: z.string().optional()

186

});

187

188

const extractResult = await app.extract({

189

urls: [

190

'https://news.example.com/tech-breakthrough',

191

'https://news.example.com/market-analysis',

192

'https://news.example.com/industry-trends'

193

],

194

schema: NewsArticleSchema,

195

prompt: 'Analyze news articles for key information, sentiment, and business impact',

196

systemPrompt: 'You are a business analyst extracting key insights from news articles. Focus on factual information and business implications.',

197

enableWebSearch: true,

198

showSources: true

199

});

200

```

201

202

### Research Paper Extraction

203

204

```typescript

205

const ResearchPaperSchema = z.object({

206

title: z.string(),

207

authors: z.array(z.string()),

208

abstract: z.string(),

209

methodology: z.string(),

210

keyFindings: z.array(z.string()),

211

conclusions: z.string(),

212

futureWork: z.string().optional(),

213

citations: z.array(z.string()),

214

keywords: z.array(z.string()),

215

publishedDate: z.string().optional(),

216

journal: z.string().optional(),

217

doi: z.string().optional()

218

});

219

220

const extractResult = await app.extract({

221

urls: [

222

'https://research.example.com/papers/ai-ethics',

223

'https://research.example.com/papers/machine-learning-bias'

224

],

225

schema: ResearchPaperSchema,

226

prompt: 'Extract comprehensive research paper information including methodology, findings, and citations',

227

systemPrompt: 'You are an academic researcher extracting structured information from research papers. Focus on scientific accuracy and completeness.',

228

allowExternalLinks: true,

229

scrapeOptions: {

230

formats: ['markdown'],

231

onlyMainContent: true

232

}

233

});

234

```

235

236

### Async Extraction with Monitoring

237

238

```typescript

239

// Start extraction job

240

const extractResponse = await app.startExtract({

241

urls: Array.from({ length: 50 }, (_, i) =>

242

`https://reviews.example.com/product/${i + 1}`

243

),

244

schema: {

245

type: 'object',

246

properties: {

247

productName: { type: 'string' },

248

rating: { type: 'number' },

249

reviewText: { type: 'string' },

250

reviewer: { type: 'string' },

251

reviewDate: { type: 'string' },

252

pros: { type: 'array', items: { type: 'string' } },

253

cons: { type: 'array', items: { type: 'string' } },

254

recommended: { type: 'boolean' }

255

}

256

},

257

prompt: 'Extract detailed product review information including pros, cons, and recommendations'

258

});

259

260

console.log(`Started extraction job: ${extractResponse.id}`);

261

262

// Monitor progress

263

let result: ExtractResponse;

264

do {

265

await new Promise(resolve => setTimeout(resolve, 5000)); // Wait 5 seconds

266

result = await app.getExtractStatus(extractResponse.id!);

267

console.log(`Extraction status: ${result.status}`);

268

} while (result.status === 'processing');

269

270

if (result.status === 'completed') {

271

console.log('Extraction completed:', result.data);

272

} else {

273

console.error('Extraction failed:', result.error);

274

}

275

```

276

277

### Web Search Enhanced Extraction

278

279

```typescript

280

const extractResult = await app.extract({

281

prompt: 'Find information about recent AI safety research developments, key researchers, and policy recommendations',

282

enableWebSearch: true,

283

schema: {

284

type: 'object',

285

properties: {

286

recentDevelopments: {

287

type: 'array',

288

items: {

289

type: 'object',

290

properties: {

291

title: { type: 'string' },

292

description: { type: 'string' },

293

researchers: { type: 'array', items: { type: 'string' } },

294

institution: { type: 'string' },

295

date: { type: 'string' },

296

significance: { type: 'string' }

297

}

298

}

299

},

300

keyResearchers: {

301

type: 'array',

302

items: {

303

type: 'object',

304

properties: {

305

name: { type: 'string' },

306

affiliation: { type: 'string' },

307

expertise: { type: 'array', items: { type: 'string' } },

308

recentWork: { type: 'string' }

309

}

310

}

311

},

312

policyRecommendations: {

313

type: 'array',

314

items: {

315

type: 'object',

316

properties: {

317

recommendation: { type: 'string' },

318

rationale: { type: 'string' },

319

source: { type: 'string' }

320

}

321

}

322

}

323

}

324

},

325

showSources: true

326

}, {

327

timeout: 300 // 5 minutes

328

});

329

330

console.log('AI safety research analysis:', extractResult.data);

331

```

332

333

### Financial Data Extraction

334

335

```typescript

336

const FinancialDataSchema = z.object({

337

companyName: z.string(),

338

ticker: z.string().optional(),

339

currentPrice: z.number().optional(),

340

marketCap: z.string().optional(),

341

revenue: z.string(),

342

netIncome: z.string(),

343

eps: z.number().optional(),

344

peRatio: z.number().optional(),

345

dividendYield: z.number().optional(),

346

quarterlyGrowth: z.string().optional(),

347

keyMetrics: z.record(z.string()),

348

riskFactors: z.array(z.string()),

349

businessSegments: z.array(z.object({

350

segment: z.string(),

351

revenue: z.string(),

352

percentage: z.number().optional()

353

}))

354

});

355

356

const extractResult = await app.extract({

357

urls: [

358

'https://investor.example.com/financials',

359

'https://finance.yahoo.com/quote/EXAMPLE',

360

'https://www.sec.gov/example-10k'

361

],

362

schema: FinancialDataSchema,

363

prompt: 'Extract comprehensive financial data including revenue, profitability, key metrics, and risk factors',

364

systemPrompt: 'You are a financial analyst extracting key financial metrics and business information. Focus on numerical accuracy and current data.',

365

allowExternalLinks: true,

366

scrapeOptions: {

367

formats: ['markdown'],

368

timeout: 30000

369

}

370

});

371

```

372

373

### Error Handling and Validation

374

375

```typescript

376

try {

377

const extractResult = await app.extract({

378

urls: ['https://complex-site.example.com'],

379

schema: ComplexSchema,

380

prompt: 'Extract detailed information',

381

timeout: 180 // 3 minutes

382

});

383

384

if (extractResult.success && extractResult.data) {

385

// Validate extracted data

386

if (typeof extractResult.data === 'object' && extractResult.data !== null) {

387

console.log('Extraction successful:', extractResult.data);

388

389

if (extractResult.warning) {

390

console.log('Warning:', extractResult.warning);

391

}

392

393

if (extractResult.sources) {

394

console.log('Sources used:', extractResult.sources);

395

}

396

} else {

397

console.log('Extraction returned unexpected data format');

398

}

399

} else {

400

console.error('Extraction failed:', extractResult.error);

401

}

402

403

} catch (error) {

404

console.error('Extraction error:', error);

405

406

// Fallback to simpler extraction

407

try {

408

const fallbackResult = await app.extract({

409

urls: ['https://simple-fallback.example.com'],

410

prompt: 'Extract basic information',

411

timeout: 60

412

});

413

console.log('Fallback extraction:', fallbackResult.data);

414

} catch (fallbackError) {

415

console.error('Fallback extraction also failed:', fallbackError);

416

}

417

}

418

```

419

420

### Custom Agent Configuration

421

422

```typescript

423

const extractResult = await app.extract({

424

urls: ['https://technical-docs.example.com'],

425

prompt: 'Extract API documentation including endpoints, parameters, and examples',

426

schema: {

427

type: 'object',

428

properties: {

429

endpoints: {

430

type: 'array',

431

items: {

432

type: 'object',

433

properties: {

434

method: { type: 'string' },

435

path: { type: 'string' },

436

description: { type: 'string' },

437

parameters: { type: 'array', items: { type: 'object' } },

438

responses: { type: 'object' },

439

examples: { type: 'array', items: { type: 'string' } }

440

}

441

}

442

}

443

}

444

},

445

agent: {

446

model: 'gpt-4',

447

sessionId: 'api-docs-extraction-session'

448

},

449

scrapeOptions: {

450

formats: ['markdown'],

451

onlyMainContent: true

452

}

453

});

454

```