Tessl Tile for npm/scrape-it@6.1.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

index.md

index.mddocs/

0
# Scrape-It
1

2
Scrape-It is a human-friendly Node.js web scraping library that extracts structured data from HTML pages using CSS selectors. Built on top of Cheerio and Cheerio-req, it provides both Promise-based and HTML-only scraping interfaces with support for complex data extraction patterns, nested lists, and automatic value conversion.
3

4
## Package Information
5

6
- **Package Name**: scrape-it
7
- **Package Type**: npm
8
- **Language**: JavaScript with TypeScript definitions
9
- **Installation**: `npm install scrape-it`
10

11
## Core Imports
12

13
```typescript
14
import scrapeIt = require("scrape-it");
15
```
16

17
For CommonJS:
18

19
```javascript
20
const scrapeIt = require("scrape-it");
21
```
22

23
## Basic Usage
24

25
```typescript
26
import scrapeIt = require("scrape-it");
27

28
// Simple data extraction
29
const { data, status } = await scrapeIt("https://example.com", {
30
  title: "h1",
31
  description: ".description",
32
  price: {
33
    selector: ".price",
34
    convert: (value) => parseFloat(value.replace("$", ""))
35
  }
36
});
37

38
console.log(data); // { title: "...", description: "...", price: 19.99 }
39
```
40

41
## Architecture
42

43
Scrape-It is built around two core components:
44

45
- **Main scrapeIt Function**: Handles HTTP requests and HTML parsing, returning comprehensive results with scraped data, Cheerio instance, and response metadata
46
- **scrapeHTML Method**: Processes pre-loaded HTML content using Cheerio, ideal for local files or custom HTTP handling
47
- **Options System**: Flexible configuration supporting simple selectors, complex nested data structures, attribute extraction, and value conversion
48
- **Type Safety**: Full TypeScript support with generic types preserving scraped data structure
49

50
## Capabilities
51

52
### Web Page Scraping
53

54
Fetch and scrape data directly from web URLs with automatic HTTP handling and response metadata.
55

56
```typescript { .api }
57
/**
58
 * Main scraping function that fetches and parses web pages
59
 * @param url - The page URL or request options object
60
 * @param opts - Scraping configuration options
61
 * @returns Promise resolving to scrape results with data and metadata
62
 */
63
function scrapeIt<T>(
64
  url: string | object,
65
  opts: ScrapeOptions
66
): Promise<ScrapeResult<T>>;
67

68
interface ScrapeResult<T> {
69
  /** The scraped data matching the provided options structure */
70
  data: T;
71
  /** HTTP status code from the response */
72
  status: number;
73
  /** HTTP status text from the response */
74
  statusText: string;
75
  /** Cheerio instance for additional DOM manipulation */
76
  $: Cheerio;
77
  /** Raw HTML body as a string */
78
  body: string;
79
}
80
```
81

82
**Usage Examples:**
83

84
```typescript
85
// Basic scraping
86
const result = await scrapeIt("https://news.ycombinator.com", {
87
  stories: {
88
    listItem: ".storylink",
89
    data: {
90
      title: "a",
91
      url: {
92
        selector: "a",
93
        attr: "href"
94
      }
95
    }
96
  }
97
});
98

99
// With request options
100
const result = await scrapeIt({
101
  url: "https://api.example.com/data",
102
  headers: {
103
    "User-Agent": "My Scraper 1.0"
104
  }
105
}, {
106
  items: ".item"
107
});
108
```
109

110
### HTML Content Scraping
111

112
Process pre-loaded HTML content using Cheerio, perfect for local files or custom HTTP handling.
113

114
```typescript { .api }
115
/**
116
 * Scrapes data from provided Cheerio element or HTML string
117
 * @param $ - Cheerio instance or HTML string to parse
118
 * @param opts - Scraping configuration options
119
 * @returns Scraped data object
120
 */
121
function scrapeHTML<T>(
122
  $: Cheerio | string,
123
  opts: ScrapeOptions
124
): T;
125
```
126

127
**Usage Examples:**
128

129
```typescript
130
import { readFileSync } from "fs";
131
import * as cheerio from "cheerio";
132

133
// From file
134
const html = readFileSync("page.html", "utf8");
135
const data = scrapeIt.scrapeHTML(html, {
136
  title: "h1",
137
  links: {
138
    listItem: "a",
139
    data: {
140
      text: "",
141
      href: { attr: "href" }
142
    }
143
  }
144
});
145

146
// From existing Cheerio instance
147
const $ = cheerio.load(html);
148
const data = scrapeIt.scrapeHTML($, {
149
  content: ".main-content"
150
});
151

152
// Advanced text node selection for mixed content
153
const textData = scrapeIt.scrapeHTML(html, {
154
  line0: {
155
    selector: ".mixed-content",
156
    texteq: 0                     // First direct text node
157
  },
158
  line1: {
159
    selector: ".mixed-content",
160
    texteq: 1                     // Second direct text node
161
  }
162
  // Note: texteq only selects direct text children, not nested text
163
});
164

165
// List conversion examples
166
const convertedData = scrapeIt.scrapeHTML(html, {
167
  featureIds: {
168
    listItem: ".features > li",
169
    convert: (value) => parseInt(value, 10)  // Convert strings to numbers
170
  }
171
});
172
```
173

174
### Data Structure Configuration
175

176
Flexible options system supporting simple selectors, nested objects, lists, and advanced element selection.
177

178
```typescript { .api }
179
interface ScrapeOptions {
180
  [key: string]: string | ScrapeOptionElement | ScrapeOptionList;
181
}
182

183
interface ScrapeOptionElement {
184
  /** CSS selector for target element */
185
  selector?: string;
186
  /** Function to convert extracted value */
187
  convert?: (value: any) => any;
188
  /** Method to access element value (text, html, or custom function) */
189
  how?: string | ((element: Cheerio) => any);
190
  /** Attribute name to extract instead of text content */
191
  attr?: string;
192
  /** Whether to trim extracted values (default: true) */
193
  trim?: boolean;
194
  /** CSS selector for closest ancestor element */
195
  closest?: string;
196
  /** Select the nth element (0-indexed) */
197
  eq?: number;
198
  /** Select the nth direct text child (0-indexed) */
199
  texteq?: number;
200
}
201

202
interface ScrapeOptionList {
203
  /** CSS selector for each list item */
204
  listItem: string;
205
  /** Data extraction configuration for each list item */
206
  data?: ScrapeOptions;
207
  /** Function to convert each list item value */
208
  convert?: (value: any) => any;
209
}
210
```
211

212
**Usage Examples:**
213

214
```typescript
215
// Simple field extraction
216
const data = await scrapeIt("https://example.com", {
217
  title: "h1",                    // Simple selector
218
  description: ".description"     // Simple selector
219
});
220

221
// Advanced field configuration
222
const data = await scrapeIt("https://example.com", {
223
  price: {
224
    selector: ".price",
225
    convert: (value) => parseFloat(value.replace(/[^0-9.]/g, ""))
226
  },
227
  image: {
228
    selector: "img.product",
229
    attr: "src"                   // Extract src attribute
230
  },
231
  content: {
232
    selector: ".content",
233
    how: "html"                   // Get HTML instead of text
234
  }
235
});
236

237
// List scraping with nested data
238
const data = await scrapeIt("https://example.com", {
239
  articles: {
240
    listItem: ".article",
241
    data: {
242
      title: "h2",
243
      date: {
244
        selector: ".date",
245
        convert: (value) => new Date(value)
246
      },
247
      tags: {
248
        listItem: ".tag"          // Nested list
249
      },
250
      // Complex nested object structures
251
      metadata: {
252
        selector: ".meta",
253
        data: {
254
          author: {
255
            data: {
256
              name: ".author-name",
257
              bio: {
258
                selector: ".author-bio span",
259
                eq: 1              // Select 2nd span element
260
              }
261
            }
262
          },
263
          category: ".category",
264
          readTime: ".read-time"
265
        }
266
      }
267
    }
268
  }
269
});
270

271
// Advanced element selection
272
const data = await scrapeIt("https://example.com", {
273
  secondParagraph: {
274
    selector: "p",
275
    eq: 1                         // Select 2nd paragraph
276
  },
277
  firstTextNode: {
278
    selector: ".content",
279
    texteq: 0                     // Select 1st direct text child
280
  },
281
  secondTextLine: {
282
    selector: ".multi-line",
283
    texteq: 1                     // Select 2nd direct text child
284
  },
285
  nearestTable: {
286
    selector: ".data-cell",
287
    closest: "table"              // Find closest table ancestor
288
  },
289
  // Advanced closest + convert pattern for context-aware extraction
290
  addresses: {
291
    listItem: "table tbody tr",
292
    data: {
293
      address: ".address",
294
      city: {
295
        closest: "table",           // Navigate to parent table
296
        convert: (html, $node) => {
297
          return $node.find("thead .city").text();
298
        }
299
      }
300
    }
301
  }
302
});
303
```
304

305
### Error Handling
306

307
Common error scenarios and handling patterns:
308

309
```typescript
310
try {
311
  const result = await scrapeIt("https://example.com", options);
312
  console.log(result.data);
313
} catch (error) {
314
  // Network errors, invalid URLs, or HTML parsing failures
315
  console.error("Scraping failed:", error.message);
316
}
317

318
// Check HTTP status
319
const result = await scrapeIt("https://example.com", options);
320
if (result.status !== 200) {
321
  console.warn(`Non-200 status: ${result.status} ${result.statusText}`);
322
}
323
```
324

325
## Types
326

327
```typescript { .api }
328
declare namespace scrapeIt {
329
  interface ScrapeOptions {
330
    [key: string]: string | ScrapeOptionList | ScrapeOptionElement;
331
  }
332

333
  interface ScrapeOptionElement {
334
    /** CSS selector for target element */
335
    selector?: string;
336
    /** Function to convert extracted value */
337
    convert?: (value: any) => any;
338
    /** Method to access element value (text, html, or custom function) */
339
    how?: string | ((element: Cheerio) => any);
340
    /** Attribute name to extract instead of text content */
341
    attr?: string;
342
    /** Whether to trim extracted values (default: true) */
343
    trim?: boolean;
344
    /** CSS selector for closest ancestor element */
345
    closest?: string;
346
    /** Select the nth element (0-indexed) */
347
    eq?: number;
348
    /** Select the nth direct text child (0-indexed) */
349
    texteq?: number;
350
  }
351

352
  interface ScrapeOptionList {
353
    /** CSS selector for each list item */
354
    listItem: string;
355
    /** Data extraction configuration for each list item */
356
    data?: ScrapeOptions;
357
    /** Function to convert each list item value */
358
    convert?: (value: any) => any;
359
  }
360

361
  interface ScrapeResult<T> {
362
    /** The scraped data matching the provided options structure */
363
    data: T;
364
    /** HTTP status code from the response */
365
    status: number;
366
    /** HTTP status text from the response */
367
    statusText: string;
368
    /** Cheerio instance for additional DOM manipulation */
369
    $: Cheerio;
370
    /** Raw HTML body as a string */
371
    body: string;
372
  }
373

374
  function scrapeHTML<T>(body: Cheerio | string, options: ScrapeOptions): T;
375
}
376

377
declare function scrapeIt<T>(
378
  url: string | object,
379
  opts: scrapeIt.ScrapeOptions
380
): Promise<scrapeIt.ScrapeResult<T>>;
381

382
export = scrapeIt;
383
```

Version

Tile

Files

index.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

index.mddocs/