A Node.js scraper for humans that extracts structured data from web pages using CSS selectors
npx @tessl/cli install tessl/npm-scrape-it@6.1.0Scrape-It is a human-friendly Node.js web scraping library that extracts structured data from HTML pages using CSS selectors. Built on top of Cheerio and Cheerio-req, it provides both Promise-based and HTML-only scraping interfaces with support for complex data extraction patterns, nested lists, and automatic value conversion.
npm install scrape-itimport scrapeIt = require("scrape-it");For CommonJS:
const scrapeIt = require("scrape-it");import scrapeIt = require("scrape-it");
// Simple data extraction
const { data, status } = await scrapeIt("https://example.com", {
title: "h1",
description: ".description",
price: {
selector: ".price",
convert: (value) => parseFloat(value.replace("$", ""))
}
});
console.log(data); // { title: "...", description: "...", price: 19.99 }Scrape-It is built around two core components:
Fetch and scrape data directly from web URLs with automatic HTTP handling and response metadata.
/**
* Main scraping function that fetches and parses web pages
* @param url - The page URL or request options object
* @param opts - Scraping configuration options
* @returns Promise resolving to scrape results with data and metadata
*/
function scrapeIt<T>(
url: string | object,
opts: ScrapeOptions
): Promise<ScrapeResult<T>>;
interface ScrapeResult<T> {
/** The scraped data matching the provided options structure */
data: T;
/** HTTP status code from the response */
status: number;
/** HTTP status text from the response */
statusText: string;
/** Cheerio instance for additional DOM manipulation */
$: Cheerio;
/** Raw HTML body as a string */
body: string;
}Usage Examples:
// Basic scraping
const result = await scrapeIt("https://news.ycombinator.com", {
stories: {
listItem: ".storylink",
data: {
title: "a",
url: {
selector: "a",
attr: "href"
}
}
}
});
// With request options
const result = await scrapeIt({
url: "https://api.example.com/data",
headers: {
"User-Agent": "My Scraper 1.0"
}
}, {
items: ".item"
});Process pre-loaded HTML content using Cheerio, perfect for local files or custom HTTP handling.
/**
* Scrapes data from provided Cheerio element or HTML string
* @param $ - Cheerio instance or HTML string to parse
* @param opts - Scraping configuration options
* @returns Scraped data object
*/
function scrapeHTML<T>(
$: Cheerio | string,
opts: ScrapeOptions
): T;Usage Examples:
import { readFileSync } from "fs";
import * as cheerio from "cheerio";
// From file
const html = readFileSync("page.html", "utf8");
const data = scrapeIt.scrapeHTML(html, {
title: "h1",
links: {
listItem: "a",
data: {
text: "",
href: { attr: "href" }
}
}
});
// From existing Cheerio instance
const $ = cheerio.load(html);
const data = scrapeIt.scrapeHTML($, {
content: ".main-content"
});
// Advanced text node selection for mixed content
const textData = scrapeIt.scrapeHTML(html, {
line0: {
selector: ".mixed-content",
texteq: 0 // First direct text node
},
line1: {
selector: ".mixed-content",
texteq: 1 // Second direct text node
}
// Note: texteq only selects direct text children, not nested text
});
// List conversion examples
const convertedData = scrapeIt.scrapeHTML(html, {
featureIds: {
listItem: ".features > li",
convert: (value) => parseInt(value, 10) // Convert strings to numbers
}
});Flexible options system supporting simple selectors, nested objects, lists, and advanced element selection.
interface ScrapeOptions {
[key: string]: string | ScrapeOptionElement | ScrapeOptionList;
}
interface ScrapeOptionElement {
/** CSS selector for target element */
selector?: string;
/** Function to convert extracted value */
convert?: (value: any) => any;
/** Method to access element value (text, html, or custom function) */
how?: string | ((element: Cheerio) => any);
/** Attribute name to extract instead of text content */
attr?: string;
/** Whether to trim extracted values (default: true) */
trim?: boolean;
/** CSS selector for closest ancestor element */
closest?: string;
/** Select the nth element (0-indexed) */
eq?: number;
/** Select the nth direct text child (0-indexed) */
texteq?: number;
}
interface ScrapeOptionList {
/** CSS selector for each list item */
listItem: string;
/** Data extraction configuration for each list item */
data?: ScrapeOptions;
/** Function to convert each list item value */
convert?: (value: any) => any;
}Usage Examples:
// Simple field extraction
const data = await scrapeIt("https://example.com", {
title: "h1", // Simple selector
description: ".description" // Simple selector
});
// Advanced field configuration
const data = await scrapeIt("https://example.com", {
price: {
selector: ".price",
convert: (value) => parseFloat(value.replace(/[^0-9.]/g, ""))
},
image: {
selector: "img.product",
attr: "src" // Extract src attribute
},
content: {
selector: ".content",
how: "html" // Get HTML instead of text
}
});
// List scraping with nested data
const data = await scrapeIt("https://example.com", {
articles: {
listItem: ".article",
data: {
title: "h2",
date: {
selector: ".date",
convert: (value) => new Date(value)
},
tags: {
listItem: ".tag" // Nested list
},
// Complex nested object structures
metadata: {
selector: ".meta",
data: {
author: {
data: {
name: ".author-name",
bio: {
selector: ".author-bio span",
eq: 1 // Select 2nd span element
}
}
},
category: ".category",
readTime: ".read-time"
}
}
}
}
});
// Advanced element selection
const data = await scrapeIt("https://example.com", {
secondParagraph: {
selector: "p",
eq: 1 // Select 2nd paragraph
},
firstTextNode: {
selector: ".content",
texteq: 0 // Select 1st direct text child
},
secondTextLine: {
selector: ".multi-line",
texteq: 1 // Select 2nd direct text child
},
nearestTable: {
selector: ".data-cell",
closest: "table" // Find closest table ancestor
},
// Advanced closest + convert pattern for context-aware extraction
addresses: {
listItem: "table tbody tr",
data: {
address: ".address",
city: {
closest: "table", // Navigate to parent table
convert: (html, $node) => {
return $node.find("thead .city").text();
}
}
}
}
});Common error scenarios and handling patterns:
try {
const result = await scrapeIt("https://example.com", options);
console.log(result.data);
} catch (error) {
// Network errors, invalid URLs, or HTML parsing failures
console.error("Scraping failed:", error.message);
}
// Check HTTP status
const result = await scrapeIt("https://example.com", options);
if (result.status !== 200) {
console.warn(`Non-200 status: ${result.status} ${result.statusText}`);
}declare namespace scrapeIt {
interface ScrapeOptions {
[key: string]: string | ScrapeOptionList | ScrapeOptionElement;
}
interface ScrapeOptionElement {
/** CSS selector for target element */
selector?: string;
/** Function to convert extracted value */
convert?: (value: any) => any;
/** Method to access element value (text, html, or custom function) */
how?: string | ((element: Cheerio) => any);
/** Attribute name to extract instead of text content */
attr?: string;
/** Whether to trim extracted values (default: true) */
trim?: boolean;
/** CSS selector for closest ancestor element */
closest?: string;
/** Select the nth element (0-indexed) */
eq?: number;
/** Select the nth direct text child (0-indexed) */
texteq?: number;
}
interface ScrapeOptionList {
/** CSS selector for each list item */
listItem: string;
/** Data extraction configuration for each list item */
data?: ScrapeOptions;
/** Function to convert each list item value */
convert?: (value: any) => any;
}
interface ScrapeResult<T> {
/** The scraped data matching the provided options structure */
data: T;
/** HTTP status code from the response */
status: number;
/** HTTP status text from the response */
statusText: string;
/** Cheerio instance for additional DOM manipulation */
$: Cheerio;
/** Raw HTML body as a string */
body: string;
}
function scrapeHTML<T>(body: Cheerio | string, options: ScrapeOptions): T;
}
declare function scrapeIt<T>(
url: string | object,
opts: scrapeIt.ScrapeOptions
): Promise<scrapeIt.ScrapeResult<T>>;
export = scrapeIt;