Fast & forgiving HTML/XML parser with callback-based interface and DOM generation capabilities
—
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Pending
The risk profile of this skill
Specialized functionality for parsing RSS, RDF, and Atom feeds with automatic feed detection and structured data extraction. Built on top of domutils integration.
High-level function for parsing RSS, Atom, and RDF feeds into structured data objects.
/**
* Parse a feed (RSS, Atom, or RDF) into a structured Feed object
* @param feed - The feed XML string to parse
* @param options - Parser options (xmlMode is automatically enabled)
* @returns Parsed Feed object or null if not a valid feed
*/
function parseFeed(feed: string, options?: Options): Feed | null;Usage Examples:
import { parseFeed } from "htmlparser2";
// Parse RSS feed
const rssXml = `
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>Example Blog</title>
<link>https://example.com</link>
<description>A blog about web development</description>
<item>
<title>First Post</title>
<link>https://example.com/first-post</link>
<description>Content of the first post</description>
<pubDate>Wed, 01 Jan 2025 10:00:00 GMT</pubDate>
</item>
</channel>
</rss>`;
const feed = parseFeed(rssXml);
if (feed) {
console.log("Feed title:", feed.title);
console.log("Feed type:", feed.type); // "rss"
console.log("Items:", feed.items.length);
feed.items.forEach(item => {
console.log("Item:", item.title, "->", item.link);
});
}
// Parse Atom feed
const atomXml = `
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Atom Feed</title>
<link href="https://example.com/"/>
<id>https://example.com/</id>
<entry>
<title>Sample Entry</title>
<link href="https://example.com/entry/1"/>
<id>https://example.com/entry/1</id>
<summary>This is a sample entry</summary>
</entry>
</feed>`;
const atomFeed = parseFeed(atomXml);
console.log("Atom feed type:", atomFeed?.type); // "atom"Lower-level function that extracts feed data from an already-parsed DOM tree.
/**
* Extract feed information from a parsed DOM tree
* @param dom - Array of DOM nodes (typically from parseDOM)
* @returns Feed object or null if DOM doesn't contain feed data
*/
function getFeed(dom: ChildNode[]): Feed | null;Usage Examples:
import { parseDOM, getFeed } from "htmlparser2";
// Parse XML then extract feed data
const dom = parseDOM(feedXmlString, { xmlMode: true });
const feedData = getFeed(dom);
if (feedData) {
console.log("Feed extracted from DOM:", feedData.title);
}The Feed object structure supports RSS, Atom, and RDF feeds:
interface Feed {
/** Feed format type: "rss", "atom", or "rdf" */
type: string;
/** Feed title */
title?: string;
/** Feed URL or website link */
link?: string;
/** Feed description */
description?: string;
/** Language of the feed content */
language?: string;
/** Feed update timestamp */
updated?: Date;
/** Feed author information */
author?: string;
/** Feed image/logo information */
image?: FeedImage;
/** Array of feed items/entries */
items: FeedItem[];
}
interface FeedItem {
/** Item title */
title?: string;
/** Item URL link */
link?: string;
/** Item description or content */
description?: string;
/** Item publication date */
pubDate?: Date;
/** Item author */
author?: string;
/** Item unique identifier */
id?: string;
/** Item categories/tags */
categories?: string[];
/** Item media attachments */
media?: FeedMedia[];
}
interface FeedImage {
/** Image URL */
url?: string;
/** Image title */
title?: string;
/** Image link */
link?: string;
}
interface FeedMedia {
/** Media URL */
url?: string;
/** Media type (MIME type) */
type?: string;
/** Media length in bytes */
length?: number;
}<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>RSS Feed Title</title>
<link>https://example.com</link>
<description>Feed description</description>
<language>en-us</language>
<pubDate>Wed, 01 Jan 2025 10:00:00 GMT</pubDate>
<item>
<title>Item Title</title>
<link>https://example.com/item1</link>
<description>Item description</description>
<pubDate>Wed, 01 Jan 2025 09:00:00 GMT</pubDate>
<category>Technology</category>
</item>
</channel>
</rss><?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Atom Feed Title</title>
<link href="https://example.com/"/>
<updated>2025-01-01T10:00:00Z</updated>
<id>https://example.com/</id>
<entry>
<title>Entry Title</title>
<link href="https://example.com/entry1"/>
<id>https://example.com/entry1</id>
<updated>2025-01-01T09:00:00Z</updated>
<summary>Entry summary</summary>
<category term="Technology"/>
</entry>
</feed><?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/">
<channel rdf:about="https://example.com">
<title>RDF Feed Title</title>
<link>https://example.com</link>
<description>Feed description</description>
</channel>
<item rdf:about="https://example.com/item1">
<title>Item Title</title>
<link>https://example.com/item1</link>
<description>Item description</description>
</item>
</rdf:RDF>import { parseFeed } from "htmlparser2";
function processFeed(feedXml: string) {
try {
const feed = parseFeed(feedXml);
if (!feed) {
console.log("Not a valid feed format");
return null;
}
// Validate required fields
if (!feed.title) {
console.warn("Feed missing title");
}
if (!feed.items || feed.items.length === 0) {
console.warn("Feed has no items");
}
return feed;
} catch (error) {
console.error("Feed parsing failed:", error);
return null;
}
}import { parseFeed } from "htmlparser2";
const feed = parseFeed(feedXml);
if (feed) {
// Process items by date
const sortedItems = feed.items
.filter(item => item.pubDate)
.sort((a, b) => (b.pubDate?.getTime() || 0) - (a.pubDate?.getTime() || 0));
console.log("Latest items:");
sortedItems.slice(0, 5).forEach(item => {
console.log(`${item.pubDate?.toDateString()}: ${item.title}`);
});
// Extract categories
const allCategories = new Set();
feed.items.forEach(item => {
item.categories?.forEach(cat => allCategories.add(cat));
});
console.log("Feed categories:", Array.from(allCategories));
}import { parseFeed } from "htmlparser2";
// Parse feed with custom options
const feed = parseFeed(feedXml, {
xmlMode: true, // Always true for feeds
decodeEntities: true, // Decode HTML entities in content
normalizeWhitespace: true // Clean up whitespace
});
if (feed) {
// Process media elements (podcasts, etc.)
feed.items.forEach(item => {
if (item.media && item.media.length > 0) {
console.log(`Media item: ${item.title}`);
item.media.forEach(media => {
console.log(` - ${media.type}: ${media.url} (${media.length} bytes)`);
});
}
});
}import { parseFeed } from "htmlparser2";
import https from "https";
class SimpleFeedReader {
async fetchAndParse(feedUrl: string): Promise<Feed | null> {
return new Promise((resolve, reject) => {
https.get(feedUrl, (response) => {
let data = '';
response.on('data', chunk => {
data += chunk;
});
response.on('end', () => {
try {
const feed = parseFeed(data);
resolve(feed);
} catch (error) {
reject(error);
}
});
}).on('error', reject);
});
}
async getLatestItems(feedUrl: string, count: number = 5) {
const feed = await this.fetchAndParse(feedUrl);
if (!feed) return [];
return feed.items
.sort((a, b) => (b.pubDate?.getTime() || 0) - (a.pubDate?.getTime() || 0))
.slice(0, count);
}
}
// Usage
const reader = new SimpleFeedReader();
const latestItems = await reader.getLatestItems('https://example.com/feed.xml');