Specialized functionality for parsing RSS, RDF, and Atom feeds with automatic feed detection and structured data extraction. Built on top of domutils integration.
High-level function for parsing RSS, Atom, and RDF feeds into structured data objects.
/**
* Parse a feed (RSS, Atom, or RDF) into a structured Feed object
* @param feed - The feed XML string to parse
* @param options - Parser options (xmlMode is automatically enabled)
* @returns Parsed Feed object or null if not a valid feed
*/
function parseFeed(feed: string, options?: Options): Feed | null;Usage Examples:
import { parseFeed } from "htmlparser2";
// Parse RSS feed
const rssXml = `
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>Example Blog</title>
<link>https://example.com</link>
<description>A blog about web development</description>
<item>
<title>First Post</title>
<link>https://example.com/first-post</link>
<description>Content of the first post</description>
<pubDate>Wed, 01 Jan 2025 10:00:00 GMT</pubDate>
</item>
</channel>
</rss>`;
const feed = parseFeed(rssXml);
if (feed) {
console.log("Feed title:", feed.title);
console.log("Feed type:", feed.type); // "rss"
console.log("Items:", feed.items.length);
feed.items.forEach(item => {
console.log("Item:", item.title, "->", item.link);
});
}
// Parse Atom feed
const atomXml = `
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Atom Feed</title>
<link href="https://example.com/"/>
<id>https://example.com/</id>
<entry>
<title>Sample Entry</title>
<link href="https://example.com/entry/1"/>
<id>https://example.com/entry/1</id>
<summary>This is a sample entry</summary>
</entry>
</feed>`;
const atomFeed = parseFeed(atomXml);
console.log("Atom feed type:", atomFeed?.type); // "atom"Lower-level function that extracts feed data from an already-parsed DOM tree.
/**
* Extract feed information from a parsed DOM tree
* @param dom - Array of DOM nodes (typically from parseDOM)
* @returns Feed object or null if DOM doesn't contain feed data
*/
function getFeed(dom: ChildNode[]): Feed | null;Usage Examples:
import { parseDOM, getFeed } from "htmlparser2";
// Parse XML then extract feed data
const dom = parseDOM(feedXmlString, { xmlMode: true });
const feedData = getFeed(dom);
if (feedData) {
console.log("Feed extracted from DOM:", feedData.title);
}The Feed object structure supports RSS, Atom, and RDF feeds:
interface Feed {
/** Feed format type: "rss", "atom", or "rdf" */
type: string;
/** Feed title */
title?: string;
/** Feed URL or website link */
link?: string;
/** Feed description */
description?: string;
/** Language of the feed content */
language?: string;
/** Feed update timestamp */
updated?: Date;
/** Feed author information */
author?: string;
/** Feed image/logo information */
image?: FeedImage;
/** Array of feed items/entries */
items: FeedItem[];
}
interface FeedItem {
/** Item title */
title?: string;
/** Item URL link */
link?: string;
/** Item description or content */
description?: string;
/** Item publication date */
pubDate?: Date;
/** Item author */
author?: string;
/** Item unique identifier */
id?: string;
/** Item categories/tags */
categories?: string[];
/** Item media attachments */
media?: FeedMedia[];
}
interface FeedImage {
/** Image URL */
url?: string;
/** Image title */
title?: string;
/** Image link */
link?: string;
}
interface FeedMedia {
/** Media URL */
url?: string;
/** Media type (MIME type) */
type?: string;
/** Media length in bytes */
length?: number;
}<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>RSS Feed Title</title>
<link>https://example.com</link>
<description>Feed description</description>
<language>en-us</language>
<pubDate>Wed, 01 Jan 2025 10:00:00 GMT</pubDate>
<item>
<title>Item Title</title>
<link>https://example.com/item1</link>
<description>Item description</description>
<pubDate>Wed, 01 Jan 2025 09:00:00 GMT</pubDate>
<category>Technology</category>
</item>
</channel>
</rss><?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Atom Feed Title</title>
<link href="https://example.com/"/>
<updated>2025-01-01T10:00:00Z</updated>
<id>https://example.com/</id>
<entry>
<title>Entry Title</title>
<link href="https://example.com/entry1"/>
<id>https://example.com/entry1</id>
<updated>2025-01-01T09:00:00Z</updated>
<summary>Entry summary</summary>
<category term="Technology"/>
</entry>
</feed><?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://purl.org/rss/1.0/">
<channel rdf:about="https://example.com">
<title>RDF Feed Title</title>
<link>https://example.com</link>
<description>Feed description</description>
</channel>
<item rdf:about="https://example.com/item1">
<title>Item Title</title>
<link>https://example.com/item1</link>
<description>Item description</description>
</item>
</rdf:RDF>import { parseFeed } from "htmlparser2";
function processFeed(feedXml: string) {
try {
const feed = parseFeed(feedXml);
if (!feed) {
console.log("Not a valid feed format");
return null;
}
// Validate required fields
if (!feed.title) {
console.warn("Feed missing title");
}
if (!feed.items || feed.items.length === 0) {
console.warn("Feed has no items");
}
return feed;
} catch (error) {
console.error("Feed parsing failed:", error);
return null;
}
}import { parseFeed } from "htmlparser2";
const feed = parseFeed(feedXml);
if (feed) {
// Process items by date
const sortedItems = feed.items
.filter(item => item.pubDate)
.sort((a, b) => (b.pubDate?.getTime() || 0) - (a.pubDate?.getTime() || 0));
console.log("Latest items:");
sortedItems.slice(0, 5).forEach(item => {
console.log(`${item.pubDate?.toDateString()}: ${item.title}`);
});
// Extract categories
const allCategories = new Set();
feed.items.forEach(item => {
item.categories?.forEach(cat => allCategories.add(cat));
});
console.log("Feed categories:", Array.from(allCategories));
}import { parseFeed } from "htmlparser2";
// Parse feed with custom options
const feed = parseFeed(feedXml, {
xmlMode: true, // Always true for feeds
decodeEntities: true, // Decode HTML entities in content
normalizeWhitespace: true // Clean up whitespace
});
if (feed) {
// Process media elements (podcasts, etc.)
feed.items.forEach(item => {
if (item.media && item.media.length > 0) {
console.log(`Media item: ${item.title}`);
item.media.forEach(media => {
console.log(` - ${media.type}: ${media.url} (${media.length} bytes)`);
});
}
});
}import { parseFeed } from "htmlparser2";
import https from "https";
class SimpleFeedReader {
async fetchAndParse(feedUrl: string): Promise<Feed | null> {
return new Promise((resolve, reject) => {
https.get(feedUrl, (response) => {
let data = '';
response.on('data', chunk => {
data += chunk;
});
response.on('end', () => {
try {
const feed = parseFeed(data);
resolve(feed);
} catch (error) {
reject(error);
}
});
}).on('error', reject);
});
}
async getLatestItems(feedUrl: string, count: number = 5) {
const feed = await this.fetchAndParse(feedUrl);
if (!feed) return [];
return feed.items
.sort((a, b) => (b.pubDate?.getTime() || 0) - (a.pubDate?.getTime() || 0))
.slice(0, count);
}
}
// Usage
const reader = new SimpleFeedReader();
const latestItems = await reader.getLatestItems('https://example.com/feed.xml');