tessl/npm-htmlparser2

Fast & forgiving HTML/XML parser with callback-based interface and DOM generation capabilities

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Securityby

Pending

The risk profile of this skill

Overview

Eval results

Files

Feed Parsing

Name: tessl/npm-htmlparser2
Author: tessl

Specialized functionality for parsing RSS, RDF, and Atom feeds with automatic feed detection and structured data extraction. Built on top of domutils integration.

Capabilities

parseFeed Function

High-level function for parsing RSS, Atom, and RDF feeds into structured data objects.

/**
 * Parse a feed (RSS, Atom, or RDF) into a structured Feed object
 * @param feed - The feed XML string to parse
 * @param options - Parser options (xmlMode is automatically enabled)
 * @returns Parsed Feed object or null if not a valid feed
 */
function parseFeed(feed: string, options?: Options): Feed | null;

Usage Examples:

import { parseFeed } from "htmlparser2";

// Parse RSS feed
const rssXml = `
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
  <channel>
    <title>Example Blog</title>
    <link>https://example.com</link>
    <description>A blog about web development</description>
    <item>
      <title>First Post</title>
      <link>https://example.com/first-post</link>
      <description>Content of the first post</description>
      <pubDate>Wed, 01 Jan 2025 10:00:00 GMT</pubDate>
    </item>
  </channel>
</rss>`;

const feed = parseFeed(rssXml);
if (feed) {
  console.log("Feed title:", feed.title);
  console.log("Feed type:", feed.type); // "rss"
  console.log("Items:", feed.items.length);
  
  feed.items.forEach(item => {
    console.log("Item:", item.title, "->", item.link);
  });
}

// Parse Atom feed
const atomXml = `
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
  <title>Example Atom Feed</title>
  <link href="https://example.com/"/>
  <id>https://example.com/</id>
  <entry>
    <title>Sample Entry</title>
    <link href="https://example.com/entry/1"/>
    <id>https://example.com/entry/1</id>
    <summary>This is a sample entry</summary>
  </entry>
</feed>`;

const atomFeed = parseFeed(atomXml);
console.log("Atom feed type:", atomFeed?.type); // "atom"

getFeed Function

Lower-level function that extracts feed data from an already-parsed DOM tree.

/**
 * Extract feed information from a parsed DOM tree
 * @param dom - Array of DOM nodes (typically from parseDOM)
 * @returns Feed object or null if DOM doesn't contain feed data
 */
function getFeed(dom: ChildNode[]): Feed | null;

Usage Examples:

import { parseDOM, getFeed } from "htmlparser2";

// Parse XML then extract feed data
const dom = parseDOM(feedXmlString, { xmlMode: true });
const feedData = getFeed(dom);

if (feedData) {
  console.log("Feed extracted from DOM:", feedData.title);
}

Feed Types

The Feed object structure supports RSS, Atom, and RDF feeds:

interface Feed {
  /** Feed format type: "rss", "atom", or "rdf" */
  type: string;
  
  /** Feed title */
  title?: string;
  
  /** Feed URL or website link */
  link?: string;
  
  /** Feed description */
  description?: string;
  
  /** Language of the feed content */
  language?: string;
  
  /** Feed update timestamp */
  updated?: Date;
  
  /** Feed author information */
  author?: string;
  
  /** Feed image/logo information */
  image?: FeedImage;
  
  /** Array of feed items/entries */
  items: FeedItem[];
}

interface FeedItem {
  /** Item title */
  title?: string;
  
  /** Item URL link */
  link?: string;
  
  /** Item description or content */
  description?: string;
  
  /** Item publication date */
  pubDate?: Date;
  
  /** Item author */
  author?: string;
  
  /** Item unique identifier */
  id?: string;
  
  /** Item categories/tags */
  categories?: string[];
  
  /** Item media attachments */
  media?: FeedMedia[];
}

interface FeedImage {
  /** Image URL */
  url?: string;
  
  /** Image title */
  title?: string;
  
  /** Image link */
  link?: string;
}

interface FeedMedia {
  /** Media URL */
  url?: string;
  
  /** Media type (MIME type) */
  type?: string;
  
  /** Media length in bytes */
  length?: number;
}

Supported Feed Formats

RSS 2.0

<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
  <channel>
    <title>RSS Feed Title</title>
    <link>https://example.com</link>
    <description>Feed description</description>
    <language>en-us</language>
    <pubDate>Wed, 01 Jan 2025 10:00:00 GMT</pubDate>
    <item>
      <title>Item Title</title>
      <link>https://example.com/item1</link>
      <description>Item description</description>
      <pubDate>Wed, 01 Jan 2025 09:00:00 GMT</pubDate>
      <category>Technology</category>
    </item>
  </channel>
</rss>

Atom 1.0

<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
  <title>Atom Feed Title</title>
  <link href="https://example.com/"/>
  <updated>2025-01-01T10:00:00Z</updated>
  <id>https://example.com/</id>
  <entry>
    <title>Entry Title</title>
    <link href="https://example.com/entry1"/>
    <id>https://example.com/entry1</id>
    <updated>2025-01-01T09:00:00Z</updated>
    <summary>Entry summary</summary>
    <category term="Technology"/>
  </entry>
</feed>

RDF 1.0

<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns="http://purl.org/rss/1.0/">
  <channel rdf:about="https://example.com">
    <title>RDF Feed Title</title>
    <link>https://example.com</link>
    <description>Feed description</description>
  </channel>
  <item rdf:about="https://example.com/item1">
    <title>Item Title</title>
    <link>https://example.com/item1</link>
    <description>Item description</description>
  </item>
</rdf:RDF>

Advanced Feed Processing

Feed Validation and Error Handling

import { parseFeed } from "htmlparser2";

function processFeed(feedXml: string) {
  try {
    const feed = parseFeed(feedXml);
    
    if (!feed) {
      console.log("Not a valid feed format");
      return null;
    }
    
    // Validate required fields
    if (!feed.title) {
      console.warn("Feed missing title");
    }
    
    if (!feed.items || feed.items.length === 0) {
      console.warn("Feed has no items");
    }
    
    return feed;
  } catch (error) {
    console.error("Feed parsing failed:", error);
    return null;
  }
}

Feed Item Processing

import { parseFeed } from "htmlparser2";

const feed = parseFeed(feedXml);

if (feed) {
  // Process items by date
  const sortedItems = feed.items
    .filter(item => item.pubDate)
    .sort((a, b) => (b.pubDate?.getTime() || 0) - (a.pubDate?.getTime() || 0));
    
  console.log("Latest items:");
  sortedItems.slice(0, 5).forEach(item => {
    console.log(`${item.pubDate?.toDateString()}: ${item.title}`);
  });
  
  // Extract categories
  const allCategories = new Set();
  feed.items.forEach(item => {
    item.categories?.forEach(cat => allCategories.add(cat));
  });
  
  console.log("Feed categories:", Array.from(allCategories));
}

Custom Feed Processing with Options

import { parseFeed } from "htmlparser2";

// Parse feed with custom options
const feed = parseFeed(feedXml, {
  xmlMode: true,           // Always true for feeds
  decodeEntities: true,    // Decode HTML entities in content
  normalizeWhitespace: true // Clean up whitespace
});

if (feed) {
  // Process media elements (podcasts, etc.)
  feed.items.forEach(item => {
    if (item.media && item.media.length > 0) {
      console.log(`Media item: ${item.title}`);
      item.media.forEach(media => {
        console.log(`  - ${media.type}: ${media.url} (${media.length} bytes)`);
      });
    }
  });
}

Integration with Feed Readers

import { parseFeed } from "htmlparser2";
import https from "https";

class SimpleFeedReader {
  async fetchAndParse(feedUrl: string): Promise<Feed | null> {
    return new Promise((resolve, reject) => {
      https.get(feedUrl, (response) => {
        let data = '';
        
        response.on('data', chunk => {
          data += chunk;
        });
        
        response.on('end', () => {
          try {
            const feed = parseFeed(data);
            resolve(feed);
          } catch (error) {
            reject(error);
          }
        });
      }).on('error', reject);
    });
  }
  
  async getLatestItems(feedUrl: string, count: number = 5) {
    const feed = await this.fetchAndParse(feedUrl);
    
    if (!feed) return [];
    
    return feed.items
      .sort((a, b) => (b.pubDate?.getTime() || 0) - (a.pubDate?.getTime() || 0))
      .slice(0, count);
  }
}

// Usage
const reader = new SimpleFeedReader();
const latestItems = await reader.getLatestItems('https://example.com/feed.xml');