or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

callback-parsing.mddom-parsing.mdfeed-parsing.mdindex.mdstream-processing.mdtokenization.md
tile.json

feed-parsing.mddocs/

Feed Parsing

Specialized functionality for parsing RSS, RDF, and Atom feeds with automatic feed detection and structured data extraction. Built on top of domutils integration.

Capabilities

parseFeed Function

High-level function for parsing RSS, Atom, and RDF feeds into structured data objects.

/**
 * Parse a feed (RSS, Atom, or RDF) into a structured Feed object
 * @param feed - The feed XML string to parse
 * @param options - Parser options (xmlMode is automatically enabled)
 * @returns Parsed Feed object or null if not a valid feed
 */
function parseFeed(feed: string, options?: Options): Feed | null;

Usage Examples:

import { parseFeed } from "htmlparser2";

// Parse RSS feed
const rssXml = `
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
  <channel>
    <title>Example Blog</title>
    <link>https://example.com</link>
    <description>A blog about web development</description>
    <item>
      <title>First Post</title>
      <link>https://example.com/first-post</link>
      <description>Content of the first post</description>
      <pubDate>Wed, 01 Jan 2025 10:00:00 GMT</pubDate>
    </item>
  </channel>
</rss>`;

const feed = parseFeed(rssXml);
if (feed) {
  console.log("Feed title:", feed.title);
  console.log("Feed type:", feed.type); // "rss"
  console.log("Items:", feed.items.length);
  
  feed.items.forEach(item => {
    console.log("Item:", item.title, "->", item.link);
  });
}

// Parse Atom feed
const atomXml = `
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
  <title>Example Atom Feed</title>
  <link href="https://example.com/"/>
  <id>https://example.com/</id>
  <entry>
    <title>Sample Entry</title>
    <link href="https://example.com/entry/1"/>
    <id>https://example.com/entry/1</id>
    <summary>This is a sample entry</summary>
  </entry>
</feed>`;

const atomFeed = parseFeed(atomXml);
console.log("Atom feed type:", atomFeed?.type); // "atom"

getFeed Function

Lower-level function that extracts feed data from an already-parsed DOM tree.

/**
 * Extract feed information from a parsed DOM tree
 * @param dom - Array of DOM nodes (typically from parseDOM)
 * @returns Feed object or null if DOM doesn't contain feed data
 */
function getFeed(dom: ChildNode[]): Feed | null;

Usage Examples:

import { parseDOM, getFeed } from "htmlparser2";

// Parse XML then extract feed data
const dom = parseDOM(feedXmlString, { xmlMode: true });
const feedData = getFeed(dom);

if (feedData) {
  console.log("Feed extracted from DOM:", feedData.title);
}

Feed Types

The Feed object structure supports RSS, Atom, and RDF feeds:

interface Feed {
  /** Feed format type: "rss", "atom", or "rdf" */
  type: string;
  
  /** Feed title */
  title?: string;
  
  /** Feed URL or website link */
  link?: string;
  
  /** Feed description */
  description?: string;
  
  /** Language of the feed content */
  language?: string;
  
  /** Feed update timestamp */
  updated?: Date;
  
  /** Feed author information */
  author?: string;
  
  /** Feed image/logo information */
  image?: FeedImage;
  
  /** Array of feed items/entries */
  items: FeedItem[];
}

interface FeedItem {
  /** Item title */
  title?: string;
  
  /** Item URL link */
  link?: string;
  
  /** Item description or content */
  description?: string;
  
  /** Item publication date */
  pubDate?: Date;
  
  /** Item author */
  author?: string;
  
  /** Item unique identifier */
  id?: string;
  
  /** Item categories/tags */
  categories?: string[];
  
  /** Item media attachments */
  media?: FeedMedia[];
}

interface FeedImage {
  /** Image URL */
  url?: string;
  
  /** Image title */
  title?: string;
  
  /** Image link */
  link?: string;
}

interface FeedMedia {
  /** Media URL */
  url?: string;
  
  /** Media type (MIME type) */
  type?: string;
  
  /** Media length in bytes */
  length?: number;
}

Supported Feed Formats

RSS 2.0

<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
  <channel>
    <title>RSS Feed Title</title>
    <link>https://example.com</link>
    <description>Feed description</description>
    <language>en-us</language>
    <pubDate>Wed, 01 Jan 2025 10:00:00 GMT</pubDate>
    <item>
      <title>Item Title</title>
      <link>https://example.com/item1</link>
      <description>Item description</description>
      <pubDate>Wed, 01 Jan 2025 09:00:00 GMT</pubDate>
      <category>Technology</category>
    </item>
  </channel>
</rss>

Atom 1.0

<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
  <title>Atom Feed Title</title>
  <link href="https://example.com/"/>
  <updated>2025-01-01T10:00:00Z</updated>
  <id>https://example.com/</id>
  <entry>
    <title>Entry Title</title>
    <link href="https://example.com/entry1"/>
    <id>https://example.com/entry1</id>
    <updated>2025-01-01T09:00:00Z</updated>
    <summary>Entry summary</summary>
    <category term="Technology"/>
  </entry>
</feed>

RDF 1.0

<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns="http://purl.org/rss/1.0/">
  <channel rdf:about="https://example.com">
    <title>RDF Feed Title</title>
    <link>https://example.com</link>
    <description>Feed description</description>
  </channel>
  <item rdf:about="https://example.com/item1">
    <title>Item Title</title>
    <link>https://example.com/item1</link>
    <description>Item description</description>
  </item>
</rdf:RDF>

Advanced Feed Processing

Feed Validation and Error Handling

import { parseFeed } from "htmlparser2";

function processFeed(feedXml: string) {
  try {
    const feed = parseFeed(feedXml);
    
    if (!feed) {
      console.log("Not a valid feed format");
      return null;
    }
    
    // Validate required fields
    if (!feed.title) {
      console.warn("Feed missing title");
    }
    
    if (!feed.items || feed.items.length === 0) {
      console.warn("Feed has no items");
    }
    
    return feed;
  } catch (error) {
    console.error("Feed parsing failed:", error);
    return null;
  }
}

Feed Item Processing

import { parseFeed } from "htmlparser2";

const feed = parseFeed(feedXml);

if (feed) {
  // Process items by date
  const sortedItems = feed.items
    .filter(item => item.pubDate)
    .sort((a, b) => (b.pubDate?.getTime() || 0) - (a.pubDate?.getTime() || 0));
    
  console.log("Latest items:");
  sortedItems.slice(0, 5).forEach(item => {
    console.log(`${item.pubDate?.toDateString()}: ${item.title}`);
  });
  
  // Extract categories
  const allCategories = new Set();
  feed.items.forEach(item => {
    item.categories?.forEach(cat => allCategories.add(cat));
  });
  
  console.log("Feed categories:", Array.from(allCategories));
}

Custom Feed Processing with Options

import { parseFeed } from "htmlparser2";

// Parse feed with custom options
const feed = parseFeed(feedXml, {
  xmlMode: true,           // Always true for feeds
  decodeEntities: true,    // Decode HTML entities in content
  normalizeWhitespace: true // Clean up whitespace
});

if (feed) {
  // Process media elements (podcasts, etc.)
  feed.items.forEach(item => {
    if (item.media && item.media.length > 0) {
      console.log(`Media item: ${item.title}`);
      item.media.forEach(media => {
        console.log(`  - ${media.type}: ${media.url} (${media.length} bytes)`);
      });
    }
  });
}

Integration with Feed Readers

import { parseFeed } from "htmlparser2";
import https from "https";

class SimpleFeedReader {
  async fetchAndParse(feedUrl: string): Promise<Feed | null> {
    return new Promise((resolve, reject) => {
      https.get(feedUrl, (response) => {
        let data = '';
        
        response.on('data', chunk => {
          data += chunk;
        });
        
        response.on('end', () => {
          try {
            const feed = parseFeed(data);
            resolve(feed);
          } catch (error) {
            reject(error);
          }
        });
      }).on('error', reject);
    });
  }
  
  async getLatestItems(feedUrl: string, count: number = 5) {
    const feed = await this.fetchAndParse(feedUrl);
    
    if (!feed) return [];
    
    return feed.items
      .sort((a, b) => (b.pubDate?.getTime() || 0) - (a.pubDate?.getTime() || 0))
      .slice(0, count);
  }
}

// Usage
const reader = new SimpleFeedReader();
const latestItems = await reader.getLatestItems('https://example.com/feed.xml');