tessl/npm-nuxtjs--sitemap

Powerfully flexible XML Sitemaps that integrate seamlessly, for Nuxt.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

XML and HTML Utilities

Name: tessl/npm-nuxtjs--sitemap
Author: tessl

Utility functions for parsing existing XML sitemaps and extracting sitemap metadata from HTML documents for analysis and integration purposes.

import { parseSitemapXml, parseHtmlExtractSitemapMeta } from '@nuxtjs/sitemap/utils';
import type { SitemapParseResult, SitemapWarning } from '@nuxtjs/sitemap/utils';

Capabilities

XML Sitemap Parsing

Parse existing XML sitemap content into structured data with validation and warning reporting.

/**
 * Parse XML sitemap content into structured data
 * Handles both regular sitemaps and sitemap index files
 * @param xml - Raw XML sitemap content as string
 * @returns Promise resolving to parsed sitemap data with URLs and validation warnings
 */
function parseSitemapXml(xml: string): Promise<SitemapParseResult>;

interface SitemapParseResult {
  /** Array of parsed sitemap URLs */
  urls: SitemapUrlInput[];
  /** Array of validation warnings encountered during parsing */
  warnings: SitemapWarning[];
}

interface SitemapWarning {
  /** Type of warning encountered */
  type: 'validation';
  /** Human-readable warning message */
  message: string;
  /** Context information about where the warning occurred */
  context?: {
    url?: string;
    field?: string;
    value?: unknown;
  };
}

HTML Metadata Extraction

Extract sitemap-relevant metadata from HTML documents for automatic discovery and analysis.

/**
 * Extract sitemap metadata from HTML document content
 * Discovers images, videos, and other sitemap-relevant information
 * @param html - Raw HTML content as string
 * @param options - Optional configuration for metadata extraction
 * @returns Array of sitemap URLs with discovered metadata
 */
function parseHtmlExtractSitemapMeta(
  html: string, 
  options?: {
    /** Whether to discover images in the HTML content */
    images?: boolean;
    /** Whether to discover videos in the HTML content */
    videos?: boolean;
    /** Whether to extract lastmod information */
    lastmod?: boolean;
    /** Whether to extract alternative language links */
    alternatives?: boolean;
    /** Function to resolve relative URLs to absolute URLs */
    resolveUrl?: (url: string) => string;
  }
): SitemapUrl[];

Parsed Data Types

URL Entry Structure

interface SitemapUrl {
  /** URL location (required) */
  loc: string;
  /** Last modification date */
  lastmod?: string | Date;
  /** Change frequency indicator */
  changefreq?: Changefreq;
  /** Priority value between 0.0 and 1.0 */
  priority?: 0 | 0.1 | 0.2 | 0.3 | 0.4 | 0.5 | 0.6 | 0.7 | 0.8 | 0.9 | 1;
  /** Alternative language versions */
  alternatives?: AlternativeEntry[];
  /** Google News metadata */
  news?: GoogleNewsEntry;
  /** Associated images */
  images?: ImageEntry[];
  /** Associated videos */
  videos?: VideoEntry[];
}

type Changefreq = 
  | 'always' 
  | 'hourly' 
  | 'daily' 
  | 'weekly' 
  | 'monthly' 
  | 'yearly' 
  | 'never';

Image Metadata Structure

interface ImageEntry {
  /** Image URL location */
  loc: string | URL;
  /** Image caption text */
  caption?: string;
  /** Geographic location information */
  geoLocation?: string;
  /** Image title */
  title?: string;
  /** License URL */
  license?: string | URL;
}

Video Metadata Structure

interface VideoEntry {
  /** Video title (required) */
  title: string;
  /** Video thumbnail URL (required) */
  thumbnail_loc: string | URL;
  /** Video description (required) */
  description: string;
  /** Direct video content URL */
  content_loc?: string | URL;
  /** Video player page URL */
  player_loc?: string | URL;
  /** Video duration in seconds */
  duration?: number;
  /** Video expiration date */
  expiration_date?: Date | string;
  /** Video rating (0.0 to 5.0) */
  rating?: number;
  /** View count */
  view_count?: number;
  /** Publication date */
  publication_date?: Date | string;
  /** Family-friendly flag */
  family_friendly?: 'yes' | 'no' | boolean;
  /** Geographic restrictions */
  restriction?: Restriction;
  /** Platform restrictions */
  platform?: Platform;
  /** Pricing information */
  price?: PriceEntry[];
  /** Subscription requirement */
  requires_subscription?: 'yes' | 'no' | boolean;
  /** Uploader information */
  uploader?: {
    uploader: string;
    info?: string | URL;
  };
  /** Live content indicator */
  live?: 'yes' | 'no' | boolean;
  /** Content tags */
  tag?: string | string[];
}

interface Restriction {
  relationship: 'allow' | 'deny';
  restriction: string;
}

interface Platform {
  relationship: 'allow' | 'deny';
  platform: string;
}

interface PriceEntry {
  price?: number | string;
  currency?: string;
  type?: 'rent' | 'purchase' | 'package' | 'subscription';
}

Alternative URL Structure

interface AlternativeEntry {
  /** Language/locale code (hreflang attribute) */
  hreflang: string;
  /** Alternative URL */
  href: string | URL;
}

Google News Structure

interface GoogleNewsEntry {
  /** News article title */
  title: string;
  /** Article publication date in W3C format */
  publication_date: Date | string;
  /** Publication information */
  publication: {
    /** Publication name as it appears on news.google.com */
    name: string;
    /** Publication language (ISO 639 code) */
    language: string;
  };
}

Usage Examples:

// Parse an existing XML sitemap
import { parseSitemapXml } from '@nuxtjs/sitemap/utils';

const xmlContent = `<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  <url>
    <loc>https://example.com/</loc>
    <lastmod>2023-12-01</lastmod>
    <changefreq>daily</changefreq>
    <priority>1.0</priority>
  </url>
  <url>
    <loc>https://example.com/about</loc>
    <lastmod>invalid-date</lastmod>
    <priority>0.8</priority>
  </url>
</urlset>`;

const result = parseSitemapXml(xmlContent);

console.log(result.urls);
// [
//   {
//     loc: 'https://example.com/',
//     lastmod: '2023-12-01',
//     changefreq: 'daily',
//     priority: 1.0
//   },
//   {
//     loc: 'https://example.com/about',
//     priority: 0.8
//   }
// ]

console.log(result.warnings);
// [
//   {
//     type: 'invalid-date',
//     message: 'Invalid lastmod date: invalid-date',
//     context: 'https://example.com/about'
//   }
// ]

// Extract metadata from HTML content
import { parseHtmlExtractSitemapMeta } from '@nuxtjs/sitemap/utils';

const htmlContent = `
<!DOCTYPE html>
<html>
<head>
  <title>My Blog Post</title>
  <meta property="og:image" content="https://example.com/hero.jpg">
  <meta property="article:published_time" content="2023-12-01T10:00:00Z">
</head>
<body>
  <h1>My Blog Post</h1>
  <img src="/images/diagram.png" alt="Technical diagram">
  <video src="/videos/demo.mp4" poster="/videos/demo-thumb.jpg">
    <source src="/videos/demo.mp4" type="video/mp4">
  </video>
</body>
</html>
`;

const metadata = parseHtmlExtractSitemapMeta(htmlContent);

console.log(metadata);
// [
//   {
//     images: [
//       {
//         loc: 'https://example.com/hero.jpg',
//         title: 'My Blog Post'
//       },
//       {
//         loc: '/images/diagram.png',
//         caption: 'Technical diagram'
//       }
//     ],
//     videos: [
//       {
//         title: 'My Blog Post',
//         content_loc: '/videos/demo.mp4',
//         thumbnail_loc: '/videos/demo-thumb.jpg'
//       }
//     ],
//     lastmod: '2023-12-01T10:00:00Z'
//   }
// ]

// Handle parsing errors gracefully
try {
  const result = parseSitemapXml(invalidXml);
  
  // Process results
  result.urls.forEach(url => {
    console.log(`Processing URL: ${url.loc}`);
  });
  
  // Handle warnings
  if (result.warnings.length > 0) {
    console.warn('Parsing warnings:');
    result.warnings.forEach(warning => {
      console.warn(`- ${warning.type}: ${warning.message}`);
    });
  }
} catch (error) {
  console.error('Failed to parse sitemap XML:', error);
}

// Integration with existing sitemap generation
import { parseSitemapXml, parseHtmlExtractSitemapMeta } from '@nuxtjs/sitemap/utils';

// Parse competitor's sitemap for analysis
const competitorSitemap = await $fetch('https://competitor.com/sitemap.xml');
const parsed = parseSitemapXml(competitorSitemap);

// Use parsed data to inform your sitemap structure
const competitorUrls = parsed.urls.map(url => ({
  loc: url.loc.replace('competitor.com', 'mysite.com'),
  priority: Math.max(0.1, (url.priority || 0.5) - 0.1) // Slightly lower priority
}));

// Extract metadata from rendered pages for automatic discovery
const pageHtml = await $fetch('https://mysite.com/blog/post-1');
const extractedMeta = parseHtmlExtractSitemapMeta(pageHtml);

// Combine with existing sitemap data
const enrichedUrl = {
  loc: '/blog/post-1',
  ...extractedMeta[0], // Use discovered metadata
  priority: 0.8
};