Get author property from HTML markup using metascraper plugin rules
npx @tessl/cli install tessl/npm-metascraper-author@5.49.0Metascraper Author is a specialized plugin for the metascraper ecosystem that extracts author information from HTML markup. It implements a comprehensive set of extraction rules to identify authors from various HTML structures including JSON-LD structured data, meta tags, microdata, and semantic HTML elements.
npm install metascraper-authorconst metascraperAuthor = require('metascraper-author');Note: This package uses CommonJS exports only. ES6 import syntax is not supported.
const metascraper = require('metascraper')([
require('metascraper-author')()
]);
const html = `
<html>
<head>
<meta name="author" content="John Doe">
</head>
<body>
<article>
<h1>Sample Article</h1>
<p>Content here...</p>
</article>
</body>
</html>
`;
const url = 'https://example.com/article';
(async () => {
const metadata = await metascraper({ html, url });
console.log(metadata.author); // "John Doe"
})();Metascraper Author follows the metascraper plugin architecture pattern:
Creates and returns extraction rules for identifying author information from HTML markup.
/**
* Factory function that returns metascraper rules for author extraction
* @returns {Rules} Rules object containing author extraction strategies
*/
function metascraperAuthor() {
return {
/** Array of extraction rules for author identification */
author: RulesOptions[],
/** Package identifier for metascraper */
pkgName: 'metascraper-author'
};
}
/**
* Rule extraction function type
* @typedef {Function} RulesOptions
* @param {RulesTestOptions} options - Rule execution context
* @returns {string|null|undefined} Extracted value or null/undefined if not found
*/
/**
* Rule execution context
* @typedef {Object} RulesTestOptions
* @property {import('cheerio').CheerioAPI} htmlDom - Cheerio DOM instance
* @property {string} url - Page URL for context
*/
/**
* Metascraper rules object
* @typedef {Object} Rules
* @property {RulesOptions[]} [author] - Array of author extraction rules
* @property {string} [pkgName] - Package identifier
* @property {Function} [test] - Optional test function for conditional rule execution
*/The plugin implements 13 different extraction strategies in priority order:
author.name property in JSON-LDbrand.name property as fallback<meta name="author" content="..."><meta property="article:author" content="...">itemprop*="author" containing itemprop="name"itemprop*="author"rel="author"The plugin uses internal validation mechanisms to ensure quality author extraction:
/**
* Internal strict validation function
* Enforces stricter matching criteria for author extraction rules
* @param {Function} rule - Base extraction rule to enhance
* @returns {Function} Enhanced rule with strict validation
*/
const strict = rule => $ => {
const value = rule($);
return /^\S+\s+\S+/.test(value) && value; // Must contain at least two words
};Validation Features:
/^\S+\s+\S+/)This package depends on @metascraper/helpers which provides the following key utility functions:
/**
* Extract JSON-LD structured data values
* @param {string} path - JSONPath expression (e.g., 'author.name')
* @returns {Function} Rule function for extracting JSON-LD values
*/
const $jsonld = require('@metascraper/helpers').$jsonld;
/**
* Filter and extract text content from DOM elements
* @param {CheerioAPI} $ - Cheerio instance
* @param {CheerioElement} elements - Selected elements
* @param {Function} [filterFn] - Optional element filter function
* @returns {string|null} Extracted and cleaned text content
*/
const $filter = require('@metascraper/helpers').$filter;
/**
* Convert a mapping function into a metascraper rule
* @param {Function} mapper - Function to process extracted values
* @returns {Function} Metascraper-compatible rule function
*/
const toRule = require('@metascraper/helpers').toRule;
/**
* Validate and parse date values
* @param {string} value - Potential date string
* @returns {boolean} True if value is a valid date
*/
const date = require('@metascraper/helpers').date;
/**
* Clean and validate author strings
* @param {string} value - Raw author value
* @returns {string|null} Cleaned author string or null if invalid
*/
const author = require('@metascraper/helpers').author;The plugin includes comprehensive validation mechanisms:
REGEX_STRICT)The plugin gracefully handles various edge cases:
false when target elements are not foundfalse for content that doesn't meet validation criteriaThis plugin is designed to be used within the metascraper ecosystem:
metascraper and metascraper-authorauthor property in extraction resultsThe metascraper-author package uses the following type definitions:
/**
* Main export - factory function for creating author extraction rules
* @returns {Rules} Metascraper rules object
*/
module.exports = function metascraperAuthor() {
return {
author: RulesOptions[],
pkgName: 'metascraper-author'
};
};
/**
* Individual extraction rule function
* @typedef {Function} RulesOptions
* @param {RulesTestOptions} options - Extraction context
* @returns {string|null|undefined} Extracted author or null if not found
*/
/**
* Context provided to each rule during extraction
* @typedef {Object} RulesTestOptions
* @property {import('cheerio').CheerioAPI} htmlDom - Cheerio DOM API for HTML parsing
* @property {string} url - Source URL for context and relative link resolution
*/
/**
* Complete rules object returned by the factory function
* @typedef {Object} Rules
* @property {RulesOptions[]} author - Prioritized array of author extraction rules
* @property {string} pkgName - Package identifier for debugging
* @property {Function} [test] - Optional conditional test function
*/