Html Content / Article Extractor, web scrapping for Python3
—
Comprehensive configuration system for customizing Goose3 extraction behavior, including parser selection, language targeting, content identification patterns, network settings, and image handling options.
The main configuration class that controls all aspects of the extraction process.
class Configuration:
def __init__(self):
"""Initialize configuration with default values."""
# Parser and processing options
parser_class: str # 'lxml' or 'soup'
available_parsers: list # Available parser names
# Language and localization
target_language: str # Language code (e.g., 'en', 'es', 'zh')
use_meta_language: bool # Use meta tags for language detection
# Network and fetching
browser_user_agent: str # User agent string for requests
http_timeout: float # HTTP request timeout in seconds
http_auth: tuple # HTTP authentication tuple (username, password)
http_proxies: dict # HTTP proxy configuration
http_headers: dict # Additional HTTP headers
strict: bool # Strict error handling for network issues
# Image processing
enable_image_fetching: bool # Enable image downloading and processing
local_storage_path: str # Directory for storing downloaded images
images_min_bytes: int # Minimum image size in bytes
imagemagick_convert_path: str # Path to ImageMagick convert binary (unused)
imagemagick_identify_path: str # Path to ImageMagick identify binary (unused)
# Content processing options
parse_lists: bool # Parse and format list elements
pretty_lists: bool # Pretty formatting for lists
parse_headers: bool # Parse header elements
keep_footnotes: bool # Keep footnote content
# Content extraction patterns (properties with getters/setters)
known_context_patterns: list # Patterns for identifying article content
known_publish_date_tags: list # Patterns for extracting publication dates
known_author_patterns: list # Patterns for extracting author information
# Advanced options
stopwords_class: type # Class for stopwords processing
log_level: str # Logging level
# Methods
def get_parser(self):
"""Retrieve the current parser class based on parser_class setting"""
...Classes for defining custom content extraction patterns.
class ArticleContextPattern:
def __init__(self, *, attr=None, value=None, tag=None, domain=None):
"""
Pattern for identifying article content areas.
Parameters:
- attr: HTML attribute name (e.g., 'class', 'id')
- value: Attribute value to match
- tag: HTML tag name to match
- domain: Domain to which this pattern applies (optional)
Note: Must provide either (attr and value) or tag
Raises:
- Exception: If neither (attr and value) nor tag is provided
"""
attr: str
value: str
tag: str
domain: str
class PublishDatePattern:
def __init__(self, *, attr=None, value=None, content=None, subcontent=None, tag=None, domain=None):
"""
Pattern for extracting publication dates.
Parameters:
- attr: HTML attribute name
- value: Attribute value to match
- content: Name of attribute containing the date value
- subcontent: JSON object key for nested data (optional)
- tag: HTML tag name to match
- domain: Domain to which this pattern applies (optional)
Note: Must provide either (attr and value) or tag
Raises:
- Exception: If neither (attr and value) nor tag is provided
"""
attr: str
value: str
content: str
subcontent: str
tag: str
domain: str
class AuthorPattern:
def __init__(self, *, attr=None, value=None, tag=None, domain=None):
"""
Pattern for extracting author information.
Parameters:
- attr: HTML attribute name
- value: Attribute value to match
- tag: HTML tag name to match
- domain: Domain to which this pattern applies (optional)
Note: Must provide either (attr and value) or tag
Raises:
- Exception: If neither (attr and value) nor tag is provided
"""
attr: str
value: str
tag: str
domain: strBasic configuration setup:
from goose3 import Configuration
config = Configuration()
config.parser_class = 'soup'
config.target_language = 'es'
config.browser_user_agent = 'Mozilla/5.0 Custom Agent'Image extraction configuration:
config = Configuration()
config.enable_image_fetching = True
config.local_storage_path = '/tmp/goose_images'Custom content patterns:
from goose3 import Configuration, ArticleContextPattern
config = Configuration()
# Add custom article content pattern
custom_pattern = ArticleContextPattern(
attr='class',
value='article-body',
domain='example.com'
)
config.known_article_content_patterns.append(custom_pattern)
# Add tag-based pattern
tag_pattern = ArticleContextPattern(tag='main')
config.known_article_content_patterns.append(tag_pattern)Language-specific configuration:
# Chinese language support
from goose3.text import StopWordsChinese
config = Configuration()
config.target_language = 'zh'
config.use_meta_language = False
config.stopwords_class = StopWordsChinese
# Arabic language support
from goose3.text import StopWordsArabic
config = Configuration()
config.target_language = 'ar'
config.use_meta_language = False
config.stopwords_class = StopWordsArabic
# Korean language support
from goose3.text import StopWordsKorean
config = Configuration()
config.target_language = 'ko'
config.use_meta_language = False
config.stopwords_class = StopWordsKorean
# Automatic language detection
config = Configuration()
config.use_meta_language = TrueNetwork and error handling:
# Lenient error handling
config = Configuration()
config.strict = False # Don't raise network exceptions
# Custom user agent
config = Configuration()
config.browser_user_agent = 'MyBot/1.0 (Custom Web Crawler)'Goose3 includes built-in content extraction patterns:
# Default article content patterns
KNOWN_ARTICLE_CONTENT_PATTERNS = [
ArticleContextPattern(attr="class", value="short-story"),
ArticleContextPattern(attr="itemprop", value="articleBody"),
ArticleContextPattern(attr="class", value="post-content"),
ArticleContextPattern(attr="class", value="g-content"),
ArticleContextPattern(attr="class", value="post-outer"),
ArticleContextPattern(tag="article"),
]
# Available parsers
AVAILABLE_PARSERS = {
"lxml": Parser, # Default HTML parser
"soup": ParserSoup, # BeautifulSoup parser
}Combining multiple configuration options:
from goose3 import Goose, Configuration, PublishDatePattern
config = Configuration()
config.parser_class = 'lxml'
config.target_language = 'en'
config.enable_image_fetching = True
config.local_storage_path = '/tmp/article_images'
config.strict = True
config.browser_user_agent = 'ArticleBot/1.0'
# Add custom publish date pattern
date_pattern = PublishDatePattern(
attr='property',
value='article:published_time',
content='content'
)
config.known_publish_date_tags.append(date_pattern)
g = Goose(config)
article = g.extract(url='https://example.com/article')Install with Tessl CLI
npx tessl i tessl/pypi-goose3