tessl/pypi-goose3

Html Content / Article Extractor, web scrapping for Python3

—

Pending

Overview

Eval results

Files

Configuration

Name: tessl/pypi-goose3
Author: tessl

Comprehensive configuration system for customizing Goose3 extraction behavior, including parser selection, language targeting, content identification patterns, network settings, and image handling options.

Capabilities

Configuration Class

The main configuration class that controls all aspects of the extraction process.

class Configuration:
    def __init__(self):
        """Initialize configuration with default values."""
        
    # Parser and processing options
    parser_class: str  # 'lxml' or 'soup'
    available_parsers: list  # Available parser names
    
    # Language and localization
    target_language: str  # Language code (e.g., 'en', 'es', 'zh')
    use_meta_language: bool  # Use meta tags for language detection
    
    # Network and fetching
    browser_user_agent: str  # User agent string for requests
    http_timeout: float  # HTTP request timeout in seconds
    http_auth: tuple  # HTTP authentication tuple (username, password)
    http_proxies: dict  # HTTP proxy configuration
    http_headers: dict  # Additional HTTP headers
    strict: bool  # Strict error handling for network issues
    
    # Image processing
    enable_image_fetching: bool  # Enable image downloading and processing
    local_storage_path: str  # Directory for storing downloaded images
    images_min_bytes: int  # Minimum image size in bytes
    imagemagick_convert_path: str  # Path to ImageMagick convert binary (unused)
    imagemagick_identify_path: str  # Path to ImageMagick identify binary (unused)
    
    # Content processing options
    parse_lists: bool  # Parse and format list elements
    pretty_lists: bool  # Pretty formatting for lists
    parse_headers: bool  # Parse header elements
    keep_footnotes: bool  # Keep footnote content
    
    # Content extraction patterns (properties with getters/setters)
    known_context_patterns: list  # Patterns for identifying article content
    known_publish_date_tags: list  # Patterns for extracting publication dates
    known_author_patterns: list  # Patterns for extracting author information
    
    # Advanced options
    stopwords_class: type  # Class for stopwords processing
    log_level: str  # Logging level
    
    # Methods
    def get_parser(self): 
        """Retrieve the current parser class based on parser_class setting"""
        ...

Pattern Helper Classes

Classes for defining custom content extraction patterns.

class ArticleContextPattern:
    def __init__(self, *, attr=None, value=None, tag=None, domain=None):
        """
        Pattern for identifying article content areas.
        
        Parameters:
        - attr: HTML attribute name (e.g., 'class', 'id')
        - value: Attribute value to match
        - tag: HTML tag name to match
        - domain: Domain to which this pattern applies (optional)
        
        Note: Must provide either (attr and value) or tag
        
        Raises:
        - Exception: If neither (attr and value) nor tag is provided
        """
        
    attr: str
    value: str
    tag: str
    domain: str

class PublishDatePattern:
    def __init__(self, *, attr=None, value=None, content=None, subcontent=None, tag=None, domain=None):
        """
        Pattern for extracting publication dates.
        
        Parameters:
        - attr: HTML attribute name
        - value: Attribute value to match
        - content: Name of attribute containing the date value
        - subcontent: JSON object key for nested data (optional)
        - tag: HTML tag name to match
        - domain: Domain to which this pattern applies (optional)
        
        Note: Must provide either (attr and value) or tag
        
        Raises:
        - Exception: If neither (attr and value) nor tag is provided
        """
        
    attr: str
    value: str
    content: str
    subcontent: str
    tag: str
    domain: str

class AuthorPattern:
    def __init__(self, *, attr=None, value=None, tag=None, domain=None):
        """
        Pattern for extracting author information.
        
        Parameters:
        - attr: HTML attribute name
        - value: Attribute value to match
        - tag: HTML tag name to match
        - domain: Domain to which this pattern applies (optional)
        
        Note: Must provide either (attr and value) or tag
        
        Raises:
        - Exception: If neither (attr and value) nor tag is provided
        """
        
    attr: str
    value: str
    tag: str
    domain: str

Configuration Examples

Basic configuration setup:

from goose3 import Configuration

config = Configuration()
config.parser_class = 'soup'
config.target_language = 'es'
config.browser_user_agent = 'Mozilla/5.0 Custom Agent'

Image extraction configuration:

config = Configuration()
config.enable_image_fetching = True
config.local_storage_path = '/tmp/goose_images'

Custom content patterns:

from goose3 import Configuration, ArticleContextPattern

config = Configuration()

# Add custom article content pattern
custom_pattern = ArticleContextPattern(
    attr='class', 
    value='article-body',
    domain='example.com'
)
config.known_article_content_patterns.append(custom_pattern)

# Add tag-based pattern
tag_pattern = ArticleContextPattern(tag='main')
config.known_article_content_patterns.append(tag_pattern)

Language-specific configuration:

# Chinese language support
from goose3.text import StopWordsChinese

config = Configuration()
config.target_language = 'zh'
config.use_meta_language = False
config.stopwords_class = StopWordsChinese

# Arabic language support
from goose3.text import StopWordsArabic

config = Configuration()
config.target_language = 'ar'
config.use_meta_language = False
config.stopwords_class = StopWordsArabic

# Korean language support
from goose3.text import StopWordsKorean

config = Configuration()
config.target_language = 'ko'
config.use_meta_language = False
config.stopwords_class = StopWordsKorean

# Automatic language detection
config = Configuration()
config.use_meta_language = True

Network and error handling:

# Lenient error handling
config = Configuration()
config.strict = False  # Don't raise network exceptions

# Custom user agent
config = Configuration()
config.browser_user_agent = 'MyBot/1.0 (Custom Web Crawler)'

Default Patterns

Goose3 includes built-in content extraction patterns:

# Default article content patterns
KNOWN_ARTICLE_CONTENT_PATTERNS = [
    ArticleContextPattern(attr="class", value="short-story"),
    ArticleContextPattern(attr="itemprop", value="articleBody"),
    ArticleContextPattern(attr="class", value="post-content"),
    ArticleContextPattern(attr="class", value="g-content"),
    ArticleContextPattern(attr="class", value="post-outer"),
    ArticleContextPattern(tag="article"),
]

# Available parsers
AVAILABLE_PARSERS = {
    "lxml": Parser,      # Default HTML parser
    "soup": ParserSoup,  # BeautifulSoup parser
}

Advanced Configuration Usage

Combining multiple configuration options:

from goose3 import Goose, Configuration, PublishDatePattern

config = Configuration()
config.parser_class = 'lxml'
config.target_language = 'en'
config.enable_image_fetching = True
config.local_storage_path = '/tmp/article_images'
config.strict = True
config.browser_user_agent = 'ArticleBot/1.0'

# Add custom publish date pattern
date_pattern = PublishDatePattern(
    attr='property',
    value='article:published_time',
    content='content'
)
config.known_publish_date_tags.append(date_pattern)

g = Goose(config)
article = g.extract(url='https://example.com/article')

Install with Tessl CLI