Html Content / Article Extractor, web scrapping for Python3
—
The main extraction functionality that processes URLs or HTML documents to extract clean text content, metadata, and media elements. This module provides the primary Goose class interface and extraction pipeline.
The Goose class serves as the main entry point for all extraction operations, managing network connections, parser selection, and the complete extraction pipeline.
class Goose:
def __init__(self, config: Union[Configuration, dict, None] = None):
"""
Initialize Goose extractor with optional configuration.
Parameters:
- config: Configuration object, dict of config options, or None for defaults
Raises:
- Exception: If local_storage_path is invalid when image fetching is enabled
"""
def extract(self, url: Union[str, None] = None, raw_html: Union[str, None] = None) -> Article:
"""
Extract article content from URL or raw HTML.
Parameters:
- url: URL to fetch and extract from
- raw_html: Raw HTML string to extract from
Returns:
- Article: Extracted content and metadata
Raises:
- ValueError: If neither url nor raw_html is provided
- NetworkError: Network-related errors during fetching
- UnicodeDecodeError: Character encoding issues
"""
def close(self):
"""
Close network connection and perform cleanup.
Automatically called when using as context manager or during garbage collection.
"""
def shutdown_network(self):
"""
Close the network connection specifically.
Called automatically by close() method.
"""
def __enter__(self):
"""Context manager entry."""
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit with automatic cleanup."""Goose supports context manager protocol for automatic resource cleanup:
with Goose() as g:
article = g.extract(url="https://example.com/article")
print(article.title)
# Network connection automatically closedPass configuration as dict or Configuration object:
# Dict configuration
g = Goose({
'parser_class': 'soup',
'target_language': 'es',
'enable_image_fetching': True,
'strict': False
})
# Configuration object
from goose3 import Configuration
config = Configuration()
config.parser_class = 'soup'
config.target_language = 'es'
g = Goose(config)Extract from URL:
g = Goose()
article = g.extract(url="https://example.com/news-article")Extract from raw HTML:
html_content = """
<html>
<body>
<h1>Article Title</h1>
<p>Article content goes here...</p>
</body>
</html>
"""
g = Goose()
article = g.extract(raw_html=html_content)from goose3 import Goose, NetworkError
g = Goose({'strict': True}) # Raise all network errors
try:
article = g.extract(url="https://example.com/article")
except NetworkError as e:
print(f"Network error: {e}")
except ValueError as e:
print(f"Input error: {e}")
except UnicodeDecodeError as e:
print(f"Encoding error: {e}")Configure language targeting for better extraction:
# Automatic language detection from meta tags
g = Goose({'use_meta_language': True})
# Force specific language
g = Goose({
'use_meta_language': False,
'target_language': 'es' # Spanish
})
# Chinese language support
g = Goose({'target_language': 'zh'})
# Arabic language support
g = Goose({'target_language': 'ar'})Choose between available HTML parsers:
# Default lxml parser (faster, more robust)
g = Goose({'parser_class': 'lxml'})
# BeautifulSoup parser (more lenient)
g = Goose({'parser_class': 'soup'})Install with Tessl CLI
npx tessl i tessl/pypi-goose3