Html Content / Article Extractor, web scrapping for Python3
npx @tessl/cli install tessl/pypi-goose3@3.1.0A comprehensive Python library for extracting article content, metadata, and media from web pages and HTML documents. Goose3 intelligently identifies main article content while filtering out navigation, advertisements, and other non-content elements using advanced text analysis algorithms.
pip install goose3pip install goose3[chinese] - Chinese language supportpip install goose3[arabic] - Arabic language supportpip install goose3[all] - All language extensionsfrom goose3 import GooseFor configuration and data types:
from goose3 import Goose, Configuration, Article, Image, Video
from goose3 import ArticleContextPattern, PublishDatePattern, AuthorPatternFor language-specific text processing:
from goose3.text import StopWords, StopWordsChinese, StopWordsArabic, StopWordsKoreanfrom goose3 import Goose
# Basic extraction from URL
g = Goose()
article = g.extract(url='https://example.com/article')
print(article.title)
print(article.cleaned_text)
print(article.meta_description)
if article.top_image:
print(article.top_image.src)
# Extract from raw HTML
html_content = "<html>...</html>"
article = g.extract(raw_html=html_content)
# Using as context manager (recommended)
with Goose() as g:
article = g.extract(url='https://example.com/article')
print(article.title)Goose3 uses a multi-stage extraction pipeline:
Main article extraction functionality that processes URLs or HTML to extract clean text content, metadata, and media elements.
class Goose:
def __init__(self, config=None): ...
def extract(self, url=None, raw_html=None) -> Article: ...
def close(self): ...
def shutdown_network(self): ...Comprehensive configuration options for customizing extraction behavior, including parser selection, language targeting, content patterns, and network settings.
class Configuration:
def __init__(self): ...
# Key properties
parser_class: str
target_language: str
browser_user_agent: str
enable_image_fetching: bool
strict: bool
local_storage_path: strRich data structure containing extracted content, metadata, and media with comprehensive property access for all extracted information.
class Article:
@property
def title(self) -> str: ...
@property
def cleaned_text(self) -> str: ...
@property
def top_image(self) -> Image: ...
@property
def movies(self) -> list[Video]: ...
# ... additional propertiesImage and video extraction capabilities with support for metadata, dimensions, and embedded content from various platforms.
class Image:
src: str
width: int
height: int
class Video:
src: str
embed_code: str
embed_type: str
width: int
height: intfrom typing import Union, Optional, List, Dict, Any
# Main extraction interface
ExtractInput = Union[str, None] # URL or raw HTML
ConfigInput = Union[Configuration, dict, None]
# Pattern matching for content extraction
class ArticleContextPattern:
def __init__(self, *, attr=None, value=None, tag=None, domain=None): ...
attr: str
value: str
tag: str
domain: str
class PublishDatePattern:
def __init__(self, *, attr=None, value=None, content=None, subcontent=None, tag=None, domain=None): ...
attr: str
value: str
content: str
subcontent: str
tag: str
domain: str
class AuthorPattern:
def __init__(self, *, attr=None, value=None, tag=None, domain=None): ...
attr: str
value: str
tag: str
domain: str
# Exception types
class NetworkError(RuntimeError):
"""Network-related errors during content fetching"""
def __init__(self, status_code, reason): ...
status_code: int # HTTP status code
reason: str # HTTP reason phrase
message: str # Formatted error message
# Language-specific text processing classes
class StopWords:
"""Base stopwords class for English text processing"""
def __init__(self, language: str = 'en'): ...
class StopWordsChinese(StopWords):
"""Chinese language stopwords for improved text analysis"""
def __init__(self): ...
class StopWordsArabic(StopWords):
"""Arabic language stopwords for improved text analysis"""
def __init__(self): ...
class StopWordsKorean(StopWords):
"""Korean language stopwords for improved text analysis"""
def __init__(self): ...