Html Content / Article Extractor, web scrapping for Python3
—
Rich data structure containing all extracted content, metadata, and media from web pages. The Article class provides comprehensive property access to extracted information including text content, metadata, images, videos, and structural data.
Main container for all extracted article information with read-only properties providing access to content and metadata.
class Article:
def __init__(self):
"""Initialize empty article container."""
# Content properties
@property
def title(self) -> str:
"""Article title extracted from page."""
@property
def cleaned_text(self) -> str:
"""Main article text content, cleaned and formatted."""
@property
def meta_description(self) -> str:
"""Meta description from page metadata."""
@property
def meta_lang(self) -> str:
"""Language metadata from page."""
@property
def meta_favicon(self) -> str:
"""Favicon URL extracted from page."""
@property
def meta_keywords(self) -> str:
"""Meta keywords from page metadata."""
@property
def meta_encoding(self) -> list:
"""Character encoding information."""
# URL and domain properties
@property
def canonical_link(self) -> str:
"""Canonical URL from page metadata."""
@property
def domain(self) -> str:
"""Domain name of the article source."""
@property
def final_url(self) -> str:
"""Final resolved URL after redirects."""
@property
def link_hash(self) -> str:
"""Hash of the article URL."""
# Content structure properties
@property
def top_node(self):
"""Main content DOM node (parser-specific object)."""
@property
def top_node_raw_html(self) -> str:
"""Raw HTML of the main content area."""
@property
def raw_html(self) -> str:
"""Original HTML of the entire page."""
@property
def doc(self):
"""Parsed document object (parser-specific)."""
@property
def raw_doc(self):
"""Raw document object before processing."""
# Media properties
@property
def top_image(self) -> Image:
"""Main article image object."""
@property
def movies(self) -> list:
"""List of Video objects for embedded videos."""
# Structured data properties
@property
def tags(self) -> list:
"""List of article tags extracted from page."""
@property
def opengraph(self) -> dict:
"""OpenGraph metadata as dictionary."""
@property
def tweets(self) -> list:
"""List of embedded tweets."""
@property
def links(self) -> list:
"""List of links found in article content."""
@property
def authors(self) -> list:
"""List of article authors."""
@property
def schema(self):
"""Schema.org structured data from page."""
# Date properties
@property
def publish_date(self) -> str:
"""Publication date as string."""
@property
def publish_datetime_utc(self):
"""Publication datetime in UTC (datetime object)."""
# Additional data
@property
def additional_data(self) -> dict:
"""Additional extracted data as dictionary."""
@property
def infos(self) -> dict:
"""Extraction information and statistics."""Accessing article content:
from goose3 import Goose
g = Goose()
article = g.extract(url='https://example.com/article')
# Basic content
print(f"Title: {article.title}")
print(f"Text length: {len(article.cleaned_text)} characters")
print(f"Description: {article.meta_description}")
# Metadata
print(f"Language: {article.meta_lang}")
print(f"Domain: {article.domain}")
print(f"Final URL: {article.final_url}")
print(f"Keywords: {article.meta_keywords}")Working with images:
article = g.extract(url='https://example.com/article')
if article.top_image:
print(f"Main image: {article.top_image.src}")
print(f"Image dimensions: {article.top_image.width}x{article.top_image.height}")
else:
print("No main image found")Accessing embedded media:
article = g.extract(url='https://example.com/article')
# Videos
if article.movies:
for video in article.movies:
print(f"Video source: {video.src}")
print(f"Embed code: {video.embed_code}")
print(f"Video type: {video.embed_type}")
# Tweets
if article.tweets:
print(f"Found {len(article.tweets)} embedded tweets")Working with structured data:
article = g.extract(url='https://example.com/article')
# OpenGraph data
if article.opengraph:
og_title = article.opengraph.get('title')
og_image = article.opengraph.get('image')
print(f"OG Title: {og_title}")
print(f"OG Image: {og_image}")
# Tags and categories
if article.tags:
print(f"Tags: {', '.join(article.tags)}")
# Author information
if article.authors:
print(f"Authors: {', '.join(article.authors)}")
# Schema.org data
if article.schema:
print(f"Schema data available: {type(article.schema)}")Date and time information:
article = g.extract(url='https://example.com/article')
if article.publish_date:
print(f"Published: {article.publish_date}")
if article.publish_datetime_utc:
print(f"Published (UTC): {article.publish_datetime_utc}")Raw content access:
article = g.extract(url='https://example.com/article')
# Raw HTML content
print(f"Page HTML length: {len(article.raw_html)} characters")
print(f"Main content HTML: {article.top_node_raw_html[:200]}...")
# Extraction statistics
if article.infos:
print(f"Extraction info: {article.infos}")
# Additional extracted data
if article.additional_data:
print(f"Additional data keys: {list(article.additional_data.keys())}")Not all properties will have values for every article. Always check for empty values:
article = g.extract(url='https://example.com/article')
# Safe access patterns
title = article.title or "No title found"
description = article.meta_description or "No description available"
# Check for media
has_image = article.top_image is not None
has_videos = bool(article.movies)
has_authors = bool(article.authors)
print(f"Content available - Image: {has_image}, Videos: {has_videos}, Authors: {has_authors}")Install with Tessl CLI
npx tessl i tessl/pypi-goose3