tessl/pypi-goose3

Html Content / Article Extractor, web scrapping for Python3

—

Pending

Overview

Eval results

Files

Article Data

Name: tessl/pypi-goose3
Author: tessl

Rich data structure containing all extracted content, metadata, and media from web pages. The Article class provides comprehensive property access to extracted information including text content, metadata, images, videos, and structural data.

Capabilities

Article Class

Main container for all extracted article information with read-only properties providing access to content and metadata.

class Article:
    def __init__(self):
        """Initialize empty article container."""
        
    # Content properties
    @property
    def title(self) -> str:
        """Article title extracted from page."""
        
    @property  
    def cleaned_text(self) -> str:
        """Main article text content, cleaned and formatted."""
        
    @property
    def meta_description(self) -> str:
        """Meta description from page metadata."""
        
    @property
    def meta_lang(self) -> str:
        """Language metadata from page."""
        
    @property
    def meta_favicon(self) -> str:
        """Favicon URL extracted from page."""
        
    @property
    def meta_keywords(self) -> str:
        """Meta keywords from page metadata."""
        
    @property
    def meta_encoding(self) -> list:
        """Character encoding information."""
        
    # URL and domain properties
    @property
    def canonical_link(self) -> str:
        """Canonical URL from page metadata."""
        
    @property
    def domain(self) -> str:
        """Domain name of the article source."""
        
    @property
    def final_url(self) -> str:
        """Final resolved URL after redirects."""
        
    @property
    def link_hash(self) -> str:
        """Hash of the article URL."""
        
    # Content structure properties
    @property
    def top_node(self):
        """Main content DOM node (parser-specific object)."""
        
    @property
    def top_node_raw_html(self) -> str:
        """Raw HTML of the main content area."""
        
    @property
    def raw_html(self) -> str:
        """Original HTML of the entire page."""
        
    @property
    def doc(self):
        """Parsed document object (parser-specific)."""
        
    @property
    def raw_doc(self):
        """Raw document object before processing."""
        
    # Media properties
    @property
    def top_image(self) -> Image:
        """Main article image object."""
        
    @property
    def movies(self) -> list:
        """List of Video objects for embedded videos."""
        
    # Structured data properties
    @property
    def tags(self) -> list:
        """List of article tags extracted from page."""
        
    @property
    def opengraph(self) -> dict:
        """OpenGraph metadata as dictionary."""
        
    @property
    def tweets(self) -> list:
        """List of embedded tweets."""
        
    @property
    def links(self) -> list:
        """List of links found in article content."""
        
    @property
    def authors(self) -> list:
        """List of article authors."""
        
    @property
    def schema(self):
        """Schema.org structured data from page."""
        
    # Date properties
    @property
    def publish_date(self) -> str:
        """Publication date as string."""
        
    @property
    def publish_datetime_utc(self):
        """Publication datetime in UTC (datetime object)."""
        
    # Additional data
    @property
    def additional_data(self) -> dict:
        """Additional extracted data as dictionary."""
        
    @property
    def infos(self) -> dict:
        """Extraction information and statistics."""

Property Usage Examples

Accessing article content:

from goose3 import Goose

g = Goose()
article = g.extract(url='https://example.com/article')

# Basic content
print(f"Title: {article.title}")
print(f"Text length: {len(article.cleaned_text)} characters")
print(f"Description: {article.meta_description}")

# Metadata
print(f"Language: {article.meta_lang}")
print(f"Domain: {article.domain}")
print(f"Final URL: {article.final_url}")
print(f"Keywords: {article.meta_keywords}")

Working with images:

article = g.extract(url='https://example.com/article')

if article.top_image:
    print(f"Main image: {article.top_image.src}")
    print(f"Image dimensions: {article.top_image.width}x{article.top_image.height}")
else:
    print("No main image found")

Accessing embedded media:

article = g.extract(url='https://example.com/article')

# Videos
if article.movies:
    for video in article.movies:
        print(f"Video source: {video.src}")
        print(f"Embed code: {video.embed_code}")
        print(f"Video type: {video.embed_type}")

# Tweets
if article.tweets:
    print(f"Found {len(article.tweets)} embedded tweets")

Working with structured data:

article = g.extract(url='https://example.com/article')

# OpenGraph data
if article.opengraph:
    og_title = article.opengraph.get('title')
    og_image = article.opengraph.get('image')
    print(f"OG Title: {og_title}")
    print(f"OG Image: {og_image}")

# Tags and categories
if article.tags:
    print(f"Tags: {', '.join(article.tags)}")

# Author information
if article.authors:
    print(f"Authors: {', '.join(article.authors)}")

# Schema.org data
if article.schema:
    print(f"Schema data available: {type(article.schema)}")

Date and time information:

article = g.extract(url='https://example.com/article')

if article.publish_date:
    print(f"Published: {article.publish_date}")

if article.publish_datetime_utc:
    print(f"Published (UTC): {article.publish_datetime_utc}")

Raw content access:

article = g.extract(url='https://example.com/article')

# Raw HTML content
print(f"Page HTML length: {len(article.raw_html)} characters")
print(f"Main content HTML: {article.top_node_raw_html[:200]}...")

# Extraction statistics
if article.infos:
    print(f"Extraction info: {article.infos}")

# Additional extracted data
if article.additional_data:
    print(f"Additional data keys: {list(article.additional_data.keys())}")

Data Availability

Not all properties will have values for every article. Always check for empty values:

article = g.extract(url='https://example.com/article')

# Safe access patterns
title = article.title or "No title found"
description = article.meta_description or "No description available"

# Check for media
has_image = article.top_image is not None
has_videos = bool(article.movies)
has_authors = bool(article.authors)

print(f"Content available - Image: {has_image}, Videos: {has_videos}, Authors: {has_authors}")

Install with Tessl CLI