tessl/pypi-goose3

Html Content / Article Extractor, web scrapping for Python3

—

Pending

Overview

Eval results

Files

Media Extraction

Name: tessl/pypi-goose3
Author: tessl

Image and video extraction capabilities with support for metadata, dimensions, and embedded content from various platforms. Goose3 automatically identifies and extracts media elements from article content, including main images and embedded videos.

Capabilities

Image Classes

Classes representing extracted images with metadata and storage capabilities.

class Image:
    def __init__(self):
        """Initialize image container."""
        
    @property
    def src(self) -> str:
        """Image source URL."""
        
    @property
    def width(self) -> int:
        """Image width in pixels."""
        
    @property
    def height(self) -> int:
        """Image height in pixels."""
        
    @property
    def top_image_node(self):
        """DOM node of the image element."""
        
    @property
    def confidence_score(self) -> float:
        """Confidence score for image extraction (0.0-1.0)."""
        
    @property
    def extraction_type(self) -> str:
        """Type of extraction used (e.g., 'bestGuess', 'linkTag', 'openGraph')."""
        
    @property
    def bytes(self) -> int:
        """Size of the image in bytes."""

class ImageDetails:
    """Detailed image information and metadata container."""
    def __init__(self):
        """Initialize detailed image information."""
        
    # Contains extended image metadata and analysis results

class LocallyStoredImage:
    """Container for locally stored/cached images."""
    def __init__(self):
        """Initialize local image storage container."""
        
    # Manages local storage paths and cached image data

Video Classes

Classes representing extracted video content from embedded players and media platforms.

class Video:
    def __init__(self):
        """Initialize video container."""
        
    @property
    def src(self) -> str:
        """Video source URL."""
        
    @property
    def embed_code(self) -> str:
        """HTML embed code for the video."""
        
    @property
    def embed_type(self) -> str:
        """Type of embed (e.g., 'iframe', 'object')."""
        
    @property
    def width(self) -> int:
        """Video width in pixels."""
        
    @property
    def height(self) -> int:
        """Video height in pixels."""
        
    @property
    def provider(self) -> str:
        """Video provider/platform name."""

Image Extraction Usage

Basic image access:

from goose3 import Goose

g = Goose()
article = g.extract(url='https://example.com/article')

# Access main article image
if article.top_image:
    image = article.top_image
    print(f"Image URL: {image.src}")
    print(f"Dimensions: {image.width}x{image.height}")
    
    # Check if valid dimensions were extracted
    if image.width and image.height:
        aspect_ratio = image.width / image.height
        print(f"Aspect ratio: {aspect_ratio:.2f}")
    else:
        print("Dimensions not available")
else:
    print("No main image found in article")

Image fetching configuration:

from goose3 import Goose

# Enable image fetching and local storage
config = {
    'enable_image_fetching': True,
    'local_storage_path': '/tmp/goose_images'
}

g = Goose(config)
article = g.extract(url='https://example.com/article')

if article.top_image:
    print(f"Image fetched and stored: {article.top_image.src}")

Video Extraction Usage

Basic video access:

from goose3 import Goose

g = Goose()
article = g.extract(url='https://example.com/article')

# Access embedded videos
if article.movies:
    print(f"Found {len(article.movies)} videos")
    
    for i, video in enumerate(article.movies):
        print(f"\nVideo {i+1}:")
        print(f"  Source: {video.src}")
        print(f"  Type: {video.embed_type}")
        print(f"  Dimensions: {video.width}x{video.height}")
        print(f"  Embed code: {video.embed_code[:100]}...")
else:
    print("No videos found in article")

Working with video embeds:

article = g.extract(url='https://example.com/article')

for video in article.movies:
    if video.embed_type == 'iframe':
        # Handle iframe embeds
        print(f"Iframe video: {video.src}")
        print(f"Embed HTML: {video.embed_code}")
    elif video.embed_type == 'object':
        # Handle object embeds
        print(f"Object video: {video.src}")
    
    # Check video platform
    if 'youtube.com' in video.src or 'youtu.be' in video.src:
        print("YouTube video detected")
    elif 'vimeo.com' in video.src:
        print("Vimeo video detected")
    elif 'kewego.com' in video.src:
        print("Kewego video detected")

Platform-Specific Video Support

Goose3 supports extraction from various video platforms:

# YouTube video extraction
article = g.extract(url='https://example.com/article-with-youtube')
for video in article.movies:
    if 'youtube' in video.src.lower():
        print(f"YouTube video ID can be extracted from: {video.src}")

# Vimeo video extraction  
article = g.extract(url='https://example.com/article-with-vimeo')
for video in article.movies:
    if 'vimeo' in video.src.lower():
        print(f"Vimeo video: {video.src}")

# Generic iframe embeds
for video in article.movies:
    if video.embed_type == 'iframe':
        print(f"Generic iframe embed: {video.embed_code}")

Media Validation

Checking media availability and quality:

from goose3 import Goose

g = Goose({'enable_image_fetching': True})
article = g.extract(url='https://example.com/article')

# Validate main image
if article.top_image:
    image = article.top_image
    
    # Check if image has valid URL
    if image.src and image.src.startswith(('http://', 'https://')):
        print(f"Valid image URL: {image.src}")
        
        # Check dimensions
        if image.width and image.height:
            if image.width >= 300 and image.height >= 200:
                print("Image meets minimum size requirements")
            else:
                print("Image is quite small")
        else:
            print("Image dimensions not available")
    else:
        print("Invalid or missing image URL")

# Validate videos
valid_videos = []
for video in article.movies:
    if video.src and video.embed_code:
        if any(platform in video.src.lower() 
               for platform in ['youtube', 'vimeo', 'dailymotion']):
            valid_videos.append(video)

print(f"Found {len(valid_videos)} valid videos from known platforms")

Error Handling for Media

from goose3 import Goose

try:
    g = Goose({
        'enable_image_fetching': True,
        'local_storage_path': '/tmp/goose_images'
    })
    article = g.extract(url='https://example.com/article')
    
    # Safe media access
    image_available = bool(article.top_image and article.top_image.src)
    videos_available = bool(article.movies)
    
    print(f"Media extraction results - Images: {image_available}, Videos: {videos_available}")
    
except Exception as e:
    print(f"Media extraction error: {e}")
    # Continue with article text even if media extraction fails
    print(f"Article title: {article.title if 'article' in locals() else 'N/A'}")

Install with Tessl CLI