Html Content / Article Extractor, web scrapping for Python3
—
Image and video extraction capabilities with support for metadata, dimensions, and embedded content from various platforms. Goose3 automatically identifies and extracts media elements from article content, including main images and embedded videos.
Classes representing extracted images with metadata and storage capabilities.
class Image:
def __init__(self):
"""Initialize image container."""
@property
def src(self) -> str:
"""Image source URL."""
@property
def width(self) -> int:
"""Image width in pixels."""
@property
def height(self) -> int:
"""Image height in pixels."""
@property
def top_image_node(self):
"""DOM node of the image element."""
@property
def confidence_score(self) -> float:
"""Confidence score for image extraction (0.0-1.0)."""
@property
def extraction_type(self) -> str:
"""Type of extraction used (e.g., 'bestGuess', 'linkTag', 'openGraph')."""
@property
def bytes(self) -> int:
"""Size of the image in bytes."""
class ImageDetails:
"""Detailed image information and metadata container."""
def __init__(self):
"""Initialize detailed image information."""
# Contains extended image metadata and analysis results
class LocallyStoredImage:
"""Container for locally stored/cached images."""
def __init__(self):
"""Initialize local image storage container."""
# Manages local storage paths and cached image dataClasses representing extracted video content from embedded players and media platforms.
class Video:
def __init__(self):
"""Initialize video container."""
@property
def src(self) -> str:
"""Video source URL."""
@property
def embed_code(self) -> str:
"""HTML embed code for the video."""
@property
def embed_type(self) -> str:
"""Type of embed (e.g., 'iframe', 'object')."""
@property
def width(self) -> int:
"""Video width in pixels."""
@property
def height(self) -> int:
"""Video height in pixels."""
@property
def provider(self) -> str:
"""Video provider/platform name."""Basic image access:
from goose3 import Goose
g = Goose()
article = g.extract(url='https://example.com/article')
# Access main article image
if article.top_image:
image = article.top_image
print(f"Image URL: {image.src}")
print(f"Dimensions: {image.width}x{image.height}")
# Check if valid dimensions were extracted
if image.width and image.height:
aspect_ratio = image.width / image.height
print(f"Aspect ratio: {aspect_ratio:.2f}")
else:
print("Dimensions not available")
else:
print("No main image found in article")Image fetching configuration:
from goose3 import Goose
# Enable image fetching and local storage
config = {
'enable_image_fetching': True,
'local_storage_path': '/tmp/goose_images'
}
g = Goose(config)
article = g.extract(url='https://example.com/article')
if article.top_image:
print(f"Image fetched and stored: {article.top_image.src}")Basic video access:
from goose3 import Goose
g = Goose()
article = g.extract(url='https://example.com/article')
# Access embedded videos
if article.movies:
print(f"Found {len(article.movies)} videos")
for i, video in enumerate(article.movies):
print(f"\nVideo {i+1}:")
print(f" Source: {video.src}")
print(f" Type: {video.embed_type}")
print(f" Dimensions: {video.width}x{video.height}")
print(f" Embed code: {video.embed_code[:100]}...")
else:
print("No videos found in article")Working with video embeds:
article = g.extract(url='https://example.com/article')
for video in article.movies:
if video.embed_type == 'iframe':
# Handle iframe embeds
print(f"Iframe video: {video.src}")
print(f"Embed HTML: {video.embed_code}")
elif video.embed_type == 'object':
# Handle object embeds
print(f"Object video: {video.src}")
# Check video platform
if 'youtube.com' in video.src or 'youtu.be' in video.src:
print("YouTube video detected")
elif 'vimeo.com' in video.src:
print("Vimeo video detected")
elif 'kewego.com' in video.src:
print("Kewego video detected")Goose3 supports extraction from various video platforms:
# YouTube video extraction
article = g.extract(url='https://example.com/article-with-youtube')
for video in article.movies:
if 'youtube' in video.src.lower():
print(f"YouTube video ID can be extracted from: {video.src}")
# Vimeo video extraction
article = g.extract(url='https://example.com/article-with-vimeo')
for video in article.movies:
if 'vimeo' in video.src.lower():
print(f"Vimeo video: {video.src}")
# Generic iframe embeds
for video in article.movies:
if video.embed_type == 'iframe':
print(f"Generic iframe embed: {video.embed_code}")Checking media availability and quality:
from goose3 import Goose
g = Goose({'enable_image_fetching': True})
article = g.extract(url='https://example.com/article')
# Validate main image
if article.top_image:
image = article.top_image
# Check if image has valid URL
if image.src and image.src.startswith(('http://', 'https://')):
print(f"Valid image URL: {image.src}")
# Check dimensions
if image.width and image.height:
if image.width >= 300 and image.height >= 200:
print("Image meets minimum size requirements")
else:
print("Image is quite small")
else:
print("Image dimensions not available")
else:
print("Invalid or missing image URL")
# Validate videos
valid_videos = []
for video in article.movies:
if video.src and video.embed_code:
if any(platform in video.src.lower()
for platform in ['youtube', 'vimeo', 'dailymotion']):
valid_videos.append(video)
print(f"Found {len(valid_videos)} valid videos from known platforms")from goose3 import Goose
try:
g = Goose({
'enable_image_fetching': True,
'local_storage_path': '/tmp/goose_images'
})
article = g.extract(url='https://example.com/article')
# Safe media access
image_available = bool(article.top_image and article.top_image.src)
videos_available = bool(article.movies)
print(f"Media extraction results - Images: {image_available}, Videos: {videos_available}")
except Exception as e:
print(f"Media extraction error: {e}")
# Continue with article text even if media extraction fails
print(f"Article title: {article.title if 'article' in locals() else 'N/A'}")Install with Tessl CLI
npx tessl i tessl/pypi-goose3