Simplified python article discovery & extraction.
—
Functionality for working with news websites and domains as collections of articles. The Source class provides comprehensive capabilities for discovering, organizing, and processing articles from news sources, including article discovery, category extraction, RSS feed processing, and batch operations.
Create and initialize Source objects for news websites with automatic article discovery.
class Source:
def __init__(self, url: str, config=None, **kwargs):
"""
Initialize a news source object.
Parameters:
- url: Homepage URL of the news source
- config: Configuration object for processing options
- **kwargs: Additional configuration parameters
Raises:
Exception: If URL is invalid or malformed
"""
def build(self):
"""
Complete source processing pipeline: download homepage, parse structure,
discover categories and feeds, generate article objects.
"""
def build(url: str = '', dry: bool = False, config=None, **kwargs) -> Source:
"""
Factory function to create and optionally build a Source object.
Parameters:
- url: Source homepage URL
- dry: If True, create source without building (no downloads)
- config: Configuration object
- **kwargs: Additional configuration parameters
Returns:
Source object, built if dry=False
"""Download and parse source homepage and category pages.
def download(self):
"""Download homepage HTML content."""
def parse(self):
"""Parse homepage HTML to extract source structure and metadata."""
def download_categories(self):
"""Download all category page HTML content using multi-threading."""
def download_feeds(self):
"""Download RSS/Atom feed content for all discovered feeds."""Discover and organize source content including categories, feeds, and articles.
def set_categories(self):
"""Discover and set category URLs from homepage."""
def set_feeds(self):
"""
Discover and set RSS/Atom feed URLs.
Checks common feed locations and category pages for feed links.
"""
def generate_articles(self):
"""
Generate Article objects from discovered URLs.
Creates articles from category pages and feed content.
"""
def set_description(self):
"""Extract and set source description from homepage metadata."""Process multiple articles from the source efficiently.
def download_articles(self, thread_count_per_source: int = 1):
"""
Download all source articles using multi-threading.
Parameters:
- thread_count_per_source: Number of threads to use for downloading
"""Filter and validate articles based on quality criteria.
def purge_articles(self, reason: str, articles: list) -> list:
"""
Filter articles based on validation criteria.
Parameters:
- reason: Filter type - 'url' for URL validation, 'body' for content validation
- articles: List of articles to filter
Returns:
Filtered list of valid articles
"""Access source information and discovered content.
# Source Information
url: str # Homepage URL
domain: str # Domain name
scheme: str # URL scheme (http/https)
brand: str # Brand name extracted from domain
description: str # Source description from metadata
# Content Collections
categories: list # List of Category objects
feeds: list # List of Feed objects
articles: list # List of Article objects
# Content Data
html: str # Homepage HTML content
doc: object # lxml DOM object of homepage
logo_url: str # Source logo URL
favicon: str # Favicon URL
# Processing State
is_parsed: bool # Whether source has been parsed
is_downloaded: bool # Whether source has been downloadedSupporting classes for organizing source content.
class Category:
def __init__(self, url: str):
"""
Represents a news category/section.
Parameters:
- url: Category page URL
"""
url: str # Category URL
html: str # Category page HTML
doc: object # lxml DOM object
class Feed:
def __init__(self, url: str):
"""
Represents an RSS/Atom feed.
Parameters:
- url: Feed URL
"""
url: str # Feed URL
rss: str # Feed contentfrom newspaper import build
# Build source and discover articles
cnn_source = build('http://cnn.com')
print(f"Source: {cnn_source.brand}")
print(f"Articles found: {len(cnn_source.articles)}")
print(f"Categories: {len(cnn_source.categories)}")
print(f"Feeds: {len(cnn_source.feeds)}")
# Access discovered articles
for article in cnn_source.articles[:5]:
print(f"Article URL: {article.url}")from newspaper import Source
# Create source without automatic building
source = Source('http://example.com')
# Manual step-by-step processing
source.download()
source.parse()
source.set_categories()
source.download_categories()
source.set_feeds()
source.download_feeds()
source.generate_articles()
print(f"Generated {len(source.articles)} articles")from newspaper import build
# Build source and filter articles
source = build('http://news-site.com')
# Filter by URL validity
valid_url_articles = source.purge_articles('url', source.articles)
print(f"Valid URL articles: {len(valid_url_articles)}")
# Download and filter by content quality
for article in valid_url_articles[:10]:
article.download()
article.parse()
valid_body_articles = source.purge_articles('body', valid_url_articles[:10])
print(f"Valid content articles: {len(valid_body_articles)}")from newspaper import build
# Build source
source = build('http://news-site.com')
# Download all articles with multiple threads
source.download_articles(thread_count_per_source=5)
# Process downloaded articles
for article in source.articles:
if hasattr(article, 'html') and article.html:
article.parse()
if article.is_valid_body():
article.nlp()
print(f"Processed: {article.title}")from newspaper import build
source = build('http://news-site.com')
# Examine categories
print("Categories:")
for category in source.categories:
print(f" {category.url}")
# Examine feeds
print("Feeds:")
for feed in source.feeds:
print(f" {feed.url}")
# Source metadata
print(f"Description: {source.description}")
print(f"Logo: {source.logo_url}")
print(f"Favicon: {source.favicon}")from newspaper import build, Configuration
# Create custom configuration
config = Configuration()
config.number_threads = 20
config.request_timeout = 10
config.language = 'fr'
# Build source with custom settings
source = build('http://french-news-site.com', config=config)
print(f"Articles discovered: {len(source.articles)}")Install with Tessl CLI
npx tessl i tessl/pypi-newspaper3k