tessl/pypi-newspaper3k

Simplified python article discovery & extraction.

—

Pending

Overview

Eval results

Files

Source Management

Name: tessl/pypi-newspaper3k
Author: tessl

Functionality for working with news websites and domains as collections of articles. The Source class provides comprehensive capabilities for discovering, organizing, and processing articles from news sources, including article discovery, category extraction, RSS feed processing, and batch operations.

Capabilities

Source Creation and Building

Create and initialize Source objects for news websites with automatic article discovery.

class Source:
    def __init__(self, url: str, config=None, **kwargs):
        """
        Initialize a news source object.
        
        Parameters:
        - url: Homepage URL of the news source
        - config: Configuration object for processing options
        - **kwargs: Additional configuration parameters
        
        Raises:
        Exception: If URL is invalid or malformed
        """

    def build(self):
        """
        Complete source processing pipeline: download homepage, parse structure,
        discover categories and feeds, generate article objects.
        """

def build(url: str = '', dry: bool = False, config=None, **kwargs) -> Source:
    """
    Factory function to create and optionally build a Source object.
    
    Parameters:
    - url: Source homepage URL
    - dry: If True, create source without building (no downloads)
    - config: Configuration object
    - **kwargs: Additional configuration parameters
    
    Returns:
    Source object, built if dry=False
    """

Content Download and Parsing

Download and parse source homepage and category pages.

def download(self):
    """Download homepage HTML content."""

def parse(self):
    """Parse homepage HTML to extract source structure and metadata."""

def download_categories(self):
    """Download all category page HTML content using multi-threading."""

def download_feeds(self):
    """Download RSS/Atom feed content for all discovered feeds."""

Content Discovery

Discover and organize source content including categories, feeds, and articles.

def set_categories(self):
    """Discover and set category URLs from homepage."""

def set_feeds(self):
    """
    Discover and set RSS/Atom feed URLs.
    Checks common feed locations and category pages for feed links.
    """

def generate_articles(self):
    """
    Generate Article objects from discovered URLs.
    Creates articles from category pages and feed content.
    """

def set_description(self):
    """Extract and set source description from homepage metadata."""

Batch Processing

Process multiple articles from the source efficiently.

def download_articles(self, thread_count_per_source: int = 1):
    """
    Download all source articles using multi-threading.
    
    Parameters:
    - thread_count_per_source: Number of threads to use for downloading
    """

Content Filtering

Filter and validate articles based on quality criteria.

def purge_articles(self, reason: str, articles: list) -> list:
    """
    Filter articles based on validation criteria.
    
    Parameters:
    - reason: Filter type - 'url' for URL validation, 'body' for content validation
    - articles: List of articles to filter
    
    Returns:
    Filtered list of valid articles
    """

Source Properties

Access source information and discovered content.

# Source Information
url: str                   # Homepage URL
domain: str                # Domain name
scheme: str                # URL scheme (http/https)
brand: str                 # Brand name extracted from domain
description: str           # Source description from metadata

# Content Collections
categories: list           # List of Category objects
feeds: list               # List of Feed objects
articles: list            # List of Article objects

# Content Data
html: str                 # Homepage HTML content
doc: object               # lxml DOM object of homepage
logo_url: str             # Source logo URL
favicon: str              # Favicon URL

# Processing State  
is_parsed: bool           # Whether source has been parsed
is_downloaded: bool       # Whether source has been downloaded

Helper Classes

Supporting classes for organizing source content.

class Category:
    def __init__(self, url: str):
        """
        Represents a news category/section.
        
        Parameters:
        - url: Category page URL
        """
    
    url: str                 # Category URL
    html: str               # Category page HTML
    doc: object             # lxml DOM object

class Feed:
    def __init__(self, url: str):
        """
        Represents an RSS/Atom feed.
        
        Parameters:
        - url: Feed URL
        """
    
    url: str                # Feed URL
    rss: str               # Feed content

Usage Examples

Basic Source Processing

from newspaper import build

# Build source and discover articles
cnn_source = build('http://cnn.com')

print(f"Source: {cnn_source.brand}")
print(f"Articles found: {len(cnn_source.articles)}")
print(f"Categories: {len(cnn_source.categories)}")
print(f"Feeds: {len(cnn_source.feeds)}")

# Access discovered articles
for article in cnn_source.articles[:5]:
    print(f"Article URL: {article.url}")

Manual Source Building

from newspaper import Source

# Create source without automatic building
source = Source('http://example.com')

# Manual step-by-step processing
source.download()
source.parse()
source.set_categories()
source.download_categories()
source.set_feeds()
source.download_feeds()
source.generate_articles()

print(f"Generated {len(source.articles)} articles")

Article Quality Filtering

from newspaper import build

# Build source and filter articles
source = build('http://news-site.com')

# Filter by URL validity
valid_url_articles = source.purge_articles('url', source.articles)
print(f"Valid URL articles: {len(valid_url_articles)}")

# Download and filter by content quality
for article in valid_url_articles[:10]:
    article.download()
    article.parse()

valid_body_articles = source.purge_articles('body', valid_url_articles[:10])
print(f"Valid content articles: {len(valid_body_articles)}")

Multi-threaded Article Processing

from newspaper import build

# Build source
source = build('http://news-site.com')

# Download all articles with multiple threads
source.download_articles(thread_count_per_source=5)

# Process downloaded articles
for article in source.articles:
    if hasattr(article, 'html') and article.html:
        article.parse()
        if article.is_valid_body():
            article.nlp()
            print(f"Processed: {article.title}")

Category and Feed Analysis

from newspaper import build

source = build('http://news-site.com')

# Examine categories
print("Categories:")
for category in source.categories:
    print(f"  {category.url}")

# Examine feeds  
print("Feeds:")
for feed in source.feeds:
    print(f"  {feed.url}")

# Source metadata
print(f"Description: {source.description}")
print(f"Logo: {source.logo_url}")
print(f"Favicon: {source.favicon}")

Custom Configuration for Sources

from newspaper import build, Configuration

# Create custom configuration
config = Configuration()
config.number_threads = 20
config.request_timeout = 10
config.language = 'fr'

# Build source with custom settings
source = build('http://french-news-site.com', config=config)
print(f"Articles discovered: {len(source.articles)}")

Install with Tessl CLI