tessl/pypi-newspaper3k

Simplified python article discovery & extraction.

—

Pending

Overview

Eval results

Files

Multi-threading & Batch Processing

Name: tessl/pypi-newspaper3k
Author: tessl

Thread pool management for processing multiple articles and sources concurrently. The NewsPool class enables efficient large-scale content extraction and processing by managing thread allocation and coordination for batch operations on articles and news sources.

Capabilities

Thread Pool Management

Create and manage thread pools for concurrent article and source processing.

class NewsPool:
    def __init__(self, config=None):
        """
        Initialize a news processing thread pool.
        
        Parameters:
        - config: Configuration object for threading settings
        """

    def set(self, news_list: list, threads_per_source: int = 1, override_threads: int = None):
        """
        Set the list of articles or sources to process with threading configuration.
        
        Parameters:
        - news_list: List of Article objects, Source objects, or mixed
        - threads_per_source: Number of threads per source (when processing sources)
        - override_threads: Override automatic thread calculation with specific count
        
        Threading Logic:
        - If override_threads specified: use that count
        - If all items are Source objects: threads_per_source * number_of_sources  
        - Otherwise: use 1 thread
        """

    def join(self):
        """
        Execute multi-threaded processing and wait for all threads to complete.
        Processes articles by downloading, sources by downloading articles.
        
        Raises:
        ConcurrencyException: If set() was not called before join()
        """

Pre-instantiated Pool

Convenient global NewsPool instance for immediate use.

news_pool: NewsPool  # Pre-instantiated NewsPool object for convenience

Threading Utilities

Supporting classes for thread pool implementation.

class ThreadPool:
    def __init__(self, num_threads: int, timeout_seconds: int):
        """Initialize thread pool with specified thread count and timeout."""
    
    def add_task(self, func, *args, **kwargs):
        """Add a task function to the thread pool queue."""
    
    def wait_completion(self):
        """Wait for all queued tasks to complete."""

class Worker:
    """Worker thread that executes tasks from a queue."""
    
    def __init__(self, tasks, timeout_seconds: int):
        """Initialize worker thread with task queue and timeout."""

class ConcurrencyException(Exception):
    """Exception raised for thread pool operation errors."""

Usage Examples

Basic Multi-threaded Article Processing

from newspaper import Article, news_pool

# Create multiple articles
articles = [
    Article('http://cnn.com/article1'),
    Article('http://cnn.com/article2'), 
    Article('http://cnn.com/article3'),
    Article('http://bbc.com/article1'),
    Article('http://bbc.com/article2')
]

# Process all articles concurrently
news_pool.set(articles)
news_pool.join()

# All articles now have downloaded HTML
for article in articles:
    if article.html:
        article.parse()
        print(f"Downloaded and parsed: {article.url}")

Multi-threaded Source Processing

from newspaper import build, news_pool

# Create multiple sources (don't build them yet)
sources = [
    build('http://cnn.com', dry=True),
    build('http://bbc.com', dry=True),
    build('http://techcrunch.com', dry=True)
]

# Download articles from all sources concurrently
# Uses one thread per source to avoid rate limiting
news_pool.set(sources, threads_per_source=1)
news_pool.join()

# Process results
for source in sources:
    print(f"Source {source.brand}: {len(source.articles)} articles")
    for article in source.articles[:3]:  # Process first 3 articles
        if article.html:
            article.parse()
            print(f"  - {article.title}")

Custom Thread Configuration

from newspaper import NewsPool, Article, Configuration

# Create custom configuration
config = Configuration()
config.thread_timeout_seconds = 5
config.number_threads = 8

# Create custom news pool
custom_pool = NewsPool(config=config)

# Create articles
articles = [Article(f'http://example.com/article{i}') for i in range(20)]

# Process with specific thread count
custom_pool.set(articles, override_threads=10)
custom_pool.join()

print(f"Processed {len([a for a in articles if a.html])} articles")

Mixed Article and Source Processing

from newspaper import Article, build, news_pool

# Mix of articles and sources
news_items = [
    Article('http://standalone-article.com/news'),
    build('http://cnn.com', dry=True),
    Article('http://another-article.com/story'),
    build('http://bbc.com', dry=True)
]

# Process mixed list (will use 1 thread since not all sources)
news_pool.set(news_items)
news_pool.join()

# Handle results based on type
for item in news_items:
    if hasattr(item, 'articles'):  # It's a Source
        print(f"Source: {item.brand} - {len(item.articles)} articles")
    else:  # It's an Article
        if item.html:
            item.parse()
            print(f"Article: {item.title}")

Error Handling with Threading

from newspaper import Article, news_pool, ArticleException

# Create articles (some may have invalid URLs)
urls = [
    'http://valid-site.com/article1',
    'http://invalid-url-that-will-fail.com/article',
    'http://valid-site.com/article2'
]

articles = [Article(url) for url in urls]

try:
    news_pool.set(articles)
    news_pool.join()
    
    # Check results and handle failures
    successful = []
    failed = []
    
    for article in articles:
        if article.download_state == 2:  # SUCCESS
            article.parse()
            successful.append(article)
        else:
            failed.append(article)
    
    print(f"Successful downloads: {len(successful)}")
    print(f"Failed downloads: {len(failed)}")
    
    for article in failed:
        print(f"Failed: {article.url} - {article.download_exception_msg}")
        
except Exception as e:
    print(f"Threading error: {e}")

Performance Optimization

from newspaper import build, NewsPool, Configuration

# Create high-performance configuration
config = Configuration()
config.number_threads = 15
config.request_timeout = 5
config.thread_timeout_seconds = 2

# Create sources
sources = [
    build('http://site1.com', dry=True, config=config),
    build('http://site2.com', dry=True, config=config),
    build('http://site3.com', dry=True, config=config)
]

# Use custom pool with optimized settings
pool = NewsPool(config=config)

# Process with multiple threads per source for faster downloading
pool.set(sources, threads_per_source=3)  # 9 total threads (3 sources × 3 threads)
pool.join()

# Measure results
total_articles = sum(len(source.articles) for source in sources)
print(f"Downloaded articles from {len(sources)} sources: {total_articles} total")

Thread Pool Lifecycle Management

from newspaper import NewsPool, Article

# Create pool
pool = NewsPool()

# First batch
batch1 = [Article(f'http://site1.com/article{i}') for i in range(5)]
pool.set(batch1)
pool.join()

# Process results
for article in batch1:
    if article.html:
        article.parse()

# Second batch (pool can be reused)
batch2 = [Article(f'http://site2.com/article{i}') for i in range(5)]
pool.set(batch2)
pool.join()

# Process second batch results
for article in batch2:
    if article.html:
        article.parse()

print("Completed two separate batches")

Install with Tessl CLI