tessl/pypi-newspaper3k

Simplified python article discovery & extraction.

—

Pending

Overview

Eval results

Files

Configuration & Utilities

Name: tessl/pypi-newspaper3k
Author: tessl

Configuration management, language support, and utility functions for customizing extraction behavior and accessing supplementary features. The Configuration class provides extensive customization options for article processing, while utility functions offer additional capabilities like fulltext extraction and trending topic discovery.

Capabilities

Configuration Management

Comprehensive configuration options for customizing newspaper3k behavior.

class Configuration:
    def __init__(self):
        """Initialize configuration with default settings."""

    def get_language(self) -> str:
        """Get the current language setting."""

    def set_language(self, language: str):
        """
        Set the target language for processing.
        
        Parameters:
        - language: Two-character language code (e.g., 'en', 'es', 'fr')
        
        Raises:
        Exception: If language code is invalid or not 2 characters
        """

    @staticmethod
    def get_stopwords_class(language: str):
        """
        Get the appropriate stopwords class for a language.
        
        Parameters:
        - language: Two-character language code
        
        Returns:
        Stopwords class for the specified language
        """

    @staticmethod  
    def get_parser():
        """Get the HTML parser class (lxml-based Parser)."""

Configuration Properties

Extensive configuration options for fine-tuning extraction behavior.

# Content Validation Thresholds
MIN_WORD_COUNT: int = 300        # Minimum words for valid article
MIN_SENT_COUNT: int = 7          # Minimum sentences for valid article
MAX_TITLE: int = 200             # Maximum title length in characters
MAX_TEXT: int = 100000           # Maximum article text length
MAX_KEYWORDS: int = 35           # Maximum keywords to extract
MAX_AUTHORS: int = 10            # Maximum authors to extract
MAX_SUMMARY: int = 5000          # Maximum summary length
MAX_SUMMARY_SENT: int = 5        # Maximum summary sentences

# Caching and Storage
MAX_FILE_MEMO: int = 20000       # Max URLs cached per news source
memoize_articles: bool = True    # Cache articles between runs

# Media Processing
fetch_images: bool = True        # Download and process images
image_dimension_ration: float = 16/9.0  # Preferred image aspect ratio

# Network and Processing
follow_meta_refresh: bool = False    # Follow meta refresh redirects
use_meta_language: bool = True       # Use language from HTML meta tags
keep_article_html: bool = False      # Retain cleaned article HTML
http_success_only: bool = True       # Fail on HTTP error responses
request_timeout: int = 7             # HTTP request timeout in seconds
number_threads: int = 10             # Default thread count
thread_timeout_seconds: int = 1      # Thread timeout in seconds

# Language and Localization
language: str = 'en'                 # Target language code
stopwords_class: class = StopWords   # Stopwords class for language

# HTTP Configuration
browser_user_agent: str             # HTTP User-Agent header
headers: dict = {}                  # Additional HTTP headers
proxies: dict = {}                  # Proxy configuration

# Debugging
verbose: bool = False               # Enable debug logging

Utility Functions

Standalone functions for specialized processing and information retrieval.

def fulltext(html: str, language: str = 'en') -> str:
    """
    Extract clean text content from raw HTML.
    
    Parameters:
    - html: Raw HTML string
    - language: Language code for processing (default: 'en')
    
    Returns:
    Extracted plain text content
    """

def hot() -> list:
    """
    Get trending topics from Google Trends.
    
    Returns:
    List of trending search terms, or None if failed
    """

def languages():
    """Print list of supported languages to console."""

def popular_urls() -> list:
    """
    Get list of popular news source URLs.
    
    Returns:
    List of pre-extracted popular news website URLs
    """

Language Support Classes

Specialized stopwords classes for different languages.

class StopWords:
    """Default English stopwords class."""

class StopWordsChinese(StopWords):
    """Chinese language stopwords."""

class StopWordsArabic(StopWords):
    """Arabic and Persian language stopwords."""

class StopWordsKorean(StopWords):
    """Korean language stopwords."""

class StopWordsHindi(StopWords):
    """Hindi language stopwords."""

class StopWordsJapanese(StopWords):
    """Japanese language stopwords."""

Helper Functions

Additional utility functions for configuration and language support.

def get_available_languages() -> list:
    """
    Get list of supported language codes.
    
    Returns:
    List of two-character language codes
    """

def print_available_languages():
    """Print supported languages to console."""

def extend_config(config: Configuration, config_items: dict) -> Configuration:
    """
    Merge configuration object with additional settings.
    
    Parameters:
    - config: Base Configuration object
    - config_items: Dictionary of additional configuration values
    
    Returns:
    Updated Configuration object
    """

Usage Examples

Basic Configuration

from newspaper import Configuration, Article

# Create custom configuration
config = Configuration()
config.language = 'es'
config.MIN_WORD_COUNT = 500
config.fetch_images = False
config.request_timeout = 10

# Use with article
article = Article('http://spanish-news-site.com/article', config=config)
article.build()

Multi-language Processing

from newspaper import Configuration, Article

# Process articles in different languages
languages = ['en', 'es', 'fr', 'de']
articles = {}

for lang in languages:
    config = Configuration()
    config.set_language(lang)
    
    # Language-specific URL (example)
    url = f'http://news-site.com/{lang}/article'
    article = Article(url, config=config)
    article.build()
    
    articles[lang] = article
    print(f"{lang}: {article.title}")

Performance Optimization

from newspaper import Configuration, build

# High-performance configuration
config = Configuration()
config.number_threads = 20
config.thread_timeout_seconds = 2
config.request_timeout = 5
config.memoize_articles = True
config.fetch_images = False  # Skip images for speed

# Build source with optimized settings
source = build('http://news-site.com', config=config)
print(f"Fast processing: {len(source.articles)} articles discovered")

Content Quality Configuration

from newspaper import Configuration, Article

# Strict content validation
config = Configuration()
config.MIN_WORD_COUNT = 800      # Require longer articles
config.MIN_SENT_COUNT = 15       # Require more sentences
config.MAX_KEYWORDS = 50         # Extract more keywords
config.MAX_SUMMARY_SENT = 10     # Longer summaries

# Use strict configuration
article = Article('http://long-form-article.com', config=config)
article.build()

if article.is_valid_body():
    print(f"High-quality article: {len(article.text)} words")
    print(f"Keywords: {len(article.keywords)}")
    print(f"Summary sentences: {len(article.summary.split('.'))}")

Network Configuration

from newspaper import Configuration, Article

# Custom network settings
config = Configuration()
config.browser_user_agent = 'MyBot/1.0'
config.headers = {
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate'
}
config.proxies = {
    'http': 'http://proxy.example.com:8080',
    'https': 'https://proxy.example.com:8080'
}
config.request_timeout = 15

# Use custom network settings
article = Article('http://example.com/article', config=config)
article.download()

Language Detection and Processing

from newspaper import get_available_languages, Configuration

# Show supported languages
print("Supported languages:")
print(get_available_languages())

# Auto-detect and process
def process_with_language_detection(url):
    # First pass - detect language
    article = Article(url)
    article.download()
    article.parse()  # This extracts meta_lang
    
    detected_lang = article.meta_lang
    if detected_lang in get_available_languages():
        # Second pass with detected language
        config = Configuration()
        config.set_language(detected_lang)
        
        article_lang = Article(url, config=config)
        article_lang.build()
        return article_lang
    
    return article

# Process with language detection
result = process_with_language_detection('http://multilingual-site.com/article')
print(f"Language: {result.meta_lang}")
print(f"Title: {result.title}")

Utility Functions Usage

from newspaper import fulltext, hot, popular_urls

# Extract text from raw HTML
html_content = """
<html><body>
<h1>News Title</h1>
<p>This is the main article content with <a href="#">links</a> and formatting.</p>
</body></html>
"""

clean_text = fulltext(html_content, language='en')
print(f"Extracted text: {clean_text}")

# Get trending topics
try:
    trending = hot()
    if trending:
        print("Trending topics:", trending[:5])
except Exception as e:
    print(f"Could not fetch trending topics: {e}")

# Get popular news sources
popular_sources = popular_urls()
print(f"Popular sources: {len(popular_sources)} URLs")
for source in popular_sources[:5]:
    print(f"  {source}")

Debug Configuration

from newspaper import Configuration, Article
import logging

# Enable debug logging
config = Configuration()
config.verbose = True

# Set up logging to see debug output
logging.basicConfig(level=logging.DEBUG)

# Process with verbose output
article = Article('http://example.com/article', config=config)
article.build()  # Will show detailed debug information

Install with Tessl CLI