tessl/pypi-newspaper3k

Simplified python article discovery & extraction.

—

Pending

Overview

Eval results

Files

Article Processing

Name: tessl/pypi-newspaper3k
Author: tessl

Core functionality for downloading, parsing, and extracting content from individual news articles. The Article class provides comprehensive capabilities for processing web articles including text extraction, metadata parsing, image discovery, video extraction, and natural language processing.

Capabilities

Article Creation and Building

Create and initialize Article objects, with full processing pipeline support.

class Article:
    def __init__(self, url: str, title: str = '', source_url: str = '', config=None, **kwargs):
        """
        Initialize an article object.
        
        Parameters:
        - url: Article URL to process
        - title: Optional article title
        - source_url: Optional source website URL 
        - config: Configuration object for processing options
        - **kwargs: Additional configuration parameters
        """

    def build(self):
        """
        Complete article processing pipeline: download, parse, and NLP.
        Equivalent to calling download(), parse(), and nlp() in sequence.
        """

def build_article(url: str = '', config=None, **kwargs) -> Article:
    """
    Factory function to create an Article object.
    
    Parameters:
    - url: Article URL
    - config: Configuration object
    - **kwargs: Additional configuration parameters
    
    Returns:
    Article object ready for processing
    """

Content Download

Download HTML content from article URLs with error handling and redirect support.

def download(self, input_html: str = None, title: str = None, recursion_counter: int = 0):
    """
    Download article HTML content.
    
    Parameters:
    - input_html: Optional pre-downloaded HTML content
    - title: Optional title override
    - recursion_counter: Internal parameter for handling redirects
    
    Raises:
    ArticleException: If download fails due to network or HTTP errors
    """

Content Parsing

Parse downloaded HTML to extract article components including text, metadata, images, and structure.

def parse(self):
    """
    Parse downloaded HTML content to extract article data.
    Extracts title, authors, text content, images, metadata, and publication date.
    
    Raises:
    ArticleException: If article has not been downloaded first
    """

Natural Language Processing

Extract keywords and generate summaries from article text content.

def nlp(self):
    """
    Perform natural language processing on parsed article text.
    Extracts keywords from title and body text, generates article summary.
    
    Raises:
    ArticleException: If article has not been downloaded and parsed first
    """

Content Validation

Validate article URLs and content quality according to configurable criteria.

def is_valid_url(self) -> bool:
    """
    Check if the article URL is valid for processing.
    
    Returns:
    bool: True if URL is valid, False otherwise
    """

def is_valid_body(self) -> bool:
    """
    Check if article content meets quality requirements.
    Validates word count, sentence count, title quality, and HTML content.
    
    Returns:
    bool: True if article body is valid, False otherwise
    
    Raises:
    ArticleException: If article has not been parsed first
    """

def is_media_news(self) -> bool:
    """
    Check if article is media-heavy (gallery, video, slideshow, etc.).
    
    Returns:
    bool: True if article is media-focused, False otherwise
    """

Article Properties

Access extracted article data and metadata.

# Content Properties
url: str                    # Article URL
title: str                  # Article title  
text: str                   # Main article body text
html: str                   # Raw HTML content
article_html: str           # Cleaned article HTML content
summary: str                # Auto-generated summary

# Author and Date Information  
authors: list               # List of article authors
publish_date: str           # Publication date

# Media Content
top_img: str               # Primary article image URL (alias: top_image)
imgs: list                 # List of all image URLs (alias: images)  
movies: list               # List of video URLs

# Metadata from HTML
meta_img: str              # Image URL from metadata
meta_keywords: list        # Keywords from HTML meta tags
meta_description: str      # Description from HTML meta
meta_lang: str             # Language from HTML meta
meta_favicon: str          # Favicon URL from meta
meta_data: dict            # Dictionary of all metadata
canonical_link: str        # Canonical URL from meta
tags: set                  # Set of article tags

# Processing State
is_parsed: bool            # Whether article has been parsed
download_state: int        # Download status (ArticleDownloadState values)
download_exception_msg: str # Error message if download failed

# Source Information
source_url: str            # URL of the parent news source

# Advanced Properties
top_node: object           # Main DOM node of article content
clean_top_node: object     # Clean copy of main DOM node  
doc: object                # Full lxml DOM object
clean_doc: object          # Clean copy of DOM object
additional_data: dict      # Custom user data storage

# Extracted Content
keywords: list             # Keywords from NLP processing

Download State Constants

class ArticleDownloadState:
    NOT_STARTED: int = 0      # Download not yet attempted
    FAILED_RESPONSE: int = 1  # Download failed due to network/HTTP error
    SUCCESS: int = 2          # Download completed successfully

Usage Examples

Basic Article Processing

from newspaper import Article

# Create and process article
article = Article('https://example.com/news/article')
article.download()
article.parse()

# Access extracted content  
print(f"Title: {article.title}")
print(f"Authors: {article.authors}")
print(f"Text length: {len(article.text)} characters")
print(f"Publication date: {article.publish_date}")
print(f"Top image: {article.top_img}")

Full Processing with NLP

from newspaper import build_article

# Build article with full processing
article = build_article('https://example.com/news/article')
article.build()  # download + parse + nlp

# Access NLP results
print(f"Keywords: {article.keywords}")
print(f"Summary: {article.summary}")

Error Handling

from newspaper import Article, ArticleException

try:
    article = Article('https://example.com/news/article')
    article.download()
    
    if article.download_state == ArticleDownloadState.FAILED_RESPONSE:
        print(f"Download failed: {article.download_exception_msg}")
    else:
        article.parse()
        
        if article.is_valid_body():
            article.nlp()
            print(f"Article processed successfully: {article.title}")
        else:
            print("Article content does not meet quality requirements")
            
except ArticleException as e:
    print(f"Article processing error: {e}")

Custom Configuration

from newspaper import Article, Configuration

# Create custom configuration
config = Configuration()
config.language = 'es'
config.MIN_WORD_COUNT = 500
config.fetch_images = False

# Process article with custom settings
article = Article('https://example.com/news/article', config=config)
article.build()

Install with Tessl CLI