Simplified python article discovery & extraction.
—
Core functionality for downloading, parsing, and extracting content from individual news articles. The Article class provides comprehensive capabilities for processing web articles including text extraction, metadata parsing, image discovery, video extraction, and natural language processing.
Create and initialize Article objects, with full processing pipeline support.
class Article:
def __init__(self, url: str, title: str = '', source_url: str = '', config=None, **kwargs):
"""
Initialize an article object.
Parameters:
- url: Article URL to process
- title: Optional article title
- source_url: Optional source website URL
- config: Configuration object for processing options
- **kwargs: Additional configuration parameters
"""
def build(self):
"""
Complete article processing pipeline: download, parse, and NLP.
Equivalent to calling download(), parse(), and nlp() in sequence.
"""
def build_article(url: str = '', config=None, **kwargs) -> Article:
"""
Factory function to create an Article object.
Parameters:
- url: Article URL
- config: Configuration object
- **kwargs: Additional configuration parameters
Returns:
Article object ready for processing
"""Download HTML content from article URLs with error handling and redirect support.
def download(self, input_html: str = None, title: str = None, recursion_counter: int = 0):
"""
Download article HTML content.
Parameters:
- input_html: Optional pre-downloaded HTML content
- title: Optional title override
- recursion_counter: Internal parameter for handling redirects
Raises:
ArticleException: If download fails due to network or HTTP errors
"""Parse downloaded HTML to extract article components including text, metadata, images, and structure.
def parse(self):
"""
Parse downloaded HTML content to extract article data.
Extracts title, authors, text content, images, metadata, and publication date.
Raises:
ArticleException: If article has not been downloaded first
"""Extract keywords and generate summaries from article text content.
def nlp(self):
"""
Perform natural language processing on parsed article text.
Extracts keywords from title and body text, generates article summary.
Raises:
ArticleException: If article has not been downloaded and parsed first
"""Validate article URLs and content quality according to configurable criteria.
def is_valid_url(self) -> bool:
"""
Check if the article URL is valid for processing.
Returns:
bool: True if URL is valid, False otherwise
"""
def is_valid_body(self) -> bool:
"""
Check if article content meets quality requirements.
Validates word count, sentence count, title quality, and HTML content.
Returns:
bool: True if article body is valid, False otherwise
Raises:
ArticleException: If article has not been parsed first
"""
def is_media_news(self) -> bool:
"""
Check if article is media-heavy (gallery, video, slideshow, etc.).
Returns:
bool: True if article is media-focused, False otherwise
"""Access extracted article data and metadata.
# Content Properties
url: str # Article URL
title: str # Article title
text: str # Main article body text
html: str # Raw HTML content
article_html: str # Cleaned article HTML content
summary: str # Auto-generated summary
# Author and Date Information
authors: list # List of article authors
publish_date: str # Publication date
# Media Content
top_img: str # Primary article image URL (alias: top_image)
imgs: list # List of all image URLs (alias: images)
movies: list # List of video URLs
# Metadata from HTML
meta_img: str # Image URL from metadata
meta_keywords: list # Keywords from HTML meta tags
meta_description: str # Description from HTML meta
meta_lang: str # Language from HTML meta
meta_favicon: str # Favicon URL from meta
meta_data: dict # Dictionary of all metadata
canonical_link: str # Canonical URL from meta
tags: set # Set of article tags
# Processing State
is_parsed: bool # Whether article has been parsed
download_state: int # Download status (ArticleDownloadState values)
download_exception_msg: str # Error message if download failed
# Source Information
source_url: str # URL of the parent news source
# Advanced Properties
top_node: object # Main DOM node of article content
clean_top_node: object # Clean copy of main DOM node
doc: object # Full lxml DOM object
clean_doc: object # Clean copy of DOM object
additional_data: dict # Custom user data storage
# Extracted Content
keywords: list # Keywords from NLP processingclass ArticleDownloadState:
NOT_STARTED: int = 0 # Download not yet attempted
FAILED_RESPONSE: int = 1 # Download failed due to network/HTTP error
SUCCESS: int = 2 # Download completed successfullyfrom newspaper import Article
# Create and process article
article = Article('https://example.com/news/article')
article.download()
article.parse()
# Access extracted content
print(f"Title: {article.title}")
print(f"Authors: {article.authors}")
print(f"Text length: {len(article.text)} characters")
print(f"Publication date: {article.publish_date}")
print(f"Top image: {article.top_img}")from newspaper import build_article
# Build article with full processing
article = build_article('https://example.com/news/article')
article.build() # download + parse + nlp
# Access NLP results
print(f"Keywords: {article.keywords}")
print(f"Summary: {article.summary}")from newspaper import Article, ArticleException
try:
article = Article('https://example.com/news/article')
article.download()
if article.download_state == ArticleDownloadState.FAILED_RESPONSE:
print(f"Download failed: {article.download_exception_msg}")
else:
article.parse()
if article.is_valid_body():
article.nlp()
print(f"Article processed successfully: {article.title}")
else:
print("Article content does not meet quality requirements")
except ArticleException as e:
print(f"Article processing error: {e}")from newspaper import Article, Configuration
# Create custom configuration
config = Configuration()
config.language = 'es'
config.MIN_WORD_COUNT = 500
config.fetch_images = False
# Process article with custom settings
article = Article('https://example.com/news/article', config=config)
article.build()Install with Tessl CLI
npx tessl i tessl/pypi-newspaper3k