Simplified python article discovery & extraction.
npx @tessl/cli install tessl/pypi-newspaper3k@0.2.0A comprehensive Python library for extracting and curating articles from web sources. Newspaper3k provides multi-threaded article downloading, intelligent text extraction from HTML, image and video extraction, keyword and summary generation using natural language processing, author and publication date detection, and multi-language support for over 10 languages including English, Chinese, German, and Arabic.
pip install newspaper3kimport newspaperCommon imports for working with articles and sources:
from newspaper import Article, Source, build, build_article, fulltext, __version__
from newspaper import Configuration, Config, NewsPool, news_pool
from newspaper import ArticleException, hot, languages, popular_urlsimport newspaper
from newspaper import Article
# Basic article extraction
url = 'http://cnn.com/2013/11/27/justice/tucson-arizona-captive-girls/'
article = Article(url)
# Download and parse article
article.download()
article.parse()
# Access extracted content
print(article.title)
print(article.authors)
print(article.publish_date)
print(article.text)
print(article.top_image)
# Extract keywords and summary using NLP
article.nlp()
print(article.keywords)
print(article.summary)
# Build and process news sources
cnn_paper = newspaper.build('http://cnn.com')
for article in cnn_paper.articles:
print(article.url)
# Multi-threaded processing
from newspaper import news_pool
articles = [article1, article2, article3]
news_pool.set(articles)
news_pool.join()The library is built around several core concepts:
This design enables both single-article processing and large-scale news aggregation workflows, with configurable extraction parameters, caching mechanisms, and multi-language support that make it suitable for research applications, content curation systems, and automated journalism workflows.
Core functionality for downloading, parsing, and extracting content from individual news articles. Supports text extraction, metadata parsing, image discovery, video extraction, and natural language processing.
class Article:
def __init__(self, url: str, title: str = '', source_url: str = '', config=None, **kwargs): ...
def download(self, input_html=None, title=None, recursion_counter: int = 0): ...
def parse(self): ...
def nlp(self): ...
def build(self): ...
def build_article(url: str = '', config=None, **kwargs) -> Article: ...Functionality for working with news websites and domains as collections of articles. Provides article discovery, category extraction, RSS feed processing, and batch operations.
class Source:
def __init__(self, url: str, config=None, **kwargs): ...
def build(self): ...
def download(self): ...
def parse(self): ...
def build(url: str = '', dry: bool = False, config=None, **kwargs) -> Source: ...Thread pool management for processing multiple articles and sources concurrently. Enables efficient large-scale content extraction and processing.
class NewsPool:
def __init__(self, config=None): ...
def set(self, news_list: list, threads_per_source: int = 1, override_threads=None): ...
def join(self): ...
# Pre-instantiated pool
news_pool: NewsPoolConfiguration management, language support, and utility functions for customizing extraction behavior and accessing supplementary features.
class Configuration:
def __init__(self): ...
def set_language(self, language: str): ...
def get_language(self) -> str: ...
# Configuration is also aliased as Config for convenience
Config = Configuration
def fulltext(html: str, language: str = 'en') -> str: ...
def hot() -> list: ...
def languages(): ...
def popular_urls() -> list: ...
# Version information
__version__: str # Package version (currently "0.2.8")class ArticleException(Exception):
"""Exception raised for article-related errors during download or parsing."""
class ConcurrencyException(Exception):
"""Exception raised for thread pool operation errors."""Common error scenarios: