Tessl Tile for pypi/newspaper3k@0.2.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

article-processing.md configuration.md index.md multithreading.md source-management.md

multithreading.mddocs/

0
# Multi-threading & Batch Processing
1

2
Thread pool management for processing multiple articles and sources concurrently. The NewsPool class enables efficient large-scale content extraction and processing by managing thread allocation and coordination for batch operations on articles and news sources.
3

4
## Capabilities
5

6
### Thread Pool Management
7

8
Create and manage thread pools for concurrent article and source processing.
9

10
```python { .api }
11
class NewsPool:
12
    def __init__(self, config=None):
13
        """
14
        Initialize a news processing thread pool.
15
        
16
        Parameters:
17
        - config: Configuration object for threading settings
18
        """
19

20
    def set(self, news_list: list, threads_per_source: int = 1, override_threads: int = None):
21
        """
22
        Set the list of articles or sources to process with threading configuration.
23
        
24
        Parameters:
25
        - news_list: List of Article objects, Source objects, or mixed
26
        - threads_per_source: Number of threads per source (when processing sources)
27
        - override_threads: Override automatic thread calculation with specific count
28
        
29
        Threading Logic:
30
        - If override_threads specified: use that count
31
        - If all items are Source objects: threads_per_source * number_of_sources  
32
        - Otherwise: use 1 thread
33
        """
34

35
    def join(self):
36
        """
37
        Execute multi-threaded processing and wait for all threads to complete.
38
        Processes articles by downloading, sources by downloading articles.
39
        
40
        Raises:
41
        ConcurrencyException: If set() was not called before join()
42
        """
43
```
44

45
### Pre-instantiated Pool
46

47
Convenient global NewsPool instance for immediate use.
48

49
```python { .api }
50
news_pool: NewsPool  # Pre-instantiated NewsPool object for convenience
51
```
52

53
### Threading Utilities
54

55
Supporting classes for thread pool implementation.
56

57
```python { .api }
58
class ThreadPool:
59
    def __init__(self, num_threads: int, timeout_seconds: int):
60
        """Initialize thread pool with specified thread count and timeout."""
61
    
62
    def add_task(self, func, *args, **kwargs):
63
        """Add a task function to the thread pool queue."""
64
    
65
    def wait_completion(self):
66
        """Wait for all queued tasks to complete."""
67

68
class Worker:
69
    """Worker thread that executes tasks from a queue."""
70
    
71
    def __init__(self, tasks, timeout_seconds: int):
72
        """Initialize worker thread with task queue and timeout."""
73

74
class ConcurrencyException(Exception):
75
    """Exception raised for thread pool operation errors."""
76
```
77

78
## Usage Examples
79

80
### Basic Multi-threaded Article Processing
81

82
```python
83
from newspaper import Article, news_pool
84

85
# Create multiple articles
86
articles = [
87
    Article('http://cnn.com/article1'),
88
    Article('http://cnn.com/article2'), 
89
    Article('http://cnn.com/article3'),
90
    Article('http://bbc.com/article1'),
91
    Article('http://bbc.com/article2')
92
]
93

94
# Process all articles concurrently
95
news_pool.set(articles)
96
news_pool.join()
97

98
# All articles now have downloaded HTML
99
for article in articles:
100
    if article.html:
101
        article.parse()
102
        print(f"Downloaded and parsed: {article.url}")
103
```
104

105
### Multi-threaded Source Processing
106

107
```python
108
from newspaper import build, news_pool
109

110
# Create multiple sources (don't build them yet)
111
sources = [
112
    build('http://cnn.com', dry=True),
113
    build('http://bbc.com', dry=True),
114
    build('http://techcrunch.com', dry=True)
115
]
116

117
# Download articles from all sources concurrently
118
# Uses one thread per source to avoid rate limiting
119
news_pool.set(sources, threads_per_source=1)
120
news_pool.join()
121

122
# Process results
123
for source in sources:
124
    print(f"Source {source.brand}: {len(source.articles)} articles")
125
    for article in source.articles[:3]:  # Process first 3 articles
126
        if article.html:
127
            article.parse()
128
            print(f"  - {article.title}")
129
```
130

131
### Custom Thread Configuration
132

133
```python
134
from newspaper import NewsPool, Article, Configuration
135

136
# Create custom configuration
137
config = Configuration()
138
config.thread_timeout_seconds = 5
139
config.number_threads = 8
140

141
# Create custom news pool
142
custom_pool = NewsPool(config=config)
143

144
# Create articles
145
articles = [Article(f'http://example.com/article{i}') for i in range(20)]
146

147
# Process with specific thread count
148
custom_pool.set(articles, override_threads=10)
149
custom_pool.join()
150

151
print(f"Processed {len([a for a in articles if a.html])} articles")
152
```
153

154
### Mixed Article and Source Processing
155

156
```python
157
from newspaper import Article, build, news_pool
158

159
# Mix of articles and sources
160
news_items = [
161
    Article('http://standalone-article.com/news'),
162
    build('http://cnn.com', dry=True),
163
    Article('http://another-article.com/story'),
164
    build('http://bbc.com', dry=True)
165
]
166

167
# Process mixed list (will use 1 thread since not all sources)
168
news_pool.set(news_items)
169
news_pool.join()
170

171
# Handle results based on type
172
for item in news_items:
173
    if hasattr(item, 'articles'):  # It's a Source
174
        print(f"Source: {item.brand} - {len(item.articles)} articles")
175
    else:  # It's an Article
176
        if item.html:
177
            item.parse()
178
            print(f"Article: {item.title}")
179
```
180

181
### Error Handling with Threading
182

183
```python
184
from newspaper import Article, news_pool, ArticleException
185

186
# Create articles (some may have invalid URLs)
187
urls = [
188
    'http://valid-site.com/article1',
189
    'http://invalid-url-that-will-fail.com/article',
190
    'http://valid-site.com/article2'
191
]
192

193
articles = [Article(url) for url in urls]
194

195
try:
196
    news_pool.set(articles)
197
    news_pool.join()
198
    
199
    # Check results and handle failures
200
    successful = []
201
    failed = []
202
    
203
    for article in articles:
204
        if article.download_state == 2:  # SUCCESS
205
            article.parse()
206
            successful.append(article)
207
        else:
208
            failed.append(article)
209
    
210
    print(f"Successful downloads: {len(successful)}")
211
    print(f"Failed downloads: {len(failed)}")
212
    
213
    for article in failed:
214
        print(f"Failed: {article.url} - {article.download_exception_msg}")
215
        
216
except Exception as e:
217
    print(f"Threading error: {e}")
218
```
219

220
### Performance Optimization
221

222
```python
223
from newspaper import build, NewsPool, Configuration
224

225
# Create high-performance configuration
226
config = Configuration()
227
config.number_threads = 15
228
config.request_timeout = 5
229
config.thread_timeout_seconds = 2
230

231
# Create sources
232
sources = [
233
    build('http://site1.com', dry=True, config=config),
234
    build('http://site2.com', dry=True, config=config),
235
    build('http://site3.com', dry=True, config=config)
236
]
237

238
# Use custom pool with optimized settings
239
pool = NewsPool(config=config)
240

241
# Process with multiple threads per source for faster downloading
242
pool.set(sources, threads_per_source=3)  # 9 total threads (3 sources × 3 threads)
243
pool.join()
244

245
# Measure results
246
total_articles = sum(len(source.articles) for source in sources)
247
print(f"Downloaded articles from {len(sources)} sources: {total_articles} total")
248
```
249

250
### Thread Pool Lifecycle Management
251

252
```python
253
from newspaper import NewsPool, Article
254

255
# Create pool
256
pool = NewsPool()
257

258
# First batch
259
batch1 = [Article(f'http://site1.com/article{i}') for i in range(5)]
260
pool.set(batch1)
261
pool.join()
262

263
# Process results
264
for article in batch1:
265
    if article.html:
266
        article.parse()
267

268
# Second batch (pool can be reused)
269
batch2 = [Article(f'http://site2.com/article{i}') for i in range(5)]
270
pool.set(batch2)
271
pool.join()
272

273
# Process second batch results
274
for article in batch2:
275
    if article.html:
276
        article.parse()
277

278
print("Completed two separate batches")
279
```

Version

Tile

Files

multithreading.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

multithreading.mddocs/