0
# Multi-threading & Batch Processing
1
2
Thread pool management for processing multiple articles and sources concurrently. The NewsPool class enables efficient large-scale content extraction and processing by managing thread allocation and coordination for batch operations on articles and news sources.
3
4
## Capabilities
5
6
### Thread Pool Management
7
8
Create and manage thread pools for concurrent article and source processing.
9
10
```python { .api }
11
class NewsPool:
12
def __init__(self, config=None):
13
"""
14
Initialize a news processing thread pool.
15
16
Parameters:
17
- config: Configuration object for threading settings
18
"""
19
20
def set(self, news_list: list, threads_per_source: int = 1, override_threads: int = None):
21
"""
22
Set the list of articles or sources to process with threading configuration.
23
24
Parameters:
25
- news_list: List of Article objects, Source objects, or mixed
26
- threads_per_source: Number of threads per source (when processing sources)
27
- override_threads: Override automatic thread calculation with specific count
28
29
Threading Logic:
30
- If override_threads specified: use that count
31
- If all items are Source objects: threads_per_source * number_of_sources
32
- Otherwise: use 1 thread
33
"""
34
35
def join(self):
36
"""
37
Execute multi-threaded processing and wait for all threads to complete.
38
Processes articles by downloading, sources by downloading articles.
39
40
Raises:
41
ConcurrencyException: If set() was not called before join()
42
"""
43
```
44
45
### Pre-instantiated Pool
46
47
Convenient global NewsPool instance for immediate use.
48
49
```python { .api }
50
news_pool: NewsPool # Pre-instantiated NewsPool object for convenience
51
```
52
53
### Threading Utilities
54
55
Supporting classes for thread pool implementation.
56
57
```python { .api }
58
class ThreadPool:
59
def __init__(self, num_threads: int, timeout_seconds: int):
60
"""Initialize thread pool with specified thread count and timeout."""
61
62
def add_task(self, func, *args, **kwargs):
63
"""Add a task function to the thread pool queue."""
64
65
def wait_completion(self):
66
"""Wait for all queued tasks to complete."""
67
68
class Worker:
69
"""Worker thread that executes tasks from a queue."""
70
71
def __init__(self, tasks, timeout_seconds: int):
72
"""Initialize worker thread with task queue and timeout."""
73
74
class ConcurrencyException(Exception):
75
"""Exception raised for thread pool operation errors."""
76
```
77
78
## Usage Examples
79
80
### Basic Multi-threaded Article Processing
81
82
```python
83
from newspaper import Article, news_pool
84
85
# Create multiple articles
86
articles = [
87
Article('http://cnn.com/article1'),
88
Article('http://cnn.com/article2'),
89
Article('http://cnn.com/article3'),
90
Article('http://bbc.com/article1'),
91
Article('http://bbc.com/article2')
92
]
93
94
# Process all articles concurrently
95
news_pool.set(articles)
96
news_pool.join()
97
98
# All articles now have downloaded HTML
99
for article in articles:
100
if article.html:
101
article.parse()
102
print(f"Downloaded and parsed: {article.url}")
103
```
104
105
### Multi-threaded Source Processing
106
107
```python
108
from newspaper import build, news_pool
109
110
# Create multiple sources (don't build them yet)
111
sources = [
112
build('http://cnn.com', dry=True),
113
build('http://bbc.com', dry=True),
114
build('http://techcrunch.com', dry=True)
115
]
116
117
# Download articles from all sources concurrently
118
# Uses one thread per source to avoid rate limiting
119
news_pool.set(sources, threads_per_source=1)
120
news_pool.join()
121
122
# Process results
123
for source in sources:
124
print(f"Source {source.brand}: {len(source.articles)} articles")
125
for article in source.articles[:3]: # Process first 3 articles
126
if article.html:
127
article.parse()
128
print(f" - {article.title}")
129
```
130
131
### Custom Thread Configuration
132
133
```python
134
from newspaper import NewsPool, Article, Configuration
135
136
# Create custom configuration
137
config = Configuration()
138
config.thread_timeout_seconds = 5
139
config.number_threads = 8
140
141
# Create custom news pool
142
custom_pool = NewsPool(config=config)
143
144
# Create articles
145
articles = [Article(f'http://example.com/article{i}') for i in range(20)]
146
147
# Process with specific thread count
148
custom_pool.set(articles, override_threads=10)
149
custom_pool.join()
150
151
print(f"Processed {len([a for a in articles if a.html])} articles")
152
```
153
154
### Mixed Article and Source Processing
155
156
```python
157
from newspaper import Article, build, news_pool
158
159
# Mix of articles and sources
160
news_items = [
161
Article('http://standalone-article.com/news'),
162
build('http://cnn.com', dry=True),
163
Article('http://another-article.com/story'),
164
build('http://bbc.com', dry=True)
165
]
166
167
# Process mixed list (will use 1 thread since not all sources)
168
news_pool.set(news_items)
169
news_pool.join()
170
171
# Handle results based on type
172
for item in news_items:
173
if hasattr(item, 'articles'): # It's a Source
174
print(f"Source: {item.brand} - {len(item.articles)} articles")
175
else: # It's an Article
176
if item.html:
177
item.parse()
178
print(f"Article: {item.title}")
179
```
180
181
### Error Handling with Threading
182
183
```python
184
from newspaper import Article, news_pool, ArticleException
185
186
# Create articles (some may have invalid URLs)
187
urls = [
188
'http://valid-site.com/article1',
189
'http://invalid-url-that-will-fail.com/article',
190
'http://valid-site.com/article2'
191
]
192
193
articles = [Article(url) for url in urls]
194
195
try:
196
news_pool.set(articles)
197
news_pool.join()
198
199
# Check results and handle failures
200
successful = []
201
failed = []
202
203
for article in articles:
204
if article.download_state == 2: # SUCCESS
205
article.parse()
206
successful.append(article)
207
else:
208
failed.append(article)
209
210
print(f"Successful downloads: {len(successful)}")
211
print(f"Failed downloads: {len(failed)}")
212
213
for article in failed:
214
print(f"Failed: {article.url} - {article.download_exception_msg}")
215
216
except Exception as e:
217
print(f"Threading error: {e}")
218
```
219
220
### Performance Optimization
221
222
```python
223
from newspaper import build, NewsPool, Configuration
224
225
# Create high-performance configuration
226
config = Configuration()
227
config.number_threads = 15
228
config.request_timeout = 5
229
config.thread_timeout_seconds = 2
230
231
# Create sources
232
sources = [
233
build('http://site1.com', dry=True, config=config),
234
build('http://site2.com', dry=True, config=config),
235
build('http://site3.com', dry=True, config=config)
236
]
237
238
# Use custom pool with optimized settings
239
pool = NewsPool(config=config)
240
241
# Process with multiple threads per source for faster downloading
242
pool.set(sources, threads_per_source=3) # 9 total threads (3 sources × 3 threads)
243
pool.join()
244
245
# Measure results
246
total_articles = sum(len(source.articles) for source in sources)
247
print(f"Downloaded articles from {len(sources)} sources: {total_articles} total")
248
```
249
250
### Thread Pool Lifecycle Management
251
252
```python
253
from newspaper import NewsPool, Article
254
255
# Create pool
256
pool = NewsPool()
257
258
# First batch
259
batch1 = [Article(f'http://site1.com/article{i}') for i in range(5)]
260
pool.set(batch1)
261
pool.join()
262
263
# Process results
264
for article in batch1:
265
if article.html:
266
article.parse()
267
268
# Second batch (pool can be reused)
269
batch2 = [Article(f'http://site2.com/article{i}') for i in range(5)]
270
pool.set(batch2)
271
pool.join()
272
273
# Process second batch results
274
for article in batch2:
275
if article.html:
276
article.parse()
277
278
print("Completed two separate batches")
279
```