or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

article-processing.mdconfiguration.mdindex.mdmultithreading.mdsource-management.md

multithreading.mddocs/

0

# Multi-threading & Batch Processing

1

2

Thread pool management for processing multiple articles and sources concurrently. The NewsPool class enables efficient large-scale content extraction and processing by managing thread allocation and coordination for batch operations on articles and news sources.

3

4

## Capabilities

5

6

### Thread Pool Management

7

8

Create and manage thread pools for concurrent article and source processing.

9

10

```python { .api }

11

class NewsPool:

12

def __init__(self, config=None):

13

"""

14

Initialize a news processing thread pool.

15

16

Parameters:

17

- config: Configuration object for threading settings

18

"""

19

20

def set(self, news_list: list, threads_per_source: int = 1, override_threads: int = None):

21

"""

22

Set the list of articles or sources to process with threading configuration.

23

24

Parameters:

25

- news_list: List of Article objects, Source objects, or mixed

26

- threads_per_source: Number of threads per source (when processing sources)

27

- override_threads: Override automatic thread calculation with specific count

28

29

Threading Logic:

30

- If override_threads specified: use that count

31

- If all items are Source objects: threads_per_source * number_of_sources

32

- Otherwise: use 1 thread

33

"""

34

35

def join(self):

36

"""

37

Execute multi-threaded processing and wait for all threads to complete.

38

Processes articles by downloading, sources by downloading articles.

39

40

Raises:

41

ConcurrencyException: If set() was not called before join()

42

"""

43

```

44

45

### Pre-instantiated Pool

46

47

Convenient global NewsPool instance for immediate use.

48

49

```python { .api }

50

news_pool: NewsPool # Pre-instantiated NewsPool object for convenience

51

```

52

53

### Threading Utilities

54

55

Supporting classes for thread pool implementation.

56

57

```python { .api }

58

class ThreadPool:

59

def __init__(self, num_threads: int, timeout_seconds: int):

60

"""Initialize thread pool with specified thread count and timeout."""

61

62

def add_task(self, func, *args, **kwargs):

63

"""Add a task function to the thread pool queue."""

64

65

def wait_completion(self):

66

"""Wait for all queued tasks to complete."""

67

68

class Worker:

69

"""Worker thread that executes tasks from a queue."""

70

71

def __init__(self, tasks, timeout_seconds: int):

72

"""Initialize worker thread with task queue and timeout."""

73

74

class ConcurrencyException(Exception):

75

"""Exception raised for thread pool operation errors."""

76

```

77

78

## Usage Examples

79

80

### Basic Multi-threaded Article Processing

81

82

```python

83

from newspaper import Article, news_pool

84

85

# Create multiple articles

86

articles = [

87

Article('http://cnn.com/article1'),

88

Article('http://cnn.com/article2'),

89

Article('http://cnn.com/article3'),

90

Article('http://bbc.com/article1'),

91

Article('http://bbc.com/article2')

92

]

93

94

# Process all articles concurrently

95

news_pool.set(articles)

96

news_pool.join()

97

98

# All articles now have downloaded HTML

99

for article in articles:

100

if article.html:

101

article.parse()

102

print(f"Downloaded and parsed: {article.url}")

103

```

104

105

### Multi-threaded Source Processing

106

107

```python

108

from newspaper import build, news_pool

109

110

# Create multiple sources (don't build them yet)

111

sources = [

112

build('http://cnn.com', dry=True),

113

build('http://bbc.com', dry=True),

114

build('http://techcrunch.com', dry=True)

115

]

116

117

# Download articles from all sources concurrently

118

# Uses one thread per source to avoid rate limiting

119

news_pool.set(sources, threads_per_source=1)

120

news_pool.join()

121

122

# Process results

123

for source in sources:

124

print(f"Source {source.brand}: {len(source.articles)} articles")

125

for article in source.articles[:3]: # Process first 3 articles

126

if article.html:

127

article.parse()

128

print(f" - {article.title}")

129

```

130

131

### Custom Thread Configuration

132

133

```python

134

from newspaper import NewsPool, Article, Configuration

135

136

# Create custom configuration

137

config = Configuration()

138

config.thread_timeout_seconds = 5

139

config.number_threads = 8

140

141

# Create custom news pool

142

custom_pool = NewsPool(config=config)

143

144

# Create articles

145

articles = [Article(f'http://example.com/article{i}') for i in range(20)]

146

147

# Process with specific thread count

148

custom_pool.set(articles, override_threads=10)

149

custom_pool.join()

150

151

print(f"Processed {len([a for a in articles if a.html])} articles")

152

```

153

154

### Mixed Article and Source Processing

155

156

```python

157

from newspaper import Article, build, news_pool

158

159

# Mix of articles and sources

160

news_items = [

161

Article('http://standalone-article.com/news'),

162

build('http://cnn.com', dry=True),

163

Article('http://another-article.com/story'),

164

build('http://bbc.com', dry=True)

165

]

166

167

# Process mixed list (will use 1 thread since not all sources)

168

news_pool.set(news_items)

169

news_pool.join()

170

171

# Handle results based on type

172

for item in news_items:

173

if hasattr(item, 'articles'): # It's a Source

174

print(f"Source: {item.brand} - {len(item.articles)} articles")

175

else: # It's an Article

176

if item.html:

177

item.parse()

178

print(f"Article: {item.title}")

179

```

180

181

### Error Handling with Threading

182

183

```python

184

from newspaper import Article, news_pool, ArticleException

185

186

# Create articles (some may have invalid URLs)

187

urls = [

188

'http://valid-site.com/article1',

189

'http://invalid-url-that-will-fail.com/article',

190

'http://valid-site.com/article2'

191

]

192

193

articles = [Article(url) for url in urls]

194

195

try:

196

news_pool.set(articles)

197

news_pool.join()

198

199

# Check results and handle failures

200

successful = []

201

failed = []

202

203

for article in articles:

204

if article.download_state == 2: # SUCCESS

205

article.parse()

206

successful.append(article)

207

else:

208

failed.append(article)

209

210

print(f"Successful downloads: {len(successful)}")

211

print(f"Failed downloads: {len(failed)}")

212

213

for article in failed:

214

print(f"Failed: {article.url} - {article.download_exception_msg}")

215

216

except Exception as e:

217

print(f"Threading error: {e}")

218

```

219

220

### Performance Optimization

221

222

```python

223

from newspaper import build, NewsPool, Configuration

224

225

# Create high-performance configuration

226

config = Configuration()

227

config.number_threads = 15

228

config.request_timeout = 5

229

config.thread_timeout_seconds = 2

230

231

# Create sources

232

sources = [

233

build('http://site1.com', dry=True, config=config),

234

build('http://site2.com', dry=True, config=config),

235

build('http://site3.com', dry=True, config=config)

236

]

237

238

# Use custom pool with optimized settings

239

pool = NewsPool(config=config)

240

241

# Process with multiple threads per source for faster downloading

242

pool.set(sources, threads_per_source=3) # 9 total threads (3 sources × 3 threads)

243

pool.join()

244

245

# Measure results

246

total_articles = sum(len(source.articles) for source in sources)

247

print(f"Downloaded articles from {len(sources)} sources: {total_articles} total")

248

```

249

250

### Thread Pool Lifecycle Management

251

252

```python

253

from newspaper import NewsPool, Article

254

255

# Create pool

256

pool = NewsPool()

257

258

# First batch

259

batch1 = [Article(f'http://site1.com/article{i}') for i in range(5)]

260

pool.set(batch1)

261

pool.join()

262

263

# Process results

264

for article in batch1:

265

if article.html:

266

article.parse()

267

268

# Second batch (pool can be reused)

269

batch2 = [Article(f'http://site2.com/article{i}') for i in range(5)]

270

pool.set(batch2)

271

pool.join()

272

273

# Process second batch results

274

for article in batch2:

275

if article.html:

276

article.parse()

277

278

print("Completed two separate batches")

279

```