0
# Error Handling
1
2
Feedparser provides comprehensive error handling with graceful degradation, allowing feed parsing to continue even when encountering malformed content, encoding issues, or network problems.
3
4
## Capabilities
5
6
### Exception Classes
7
8
Feedparser defines several exception classes for different types of parsing issues:
9
10
```python { .api }
11
class ThingsNobodyCaresAboutButMe(Exception):
12
"""
13
Base exception for minor parsing issues that don't prevent feed processing.
14
These exceptions are captured in bozo_exception but don't stop parsing.
15
"""
16
17
class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe):
18
"""
19
Raised when character encoding is overridden during parsing.
20
Indicates encoding was detected/specified differently than declared.
21
"""
22
23
class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe):
24
"""
25
Raised when character encoding cannot be determined.
26
Parser will fall back to default encoding handling.
27
"""
28
29
class NonXMLContentType(ThingsNobodyCaresAboutButMe):
30
"""
31
Raised when content type is not XML but parsing continues anyway.
32
Common when servers misconfigure content-type headers.
33
"""
34
35
class UndeclaredNamespace(Exception):
36
"""
37
Raised when XML contains undeclared namespaces.
38
More serious than other exceptions but parsing may still continue.
39
"""
40
```
41
42
## Bozo Detection System
43
44
Feedparser uses a "bozo" flag system to indicate when feeds have issues while still attempting to parse them:
45
46
### Bozo Flag
47
48
```python
49
result = feedparser.parse(url)
50
51
# Check if feed had issues
52
if result.bozo:
53
print("Feed had parsing issues")
54
print(f"Exception: {result.bozo_exception}")
55
print(f"Exception type: {type(result.bozo_exception).__name__}")
56
else:
57
print("Feed parsed cleanly")
58
59
# Feed data may still be available even with bozo=True
60
print(f"Found {len(result.entries)} entries")
61
```
62
63
### Common Bozo Scenarios
64
65
```python
66
import urllib.error
67
import xml.sax
68
69
result = feedparser.parse(problematic_url)
70
71
if result.bozo:
72
exception = result.bozo_exception
73
74
# Network/HTTP errors
75
if isinstance(exception, urllib.error.HTTPError):
76
print(f"HTTP {exception.code}: {exception.reason}")
77
elif isinstance(exception, urllib.error.URLError):
78
print(f"Network error: {exception.reason}")
79
80
# XML parsing errors
81
elif isinstance(exception, xml.sax.SAXException):
82
print(f"XML parsing error: {exception}")
83
84
# Feedparser-specific issues
85
elif isinstance(exception, feedparser.CharacterEncodingUnknown):
86
print("Could not determine character encoding")
87
elif isinstance(exception, feedparser.NonXMLContentType):
88
print("Content type is not XML")
89
elif isinstance(exception, feedparser.UndeclaredNamespace):
90
print("Feed contains undeclared XML namespaces")
91
92
# Generic issues
93
else:
94
print(f"Other parsing issue: {exception}")
95
```
96
97
## Error Recovery Strategies
98
99
### Graceful Degradation
100
101
Feedparser attempts to extract as much data as possible even from problematic feeds:
102
103
```python
104
result = feedparser.parse(malformed_feed_url)
105
106
# Always check what data was successfully extracted
107
print(f"Bozo: {result.bozo}")
108
print(f"Entries found: {len(result.entries)}")
109
print(f"Feed title: {result.feed.get('title', 'No title')}")
110
111
# Process available data despite errors
112
if result.entries:
113
print("Processing entries despite parsing issues:")
114
for entry in result.entries:
115
title = entry.get('title', 'No title')
116
link = entry.get('link', 'No link')
117
print(f" - {title}: {link}")
118
```
119
120
### Parser Fallback System
121
122
Feedparser automatically falls back from strict XML parsing to loose HTML-style parsing:
123
124
```python
125
result = feedparser.parse(questionable_feed)
126
127
# Check which parser was used
128
if result.bozo:
129
# Likely used loose parser due to malformed XML
130
print("Used tolerant parser for malformed content")
131
else:
132
# Used strict XML parser
133
print("Used strict XML parser")
134
135
# Both parsers can produce valid results
136
if result.version:
137
print(f"Detected format: {result.version}")
138
```
139
140
## Error Handling Patterns
141
142
### Safe Data Access
143
144
Always use safe access patterns when dealing with potentially problematic feeds:
145
146
```python
147
result = feedparser.parse(url)
148
149
# Safe feed-level access
150
feed_title = result.feed.get('title', 'Untitled Feed')
151
feed_link = result.feed.get('link', '')
152
feed_description = result.feed.get('description', 'No description')
153
154
# Safe entry-level access
155
for entry in result.entries:
156
title = entry.get('title', 'Untitled')
157
link = entry.get('link', '#')
158
summary = entry.get('summary', 'No summary')
159
160
# Check for date parsing success
161
if entry.get('published_parsed'):
162
import time
163
pub_date = time.strftime('%Y-%m-%d', entry.published_parsed)
164
else:
165
pub_date = entry.get('published', 'Unknown date')
166
167
print(f"{title} ({pub_date}): {summary[:100]}...")
168
```
169
170
### Exception Handling Wrapper
171
172
```python
173
import feedparser
174
import logging
175
176
def safe_parse_feed(url, max_retries=3):
177
"""
178
Safely parse a feed with error handling and retries.
179
"""
180
for attempt in range(max_retries):
181
try:
182
result = feedparser.parse(url)
183
184
# Log parsing issues but continue
185
if result.bozo:
186
logging.warning(f"Feed parsing issues for {url}: {result.bozo_exception}")
187
188
# Validate minimum data requirements
189
if not result.feed and not result.entries:
190
logging.error(f"No usable data found in feed: {url}")
191
if attempt < max_retries - 1:
192
continue
193
return None
194
195
return result
196
197
except Exception as e:
198
logging.error(f"Unexpected error parsing {url} (attempt {attempt + 1}): {e}")
199
if attempt < max_retries - 1:
200
import time
201
time.sleep(2 ** attempt) # Exponential backoff
202
203
return None
204
205
# Usage
206
result = safe_parse_feed('https://example.com/problematic-feed.xml')
207
if result:
208
print(f"Successfully parsed feed with {len(result.entries)} entries")
209
else:
210
print("Failed to parse feed after all retries")
211
```
212
213
### Content Validation
214
215
```python
216
def validate_feed_content(result):
217
"""
218
Validate and report on feed content quality.
219
"""
220
issues = []
221
222
# Check parsing status
223
if result.bozo:
224
issues.append(f"Parsing issues: {result.bozo_exception}")
225
226
# Check feed-level data
227
if not result.feed.get('title'):
228
issues.append("Feed has no title")
229
230
if not result.feed.get('link'):
231
issues.append("Feed has no link")
232
233
# Check entry quality
234
if not result.entries:
235
issues.append("Feed has no entries")
236
else:
237
entries_without_titles = sum(1 for e in result.entries if not e.get('title'))
238
if entries_without_titles:
239
issues.append(f"{entries_without_titles} entries missing titles")
240
241
entries_without_links = sum(1 for e in result.entries if not e.get('link'))
242
if entries_without_links:
243
issues.append(f"{entries_without_links} entries missing links")
244
245
return issues
246
247
# Usage
248
result = feedparser.parse(url)
249
issues = validate_feed_content(result)
250
251
if issues:
252
print("Feed quality issues:")
253
for issue in issues:
254
print(f" - {issue}")
255
else:
256
print("Feed appears to be high quality")
257
```
258
259
## Character Encoding Issues
260
261
### Encoding Detection and Override
262
263
```python
264
result = feedparser.parse(url)
265
266
# Check encoding information
267
print(f"Detected encoding: {result.get('encoding', 'Unknown')}")
268
269
# Handle encoding-related exceptions
270
if result.bozo:
271
if isinstance(result.bozo_exception, feedparser.CharacterEncodingOverride):
272
print("Encoding was overridden during parsing")
273
elif isinstance(result.bozo_exception, feedparser.CharacterEncodingUnknown):
274
print("Could not determine character encoding")
275
276
# Content should still be usable with UTF-8 conversion
277
for entry in result.entries:
278
# Text content is normalized to Unicode
279
title = entry.get('title', '')
280
if title:
281
print(f"Entry title: {title}")
282
```
283
284
### Handling Malformed Unicode
285
286
```python
287
def clean_text_content(text):
288
"""
289
Clean text content that may have encoding issues.
290
"""
291
if not text:
292
return text
293
294
# Remove or replace problematic characters
295
import unicodedata
296
297
# Normalize Unicode
298
text = unicodedata.normalize('NFKC', text)
299
300
# Remove control characters except whitespace
301
text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C' or char.isspace())
302
303
return text.strip()
304
305
# Apply to feed content
306
result = feedparser.parse(url)
307
for entry in result.entries:
308
title = clean_text_content(entry.get('title', ''))
309
summary = clean_text_content(entry.get('summary', ''))
310
# Use cleaned content...
311
```
312
313
## Network Error Handling
314
315
### HTTP Error Recovery
316
317
```python
318
import urllib.error
319
320
def parse_with_fallbacks(urls):
321
"""
322
Try multiple feed URLs with fallback handling.
323
"""
324
for url in urls:
325
try:
326
result = feedparser.parse(url)
327
328
# Check HTTP status
329
if hasattr(result, 'status'):
330
if result.status == 200:
331
return result
332
elif result.status == 301 or result.status == 302:
333
# Follow redirect manually if needed
334
redirect_url = result.headers.get('location')
335
if redirect_url:
336
return feedparser.parse(redirect_url)
337
elif result.status == 304:
338
# Not modified - could return cached version
339
pass
340
elif result.status >= 400:
341
print(f"HTTP {result.status} for {url}")
342
continue
343
344
# Return result even with minor issues
345
if result.entries or result.feed:
346
return result
347
348
except Exception as e:
349
print(f"Failed to parse {url}: {e}")
350
continue
351
352
return None
353
354
# Usage with multiple possible feed URLs
355
feed_urls = [
356
'https://example.com/feed.xml',
357
'https://example.com/rss.xml',
358
'https://example.com/atom.xml',
359
'https://example.com/feeds/all.xml'
360
]
361
362
result = parse_with_fallbacks(feed_urls)
363
```
364
365
### Timeout and Connection Issues
366
367
```python
368
import socket
369
import urllib.error
370
371
# Set reasonable timeout
372
socket.setdefaulttimeout(30)
373
374
def parse_with_timeout_handling(url):
375
"""
376
Parse feed with proper timeout and connection error handling.
377
"""
378
try:
379
result = feedparser.parse(url)
380
return result
381
382
except socket.timeout:
383
print(f"Timeout accessing {url}")
384
return None
385
386
except socket.gaierror as e:
387
print(f"DNS resolution failed for {url}: {e}")
388
return None
389
390
except ConnectionResetError:
391
print(f"Connection reset by peer: {url}")
392
return None
393
394
except urllib.error.URLError as e:
395
if isinstance(e.reason, socket.timeout):
396
print(f"Timeout: {url}")
397
else:
398
print(f"URL error for {url}: {e.reason}")
399
return None
400
```
401
402
## Logging and Monitoring
403
404
### Comprehensive Error Logging
405
406
```python
407
import logging
408
import feedparser
409
410
# Configure logging
411
logging.basicConfig(level=logging.INFO)
412
logger = logging.getLogger(__name__)
413
414
def parse_and_log(url):
415
"""
416
Parse feed with comprehensive logging.
417
"""
418
logger.info(f"Parsing feed: {url}")
419
420
result = feedparser.parse(url)
421
422
# Log parsing results
423
if result.bozo:
424
logger.warning(f"Bozo feed {url}: {result.bozo_exception}")
425
else:
426
logger.info(f"Clean parse: {url}")
427
428
# Log HTTP information
429
if hasattr(result, 'status'):
430
logger.info(f"HTTP {result.status}: {url}")
431
432
# Log content statistics
433
logger.info(f"Feed: {len(result.entries)} entries, version: {result.get('version', 'unknown')}")
434
435
# Log encoding information
436
if 'encoding' in result:
437
logger.info(f"Encoding: {result.encoding}")
438
439
return result
440
```