Expert guidance for HTML/XML parsing using BeautifulSoup in Python with best practices for DOM navigation, data extraction, and efficient scraping workflows.
Install with Tessl CLI
npx tessl i github:mindrally/skills --skill beautifulsoup-parsing60
Does it follow best practices?
If you maintain this skill, you can automatically optimize it using the tessl CLI to improve its score:
npx tessl skill review --optimize ./path/to/skillValidation for skill structure
You are an expert in BeautifulSoup, Python HTML/XML parsing, DOM navigation, and building efficient data extraction pipelines for web scraping.
pip install beautifulsoup4 requests lxmlfrom bs4 import BeautifulSoup
import requests
# From string
html = '<html><body><h1>Hello</h1></body></html>'
soup = BeautifulSoup(html, 'lxml')
# From file
with open('page.html', 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'lxml')
# From URL
response = requests.get('https://example.com')
soup = BeautifulSoup(response.content, 'lxml')# lxml - Fast, lenient (recommended)
soup = BeautifulSoup(html, 'lxml')
# html.parser - Built-in, no dependencies
soup = BeautifulSoup(html, 'html.parser')
# html5lib - Most lenient, slowest
soup = BeautifulSoup(html, 'html5lib')
# lxml-xml - For XML documents
soup = BeautifulSoup(xml, 'lxml-xml')# First matching element
soup.find('h1')
# All matching elements
soup.find_all('p')
# Shorthand
soup.h1 # Same as soup.find('h1')# By class
soup.find('div', class_='article')
soup.find_all('div', class_='article')
# By ID
soup.find(id='main-content')
# By any attribute
soup.find('a', href='https://example.com')
soup.find_all('input', attrs={'type': 'text', 'name': 'email'})
# By data attributes
soup.find('div', attrs={'data-id': '123'})# Single element
soup.select_one('div.article > h2')
# Multiple elements
soup.select('div.article h2')
# Complex selectors
soup.select('a[href^="https://"]') # Starts with
soup.select('a[href$=".pdf"]') # Ends with
soup.select('a[href*="example"]') # Contains
soup.select('li:nth-child(2)')
soup.select('h1, h2, h3') # Multipleimport re
# By regex
soup.find_all('a', href=re.compile(r'^https://'))
# By function
def has_data_attr(tag):
return tag.has_attr('data-id')
soup.find_all(has_data_attr)
# String matching
soup.find_all(string='exact text')
soup.find_all(string=re.compile('pattern'))# Get text
element.text
element.get_text()
# Get text with separator
element.get_text(separator=' ')
# Get stripped text
element.get_text(strip=True)
# Get strings (generator)
for string in element.stripped_strings:
print(string)# Get attribute
element['href']
element.get('href') # Returns None if missing
element.get('href', 'default') # With default
# Get all attributes
element.attrs # Returns dict
# Check attribute exists
element.has_attr('class')# Inner HTML
str(element)
# Just the tag
element.name
# Prettified HTML
element.prettify()element.parent
element.parents # Generator of all ancestors
# Find specific ancestor
for parent in element.parents:
if parent.name == 'div' and 'article' in parent.get('class', []):
breakelement.children # Direct children (generator)
list(element.children)
element.contents # Direct children (list)
element.descendants # All descendants (generator)
# Find in children
element.find('span') # Searches descendantselement.next_sibling
element.previous_sibling
element.next_siblings # Generator
element.previous_siblings # Generator
# Next/previous element (skips whitespace)
element.next_element
element.previous_elementdef safe_text(element, selector, default=''):
"""Safely extract text from element."""
found = element.select_one(selector)
return found.get_text(strip=True) if found else default
def safe_attr(element, selector, attr, default=None):
"""Safely extract attribute from element."""
found = element.select_one(selector)
return found.get(attr, default) if found else defaultdef extract_table(table):
"""Extract table data as list of dictionaries."""
headers = [th.get_text(strip=True) for th in table.select('th')]
rows = []
for tr in table.select('tbody tr'):
cells = [td.get_text(strip=True) for td in tr.select('td')]
if cells:
rows.append(dict(zip(headers, cells)))
return rowsdef extract_items(soup, selector, extractor):
"""Extract multiple items using a custom extractor function."""
return [extractor(item) for item in soup.select(selector)]
# Usage
def extract_product(item):
return {
'name': safe_text(item, '.name'),
'price': safe_text(item, '.price'),
'url': safe_attr(item, 'a', 'href')
}
products = extract_items(soup, '.product', extract_product)from urllib.parse import urljoin
def resolve_url(base_url, relative_url):
"""Convert relative URL to absolute."""
if not relative_url:
return None
return urljoin(base_url, relative_url)
# Usage
base_url = 'https://example.com/products/'
for link in soup.select('a'):
href = link.get('href')
absolute_url = resolve_url(base_url, href)
print(absolute_url)# lxml parser is lenient with malformed HTML
soup = BeautifulSoup(malformed_html, 'lxml')
# For very broken HTML, use html5lib
soup = BeautifulSoup(very_broken_html, 'html5lib')
# Handle encoding issues
response = requests.get(url)
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'lxml')import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
class ProductScraper:
def __init__(self, base_url):
self.base_url = base_url
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; MyScraper/1.0)'
})
def fetch_page(self, url):
"""Fetch and parse a page."""
response = self.session.get(url, timeout=30)
response.raise_for_status()
return BeautifulSoup(response.content, 'lxml')
def extract_product(self, item):
"""Extract product data from a card element."""
return {
'name': self._safe_text(item, '.product-title'),
'price': self._parse_price(item.select_one('.price')),
'rating': self._safe_attr(item, '.rating', 'data-rating'),
'image': self._resolve(self._safe_attr(item, 'img', 'src')),
'url': self._resolve(self._safe_attr(item, 'a', 'href')),
'in_stock': not item.select_one('.out-of-stock')
}
def scrape_products(self, url):
"""Scrape all products from a page."""
soup = self.fetch_page(url)
items = soup.select('.product-card')
return [self.extract_product(item) for item in items]
def _safe_text(self, element, selector, default=''):
found = element.select_one(selector)
return found.get_text(strip=True) if found else default
def _safe_attr(self, element, selector, attr, default=None):
found = element.select_one(selector)
return found.get(attr, default) if found else default
def _parse_price(self, element):
if not element:
return None
text = element.get_text(strip=True)
try:
return float(text.replace('$', '').replace(',', ''))
except ValueError:
return None
def _resolve(self, url):
return urljoin(self.base_url, url) if url else None
# Usage
scraper = ProductScraper('https://example.com')
products = scraper.scrape_products('https://example.com/products')
for product in products:
print(product)# Use SoupStrainer to parse only needed elements
from bs4 import SoupStrainer
only_articles = SoupStrainer('article')
soup = BeautifulSoup(html, 'lxml', parse_only=only_articles)
# Use lxml parser for speed
soup = BeautifulSoup(html, 'lxml') # Fastest
# Decompose unneeded elements
for script in soup.find_all('script'):
script.decompose()
# Use generators for memory efficiency
for item in soup.select('.item'):
yield extract_data(item)select() and select_one() for CSS selectorsget_text(strip=True) for clean text extraction47f47c1
If you maintain this skill, you can claim it as your own. Once claimed, you can manage eval scenarios, bundle related skills, attach documentation or rules, and ensure cross-agent compatibility.