tessl/pypi-beautifulsoup4

Python library for pulling data out of HTML and XML files with pluggable parser architecture and intuitive navigation API

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Content Extraction

Name: tessl/pypi-beautifulsoup4
Author: tessl

Extract text content, attribute values, and formatted output from parse tree elements with flexible filtering and formatting options. Beautiful Soup provides multiple ways to access and extract different types of content from HTML/XML documents.

Capabilities

Text Content Extraction

Extract text content from elements with various filtering and formatting options.

def get_text(self, separator="", strip=False, types=(NavigableString,)):
    """
    Extract all text content from this element and its descendants.
    
    Parameters:
    - separator: str - string to join text pieces (default: "")
    - strip: bool - strip whitespace from each piece (default: False)  
    - types: tuple - NavigableString types to include (default: (NavigableString,))
    
    Returns:
    str - concatenated text content
    """

@property
def text(self):
    """
    All text content concatenated without separators.
    
    Equivalent to get_text()
    
    Returns:
    str
    """

@property  
def string(self):
    """
    The single NavigableString child, or None if multiple children.
    
    Returns string content only if element has exactly one string child.
    
    Returns:
    NavigableString or None
    """

@property
def strings(self):
    """
    Generator yielding all NavigableString descendants.
    
    Yields:
    NavigableString instances in document order
    """

@property
def stripped_strings(self):
    """
    Generator yielding all non-empty NavigableString descendants with whitespace stripped.
    
    Yields:
    str - stripped string content (empty strings excluded)
    """

Usage Examples:

from bs4 import BeautifulSoup, Comment

html = '''
<div class="article">
  <h1>Article Title</h1>
  <!-- This is a comment -->
  <p>First paragraph with <em>emphasis</em> and <strong>bold</strong> text.</p>
  <p>  Second paragraph with extra whitespace.  </p>
  <script>console.log('script content');</script>
</div>
'''

soup = BeautifulSoup(html, 'html.parser')
article = soup.find('div', class_='article')

# Basic text extraction
all_text = article.get_text()
print(all_text)  # All text concatenated

# Text with separators
spaced_text = article.get_text(' ')
line_separated = article.get_text('\n')
print(spaced_text)   # Words separated by spaces
print(line_separated) # Elements separated by newlines

# Stripped text (removes extra whitespace)
clean_text = article.get_text(' ', strip=True)
print(clean_text)  # Clean, properly spaced text

# Include different string types
from bs4 import NavigableString, Comment, CData

# Default - only NavigableString (excludes comments, scripts, etc.)
text_only = article.get_text(types=(NavigableString,))

# Include comments
with_comments = article.get_text(types=(NavigableString, Comment))

# Direct property access
print(article.text)  # Same as get_text()

# Single string access
title = soup.find('h1')
print(title.string)  # "Article Title" (single string child)

paragraph = soup.find('p')
print(paragraph.string)  # None (has multiple children including tags)

# Iterate over all strings
for string in article.strings:
    print(repr(string))  # Shows all text nodes including whitespace

# Iterate over stripped strings (non-empty only)
for string in article.stripped_strings:
    print(repr(string))  # Clean text content only

Attribute Access

Access and manipulate element attributes with dictionary-like interface.

def get(self, key, default=None):
    """
    Get attribute value with optional default.
    
    Parameters:
    - key: str - attribute name
    - default: value to return if attribute doesn't exist
    
    Returns:
    Attribute value (str or list for class), or default
    """

def has_attr(self, key):
    """
    Check if element has the specified attribute.
    
    Parameters:
    - key: str - attribute name
    
    Returns:
    bool
    """

def __getitem__(self, key):
    """
    Get attribute value using dictionary syntax.
    
    Parameters:
    - key: str - attribute name
    
    Returns:
    Attribute value
    
    Raises:
    KeyError if attribute doesn't exist
    """

@property
def attrs(self):
    """
    Dictionary of all element attributes.
    
    Returns:
    dict - attribute name/value pairs
    """

Usage Examples:

html = '''
<div id="main" class="container highlight" data-value="123" title="Main container">
  <a href="https://example.com" target="_blank" rel="noopener">Link</a>
  <img src="image.jpg" alt="Description" width="100" height="200">
</div>
'''

soup = BeautifulSoup(html, 'html.parser')

div = soup.find('div')
link = soup.find('a')
img = soup.find('img')

# Get attributes with default
print(div.get('id'))  # 'main'
print(div.get('data-value'))  # '123'
print(div.get('nonexistent', 'default'))  # 'default'

# Dictionary-style access
print(div['id'])  # 'main'
print(link['href'])  # 'https://example.com'

# Check attribute existence
if div.has_attr('class'):
    print('Div has class attribute')

if not img.has_attr('alt'):
    print('Image missing alt text')

# Access all attributes
print(div.attrs)
# {'id': 'main', 'class': ['container', 'highlight'], 
#  'data-value': '123', 'title': 'Main container'}

# Special handling for class attribute (always a list)
print(div['class'])  # ['container', 'highlight']
print(type(div['class']))  # <class 'list'>

# Iterate over attributes
for attr_name, attr_value in div.attrs.items():
    print(f'{attr_name}: {attr_value}')

Content Type Detection

Identify and work with different types of content within elements.

# Content type checking
def isinstance(obj, class_or_tuple):
    """Check if object is instance of NavigableString subclass"""

# NavigableString types
class NavigableString(str):
    """Regular text content"""

class Comment(NavigableString):
    """HTML/XML comments"""

class CData(NavigableString):
    """CDATA sections"""

class ProcessingInstruction(NavigableString):
    """XML processing instructions"""

class Doctype(NavigableString):
    """DOCTYPE declarations"""

Usage Examples:

from bs4 import BeautifulSoup, NavigableString, Comment, CData

html = '''
<div>
  Regular text
  <!-- This is a comment -->
  <![CDATA[This is CDATA]]>
  <?xml version="1.0"?>
  <p>Paragraph text</p>
</div>
'''

soup = BeautifulSoup(html, 'lxml')  # lxml better for mixed content
div = soup.find('div')

# Iterate and identify content types
for content in div.contents:
    if isinstance(content, Comment):
        print(f"Comment: {content}")
    elif isinstance(content, CData):
        print(f"CDATA: {content}")
    elif isinstance(content, NavigableString):
        if content.strip():  # Skip empty whitespace
            print(f"Text: {content.strip()}")
    elif hasattr(content, 'name'):  # It's a Tag
        print(f"Tag: {content.name}")

# Filter by content type
comments = [c for c in div.contents if isinstance(c, Comment)]
text_nodes = [c for c in div.strings if isinstance(c, NavigableString)]

Data Extraction Patterns

Common patterns for extracting structured data from HTML documents.

# Common extraction patterns

def extract_links(soup):
    """Extract all links with href and text"""
    
def extract_images(soup):
    """Extract image sources and alt text"""
    
def extract_tables(soup):
    """Extract table data as list of dictionaries"""
    
def extract_forms(soup):
    """Extract form fields and actions"""

Usage Examples:

html = '''
<div class="content">
  <h2>Product List</h2>
  <ul class="products">
    <li data-id="1" data-price="29.99">
      <a href="/product/1">Widget A</a>
      <span class="price">$29.99</span>
    </li>
    <li data-id="2" data-price="39.99">
      <a href="/product/2">Widget B</a>
      <span class="price">$39.99</span>
    </li>
  </ul>
</div>
'''

soup = BeautifulSoup(html, 'html.parser')

# Extract structured product data
products = []
for item in soup.find_all('li', {'data-id': True}):
    product = {
        'id': item.get('data-id'),
        'price': item.get('data-price'),
        'name': item.find('a').get_text().strip(),
        'url': item.find('a').get('href'),
        'price_text': item.find('span', class_='price').get_text()
    }
    products.append(product)

print(products)
# [{'id': '1', 'price': '29.99', 'name': 'Widget A', 
#   'url': '/product/1', 'price_text': '$29.99'}, ...]

# Extract all links
links = []
for link in soup.find_all('a', href=True):
    links.append({
        'url': link['href'],
        'text': link.get_text().strip(),
        'title': link.get('title', '')
    })

# Extract metadata
metadata = {}
for meta in soup.find_all('meta'):
    name = meta.get('name') or meta.get('property') or meta.get('http-equiv')
    content = meta.get('content')
    if name and content:
        metadata[name] = content

Text Processing Utilities

Helper functions for cleaning and processing extracted text content.

import re

def clean_text(text):
    """Remove extra whitespace and normalize text"""
    return re.sub(r'\s+', ' ', text.strip())

def extract_numbers(text):
    """Extract numeric values from text"""
    return re.findall(r'\d+\.?\d*', text)

def extract_emails(text):
    """Extract email addresses from text"""
    return re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)

def extract_urls(text):
    """Extract URLs from text"""
    return re.findall(r'https?://[^\s<>"]+', text)

Usage Examples:

html = '''
<div class="contact">
  Contact us at   support@example.com   or visit 
  https://example.com/contact   for more info.
  
  Phone:   555-123-4567   
</div>
'''

soup = BeautifulSoup(html, 'html.parser')
contact_div = soup.find('div', class_='contact')

# Extract and clean text
raw_text = contact_div.get_text()
clean_text = re.sub(r'\s+', ' ', raw_text.strip())
print(clean_text)

# Extract specific data patterns
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', clean_text)
urls = re.findall(r'https?://[^\s<>"]+', clean_text)
phones = re.findall(r'\d{3}-\d{3}-\d{4}', clean_text)

print(f"Emails: {emails}")  # ['support@example.com']
print(f"URLs: {urls}")      # ['https://example.com/contact']
print(f"Phones: {phones}")  # ['555-123-4567']

Install with Tessl CLI