tessl/pypi-parsel

Parsel is a library to extract data from HTML and XML using XPath and CSS selectors

—

Pending

Overview

Eval results

Files

Element Modification

Name: tessl/pypi-parsel
Author: tessl

Methods for removing and modifying DOM elements within the parsed document structure. These operations modify the underlying document tree and affect subsequent queries.

Capabilities

Element Removal

Remove selected elements from their parent containers in the document tree.

def drop(self) -> None:
    """
    Drop matched nodes from the parent element.
    
    Removes the selected element from its parent in the DOM tree.
    Uses appropriate removal method based on document type:
    - HTML: Uses lxml's drop_tree() method
    - XML: Uses parent.remove() method
    
    Raises:
    - CannotRemoveElementWithoutRoot: Element has no root document
    - CannotDropElementWithoutParent: Element has no parent to remove from
    """

def remove(self) -> None:
    """
    Remove matched nodes from the parent element.
    
    Deprecated: Use drop() method instead.
    
    Raises:
    - CannotRemoveElementWithoutRoot: Element has no root document  
    - CannotRemoveElementWithoutParent: Element has no parent to remove from
    - DeprecationWarning: Method is deprecated
    """

Usage Example:

from parsel import Selector

html = """
<article>
    <h1>Article Title</h1>
    <div class="ads">Advertisement content</div>
    <p>First paragraph of content.</p>
    <div class="ads">Another advertisement</div>
    <p>Second paragraph of content.</p>
    <div class="sidebar">Sidebar content</div>
</article>
"""

selector = Selector(text=html)

# Remove all advertisement elements
ads = selector.css('.ads')
ads.drop()

# Verify ads are removed
remaining_content = selector.css('article').get()
print("Ads removed:", "ads" not in remaining_content)

# Remove sidebar
sidebar = selector.css('.sidebar')
sidebar.drop()

# Check final structure - only h1 and p elements remain
final_structure = selector.css('article > *')
elements = [elem.root.tag for elem in final_structure]
# Returns: ['h1', 'p', 'p']

Batch Element Removal

Remove multiple elements using SelectorList operations.

Usage Example:

html_with_cleanup = """
<div class="content">
    <h2>Important Heading</h2>
    <script>tracking_code();</script>
    <p>Valuable content paragraph.</p>
    <div class="popup">Popup modal</div>
    <p>Another valuable paragraph.</p>
    <noscript>No JavaScript message</noscript>
    <footer>Footer content</footer>
</div>
"""

selector = Selector(text=html_with_cleanup)

# Remove multiple unwanted element types at once
unwanted = selector.css('script, .popup, noscript')
unwanted.drop()

# Verify cleanup
cleaned_content = selector.css('.content').get()
print("Scripts removed:", "script" not in cleaned_content)
print("Popups removed:", "popup" not in cleaned_content)
print("Noscript removed:", "noscript" not in cleaned_content)

# Extract clean content
clean_paragraphs = selector.css('p::text').getall()
# Returns: ['Valuable content paragraph.', 'Another valuable paragraph.']

Conditional Element Removal

Remove elements based on content or attribute conditions.

Usage Example:

html_with_conditions = """
<div class="comments">
    <div class="comment" data-score="5">Great article!</div>
    <div class="comment" data-score="1">Spam content here</div>
    <div class="comment" data-score="4">Very helpful, thanks.</div>
    <div class="comment" data-score="2">Not very useful</div>
    <div class="comment" data-score="5">Excellent explanation!</div>
</div>
"""

selector = Selector(text=html_with_conditions)

# Remove low-quality comments (score <= 2)
low_quality = selector.xpath('//div[@class="comment"][@data-score<=2]')
low_quality.drop()

# Verify only high-quality comments remain
remaining_scores = selector.css('.comment').xpath('./@data-score').getall()
# Returns: ['5', '4', '5'] - only scores > 2

# Remove comments containing specific text
spam_comments = selector.xpath('//div[@class="comment"][contains(text(), "spam")]')
spam_comments.drop()

Targeted Content Removal

Remove specific content while preserving structure.

Usage Example:

html_with_mixed_content = """
<article>
    <h1>Product Review</h1>
    <div class="meta">
        <span class="author">John Doe</span>
        <span class="date">2024-01-15</span>
        <span class="tracking" data-track="view">TRACK123</span>
    </div>
    <div class="content">
        <p>This product is amazing!</p>
        <div class="affiliate-link">
            <a href="/affiliate?id=123">Buy Now - Special Offer!</a>
        </div>
        <p>I highly recommend it to everyone.</p>
    </div>
</article>
"""

selector = Selector(text=html_with_mixed_content)

# Remove tracking and affiliate elements
tracking_elements = selector.css('[data-track], .affiliate-link')
tracking_elements.drop()

# Extract clean content
article_text = selector.css('.content p::text').getall()
# Returns: ['This product is amazing!', 'I highly recommend it to everyone.']

# Verify meta information is preserved (author, date kept)
meta_info = selector.css('.meta span:not(.tracking)::text').getall()
# Returns: ['John Doe', '2024-01-15']

Exception Handling

Element modification operations can raise specific exceptions that should be handled appropriately.

Exception Types

class CannotRemoveElementWithoutRoot(Exception):
    """
    Raised when attempting to remove an element that has no root document.
    
    Common causes:
    - Trying to remove text nodes or pseudo-elements
    - Working with detached elements
    """

class CannotRemoveElementWithoutParent(Exception):
    """
    Raised when attempting to remove an element that has no parent.
    
    Common causes:
    - Trying to remove the root element
    - Working with already-removed elements
    """

class CannotDropElementWithoutParent(CannotRemoveElementWithoutParent):
    """
    Specific exception for drop() operations.
    Inherits from CannotRemoveElementWithoutParent.
    """

Exception Handling Example:

from parsel import Selector
from parsel.selector import (
    CannotRemoveElementWithoutRoot,
    CannotDropElementWithoutParent
)

html = """
<div>
    <p>Paragraph with <em>emphasis</em> text.</p>
    <ul>
        <li>Item 1</li>
        <li>Item 2</li>
    </ul>
</div>
"""

selector = Selector(text=html)

# Safe element removal with exception handling
def safe_remove_elements(selector, css_query):
    try:
        elements = selector.css(css_query)
        elements.drop()
        return True
    except CannotRemoveElementWithoutRoot:
        print(f"Cannot remove {css_query}: elements have no root")
        return False
    except CannotDropElementWithoutParent:
        print(f"Cannot remove {css_query}: elements have no parent")
        return False

# Remove list items safely
success = safe_remove_elements(selector, 'li')
print(f"List items removed: {success}")

# Try to remove text nodes (will fail gracefully)
text_nodes = selector.xpath('//text()')
try:
    text_nodes.drop()
except CannotRemoveElementWithoutRoot as e:
    print(f"Expected error: {e}")

# Try to remove root element (will fail)
try:
    root_div = selector.css('div')
    if root_div:
        root_div[0].drop()  # Try to remove root
except CannotDropElementWithoutParent as e:
    print(f"Cannot remove root: {e}")

Document State After Modification

Element removal permanently modifies the document structure:

Subsequent queries reflect the modified document state
Removed elements are no longer accessible via selectors
Parent-child relationships are updated automatically
Document serialization excludes removed elements

State Tracking Example:

html = """
<nav>
    <ul>
        <li><a href="/home">Home</a></li>
        <li class="active"><a href="/products">Products</a></li>
        <li><a href="/contact">Contact</a></li>
    </ul>
</nav>
"""

selector = Selector(text=html)

# Count elements before removal
initial_count = len(selector.css('li'))
print(f"Initial list items: {initial_count}")  # 3

# Remove active item
active_item = selector.css('li.active')
active_item.drop()

# Count elements after removal
final_count = len(selector.css('li'))
print(f"Remaining list items: {final_count}")  # 2

# Verify active item is gone
active_check = selector.css('li.active')
print(f"Active items found: {len(active_check)}")  # 0

# Get final HTML structure
final_html = selector.css('nav').get()
print("Active class removed:", "active" not in final_html)