tessl/pypi-mechanicalsoup

A Python library for automating interaction with websites, providing web scraping and form submission capabilities

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Stateful Web Navigation

Name: tessl/pypi-mechanicalsoup
Author: tessl

High-level browser that maintains page state and provides convenient methods for navigation, link following, and multi-step web interactions. StatefulBrowser inherits from Browser and is recommended for most web automation applications.

Capabilities

Browser Creation

Create a StatefulBrowser instance with all Browser configuration options.

class StatefulBrowser(Browser):
    def __init__(self, *args, **kwargs):
        """
        Create a StatefulBrowser instance.
        All parameters are forwarded to Browser.__init__()
        """

Usage Example:

import mechanicalsoup

# Basic stateful browser
browser = mechanicalsoup.StatefulBrowser()

# With custom configuration
browser = mechanicalsoup.StatefulBrowser(
    raise_on_404=True,
    user_agent="MyScript/1.0"
)

Page State Properties

Access current page state including content, URL, and selected form.

@property
def page(self):
    """Current page BeautifulSoup object (read-only)"""
    
@property 
def url(self):
    """Current page URL string (read-only)"""
    
@property
def form(self):
    """Currently selected Form object (read-only)"""

Usage Example:

browser = mechanicalsoup.StatefulBrowser()
browser.open("https://httpbin.org/html")

# Access current page content
print(browser.page.title.string)

# Access current URL
print(f"Current URL: {browser.url}")

# Access selected form (if any)
if browser.form:
    print(f"Current form action: {browser.form.form.get('action')}")

Page Navigation

Navigate to URLs and manage page state.

def open(self, url, *args, **kwargs):
    """
    Open URL and update browser state.
    
    Parameters:
    - url: URL to open
    - *args, **kwargs: Forwarded to Browser.get()
    
    Returns:
    requests.Response with soup attribute
    """

def open_fake_page(self, page_text, url=None, soup_config=None):
    """
    Mock page loading for testing purposes.
    
    Parameters:
    - page_text: HTML content as string
    - url: Optional URL to associate with fake page
    - soup_config: Optional BeautifulSoup config override
    """

def open_relative(self, url, *args, **kwargs):
    """
    Open relative URL from current page.
    
    Parameters:
    - url: Relative URL path
    - *args, **kwargs: Forwarded to open()
    """

def refresh(self):
    """Reload the current page"""

def absolute_url(self, url):
    """
    Convert relative URL to absolute based on current page.
    
    Parameters:
    - url: Relative or absolute URL
    
    Returns:
    Absolute URL string
    """

Usage Example:

browser = mechanicalsoup.StatefulBrowser()

# Open initial page
browser.open("https://httpbin.org/")

# Navigate to relative URL
browser.open_relative("/forms/post")

# Refresh current page
browser.refresh()

# Convert relative to absolute URL
abs_url = browser.absolute_url("../status/200")
print(abs_url)  # https://httpbin.org/status/200

Link Discovery and Following

Find and follow links on the current page.

def links(self, url_regex=None, link_text=None, *args, **kwargs):
    """
    Get links from current page matching criteria.
    
    Parameters:
    - url_regex: Regular expression to match link URLs
    - link_text: Text content to match in link text
    - *args, **kwargs: Additional BeautifulSoup find parameters
    
    Returns:
    List of BeautifulSoup Tag objects
    """

def list_links(self, *args, **kwargs):
    """Print all links in current page for debugging"""

def find_link(self, *args, **kwargs):
    """
    Find single link matching criteria.
    
    Returns:
    BeautifulSoup Tag object or None
    """

def follow_link(self, link=None, *bs4_args, bs4_kwargs={}, requests_kwargs={}, **kwargs):
    """
    Follow a link and update browser state.
    
    Parameters:
    - link: Link Tag object, or search criteria if None
    - bs4_args, bs4_kwargs: BeautifulSoup search parameters if link is None
    - requests_kwargs: Parameters for the HTTP request
    - **kwargs: Additional search parameters
    
    Returns:
    requests.Response with soup attribute
    """

Usage Example:

browser = mechanicalsoup.StatefulBrowser()
browser.open("https://httpbin.org/")

# Get all links
all_links = browser.links()
print(f"Found {len(all_links)} links")

# Find links with specific text
status_links = browser.links(link_text="Status codes")

# Find link by URL pattern
import re
json_links = browser.links(url_regex=re.compile(r"/json"))

# Follow first link
if all_links:
    response = browser.follow_link(all_links[0])
    print(f"Followed to: {browser.url}")

# Follow link by search criteria
browser.follow_link(link_text="Forms")

Link Download

Download link content to files.

def download_link(self, link=None, file=None, *bs4_args, bs4_kwargs={}, 
                  requests_kwargs={}, **kwargs):
    """
    Download link content to file.
    
    Parameters:
    - link: Link Tag object, or search criteria if None
    - file: File path or file-like object for output
    - bs4_args, bs4_kwargs: BeautifulSoup search parameters if link is None
    - requests_kwargs: Parameters for the HTTP request
    - **kwargs: Additional search parameters
    
    Returns:
    requests.Response object
    """

Usage Example:

browser = mechanicalsoup.StatefulBrowser()
browser.open("https://httpbin.org/")

# Download first link to file
links = browser.links()
if links:
    browser.download_link(links[0], file="downloaded.html")

# Download by search criteria
browser.download_link(link_text="JSON", file="api_doc.html")

Form Selection and Interaction

Select and interact with forms on the current page.

def select_form(self, selector="form", nr=0):
    """
    Select a form on the current page.
    
    Parameters:
    - selector: CSS selector or BeautifulSoup search criteria
    - nr: Form index if multiple matches (0-based)
    
    Returns:
    Form object
    """

def submit_selected(self, btnName=None, update_state=True, **kwargs):
    """
    Submit the currently selected form.
    
    Parameters:
    - btnName: Name of submit button to use
    - update_state: Whether to update browser state with response
    - **kwargs: Additional request parameters
    
    Returns:
    requests.Response with soup attribute
    """

def new_control(self, type, name, value, **kwargs):
    """
    Add new control to selected form.
    
    Parameters:
    - type: Input type (text, hidden, etc.)
    - name: Control name
    - value: Control value
    - **kwargs: Additional attributes
    """

def __setitem__(self, name, value):
    """Set form field value using bracket notation"""

Usage Example:

browser = mechanicalsoup.StatefulBrowser()
browser.open("https://httpbin.org/forms/post")

# Select form by CSS selector
browser.select_form('form[action="/post"]')

# Set form fields
browser["custname"] = "John Doe"
browser["custtel"] = "555-1234"

# Add new hidden field
browser.new_control("hidden", "session_id", "abc123")

# Submit form
response = browser.submit_selected()
print(response.json())

Debug and Development Tools

Tools for debugging web automation workflows.

def set_debug(self, debug):
    """
    Enable/disable debug mode.
    
    Parameters:
    - debug: Boolean debug flag
    """

def get_debug(self):
    """Get current debug mode status"""

def set_verbose(self, verbose):
    """
    Set verbosity level.
    
    Parameters:
    - verbose: Verbosity level (0-2)
    """

def get_verbose(self):
    """Get current verbosity level"""

def launch_browser(self, soup=None):
    """
    Launch external browser with current or specified page.
    
    Parameters:
    - soup: Optional BeautifulSoup object, uses current page if None
    """

Usage Example:

browser = mechanicalsoup.StatefulBrowser()

# Enable debug mode
browser.set_debug(True)

# Set high verbosity
browser.set_verbose(2)

# Launch browser for visual debugging
browser.open("https://httpbin.org/forms/post")
browser.launch_browser()  # Opens current page in system browser

Legacy Compatibility Methods

Deprecated methods maintained for backward compatibility.

def get_current_page(self):
    """Deprecated: Use .page property instead"""

def get_current_form(self):
    """Deprecated: Use .form property instead"""

def get_url(self):
    """Deprecated: Use .url property instead"""

Complete Navigation Workflow Example

import mechanicalsoup
import re

# Create browser and enable debugging
browser = mechanicalsoup.StatefulBrowser(user_agent="MyBot/1.0")
browser.set_verbose(1)

# Navigate to a form page
browser.open("https://httpbin.org/forms/post")

# Examine current page
print(f"Page title: {browser.page.title.string}")
print(f"Current URL: {browser.url}")

# Find and select form
browser.select_form()

# Fill form fields
browser["custname"] = "Jane Smith"
browser["custtel"] = "555-9876"
browser.form.set_radio({"size": "large"})

# Submit and follow response
response = browser.submit_selected()
print(f"Form submitted to: {browser.url}")

# Navigate using links
browser.open("https://httpbin.org/")
json_links = browser.links(url_regex=re.compile(r"/json"))
if json_links:
    browser.follow_link(json_links[0])
    print(f"JSON endpoint content: {browser.page}")

# Clean up
browser.close()

Install with Tessl CLI