tessl/pypi-html2text

Turn HTML into equivalent Markdown-structured text.

—

Pending

Overview

Eval results

Files

Utility Functions

Name: tessl/pypi-html2text
Author: tessl

Helper functions for text processing, CSS parsing, character escaping, and table formatting. These functions are used internally by html2text and are also available for advanced use cases requiring custom text processing.

Capabilities

Text Escaping and Processing

Functions for escaping markdown characters and processing text sections safely.

def escape_md(text: str) -> str:
    """
    Escape markdown-sensitive characters within markdown constructs.
    
    Escapes characters that have special meaning in Markdown (like brackets,
    parentheses, backslashes) to prevent them from being interpreted as
    formatting when they should be literal text.
    
    Args:
        text: Text string to escape
        
    Returns:
        Text with markdown characters escaped with backslashes
        
    Example:
        >>> from html2text.utils import escape_md
        >>> escape_md("Some [text] with (special) chars")
        'Some \\[text\\] with \\(special\\) chars'
    """

def escape_md_section(text: str, snob: bool = False) -> str:
    """
    Escape markdown-sensitive characters across document sections.
    
    More comprehensive escaping for full document sections, handling
    various markdown constructs that could interfere with formatting.
    
    Args:
        text: Text string to escape
        snob: If True, escape additional characters for maximum safety
        
    Returns:
        Text with markdown characters properly escaped
        
    Example:
        >>> from html2text.utils import escape_md_section
        >>> escape_md_section("1. Item\\n2. Another", snob=True)
        '1\\. Item\\n2\\. Another'
    """

Table Formatting

Functions for formatting and aligning table content in text output.

def pad_tables_in_text(text: str, right_margin: int = 1) -> str:
    """
    Add padding to tables in text for consistent column alignment.
    
    Processes text containing markdown tables and adds appropriate padding
    to ensure all columns have consistent width for improved readability.
    
    Args:
        text: Text containing markdown tables to format
        right_margin: Additional padding spaces for right margin (default: 1)
        
    Returns:
        Text with properly padded and aligned tables
        
    Example:
        >>> table_text = "| Name | Age |\\n| Alice | 30 |\\n| Bob | 25 |"
        >>> padded = pad_tables_in_text(table_text)
        >>> print(padded)
        | Name  | Age |
        | Alice | 30  |
        | Bob   | 25  |
    """

def reformat_table(lines: List[str], right_margin: int) -> List[str]:
    """
    Reformat table lines with consistent column widths.
    
    Takes raw table lines and reformats them with proper padding
    to create aligned columns.
    
    Args:
        lines: List of table row strings
        right_margin: Right margin padding in spaces
        
    Returns:
        List of reformatted table lines with consistent alignment
    """

CSS and Style Processing

Functions for parsing CSS styles and processing element styling, particularly useful for Google Docs HTML.

def dumb_property_dict(style: str) -> Dict[str, str]:
    """
    Parse CSS style string into property dictionary.
    
    Takes a CSS style string (like from a style attribute) and converts
    it into a dictionary of property-value pairs.
    
    Args:
        style: CSS style string with semicolon-separated property declarations
        
    Returns:
        Dictionary mapping CSS property names to values (both lowercased)
        
    Example:
        >>> from html2text.utils import dumb_property_dict
        >>> style = "color: red; font-size: 14px; font-weight: bold"
        >>> props = dumb_property_dict(style)
        >>> print(props)
        {'color': 'red', 'font-size': '14px', 'font-weight': 'bold'}
    """

def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
    """
    Parse CSS style definitions into a structured format.
    
    Simple CSS parser that extracts style rules and properties for
    processing HTML with inline styles or embedded CSS.
    
    Args:
        data: CSS string to parse
        
    Returns:
        Dictionary mapping selectors to property dictionaries
        
    Example:
        >>> css = "p { color: red; font-size: 14px; }"
        >>> parsed = dumb_css_parser(css)
        >>> print(parsed)
        {'p': {'color': 'red', 'font-size': '14px'}}
    """

def element_style(
    attrs: Dict[str, Optional[str]], 
    style_def: Dict[str, Dict[str, str]], 
    parent_style: Dict[str, str]
) -> Dict[str, str]:
    """
    Compute final style attributes for an HTML element.
    
    Combines parent styles, CSS class styles, and inline styles to
    determine the effective styling for an element.
    
    Args:
        attrs: HTML element attributes dictionary
        style_def: CSS style definitions from stylesheet
        parent_style: Inherited styles from parent elements
        
    Returns:
        Dictionary of final computed styles for the element
    """

def google_text_emphasis(style: Dict[str, str]) -> List[str]:
    """
    Extract text emphasis styles from Google Docs CSS.
    
    Analyzes CSS style properties to determine what text emphasis
    (bold, italic, underline, etc.) should be applied.
    
    Args:
        style: Dictionary of CSS style properties
        
    Returns:
        List of emphasis style names found in the styles
    """

def google_fixed_width_font(style: Dict[str, str]) -> bool:
    """
    Check if CSS styles specify a fixed-width (monospace) font.
    
    Args:
        style: Dictionary of CSS style properties
        
    Returns:
        True if styles specify a monospace font family
    """

def google_has_height(style: Dict[str, str]) -> bool:
    """
    Check if CSS styles have explicit height defined.
    
    Args:
        style: Dictionary of CSS style properties
        
    Returns:
        True if height property is explicitly set
    """

def google_list_style(style: Dict[str, str]) -> str:
    """
    Determine list type from Google Docs CSS styles.
    
    Args:
        style: Dictionary of CSS style properties
        
    Returns:
        'ul' for unordered lists, 'ol' for ordered lists
    """

HTML Processing Utilities

Helper functions for processing HTML elements and attributes.

def hn(tag: str) -> int:
    """
    Extract header level from HTML header tag name.
    
    Args:
        tag: HTML tag name (e.g., 'h1', 'h2', 'div')
        
    Returns:
        Header level (1-6) for header tags, 0 for non-header tags
        
    Example:
        >>> hn('h1')
        1
        >>> hn('h3') 
        3
        >>> hn('div')
        0
    """

def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
    """
    Extract starting number from ordered list attributes.
    
    Args:
        attrs: HTML element attributes dictionary
        
    Returns:
        Starting number for ordered list (adjusted for 0-based indexing)
        
    Example:
        >>> attrs = {'start': '5'}
        >>> list_numbering_start(attrs)
        4  # Returns start - 1 for internal counting
    """

def skipwrap(
    para: str, 
    wrap_links: bool, 
    wrap_list_items: bool, 
    wrap_tables: bool
) -> bool:
    """
    Determine if a paragraph should skip text wrapping.
    
    Analyzes paragraph content to decide whether it should be wrapped
    based on content type and wrapping configuration.
    
    Args:
        para: Paragraph text to analyze
        wrap_links: Whether to allow wrapping of links
        wrap_list_items: Whether to allow wrapping of list items
        wrap_tables: Whether to allow wrapping of tables
        
    Returns:
        True if paragraph should skip wrapping, False otherwise
    """

Character and Entity Processing

Functions for handling HTML entities and character replacements.

# Character mapping constants
unifiable_n: Dict[int, str]
"""Mapping of Unicode code points to ASCII replacements."""

control_character_replacements: Dict[int, int]
"""Mapping of control characters to their Unicode replacements."""

Usage Examples

Text Escaping

from html2text.utils import escape_md, escape_md_section

# Basic markdown escaping
text = "Some [bracketed] text with (parentheses)"
escaped = escape_md(text)
print(escaped)  # "Some \\[bracketed\\] text with \\(parentheses\\)"

# Section-level escaping with additional safety
content = """
1. First item
2. Second item
*Some emphasized text*
`Code with backticks`
"""

safe_content = escape_md_section(content, snob=True)
print(safe_content)

Table Processing

from html2text.utils import pad_tables_in_text

# Raw table text with inconsistent spacing
table_text = """
| Name | Age | City |
| Alice | 30 | New York |
| Bob | 25 | London |
| Charlie | 35 | Paris |
"""

# Add padding for consistent alignment
padded_table = pad_tables_in_text(table_text)
print(padded_table)
# Output will have consistent column widths

CSS Processing

from html2text.utils import dumb_css_parser, dumb_property_dict, element_style

# Parse inline CSS styles
inline_style = "color: red; font-size: 14px; font-weight: bold"
props = dumb_property_dict(inline_style)
print(props)
# Output: {'color': 'red', 'font-size': '14px', 'font-weight': 'bold'}

# Parse CSS styles
css_content = """
.bold { font-weight: bold; color: black; }
.italic { font-style: italic; }
p { margin: 10px; font-size: 14px; }
"""

styles = dumb_css_parser(css_content)
print(styles)

# Compute element styles
element_attrs = {
    'class': 'bold italic',
    'style': 'color: red; font-size: 16px;'
}

parent_styles = {'margin': '5px'}
final_styles = element_style(element_attrs, styles, parent_styles)
print(final_styles)
# Will combine class styles, inline styles, and parent styles

HTML Tag Processing

from html2text.utils import hn, list_numbering_start

# Extract header levels
print(hn('h1'))    # 1
print(hn('h3'))    # 3  
print(hn('div'))   # 0

# Process list attributes
ol_attrs = {'start': '5', 'type': '1'}
start_num = list_numbering_start(ol_attrs)
print(start_num)   # 4 (adjusted for 0-based counting)

Wrapping Analysis

from html2text.utils import skipwrap

# Test different paragraph types
paragraphs = [
    "Regular paragraph text that can be wrapped normally.",
    "    This is a code block with leading spaces",
    "* This is a list item that might not wrap",
    "Here's a paragraph with [a link](http://example.com) in it.",
    "| Name | Age | - this looks like a table"
]

for para in paragraphs:
    should_skip = skipwrap(para, wrap_links=True, wrap_list_items=False, wrap_tables=False)
    print(f"Skip wrapping: {should_skip} - {para[:30]}...")

Google Docs Style Processing

from html2text.utils import (
    google_text_emphasis, 
    google_fixed_width_font,
    google_list_style
)

# Analyze Google Docs styles
gdoc_style = {
    'font-weight': 'bold',
    'font-style': 'italic', 
    'text-decoration': 'underline',
    'font-family': 'courier new'
}

emphasis = google_text_emphasis(gdoc_style)
print(f"Emphasis styles: {emphasis}")

is_monospace = google_fixed_width_font(gdoc_style)
print(f"Monospace font: {is_monospace}")

list_style = {
    'list-style-type': 'disc'
}
list_type = google_list_style(list_style)
print(f"List type: {list_type}")

Install with Tessl CLI