Turn HTML into equivalent Markdown-structured text.
—
Helper functions for text processing, CSS parsing, character escaping, and table formatting. These functions are used internally by html2text and are also available for advanced use cases requiring custom text processing.
Functions for escaping markdown characters and processing text sections safely.
def escape_md(text: str) -> str:
"""
Escape markdown-sensitive characters within markdown constructs.
Escapes characters that have special meaning in Markdown (like brackets,
parentheses, backslashes) to prevent them from being interpreted as
formatting when they should be literal text.
Args:
text: Text string to escape
Returns:
Text with markdown characters escaped with backslashes
Example:
>>> from html2text.utils import escape_md
>>> escape_md("Some [text] with (special) chars")
'Some \\[text\\] with \\(special\\) chars'
"""
def escape_md_section(text: str, snob: bool = False) -> str:
"""
Escape markdown-sensitive characters across document sections.
More comprehensive escaping for full document sections, handling
various markdown constructs that could interfere with formatting.
Args:
text: Text string to escape
snob: If True, escape additional characters for maximum safety
Returns:
Text with markdown characters properly escaped
Example:
>>> from html2text.utils import escape_md_section
>>> escape_md_section("1. Item\\n2. Another", snob=True)
'1\\. Item\\n2\\. Another'
"""Functions for formatting and aligning table content in text output.
def pad_tables_in_text(text: str, right_margin: int = 1) -> str:
"""
Add padding to tables in text for consistent column alignment.
Processes text containing markdown tables and adds appropriate padding
to ensure all columns have consistent width for improved readability.
Args:
text: Text containing markdown tables to format
right_margin: Additional padding spaces for right margin (default: 1)
Returns:
Text with properly padded and aligned tables
Example:
>>> table_text = "| Name | Age |\\n| Alice | 30 |\\n| Bob | 25 |"
>>> padded = pad_tables_in_text(table_text)
>>> print(padded)
| Name | Age |
| Alice | 30 |
| Bob | 25 |
"""
def reformat_table(lines: List[str], right_margin: int) -> List[str]:
"""
Reformat table lines with consistent column widths.
Takes raw table lines and reformats them with proper padding
to create aligned columns.
Args:
lines: List of table row strings
right_margin: Right margin padding in spaces
Returns:
List of reformatted table lines with consistent alignment
"""Functions for parsing CSS styles and processing element styling, particularly useful for Google Docs HTML.
def dumb_property_dict(style: str) -> Dict[str, str]:
"""
Parse CSS style string into property dictionary.
Takes a CSS style string (like from a style attribute) and converts
it into a dictionary of property-value pairs.
Args:
style: CSS style string with semicolon-separated property declarations
Returns:
Dictionary mapping CSS property names to values (both lowercased)
Example:
>>> from html2text.utils import dumb_property_dict
>>> style = "color: red; font-size: 14px; font-weight: bold"
>>> props = dumb_property_dict(style)
>>> print(props)
{'color': 'red', 'font-size': '14px', 'font-weight': 'bold'}
"""
def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
"""
Parse CSS style definitions into a structured format.
Simple CSS parser that extracts style rules and properties for
processing HTML with inline styles or embedded CSS.
Args:
data: CSS string to parse
Returns:
Dictionary mapping selectors to property dictionaries
Example:
>>> css = "p { color: red; font-size: 14px; }"
>>> parsed = dumb_css_parser(css)
>>> print(parsed)
{'p': {'color': 'red', 'font-size': '14px'}}
"""
def element_style(
attrs: Dict[str, Optional[str]],
style_def: Dict[str, Dict[str, str]],
parent_style: Dict[str, str]
) -> Dict[str, str]:
"""
Compute final style attributes for an HTML element.
Combines parent styles, CSS class styles, and inline styles to
determine the effective styling for an element.
Args:
attrs: HTML element attributes dictionary
style_def: CSS style definitions from stylesheet
parent_style: Inherited styles from parent elements
Returns:
Dictionary of final computed styles for the element
"""
def google_text_emphasis(style: Dict[str, str]) -> List[str]:
"""
Extract text emphasis styles from Google Docs CSS.
Analyzes CSS style properties to determine what text emphasis
(bold, italic, underline, etc.) should be applied.
Args:
style: Dictionary of CSS style properties
Returns:
List of emphasis style names found in the styles
"""
def google_fixed_width_font(style: Dict[str, str]) -> bool:
"""
Check if CSS styles specify a fixed-width (monospace) font.
Args:
style: Dictionary of CSS style properties
Returns:
True if styles specify a monospace font family
"""
def google_has_height(style: Dict[str, str]) -> bool:
"""
Check if CSS styles have explicit height defined.
Args:
style: Dictionary of CSS style properties
Returns:
True if height property is explicitly set
"""
def google_list_style(style: Dict[str, str]) -> str:
"""
Determine list type from Google Docs CSS styles.
Args:
style: Dictionary of CSS style properties
Returns:
'ul' for unordered lists, 'ol' for ordered lists
"""Helper functions for processing HTML elements and attributes.
def hn(tag: str) -> int:
"""
Extract header level from HTML header tag name.
Args:
tag: HTML tag name (e.g., 'h1', 'h2', 'div')
Returns:
Header level (1-6) for header tags, 0 for non-header tags
Example:
>>> hn('h1')
1
>>> hn('h3')
3
>>> hn('div')
0
"""
def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
"""
Extract starting number from ordered list attributes.
Args:
attrs: HTML element attributes dictionary
Returns:
Starting number for ordered list (adjusted for 0-based indexing)
Example:
>>> attrs = {'start': '5'}
>>> list_numbering_start(attrs)
4 # Returns start - 1 for internal counting
"""
def skipwrap(
para: str,
wrap_links: bool,
wrap_list_items: bool,
wrap_tables: bool
) -> bool:
"""
Determine if a paragraph should skip text wrapping.
Analyzes paragraph content to decide whether it should be wrapped
based on content type and wrapping configuration.
Args:
para: Paragraph text to analyze
wrap_links: Whether to allow wrapping of links
wrap_list_items: Whether to allow wrapping of list items
wrap_tables: Whether to allow wrapping of tables
Returns:
True if paragraph should skip wrapping, False otherwise
"""Functions for handling HTML entities and character replacements.
# Character mapping constants
unifiable_n: Dict[int, str]
"""Mapping of Unicode code points to ASCII replacements."""
control_character_replacements: Dict[int, int]
"""Mapping of control characters to their Unicode replacements."""from html2text.utils import escape_md, escape_md_section
# Basic markdown escaping
text = "Some [bracketed] text with (parentheses)"
escaped = escape_md(text)
print(escaped) # "Some \\[bracketed\\] text with \\(parentheses\\)"
# Section-level escaping with additional safety
content = """
1. First item
2. Second item
*Some emphasized text*
`Code with backticks`
"""
safe_content = escape_md_section(content, snob=True)
print(safe_content)from html2text.utils import pad_tables_in_text
# Raw table text with inconsistent spacing
table_text = """
| Name | Age | City |
| Alice | 30 | New York |
| Bob | 25 | London |
| Charlie | 35 | Paris |
"""
# Add padding for consistent alignment
padded_table = pad_tables_in_text(table_text)
print(padded_table)
# Output will have consistent column widthsfrom html2text.utils import dumb_css_parser, dumb_property_dict, element_style
# Parse inline CSS styles
inline_style = "color: red; font-size: 14px; font-weight: bold"
props = dumb_property_dict(inline_style)
print(props)
# Output: {'color': 'red', 'font-size': '14px', 'font-weight': 'bold'}
# Parse CSS styles
css_content = """
.bold { font-weight: bold; color: black; }
.italic { font-style: italic; }
p { margin: 10px; font-size: 14px; }
"""
styles = dumb_css_parser(css_content)
print(styles)
# Compute element styles
element_attrs = {
'class': 'bold italic',
'style': 'color: red; font-size: 16px;'
}
parent_styles = {'margin': '5px'}
final_styles = element_style(element_attrs, styles, parent_styles)
print(final_styles)
# Will combine class styles, inline styles, and parent stylesfrom html2text.utils import hn, list_numbering_start
# Extract header levels
print(hn('h1')) # 1
print(hn('h3')) # 3
print(hn('div')) # 0
# Process list attributes
ol_attrs = {'start': '5', 'type': '1'}
start_num = list_numbering_start(ol_attrs)
print(start_num) # 4 (adjusted for 0-based counting)from html2text.utils import skipwrap
# Test different paragraph types
paragraphs = [
"Regular paragraph text that can be wrapped normally.",
" This is a code block with leading spaces",
"* This is a list item that might not wrap",
"Here's a paragraph with [a link](http://example.com) in it.",
"| Name | Age | - this looks like a table"
]
for para in paragraphs:
should_skip = skipwrap(para, wrap_links=True, wrap_list_items=False, wrap_tables=False)
print(f"Skip wrapping: {should_skip} - {para[:30]}...")from html2text.utils import (
google_text_emphasis,
google_fixed_width_font,
google_list_style
)
# Analyze Google Docs styles
gdoc_style = {
'font-weight': 'bold',
'font-style': 'italic',
'text-decoration': 'underline',
'font-family': 'courier new'
}
emphasis = google_text_emphasis(gdoc_style)
print(f"Emphasis styles: {emphasis}")
is_monospace = google_fixed_width_font(gdoc_style)
print(f"Monospace font: {is_monospace}")
list_style = {
'list-style-type': 'disc'
}
list_type = google_list_style(list_style)
print(f"List type: {list_type}")Install with Tessl CLI
npx tessl i tessl/pypi-html2text