Turn HTML into equivalent Markdown-structured text.
npx @tessl/cli install tessl/pypi-html2text@2025.4.0A comprehensive Python library that converts HTML into clean, readable plain ASCII text and valid Markdown format. It provides both programmatic API and command-line interface with extensive configuration options for handling links, code blocks, tables, and formatting elements while maintaining semantic structure.
pip install html2textimport html2textFor basic usage:
from html2text import html2textFor advanced usage with configuration:
from html2text import HTML2Textimport html2text
# Simple conversion using convenience function
html = "<p><strong>Bold text</strong> and <em>italic text</em></p>"
markdown = html2text.html2text(html)
print(markdown)
# Output: **Bold text** and _italic text_
# Advanced usage with configuration
h = html2text.HTML2Text()
h.ignore_links = True
h.body_width = 0 # No line wrapping
markdown = h.handle("<p>Hello <a href='http://example.com'>world</a>!</p>")
print(markdown)
# Output: Hello world!html2text uses an HTML parser-based architecture:
html.parser.HTMLParser with extensive configuration optionsPrimary conversion functionality for transforming HTML into Markdown or plain text with configurable formatting options.
def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str:
"""
Convert HTML string to Markdown/text.
Args:
html: HTML string to convert
baseurl: Base URL for resolving relative links
bodywidth: Text wrapping width (None uses default)
Returns:
Converted Markdown/text string
"""
class HTML2Text(html.parser.HTMLParser):
"""
Advanced HTML to text converter with extensive configuration options.
Args:
out: Optional custom output callback function
baseurl: Base URL for resolving relative links (default: "")
bodywidth: Maximum line width for text wrapping (default: 78)
"""
def handle(self, data: str) -> str:
"""
Convert HTML string to Markdown/text.
Args:
data: HTML string to convert
Returns:
Converted Markdown/text string
"""Comprehensive formatting and behavior configuration for customizing HTML to text conversion including link handling, text formatting, table processing, and output styling.
# Link and Image Configuration
ignore_links: bool = False # Skip all link formatting
ignore_mailto_links: bool = False # Skip mailto links
inline_links: bool = True # Use inline vs reference links
protect_links: bool = False # Wrap links with angle brackets
ignore_images: bool = False # Skip image formatting
images_to_alt: bool = False # Replace images with alt text only
# Text Formatting Configuration
body_width: int = 78 # Text wrapping width (0 for no wrap)
unicode_snob: bool = False # Use Unicode vs ASCII replacements
escape_snob: bool = False # Escape all special characters
ignore_emphasis: bool = False # Skip bold/italic formatting
single_line_break: bool = False # Use single vs double line breaks
# Table Configuration
bypass_tables: bool = False # Format tables as HTML vs Markdown
ignore_tables: bool = False # Skip table formatting entirely
pad_tables: bool = False # Pad table cells to equal widthHelper functions for text processing, CSS parsing, character escaping, and table formatting used internally and available for advanced use cases.
def escape_md(text: str) -> str:
"""Escape markdown-sensitive characters within markdown constructs."""
def escape_md_section(text: str, snob: bool = False) -> str:
"""Escape markdown-sensitive characters across document sections."""
def pad_tables_in_text(text: str, right_margin: int = 1) -> str:
"""Add padding to tables in text for consistent column alignment."""html2text handles malformed HTML gracefully through its HTMLParser base class. Character encoding issues should be resolved before passing HTML to the converter:
# Handle encoding explicitly if needed
with open('file.html', 'rb') as f:
html_bytes = f.read()
html_text = html_bytes.decode('utf-8', errors='ignore')
markdown = html2text.html2text(html_text)The package includes a command-line tool html2text with comprehensive configuration options:
# Basic usage
html2text input.html
# From stdin
echo "<p>Hello world</p>" | html2text
# With custom encoding
html2text input.html utf-8
# Common options
html2text --body-width=0 --ignore-links input.html
html2text --reference-links --pad-tables input.html
html2text --google-doc --hide-strikethrough gdoc.htmlText Formatting:
--body-width=N - Line width (0 for no wrapping, default: 78)--single-line-break - Use single line breaks instead of double--escape-all - Escape all special characters for safer outputLink Handling:
--ignore-links - Don't include any link formatting--ignore-mailto-links - Don't include mailto: links--reference-links - Use reference-style links instead of inline--protect-links - Wrap links with angle brackets--no-wrap-links - Don't wrap long linksImage Handling:
--ignore-images - Don't include any image formatting--images-as-html - Keep images as raw HTML tags--images-to-alt - Replace images with alt text only--images-with-size - Include width/height in HTML image tags--default-image-alt=TEXT - Default alt text for imagesTable Formatting:
--pad-tables - Pad cells to equal column width--bypass-tables - Format tables as HTML instead of Markdown--ignore-tables - Skip table formatting entirely--wrap-tables - Allow table content wrappingList and Emphasis:
--ignore-emphasis - Don't include formatting for bold/italic--dash-unordered-list - Use dashes instead of asterisks for lists--asterisk-emphasis - Use asterisks instead of underscores for emphasis--wrap-list-items - Allow list item wrappingGoogle Docs Support:
--google-doc - Enable Google Docs-specific processing--google-list-indent=N - Pixels Google uses for list indentation (default: 36)--hide-strikethrough - Hide strikethrough text (use with --google-doc)from typing import Dict, List, Optional, Protocol
class OutCallback(Protocol):
"""Protocol for custom output callback functions."""
def __call__(self, s: str) -> None: ...
class AnchorElement:
"""Represents link elements during processing."""
attrs: Dict[str, Optional[str]]
count: int
outcount: int
class ListElement:
"""Represents list elements during processing."""
name: str # 'ul' or 'ol'
num: int # Current list item number