Turn HTML into equivalent Markdown-structured text.
—
Primary conversion functionality for transforming HTML into Markdown or plain text. Provides both simple one-shot conversion and advanced configurable conversion with extensive formatting options.
Convenience function for straightforward HTML to Markdown conversion with minimal configuration.
def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str:
"""
Convert HTML string to Markdown/text using default settings.
Args:
html: HTML string to convert
baseurl: Base URL for resolving relative links (default: "")
bodywidth: Text wrapping width, None uses config.BODY_WIDTH (default: None)
Returns:
Converted Markdown/text string
Example:
>>> import html2text
>>> html = "<p><strong>Bold</strong> and <em>italic</em></p>"
>>> print(html2text.html2text(html))
**Bold** and _italic_
"""Full-featured HTML to text converter with extensive configuration options for fine-grained control over output formatting.
class HTML2Text(html.parser.HTMLParser):
"""
Advanced HTML to text converter with comprehensive configuration options.
Inherits from html.parser.HTMLParser to handle HTML parsing and provides
extensive customization for output formatting, link handling, table processing,
and text styling.
"""
def __init__(
self,
out: Optional[OutCallback] = None,
baseurl: str = "",
bodywidth: int = 78
) -> None:
"""
Initialize HTML2Text converter.
Args:
out: Optional custom output callback function for handling text output
baseurl: Base URL for resolving relative links (default: "")
bodywidth: Maximum line width for text wrapping (default: 78)
"""
def handle(self, data: str) -> str:
"""
Convert HTML string to Markdown/text with current configuration.
This is the main conversion method that processes the HTML through
the parser and returns the formatted output.
Args:
data: HTML string to convert
Returns:
Converted Markdown/text string
Example:
>>> h = html2text.HTML2Text()
>>> h.ignore_links = True
>>> html = "<p>Hello <a href='http://example.com'>world</a>!</p>"
>>> print(h.handle(html))
Hello world!
"""
def feed(self, data: str) -> None:
"""
Feed HTML data to the parser for processing.
Args:
data: HTML string to feed to parser
"""
def finish(self) -> str:
"""
Complete parsing and return formatted text output.
Returns:
Final formatted text string
"""
def outtextf(self, s: str) -> None:
"""
Default output callback function that appends text to internal buffer.
This is the default implementation of the output callback that collects
all text output into an internal list for final processing.
Args:
s: Text string to append to output buffer
"""
def close(self) -> None:
"""
Close the HTML parser and perform final cleanup.
Inherited from HTMLParser, ensures proper parser cleanup.
"""
def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]:
"""
Find index of link with matching attributes in anchor list.
Used internally for reference-style link processing to avoid
duplicate link definitions.
Args:
attrs: Dictionary of HTML element attributes
Returns:
Index of matching anchor element or None if not found
"""html2text supports comprehensive HTML element conversion:
<strong>, <b> → **text**<em>, <i> → _text_<code>, <tt>, <kbd> → `text`<del>, <strike>, <s> → ~~text~~<q> → "text"<sup>, <sub> (configurable)<h1> through <h6> → # Header<p> → paragraph breaks<br> → line breaks<hr> → * * *<blockquote> → > text<pre> → indented code blocks<ul>, <li> → * item<ol>, <li> → 1. item<dl>, <dt>, <dd><a> → [text](url) or reference-style<img> →  or configurable formats<table>, <tr>, <td>, <th> → Markdown tablesimport html2text
# Simple paragraph with formatting
html = """
<div>
<h1>Main Title</h1>
<p>This is a <strong>bold statement</strong> with some <em>emphasis</em>.</p>
<p>Here's a <a href="https://example.com">link</a> and some <code>inline code</code>.</p>
</div>
"""
converter = html2text.HTML2Text()
markdown = converter.handle(html)
print(markdown)html = """
<ul>
<li>First item</li>
<li>Second item with <strong>bold text</strong></li>
<li>Third item
<ol>
<li>Nested ordered item</li>
<li>Another nested item</li>
</ol>
</li>
</ul>
"""
converter = html2text.HTML2Text()
result = converter.handle(html)
print(result)html = """
<table>
<tr>
<th>Name</th>
<th>Age</th>
<th>City</th>
</tr>
<tr>
<td>Alice</td>
<td>30</td>
<td>New York</td>
</tr>
<tr>
<td>Bob</td>
<td>25</td>
<td>London</td>
</tr>
</table>
"""
converter = html2text.HTML2Text()
converter.pad_tables = True # Enable table padding
result = converter.handle(html)
print(result)def custom_output(text):
"""Custom output handler that uppercases text."""
print(text.upper(), end='')
html = "<p>Hello world!</p>"
converter = html2text.HTML2Text(out=custom_output)
converter.handle(html) # Will print "HELLO WORLD!" in uppercaseInstall with Tessl CLI
npx tessl i tessl/pypi-html2text