CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-html2text

Turn HTML into equivalent Markdown-structured text.

Pending
Overview
Eval results
Files

configuration.mddocs/

Configuration Options

Comprehensive formatting and behavior configuration for customizing HTML to text conversion. All options can be set on HTML2Text instances to control output formatting, link handling, table processing, and text styling.

Capabilities

Link and Image Configuration

Control how links and images are processed and formatted in the output.

# Link handling options
ignore_links: bool = False
"""Skip all link formatting, treating links as plain text."""

ignore_mailto_links: bool = False  
"""Skip mailto: links while processing other links normally."""

inline_links: bool = True
"""Use inline [text](url) format vs reference-style [text][1] links."""

protect_links: bool = False
"""Wrap links with angle brackets <url> to prevent line breaks."""

skip_internal_links: bool = True
"""Skip internal anchor links (href="#section")."""

links_each_paragraph: bool = False
"""Place reference links after each paragraph instead of document end."""

use_automatic_links: bool = True
"""Convert URLs that match link text to automatic <url> format."""

wrap_links: bool = True
"""Allow wrapping of long links across multiple lines."""

# Image handling options  
ignore_images: bool = False
"""Skip all image formatting, removing images from output."""

images_as_html: bool = False
"""Output images as raw HTML tags preserving attributes."""

images_to_alt: bool = False
"""Replace images with alt text only, discarding image references."""

images_with_size: bool = False
"""Include width/height attributes when outputting images as HTML."""

default_image_alt: str = ""
"""Default alt text for images missing alt attributes."""

Text Formatting Configuration

Control text wrapping, character handling, and emphasis formatting.

# Text wrapping and layout
body_width: int = 78
"""Maximum line width for text wrapping. Set to 0 for no wrapping."""

single_line_break: bool = False
"""Use single line breaks after block elements instead of double."""

wrap_list_items: bool = False
"""Allow wrapping of list items across multiple lines."""

# Character and emphasis handling
unicode_snob: bool = False
"""Use Unicode characters instead of ASCII replacements (e.g., → vs ->)."""

escape_snob: bool = False
"""Escape all special characters for safer but less readable output."""

ignore_emphasis: bool = False
"""Skip all emphasis formatting (bold, italic, etc.)."""

# Emphasis markers
ul_item_mark: str = "*"
"""Character used for unordered list items. Common: "*", "-", "+"."""

emphasis_mark: str = "_"
"""Character used for italic emphasis. Common: "_", "*"."""

strong_mark: str = "**"
"""Character sequence used for bold emphasis."""

# Quote handling
open_quote: str = '"'
"""Character used to open quotes from <q> tags."""

close_quote: str = '"'
"""Character used to close quotes from <q> tags."""

Table Configuration

Control table processing and formatting options.

bypass_tables: bool = False
"""Format tables as raw HTML instead of Markdown table syntax."""

ignore_tables: bool = False
"""Skip table formatting entirely, treating as plain text."""

pad_tables: bool = False
"""Pad table cells to equal column width for aligned appearance."""

wrap_tables: bool = False
"""Allow wrapping of table content across multiple lines."""

Code and Preformatted Text

Control handling of code blocks and preformatted content.

mark_code: bool = False
"""Mark code blocks with [code]...[/code] tags instead of indentation."""

backquote_code_style: bool = False  
"""Use triple-backtick ```code``` blocks instead of indentation."""

hide_strikethrough: bool = False
"""Hide strikethrough text instead of showing with ~~text~~ format."""

Google Docs Specific Options

Special handling for HTML exported from Google Docs.

google_doc: bool = False
"""Enable Google Docs-specific formatting and style handling."""

google_list_indent: int = 36
"""Number of pixels Google uses for nested list indentation."""

Advanced Options

Additional options for specialized use cases.

include_sup_sub: bool = False
"""Include superscript <sup> and subscript <sub> tags in output."""

tag_callback: Optional[Callable] = None
"""Custom callback function for handling specific HTML tags."""

Configuration Examples

Basic Configuration

import html2text

# Create converter with custom settings
h = html2text.HTML2Text()

# Configure for clean, readable output
h.ignore_links = True           # Remove all links
h.ignore_images = True          # Remove all images  
h.body_width = 0               # No line wrapping
h.ignore_emphasis = False       # Keep bold/italic formatting

html = """
<div>
    <h1>Title</h1>
    <p>Some <strong>bold</strong> text with a <a href="http://example.com">link</a>.</p>
    <img src="image.jpg" alt="An image">
</div>
"""

result = h.handle(html)
print(result)

Link Processing Options

import html2text

html = """
<p>Check out <a href="https://example.com">our website</a> and 
<a href="mailto:contact@example.com">email us</a> or see 
<a href="#section1">this section</a>.</p>
"""

# Inline links (default)
h1 = html2text.HTML2Text()
h1.inline_links = True
print("Inline links:")
print(h1.handle(html))

# Reference-style links  
h2 = html2text.HTML2Text()
h2.inline_links = False
print("\nReference links:")
print(h2.handle(html))

# Ignore specific link types
h3 = html2text.HTML2Text()  
h3.ignore_mailto_links = True
h3.skip_internal_links = True
print("\nFiltered links:")
print(h3.handle(html))

Table Formatting Options

import html2text

html = """
<table>
    <tr><th>Name</th><th>Age</th><th>City</th></tr>
    <tr><td>Alice</td><td>30</td><td>New York</td></tr>
    <tr><td>Bob</td><td>25</td><td>London</td></tr>
</table>
"""

# Default markdown table
h1 = html2text.HTML2Text()
print("Markdown table:")
print(h1.handle(html))

# Padded table for alignment
h2 = html2text.HTML2Text()
h2.pad_tables = True
print("\nPadded table:")
print(h2.handle(html))

# Raw HTML table
h3 = html2text.HTML2Text()
h3.bypass_tables = True
print("\nHTML table:")
print(h3.handle(html))

# No table formatting
h4 = html2text.HTML2Text()
h4.ignore_tables = True
print("\nIgnored table:")
print(h4.handle(html))

Code Block Formatting

import html2text

html = """
<div>
    <p>Here's some code:</p>
    <pre><code>def hello():
    print("Hello, world!")
    return True</code></pre>
    <p>And inline <code>code</code> too.</p>
</div>
"""

# Default indented code blocks
h1 = html2text.HTML2Text()
print("Indented code blocks:")
print(h1.handle(html))

# Triple-backtick code blocks
h2 = html2text.HTML2Text()
h2.backquote_code_style = True
print("\nBacktick code blocks:")
print(h2.handle(html))

# Marked code blocks
h3 = html2text.HTML2Text()
h3.mark_code = True
print("\nMarked code blocks:")
print(h3.handle(html))

Text Wrapping and Formatting

import html2text

html = "<p>This is a very long paragraph that will demonstrate text wrapping behavior in the html2text converter when processing HTML content.</p>"

# Default wrapping at 78 characters
h1 = html2text.HTML2Text()
print(f"Default wrapping (width={h1.body_width}):")
print(h1.handle(html))

# Custom width
h2 = html2text.HTML2Text()
h2.body_width = 40
print(f"\nNarrow wrapping (width={h2.body_width}):")
print(h2.handle(html))

# No wrapping
h3 = html2text.HTML2Text()
h3.body_width = 0
print(f"\nNo wrapping (width={h3.body_width}):")
print(h3.handle(html))

Google Docs Processing

import html2text

# HTML exported from Google Docs with inline styles
google_html = """
<p style="margin-left:36px"><span style="font-weight:bold">Bold item</span></p>
<p style="margin-left:72px">Nested item with <span style="font-style:italic">emphasis</span></p>
"""

h = html2text.HTML2Text()
h.google_doc = True
h.google_list_indent = 36  # Google's default indent

result = h.handle(google_html)
print(result)

Install with Tessl CLI

npx tessl i tessl/pypi-html2text

docs

configuration.md

core-conversion.md

index.md

utilities.md

tile.json