Convert HTML to markdown with extensive customization options for tag filtering, heading styles, and output formatting.
npx @tessl/cli install tessl/pypi-markdownify@1.2.0A comprehensive Python library for converting HTML to Markdown. Markdownify provides extensive customization options including tag filtering (strip/convert specific tags), heading style control (ATX, SETEXT, underlined), list formatting, code block handling, and table conversion with advanced features like colspan support and header inference.
pip install markdownifyfrom markdownify import markdownifyOr import the converter class directly:
from markdownify import MarkdownConverterYou can also import constants for configuration:
from markdownify import (
markdownify, MarkdownConverter,
ATX, ATX_CLOSED, UNDERLINED, SETEXT,
SPACES, BACKSLASH, ASTERISK, UNDERSCORE,
STRIP, LSTRIP, RSTRIP, STRIP_ONE
)from markdownify import markdownify as md
# Simple HTML to Markdown conversion
html = '<b>Bold text</b> and <a href="http://example.com">a link</a>'
markdown = md(html)
print(markdown) # **Bold text** and [a link](http://example.com)
# Convert with options
html = '<h1>Title</h1><p>Paragraph with <em>emphasis</em></p>'
markdown = md(html, heading_style='atx', strip=['em'])
print(markdown) # # Title\n\nParagraph with emphasis
# Using the MarkdownConverter class for repeated conversions
converter = MarkdownConverter(
heading_style='atx_closed',
bullets='*+-',
escape_misc=True
)
markdown1 = converter.convert('<h2>Section</h2><ul><li>Item 1</li></ul>')
markdown2 = converter.convert('<blockquote>Quote text</blockquote>')# Convert HTML file to Markdown
markdownify input.html
# Convert from stdin
echo '<b>Bold</b>' | markdownify
# Basic formatting options
markdownify --heading-style=atx --bullets='*-+' input.html
markdownify --strong-em-symbol='_' --newline-style=backslash input.html
# Tag filtering
markdownify --strip a script style input.html
markdownify --convert h1 h2 p b i strong em input.html
# Advanced options
markdownify --wrap --wrap-width=100 --table-infer-header input.html
markdownify --keep-inline-images-in h1 h2 --code-language=python input.html
markdownify --no-escape-asterisks --no-escape-underscores input.html
markdownify --sub-symbol='~' --sup-symbol='^' --bs4-options=lxml input.htmlThe main function for converting HTML to Markdown with comprehensive options.
def markdownify(
html: str,
autolinks: bool = True,
bs4_options: str | dict = 'html.parser',
bullets: str = '*+-',
code_language: str = '',
code_language_callback: callable = None,
convert: list = None,
default_title: bool = False,
escape_asterisks: bool = True,
escape_underscores: bool = True,
escape_misc: bool = False,
heading_style: str = 'underlined',
keep_inline_images_in: list = [],
newline_style: str = 'spaces',
strip: list = None,
strip_document: str = 'strip',
strip_pre: str = 'strip',
strong_em_symbol: str = '*',
sub_symbol: str = '',
sup_symbol: str = '',
table_infer_header: bool = False,
wrap: bool = False,
wrap_width: int = 80
) -> str:
"""
Convert HTML to Markdown with extensive customization options.
Parameters:
- html: HTML string to convert
- autolinks: Use automatic link style when link text matches href
- bs4_options: BeautifulSoup parser options (string for parser name, or dict with 'features' key and other options)
- bullets: String of bullet characters for nested lists (e.g., '*+-')
- code_language: Default language for code blocks
- code_language_callback: Function to determine code block language
- convert: List of tags to convert (excludes all others if specified)
- default_title: Use href as title when no title provided
- escape_asterisks: Escape asterisk characters in text
- escape_underscores: Escape underscore characters in text
- escape_misc: Escape miscellaneous Markdown special characters
- heading_style: Style for headings ('atx', 'atx_closed', 'underlined')
- keep_inline_images_in: Parent tags that should keep inline images
- newline_style: Style for line breaks ('spaces', 'backslash')
- strip: List of tags to strip (excludes from conversion)
- strip_document: Document-level whitespace stripping ('strip', 'lstrip', 'rstrip', None)
- strip_pre: Pre-block whitespace stripping ('strip', 'strip_one', None)
- strong_em_symbol: Symbol for strong/emphasis ('*', '_')
- sub_symbol: Characters to surround subscript text
- sup_symbol: Characters to surround superscript text
- table_infer_header: Infer table headers when not explicitly marked
- wrap: Wrap text paragraphs at specified width
- wrap_width: Width for text wrapping
Returns:
Markdown string
"""The main converter class providing configurable HTML to Markdown conversion with caching and extensibility.
class MarkdownConverter:
"""
Configurable HTML to Markdown converter with extensive customization options.
Supports custom conversion methods for specific tags and provides caching for performance.
"""
def __init__(
self,
autolinks: bool = True,
bs4_options: str | dict = 'html.parser',
bullets: str = '*+-',
code_language: str = '',
code_language_callback: callable = None,
convert: list = None,
default_title: bool = False,
escape_asterisks: bool = True,
escape_underscores: bool = True,
escape_misc: bool = False,
heading_style: str = 'underlined',
keep_inline_images_in: list = [],
newline_style: str = 'spaces',
strip: list = None,
strip_document: str = 'strip',
strip_pre: str = 'strip',
strong_em_symbol: str = '*',
sub_symbol: str = '',
sup_symbol: str = '',
table_infer_header: bool = False,
wrap: bool = False,
wrap_width: int = 80
):
"""
Initialize MarkdownConverter with configuration options.
Parameters: Same as markdownify() function
"""
def convert(self, html: str) -> str:
"""
Convert HTML string to Markdown.
Parameters:
- html: HTML string to convert
Returns:
Markdown string
"""
def convert_soup(self, soup) -> str:
"""
Convert BeautifulSoup object to Markdown.
Parameters:
- soup: BeautifulSoup parsed HTML object
Returns:
Markdown string
"""Entry point for command-line HTML to Markdown conversion.
def main(argv: list = None):
"""
Command-line interface for markdownify.
Parameters:
- argv: Command line arguments (defaults to sys.argv[1:])
Supports all conversion options as command-line flags:
--strip, --convert, --autolinks, --heading-style, --bullets,
--strong-em-symbol, --sub-symbol, --sup-symbol, --newline-style,
--code-language, --no-escape-asterisks, --no-escape-underscores,
--keep-inline-images-in, --table-infer-header, --wrap, --wrap-width,
--bs4-options
"""Helper functions for text processing and whitespace handling.
def strip_pre(text: str) -> str:
"""
Strip all leading and trailing newlines from preformatted text.
Parameters:
- text: Text to strip
Returns:
Stripped text
"""
def strip1_pre(text: str) -> str:
"""
Strip one leading and trailing newline from preformatted text.
Parameters:
- text: Text to strip
Returns:
Stripped text with at most one leading/trailing newline removed
"""
def chomp(text: str) -> tuple:
"""
Extract leading/trailing spaces from inline text to prevent malformed Markdown.
Parameters:
- text: Text to process
Returns:
Tuple of (prefix_space, suffix_space, stripped_text)
"""
def abstract_inline_conversion(markup_fn: callable) -> callable:
"""
Factory function for creating inline tag conversion functions.
Parameters:
- markup_fn: Function that returns markup string for the tag
Returns:
Conversion function for inline tags
"""
def should_remove_whitespace_inside(el) -> bool:
"""
Determine if whitespace should be removed inside a block-level element.
Parameters:
- el: HTML element to check
Returns:
True if whitespace should be removed inside the element
"""
def should_remove_whitespace_outside(el) -> bool:
"""
Determine if whitespace should be removed outside a block-level element.
Parameters:
- el: HTML element to check
Returns:
True if whitespace should be removed outside the element
"""Style constants for configuring conversion behavior.
# Heading styles
ATX = 'atx' # # Heading
ATX_CLOSED = 'atx_closed' # # Heading #
UNDERLINED = 'underlined' # Heading\n=======
SETEXT = UNDERLINED # Alias for UNDERLINED
# Newline styles for <br> tags
SPACES = 'spaces' # Two spaces at end of line
BACKSLASH = 'backslash' # Backslash at end of line
# Strong/emphasis symbols
ASTERISK = '*' # **bold** and *italic*
UNDERSCORE = '_' # __bold__ and _italic_
# Document/pre stripping options
STRIP = 'strip' # Remove leading and trailing whitespace
LSTRIP = 'lstrip' # Remove leading whitespace only
RSTRIP = 'rstrip' # Remove trailing whitespace only
STRIP_ONE = 'strip_one' # Remove one leading/trailing newlineYou can extend MarkdownConverter to create custom conversion behavior for specific tags:
from markdownify import MarkdownConverter
class CustomConverter(MarkdownConverter):
def convert_custom_tag(self, el, text, parent_tags):
"""Custom conversion for <custom-tag> elements."""
return f"[CUSTOM: {text}]"
def convert_img(self, el, text, parent_tags):
"""Override image conversion to add custom behavior."""
result = super().convert_img(el, text, parent_tags)
return result + "\n\n" # Add extra newlines after images
# Usage
converter = CustomConverter()
html = '<custom-tag>content</custom-tag><img src="test.jpg" alt="Test">'
markdown = converter.convert(html)The library handles malformed HTML gracefully through BeautifulSoup's parsing capabilities. Invalid configuration options raise ValueError exceptions:
strip and convert optionsheading_style, newline_style, strip_document, or strip_precode_language_callback