Convert Word documents from docx to simple and clean HTML and Markdown
npx @tessl/cli install tessl/pypi-mammoth@1.10.0A robust Python library that converts Microsoft Word .docx documents into clean, semantic HTML and Markdown formats. Mammoth focuses on preserving the semantic structure of documents by converting styled elements (like headings, lists, tables) to appropriate HTML tags rather than attempting to replicate exact visual formatting.
pip install mammothimport mammothAccess to main conversion functions:
from mammoth import convert_to_html, convert_to_markdown, extract_raw_textAccess to styling and transformation utilities:
from mammoth import images, transforms, underlineAccess to writers and HTML generation:
from mammoth.writers import writer, formats, HtmlWriter, MarkdownWriter
from mammoth.html import text, element, tag, collapsible_element, strip_empty, collapse, writeimport mammoth
# Convert DOCX to HTML
with open("document.docx", "rb") as docx_file:
result = mammoth.convert_to_html(docx_file)
html = result.value # The generated HTML
messages = result.messages # Any conversion warnings
print(html)
# Convert DOCX to Markdown
with open("document.docx", "rb") as docx_file:
result = mammoth.convert_to_markdown(docx_file)
markdown = result.value # The generated Markdown
# Extract plain text only
with open("document.docx", "rb") as docx_file:
result = mammoth.extract_raw_text(docx_file)
text = result.value # Plain text contentMammoth processes DOCX documents through a well-defined pipeline:
The library supports extensive customization through style maps, image handlers, and document transformers, making it highly adaptable for different use cases while maintaining clean, semantic output.
Core conversion functions for transforming DOCX files to HTML and Markdown formats with comprehensive options for customization and style mapping.
def convert_to_html(fileobj, **kwargs):
"""Convert DOCX file to HTML format."""
def convert_to_markdown(fileobj, **kwargs):
"""Convert DOCX file to Markdown format."""
def convert(fileobj, transform_document=None, id_prefix=None,
include_embedded_style_map=True, **kwargs):
"""Core conversion function with full parameter control."""
def extract_raw_text(fileobj):
"""Extract plain text from DOCX file."""Writer system for generating HTML and Markdown output with flexible interfaces for custom rendering and output format creation.
def writer(output_format=None):
"""Create writer instance for specified output format."""
def formats():
"""Get available output format keys."""
class HtmlWriter:
"""HTML writer for generating HTML output."""
class MarkdownWriter:
"""Markdown writer for generating Markdown output."""Functions for processing and converting images embedded in DOCX documents, including data URI conversion and custom image handling.
def img_element(func):
"""Decorator for creating image conversion functions."""
def data_uri(image):
"""Convert images to base64 data URIs."""Utilities for transforming document elements before conversion, allowing for custom processing of paragraphs, runs, and other document components.
def paragraph(transform_paragraph):
"""Create transform for paragraph elements."""
def run(transform_run):
"""Create transform for run elements."""
def element_of_type(element_type, transform):
"""Create transform for specific element types."""Comprehensive style mapping system for converting Word document styles to HTML elements, including parsers and matchers for complex styling rules.
def embed_style_map(fileobj, style_map):
"""Embed style map into DOCX file."""
def read_embedded_style_map(fileobj):
"""Read embedded style map from DOCX file."""Core functions for creating and manipulating HTML elements, nodes, and structures during document conversion.
def text(value):
"""Create a text node with specified value."""
def element(tag_names, attributes=None, children=None, collapsible=None, separator=None):
"""Create HTML element with tag, attributes, and children."""
def tag(tag_names, attributes=None, collapsible=None, separator=None):
"""Create HTML tag definition."""
def collapsible_element(tag_names, attributes=None, children=None):
"""Create collapsible HTML element."""
def strip_empty(nodes):
"""Remove empty nodes from node list."""
def collapse(nodes):
"""Collapse adjacent similar nodes."""
def write(writer, nodes):
"""Write nodes using specified writer."""Functions for converting underline formatting to custom HTML elements.
def element(name):
"""Create underline converter that wraps content in specified HTML element."""Command-line tool for converting DOCX files with support for various output formats and options.
def main():
"""Command-line interface entry point."""
class ImageWriter:
"""Handles writing images to separate files in output directory."""
def __init__(self, output_dir):
"""Initialize with output directory path."""
def __call__(self, element):
"""Write image element to file and return attributes."""Console command: mammoth <docx-path> [output-path] [options]
Arguments:
docx-path: Path to the .docx file to convertoutput-path: Optional output path for generated document (writes to stdout if not specified)Options:
--output-dir: Output directory for generated HTML and images (mutually exclusive with output-path)--output-format: Output format (choices: html, markdown)--style-map: File containing a style mapclass Result:
"""Container for operation results with messages."""
value: any # The result value
messages: list # List of warning/error messages
def map(self, func):
"""Transform the value."""
def bind(self, func):
"""Chain operations that return Results."""
class Message:
"""Warning/error message structure."""
type: str # Message type
message: str # Message content
def warning(message):
"""Create a warning message."""
def success(value):
"""Create a successful Result with no messages."""
def combine(results):
"""Combine multiple Results into one."""
# HTML Node Types
class Node:
"""Base class for all HTML nodes."""
class TextNode(Node):
"""Text content node."""
value: str # Text content
class Tag:
"""HTML tag definition."""
tag_names: list # List of tag names
attributes: dict # HTML attributes
collapsible: bool # Whether tag can be collapsed
separator: str # Separator for multiple tags
@property
def tag_name(self):
"""Get primary tag name."""
class Element(Node):
"""HTML element node with tag and children."""
tag: Tag # Tag definition
children: list # Child nodes
@property
def tag_name(self):
"""Get primary tag name."""
@property
def tag_names(self):
"""Get all tag names."""
@property
def attributes(self):
"""Get HTML attributes."""
@property
def collapsible(self):
"""Check if element is collapsible."""
def is_void(self):
"""Check if element is void (self-closing)."""
class ForceWrite(Node):
"""Special node that forces writing even if empty."""
class NodeVisitor:
"""Base class for visiting HTML nodes."""