Convert Word documents from docx to simple and clean HTML and Markdown
—
Comprehensive style mapping system for converting Word document styles to HTML elements. Mammoth's style system includes parsers, matchers, and embedded style map support for complex styling rules and customization.
Functions for embedding and reading style maps directly in DOCX files.
def embed_style_map(fileobj, style_map):
"""
Embed a style map directly into DOCX file.
Parameters:
- fileobj: DOCX file object (must be writable)
- style_map: str, style mapping rules as text
Note: Modifies the DOCX file to include the style map
"""
def read_embedded_style_map(fileobj):
"""
Read embedded style map from DOCX file.
Parameters:
- fileobj: DOCX file object
Returns:
str, style map text or None if no embedded map exists
"""Parse style mapping strings into internal representations.
def read_style_mapping(string):
"""
Parse style mapping strings.
Parameters:
- string: str, style mapping text line
Returns:
Result object with parsed style mapping or warning
Raises:
LineParseError: When style mapping syntax is invalid
"""
class LineParseError(Exception):
"""Raised for style mapping parse errors."""
def style(document_matcher, html_path):
"""
Create style mapping from document matcher to HTML path.
Parameters:
- document_matcher: DocumentMatcher, matcher for document elements
- html_path: HtmlPath, target HTML structure
Returns:
Style named tuple
"""Matchers for identifying specific document elements and formatting.
def paragraph(style_id=None, style_name=None, numbering=None):
"""
Create paragraph matcher.
Parameters:
- style_id: str, Word style ID to match
- style_name: str, Word style name to match
- numbering: object, numbering level to match
Returns:
ParagraphMatcher instance
"""
def run(style_id=None, style_name=None):
"""
Create run matcher.
Parameters:
- style_id: str, Word style ID to match
- style_name: str, Word style name to match
Returns:
RunMatcher instance
"""
def table(style_id=None, style_name=None):
"""
Create table matcher.
Parameters:
- style_id: str, Word style ID to match
- style_name: str, Word style name to match
Returns:
TableMatcher instance
"""
def highlight(color=None):
"""
Create highlight matcher.
Parameters:
- color: str, highlight color to match (optional)
Returns:
HighlightMatcher instance
"""Pre-defined matchers for common text formatting.
# Formatting matcher constants
bold = BoldMatcher() # Matches bold formatting
italic = ItalicMatcher() # Matches italic formatting
underline = UnderlineMatcher() # Matches underline formatting
strikethrough = StrikethroughMatcher() # Matches strikethrough formatting
all_caps = AllCapsMatcher() # Matches all-caps formatting
small_caps = SmallCapsMatcher() # Matches small-caps formatting
comment_reference = CommentReferenceMatcher() # Matches comment referencesMatchers for different types of document breaks.
# Break matcher constants
line_break = LineBreakMatcher() # Matches line breaks
page_break = PageBreakMatcher() # Matches page breaks
column_break = ColumnBreakMatcher() # Matches column breaksMatchers for string comparison in style names and IDs.
def equal_to(value):
"""
Create case-insensitive string equality matcher.
Parameters:
- value: str, string to match exactly (case-insensitive)
Returns:
StringMatcher instance
"""
def starts_with(value):
"""
Create case-insensitive string prefix matcher.
Parameters:
- value: str, prefix to match (case-insensitive)
Returns:
StringMatcher instance
"""System for defining HTML output structures in style mappings.
def path(elements):
"""
Create HTML path from elements.
Parameters:
- elements: list, HTML path elements
Returns:
HtmlPath instance
"""
def element(names, attributes=None, class_names=None,
fresh=None, separator=None):
"""
Create HTML path elements for style mapping.
Parameters:
- names: str or list, HTML element name(s)
- attributes: dict, HTML attributes
- class_names: list, CSS class names
- fresh: bool, whether element should be fresh (force new element)
- separator: str, separator for multiple elements
Returns:
HtmlPathElement instance
"""
# Special path constants
empty = EmptyPath() # Empty HTML path (no output)
ignore = IgnorePath() # Path that ignores/removes contentMammoth uses a simple text-based syntax for style mappings:
<document_matcher> => <html_path># Style mapping examples
style_map = """
# Headings
p.Heading1 => h1:fresh
p.Heading2 => h2:fresh
p[style-name='Custom Heading'] => h3.custom:fresh
# Text formatting
r.Strong => strong
r[style-name='Emphasis'] => em
# Lists
p:unordered-list(1) => ul > li:fresh
p:ordered-list(1) => ol > li:fresh
# Tables
table.CustomTable => table.custom-table
# Ignore unwanted content
r[style-name='Hidden'] =>
p.Footer =>
# Comments (lines starting with #)
# This is a comment and will be ignored
"""
# Use style map in conversion
with open("document.docx", "rb") as docx_file:
result = mammoth.convert_to_html(
docx_file,
style_map=style_map
)p - Paragraph elementsr - Run elementstable - Table elements.StyleName - Match by style name[style-name='Style Name'] - Match by style name with spaces[style-id='styleId'] - Match by style ID:unordered-list(level) - Match unordered list at level:ordered-list(level) - Match ordered list at levelh1 - Create h1 elementh1.class-name - Create h1 with CSS classdiv.container > p - Nested elements:fresh - Force new element creationul|ol - Alternative elements=> alone - Ignore contentMammoth includes extensive built-in style mappings:
# Built-in mappings include:
"""
# Standard headings
p.Heading1 => h1:fresh
p.Heading2 => h2:fresh
p.Heading3 => h3:fresh
p.Heading4 => h4:fresh
p.Heading5 => h5:fresh
p.Heading6 => h6:fresh
# Alternative heading formats
p[style-name='Heading 1'] => h1:fresh
p[style-name='heading 1'] => h1:fresh
# Apple Pages
p.Heading => h1:fresh
p[style-name='Heading'] => h1:fresh
# Lists with nesting
p:unordered-list(1) => ul > li:fresh
p:unordered-list(2) => ul|ol > li > ul > li:fresh
p:ordered-list(1) => ol > li:fresh
p:ordered-list(2) => ul|ol > li > ol > li:fresh
# Text formatting
r[style-name='Strong'] => strong
r[style-name='Hyperlink'] =>
# Notes
p[style-name='footnote text'] => p:fresh
r[style-name='footnote reference'] =>
p[style-name='endnote text'] => p:fresh
r[style-name='endnote reference'] =>
# Normal paragraphs
p[style-name='Normal'] => p:fresh
p.Body => p:fresh
"""import mammoth
# Embed style map in DOCX file
style_map = "p.CustomStyle => div.special"
with open("document.docx", "r+b") as docx_file:
mammoth.embed_style_map(docx_file, style_map)
# Later, read embedded style map
with open("document.docx", "rb") as docx_file:
embedded_map = mammoth.read_embedded_style_map(docx_file)
print(embedded_map) # "p.CustomStyle => div.special"import mammoth
def process_options(options):
"""Process conversion options with custom style logic."""
result = mammoth.options.read_options(options)
if result.messages:
for message in result.messages:
print(f"Style warning: {message.message}")
return result
# Use custom options processing
options = {
"style_map": "p.Custom => div.processed",
"include_default_style_map": True
}
processed_options = process_options(options)import mammoth
def validate_style_map(style_map_text):
"""Validate style mapping syntax."""
lines = style_map_text.strip().split('\n')
errors = []
for i, line in enumerate(lines, 1):
line = line.strip()
if line and not line.startswith('#'):
try:
result = mammoth.styles.parser.read_style_mapping(line)
if result.messages:
for msg in result.messages:
errors.append(f"Line {i}: {msg.message}")
except mammoth.styles.parser.LineParseError as e:
errors.append(f"Line {i}: {str(e)}")
return errors
# Validate before using
style_map = """
p.Heading1 => h1:fresh
invalid syntax here
p.Heading2 => h2:fresh
"""
errors = validate_style_map(style_map)
if errors:
for error in errors:
print(error)Functions for processing and validating conversion options.
def read_options(options):
"""
Process and validate conversion options.
Parameters:
- options: dict, conversion options dictionary including:
- style_map: str, custom style mapping rules
- embedded_style_map: str, style map from DOCX file
- include_default_style_map: bool, use built-in styles (default: True)
- ignore_empty_paragraphs: bool, skip empty paragraphs (default: True)
- convert_image: function, custom image conversion function
- output_format: str, "html" or "markdown"
- id_prefix: str, prefix for HTML element IDs
Returns:
Result object with processed options dictionary
"""Usage example:
import mammoth
# Process options with validation
options = {
"style_map": "p.CustomHeading => h1.special",
"ignore_empty_paragraphs": False,
"include_default_style_map": True
}
result = mammoth.options.read_options(options)
if result.messages:
for message in result.messages:
print(f"Option warning: {message.message}")
processed_options = result.valueInstall with Tessl CLI
npx tessl i tessl/pypi-mammoth