Convert Word documents from docx to simple and clean HTML and Markdown
—
Utilities for transforming document elements before conversion. Mammoth's transformation system allows for custom processing of paragraphs, runs, and other document components, enabling advanced document manipulation workflows.
Create transformations that target specific document element types.
def paragraph(transform_paragraph):
"""
Create transform that applies to paragraph elements.
Parameters:
- transform_paragraph: function, transforms paragraph elements
Returns:
Transform function that processes the entire document
"""
def run(transform_run):
"""
Create transform that applies to run elements.
Parameters:
- transform_run: function, transforms run elements
Returns:
Transform function that processes the entire document
"""
def element_of_type(element_type, transform):
"""
Create transform for specific element types.
Parameters:
- element_type: class/type to match
- transform: function to apply to matching elements
Returns:
Transform function that processes the entire document
"""Functions for finding and extracting specific elements from the document tree.
def get_descendants_of_type(element, element_type):
"""
Get all descendant elements of specified type.
Parameters:
- element: Root element to search from
- element_type: Type/class to filter for
Returns:
List of matching descendant elements
"""
def get_descendants(element):
"""
Get all descendant elements.
Parameters:
- element: Root element to search from
Returns:
List of all descendant elements
"""When creating transforms, you'll work with these document element types:
class Document:
"""Root document container."""
children: list # Child elements
notes: list # Footnotes and endnotes
comments: list # Document comments
class Paragraph:
"""Paragraph element with styling information."""
children: list # Child elements (runs, hyperlinks, etc.)
style_id: str # Word style ID
style_name: str # Word style name
numbering: object # List numbering information
alignment: str # Text alignment
indent: object # Indentation settings
class Run:
"""Text run with formatting."""
children: list # Child elements (text, breaks, etc.)
style_id: str # Word style ID
style_name: str # Word style name
is_bold: bool # Bold formatting
is_italic: bool # Italic formatting
is_underline: bool # Underline formatting
is_strikethrough: bool # Strikethrough formatting
is_all_caps: bool # All caps formatting
is_small_caps: bool # Small caps formatting
vertical_alignment: str # Superscript/subscript
font: str # Font name
font_size: int # Font size in half-points
highlight: str # Highlight color
class Text:
"""Plain text node."""
value: str # Text content
class Hyperlink:
"""Hyperlink element."""
children: list # Child elements
href: str # Link URL
anchor: str # Internal anchor
target_frame: str # Target frame
class Image:
"""Image element."""
alt_text: str # Alternative text
content_type: str # MIME type
def open(self):
"""Open image data for reading."""
class Table:
"""Table element."""
children: list # TableRow elements
style_id: str # Word style ID
style_name: str # Word style name
class TableRow:
"""Table row element."""
children: list # TableCell elements
class TableCell:
"""Table cell element."""
children: list # Cell content elements
colspan: int # Column span
rowspan: int # Row span
class Break:
"""Line, page, or column break."""
break_type: str # "line", "page", "column"import mammoth
def remove_empty_paragraphs(paragraph):
# Check if paragraph has no text content
has_text = any(
isinstance(child, mammoth.documents.Text) and child.value.strip()
for child in mammoth.transforms.get_descendants(paragraph)
)
if not has_text:
return None # Remove the paragraph
return paragraph
# Create the transform
transform = mammoth.transforms.paragraph(remove_empty_paragraphs)
# Apply during conversion
with open("document.docx", "rb") as docx_file:
result = mammoth.convert_to_html(
docx_file,
transform_document=transform
)import mammoth
def convert_custom_headings(paragraph):
# Convert custom heading styles to standard ones
if paragraph.style_name == "CustomHeading1":
paragraph = paragraph.copy(style_name="Heading 1")
elif paragraph.style_name == "CustomHeading2":
paragraph = paragraph.copy(style_name="Heading 2")
return paragraph
transform = mammoth.transforms.paragraph(convert_custom_headings)
with open("document.docx", "rb") as docx_file:
result = mammoth.convert_to_html(
docx_file,
transform_document=transform
)import mammoth
def uppercase_bold_text(run):
if run.is_bold:
# Transform all text children to uppercase
new_children = []
for child in run.children:
if isinstance(child, mammoth.documents.Text):
new_children.append(
mammoth.documents.text(child.value.upper())
)
else:
new_children.append(child)
return run.copy(children=new_children)
return run
transform = mammoth.transforms.run(uppercase_bold_text)
with open("document.docx", "rb") as docx_file:
result = mammoth.convert_to_html(
docx_file,
transform_document=transform
)import mammoth
def analyze_and_transform(document):
# Find all headings in the document
headings = []
for paragraph in mammoth.transforms.get_descendants_of_type(
document, mammoth.documents.Paragraph
):
if paragraph.style_name and "Heading" in paragraph.style_name:
headings.append(paragraph)
print(f"Found {len(headings)} headings")
# Find all images
images = mammoth.transforms.get_descendants_of_type(
document, mammoth.documents.Image
)
print(f"Found {len(images)} images")
# Return unchanged document
return document
with open("document.docx", "rb") as docx_file:
result = mammoth.convert_to_html(
docx_file,
transform_document=analyze_and_transform
)import mammoth
def remove_comments(paragraph):
# Remove comment references
new_children = []
for child in paragraph.children:
if not isinstance(child, mammoth.documents.CommentReference):
new_children.append(child)
return paragraph.copy(children=new_children)
def normalize_whitespace(run):
new_children = []
for child in run.children:
if isinstance(child, mammoth.documents.Text):
# Normalize whitespace
normalized = " ".join(child.value.split())
new_children.append(mammoth.documents.text(normalized))
else:
new_children.append(child)
return run.copy(children=new_children)
def combined_transform(document):
# Apply multiple transforms in sequence
comment_transform = mammoth.transforms.paragraph(remove_comments)
whitespace_transform = mammoth.transforms.run(normalize_whitespace)
document = comment_transform(document)
document = whitespace_transform(document)
return document
with open("document.docx", "rb") as docx_file:
result = mammoth.convert_to_html(
docx_file,
transform_document=combined_transform
)Mammoth provides factory functions for creating document elements:
def document(children, notes=None, comments=None):
"""Create Document instance."""
def paragraph(children, style_id=None, style_name=None,
numbering=None, alignment=None, indent=None):
"""Create Paragraph instance."""
def run(children, style_id=None, style_name=None,
is_bold=None, is_italic=None, **kwargs):
"""Create Run instance with normalized boolean fields."""
def text(value):
"""Create Text instance."""
def hyperlink(children, href=None, anchor=None, target_frame=None):
"""Create Hyperlink instance."""
def table(children, style_id=None, style_name=None):
"""Create Table instance."""These factory functions can be used when creating new document elements in transforms.
Install with Tessl CLI
npx tessl i tessl/pypi-mammoth