tessl/pypi-mammoth

Convert Word documents from docx to simple and clean HTML and Markdown

—

Pending

Overview

Eval results

Files

Document Transformation

Name: tessl/pypi-mammoth
Author: tessl

Utilities for transforming document elements before conversion. Mammoth's transformation system allows for custom processing of paragraphs, runs, and other document components, enabling advanced document manipulation workflows.

Capabilities

Element Type Transforms

Create transformations that target specific document element types.

def paragraph(transform_paragraph):
    """
    Create transform that applies to paragraph elements.
    
    Parameters:
    - transform_paragraph: function, transforms paragraph elements
    
    Returns:
    Transform function that processes the entire document
    """

def run(transform_run):
    """
    Create transform that applies to run elements.
    
    Parameters:
    - transform_run: function, transforms run elements
    
    Returns:
    Transform function that processes the entire document
    """

def element_of_type(element_type, transform):
    """
    Create transform for specific element types.
    
    Parameters:
    - element_type: class/type to match
    - transform: function to apply to matching elements
    
    Returns:
    Transform function that processes the entire document
    """

Document Traversal

Functions for finding and extracting specific elements from the document tree.

def get_descendants_of_type(element, element_type):
    """
    Get all descendant elements of specified type.
    
    Parameters:
    - element: Root element to search from
    - element_type: Type/class to filter for
    
    Returns:
    List of matching descendant elements
    """

def get_descendants(element):
    """
    Get all descendant elements.
    
    Parameters:
    - element: Root element to search from
    
    Returns:
    List of all descendant elements
    """

Document Element Types

When creating transforms, you'll work with these document element types:

class Document:
    """Root document container."""
    children: list  # Child elements
    notes: list     # Footnotes and endnotes
    comments: list  # Document comments

class Paragraph:
    """Paragraph element with styling information."""
    children: list      # Child elements (runs, hyperlinks, etc.)
    style_id: str       # Word style ID
    style_name: str     # Word style name
    numbering: object   # List numbering information
    alignment: str      # Text alignment
    indent: object      # Indentation settings

class Run:
    """Text run with formatting."""
    children: list           # Child elements (text, breaks, etc.)
    style_id: str           # Word style ID
    style_name: str         # Word style name
    is_bold: bool           # Bold formatting
    is_italic: bool         # Italic formatting
    is_underline: bool      # Underline formatting
    is_strikethrough: bool  # Strikethrough formatting
    is_all_caps: bool       # All caps formatting
    is_small_caps: bool     # Small caps formatting
    vertical_alignment: str # Superscript/subscript
    font: str               # Font name
    font_size: int          # Font size in half-points
    highlight: str          # Highlight color

class Text:
    """Plain text node."""
    value: str  # Text content

class Hyperlink:
    """Hyperlink element."""
    children: list      # Child elements
    href: str           # Link URL
    anchor: str         # Internal anchor
    target_frame: str   # Target frame

class Image:
    """Image element."""
    alt_text: str      # Alternative text
    content_type: str  # MIME type
    
    def open(self):
        """Open image data for reading."""

class Table:
    """Table element."""
    children: list    # TableRow elements
    style_id: str     # Word style ID
    style_name: str   # Word style name

class TableRow:
    """Table row element."""
    children: list  # TableCell elements

class TableCell:
    """Table cell element."""
    children: list  # Cell content elements
    colspan: int    # Column span
    rowspan: int    # Row span

class Break:
    """Line, page, or column break."""
    break_type: str  # "line", "page", "column"

Transform Examples

Remove Empty Paragraphs

import mammoth

def remove_empty_paragraphs(paragraph):
    # Check if paragraph has no text content
    has_text = any(
        isinstance(child, mammoth.documents.Text) and child.value.strip()
        for child in mammoth.transforms.get_descendants(paragraph)
    )
    
    if not has_text:
        return None  # Remove the paragraph
    return paragraph

# Create the transform
transform = mammoth.transforms.paragraph(remove_empty_paragraphs)

# Apply during conversion
with open("document.docx", "rb") as docx_file:
    result = mammoth.convert_to_html(
        docx_file,
        transform_document=transform
    )

Convert Custom Styles

import mammoth

def convert_custom_headings(paragraph):
    # Convert custom heading styles to standard ones
    if paragraph.style_name == "CustomHeading1":
        paragraph = paragraph.copy(style_name="Heading 1")
    elif paragraph.style_name == "CustomHeading2":
        paragraph = paragraph.copy(style_name="Heading 2")
    
    return paragraph

transform = mammoth.transforms.paragraph(convert_custom_headings)

with open("document.docx", "rb") as docx_file:
    result = mammoth.convert_to_html(
        docx_file,
        transform_document=transform
    )

Modify Text Content

import mammoth

def uppercase_bold_text(run):
    if run.is_bold:
        # Transform all text children to uppercase
        new_children = []
        for child in run.children:
            if isinstance(child, mammoth.documents.Text):
                new_children.append(
                    mammoth.documents.text(child.value.upper())
                )
            else:
                new_children.append(child)
        
        return run.copy(children=new_children)
    
    return run

transform = mammoth.transforms.run(uppercase_bold_text)

with open("document.docx", "rb") as docx_file:
    result = mammoth.convert_to_html(
        docx_file,
        transform_document=transform
    )

Complex Document Analysis

import mammoth

def analyze_and_transform(document):
    # Find all headings in the document
    headings = []
    for paragraph in mammoth.transforms.get_descendants_of_type(
        document, mammoth.documents.Paragraph
    ):
        if paragraph.style_name and "Heading" in paragraph.style_name:
            headings.append(paragraph)
    
    print(f"Found {len(headings)} headings")
    
    # Find all images
    images = mammoth.transforms.get_descendants_of_type(
        document, mammoth.documents.Image
    )
    print(f"Found {len(images)} images")
    
    # Return unchanged document
    return document

with open("document.docx", "rb") as docx_file:
    result = mammoth.convert_to_html(
        docx_file,
        transform_document=analyze_and_transform
    )

Combining Transforms

import mammoth

def remove_comments(paragraph):
    # Remove comment references
    new_children = []
    for child in paragraph.children:
        if not isinstance(child, mammoth.documents.CommentReference):
            new_children.append(child)
    
    return paragraph.copy(children=new_children)

def normalize_whitespace(run):
    new_children = []
    for child in run.children:
        if isinstance(child, mammoth.documents.Text):
            # Normalize whitespace
            normalized = " ".join(child.value.split())
            new_children.append(mammoth.documents.text(normalized))
        else:
            new_children.append(child)
    
    return run.copy(children=new_children)

def combined_transform(document):
    # Apply multiple transforms in sequence
    comment_transform = mammoth.transforms.paragraph(remove_comments)
    whitespace_transform = mammoth.transforms.run(normalize_whitespace)
    
    document = comment_transform(document)
    document = whitespace_transform(document)
    
    return document

with open("document.docx", "rb") as docx_file:
    result = mammoth.convert_to_html(
        docx_file,
        transform_document=combined_transform
    )

Factory Functions

Mammoth provides factory functions for creating document elements:

def document(children, notes=None, comments=None):
    """Create Document instance."""

def paragraph(children, style_id=None, style_name=None, 
             numbering=None, alignment=None, indent=None):
    """Create Paragraph instance."""

def run(children, style_id=None, style_name=None, 
        is_bold=None, is_italic=None, **kwargs):
    """Create Run instance with normalized boolean fields."""

def text(value):
    """Create Text instance."""

def hyperlink(children, href=None, anchor=None, target_frame=None):
    """Create Hyperlink instance."""

def table(children, style_id=None, style_name=None):
    """Create Table instance."""

These factory functions can be used when creating new document elements in transforms.

Install with Tessl CLI