tessl/pypi-mammoth

Convert Word documents from docx to simple and clean HTML and Markdown

—

Pending

Overview

Eval results

Files

Image Handling

Name: tessl/pypi-mammoth
Author: tessl

Functions for processing and converting images embedded in DOCX documents. Mammoth provides flexible image handling capabilities, including data URI conversion and support for custom image processing functions.

Capabilities

Image Element Decorator

Creates image conversion functions that produce HTML img elements with proper attributes and alt text handling.

def img_element(func):
    """
    Decorator that converts image conversion functions to HTML img elements.
    
    Parameters:
    - func: function, takes an image object and returns attributes dict
    
    Returns:
    Image conversion function that returns list of HTML img elements
    """

Usage example:

import mammoth

@mammoth.images.img_element
def custom_image_handler(image):
    return {
        "src": f"/images/{image.filename}",
        "class": "document-image"
    }

# Use with conversion
with open("document.docx", "rb") as docx_file:
    result = mammoth.convert_to_html(
        docx_file,
        convert_image=custom_image_handler
    )

Data URI Conversion

Converts images to base64 data URIs, embedding image data directly in the HTML output.

def data_uri(image):
    """
    Convert images to base64 data URIs.
    
    Parameters:
    - image: Image object with .open() method and .content_type property
    
    Returns:
    List containing HTML img element with data URI src
    """

Usage example:

import mammoth

# Use data URI conversion for embedded images
with open("document.docx", "rb") as docx_file:
    result = mammoth.convert_to_html(
        docx_file,
        convert_image=mammoth.images.data_uri
    )
    # Images will be embedded as data URIs in the HTML

Inline Image Handler

Backwards compatibility alias for img_element. Retained for compatibility with version 0.3.x.

inline = img_element  # Alias for backwards compatibility

Image Object Properties

When working with custom image handlers, image objects have these properties:

class Image:
    """Image object passed to conversion functions."""
    alt_text: str  # Alternative text for the image
    content_type: str  # MIME type (e.g., "image/png", "image/jpeg")
    
    def open(self):
        """
        Open image data for reading.
        
        Returns:
        File-like object with image binary data
        """

Custom Image Handling Examples

Save Images to Files

import mammoth
import os
from uuid import uuid4

@mammoth.images.img_element
def save_image_to_file(image):
    # Generate unique filename
    extension = {
        "image/png": ".png",
        "image/jpeg": ".jpg",
        "image/gif": ".gif"
    }.get(image.content_type, ".bin")
    
    filename = f"image_{uuid4()}{extension}"
    filepath = f"./images/{filename}"
    
    # Ensure directory exists
    os.makedirs("./images", exist_ok=True)
    
    # Save image data
    with image.open() as image_bytes:
        with open(filepath, "wb") as f:
            f.write(image_bytes.read())
    
    return {"src": filepath}

# Use the custom handler
with open("document.docx", "rb") as docx_file:
    result = mammoth.convert_to_html(
        docx_file,
        convert_image=save_image_to_file
    )

Remote Image Upload

import mammoth
import requests

@mammoth.images.img_element
def upload_to_server(image):
    # Upload image to remote server
    with image.open() as image_bytes:
        files = {"image": (f"image{extension}", image_bytes, image.content_type)}
        response = requests.post("https://api.example.com/upload", files=files)
        
    if response.status_code == 200:
        image_url = response.json()["url"]
        return {"src": image_url}
    else:
        # Fallback to data URI
        return mammoth.images.data_uri(image)[0].attributes

# Use the upload handler
with open("document.docx", "rb") as docx_file:
    result = mammoth.convert_to_html(
        docx_file,
        convert_image=upload_to_server
    )

Image Processing

import mammoth
from PIL import Image as PILImage
import base64
import io

@mammoth.images.img_element
def resize_and_convert(image):
    with image.open() as image_bytes:
        # Open with PIL
        pil_image = PILImage.open(image_bytes)
        
        # Resize if too large
        max_width = 800
        if pil_image.width > max_width:
            ratio = max_width / pil_image.width
            new_height = int(pil_image.height * ratio)
            pil_image = pil_image.resize((max_width, new_height))
        
        # Convert to JPEG and encode as data URI
        output = io.BytesIO()
        pil_image.save(output, format="JPEG", quality=85)
        encoded = base64.b64encode(output.getvalue()).decode("ascii")
        
        return {"src": f"data:image/jpeg;base64,{encoded}"}

# Use the processing handler
with open("document.docx", "rb") as docx_file:
    result = mammoth.convert_to_html(
        docx_file,
        convert_image=resize_and_convert
    )

Image Alt Text

Mammoth automatically preserves alt text from Word documents when available:

@mammoth.images.img_element
def preserve_alt_text(image):
    attributes = {"src": f"/images/{uuid4()}.jpg"}
    
    # Alt text is automatically added by img_element decorator
    # if image.alt_text is not None
    
    return attributes

The img_element decorator automatically adds alt text to the generated img element if image.alt_text is available from the source document.

Install with Tessl CLI