Convert Word documents from docx to simple and clean HTML and Markdown
—
Functions for processing and converting images embedded in DOCX documents. Mammoth provides flexible image handling capabilities, including data URI conversion and support for custom image processing functions.
Creates image conversion functions that produce HTML img elements with proper attributes and alt text handling.
def img_element(func):
"""
Decorator that converts image conversion functions to HTML img elements.
Parameters:
- func: function, takes an image object and returns attributes dict
Returns:
Image conversion function that returns list of HTML img elements
"""Usage example:
import mammoth
@mammoth.images.img_element
def custom_image_handler(image):
return {
"src": f"/images/{image.filename}",
"class": "document-image"
}
# Use with conversion
with open("document.docx", "rb") as docx_file:
result = mammoth.convert_to_html(
docx_file,
convert_image=custom_image_handler
)Converts images to base64 data URIs, embedding image data directly in the HTML output.
def data_uri(image):
"""
Convert images to base64 data URIs.
Parameters:
- image: Image object with .open() method and .content_type property
Returns:
List containing HTML img element with data URI src
"""Usage example:
import mammoth
# Use data URI conversion for embedded images
with open("document.docx", "rb") as docx_file:
result = mammoth.convert_to_html(
docx_file,
convert_image=mammoth.images.data_uri
)
# Images will be embedded as data URIs in the HTMLBackwards compatibility alias for img_element. Retained for compatibility with version 0.3.x.
inline = img_element # Alias for backwards compatibilityWhen working with custom image handlers, image objects have these properties:
class Image:
"""Image object passed to conversion functions."""
alt_text: str # Alternative text for the image
content_type: str # MIME type (e.g., "image/png", "image/jpeg")
def open(self):
"""
Open image data for reading.
Returns:
File-like object with image binary data
"""import mammoth
import os
from uuid import uuid4
@mammoth.images.img_element
def save_image_to_file(image):
# Generate unique filename
extension = {
"image/png": ".png",
"image/jpeg": ".jpg",
"image/gif": ".gif"
}.get(image.content_type, ".bin")
filename = f"image_{uuid4()}{extension}"
filepath = f"./images/{filename}"
# Ensure directory exists
os.makedirs("./images", exist_ok=True)
# Save image data
with image.open() as image_bytes:
with open(filepath, "wb") as f:
f.write(image_bytes.read())
return {"src": filepath}
# Use the custom handler
with open("document.docx", "rb") as docx_file:
result = mammoth.convert_to_html(
docx_file,
convert_image=save_image_to_file
)import mammoth
import requests
@mammoth.images.img_element
def upload_to_server(image):
# Upload image to remote server
with image.open() as image_bytes:
files = {"image": (f"image{extension}", image_bytes, image.content_type)}
response = requests.post("https://api.example.com/upload", files=files)
if response.status_code == 200:
image_url = response.json()["url"]
return {"src": image_url}
else:
# Fallback to data URI
return mammoth.images.data_uri(image)[0].attributes
# Use the upload handler
with open("document.docx", "rb") as docx_file:
result = mammoth.convert_to_html(
docx_file,
convert_image=upload_to_server
)import mammoth
from PIL import Image as PILImage
import base64
import io
@mammoth.images.img_element
def resize_and_convert(image):
with image.open() as image_bytes:
# Open with PIL
pil_image = PILImage.open(image_bytes)
# Resize if too large
max_width = 800
if pil_image.width > max_width:
ratio = max_width / pil_image.width
new_height = int(pil_image.height * ratio)
pil_image = pil_image.resize((max_width, new_height))
# Convert to JPEG and encode as data URI
output = io.BytesIO()
pil_image.save(output, format="JPEG", quality=85)
encoded = base64.b64encode(output.getvalue()).decode("ascii")
return {"src": f"data:image/jpeg;base64,{encoded}"}
# Use the processing handler
with open("document.docx", "rb") as docx_file:
result = mammoth.convert_to_html(
docx_file,
convert_image=resize_and_convert
)Mammoth automatically preserves alt text from Word documents when available:
@mammoth.images.img_element
def preserve_alt_text(image):
attributes = {"src": f"/images/{uuid4()}.jpg"}
# Alt text is automatically added by img_element decorator
# if image.alt_text is not None
return attributesThe img_element decorator automatically adds alt text to the generated img element if image.alt_text is available from the source document.
Install with Tessl CLI
npx tessl i tessl/pypi-mammoth