Convert Word documents from docx to simple and clean HTML and Markdown
—
Writer system for generating HTML and Markdown output from internal document representations. The writer system provides a flexible interface for creating different output formats and custom rendering logic.
Create writer instances for different output formats.
def writer(output_format=None):
"""
Create writer instance for specified output format.
Parameters:
- output_format: str, "html" or "markdown" (default: "html")
Returns:
HtmlWriter or MarkdownWriter instance
"""
def formats():
"""
Get available output format keys.
Returns:
List of supported format strings
"""Usage example:
from mammoth.writers import writer, formats
# Get available formats
print(formats()) # ['html', 'markdown']
# Create HTML writer
html_writer = writer("html")
# Create Markdown writer
md_writer = writer("markdown")Writer class for generating HTML output with full control over element creation and attributes.
class HtmlWriter:
"""HTML writer for generating HTML output."""
def text(self, text):
"""
Write text content with proper escaping.
Parameters:
- text: str, text content to write
"""
def start(self, name, attributes=None):
"""
Write opening HTML tag.
Parameters:
- name: str, HTML element name
- attributes: dict, HTML attributes (optional)
"""
def end(self, name):
"""
Write closing HTML tag.
Parameters:
- name: str, HTML element name
"""
def self_closing(self, name, attributes=None):
"""
Write self-closing HTML tag.
Parameters:
- name: str, HTML element name
- attributes: dict, HTML attributes (optional)
"""
def append(self, html):
"""
Append raw HTML content.
Parameters:
- html: str, HTML content to append
"""
def as_string(self):
"""
Get final HTML output.
Returns:
str, complete HTML content
"""Usage example:
from mammoth.writers import writer
html_writer = writer("html")
# Create HTML structure
html_writer.start("div", {"class": "container"})
html_writer.start("h1")
html_writer.text("Hello World")
html_writer.end("h1")
html_writer.start("p")
html_writer.text("This is a paragraph.")
html_writer.end("p")
html_writer.self_closing("img", {"src": "image.jpg", "alt": "Image"})
html_writer.end("div")
# Get final HTML
output = html_writer.as_string()
print(output) # <div class="container"><h1>Hello World</h1><p>This is a paragraph.</p><img src="image.jpg" alt="Image" /></div>Writer class for generating Markdown output with the same interface as HtmlWriter.
class MarkdownWriter:
"""Markdown writer for generating Markdown output."""
def text(self, text):
"""Write text content with Markdown escaping."""
def start(self, name, attributes=None):
"""Convert HTML opening tag to Markdown equivalent."""
def end(self, name):
"""Convert HTML closing tag to Markdown equivalent."""
def self_closing(self, name, attributes=None):
"""Convert self-closing HTML tag to Markdown equivalent."""
def append(self, html):
"""Append content, converting HTML to Markdown."""
def as_string(self):
"""Get final Markdown output."""Usage example:
from mammoth.writers import writer
md_writer = writer("markdown")
# Create Markdown structure
md_writer.start("h1")
md_writer.text("Hello World")
md_writer.end("h1")
md_writer.start("p")
md_writer.text("This is a paragraph.")
md_writer.end("p")
# Get final Markdown
output = md_writer.as_string()
print(output) # # Hello World\n\nThis is a paragraph.Abstract base class defining the writer interface.
class Writer:
"""Abstract base class for all writers."""
def text(self, text):
"""Write text content."""
raise NotImplementedError()
def start(self, name, attributes=None):
"""Write opening tag."""
raise NotImplementedError()
def end(self, name):
"""Write closing tag."""
raise NotImplementedError()
def self_closing(self, name, attributes=None):
"""Write self-closing tag."""
raise NotImplementedError()
def append(self, html):
"""Append raw content."""
raise NotImplementedError()
def as_string(self):
"""Get final output."""
raise NotImplementedError()Low-level HTML generation utilities for creating HTML node trees.
def text(value):
"""
Create text node.
Parameters:
- value: str, text content
Returns:
TextNode instance
"""
def element(tag_names, attributes=None, children=None,
collapsible=None, separator=None):
"""
Create HTML element nodes.
Parameters:
- tag_names: str or list, HTML element name(s)
- attributes: dict, HTML attributes
- children: list, child nodes
- collapsible: bool, whether element can be collapsed
- separator: str, separator for multiple elements
Returns:
Element instance
"""
def collapsible_element(tag_names, attributes=None, children=None):
"""
Create collapsible HTML elements.
Parameters:
- tag_names: str or list, HTML element name(s)
- attributes: dict, HTML attributes
- children: list, child nodes
Returns:
Element instance that can be collapsed if empty
"""
def strip_empty(nodes):
"""
Remove empty elements from node tree.
Parameters:
- nodes: list, HTML nodes
Returns:
List of nodes with empty elements removed
"""
def collapse(nodes):
"""
Collapse adjacent matching elements.
Parameters:
- nodes: list, HTML nodes
Returns:
List of nodes with adjacent elements collapsed
"""
def write(writer, nodes):
"""
Write HTML nodes using a writer.
Parameters:
- writer: Writer instance
- nodes: list, HTML nodes to write
"""Special constant for forcing elements to be written even if empty.
force_write = ForceWrite() # Forces element to be written even if emptyUsage example:
from mammoth.html import element, text, force_write, write
from mammoth.writers import writer
# Create HTML node tree
nodes = [
element("div", {"class": "container"}, [
element("h1", None, [text("Title")]),
element("p", None, [text("Content")]),
element("div", {"class": "empty"}, [], force_write) # Force empty div
])
]
# Write to HTML
html_writer = writer("html")
write(html_writer, nodes)
output = html_writer.as_string()from mammoth.writers.abc import Writer
class XmlWriter(Writer):
def __init__(self):
self._content = []
def text(self, text):
# Escape XML special characters
escaped = text.replace("&", "&").replace("<", "<").replace(">", ">")
self._content.append(escaped)
def start(self, name, attributes=None):
tag = f"<{name}"
if attributes:
for key, value in attributes.items():
tag += f' {key}="{value}"'
tag += ">"
self._content.append(tag)
def end(self, name):
self._content.append(f"</{name}>")
def self_closing(self, name, attributes=None):
tag = f"<{name}"
if attributes:
for key, value in attributes.items():
tag += f' {key}="{value}"'
tag += "/>"
self._content.append(tag)
def append(self, content):
self._content.append(content)
def as_string(self):
return "".join(self._content)
# Use custom writer
xml_writer = XmlWriter()
xml_writer.start("document")
xml_writer.start("title")
xml_writer.text("My Document")
xml_writer.end("title")
xml_writer.end("document")
print(xml_writer.as_string()) # <document><title>My Document</title></document>Install with Tessl CLI
npx tessl i tessl/pypi-mammoth