Convert Word documents from docx to simple HTML and Markdown
npx @tessl/cli install tessl/npm-mammoth@1.10.0Mammoth is designed to convert .docx documents, such as those created by Microsoft Word, Google Docs and LibreOffice, to HTML and Markdown formats. It focuses on semantic markup preservation rather than visual formatting, converting document styles (like Heading 1) to appropriate HTML elements (like h1 tags) while ignoring font styling details.
npm install mammothconst mammoth = require("mammoth");TypeScript:
import mammoth = require("mammoth");
// or
const mammoth = require("mammoth");Browser (standalone):
// Include mammoth.browser.js or mammoth.browser.min.js
const mammoth = window.mammoth;const mammoth = require("mammoth");
// Convert DOCX to HTML
mammoth.convertToHtml({path: "document.docx"})
.then(function(result){
const html = result.value; // The generated HTML
const messages = result.messages; // Any messages, such as warnings
})
.catch(function(error) {
console.error(error);
});
// Extract raw text
mammoth.extractRawText({path: "document.docx"})
.then(function(result){
const text = result.value; // The raw text
const messages = result.messages;
});Mammoth also provides a command-line interface:
# Convert DOCX to HTML
mammoth document.docx output.html
# Convert with style map
mammoth document.docx output.html --style-map=custom-style-map
# Convert to Markdown (deprecated)
mammoth document.docx --output-format=markdown
# Extract images to directory
mammoth document.docx --output-dir=output-dirMammoth is built around several key components:
Core functionality for converting DOCX documents to HTML and Markdown formats, with support for custom style mappings and conversion options.
function convertToHtml(input: Input, options?: Options): Promise<Result>;
function convertToMarkdown(input: Input, options?: Options): Promise<Result>;
function extractRawText(input: Input): Promise<Result>;Image conversion utilities for customizing how images in DOCX documents are processed and included in the output.
const images: {
dataUri: ImageConverter;
imgElement: (func: (image: Image) => Promise<ImageAttributes>) => ImageConverter;
};Document transformation utilities for modifying document elements before conversion, enabling custom preprocessing of document structure.
const transforms: {
paragraph: (transform: (element: any) => any) => (element: any) => any;
run: (transform: (element: any) => any) => (element: any) => any;
getDescendants: (element: any) => any[];
getDescendantsOfType: (element: any, type: string) => any[];
};Utilities for handling underline and other styling elements in document conversion.
const underline: {
element: (name: string) => (html: any) => any;
};Functions for embedding and reading custom style maps in DOCX documents.
function embedStyleMap(input: Input, styleMap: string): Promise<{
toArrayBuffer: () => ArrayBuffer;
toBuffer: () => Buffer;
}>;
function readEmbeddedStyleMap(input: Input): Promise<string>;type Input = PathInput | BufferInput | ArrayBufferInput;
interface PathInput {
path: string;
}
interface BufferInput {
buffer: Buffer;
}
interface ArrayBufferInput {
arrayBuffer: ArrayBuffer;
}
interface Options {
styleMap?: string | string[];
includeEmbeddedStyleMap?: boolean;
includeDefaultStyleMap?: boolean;
convertImage?: ImageConverter;
ignoreEmptyParagraphs?: boolean;
idPrefix?: string;
transformDocument?: (element: any) => any;
}
interface Result {
value: string;
messages: Message[];
}
type Message = Warning | Error;
interface Warning {
type: "warning";
message: string;
}
interface Error {
type: "error";
message: string;
error: unknown;
}
interface Image {
contentType: string;
readAsArrayBuffer(): Promise<ArrayBuffer>;
readAsBase64String(): Promise<string>;
readAsBuffer(): Promise<Buffer>;
read(): Promise<Buffer>;
read(encoding: string): Promise<string>;
}
interface ImageConverter {
__mammothBrand: "ImageConverter";
}
interface ImageAttributes {
src: string;
[key: string]: string;
}