0
# Document Processing
1
2
Low-level utilities for processing HTML documents, fragments, and converting between HTML strings and Slate document structures. These functions provide the foundation for both serialization and deserialization operations.
3
4
## Capabilities
5
6
### Document Fragment Processing
7
8
Functions for handling complete HTML documents and converting them to Slate-compatible formats.
9
10
```typescript { .api }
11
/**
12
* Deserialize HTML element to a valid document fragment
13
* @param editor - Plate editor instance
14
* @param options - Document fragment processing options
15
* @returns Array of Slate descendants representing the document
16
*/
17
function deserializeHTMLToDocumentFragment<T = {}>(
18
editor: PlateEditor<T>,
19
options: DocumentFragmentOptions<T>
20
): TDescendant[];
21
22
interface DocumentFragmentOptions<T> {
23
/** Array of Plate plugins for deserialization rules */
24
plugins: PlatePlugin<T>[];
25
/** HTML element or string to deserialize */
26
element: HTMLElement | string;
27
/** Whether to strip whitespace from the HTML (default: true) */
28
stripWhitespace?: boolean;
29
}
30
```
31
32
**Usage Examples:**
33
34
```typescript
35
import { deserializeHTMLToDocumentFragment } from "@udecode/plate-html-serializer";
36
37
// Process HTML string to document fragment
38
const htmlString = `
39
<div>
40
<h1>Title</h1>
41
<p>First paragraph with <strong>bold</strong> text.</p>
42
<ul>
43
<li>List item 1</li>
44
<li>List item 2</li>
45
</ul>
46
</div>
47
`;
48
49
const documentFragment = deserializeHTMLToDocumentFragment(editor, {
50
plugins: myPlugins,
51
element: htmlString,
52
stripWhitespace: true
53
});
54
55
// Process HTML element to document fragment
56
const htmlElement = document.createElement('article');
57
htmlElement.innerHTML = '<h2>Article Title</h2><p>Article content...</p>';
58
59
const articleFragment = deserializeHTMLToDocumentFragment(editor, {
60
plugins: myPlugins,
61
element: htmlElement,
62
stripWhitespace: false
63
});
64
65
// Complex document with mixed content
66
const complexHtml = `
67
<div>
68
<blockquote>
69
<p>This is a quote with <em>emphasis</em>.</p>
70
</blockquote>
71
<pre><code>console.log('code block');</code></pre>
72
<p>Regular paragraph after code.</p>
73
</div>
74
`;
75
76
const complexFragment = deserializeHTMLToDocumentFragment(editor, {
77
plugins: [
78
// Include plugins for blockquote, code, emphasis, etc.
79
blockquotePlugin,
80
codePlugin,
81
emphasisPlugin
82
],
83
element: complexHtml
84
});
85
```
86
87
### HTML String to DOM Conversion
88
89
Utility for converting HTML strings into DOM elements for further processing.
90
91
```typescript { .api }
92
/**
93
* Convert HTML string into HTML element
94
* @param rawHtml - HTML string to convert
95
* @param stripWhitespace - Whether to strip whitespace characters (default: true)
96
* @returns HTMLElement with the parsed content as body element
97
*/
98
function htmlStringToDOMNode(
99
rawHtml: string,
100
stripWhitespace?: boolean
101
): HTMLElement;
102
```
103
104
**Usage Examples:**
105
106
```typescript
107
import { htmlStringToDOMNode } from "@udecode/plate-html-serializer";
108
109
// Basic HTML string conversion
110
const htmlString = '<p>Hello <strong>world</strong>!</p>';
111
const domNode = htmlStringToDOMNode(htmlString);
112
113
// Access the parsed content
114
console.log(domNode.tagName); // "BODY"
115
console.log(domNode.innerHTML); // "<p>Hello <strong>world</strong>!</p>"
116
117
// Preserve whitespace formatting
118
const formattedHtml = `
119
<div>
120
<p>Line 1</p>
121
<p>Line 2</p>
122
</div>
123
`;
124
125
const preserveWhitespace = htmlStringToDOMNode(formattedHtml, false);
126
const stripWhitespace = htmlStringToDOMNode(formattedHtml, true);
127
128
// Use in deserialization pipeline
129
const htmlContent = '<div><h1>Title</h1><p>Content</p></div>';
130
const bodyElement = htmlStringToDOMNode(htmlContent);
131
132
const slateNodes = deserializeHTMLElement(editor, {
133
plugins: myPlugins,
134
element: bodyElement
135
});
136
```
137
138
## Document Processing Workflow
139
140
The document processing system follows this workflow:
141
142
1. **HTML Input**: Accepts either HTML strings or HTMLElement objects
143
2. **DOM Parsing**: Converts strings to DOM using `htmlStringToDOMNode`
144
3. **Whitespace Processing**: Optionally strips whitespace based on configuration
145
4. **Element Deserialization**: Processes DOM elements through `deserializeHTMLElement`
146
5. **Fragment Normalization**: Ensures valid Slate document structure using `normalizeDescendantsToDocumentFragment`
147
6. **Result**: Returns properly structured TDescendant array
148
149
## Integration Examples
150
151
### Processing Pasted Content
152
153
```typescript
154
import {
155
deserializeHTMLToDocumentFragment,
156
htmlStringToDOMNode
157
} from "@udecode/plate-html-serializer";
158
159
// Handle paste events
160
const handlePaste = (event: ClipboardEvent) => {
161
const html = event.clipboardData?.getData('text/html');
162
163
if (html) {
164
// Convert HTML to Slate nodes
165
const slateFragment = deserializeHTMLToDocumentFragment(editor, {
166
plugins: editorPlugins,
167
element: html,
168
stripWhitespace: true
169
});
170
171
// Insert into editor
172
editor.insertFragment(slateFragment);
173
event.preventDefault();
174
}
175
};
176
```
177
178
### Processing Server Response
179
180
```typescript
181
// Process HTML from API response
182
const processServerHtml = async (htmlContent: string) => {
183
// Clean and convert to DOM
184
const domNode = htmlStringToDOMNode(htmlContent, true);
185
186
// Apply any preprocessing to the DOM if needed
187
preprocessDomNode(domNode);
188
189
// Convert to Slate format
190
const slateContent = deserializeHTMLToDocumentFragment(editor, {
191
plugins: serverContentPlugins,
192
element: domNode
193
});
194
195
return slateContent;
196
};
197
```
198
199
### Handling Complex Documents
200
201
```typescript
202
// Process complete HTML documents
203
const processFullDocument = (htmlDocument: string) => {
204
// Extract body content if it's a full HTML document
205
const doc = new DOMParser().parseFromString(htmlDocument, 'text/html');
206
const bodyContent = doc.body;
207
208
// Process with document fragment handler
209
const slateDocument = deserializeHTMLToDocumentFragment(editor, {
210
plugins: documentPlugins,
211
element: bodyContent,
212
stripWhitespace: true
213
});
214
215
return slateDocument;
216
};
217
```
218
219
## Error Handling
220
221
Document processing functions handle various edge cases:
222
223
- **Empty Content**: Returns empty array for empty or whitespace-only content
224
- **Invalid HTML**: Gracefully handles malformed HTML using browser parsing
225
- **Unsupported Elements**: Elements without matching plugins are wrapped in div tags
226
- **Text Nodes**: Pure text content is properly wrapped in text nodes
227
- **Nested Structures**: Complex nested HTML is recursively processed
228
229
```typescript
230
// Robust document processing with error handling
231
const safeProcessDocument = (html: string) => {
232
try {
233
const fragment = deserializeHTMLToDocumentFragment(editor, {
234
plugins: myPlugins,
235
element: html || '<p></p>', // Fallback for empty content
236
stripWhitespace: true
237
});
238
239
// Ensure we have at least one node
240
return fragment.length > 0 ? fragment : [{ type: 'p', children: [{ text: '' }] }];
241
} catch (error) {
242
console.error('Document processing failed:', error);
243
return [{ type: 'p', children: [{ text: '' }] }]; // Safe fallback
244
}
245
};
246
```