or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

deserialization.mddocument-processing.mdindex.mdserialization.md

document-processing.mddocs/

0

# Document Processing

1

2

Low-level utilities for processing HTML documents, fragments, and converting between HTML strings and Slate document structures. These functions provide the foundation for both serialization and deserialization operations.

3

4

## Capabilities

5

6

### Document Fragment Processing

7

8

Functions for handling complete HTML documents and converting them to Slate-compatible formats.

9

10

```typescript { .api }

11

/**

12

* Deserialize HTML element to a valid document fragment

13

* @param editor - Plate editor instance

14

* @param options - Document fragment processing options

15

* @returns Array of Slate descendants representing the document

16

*/

17

function deserializeHTMLToDocumentFragment<T = {}>(

18

editor: PlateEditor<T>,

19

options: DocumentFragmentOptions<T>

20

): TDescendant[];

21

22

interface DocumentFragmentOptions<T> {

23

/** Array of Plate plugins for deserialization rules */

24

plugins: PlatePlugin<T>[];

25

/** HTML element or string to deserialize */

26

element: HTMLElement | string;

27

/** Whether to strip whitespace from the HTML (default: true) */

28

stripWhitespace?: boolean;

29

}

30

```

31

32

**Usage Examples:**

33

34

```typescript

35

import { deserializeHTMLToDocumentFragment } from "@udecode/plate-html-serializer";

36

37

// Process HTML string to document fragment

38

const htmlString = `

39

<div>

40

<h1>Title</h1>

41

<p>First paragraph with <strong>bold</strong> text.</p>

42

<ul>

43

<li>List item 1</li>

44

<li>List item 2</li>

45

</ul>

46

</div>

47

`;

48

49

const documentFragment = deserializeHTMLToDocumentFragment(editor, {

50

plugins: myPlugins,

51

element: htmlString,

52

stripWhitespace: true

53

});

54

55

// Process HTML element to document fragment

56

const htmlElement = document.createElement('article');

57

htmlElement.innerHTML = '<h2>Article Title</h2><p>Article content...</p>';

58

59

const articleFragment = deserializeHTMLToDocumentFragment(editor, {

60

plugins: myPlugins,

61

element: htmlElement,

62

stripWhitespace: false

63

});

64

65

// Complex document with mixed content

66

const complexHtml = `

67

<div>

68

<blockquote>

69

<p>This is a quote with <em>emphasis</em>.</p>

70

</blockquote>

71

<pre><code>console.log('code block');</code></pre>

72

<p>Regular paragraph after code.</p>

73

</div>

74

`;

75

76

const complexFragment = deserializeHTMLToDocumentFragment(editor, {

77

plugins: [

78

// Include plugins for blockquote, code, emphasis, etc.

79

blockquotePlugin,

80

codePlugin,

81

emphasisPlugin

82

],

83

element: complexHtml

84

});

85

```

86

87

### HTML String to DOM Conversion

88

89

Utility for converting HTML strings into DOM elements for further processing.

90

91

```typescript { .api }

92

/**

93

* Convert HTML string into HTML element

94

* @param rawHtml - HTML string to convert

95

* @param stripWhitespace - Whether to strip whitespace characters (default: true)

96

* @returns HTMLElement with the parsed content as body element

97

*/

98

function htmlStringToDOMNode(

99

rawHtml: string,

100

stripWhitespace?: boolean

101

): HTMLElement;

102

```

103

104

**Usage Examples:**

105

106

```typescript

107

import { htmlStringToDOMNode } from "@udecode/plate-html-serializer";

108

109

// Basic HTML string conversion

110

const htmlString = '<p>Hello <strong>world</strong>!</p>';

111

const domNode = htmlStringToDOMNode(htmlString);

112

113

// Access the parsed content

114

console.log(domNode.tagName); // "BODY"

115

console.log(domNode.innerHTML); // "<p>Hello <strong>world</strong>!</p>"

116

117

// Preserve whitespace formatting

118

const formattedHtml = `

119

<div>

120

<p>Line 1</p>

121

<p>Line 2</p>

122

</div>

123

`;

124

125

const preserveWhitespace = htmlStringToDOMNode(formattedHtml, false);

126

const stripWhitespace = htmlStringToDOMNode(formattedHtml, true);

127

128

// Use in deserialization pipeline

129

const htmlContent = '<div><h1>Title</h1><p>Content</p></div>';

130

const bodyElement = htmlStringToDOMNode(htmlContent);

131

132

const slateNodes = deserializeHTMLElement(editor, {

133

plugins: myPlugins,

134

element: bodyElement

135

});

136

```

137

138

## Document Processing Workflow

139

140

The document processing system follows this workflow:

141

142

1. **HTML Input**: Accepts either HTML strings or HTMLElement objects

143

2. **DOM Parsing**: Converts strings to DOM using `htmlStringToDOMNode`

144

3. **Whitespace Processing**: Optionally strips whitespace based on configuration

145

4. **Element Deserialization**: Processes DOM elements through `deserializeHTMLElement`

146

5. **Fragment Normalization**: Ensures valid Slate document structure using `normalizeDescendantsToDocumentFragment`

147

6. **Result**: Returns properly structured TDescendant array

148

149

## Integration Examples

150

151

### Processing Pasted Content

152

153

```typescript

154

import {

155

deserializeHTMLToDocumentFragment,

156

htmlStringToDOMNode

157

} from "@udecode/plate-html-serializer";

158

159

// Handle paste events

160

const handlePaste = (event: ClipboardEvent) => {

161

const html = event.clipboardData?.getData('text/html');

162

163

if (html) {

164

// Convert HTML to Slate nodes

165

const slateFragment = deserializeHTMLToDocumentFragment(editor, {

166

plugins: editorPlugins,

167

element: html,

168

stripWhitespace: true

169

});

170

171

// Insert into editor

172

editor.insertFragment(slateFragment);

173

event.preventDefault();

174

}

175

};

176

```

177

178

### Processing Server Response

179

180

```typescript

181

// Process HTML from API response

182

const processServerHtml = async (htmlContent: string) => {

183

// Clean and convert to DOM

184

const domNode = htmlStringToDOMNode(htmlContent, true);

185

186

// Apply any preprocessing to the DOM if needed

187

preprocessDomNode(domNode);

188

189

// Convert to Slate format

190

const slateContent = deserializeHTMLToDocumentFragment(editor, {

191

plugins: serverContentPlugins,

192

element: domNode

193

});

194

195

return slateContent;

196

};

197

```

198

199

### Handling Complex Documents

200

201

```typescript

202

// Process complete HTML documents

203

const processFullDocument = (htmlDocument: string) => {

204

// Extract body content if it's a full HTML document

205

const doc = new DOMParser().parseFromString(htmlDocument, 'text/html');

206

const bodyContent = doc.body;

207

208

// Process with document fragment handler

209

const slateDocument = deserializeHTMLToDocumentFragment(editor, {

210

plugins: documentPlugins,

211

element: bodyContent,

212

stripWhitespace: true

213

});

214

215

return slateDocument;

216

};

217

```

218

219

## Error Handling

220

221

Document processing functions handle various edge cases:

222

223

- **Empty Content**: Returns empty array for empty or whitespace-only content

224

- **Invalid HTML**: Gracefully handles malformed HTML using browser parsing

225

- **Unsupported Elements**: Elements without matching plugins are wrapped in div tags

226

- **Text Nodes**: Pure text content is properly wrapped in text nodes

227

- **Nested Structures**: Complex nested HTML is recursively processed

228

229

```typescript

230

// Robust document processing with error handling

231

const safeProcessDocument = (html: string) => {

232

try {

233

const fragment = deserializeHTMLToDocumentFragment(editor, {

234

plugins: myPlugins,

235

element: html || '<p></p>', // Fallback for empty content

236

stripWhitespace: true

237

});

238

239

// Ensure we have at least one node

240

return fragment.length > 0 ? fragment : [{ type: 'p', children: [{ text: '' }] }];

241

} catch (error) {

242

console.error('Document processing failed:', error);

243

return [{ type: 'p', children: [{ text: '' }] }]; // Safe fallback

244

}

245

};

246

```