or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

error-handling.mdhtml-utilities.mdindex.mdparsing.mdserialization.mdtokenization.mdtree-adapters.md

index.mddocs/

0

# Parse5

1

2

Parse5 is a HTML parser and serializer that provides fast, standard-compliant HTML parsing for Node.js applications. It implements the WHATWG HTML Living Standard and handles malformed HTML gracefully, making it suitable for server-side HTML processing, DOM manipulation, and web scraping applications.

3

4

## Package Information

5

6

- **Package Name**: parse5

7

- **Package Type**: npm

8

- **Language**: TypeScript

9

- **Installation**: `npm install parse5`

10

11

## Core Imports

12

13

```typescript

14

// Main parsing and serialization functions

15

import { parse, parseFragment, serialize, serializeOuter } from "parse5";

16

17

// Tree adapters and types

18

import { defaultTreeAdapter, DefaultTreeAdapterTypes } from "parse5";

19

import type { TreeAdapter, TreeAdapterTypeMap, DefaultTreeAdapterMap } from "parse5";

20

21

// Error handling

22

import { ErrorCodes, type ParserError, type ParserErrorHandler } from "parse5";

23

24

// Options interfaces

25

import type { ParserOptions, SerializerOptions } from "parse5";

26

27

// HTML constants and utilities

28

import { html, Token } from "parse5";

29

30

// Advanced tokenization (internal APIs)

31

import { Tokenizer, TokenizerMode, type TokenizerOptions, type TokenHandler } from "parse5";

32

```

33

34

For CommonJS:

35

36

```javascript

37

const {

38

parse, parseFragment, serialize, serializeOuter,

39

defaultTreeAdapter, DefaultTreeAdapterTypes,

40

ErrorCodes, html, Token,

41

Tokenizer, TokenizerMode

42

} = require("parse5");

43

```

44

45

## Basic Usage

46

47

```typescript

48

import { parse, parseFragment, serialize } from "parse5";

49

50

// Parse a complete HTML document

51

const document = parse('<!DOCTYPE html><html><head></head><body>Hello World!</body></html>');

52

53

// Parse HTML fragment

54

const fragment = parseFragment('<div><span>Content</span></div>');

55

56

// Serialize back to HTML string

57

const htmlString = serialize(document);

58

console.log(htmlString);

59

```

60

61

## Architecture

62

63

Parse5 is built around several core components:

64

65

- **Parser**: Core HTML parsing engine that converts HTML strings to AST nodes

66

- **Serializer**: Converts AST nodes back to HTML strings

67

- **Tree Adapters**: Pluggable adapters that define the structure of AST nodes

68

- **Tokenizer**: Low-level HTML tokenization engine

69

- **Location Tracking**: Optional source code location information for debugging

70

- **Error Handling**: Comprehensive parsing error detection and reporting

71

72

## Capabilities

73

74

### HTML Parsing

75

76

Core HTML parsing functionality that converts HTML strings into abstract syntax trees. Supports both complete documents and fragments with optional context.

77

78

```typescript { .api }

79

function parse<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>(

80

html: string,

81

options?: ParserOptions<T>

82

): T['document'];

83

84

function parseFragment<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>(

85

fragmentContext: T['parentNode'] | null,

86

html: string,

87

options: ParserOptions<T>

88

): T['documentFragment'];

89

90

function parseFragment<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>(

91

html: string,

92

options?: ParserOptions<T>

93

): T['documentFragment'];

94

```

95

96

[HTML Parsing](./parsing.md)

97

98

### HTML Serialization

99

100

Serialization functionality for converting parsed AST nodes back to HTML strings. Supports both inner content and complete element serialization.

101

102

```typescript { .api }

103

function serialize<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>(

104

node: T['parentNode'],

105

options?: SerializerOptions<T>

106

): string;

107

108

function serializeOuter<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap>(

109

node: T['node'],

110

options?: SerializerOptions<T>

111

): string;

112

```

113

114

[HTML Serialization](./serialization.md)

115

116

### Tree Adapters

117

118

Pluggable tree adapter system that defines the structure and manipulation of AST nodes. Allows customization of how parsed HTML is represented in memory.

119

120

```typescript { .api }

121

interface TreeAdapter<T extends TreeAdapterTypeMap = TreeAdapterTypeMap> {

122

// Node creation methods

123

createDocument(): T['document'];

124

createDocumentFragment(): T['documentFragment'];

125

createElement(tagName: string, namespaceURI: string, attrs: Attribute[]): T['element'];

126

createCommentNode(data: string): T['commentNode'];

127

createTextNode(value: string): T['textNode'];

128

129

// Node manipulation methods

130

appendChild(parentNode: T['parentNode'], newNode: T['childNode']): void;

131

insertBefore(parentNode: T['parentNode'], newNode: T['childNode'], referenceNode: T['childNode']): void;

132

detachNode(node: T['childNode']): void;

133

134

// Node inspection and type guards

135

isElementNode(node: T['node']): node is T['element'];

136

isTextNode(node: T['node']): node is T['textNode'];

137

isCommentNode(node: T['node']): node is T['commentNode'];

138

isDocumentTypeNode(node: T['node']): node is T['documentType'];

139

}

140

141

const defaultTreeAdapter: TreeAdapter<DefaultTreeAdapterMap>;

142

```

143

144

[Tree Adapters](./tree-adapters.md)

145

146

### Error Handling

147

148

Comprehensive error handling system that provides detailed parsing error information with source code locations and error codes.

149

150

```typescript { .api }

151

interface ParserError {

152

code: string;

153

startLine: number;

154

startCol: number;

155

startOffset: number;

156

endLine: number;

157

endCol: number;

158

endOffset: number;

159

}

160

161

type ParserErrorHandler = (error: ParserError) => void;

162

163

enum ErrorCodes {

164

controlCharacterInInputStream = 'control-character-in-input-stream',

165

noncharacterInInputStream = 'noncharacter-in-input-stream',

166

unexpectedNullCharacter = 'unexpected-null-character',

167

unexpectedQuestionMarkInsteadOfTagName = 'unexpected-question-mark-instead-of-tag-name',

168

// ... many more error codes

169

}

170

```

171

172

[Error Handling](./error-handling.md)

173

174

### HTML Tokenization

175

176

Low-level HTML tokenization functionality for advanced use cases that require direct access to the tokenization process. Provides tokenizer class, token types, and token handlers.

177

178

```typescript { .api }

179

class Tokenizer {

180

constructor(options: TokenizerOptions, handler: TokenHandler);

181

write(chunk: string, isLastChunk: boolean): void;

182

insertHtmlAtCurrentPos(chunk: string): void;

183

}

184

185

interface TokenizerOptions {

186

sourceCodeLocationInfo?: boolean;

187

}

188

189

interface TokenHandler {

190

onComment(token: CommentToken): void;

191

onDoctype(token: DoctypeToken): void;

192

onStartTag(token: TagToken): void;

193

onEndTag(token: TagToken): void;

194

onEof(token: EOFToken): void;

195

onCharacter(token: CharacterToken): void;

196

onNullCharacter(token: CharacterToken): void;

197

onWhitespaceCharacter(token: CharacterToken): void;

198

}

199

200

const TokenizerMode: {

201

readonly DATA: State.DATA;

202

readonly RCDATA: State.RCDATA;

203

readonly RAWTEXT: State.RAWTEXT;

204

readonly SCRIPT_DATA: State.SCRIPT_DATA;

205

readonly PLAINTEXT: State.PLAINTEXT;

206

readonly CDATA_SECTION: State.CDATA_SECTION;

207

};

208

```

209

210

[HTML Tokenization](./tokenization.md)

211

212

### HTML Constants and Utilities

213

214

HTML specification constants, enumerations, and utility functions providing access to standardized HTML element names, namespace URIs, document modes, and other HTML5 specification details.

215

216

```typescript { .api }

217

namespace html {

218

enum NS {

219

HTML = 'http://www.w3.org/1999/xhtml',

220

MATHML = 'http://www.w3.org/1998/Math/MathML',

221

SVG = 'http://www.w3.org/2000/svg',

222

XLINK = 'http://www.w3.org/1999/xlink',

223

XML = 'http://www.w3.org/XML/1998/namespace',

224

XMLNS = 'http://www.w3.org/2000/xmlns/'

225

}

226

227

enum TAG_NAMES {

228

A = 'a',

229

DIV = 'div',

230

SPAN = 'span',

231

P = 'p',

232

// ... 100+ tag names

233

}

234

235

enum TAG_ID {

236

UNKNOWN = 0,

237

A = 1,

238

DIV = 27,

239

SPAN = 100,

240

// ... corresponding IDs

241

}

242

243

enum DOCUMENT_MODE {

244

NO_QUIRKS = 'no-quirks',

245

QUIRKS = 'quirks',

246

LIMITED_QUIRKS = 'limited-quirks'

247

}

248

249

enum ATTRS {

250

CLASS = 'class',

251

ID = 'id',

252

SRC = 'src',

253

HREF = 'href',

254

// ... common attributes

255

}

256

257

function getTagID(tagName: string): TAG_ID;

258

function hasUnescapedText(tagName: string, scriptingEnabled: boolean): boolean;

259

}

260

```

261

262

[HTML Constants and Utilities](./html-utilities.md)

263

264

## Core Types

265

266

```typescript { .api }

267

interface DefaultTreeAdapterMap extends TreeAdapterTypeMap<

268

Node,

269

ParentNode,

270

ChildNode,

271

Document,

272

DocumentFragment,

273

Element,

274

CommentNode,

275

TextNode,

276

Template,

277

DocumentType

278

> {}

279

280

interface ParserOptions<T extends TreeAdapterTypeMap> {

281

/** Controls noscript element parsing. Defaults to true */

282

scriptingEnabled?: boolean;

283

/** Enables source code location tracking. Defaults to false */

284

sourceCodeLocationInfo?: boolean;

285

/** Custom tree adapter for AST node structure */

286

treeAdapter?: TreeAdapter<T>;

287

/** Error handling callback */

288

onParseError?: ParserErrorHandler;

289

}

290

291

interface SerializerOptions<T extends TreeAdapterTypeMap> {

292

/** Custom tree adapter for AST node structure */

293

treeAdapter?: TreeAdapter<T>;

294

/** Controls noscript element serialization. Defaults to true */

295

scriptingEnabled?: boolean;

296

}

297

```