or run

npx @tessl/cli init

HTML Parsing and Tokenization

Package html implements an HTML5-compliant tokenizer and parser.

Import

import "golang.org/x/net/html"

Overview

This package provides both a tokenizer and a parser:

Tokenizer: Implements the tokenization stage of the WHATWG HTML parsing specification
Parser: Implements both tokenization and tree construction stages

For semantically well-formed HTML documents, use the parser rather than the tokenizer. In security contexts, if trust decisions are being made using tokenized or parsed content, the input must be re-serialized (using Render or Token.String) for those trust decisions to hold.

Variables

var ErrBufferExceeded = errors.New("max buffer exceeded")

Parsing

// Parse returns the parse tree for the HTML from the given Reader
func Parse(r io.Reader) (*Node, error)

// ParseWithOptions is like Parse, with options
func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error)

// ParseFragment parses a fragment of HTML and returns the nodes that were found
func ParseFragment(r io.Reader, context *Node) ([]*Node, error)

// ParseFragmentWithOptions is like ParseFragment, with options
func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error)

// ParseOption configures a parser
type ParseOption func(p *parser)

// ParseOptionEnableScripting configures the scripting flag (default: true)
func ParseOptionEnableScripting(enable bool) ParseOption

Parse implements the HTML5 parsing algorithm. The resultant tree can contain implicitly created nodes, and nodes' parents can differ from the nesting implied by a naive processing of start and end tags. Parse will reject HTML that is nested deeper than 512 elements. The input is assumed to be UTF-8 encoded.

Rendering

// Render renders the parse tree n to the given writer
func Render(w io.Writer, n *Node) error

// EscapeString escapes special characters like "<" to become "&lt;"
func EscapeString(s string) string

// UnescapeString unescapes entities like "&lt;" to become "<"
func UnescapeString(s string) string

Rendering is done on a 'best effort' basis. EscapeString escapes only five characters: <, >, &, ' and ". UnescapeString unescapes a larger range of entities.

Node

// Node consists of a NodeType and some Data
type Node struct {
    Parent      *Node
    FirstChild  *Node
    LastChild   *Node
    PrevSibling *Node
    NextSibling *Node

    Type      NodeType
    DataAtom  atom.Atom
    Data      string
    Namespace string
    Attr      []Attribute
}

// NodeType is the type of a Node
type NodeType uint32

const (
    ErrorNode NodeType = iota
    TextNode
    DocumentNode
    ElementNode
    CommentNode
    DoctypeNode
    RawNode // For inserting raw HTML without escaping
)

Node Methods

// AppendChild adds a node c as a child of n
func (n *Node) AppendChild(c *Node)

// InsertBefore inserts newChild as a child of n, immediately before oldChild
func (n *Node) InsertBefore(newChild, oldChild *Node)

// RemoveChild removes a node c that is a child of n
func (n *Node) RemoveChild(c *Node)

// Ancestors returns an iterator over the ancestors of n, starting with n.Parent
func (n *Node) Ancestors() iter.Seq[*Node]

// ChildNodes returns an iterator over the immediate children of n
func (n *Node) ChildNodes() iter.Seq[*Node]

// Descendants returns an iterator over all nodes recursively beneath n
func (n *Node) Descendants() iter.Seq[*Node]

Attribute

// Attribute is an attribute namespace-key-value triple
type Attribute struct {
    Namespace string
    Key       string
    Val       string
}

Val is unescaped (it looks like "a<b" rather than "a<b"). Namespace is only used by the parser, not the tokenizer.

Tokenizer

// Tokenizer returns a stream of HTML Tokens
type Tokenizer struct {
    // Has unexported fields
}

// NewTokenizer returns a new HTML Tokenizer for the given Reader
func NewTokenizer(r io.Reader) *Tokenizer

// NewTokenizerFragment returns a new HTML Tokenizer for tokenizing an existing element's InnerHTML fragment
func NewTokenizerFragment(r io.Reader, contextTag string) *Tokenizer

Tokenizer Methods

// Next scans the next token and returns its type
func (z *Tokenizer) Next() TokenType

// Token returns the current Token
func (z *Tokenizer) Token() Token

// Text returns the unescaped text of a text, comment or doctype token
func (z *Tokenizer) Text() []byte

// TagName returns the lower-cased name of a tag token and whether the tag has attributes
func (z *Tokenizer) TagName() (name []byte, hasAttr bool)

// TagAttr returns the lower-cased key and unescaped value of the next unparsed attribute
func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool)

// Raw returns the unmodified text of the current token
func (z *Tokenizer) Raw() []byte

// Err returns the error associated with the most recent ErrorToken token
func (z *Tokenizer) Err() error

// Buffered returns a slice containing data buffered but not yet tokenized
func (z *Tokenizer) Buffered() []byte

// SetMaxBuf sets a limit on the amount of data buffered during tokenization
func (z *Tokenizer) SetMaxBuf(n int)

// AllowCDATA sets whether or not the tokenizer recognizes <![CDATA[foo]]>
func (z *Tokenizer) AllowCDATA(allowCDATA bool)

// NextIsNotRawText instructs the tokenizer that the next token should not be considered as 'raw text'
func (z *Tokenizer) NextIsNotRawText()

Token

// Token consists of a TokenType and some Data
type Token struct {
    Type     TokenType
    DataAtom atom.Atom
    Data     string
    Attr     []Attribute
}

func (t Token) String() string

// TokenType is the type of a Token
type TokenType uint32

const (
    ErrorToken          TokenType = iota // Error occurred during tokenization
    TextToken                            // Text node
    StartTagToken                        // <a>
    EndTagToken                          // </a>
    SelfClosingTagToken                  // <br/>
    CommentToken                         // <!--x-->
    DoctypeToken                         // <!DOCTYPE x>
)

func (t TokenType) String() string

Usage Examples

Parsing HTML

import (
    "fmt"
    "golang.org/x/net/html"
    "strings"
)

func parseHTML(htmlContent string) error {
    doc, err := html.Parse(strings.NewReader(htmlContent))
    if err != nil {
        return err
    }

    // Process each anchor node in depth-first order
    for n := range doc.Descendants() {
        if n.Type == html.ElementNode && n.Data == "a" {
            // Process anchor node
            for _, attr := range n.Attr {
                if attr.Key == "href" {
                    fmt.Println("Link:", attr.Val)
                }
            }
        }
    }

    return nil
}

Tokenizing HTML (High-Level API)

func tokenizeHighLevel(htmlContent string) error {
    z := html.NewTokenizer(strings.NewReader(htmlContent))

    for {
        if z.Next() == html.ErrorToken {
            // Returning io.EOF indicates success
            return z.Err()
        }
        token := z.Token()
        fmt.Printf("Token: %v\n", token)
    }
}

Tokenizing HTML (Low-Level API)

func extractAnchorText(htmlContent string) error {
    z := html.NewTokenizer(strings.NewReader(htmlContent))
    depth := 0

    for {
        tt := z.Next()
        switch tt {
        case html.ErrorToken:
            return z.Err()

        case html.TextToken:
            if depth > 0 {
                fmt.Printf("Anchor text: %s\n", z.Text())
            }

        case html.StartTagToken, html.EndTagToken:
            tn, _ := z.TagName()
            if len(tn) == 1 && tn[0] == 'a' {
                if tt == html.StartTagToken {
                    depth++
                } else {
                    depth--
                }
            }
        }
    }
}

Rendering HTML

func modifyAndRender(doc *html.Node) error {
    // Find and modify nodes
    for n := range doc.Descendants() {
        if n.Type == html.ElementNode && n.Data == "title" {
            if n.FirstChild != nil && n.FirstChild.Type == html.TextNode {
                n.FirstChild.Data = "New Title"
            }
        }
    }

    // Render modified tree
    return html.Render(os.Stdout, doc)
}

Working with Attributes

func extractLinks(doc *html.Node) []string {
    var links []string

    for n := range doc.Descendants() {
        if n.Type == html.ElementNode && n.Data == "a" {
            for _, attr := range n.Attr {
                if attr.Key == "href" {
                    links = append(links, attr.Val)
                }
            }
        }
    }

    return links
}

Version

Tile

Files

html.mddocs/

HTML Parsing and Tokenization

Import

Overview

Variables

Parsing

Rendering

Node

Node Methods

Attribute

Tokenizer

Tokenizer Methods

Token

Usage Examples

Parsing HTML

Tokenizing HTML (High-Level API)

Tokenizing HTML (Low-Level API)

Rendering HTML

Working with Attributes

Version

Tile

Files

html.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

HTML Parsing and Tokenization

Import

Overview

Variables

Parsing

Rendering

Node

Node Methods

Attribute

Tokenizer

Tokenizer Methods

Token

Usage Examples

Parsing HTML

Tokenizing HTML (High-Level API)

Tokenizing HTML (Low-Level API)

Rendering HTML

Working with Attributes

html.mddocs/