Package html implements an HTML5-compliant tokenizer and parser.
import "golang.org/x/net/html"This package provides both a tokenizer and a parser:
For semantically well-formed HTML documents, use the parser rather than the tokenizer. In security contexts, if trust decisions are being made using tokenized or parsed content, the input must be re-serialized (using Render or Token.String) for those trust decisions to hold.
var ErrBufferExceeded = errors.New("max buffer exceeded")// Parse returns the parse tree for the HTML from the given Reader
func Parse(r io.Reader) (*Node, error)
// ParseWithOptions is like Parse, with options
func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error)
// ParseFragment parses a fragment of HTML and returns the nodes that were found
func ParseFragment(r io.Reader, context *Node) ([]*Node, error)
// ParseFragmentWithOptions is like ParseFragment, with options
func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error)
// ParseOption configures a parser
type ParseOption func(p *parser)
// ParseOptionEnableScripting configures the scripting flag (default: true)
func ParseOptionEnableScripting(enable bool) ParseOptionParse implements the HTML5 parsing algorithm. The resultant tree can contain implicitly created nodes, and nodes' parents can differ from the nesting implied by a naive processing of start and end tags. Parse will reject HTML that is nested deeper than 512 elements. The input is assumed to be UTF-8 encoded.
// Render renders the parse tree n to the given writer
func Render(w io.Writer, n *Node) error
// EscapeString escapes special characters like "<" to become "<"
func EscapeString(s string) string
// UnescapeString unescapes entities like "<" to become "<"
func UnescapeString(s string) stringRendering is done on a 'best effort' basis. EscapeString escapes only five characters: <, >, &, ' and ". UnescapeString unescapes a larger range of entities.
// Node consists of a NodeType and some Data
type Node struct {
Parent *Node
FirstChild *Node
LastChild *Node
PrevSibling *Node
NextSibling *Node
Type NodeType
DataAtom atom.Atom
Data string
Namespace string
Attr []Attribute
}
// NodeType is the type of a Node
type NodeType uint32
const (
ErrorNode NodeType = iota
TextNode
DocumentNode
ElementNode
CommentNode
DoctypeNode
RawNode // For inserting raw HTML without escaping
)// AppendChild adds a node c as a child of n
func (n *Node) AppendChild(c *Node)
// InsertBefore inserts newChild as a child of n, immediately before oldChild
func (n *Node) InsertBefore(newChild, oldChild *Node)
// RemoveChild removes a node c that is a child of n
func (n *Node) RemoveChild(c *Node)
// Ancestors returns an iterator over the ancestors of n, starting with n.Parent
func (n *Node) Ancestors() iter.Seq[*Node]
// ChildNodes returns an iterator over the immediate children of n
func (n *Node) ChildNodes() iter.Seq[*Node]
// Descendants returns an iterator over all nodes recursively beneath n
func (n *Node) Descendants() iter.Seq[*Node]// Attribute is an attribute namespace-key-value triple
type Attribute struct {
Namespace string
Key string
Val string
}Val is unescaped (it looks like "a<b" rather than "a<b"). Namespace is only used by the parser, not the tokenizer.
// Tokenizer returns a stream of HTML Tokens
type Tokenizer struct {
// Has unexported fields
}
// NewTokenizer returns a new HTML Tokenizer for the given Reader
func NewTokenizer(r io.Reader) *Tokenizer
// NewTokenizerFragment returns a new HTML Tokenizer for tokenizing an existing element's InnerHTML fragment
func NewTokenizerFragment(r io.Reader, contextTag string) *Tokenizer// Next scans the next token and returns its type
func (z *Tokenizer) Next() TokenType
// Token returns the current Token
func (z *Tokenizer) Token() Token
// Text returns the unescaped text of a text, comment or doctype token
func (z *Tokenizer) Text() []byte
// TagName returns the lower-cased name of a tag token and whether the tag has attributes
func (z *Tokenizer) TagName() (name []byte, hasAttr bool)
// TagAttr returns the lower-cased key and unescaped value of the next unparsed attribute
func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool)
// Raw returns the unmodified text of the current token
func (z *Tokenizer) Raw() []byte
// Err returns the error associated with the most recent ErrorToken token
func (z *Tokenizer) Err() error
// Buffered returns a slice containing data buffered but not yet tokenized
func (z *Tokenizer) Buffered() []byte
// SetMaxBuf sets a limit on the amount of data buffered during tokenization
func (z *Tokenizer) SetMaxBuf(n int)
// AllowCDATA sets whether or not the tokenizer recognizes <![CDATA[foo]]>
func (z *Tokenizer) AllowCDATA(allowCDATA bool)
// NextIsNotRawText instructs the tokenizer that the next token should not be considered as 'raw text'
func (z *Tokenizer) NextIsNotRawText()// Token consists of a TokenType and some Data
type Token struct {
Type TokenType
DataAtom atom.Atom
Data string
Attr []Attribute
}
func (t Token) String() string
// TokenType is the type of a Token
type TokenType uint32
const (
ErrorToken TokenType = iota // Error occurred during tokenization
TextToken // Text node
StartTagToken // <a>
EndTagToken // </a>
SelfClosingTagToken // <br/>
CommentToken // <!--x-->
DoctypeToken // <!DOCTYPE x>
)
func (t TokenType) String() stringimport (
"fmt"
"golang.org/x/net/html"
"strings"
)
func parseHTML(htmlContent string) error {
doc, err := html.Parse(strings.NewReader(htmlContent))
if err != nil {
return err
}
// Process each anchor node in depth-first order
for n := range doc.Descendants() {
if n.Type == html.ElementNode && n.Data == "a" {
// Process anchor node
for _, attr := range n.Attr {
if attr.Key == "href" {
fmt.Println("Link:", attr.Val)
}
}
}
}
return nil
}func tokenizeHighLevel(htmlContent string) error {
z := html.NewTokenizer(strings.NewReader(htmlContent))
for {
if z.Next() == html.ErrorToken {
// Returning io.EOF indicates success
return z.Err()
}
token := z.Token()
fmt.Printf("Token: %v\n", token)
}
}func extractAnchorText(htmlContent string) error {
z := html.NewTokenizer(strings.NewReader(htmlContent))
depth := 0
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
return z.Err()
case html.TextToken:
if depth > 0 {
fmt.Printf("Anchor text: %s\n", z.Text())
}
case html.StartTagToken, html.EndTagToken:
tn, _ := z.TagName()
if len(tn) == 1 && tn[0] == 'a' {
if tt == html.StartTagToken {
depth++
} else {
depth--
}
}
}
}
}func modifyAndRender(doc *html.Node) error {
// Find and modify nodes
for n := range doc.Descendants() {
if n.Type == html.ElementNode && n.Data == "title" {
if n.FirstChild != nil && n.FirstChild.Type == html.TextNode {
n.FirstChild.Data = "New Title"
}
}
}
// Render modified tree
return html.Render(os.Stdout, doc)
}func extractLinks(doc *html.Node) []string {
var links []string
for n := range doc.Descendants() {
if n.Type == html.ElementNode && n.Data == "a" {
for _, attr := range n.Attr {
if attr.Key == "href" {
links = append(links, attr.Val)
}
}
}
}
return links
}