or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

collation.mdencoding.mdformatting.mdindex.mdlanguage.mdlocalization.mdsearch-and-security.mdtext-transformation.mdunicode.md
tile.json

encoding.mddocs/

Character Encoding

This document covers all character encoding packages in golang.org/x/text that provide conversion between UTF-8 and various legacy character encodings.

Package Overview

  • encoding: Core encoding interfaces and utilities
  • encoding/charmap: Simple 8-bit character encodings (IBM, ISO-8859, Windows)
  • encoding/htmlindex: W3C HTML5 encoding names
  • encoding/ianaindex: IANA-registered encoding names
  • encoding/japanese: Japanese encodings (Shift JIS, EUC-JP, ISO-2022-JP)
  • encoding/korean: Korean encodings (EUC-KR)
  • encoding/simplifiedchinese: Simplified Chinese encodings (GBK, GB18030, HZ-GB2312)
  • encoding/traditionalchinese: Traditional Chinese encodings (Big5)
  • encoding/unicode: Unicode encodings (UTF-8, UTF-16)

Core Encoding Package

Import path: golang.org/x/text/encoding

Core Types

// Encoding is a character set encoding that can be transformed to and from UTF-8
type Encoding interface {
    NewDecoder() *Decoder
    NewEncoder() *Encoder
}

// Decoder converts bytes to UTF-8
type Decoder struct {
    transform.Transformer
}

func (d *Decoder) Bytes(b []byte) ([]byte, error)
func (d *Decoder) String(s string) (string, error)
func (d *Decoder) Reader(r io.Reader) io.Reader

// Encoder converts bytes from UTF-8
type Encoder struct {
    transform.Transformer
}

func (e *Encoder) Bytes(b []byte) ([]byte, error)
func (e *Encoder) String(s string) (string, error)
func (e *Encoder) Writer(w io.Writer) io.Writer

Predefined Encodings

// Nop encoding - no transformation
var Nop Encoding

// Replacement encoding - yields U+FFFD replacement character
var Replacement Encoding

Encoding Options

// HTMLEscapeUnsupported wraps encoders to replace unsupported runes with HTML escape sequences
func HTMLEscapeUnsupported(e *Encoder) *Encoder

// ReplaceUnsupported wraps encoders to replace unsupported runes with encoding-specific replacement
func ReplaceUnsupported(e *Encoder) *Encoder

Constants and Variables

// ASCIISub is the ASCII substitute character
const ASCIISub = '\x1a'

// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8
var ErrInvalidUTF8 error

// UTF8Validator returns ErrInvalidUTF8 on the first invalid UTF-8 byte
var UTF8Validator transform.Transformer

Usage Examples

import (
    "golang.org/x/text/encoding"
    "golang.org/x/text/encoding/charmap"
)

// Decode from legacy encoding to UTF-8
decoder := charmap.Windows1252.NewDecoder()
utf8Text, err := decoder.String(windows1252Text)

// Encode from UTF-8 to legacy encoding
encoder := charmap.Windows1252.NewEncoder()
encodedText, err := encoder.String(utf8Text)

// Use with io.Reader
utf8Reader := decoder.Reader(legacyReader)

// Use with io.Writer with HTML escaping for unsupported characters
encoder = encoding.HTMLEscapeUnsupported(encoder)
legacyWriter := encoder.Writer(outputWriter)

Charmap Package

Import path: golang.org/x/text/encoding/charmap

Provides simple 8-bit character encodings.

Charmap Type

// Charmap is an 8-bit character set encoding
type Charmap struct {}

func (c *Charmap) NewDecoder() *encoding.Decoder
func (c *Charmap) NewEncoder() *encoding.Encoder
func (c *Charmap) DecodeByte(b byte) rune
func (c *Charmap) EncodeRune(r rune) (b byte, ok bool)
func (c *Charmap) String() string
func (c *Charmap) ID() (mib identifier.MIB, other string)

IBM Code Pages

var CodePage037 *Charmap  // IBM Code Page 037
var CodePage437 *Charmap  // IBM Code Page 437
var CodePage850 *Charmap  // IBM Code Page 850
var CodePage852 *Charmap  // IBM Code Page 852
var CodePage855 *Charmap  // IBM Code Page 855
var CodePage858 *Charmap  // IBM Code Page 858
var CodePage860 *Charmap  // IBM Code Page 860
var CodePage862 *Charmap  // IBM Code Page 862
var CodePage863 *Charmap  // IBM Code Page 863
var CodePage865 *Charmap  // IBM Code Page 865
var CodePage866 *Charmap  // IBM Code Page 866
var CodePage1047 *Charmap // IBM Code Page 1047
var CodePage1140 *Charmap // IBM Code Page 1140

ISO 8859 Encodings

var ISO8859_1 *Charmap  // ISO 8859-1 (Latin-1)
var ISO8859_2 *Charmap  // ISO 8859-2 (Latin-2)
var ISO8859_3 *Charmap  // ISO 8859-3 (Latin-3)
var ISO8859_4 *Charmap  // ISO 8859-4 (Latin-4)
var ISO8859_5 *Charmap  // ISO 8859-5 (Cyrillic)
var ISO8859_6 *Charmap  // ISO 8859-6 (Arabic)
var ISO8859_7 *Charmap  // ISO 8859-7 (Greek)
var ISO8859_8 *Charmap  // ISO 8859-8 (Hebrew)
var ISO8859_9 *Charmap  // ISO 8859-9 (Latin-5)
var ISO8859_10 *Charmap // ISO 8859-10 (Latin-6)
var ISO8859_13 *Charmap // ISO 8859-13 (Latin-7)
var ISO8859_14 *Charmap // ISO 8859-14 (Latin-8)
var ISO8859_15 *Charmap // ISO 8859-15 (Latin-9)
var ISO8859_16 *Charmap // ISO 8859-16 (Latin-10)

// ISO 8859-6 variants
var ISO8859_6E encoding.Encoding // ISO 8859-6E (explicit)
var ISO8859_6I encoding.Encoding // ISO 8859-6I (implicit)

// ISO 8859-8 variants
var ISO8859_8E encoding.Encoding // ISO 8859-8E (explicit)
var ISO8859_8I encoding.Encoding // ISO 8859-8I (implicit)

Windows Code Pages

var Windows874 *Charmap  // Windows 874 (Thai)
var Windows1250 *Charmap // Windows 1250 (Central European)
var Windows1251 *Charmap // Windows 1251 (Cyrillic)
var Windows1252 *Charmap // Windows 1252 (Western European)
var Windows1253 *Charmap // Windows 1253 (Greek)
var Windows1254 *Charmap // Windows 1254 (Turkish)
var Windows1255 *Charmap // Windows 1255 (Hebrew)
var Windows1256 *Charmap // Windows 1256 (Arabic)
var Windows1257 *Charmap // Windows 1257 (Baltic)
var Windows1258 *Charmap // Windows 1258 (Vietnamese)

Other 8-bit Encodings

var KOI8R *Charmap            // KOI8-R (Russian)
var KOI8U *Charmap            // KOI8-U (Ukrainian)
var Macintosh *Charmap        // Macintosh (Mac OS Roman)
var MacintoshCyrillic *Charmap // Macintosh Cyrillic
var XUserDefined *Charmap     // X-User-Defined

All Encodings List

// All contains all defined encodings in this package
var All []encoding.Encoding

Usage Examples

import "golang.org/x/text/encoding/charmap"

// Decode Windows-1252 to UTF-8
utf8Text, err := charmap.Windows1252.NewDecoder().String(win1252Text)

// Encode UTF-8 to ISO-8859-1
latin1Text, err := charmap.ISO8859_1.NewEncoder().String(utf8Text)

// Check if a rune can be encoded
b, ok := charmap.Windows1252.EncodeRune('é')

// Decode a single byte
r := charmap.Windows1252.DecodeByte(0xE9) // é

HTML Index Package

Import path: golang.org/x/text/encoding/htmlindex

Maps character set encoding names to Encodings as recommended by the W3C for HTML 5.

Functions

// Get returns an Encoding for HTML5 encoding names (case-insensitive)
func Get(name string) (encoding.Encoding, error)

// Name reports the canonical name of the given Encoding
func Name(e encoding.Encoding) (string, error)

// LanguageDefault returns the canonical name of the default encoding for a given language
func LanguageDefault(tag language.Tag) string

Usage Examples

import "golang.org/x/text/encoding/htmlindex"

// Get encoding by HTML5 name
enc, err := htmlindex.Get("windows-1252")
enc, err = htmlindex.Get("iso-8859-1")
enc, err = htmlindex.Get("utf-8")

// Get canonical name
name, err := htmlindex.Name(charmap.Windows1252)

// Get default encoding for language
defaultEnc := htmlindex.LanguageDefault(language.Japanese)

IANA Index Package

Import path: golang.org/x/text/encoding/ianaindex

Maps names to Encodings as specified by the IANA registry.

Index Type

// Index maps IANA-registered names to Encodings
type Index struct {}

func (i *Index) Encoding(name string) (encoding.Encoding, error)
func (i *Index) Name(e encoding.Encoding) (string, error)

Predefined Indexes

// MIME index for MIME names
var MIME *Index

// IANA index supporting all names and aliases using IANA names as canonical
var IANA *Index

// MIB index associating MIB display names with Encodings
var MIB *Index

Usage Examples

import "golang.org/x/text/encoding/ianaindex"

// Get encoding by IANA name
enc, err := ianaindex.IANA.Encoding("windows-1252")

// Get encoding by MIME name
enc, err = ianaindex.MIME.Encoding("iso-8859-1")

// Get canonical name
name, err := ianaindex.IANA.Name(charmap.Windows1252)

Japanese Package

Import path: golang.org/x/text/encoding/japanese

Provides Japanese encodings.

Predefined Encodings

// EUC-JP encoding
var EUCJP encoding.Encoding

// ISO-2022-JP encoding
var ISO2022JP encoding.Encoding

// Shift JIS encoding (also known as Code Page 932 and Windows-31J)
var ShiftJIS encoding.Encoding

// All defined encodings in this package
var All []encoding.Encoding

Usage Examples

import "golang.org/x/text/encoding/japanese"

// Decode Shift JIS to UTF-8
utf8Text, err := japanese.ShiftJIS.NewDecoder().String(shiftJISText)

// Encode UTF-8 to EUC-JP
eucjpText, err := japanese.EUCJP.NewEncoder().String(utf8Text)

// ISO-2022-JP (used in email)
iso2022jpText, err := japanese.ISO2022JP.NewEncoder().String(utf8Text)

Korean Package

Import path: golang.org/x/text/encoding/korean

Provides Korean encodings.

Predefined Encodings

// EUC-KR encoding (also known as Code Page 949)
var EUCKR encoding.Encoding

// All defined encodings in this package
var All []encoding.Encoding

Usage Examples

import "golang.org/x/text/encoding/korean"

// Decode EUC-KR to UTF-8
utf8Text, err := korean.EUCKR.NewDecoder().String(euckrText)

// Encode UTF-8 to EUC-KR
euckrText, err := korean.EUCKR.NewEncoder().String(utf8Text)

Simplified Chinese Package

Import path: golang.org/x/text/encoding/simplifiedchinese

Provides Simplified Chinese encodings.

Predefined Encodings

// GB18030 encoding
var GB18030 encoding.Encoding

// GBK encoding (also known as Code Page 936)
// Encodes an extension of the GB2312 character set
var GBK encoding.Encoding

// HZ-GB2312 encoding
var HZGB2312 encoding.Encoding

// All defined encodings in this package
var All []encoding.Encoding

Usage Examples

import "golang.org/x/text/encoding/simplifiedchinese"

// Decode GBK to UTF-8
utf8Text, err := simplifiedchinese.GBK.NewDecoder().String(gbkText)

// Encode UTF-8 to GB18030
gb18030Text, err := simplifiedchinese.GB18030.NewEncoder().String(utf8Text)

// HZ-GB2312 (7-bit encoding)
hzText, err := simplifiedchinese.HZGB2312.NewEncoder().String(utf8Text)

Traditional Chinese Package

Import path: golang.org/x/text/encoding/traditionalchinese

Provides Traditional Chinese encodings.

Predefined Encodings

// Big5 encoding (also known as Code Page 950)
var Big5 encoding.Encoding

// All defined encodings in this package
var All []encoding.Encoding

Usage Examples

import "golang.org/x/text/encoding/traditionalchinese"

// Decode Big5 to UTF-8
utf8Text, err := traditionalchinese.Big5.NewDecoder().String(big5Text)

// Encode UTF-8 to Big5
big5Text, err := traditionalchinese.Big5.NewEncoder().String(utf8Text)

Unicode Package

Import path: golang.org/x/text/encoding/unicode

Provides Unicode encodings such as UTF-16.

Types

// Endianness is a UTF-16 encoding's default endianness
type Endianness bool

const (
    BigEndian    Endianness = false // UTF-16BE
    LittleEndian Endianness = true  // UTF-16LE
)

// BOMPolicy is a UTF-16 encoding's byte order mark policy
type BOMPolicy uint8

const (
    IgnoreBOM BOMPolicy = 0 // Ignore any byte order marks
    UseBOM    BOMPolicy = 1 // UTF-16 form may start with a BOM
    ExpectBOM BOMPolicy = 2 // UTF-16 form must start with a BOM
)

Predefined Encodings

// UTF-8 encoding without BOM handling
var UTF8 encoding.Encoding

// UTF-8 encoding where decoder strips leading BOM and encoder adds one
var UTF8BOM encoding.Encoding

// All lists configurations for each IANA-defined UTF-16 variant
var All []encoding.Encoding

Functions

// UTF16 returns a UTF-16 Encoding for the given endianness and BOM policy
func UTF16(e Endianness, b BOMPolicy) encoding.Encoding

// BOMOverride returns a decoder that switches based on BOM presence
func BOMOverride(fallback transform.Transformer) transform.Transformer

Errors

// ErrMissingBOM means UTF-16 input with ExpectBOM did not find a BOM
var ErrMissingBOM error

Usage Examples

import "golang.org/x/text/encoding/unicode"

// UTF-16LE with BOM
enc := unicode.UTF16(unicode.LittleEndian, unicode.UseBOM)
utf16Text, err := enc.NewEncoder().String(utf8Text)

// UTF-16BE without BOM
enc = unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
utf8Text, err := enc.NewDecoder().String(utf16beText)

// UTF-8 with BOM
utf8WithBOM, err := unicode.UTF8BOM.NewEncoder().String(utf8Text)

// BOM-based decoder selection
decoder := unicode.BOMOverride(unicode.UTF8.NewDecoder().Transformer)

Common Patterns

Converting Between Encodings

import (
    "golang.org/x/text/encoding/charmap"
    "golang.org/x/text/transform"
)

// Convert from Windows-1252 to ISO-8859-1
func convertEncoding(input []byte) ([]byte, error) {
    // Decode to UTF-8
    decoder := charmap.Windows1252.NewDecoder()
    utf8, err := decoder.Bytes(input)
    if err != nil {
        return nil, err
    }

    // Encode to target
    encoder := charmap.ISO8859_1.NewEncoder()
    return encoder.Bytes(utf8)
}

Streaming Conversion

import (
    "io"
    "golang.org/x/text/encoding/japanese"
)

func convertStream(r io.Reader, w io.Writer) error {
    // Create decoder reader
    utf8Reader := japanese.ShiftJIS.NewDecoder().Reader(r)

    // Copy decoded data
    _, err := io.Copy(w, utf8Reader)
    return err
}

Handling Unsupported Characters

import (
    "golang.org/x/text/encoding"
    "golang.org/x/text/encoding/charmap"
)

// HTML escape unsupported characters
encoder := encoding.HTMLEscapeUnsupported(
    charmap.ISO8859_1.NewEncoder(),
)
result, err := encoder.String("Café ☕") // "Café ☕"

// Replace unsupported characters with encoding-specific replacement
encoder = encoding.ReplaceUnsupported(
    charmap.ISO8859_1.NewEncoder(),
)
result, err = encoder.String("Café ☕") // "Café ?"

Detecting Encoding by Name

import (
    "strings"
    "golang.org/x/text/encoding/htmlindex"
)

func getEncodingByName(name string) (encoding.Encoding, error) {
    // Try HTML5 names first (most common)
    name = strings.ToLower(name)
    return htmlindex.Get(name)
}

Version Information

The encoding packages are based on:

  • Unicode 15.0.0
  • IANA Character Sets registry
  • W3C HTML5 encoding specification
  • WHATWG Encoding Standard