The golang.org/x/exp/utf8string package provides an efficient way to index strings by rune (Unicode code point) rather than by byte. This is particularly useful when working with UTF-8 encoded strings where multi-byte characters require byte-level indexing in standard Go strings.
golang.org/x/exp/utf8stringimport "golang.org/x/exp/utf8string"package main
import (
"fmt"
"golang.org/x/exp/utf8string"
)
func main() {
// Create a new UTF-8 string
s := utf8string.NewString("Hello, 世界!")
// Get the number of runes (not bytes)
fmt.Println("Rune count:", s.RuneCount()) // Output: 9 (H, e, l, l, o, ,, space, 世, 界, !)
// Index by rune position (not byte position)
r := s.At(7)
fmt.Printf("Rune at position 7: %c\n", r) // Output: 世
// Get a slice by rune positions
slice := s.Slice(0, 5)
fmt.Println("Slice [0:5]:", slice) // Output: Hello
// Get the full string
fmt.Println("Full string:", s.String()) // Output: Hello, 世界!
// Check if the string is ASCII only
fmt.Println("Is ASCII:", s.IsASCII()) // Output: false
}The String type wraps a regular Go string and provides efficient rune-based indexing through an internal state machine. Key characteristics:
String type maintains internal state and is not thread-safeCreate a new UTF-8 string with rune-based indexing capabilities.
func NewString(contents string) *StringCreates a new String instance that wraps the provided UTF-8 string, enabling efficient rune-based indexing and operations.
Parameters:
contents string - The UTF-8 encoded string to wrapReturns:
*String - A pointer to a newly created String instanceInitialize an existing String structure with new contents.
func (s *String) Init(contents string) *StringInitializes an existing String to hold the provided contents. Useful for reusing a String instance with different content.
Parameters:
contents string - The UTF-8 encoded string to storeReturns:
*String - A pointer to the initialized StringRetrieve a single rune at a specific index position.
func (s *String) At(i int) runeReturns the rune at the specified index. The sequence of runes is the same as iterating over the contents with a for range clause.
Parameters:
i int - The zero-based rune indexReturns:
rune - The rune (Unicode code point) at position iGet the total number of runes in the string.
func (s *String) RuneCount() intReturns the number of runes (Unicode code points) in the String. For a string "Hello, 世界", this returns 9, not the byte length.
Returns:
int - The number of runes in the stringExtract a substring using rune positions.
func (s *String) Slice(i, j int) stringReturns the string sliced at rune positions [i:j], similar to string slicing in Go but using rune indices instead of byte indices.
Parameters:
i int - The starting rune index (inclusive)j int - The ending rune index (exclusive)Returns:
string - The sliced substringCheck if the string contains only ASCII characters.
func (s *String) IsASCII() boolReturns a boolean indicating whether the String contains only ASCII bytes. This is useful for optimization decisions since ASCII strings have O(1) random access.
Returns:
bool - true if the string contains only ASCII characters, false otherwiseGet the underlying string value.
func (s *String) String() stringReturns the full contents of the String. This method makes the String type directly printable by fmt.Print and other formatting functions.
Returns:
string - The complete string contentstype String struct {
// Has unexported fields
}String wraps a regular string with internal state that provides efficient indexing by code point (rune) index, as opposed to byte index.
Performance Characteristics:
String has internal mutable state and is not thread-safepackage main
import (
"fmt"
"golang.org/x/exp/utf8string"
)
func main() {
// Using a standard Go string with multi-byte characters
standardStr := "こんにちは" // Japanese text (5 characters, 15 bytes)
fmt.Printf("Standard string byte length: %d\n", len(standardStr)) // Output: 15
// Using UTF-8 String for efficient rune indexing
utf8Str := utf8string.NewString("こんにちは")
fmt.Printf("UTF-8 String rune count: %d\n", utf8Str.RuneCount()) // Output: 5
// Access runes by position
for i := 0; i < utf8Str.RuneCount(); i++ {
fmt.Printf("Position %d: %c\n", i, utf8Str.At(i))
}
}package main
import (
"fmt"
"golang.org/x/exp/utf8string"
)
func main() {
// ASCII string
asciiStr := utf8string.NewString("Hello")
fmt.Printf("'%s' is ASCII: %v\n", asciiStr.String(), asciiStr.IsASCII()) // true
// Non-ASCII string
mixedStr := utf8string.NewString("Hello, 世界")
fmt.Printf("'%s' is ASCII: %v\n", mixedStr.String(), mixedStr.IsASCII()) // false
}package main
import (
"fmt"
"golang.org/x/exp/utf8string"
)
func main() {
original := utf8string.NewString("The quick brown fox")
// Slice by rune positions
words := original.Slice(4, 9) // "quick"
fmt.Println(words) // Output: quick
// Compare with standard string slicing (which would be wrong)
standardStr := "The quick brown fox"
fmt.Println(standardStr[4:9]) // Output: quic (wrong slice on UTF-8 boundary)
}package main
import (
"fmt"
"golang.org/x/exp/utf8string"
)
func main() {
// Create a String instance
s := utf8string.NewString("First string")
fmt.Printf("Rune count: %d\n", s.RuneCount()) // Output: 12
// Reuse the instance with new content
s.Init("Second string")
fmt.Printf("Rune count: %d\n", s.RuneCount()) // Output: 13
fmt.Println(s.String()) // Output: Second string
}package main
import (
"fmt"
"golang.org/x/exp/utf8string"
"unicode/utf8"
)
func main() {
text := "Café" // 5 runes, 5 bytes for "Caf", 2 bytes for "é", total 7 bytes
// Standard Go string indexing (by bytes)
fmt.Printf("Standard string length: %d bytes\n", len(text)) // 5
// Using utf8 package to decode runes
runeCount := utf8.RuneCountInString(text)
fmt.Printf("Rune count (manual): %d runes\n", runeCount) // 4
// Using UTF-8 String for easy rune access
utf8Str := utf8string.NewString(text)
fmt.Printf("UTF-8 String rune count: %d runes\n", utf8Str.RuneCount()) // 4
// Easy access to specific runes
for i := 0; i < utf8Str.RuneCount(); i++ {
fmt.Printf("Rune %d: %c\n", i, utf8Str.At(i))
}
}