tessl install tessl/golang-cloud-google-com--go--bigquery@1.72.0Google Cloud BigQuery client library providing comprehensive Go APIs for querying, loading data, managing datasets and tables, streaming inserts, and accessing BigQuery's ecosystem of services including Storage, Analytics Hub, Data Transfer, and Migration APIs
This document covers using the BigQuery Storage Read API for high-performance parallel reads of large datasets.
The BigQuery Storage Read API provides fast, parallel access to BigQuery table data using Apache Arrow or Avro formats. It's optimized for reading large amounts of data and can be significantly faster than the traditional query API.
func (c *Client) EnableStorageReadClient(ctx context.Context, opts ...option.ClientOption) errorEnable Storage Read API for the client:
client, err := bigquery.NewClient(ctx, projectID)
if err != nil {
return err
}
err = client.EnableStorageReadClient(ctx)
if err != nil {
return err
}Important limitations when Storage Read API is enabled:
PageInfo().Token pagination is not supportedRowIterator.StartIndex is not supportedEnableStorageReadClient() twice returns an errorOnce enabled, table reads and query results automatically use the Storage Read API:
// Table read uses Storage API
it := table.Read(ctx)
for {
var row []bigquery.Value
err := it.Next(&row)
if err == iterator.Done {
break
}
if err != nil {
return err
}
// Process row
}
// Query results use Storage API
q := client.Query("SELECT * FROM dataset.large_table")
it, err := q.Read(ctx)type ArrowIterator interface {
Next() (*ArrowRecordBatch, error)
Schema() Schema
SerializedArrowSchema() []byte
}type ArrowRecordBatch struct {
Data []byte
Schema []byte
PartitionID string
}Get Arrow format data:
it, err := q.Read(ctx)
if err != nil {
return err
}
arrowIt := it.ArrowIterator()
for {
batch, err := arrowIt.Next()
if err == iterator.Done {
break
}
if err != nil {
return err
}
// Process Arrow batch
fmt.Printf("Partition: %s, Size: %d bytes\n",
batch.PartitionID, len(batch.Data))
}func NewArrowIteratorReader(it ArrowIterator) io.ReaderConvert to io.Reader for use with Arrow libraries:
arrowIt := it.ArrowIterator()
reader := bigquery.NewArrowIteratorReader(arrowIt)
// Use with Arrow libraries
// import "github.com/apache/arrow/go/v10/arrow/ipc"
// arrowReader, err := ipc.NewReader(reader)The cloud.google.com/go/bigquery/storage/apiv1 package provides direct access to the Storage Read API.
import "cloud.google.com/go/bigquery/storage/apiv1"
import "cloud.google.com/go/bigquery/storage/apiv1/storagepb"
client, err := storage.NewBigQueryReadClient(ctx)
if err != nil {
return err
}
defer client.Close()req := &storagepb.CreateReadSessionRequest{
Parent: "projects/my-project",
ReadSession: &storagepb.ReadSession{
Table: "projects/my-project/datasets/my_dataset/tables/my_table",
DataFormat: storagepb.DataFormat_ARROW,
ReadOptions: &storagepb.ReadSession_TableReadOptions{
SelectedFields: []string{"field1", "field2", "field3"},
RowRestriction: "field1 > 100",
},
},
MaxStreamCount: 10,
}
session, err := client.CreateReadSession(ctx, req)for _, stream := range session.Streams {
go func(streamName string) {
req := &storagepb.ReadRowsRequest{
ReadStream: streamName,
}
streamReader, err := client.ReadRows(ctx, req)
if err != nil {
return
}
for {
resp, err := streamReader.Recv()
if err == io.EOF {
break
}
if err != nil {
return
}
// Process Arrow batch
arrowRecordBatch := resp.GetArrowRecordBatch()
// ...
}
}(stream.Name)
}The Storage Read API is best for:
Use traditional query API for:
package main
import (
"context"
"fmt"
"log"
"cloud.google.com/go/bigquery"
"google.golang.org/api/iterator"
"google.golang.org/api/option"
)
func main() {
ctx := context.Background()
// Create BigQuery client
client, err := bigquery.NewClient(ctx, "my-project")
if err != nil {
log.Fatal(err)
}
defer client.Close()
// Enable Storage Read API
err = client.EnableStorageReadClient(ctx)
if err != nil {
log.Fatal(err)
}
// Read large table using Storage API
dataset := client.Dataset("my_dataset")
table := dataset.Table("large_table")
it := table.Read(ctx)
// Option 1: Read as regular rows
count := 0
for {
var row []bigquery.Value
err := it.Next(&row)
if err == iterator.Done {
break
}
if err != nil {
log.Fatal(err)
}
count++
if count%1000000 == 0 {
fmt.Printf("Processed %d rows\n", count)
}
}
fmt.Printf("Total rows: %d\n", count)
// Option 2: Read as Arrow batches
it2 := table.Read(ctx)
arrowIt := it2.ArrowIterator()
batchCount := 0
for {
batch, err := arrowIt.Next()
if err == iterator.Done {
break
}
if err != nil {
log.Fatal(err)
}
batchCount++
fmt.Printf("Arrow batch %d: %d bytes from partition %s\n",
batchCount, len(batch.Data), batch.PartitionID)
}
}