Google BigQuery connector for pandas DataFrames
npx @tessl/cli install tessl/pypi-pandas-gbq@0.29.0Google BigQuery connector for pandas DataFrames providing seamless two-way data integration. The library offers high-level functions for executing SQL queries and returning results as DataFrames, uploading DataFrames to BigQuery tables, with comprehensive support for authentication, schema inference, data type mapping, and performance optimizations.
pip install pandas-gbqpip install pandas-gbq[bqstorage]pip install pandas-gbq[tqdm]pip install pandas-gbq[geopandas]import pandas_gbqStandard import pattern:
from pandas_gbq import read_gbq, to_gbqFor context management:
from pandas_gbq import context, Contextimport pandas as pd
from pandas_gbq import read_gbq, to_gbq
# Reading data from BigQuery
df = read_gbq(
query="SELECT name, age FROM `my_project.my_dataset.users` WHERE age > 21",
project_id="my-gcp-project"
)
# Writing DataFrame to BigQuery
sample_data = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'age': [25, 30, 35],
'city': ['New York', 'London', 'Tokyo']
})
to_gbq(
dataframe=sample_data,
destination_table="my_dataset.new_users",
project_id="my-gcp-project",
if_exists="append"
)
# Using context for session-wide configuration
import pandas_gbq
pandas_gbq.context.project = "my-gcp-project"
pandas_gbq.context.dialect = "standard"
# Now queries can omit project_id
df = read_gbq("SELECT * FROM `my_dataset.my_table`")pandas-gbq serves as a high-level connector between pandas DataFrames and Google BigQuery, built on top of the Google Cloud BigQuery client library. The library provides a seamless interface that handles the complexities of BigQuery integration:
The design prioritizes ease of use while exposing advanced BigQuery features through comprehensive parameter options, making it suitable for both simple data analysis tasks and complex ETL pipelines.
Execute SQL queries or read tables directly from Google BigQuery, returning results as pandas DataFrames with full data type support and optional BigQuery Storage API acceleration.
def read_gbq(
query_or_table: str,
project_id: str = None,
index_col: str = None,
columns: list[str] = None,
reauth: bool = False,
auth_local_webserver: bool = True,
dialect: str = None,
location: str = None,
configuration: dict = None,
credentials = None,
use_bqstorage_api: bool = False,
max_results: int = None,
verbose: bool = None,
private_key: str = None,
progress_bar_type: str = "tqdm",
dtypes: dict = None,
auth_redirect_uri: str = None,
client_id: str = None,
client_secret: str = None,
*,
col_order: list[str] = None,
bigquery_client = None
) -> pd.DataFrame:
"""
Read data from Google BigQuery to a pandas DataFrame.
Parameters:
- query_or_table: SQL query string or table ID (dataset.table format)
- project_id: Google Cloud Platform project ID (optional if set in environment/context)
- index_col: Column name to use as DataFrame index
- columns: List of column names in desired order for results
- reauth: Force re-authentication (useful for multiple accounts)
- auth_local_webserver: Use local webserver flow instead of console flow
- dialect: SQL dialect ('standard' or 'legacy', default: 'standard')
- location: Geographic location for query job execution
- configuration: Advanced query configuration parameters
- credentials: Google auth credentials object
- use_bqstorage_api: Enable BigQuery Storage API for faster reads
- max_results: Maximum number of rows to return
- verbose: Enable verbose logging (deprecated)
- private_key: Service account private key (deprecated)
- progress_bar_type: Progress bar type ('tqdm' or None)
- dtypes: Explicit data type mapping for columns
- auth_redirect_uri: Custom OAuth redirect URI
- client_id: OAuth client ID
- client_secret: OAuth client secret
- col_order: Column order for results (keyword-only)
- bigquery_client: Pre-configured BigQuery client instance (keyword-only)
Returns:
pandas.DataFrame: Query results as DataFrame with appropriate data types
"""Usage examples:
# Basic query
df = read_gbq("SELECT * FROM `dataset.table` LIMIT 1000", project_id="my-project")
# Direct table read
df = read_gbq("my_dataset.my_table", project_id="my-project")
# With BigQuery Storage API for large results
df = read_gbq(
"SELECT * FROM `big_dataset.huge_table`",
project_id="my-project",
use_bqstorage_api=True
)
# With custom column ordering and types
df = read_gbq(
"SELECT name, age, created_at FROM `users.profile`",
project_id="my-project",
columns=["name", "age", "created_at"],
dtypes={"age": "Int64"} # Use nullable integer type
)Upload pandas DataFrames to Google BigQuery tables with flexible schema handling, multiple upload methods, and comprehensive data type support.
def to_gbq(
dataframe: pd.DataFrame,
destination_table: str,
project_id: str = None,
chunksize: int = None,
reauth: bool = False,
if_exists: str = "fail",
auth_local_webserver: bool = True,
table_schema: list[dict] = None,
location: str = None,
progress_bar: bool = True,
credentials = None,
api_method: str = "default",
verbose: bool = None,
private_key: str = None,
auth_redirect_uri: str = None,
client_id: str = None,
client_secret: str = None,
user_agent: str = None,
rfc9110_delimiter: bool = False,
bigquery_client = None
) -> None:
"""
Write a DataFrame to a Google BigQuery table.
Parameters:
- dataframe: pandas DataFrame to upload
- destination_table: Target table in format 'dataset.table' or 'project.dataset.table'
- project_id: Google Cloud Platform project ID (optional if set in environment/context)
- chunksize: Number of rows per upload chunk (None = upload all at once)
- reauth: Force re-authentication
- if_exists: Behavior when table exists ('fail', 'replace', 'append')
- auth_local_webserver: Use local webserver flow instead of console flow
- table_schema: Custom BigQuery schema specification as list of field dicts
- location: Geographic location for table and load job
- progress_bar: Show upload progress bar
- credentials: Google auth credentials object
- api_method: Upload method ('default', 'streaming', 'batch')
- verbose: Enable verbose logging (deprecated)
- private_key: Service account private key (deprecated)
- auth_redirect_uri: Custom OAuth redirect URI
- client_id: OAuth client ID
- client_secret: OAuth client secret
- user_agent: Custom user agent string
- rfc9110_delimiter: Use RFC 9110 compliant field delimiters
- bigquery_client: Pre-configured BigQuery client instance
Returns:
None
"""Usage examples:
import pandas as pd
df = pd.DataFrame({
'id': [1, 2, 3],
'name': ['Alice', 'Bob', 'Charlie'],
'score': [95.5, 87.2, 92.1],
'active': [True, False, True]
})
# Basic upload
to_gbq(df, "my_dataset.scores", project_id="my-project")
# Append to existing table
to_gbq(df, "my_dataset.scores", project_id="my-project", if_exists="append")
# Replace existing table
to_gbq(df, "my_dataset.scores", project_id="my-project", if_exists="replace")
# Custom schema specification
schema = [
{"name": "id", "type": "INTEGER", "mode": "REQUIRED"},
{"name": "name", "type": "STRING", "mode": "REQUIRED"},
{"name": "score", "type": "FLOAT", "mode": "NULLABLE"},
{"name": "active", "type": "BOOLEAN", "mode": "NULLABLE"}
]
to_gbq(
df,
"my_dataset.scores",
project_id="my-project",
table_schema=schema,
if_exists="replace"
)
# Chunked upload for large DataFrames
to_gbq(
large_df,
"my_dataset.big_table",
project_id="my-project",
chunksize=10000,
if_exists="append"
)Manage session-wide configuration for credentials, default project, and SQL dialect preferences to reduce parameter repetition across function calls.
class Context:
"""Storage for objects to be used throughout a session."""
@property
def credentials(self) -> google.auth.credentials.Credentials:
"""
Credentials to use for Google APIs.
Returns:
google.auth.credentials.Credentials object or None
"""
@credentials.setter
def credentials(self, value: google.auth.credentials.Credentials) -> None:
"""Set credentials for Google APIs."""
@property
def project(self) -> str:
"""
Default project to use for calls to Google APIs.
Returns:
str: Project ID or None
"""
@project.setter
def project(self, value: str) -> None:
"""Set default project ID."""
@property
def dialect(self) -> str:
"""
Default SQL dialect to use in read_gbq.
Returns:
str: 'standard', 'legacy', or None
"""
@dialect.setter
def dialect(self, value: str) -> None:
"""Set default SQL dialect ('standard' or 'legacy')."""
# Global context instance
context: ContextUsage examples:
import pandas_gbq
from google.oauth2 import service_account
# Set credentials from service account file
credentials = service_account.Credentials.from_service_account_file(
'/path/to/service-account-key.json'
)
pandas_gbq.context.credentials = credentials
# Set default project
pandas_gbq.context.project = "my-gcp-project"
# Set SQL dialect preference
pandas_gbq.context.dialect = "standard"
# Now function calls can omit these parameters
df = read_gbq("SELECT * FROM my_dataset.my_table")
to_gbq(df, "my_dataset.output_table", if_exists="replace")
# Create custom context instances
custom_context = Context()
custom_context.project = "different-project"pandas-gbq defines custom exceptions for different error conditions:
# Core exceptions
class DatasetCreationError(ValueError):
"""Raised when dataset creation fails."""
class TableCreationError(ValueError):
"""Raised when table creation fails."""
@property
def message(self) -> str:
"""Error message."""
class InvalidSchema(ValueError):
"""Raised when DataFrame doesn't match BigQuery table schema."""
@property
def message(self) -> str:
"""Error message."""
class NotFoundException(ValueError):
"""Raised when project, table, or dataset not found."""
class GenericGBQException(ValueError):
"""Raised for unrecognized Google API errors."""
class AccessDenied(ValueError):
"""Raised for authentication/authorization failures."""
class ConversionError(GenericGBQException):
"""Raised for DataFrame conversion problems."""
# Query and data handling exceptions
class QueryTimeout(ValueError):
"""Raised when query exceeds timeout."""
class InvalidColumnOrder(ValueError):
"""Raised when column order doesn't match schema."""
class InvalidIndexColumn(ValueError):
"""Raised when index column doesn't match schema."""
class InvalidPageToken(ValueError):
"""Raised for BigQuery page token failures."""
class InvalidPrivateKeyFormat(ValueError):
"""Raised for invalid private key format."""
# Warning types
class LargeResultsWarning(UserWarning):
"""Warning for results beyond recommended DataFrame size."""
class PerformanceWarning(RuntimeWarning):
"""Warning for unsupported performance features."""Common error handling patterns:
from pandas_gbq import read_gbq, to_gbq
from pandas_gbq.exceptions import (
NotFoundException,
AccessDenied,
QueryTimeout,
InvalidSchema
)
try:
df = read_gbq("SELECT * FROM nonexistent.table", project_id="my-project")
except NotFoundException as e:
print(f"Table not found: {e}")
except AccessDenied as e:
print(f"Access denied: {e}")
except QueryTimeout as e:
print(f"Query timed out: {e}")
try:
to_gbq(df, "dataset.table", project_id="my-project")
except InvalidSchema as e:
print(f"Schema mismatch: {e.message}")
except TableCreationError as e:
print(f"Failed to create table: {e.message}")Key data types and interfaces used throughout the pandas-gbq API:
# Authentication types (from google-auth)
from google.auth.credentials import Credentials
from google.cloud.bigquery import Client as BigQueryClient
# pandas-gbq specific type hints
from typing import Optional, Union, List, Dict, Any
# Common type aliases used in function signatures
ProjectId = Optional[str]
TableId = str # Format: "dataset.table" or "project.dataset.table"
QueryString = str
ColumnList = Optional[List[str]]
SchemaList = Optional[List[Dict[str, Any]]]
ConfigDict = Optional[Dict[str, Any]]
DtypeDict = Optional[Dict[str, Any]]
# BigQuery schema field format
SchemaField = Dict[str, Any] # Example: {"name": "col1", "type": "STRING", "mode": "NULLABLE"}
# API method options
ApiMethod = str # "default", "streaming", or "batch"
IfExistsOption = str # "fail", "replace", or "append"
DialectOption = str # "standard" or "legacy"
ProgressBarType = str # "tqdm" or None__version__: str # Current package version (0.29.2)pandas-gbq supports multiple authentication methods:
# ADC (recommended for production)
df = read_gbq("SELECT 1", project_id="my-project") # Uses ADC automatically
# Service account file
from google.oauth2 import service_account
credentials = service_account.Credentials.from_service_account_file('key.json')
df = read_gbq("SELECT 1", project_id="my-project", credentials=credentials)
# User OAuth (interactive)
df = read_gbq("SELECT 1", project_id="my-project", reauth=True)
# Pre-configured client
from google.cloud import bigquery
client = bigquery.Client(project="my-project")
df = read_gbq("SELECT 1", bigquery_client=client)