Scalable Python data science, in an API compatible & lightning fast way.
npx @tessl/cli install tessl/pypi-xorbits@0.8.0Xorbits is an open-source computing framework that enables seamless scaling of data science and machine learning workloads from single machines to distributed clusters. It provides a familiar Python API that supports popular libraries like pandas, NumPy, PyTorch, and XGBoost, allowing users to scale their existing workflows with minimal code changes.
pip install xorbitsimport xorbitsCommon imports for specific functionality:
import xorbits.pandas as pd
import xorbits.numpy as np
import xorbits.sklearn as skimport xorbits
import xorbits.pandas as pd
import xorbits.numpy as np
# Initialize Xorbits runtime
xorbits.init()
# Create distributed DataFrame (same API as pandas)
df = pd.DataFrame({
'A': np.random.randn(10000),
'B': np.random.randn(10000),
'C': np.random.randn(10000)
})
# Perform operations (lazy evaluation)
result = df.groupby('A').agg({'B': 'mean', 'C': 'sum'})
# Execute computation
computed_result = xorbits.run(result)
print(computed_result)
# Shutdown when done
xorbits.shutdown()Xorbits leverages a distributed computing architecture built on top of Mars:
DataRef instances that contain references to underlying Mars entitiesCore functions for initializing, managing, and shutting down Xorbits runtime environments, including local and distributed cluster configurations.
from typing import Dict, List, Optional, Union
from .._mars.utils import no_default
def init(
address: Optional[str] = None,
init_local: bool = no_default,
session_id: Optional[str] = None,
timeout: Optional[float] = None,
n_worker: int = 1,
n_cpu: Union[int, str] = "auto",
mem_bytes: Union[int, str] = "auto",
cuda_devices: Union[List[int], List[List[int]], str] = "auto",
web: Union[bool, str] = "auto",
new: bool = True,
storage_config: Optional[Dict] = None,
**kwargs
) -> None: ...
def shutdown(**kw) -> None: ...
def run(obj, **kwargs): ...Configuration management through options system, providing control over execution behavior and runtime settings.
# Configuration objects and functions
options: object
def option_context(*args, **kwargs): ...Drop-in replacement for pandas with distributed computing capabilities, supporting DataFrames, Series, and the full pandas API.
class DataFrame: ...
class Series: ...
class Index: ...
# Data types and constants
class Timedelta: ...
class DateOffset: ...
class Interval: ...
class Timestamp: ...
NaT: object
NA: objectDistributed array computing with NumPy-compatible API, supporting all NumPy operations on large distributed arrays.
class ndarray: ...
# NumPy constants and types
bool_: type
int8: type
int16: type
int32: type
int64: type
float16: type
float32: type
float64: type
complex64: type
complex128: type
dtype: type
pi: float
e: float
inf: float
nan: floatDistributed machine learning capabilities through sklearn, XGBoost, and LightGBM integrations, enabling scalable model training and prediction.
# Sklearn submodules
from xorbits.sklearn import cluster, datasets, decomposition, ensemble
from xorbits.sklearn import linear_model, metrics, model_selection, neighbors
from xorbits.sklearn import preprocessing, semi_supervised
# XGBoost and LightGBM classes dynamically exposedLarge-scale dataset handling with support for Hugging Face datasets and efficient data loading patterns.
class Dataset: ...
def from_huggingface(dataset_name: str, **kwargs): ...Remote function execution capabilities for distributed computing workloads.
def spawn(func, **kwargs): ...class Data:
"""Base data container class."""
class DataRef:
"""Reference to distributed data object."""
class DataRefMeta:
"""Metaclass for DataRef."""
from enum import Enum
class DataType(Enum):
"""Enumeration of data types."""
object_ = 1
scalar = 2
tensor = 3
dataframe = 4
series = 5
index = 6
categorical = 7
dataframe_groupby = 8
series_groupby = 9
dataset = 10