tessl/pypi-tensorflow

An end-to-end open source platform for machine learning

—

Pending

Overview

Eval results

Files

Data Processing

Name: tessl/pypi-tensorflow
Author: tessl

Dataset creation, transformation, and preprocessing pipeline operations for efficient data handling and training workflows. The tf.data API provides powerful tools for building scalable input pipelines.

Capabilities

Dataset Creation

Create datasets from various data sources.

class Dataset:
    """A potentially large set of elements."""
    
    @staticmethod
    def from_tensor_slices(tensors, name=None):
        """
        Creates a Dataset whose elements are slices of the given tensors.
        
        Parameters:
        - tensors: A dataset element, whose components have the same first dimension
        - name: Optional name for the tf.data operation
        
        Returns:
        A Dataset
        """
    
    @staticmethod
    def from_tensors(tensors, name=None):
        """
        Creates a Dataset with a single element, comprising the given tensors.
        
        Parameters:
        - tensors: A dataset element
        - name: Optional name for the tf.data operation
        
        Returns:
        A Dataset
        """
    
    @staticmethod
    def from_generator(generator, output_signature, args=None):
        """
        Creates a Dataset whose elements are generated by generator.
        
        Parameters:
        - generator: A callable object that returns an object that supports the iter() protocol
        - output_signature: A nested structure of tf.TypeSpec objects corresponding to each component of an element yielded by generator
        - args: A tf.Tensor object or a tuple of tf.Tensor objects to pass as arguments to generator
        
        Returns:
        A Dataset
        """
    
    @staticmethod
    def range(*args, **kwargs):
        """
        Creates a Dataset of a step-separated range of values.
        
        Parameters:
        - *args: follows the same semantics as python's xrange
        - **kwargs: optional keyword arguments
        
        Returns:
        A RangeDataset
        """
    
    @staticmethod
    def zip(datasets):
        """
        Creates a Dataset by zipping together the given datasets.
        
        Parameters:
        - datasets: A nested structure of datasets
        
        Returns:
        A Dataset
        """

Dataset Transformation

Transform and manipulate dataset elements.

def map(self, map_func, num_parallel_calls=None, deterministic=None, name=None):
    """
    Maps map_func across the elements of this dataset.
    
    Parameters:
    - map_func: A function mapping a dataset element to another dataset element
    - num_parallel_calls: A tf.int32 scalar tf.Tensor, representing the number elements to process asynchronously in parallel
    - deterministic: A boolean controlling whether the map is allowed to return elements out of order
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def filter(self, predicate, name=None):
    """
    Filters this dataset according to predicate.
    
    Parameters:
    - predicate: A function mapping a dataset element to a boolean
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def flat_map(self, map_func, name=None):
    """
    Maps map_func across this dataset and flattens the result.
    
    Parameters:
    - map_func: A function mapping a dataset element to a dataset
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def interleave(self, map_func, cycle_length=None, block_length=None, 
               num_parallel_calls=None, deterministic=None, name=None):
    """
    Maps map_func across this dataset, and interleaves the results.
    
    Parameters:
    - map_func: A function mapping a dataset element to a dataset
    - cycle_length: The number of input elements that will be processed concurrently
    - block_length: The number of consecutive elements to produce from each input element before cycling to another input element
    - num_parallel_calls: The number of parallel calls for map_func
    - deterministic: A boolean controlling whether the interleave is allowed to return elements out of order
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

Dataset Batching and Sampling

Operations for batching and sampling data.

def batch(self, batch_size, drop_remainder=False, num_parallel_calls=None, 
          deterministic=None, name=None):
    """
    Combines consecutive elements of this dataset into batches.
    
    Parameters:
    - batch_size: A tf.int64 scalar tf.Tensor, representing the number of consecutive elements of this dataset to combine in a single batch
    - drop_remainder: A tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case it has fewer than batch_size elements
    - num_parallel_calls: A tf.int32 scalar tf.Tensor, representing the number elements to process in parallel
    - deterministic: A boolean controlling whether the batch is allowed to return elements out of order
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def padded_batch(self, batch_size, padded_shapes=None, padding_values=None,
                 drop_remainder=False, name=None):
    """
    Combines consecutive elements of this dataset into padded batches.
    
    Parameters:
    - batch_size: A tf.int64 scalar tf.Tensor, representing the number of consecutive elements of this dataset to combine in a single batch
    - padded_shapes: A nested structure of tf.TensorShape or tf.int64 vector tensor-like objects representing the shape to which the respective component of each input element should be padded prior to batching
    - padding_values: A nested structure of scalar-shaped tf.Tensor, representing the padding values to use for the respective components
    - drop_remainder: A tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case it has fewer than batch_size elements
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def unbatch(self, name=None):
    """
    Splits elements of a dataset into multiple elements on the batch dimension.
    
    Parameters:
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def shuffle(self, buffer_size, seed=None, reshuffle_each_iteration=None, name=None):
    """
    Randomly shuffles the elements of this dataset.
    
    Parameters:
    - buffer_size: A tf.int64 scalar tf.Tensor, representing the number of elements from this dataset from which the new dataset will sample
    - seed: Optional tf.int64 scalar tf.Tensor, representing the random seed that will be used to create the distribution
    - reshuffle_each_iteration: If true, the dataset will be reshuffled each time it is iterated over
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def repeat(self, count=None, name=None):
    """
    Repeats this dataset so each original value is seen count times.
    
    Parameters:
    - count: A tf.int64 scalar tf.Tensor, representing the number of times the dataset should be repeated
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def take(self, count, name=None):
    """
    Creates a Dataset with at most count elements from this dataset.
    
    Parameters:
    - count: A tf.int64 scalar tf.Tensor, representing the number of elements of this dataset that should be taken to form the new dataset
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def skip(self, count, name=None):
    """
    Creates a Dataset that skips count elements from this dataset.
    
    Parameters:
    - count: A tf.int64 scalar tf.Tensor, representing the number of elements of this dataset that should be skipped to form the new dataset
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

Performance Optimization

Operations for optimizing dataset performance.

def cache(self, filename="", name=None):
    """
    Caches the elements in this dataset.
    
    Parameters:
    - filename: A tf.string scalar tf.Tensor, representing the name of a directory on the filesystem to use for caching elements in this Dataset
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def prefetch(self, buffer_size, name=None):
    """
    Creates a Dataset that prefetches elements from this dataset.
    
    Parameters:
    - buffer_size: A tf.int64 scalar tf.Tensor, representing the maximum number of elements that will be buffered when prefetching
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def parallel_interleave(map_func, cycle_length, block_length=1, 
                       sloppy=False, buffer_output_elements=None,
                       prefetch_input_elements=None):
    """
    A parallel version of the Dataset.interleave() transformation.
    
    Parameters:
    - map_func: A function mapping a nested structure of tensors to a Dataset
    - cycle_length: The number of input elements that will be processed concurrently
    - block_length: The number of consecutive elements to produce from each input element before cycling to another input element
    - sloppy: If false, the relative order of records produced by this transformation is deterministic
    - buffer_output_elements: The number of elements each iterator being interleaved should buffer
    - prefetch_input_elements: The number of input elements to transform to iterators in parallel and keep buffered
    
    Returns:
    A Dataset transformation function
    """

Dataset Properties and Utilities

Utility methods for inspecting and manipulating datasets.

@property
def element_spec(self):
    """
    The type specification of an element of this dataset.
    
    Returns:
    A nested structure of tf.TypeSpec objects matching the structure of an element of this dataset
    """

def cardinality(self):
    """
    Returns the cardinality of the dataset, if known.
    
    Returns:
    A scalar tf.int64 Tensor representing the cardinality of the dataset
    """

def enumerate(self, start=0, name=None):
    """
    Enumerates the elements of this dataset.
    
    Parameters:
    - start: A tf.int64 scalar tf.Tensor, representing the start value for enumeration
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def concatenate(self, dataset):
    """
    Creates a Dataset by concatenating the given dataset with this dataset.
    
    Parameters:
    - dataset: Dataset to be concatenated
    
    Returns:
    A Dataset
    """

def reduce(self, initial_state, reduce_func, name=None):
    """
    Reduces the input dataset to a single element.
    
    Parameters:
    - initial_state: An element representing the initial state of the reduction
    - reduce_func: A function that maps (old_state, input_element) to new_state
    - name: Optional name for the tf.data operation
    
    Returns:
    A dataset element
    """

def apply(self, transformation_func):
    """
    Applies a transformation function to this dataset.
    
    Parameters:
    - transformation_func: A function that takes one Dataset argument and returns a Dataset
    
    Returns:
    The Dataset returned by applying transformation_func to this dataset
    """

Usage Examples

import tensorflow as tf
import numpy as np

# Create datasets from different sources
# From tensor slices
data = np.array([1, 2, 3, 4, 5])
dataset = tf.data.Dataset.from_tensor_slices(data)

# From tensors (single element)
single_element = tf.data.Dataset.from_tensors([1, 2, 3, 4, 5])

# From generator
def gen():
    for i in range(100):
        yield i

dataset_gen = tf.data.Dataset.from_generator(
    gen, 
    output_signature=tf.TensorSpec(shape=(), dtype=tf.int32)
)

# Range dataset
range_dataset = tf.data.Dataset.range(10)

# Dataset transformations
# Map transformation
squared_dataset = dataset.map(lambda x: x ** 2)

# Filter transformation
even_dataset = range_dataset.filter(lambda x: x % 2 == 0)

# Batch transformation
batched_dataset = range_dataset.batch(3)

# Shuffle and repeat
shuffled_dataset = range_dataset.shuffle(buffer_size=10).repeat(2)

# Complex pipeline example
(train_images, train_labels) = np.random.random((1000, 28, 28, 1)), np.random.randint(0, 10, 1000)

train_dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels))
train_dataset = (train_dataset
                 .map(lambda x, y: (tf.cast(x, tf.float32) / 255.0, y))  # Normalize
                 .shuffle(buffer_size=100)
                 .batch(32)
                 .prefetch(tf.data.AUTOTUNE))

# Performance optimizations
# Cache dataset
cached_dataset = train_dataset.cache()

# Prefetch for performance
prefetched_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

# Parallel map
parallel_mapped = range_dataset.map(
    lambda x: x * 2, 
    num_parallel_calls=tf.data.AUTOTUNE
)

# Text processing example
text_data = ["hello world", "tensorflow data", "machine learning"]
text_dataset = tf.data.Dataset.from_tensor_slices(text_data)

# Split text into words
word_dataset = text_dataset.flat_map(
    lambda x: tf.data.Dataset.from_tensor_slices(tf.strings.split(x))
)

# Iterate through dataset
for element in range_dataset.take(5):
    print(element.numpy())

# Convert dataset to list (for small datasets)
dataset_list = list(range_dataset.take(5).as_numpy_iterator())

Install with Tessl CLI