CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-gdown

Google Drive Public File/Folder Downloader that bypasses security notices and provides recursive folder downloads

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

archive-utilities.mddocs/

Archive Utilities

Extract compressed archives with support for multiple formats including ZIP, TAR, and compressed TAR variants.

Capabilities

Archive Extraction Function

Extracts various archive formats to specified directories with automatic format detection.

def extractall(path, to=None) -> List[str]:
    """
    Extract archive file with automatic format detection.

    Parameters:
    - path (str): Path to archive file to be extracted.
    - to (str, optional): Directory to extract files to. 
                         If None, extracts to parent directory of archive file.

    Returns:
    List[str]: List of extracted file paths.

    Raises:
    ValueError: When archive format is not supported or file doesn't exist.
    """

Usage Examples

Basic Archive Extraction

import gdown

# Extract to same directory as archive
archive_path = "./data.zip"
extracted_files = gdown.extractall(archive_path)

print(f"Extracted {len(extracted_files)} files:")
for file_path in extracted_files:
    print(f"  {file_path}")

Extract to Specific Directory

# Extract to specific target directory
archive_path = "./dataset.tar.gz"
target_dir = "./extracted_data/"

extracted_files = gdown.extractall(archive_path, to=target_dir)
print(f"Extracted to {target_dir}: {len(extracted_files)} files")

Combined Download and Extract

# Download archive and extract in one workflow
import gdown

# Download compressed dataset
url = "https://drive.google.com/uc?id=ARCHIVE_FILE_ID"
archive_path = gdown.download(url, "dataset.zip")

# Extract the downloaded archive
extracted_files = gdown.extractall(archive_path, to="./dataset/")

print(f"Downloaded and extracted {len(extracted_files)} files")

Post-processing Integration

# Use with cached_download for automated workflows
def download_and_extract_dataset(url, expected_hash):
    """Download, verify, and extract dataset archive."""
    
    # Download with integrity verification
    archive_path = gdown.cached_download(
        url, 
        hash=expected_hash,
        path="./cache/dataset.tar.gz"
    )
    
    # Extract archive
    extracted_files = gdown.extractall(archive_path, to="./data/")
    
    # Process extracted files
    data_files = [f for f in extracted_files if f.endswith('.csv')]
    print(f"Found {len(data_files)} data files")
    
    return extracted_files

# Usage
files = download_and_extract_dataset(
    "https://drive.google.com/uc?id=FILE_ID",
    "sha256:expected_hash_value"
)

Integration with Post-processing Callback

# Automatic extraction using cached_download postprocess
def auto_extract(filepath):
    """Automatically extract archive after download."""
    print(f"Auto-extracting {filepath}")
    return gdown.extractall(filepath, to="./extracted/")

# Download and auto-extract
gdown.cached_download(
    url="https://example.com/data.tar.gz",
    hash="sha256:abc123...",
    postprocess=auto_extract
)

Supported Archive Formats

ZIP Archives

  • Extension: .zip
  • Description: Standard ZIP compression format
  • Usage: Most common for Windows and cross-platform archives
# ZIP file extraction
extracted = gdown.extractall("data.zip", to="./zip_contents/")

TAR Archives

  • Extension: .tar
  • Description: Uncompressed TAR (tape archive) format
  • Usage: Common on Unix/Linux systems for packaging files
# TAR file extraction
extracted = gdown.extractall("archive.tar", to="./tar_contents/")

Compressed TAR Archives

GZIP Compressed TAR

  • Extensions: .tar.gz, .tgz
  • Description: TAR archive compressed with GZIP
  • Usage: Very common for source code and Linux packages
# GZIP compressed TAR extraction
extracted = gdown.extractall("package.tar.gz", to="./source/")
extracted = gdown.extractall("backup.tgz", to="./backup/")

BZIP2 Compressed TAR

  • Extensions: .tar.bz2, .tbz
  • Description: TAR archive compressed with BZIP2 (better compression than GZIP)
  • Usage: Higher compression ratio, slower processing
# BZIP2 compressed TAR extraction
extracted = gdown.extractall("dataset.tar.bz2", to="./dataset/")
extracted = gdown.extractall("archive.tbz", to="./archive/")

Directory Structure Handling

Extraction Behavior

Archives are extracted preserving their internal directory structure:

Archive Contents:
data.zip
├── dataset/
│   ├── train/
│   │   ├── file1.txt
│   │   └── file2.txt
│   └── test/
│       └── file3.txt
└── README.md

After extraction to "./extracted/":
./extracted/
├── dataset/
│   ├── train/
│   │   ├── file1.txt
│   │   └── file2.txt
│   └── test/
│       └── file3.txt
└── README.md

Path Resolution

# Default: extract to archive's parent directory
archive_path = "/home/user/downloads/data.zip"
files = gdown.extractall(archive_path)  # Extracts to /home/user/downloads/

# Custom: extract to specific directory
files = gdown.extractall(archive_path, to="/home/user/projects/data/")

Error Handling

import gdown

def safe_extract(archive_path, target_dir=None):
    """Safely extract archive with comprehensive error handling."""
    
    try:
        extracted_files = gdown.extractall(archive_path, to=target_dir)
        print(f"✅ Successfully extracted {len(extracted_files)} files")
        return extracted_files
        
    except ValueError as e:
        if "no appropriate extractor" in str(e):
            print(f"❌ Unsupported archive format: {archive_path}")
            print("Supported formats: .zip, .tar, .tar.gz, .tgz, .tar.bz2, .tbz")
        else:
            print(f"❌ Extraction error: {e}")
        return None
        
    except FileNotFoundError:
        print(f"❌ Archive file not found: {archive_path}")
        return None
        
    except PermissionError:
        print(f"❌ Permission denied accessing: {archive_path}")
        return None
        
    except Exception as e:
        print(f"❌ Unexpected error during extraction: {e}")
        return None

# Usage
files = safe_extract("./dataset.tar.gz", "./data/")
if files:
    print("Extraction completed successfully")

Advanced Usage Patterns

Batch Archive Processing

import os
import gdown

def process_archive_directory(archive_dir, extract_base="./extracted/"):
    """Process all archives in a directory."""
    
    supported_extensions = ('.zip', '.tar', '.tar.gz', '.tgz', '.tar.bz2', '.tbz')
    processed = 0
    
    for filename in os.listdir(archive_dir):
        if filename.endswith(supported_extensions):
            archive_path = os.path.join(archive_dir, filename)
            
            # Create extraction directory based on filename
            extract_name = os.path.splitext(filename)[0]
            if extract_name.endswith('.tar'):  # Handle .tar.gz, .tar.bz2
                extract_name = os.path.splitext(extract_name)[0]
            
            extract_dir = os.path.join(extract_base, extract_name)
            
            try:
                files = gdown.extractall(archive_path, to=extract_dir)
                print(f"✅ {filename}: {len(files)} files extracted")
                processed += 1
            except Exception as e:
                print(f"❌ {filename}: {e}")
    
    print(f"Processed {processed} archives")

# Usage
process_archive_directory("./downloads/", "./data/")

Archive Validation

def validate_extraction(archive_path, expected_files=None):
    """Validate archive extraction results."""
    
    try:
        extracted_files = gdown.extractall(archive_path, to="./temp_extract/")
        
        print(f"Extraction completed: {len(extracted_files)} files")
        
        if expected_files:
            # Check if all expected files were extracted
            extracted_names = [os.path.basename(f) for f in extracted_files]
            missing = set(expected_files) - set(extracted_names)
            
            if missing:
                print(f"⚠️  Missing expected files: {missing}")
            else:
                print("✅ All expected files found")
        
        # Show file types
        extensions = {}
        for file_path in extracted_files:
            ext = os.path.splitext(file_path)[1].lower()
            extensions[ext] = extensions.get(ext, 0) + 1
        
        print("File types found:")
        for ext, count in sorted(extensions.items()):
            print(f"  {ext or '(no extension)'}: {count} files")
        
        return extracted_files
        
    except Exception as e:
        print(f"Extraction failed: {e}")
        return None

# Usage
validate_extraction(
    "dataset.zip",
    expected_files=["README.txt", "data.csv", "config.json"]
)

Cleanup and Management

import shutil
import tempfile

def extract_temporarily(archive_path, process_func):
    """Extract archive to temporary directory and clean up after processing."""
    
    with tempfile.TemporaryDirectory() as temp_dir:
        try:
            # Extract to temporary directory
            extracted_files = gdown.extractall(archive_path, to=temp_dir)
            print(f"Extracted {len(extracted_files)} files to temporary directory")
            
            # Process files
            result = process_func(extracted_files, temp_dir)
            
            return result
            
        except Exception as e:
            print(f"Processing failed: {e}")
            return None
    # Temporary directory automatically cleaned up

def process_extracted_files(file_list, base_dir):
    """Example processing function."""
    csv_files = [f for f in file_list if f.endswith('.csv')]
    print(f"Found {len(csv_files)} CSV files for processing")
    
    # Process CSV files here
    results = []
    for csv_file in csv_files:
        # Process each CSV file
        results.append(f"Processed {os.path.basename(csv_file)}")
    
    return results

# Usage
results = extract_temporarily("data.tar.gz", process_extracted_files)
print("Processing results:", results)

Best Practices

Memory Efficient Processing

def stream_process_archive(archive_path):
    """Process large archives without keeping all files in memory."""
    
    # Extract files
    extracted_files = gdown.extractall(archive_path, to="./processing/")
    
    # Process files one at a time to manage memory
    for file_path in extracted_files:
        if file_path.endswith('.csv'):
            # Process individual CSV file
            print(f"Processing {file_path}")
            # ... process file ...
            
            # Optionally remove processed file to save disk space
            # os.remove(file_path)
    
    return len(extracted_files)

Integration with Download Workflows

def complete_dataset_workflow(drive_url, expected_hash):
    """Complete workflow: download, verify, extract, and process."""
    
    # Step 1: Download with verification
    archive_path = gdown.cached_download(
        drive_url,
        hash=expected_hash,
        path="./cache/dataset.tar.gz"
    )
    
    # Step 2: Extract archive
    extracted_files = gdown.extractall(archive_path, to="./data/")
    
    # Step 3: Organize extracted files
    organized = {
        'images': [f for f in extracted_files if f.endswith(('.jpg', '.png'))],
        'data': [f for f in extracted_files if f.endswith('.csv')],
        'docs': [f for f in extracted_files if f.endswith(('.txt', '.md'))]
    }
    
    print("Dataset organized:")
    for category, files in organized.items():
        print(f"  {category}: {len(files)} files")
    
    return organized

# Usage
dataset = complete_dataset_workflow(
    "https://drive.google.com/uc?id=DATASET_ID",
    "sha256:expected_dataset_hash"
)

Install with Tessl CLI

npx tessl i tessl/pypi-gdown

docs

archive-utilities.md

caching-integrity.md

file-downloads.md

folder-operations.md

index.md

tile.json