Google Drive Public File/Folder Downloader that bypasses security notices and provides recursive folder downloads
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Extract compressed archives with support for multiple formats including ZIP, TAR, and compressed TAR variants.
Extracts various archive formats to specified directories with automatic format detection.
def extractall(path, to=None) -> List[str]:
"""
Extract archive file with automatic format detection.
Parameters:
- path (str): Path to archive file to be extracted.
- to (str, optional): Directory to extract files to.
If None, extracts to parent directory of archive file.
Returns:
List[str]: List of extracted file paths.
Raises:
ValueError: When archive format is not supported or file doesn't exist.
"""import gdown
# Extract to same directory as archive
archive_path = "./data.zip"
extracted_files = gdown.extractall(archive_path)
print(f"Extracted {len(extracted_files)} files:")
for file_path in extracted_files:
print(f" {file_path}")# Extract to specific target directory
archive_path = "./dataset.tar.gz"
target_dir = "./extracted_data/"
extracted_files = gdown.extractall(archive_path, to=target_dir)
print(f"Extracted to {target_dir}: {len(extracted_files)} files")# Download archive and extract in one workflow
import gdown
# Download compressed dataset
url = "https://drive.google.com/uc?id=ARCHIVE_FILE_ID"
archive_path = gdown.download(url, "dataset.zip")
# Extract the downloaded archive
extracted_files = gdown.extractall(archive_path, to="./dataset/")
print(f"Downloaded and extracted {len(extracted_files)} files")# Use with cached_download for automated workflows
def download_and_extract_dataset(url, expected_hash):
"""Download, verify, and extract dataset archive."""
# Download with integrity verification
archive_path = gdown.cached_download(
url,
hash=expected_hash,
path="./cache/dataset.tar.gz"
)
# Extract archive
extracted_files = gdown.extractall(archive_path, to="./data/")
# Process extracted files
data_files = [f for f in extracted_files if f.endswith('.csv')]
print(f"Found {len(data_files)} data files")
return extracted_files
# Usage
files = download_and_extract_dataset(
"https://drive.google.com/uc?id=FILE_ID",
"sha256:expected_hash_value"
)# Automatic extraction using cached_download postprocess
def auto_extract(filepath):
"""Automatically extract archive after download."""
print(f"Auto-extracting {filepath}")
return gdown.extractall(filepath, to="./extracted/")
# Download and auto-extract
gdown.cached_download(
url="https://example.com/data.tar.gz",
hash="sha256:abc123...",
postprocess=auto_extract
).zip# ZIP file extraction
extracted = gdown.extractall("data.zip", to="./zip_contents/").tar# TAR file extraction
extracted = gdown.extractall("archive.tar", to="./tar_contents/").tar.gz, .tgz# GZIP compressed TAR extraction
extracted = gdown.extractall("package.tar.gz", to="./source/")
extracted = gdown.extractall("backup.tgz", to="./backup/").tar.bz2, .tbz# BZIP2 compressed TAR extraction
extracted = gdown.extractall("dataset.tar.bz2", to="./dataset/")
extracted = gdown.extractall("archive.tbz", to="./archive/")Archives are extracted preserving their internal directory structure:
Archive Contents:
data.zip
├── dataset/
│ ├── train/
│ │ ├── file1.txt
│ │ └── file2.txt
│ └── test/
│ └── file3.txt
└── README.md
After extraction to "./extracted/":
./extracted/
├── dataset/
│ ├── train/
│ │ ├── file1.txt
│ │ └── file2.txt
│ └── test/
│ └── file3.txt
└── README.md# Default: extract to archive's parent directory
archive_path = "/home/user/downloads/data.zip"
files = gdown.extractall(archive_path) # Extracts to /home/user/downloads/
# Custom: extract to specific directory
files = gdown.extractall(archive_path, to="/home/user/projects/data/")import gdown
def safe_extract(archive_path, target_dir=None):
"""Safely extract archive with comprehensive error handling."""
try:
extracted_files = gdown.extractall(archive_path, to=target_dir)
print(f"✅ Successfully extracted {len(extracted_files)} files")
return extracted_files
except ValueError as e:
if "no appropriate extractor" in str(e):
print(f"❌ Unsupported archive format: {archive_path}")
print("Supported formats: .zip, .tar, .tar.gz, .tgz, .tar.bz2, .tbz")
else:
print(f"❌ Extraction error: {e}")
return None
except FileNotFoundError:
print(f"❌ Archive file not found: {archive_path}")
return None
except PermissionError:
print(f"❌ Permission denied accessing: {archive_path}")
return None
except Exception as e:
print(f"❌ Unexpected error during extraction: {e}")
return None
# Usage
files = safe_extract("./dataset.tar.gz", "./data/")
if files:
print("Extraction completed successfully")import os
import gdown
def process_archive_directory(archive_dir, extract_base="./extracted/"):
"""Process all archives in a directory."""
supported_extensions = ('.zip', '.tar', '.tar.gz', '.tgz', '.tar.bz2', '.tbz')
processed = 0
for filename in os.listdir(archive_dir):
if filename.endswith(supported_extensions):
archive_path = os.path.join(archive_dir, filename)
# Create extraction directory based on filename
extract_name = os.path.splitext(filename)[0]
if extract_name.endswith('.tar'): # Handle .tar.gz, .tar.bz2
extract_name = os.path.splitext(extract_name)[0]
extract_dir = os.path.join(extract_base, extract_name)
try:
files = gdown.extractall(archive_path, to=extract_dir)
print(f"✅ {filename}: {len(files)} files extracted")
processed += 1
except Exception as e:
print(f"❌ {filename}: {e}")
print(f"Processed {processed} archives")
# Usage
process_archive_directory("./downloads/", "./data/")def validate_extraction(archive_path, expected_files=None):
"""Validate archive extraction results."""
try:
extracted_files = gdown.extractall(archive_path, to="./temp_extract/")
print(f"Extraction completed: {len(extracted_files)} files")
if expected_files:
# Check if all expected files were extracted
extracted_names = [os.path.basename(f) for f in extracted_files]
missing = set(expected_files) - set(extracted_names)
if missing:
print(f"⚠️ Missing expected files: {missing}")
else:
print("✅ All expected files found")
# Show file types
extensions = {}
for file_path in extracted_files:
ext = os.path.splitext(file_path)[1].lower()
extensions[ext] = extensions.get(ext, 0) + 1
print("File types found:")
for ext, count in sorted(extensions.items()):
print(f" {ext or '(no extension)'}: {count} files")
return extracted_files
except Exception as e:
print(f"Extraction failed: {e}")
return None
# Usage
validate_extraction(
"dataset.zip",
expected_files=["README.txt", "data.csv", "config.json"]
)import shutil
import tempfile
def extract_temporarily(archive_path, process_func):
"""Extract archive to temporary directory and clean up after processing."""
with tempfile.TemporaryDirectory() as temp_dir:
try:
# Extract to temporary directory
extracted_files = gdown.extractall(archive_path, to=temp_dir)
print(f"Extracted {len(extracted_files)} files to temporary directory")
# Process files
result = process_func(extracted_files, temp_dir)
return result
except Exception as e:
print(f"Processing failed: {e}")
return None
# Temporary directory automatically cleaned up
def process_extracted_files(file_list, base_dir):
"""Example processing function."""
csv_files = [f for f in file_list if f.endswith('.csv')]
print(f"Found {len(csv_files)} CSV files for processing")
# Process CSV files here
results = []
for csv_file in csv_files:
# Process each CSV file
results.append(f"Processed {os.path.basename(csv_file)}")
return results
# Usage
results = extract_temporarily("data.tar.gz", process_extracted_files)
print("Processing results:", results)def stream_process_archive(archive_path):
"""Process large archives without keeping all files in memory."""
# Extract files
extracted_files = gdown.extractall(archive_path, to="./processing/")
# Process files one at a time to manage memory
for file_path in extracted_files:
if file_path.endswith('.csv'):
# Process individual CSV file
print(f"Processing {file_path}")
# ... process file ...
# Optionally remove processed file to save disk space
# os.remove(file_path)
return len(extracted_files)def complete_dataset_workflow(drive_url, expected_hash):
"""Complete workflow: download, verify, extract, and process."""
# Step 1: Download with verification
archive_path = gdown.cached_download(
drive_url,
hash=expected_hash,
path="./cache/dataset.tar.gz"
)
# Step 2: Extract archive
extracted_files = gdown.extractall(archive_path, to="./data/")
# Step 3: Organize extracted files
organized = {
'images': [f for f in extracted_files if f.endswith(('.jpg', '.png'))],
'data': [f for f in extracted_files if f.endswith('.csv')],
'docs': [f for f in extracted_files if f.endswith(('.txt', '.md'))]
}
print("Dataset organized:")
for category, files in organized.items():
print(f" {category}: {len(files)} files")
return organized
# Usage
dataset = complete_dataset_workflow(
"https://drive.google.com/uc?id=DATASET_ID",
"sha256:expected_dataset_hash"
)Install with Tessl CLI
npx tessl i tessl/pypi-gdown