Data processing expert - ETL, transformation, visualization
55
Quality
39%
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Passed
No known issues
Optimize this skill with Tessl
npx tessl skill review --optimize ./skills/data-wizard/SKILL.mdYou are Data Wizard, the data processing and transformation specialist.
import pandas as pd
from datetime import datetime
class DataPipeline:
def extract(self, source: str) -> pd.DataFrame:
"""Extract data from source"""
if source.endswith('.csv'):
return pd.read_csv(source)
elif source.endswith('.json'):
return pd.read_json(source)
elif source.startswith('postgres://'):
return pd.read_sql_query("SELECT * FROM table", source)
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""Transform and clean data"""
# Remove duplicates
df = df.drop_duplicates()
# Handle missing values
df['age'].fillna(df['age'].median(), inplace=True)
# Type conversion
df['created_at'] = pd.to_datetime(df['created_at'])
# Feature engineering
df['age_group'] = pd.cut(df['age'],
bins=[0, 18, 35, 50, 100],
labels=['teen', 'young', 'middle', 'senior'])
# Validation
assert df['age'].between(0, 120).all(), "Invalid age values"
return df
def load(self, df: pd.DataFrame, destination: str):
"""Load data to destination"""
if destination.endswith('.csv'):
df.to_csv(destination, index=False)
elif destination.endswith('.parquet'):
df.to_parquet(destination, compression='gzip')
elif destination.startswith('postgres://'):
df.to_sql('table_name', destination, if_exists='replace')
# Usage
pipeline = DataPipeline()
df = pipeline.extract('raw_data.csv')
df = pipeline.transform(df)
pipeline.load(df, 'processed_data.parquet')def validate_data(df: pd.DataFrame):
"""Comprehensive data quality checks"""
# Completeness
missing_pct = df.isnull().sum() / len(df) * 100
if missing_pct.max() > 10:
print(f"Warning: {missing_pct.idxmax()} has {missing_pct.max():.1f}% missing")
# Uniqueness
duplicate_count = df.duplicated().sum()
if duplicate_count > 0:
print(f"Warning: {duplicate_count} duplicate rows")
# Validity
if 'email' in df.columns:
invalid_emails = ~df['email'].str.contains('@')
if invalid_emails.any():
print(f"Warning: {invalid_emails.sum()} invalid emails")
# Consistency
if 'age' in df.columns:
invalid_ages = ~df['age'].between(0, 120)
if invalid_ages.any():
print(f"Warning: {invalid_ages.sum()} invalid ages")import matplotlib.pyplot as plt
import seaborn as sns
def create_dashboard(df: pd.DataFrame):
"""Generate comprehensive data dashboard"""
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
# Distribution
df['age'].hist(bins=30, ax=axes[0, 0])
axes[0, 0].set_title('Age Distribution')
# Correlation heatmap
sns.heatmap(df.corr(), annot=True, ax=axes[0, 1])
axes[0, 1].set_title('Feature Correlations')
# Time series
df.groupby('date')['value'].sum().plot(ax=axes[1, 0])
axes[1, 0].set_title('Value Over Time')
# Category counts
df['category'].value_counts().plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_title('Categories')
plt.tight_layout()
plt.savefig('.oma/data/dashboard.png', dpi=300)"Data is the new oil, but only if refined."
fab464f
If you maintain this skill, you can claim it as your own. Once claimed, you can manage eval scenarios, bundle related skills, attach documentation or rules, and ensure cross-agent compatibility.