agent-benchmark-suite

Agent skill for benchmark-suite - invoke with $agent-benchmark-suite

2.17x

Quality

Does it follow best practices?

Impact

89%

2.17x

Average score across 3 eval scenarios

Securityby

Passed

No known issues

Optimize this skill with Tessl

npx tessl skill review --optimize ./.agents/skills/agent-benchmark-suite/SKILL.md

name: Benchmark Suite type: agent category: optimization description: Comprehensive performance benchmarking, regression detection and performance validation

Benchmark Suite Agent

Agent Profile

Name: Benchmark Suite
Type: Performance Optimization Agent
Specialization: Comprehensive performance benchmarking and testing
Performance Focus: Automated benchmarking, regression detection, and performance validation

Core Capabilities

1. Comprehensive Benchmarking Framework

// Advanced benchmarking system
class ComprehensiveBenchmarkSuite {
  constructor() {
    this.benchmarks = {
      // Core performance benchmarks
      throughput: new ThroughputBenchmark(),
      latency: new LatencyBenchmark(),
      scalability: new ScalabilityBenchmark(),
      resource_usage: new ResourceUsageBenchmark(),
      
      // Swarm-specific benchmarks
      coordination: new CoordinationBenchmark(),
      load_balancing: new LoadBalancingBenchmark(),
      topology: new TopologyBenchmark(),
      fault_tolerance: new FaultToleranceBenchmark(),
      
      // Custom benchmarks
      custom: new CustomBenchmarkManager()
    };
    
    this.reporter = new BenchmarkReporter();
    this.comparator = new PerformanceComparator();
    this.analyzer = new BenchmarkAnalyzer();
  }
  
  // Execute comprehensive benchmark suite
  async runBenchmarkSuite(config = {}) {
    const suiteConfig = {
      duration: config.duration || 300000, // 5 minutes default
      iterations: config.iterations || 10,
      warmupTime: config.warmupTime || 30000, // 30 seconds
      cooldownTime: config.cooldownTime || 10000, // 10 seconds
      parallel: config.parallel || false,
      baseline: config.baseline || null
    };
    
    const results = {
      summary: {},
      detailed: new Map(),
      baseline_comparison: null,
      recommendations: []
    };
    
    // Warmup phase
    await this.warmup(suiteConfig.warmupTime);
    
    // Execute benchmarks
    if (suiteConfig.parallel) {
      results.detailed = await this.runBenchmarksParallel(suiteConfig);
    } else {
      results.detailed = await this.runBenchmarksSequential(suiteConfig);
    }
    
    // Generate summary
    results.summary = this.generateSummary(results.detailed);
    
    // Compare with baseline if provided
    if (suiteConfig.baseline) {
      results.baseline_comparison = await this.compareWithBaseline(
        results.detailed, 
        suiteConfig.baseline
      );
    }
    
    // Generate recommendations
    results.recommendations = await this.generateRecommendations(results);
    
    // Cooldown phase
    await this.cooldown(suiteConfig.cooldownTime);
    
    return results;
  }
  
  // Parallel benchmark execution
  async runBenchmarksParallel(config) {
    const benchmarkPromises = Object.entries(this.benchmarks).map(
      async ([name, benchmark]) => {
        const result = await this.executeBenchmark(benchmark, name, config);
        return [name, result];
      }
    );
    
    const results = await Promise.all(benchmarkPromises);
    return new Map(results);
  }
  
  // Sequential benchmark execution
  async runBenchmarksSequential(config) {
    const results = new Map();
    
    for (const [name, benchmark] of Object.entries(this.benchmarks)) {
      const result = await this.executeBenchmark(benchmark, name, config);
      results.set(name, result);
      
      // Brief pause between benchmarks
      await this.sleep(1000);
    }
    
    return results;
  }
}

2. Performance Regression Detection

// Advanced regression detection system
class RegressionDetector {
  constructor() {
    this.detectors = {
      statistical: new StatisticalRegressionDetector(),
      machine_learning: new MLRegressionDetector(),
      threshold: new ThresholdRegressionDetector(),
      trend: new TrendRegressionDetector()
    };
    
    this.analyzer = new RegressionAnalyzer();
    this.alerting = new RegressionAlerting();
  }
  
  // Detect performance regressions
  async detectRegressions(currentResults, historicalData, config = {}) {
    const regressions = {
      detected: [],
      severity: 'none',
      confidence: 0,
      analysis: {}
    };
    
    // Run multiple detection algorithms
    const detectionPromises = Object.entries(this.detectors).map(
      async ([method, detector]) => {
        const detection = await detector.detect(currentResults, historicalData, config);
        return [method, detection];
      }
    );
    
    const detectionResults = await Promise.all(detectionPromises);
    
    // Aggregate detection results
    for (const [method, detection] of detectionResults) {
      if (detection.regression_detected) {
        regressions.detected.push({
          method,
          ...detection
        });
      }
    }
    
    // Calculate overall confidence and severity
    if (regressions.detected.length > 0) {
      regressions.confidence = this.calculateAggregateConfidence(regressions.detected);
      regressions.severity = this.calculateSeverity(regressions.detected);
      regressions.analysis = await this.analyzer.analyze(regressions.detected);
    }
    
    return regressions;
  }
  
  // Statistical regression detection using change point analysis
  async detectStatisticalRegression(metric, historicalData, sensitivity = 0.95) {
    // Use CUSUM (Cumulative Sum) algorithm for change point detection
    const cusum = this.calculateCUSUM(metric, historicalData);
    
    // Detect change points
    const changePoints = this.detectChangePoints(cusum, sensitivity);
    
    // Analyze significance of changes
    const analysis = changePoints.map(point => ({
      timestamp: point.timestamp,
      magnitude: point.magnitude,
      direction: point.direction,
      significance: point.significance,
      confidence: point.confidence
    }));
    
    return {
      regression_detected: changePoints.length > 0,
      change_points: analysis,
      cusum_statistics: cusum.statistics,
      sensitivity: sensitivity
    };
  }
  
  // Machine learning-based regression detection
  async detectMLRegression(metrics, historicalData) {
    // Train anomaly detection model on historical data
    const model = await this.trainAnomalyModel(historicalData);
    
    // Predict anomaly scores for current metrics
    const anomalyScores = await model.predict(metrics);
    
    // Identify regressions based on anomaly scores
    const threshold = this.calculateDynamicThreshold(anomalyScores);
    const regressions = anomalyScores.filter(score => score.anomaly > threshold);
    
    return {
      regression_detected: regressions.length > 0,
      anomaly_scores: anomalyScores,
      threshold: threshold,
      regressions: regressions,
      model_confidence: model.confidence
    };
  }
}

3. Automated Performance Testing

// Comprehensive automated performance testing
class AutomatedPerformanceTester {
  constructor() {
    this.testSuites = {
      load: new LoadTestSuite(),
      stress: new StressTestSuite(),
      volume: new VolumeTestSuite(),
      endurance: new EnduranceTestSuite(),
      spike: new SpikeTestSuite(),
      configuration: new ConfigurationTestSuite()
    };
    
    this.scheduler = new TestScheduler();
    this.orchestrator = new TestOrchestrator();
    this.validator = new ResultValidator();
  }
  
  // Execute automated performance test campaign
  async runTestCampaign(config) {
    const campaign = {
      id: this.generateCampaignId(),
      config,
      startTime: Date.now(),
      tests: [],
      results: new Map(),
      summary: null
    };
    
    // Schedule test execution
    const schedule = await this.scheduler.schedule(config.tests, config.constraints);
    
    // Execute tests according to schedule
    for (const scheduledTest of schedule) {
      const testResult = await this.executeScheduledTest(scheduledTest);
      campaign.tests.push(scheduledTest);
      campaign.results.set(scheduledTest.id, testResult);
      
      // Validate results in real-time
      const validation = await this.validator.validate(testResult);
      if (!validation.valid) {
        campaign.summary = {
          status: 'failed',
          reason: validation.reason,
          failedAt: scheduledTest.name
        };
        break;
      }
    }
    
    // Generate campaign summary
    if (!campaign.summary) {
      campaign.summary = await this.generateCampaignSummary(campaign);
    }
    
    campaign.endTime = Date.now();
    campaign.duration = campaign.endTime - campaign.startTime;
    
    return campaign;
  }
  
  // Load testing with gradual ramp-up
  async executeLoadTest(config) {
    const loadTest = {
      type: 'load',
      config,
      phases: [],
      metrics: new Map(),
      results: {}
    };
    
    // Ramp-up phase
    const rampUpResult = await this.executeRampUp(config.rampUp);
    loadTest.phases.push({ phase: 'ramp-up', result: rampUpResult });
    
    // Sustained load phase
    const sustainedResult = await this.executeSustainedLoad(config.sustained);
    loadTest.phases.push({ phase: 'sustained', result: sustainedResult });
    
    // Ramp-down phase
    const rampDownResult = await this.executeRampDown(config.rampDown);
    loadTest.phases.push({ phase: 'ramp-down', result: rampDownResult });
    
    // Analyze results
    loadTest.results = await this.analyzeLoadTestResults(loadTest.phases);
    
    return loadTest;
  }
  
  // Stress testing to find breaking points
  async executeStressTest(config) {
    const stressTest = {
      type: 'stress',
      config,
      breakingPoint: null,
      degradationCurve: [],
      results: {}
    };
    
    let currentLoad = config.startLoad;
    let systemBroken = false;
    
    while (!systemBroken && currentLoad <= config.maxLoad) {
      const testResult = await this.applyLoad(currentLoad, config.duration);
      
      stressTest.degradationCurve.push({
        load: currentLoad,
        performance: testResult.performance,
        stability: testResult.stability,
        errors: testResult.errors
      });
      
      // Check if system is breaking
      if (this.isSystemBreaking(testResult, config.breakingCriteria)) {
        stressTest.breakingPoint = {
          load: currentLoad,
          performance: testResult.performance,
          reason: this.identifyBreakingReason(testResult)
        };
        systemBroken = true;
      }
      
      currentLoad += config.loadIncrement;
    }
    
    stressTest.results = await this.analyzeStressTestResults(stressTest);
    
    return stressTest;
  }
}

4. Performance Validation Framework

// Comprehensive performance validation
class PerformanceValidator {
  constructor() {
    this.validators = {
      sla: new SLAValidator(),
      regression: new RegressionValidator(),
      scalability: new ScalabilityValidator(),
      reliability: new ReliabilityValidator(),
      efficiency: new EfficiencyValidator()
    };
    
    this.thresholds = new ThresholdManager();
    this.rules = new ValidationRuleEngine();
  }
  
  // Validate performance against defined criteria
  async validatePerformance(results, criteria) {
    const validation = {
      overall: {
        passed: true,
        score: 0,
        violations: []
      },
      detailed: new Map(),
      recommendations: []
    };
    
    // Run all validators
    const validationPromises = Object.entries(this.validators).map(
      async ([type, validator]) => {
        const result = await validator.validate(results, criteria[type]);
        return [type, result];
      }
    );
    
    const validationResults = await Promise.all(validationPromises);
    
    // Aggregate validation results
    for (const [type, result] of validationResults) {
      validation.detailed.set(type, result);
      
      if (!result.passed) {
        validation.overall.passed = false;
        validation.overall.violations.push(...result.violations);
      }
      
      validation.overall.score += result.score * (criteria[type]?.weight || 1);
    }
    
    // Normalize overall score
    const totalWeight = Object.values(criteria).reduce((sum, c) => sum + (c.weight || 1), 0);
    validation.overall.score /= totalWeight;
    
    // Generate recommendations
    validation.recommendations = await this.generateValidationRecommendations(validation);
    
    return validation;
  }
  
  // SLA validation
  async validateSLA(results, slaConfig) {
    const slaValidation = {
      passed: true,
      violations: [],
      score: 1.0,
      metrics: {}
    };
    
    // Validate each SLA metric
    for (const [metric, threshold] of Object.entries(slaConfig.thresholds)) {
      const actualValue = this.extractMetricValue(results, metric);
      const validation = this.validateThreshold(actualValue, threshold);
      
      slaValidation.metrics[metric] = {
        actual: actualValue,
        threshold: threshold.value,
        operator: threshold.operator,
        passed: validation.passed,
        deviation: validation.deviation
      };
      
      if (!validation.passed) {
        slaValidation.passed = false;
        slaValidation.violations.push({
          metric,
          actual: actualValue,
          expected: threshold.value,
          severity: threshold.severity || 'medium'
        });
        
        // Reduce score based on violation severity
        const severityMultiplier = this.getSeverityMultiplier(threshold.severity);
        slaValidation.score -= (validation.deviation * severityMultiplier);
      }
    }
    
    slaValidation.score = Math.max(0, slaValidation.score);
    
    return slaValidation;
  }
  
  // Scalability validation
  async validateScalability(results, scalabilityConfig) {
    const scalabilityValidation = {
      passed: true,
      violations: [],
      score: 1.0,
      analysis: {}
    };
    
    // Linear scalability analysis
    if (scalabilityConfig.linear) {
      const linearityAnalysis = this.analyzeLinearScalability(results);
      scalabilityValidation.analysis.linearity = linearityAnalysis;
      
      if (linearityAnalysis.coefficient < scalabilityConfig.linear.minCoefficient) {
        scalabilityValidation.passed = false;
        scalabilityValidation.violations.push({
          type: 'linearity',
          actual: linearityAnalysis.coefficient,
          expected: scalabilityConfig.linear.minCoefficient
        });
      }
    }
    
    // Efficiency retention analysis
    if (scalabilityConfig.efficiency) {
      const efficiencyAnalysis = this.analyzeEfficiencyRetention(results);
      scalabilityValidation.analysis.efficiency = efficiencyAnalysis;
      
      if (efficiencyAnalysis.retention < scalabilityConfig.efficiency.minRetention) {
        scalabilityValidation.passed = false;
        scalabilityValidation.violations.push({
          type: 'efficiency_retention',
          actual: efficiencyAnalysis.retention,
          expected: scalabilityConfig.efficiency.minRetention
        });
      }
    }
    
    return scalabilityValidation;
  }
}

MCP Integration Hooks

Benchmark Execution Integration

// Comprehensive MCP benchmark integration
const benchmarkIntegration = {
  // Execute performance benchmarks
  async runBenchmarks(config = {}) {
    // Run benchmark suite
    const benchmarkResult = await mcp.benchmark_run({
      suite: config.suite || 'comprehensive'
    });
    
    // Collect detailed metrics during benchmarking
    const metrics = await mcp.metrics_collect({
      components: ['system', 'agents', 'coordination', 'memory']
    });
    
    // Analyze performance trends
    const trends = await mcp.trend_analysis({
      metric: 'performance',
      period: '24h'
    });
    
    // Cost analysis
    const costAnalysis = await mcp.cost_analysis({
      timeframe: '24h'
    });
    
    return {
      benchmark: benchmarkResult,
      metrics,
      trends,
      costAnalysis,
      timestamp: Date.now()
    };
  },
  
  // Quality assessment
  async assessQuality(criteria) {
    const qualityAssessment = await mcp.quality_assess({
      target: 'swarm-performance',
      criteria: criteria || [
        'throughput',
        'latency',
        'reliability',
        'scalability',
        'efficiency'
      ]
    });
    
    return qualityAssessment;
  },
  
  // Error pattern analysis
  async analyzeErrorPatterns() {
    // Collect system logs
    const logs = await this.collectSystemLogs();
    
    // Analyze error patterns
    const errorAnalysis = await mcp.error_analysis({
      logs: logs
    });
    
    return errorAnalysis;
  }
};

Operational Commands

Benchmarking Commands

# Run comprehensive benchmark suite
npx claude-flow benchmark-run --suite comprehensive --duration 300

# Execute specific benchmark
npx claude-flow benchmark-run --suite throughput --iterations 10

# Compare with baseline
npx claude-flow benchmark-compare --current <results> --baseline <baseline>

# Quality assessment
npx claude-flow quality-assess --target swarm-performance --criteria throughput,latency

# Performance validation
npx claude-flow validate-performance --results <file> --criteria <file>

Regression Detection Commands

# Detect performance regressions
npx claude-flow detect-regression --current <results> --historical <data>

# Set up automated regression monitoring
npx claude-flow regression-monitor --enable --sensitivity 0.95

# Analyze error patterns
npx claude-flow error-analysis --logs <log-files>

Integration Points

With Other Optimization Agents

Performance Monitor: Provides continuous monitoring data for benchmarking
Load Balancer: Validates load balancing effectiveness through benchmarks
Topology Optimizer: Tests topology configurations for optimal performance

With CI/CD Pipeline

Automated Testing: Integrates with CI/CD for continuous performance validation
Quality Gates: Provides pass$fail criteria for deployment decisions
Regression Prevention: Catches performance regressions before production

Performance Benchmarks

Standard Benchmark Suite

// Comprehensive benchmark definitions
const standardBenchmarks = {
  // Throughput benchmarks
  throughput: {
    name: 'Throughput Benchmark',
    metrics: ['requests_per_second', 'tasks_per_second', 'messages_per_second'],
    duration: 300000, // 5 minutes
    warmup: 30000,    // 30 seconds
    targets: {
      requests_per_second: { min: 1000, optimal: 5000 },
      tasks_per_second: { min: 100, optimal: 500 },
      messages_per_second: { min: 10000, optimal: 50000 }
    }
  },
  
  // Latency benchmarks
  latency: {
    name: 'Latency Benchmark',
    metrics: ['p50', 'p90', 'p95', 'p99', 'max'],
    duration: 300000,
    targets: {
      p50: { max: 100 },   // 100ms
      p90: { max: 200 },   // 200ms
      p95: { max: 500 },   // 500ms
      p99: { max: 1000 },  // 1s
      max: { max: 5000 }   // 5s
    }
  },
  
  // Scalability benchmarks
  scalability: {
    name: 'Scalability Benchmark',
    metrics: ['linear_coefficient', 'efficiency_retention'],
    load_points: [1, 2, 4, 8, 16, 32, 64],
    targets: {
      linear_coefficient: { min: 0.8 },
      efficiency_retention: { min: 0.7 }
    }
  }
};

This Benchmark Suite agent provides comprehensive automated performance testing, regression detection, and validation capabilities to ensure optimal swarm performance and prevent performance degradation.

Repository: ruvnet/claude-flow
Commit: d29d87f

Last updated: 4 days ago
Created: 4 days ago

Is this your skill?

If you maintain this skill, you can claim it as your own. Once claimed, you can manage eval scenarios, bundle related skills, attach documentation or rules, and ensure cross-agent compatibility.