Comprehensive developer toolkit providing reusable skills for Java/Spring Boot, TypeScript/NestJS/React/Next.js, Python, PHP, AWS CloudFormation, AI/RAG, DevOps, and more.
82
82%
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Risky
Do not use without reviewing
This reference provides systematic methodologies for iteratively improving prompt performance through structured testing, measurement, and refinement processes.
graph TD
A[Baseline Measurement] --> B[Hypothesis Generation]
B --> C[Controlled Test]
C --> D[Performance Analysis]
D --> E[Statistical Validation]
E --> F[Implementation Decision]
F --> G[Monitor Impact]
G --> H[Learn & Iterate]
H --> Bdef calculate_success_rate(results, expected_outputs):
"""
Measure percentage of tasks completed correctly.
"""
correct = sum(1 for result, expected in zip(results, expected_outputs)
if result == expected)
return (correct / len(results)) * 100def measure_consistency(prompt, test_cases, num_runs=5):
"""
Measure response stability across multiple runs.
"""
responses = {}
for test_case in test_cases:
test_responses = []
for _ in range(num_runs):
response = execute_prompt(prompt, test_case)
test_responses.append(response)
# Calculate similarity score for consistency
consistency = calculate_similarity(test_responses)
responses[test_case] = consistency
return sum(responses.values()) / len(responses)def calculate_token_efficiency(prompt, test_cases):
"""
Measure token usage per successful task completion.
"""
total_tokens = 0
successful_tasks = 0
for test_case in test_cases:
response = execute_prompt_with_metrics(prompt, test_case)
total_tokens += response.token_count
if response.is_successful:
successful_tasks += 1
return total_tokens / successful_tasks if successful_tasks > 0 else float('inf')def measure_response_time(prompt, test_cases):
"""
Measure average response time.
"""
times = []
for test_case in test_cases:
start_time = time.time()
execute_prompt(prompt, test_case)
end_time = time.time()
times.append(end_time - start_time)
return sum(times) / len(times)def assess_output_quality(response, criteria):
"""
Multi-dimensional quality assessment.
"""
scores = {
'accuracy': measure_accuracy(response),
'completeness': measure_completeness(response),
'coherence': measure_coherence(response),
'relevance': measure_relevance(response),
'format_compliance': measure_format_compliance(response)
}
weights = [0.3, 0.2, 0.2, 0.2, 0.1]
return sum(score * weight for score, weight in zip(scores.values(), weights))def check_safety_compliance(response):
"""
Measure adherence to safety guidelines.
"""
violations = []
# Check for various safety issues
if contains_harmful_content(response):
violations.append('harmful_content')
if has_bias(response):
violations.append('bias')
if violates_privacy(response):
violations.append('privacy_violation')
safety_score = max(0, 100 - len(violations) * 25)
return safety_score, violationsdef design_ab_test(baseline_prompt, variant_prompt, test_cases):
"""
Design controlled A/B test with proper statistical power.
"""
# Calculate required sample size
effect_size = estimate_effect_size(baseline_prompt, variant_prompt)
sample_size = calculate_sample_size(effect_size, power=0.8, alpha=0.05)
# Random assignment
randomized_cases = random.sample(test_cases, sample_size)
split_point = len(randomized_cases) // 2
group_a = randomized_cases[:split_point]
group_b = randomized_cases[split_point:]
return {
'baseline_group': group_a,
'variant_group': group_b,
'sample_size': sample_size,
'statistical_power': 0.8,
'significance_level': 0.05
}def analyze_ab_results(baseline_results, variant_results):
"""
Perform statistical analysis of A/B test results.
"""
# Calculate means and standard deviations
baseline_mean = np.mean(baseline_results)
variant_mean = np.mean(variant_results)
baseline_std = np.std(baseline_results)
variant_std = np.std(variant_results)
# Perform t-test
t_statistic, p_value = stats.ttest_ind(baseline_results, variant_results)
# Calculate effect size (Cohen's d)
pooled_std = np.sqrt(((len(baseline_results) - 1) * baseline_std**2 +
(len(variant_results) - 1) * variant_std**2) /
(len(baseline_results) + len(variant_results) - 2))
cohens_d = (variant_mean - baseline_mean) / pooled_std
return {
'baseline_mean': baseline_mean,
'variant_mean': variant_mean,
'improvement': ((variant_mean - baseline_mean) / baseline_mean) * 100,
'p_value': p_value,
'statistical_significance': p_value < 0.05,
'effect_size': cohens_d,
'recommendation': 'implement_variant' if p_value < 0.05 and cohens_d > 0.2 else 'keep_baseline'
}def progressive_optimization(base_prompt, test_cases, max_iterations=10):
"""
Incrementally improve prompt through systematic testing.
"""
current_prompt = base_prompt
current_performance = evaluate_prompt(current_prompt, test_cases)
optimization_history = []
for iteration in range(max_iterations):
# Generate improvement hypotheses
hypotheses = generate_improvement_hypotheses(current_prompt, current_performance)
best_improvement = None
best_performance = current_performance
for hypothesis in hypotheses:
# Test hypothesis
test_prompt = apply_hypothesis(current_prompt, hypothesis)
test_performance = evaluate_prompt(test_prompt, test_cases)
# Validate improvement
if is_statistically_significant(current_performance, test_performance):
if test_performance.overall_score > best_performance.overall_score:
best_improvement = hypothesis
best_performance = test_performance
# Apply best improvement if found
if best_improvement:
current_prompt = apply_hypothesis(current_prompt, best_improvement)
optimization_history.append({
'iteration': iteration,
'hypothesis': best_improvement,
'performance_before': current_performance,
'performance_after': best_performance,
'improvement': best_performance.overall_score - current_performance.overall_score
})
current_performance = best_performance
else:
break # No further improvements found
return current_prompt, optimization_historydef multi_objective_optimization(prompt_variants, objectives):
"""
Optimize for multiple competing objectives using Pareto efficiency.
"""
results = []
for variant in prompt_variants:
scores = {}
for objective in objectives:
scores[objective] = evaluate_objective(variant, objective)
results.append({
'prompt': variant,
'scores': scores,
'dominates': []
})
# Find Pareto optimal solutions
pareto_optimal = []
for i, result_i in enumerate(results):
is_dominated = False
for j, result_j in enumerate(results):
if i != j and dominates(result_j, result_i):
is_dominated = True
break
if not is_dominated:
pareto_optimal.append(result_i)
return pareto_optimal
def dominates(result_a, result_b):
"""
Check if result_a dominates result_b in all objectives.
"""
return all(result_a['scores'][obj] >= result_b['scores'][obj]
for obj in result_a['scores'])def adaptive_testing(prompt_variants, initial_budget):
"""
Dynamically allocate testing budget to promising variants.
"""
# Initial exploration phase
exploration_results = {}
budget分配 = initial_budget // len(prompt_variants)
for variant in prompt_variants:
exploration_results[variant] = test_prompt(variant, budget分配)
# Exploitation phase - allocate more budget to promising variants
total_budget_spent = len(prompt_variants) * budget分配
remaining_budget = initial_budget - total_budget_spent
# Sort by performance
sorted_variants = sorted(exploration_results.items(),
key=lambda x: x[1].overall_score, reverse=True)
# Allocate remaining budget proportionally to performance
final_results = {}
for i, (variant, initial_result) in enumerate(sorted_variants):
if remaining_budget > 0:
additional_budget = max(1, remaining_budget // (len(sorted_variants) - i))
final_results[variant] = test_prompt(variant, additional_budget)
remaining_budget -= additional_budget
else:
final_results[variant] = initial_result
return final_resultsinstruction_clarity_hypotheses = [
"Add numbered steps to instructions",
"Include specific output format examples",
"Clarify role and expertise level",
"Add context and background information",
"Specify constraints and boundaries",
"Include success criteria and evaluation standards"
]example_optimization_hypotheses = [
"Increase number of examples from 3 to 5",
"Add edge case examples",
"Reorder examples by complexity",
"Include negative examples",
"Add reasoning traces to examples",
"Improve example diversity and coverage"
]structure_hypotheses = [
"Add clear section headings",
"Reorganize content flow",
"Include summary at the beginning",
"Add checklist for verification",
"Separate instructions from examples",
"Add troubleshooting section"
]model_specific_hypotheses = {
'claude': [
"Use XML tags for structure",
"Add <thinking> sections for reasoning",
"Include constitutional AI principles",
"Use system message format",
"Add safety guidelines and constraints"
],
'gpt-4': [
"Use numbered sections with ### headers",
"Include JSON format specifications",
"Add function calling patterns",
"Use bullet points for clarity",
"Include error handling instructions"
],
'gemini': [
"Use bold headers with ** formatting",
"Include step-by-step process descriptions",
"Add validation checkpoints",
"Use conversational tone",
"Include confidence scoring"
]
}def setup_monitoring(prompt, alert_thresholds):
"""
Set up continuous monitoring for deployed prompts.
"""
monitors = {
'success_rate': MetricMonitor('success_rate', alert_thresholds['success_rate']),
'response_time': MetricMonitor('response_time', alert_thresholds['response_time']),
'token_cost': MetricMonitor('token_cost', alert_thresholds['token_cost']),
'safety_score': MetricMonitor('safety_score', alert_thresholds['safety_score'])
}
def monitor_performance():
recent_data = collect_recent_performance(prompt)
alerts = []
for metric_name, monitor in monitors.items():
if metric_name in recent_data:
alert = monitor.check(recent_data[metric_name])
if alert:
alerts.append(alert)
return alerts
return monitor_performancedef automated_rollback_system(prompts, monitoring_data):
"""
Automatically rollback to previous version if performance degrades.
"""
def check_and_rollback(current_prompt, baseline_prompt):
current_metrics = monitoring_data.get_metrics(current_prompt)
baseline_metrics = monitoring_data.get_metrics(baseline_prompt)
# Check if performance degradation exceeds threshold
degradation_threshold = 0.1 # 10% degradation
for metric in current_metrics:
if current_metrics[metric] < baseline_metrics[metric] * (1 - degradation_threshold):
return True, f"Performance degradation in {metric}"
return False, "Performance acceptable"
return check_and_rollbackdef generate_prompt_variations(base_prompt):
"""
Generate systematic variations for testing.
"""
variations = {}
# Instruction variations
variations['more_detailed'] = add_detailed_instructions(base_prompt)
variations['simplified'] = simplify_instructions(base_prompt)
variations['structured'] = add_structured_format(base_prompt)
# Example variations
variations['more_examples'] = add_examples(base_prompt)
variations['better_examples'] = improve_example_quality(base_prompt)
variations['diverse_examples'] = add_example_diversity(base_prompt)
# Format variations
variations['numbered_steps'] = add_numbered_steps(base_prompt)
variations['bullet_points'] = use_bullet_points(base_prompt)
variations['sections'] = add_section_headers(base_prompt)
return variationsdef create_performance_dashboard(optimization_history):
"""
Create visualization of optimization progress.
"""
# Generate performance metrics over time
metrics_over_time = {
'iterations': [h['iteration'] for h in optimization_history],
'success_rates': [h['performance_after'].success_rate for h in optimization_history],
'token_efficiency': [h['performance_after'].token_efficiency for h in optimization_history],
'response_times': [h['performance_after'].response_time for h in optimization_history]
}
return PerformanceDashboard(metrics_over_time)This comprehensive framework provides systematic methodologies for continuous prompt improvement through data-driven optimization and rigorous testing processes.
plugins
developer-kit-ai
skills
chunking-strategy
prompt-engineering
developer-kit-aws
skills
aws
aws-cli-beast
aws-cost-optimization
aws-drawio-architecture-diagrams
aws-sam-bootstrap
aws-cloudformation
aws-cloudformation-auto-scaling
references
aws-cloudformation-bedrock
references
aws-cloudformation-cloudfront
references
aws-cloudformation-cloudwatch
references
aws-cloudformation-dynamodb
references
aws-cloudformation-ec2
aws-cloudformation-ecs
references
aws-cloudformation-elasticache
aws-cloudformation-iam
references
aws-cloudformation-lambda
references
aws-cloudformation-rds
aws-cloudformation-s3
references
aws-cloudformation-security
references
aws-cloudformation-task-ecs-deploy-gh
aws-cloudformation-vpc
developer-kit-core
skills
developer-kit-java
skills
aws-lambda-java-integration
aws-rds-spring-boot-integration
aws-sdk-java-v2-bedrock
aws-sdk-java-v2-core
aws-sdk-java-v2-dynamodb
aws-sdk-java-v2-kms
aws-sdk-java-v2-lambda
aws-sdk-java-v2-messaging
aws-sdk-java-v2-rds
aws-sdk-java-v2-s3
aws-sdk-java-v2-secrets-manager
graalvm-native-image
langchain4j
langchain4j-mcp-server-patterns
langchain4j-ai-services-patterns
references
langchain4j-mcp-server-patterns
references
langchain4j-rag-implementation-patterns
references
langchain4j-spring-boot-integration
langchain4j-testing-strategies
langchain4j-tool-function-calling-patterns
langchain4j-vector-stores-configuration
references
qdrant
references
spring-ai-mcp-server-patterns
references
spring-boot-actuator
spring-boot-cache
spring-boot-crud-patterns
spring-boot-dependency-injection
spring-boot-event-driven-patterns
spring-boot-openapi-documentation
spring-boot-project-creator
spring-boot-resilience4j
spring-boot-rest-api-standards
spring-boot-saga-pattern
spring-boot-security-jwt
assets
references
scripts
spring-boot-test-patterns
spring-data-jpa
references
spring-data-neo4j
references
unit-test-application-events
unit-test-bean-validation
unit-test-boundary-conditions
unit-test-caching
unit-test-config-properties
unit-test-controller-layer
unit-test-exception-handler
unit-test-json-serialization
unit-test-mapper-converter
unit-test-parameterized
unit-test-scheduled-async
unit-test-service-layer
unit-test-utility-methods
unit-test-wiremock-rest-api
developer-kit-php
skills
aws-lambda-php-integration
developer-kit-python
skills
aws-lambda-python-integration
developer-kit-tools
developer-kit-typescript
skills
aws-lambda-typescript-integration
better-auth
drizzle-orm-patterns
dynamodb-toolbox-patterns
references
nestjs
nestjs-best-practices
nestjs-code-review
nestjs-drizzle-crud-generator
scripts
nextjs-app-router
nextjs-authentication
nextjs-code-review
nextjs-data-fetching
references
nextjs-deployment
nextjs-performance
nx-monorepo
react-code-review
react-patterns
references
shadcn-ui
tailwind-css-patterns
references
tailwind-design-system
references
turborepo-monorepo
typescript-docs
typescript-security-review
zod-validation-utilities