tessl install tessl/pypi-vllm@0.10.0A high-throughput and memory-efficient inference and serving engine for LLMs
Agent Success
Agent success rate when using this tile
69%
Improvement
Agent success rate improvement when using this tile compared to baseline
1.33x
Baseline
Agent success rate without this tile
52%
A Python utility that benchmarks LLM inference performance across different attention implementations to help users select the optimal backend for their hardware and workload.
@generates
The implementation should:
from typing import Optional, List, Dict
import time
class AttentionBackendBenchmark:
"""
Benchmarks LLM inference performance across different attention backends.
"""
def __init__(self, model: str):
"""
Initialize the benchmark tool with a model.
Args:
model: Model name or path to use for benchmarking
"""
pass
def run_with_backend(
self,
prompt: str,
attention_backend: Optional[str] = None,
max_tokens: int = 100,
temperature: float = 0.0
) -> Dict[str, any]:
"""
Run inference with a specific attention backend and return results with timing.
Args:
prompt: Input prompt text
attention_backend: Attention backend name (e.g., "FLASH_ATTN", "FLASHINFER", "XFORMERS")
max_tokens: Maximum tokens to generate
temperature: Sampling temperature
Returns:
Dictionary with keys: 'output' (generated text), 'time' (generation time in seconds),
'backend' (backend used)
"""
pass
def compare_backends(
self,
prompt: str,
backends: List[str],
max_tokens: int = 100
) -> List[Dict[str, any]]:
"""
Compare multiple backends on the same prompt.
Args:
prompt: Input prompt text
backends: List of backend names to compare
max_tokens: Maximum tokens to generate
Returns:
List of result dictionaries, one per backend
"""
passProvides high-throughput LLM inference with custom attention mechanisms.