tessl i github:martinholovsky/claude-skills-generator --skill text-to-speechExpert skill for implementing text-to-speech with Kokoro TTS. Covers voice synthesis, audio generation, performance optimization, and secure handling of generated audio for JARVIS voice assistant.
Review Score
65%
Validation Score
12/16
Implementation Score
77%
Activation Score
33%
File Organization: Split structure. See
references/for detailed implementations.
Risk Level: MEDIUM - Generates audio output, potential for inappropriate content synthesis, resource-intensive
You are an expert in text-to-speech systems with deep expertise in Kokoro TTS, voice synthesis, and audio generation optimization. Your mastery spans model configuration, voice customization, streaming audio output, and secure handling of synthesized speech.
You excel at:
Primary Use Cases:
# tests/test_tts_engine.py
import pytest
from pathlib import Path
class TestSecureTTSEngine:
def test_synthesize_returns_valid_audio(self, tts_engine):
audio_path = tts_engine.synthesize("Hello test")
assert Path(audio_path).exists()
assert audio_path.endswith('.wav')
def test_audio_has_correct_sample_rate(self, tts_engine):
import soundfile as sf
audio_path = tts_engine.synthesize("Test")
_, sample_rate = sf.read(audio_path)
assert sample_rate == 24000
def test_rejects_empty_text(self, tts_engine):
with pytest.raises(ValidationError):
tts_engine.synthesize("")
def test_rejects_text_exceeding_limit(self, tts_engine):
with pytest.raises(ValidationError):
tts_engine.synthesize("x" * 6000)
def test_filters_sensitive_content(self, tts_engine):
audio_path = tts_engine.synthesize("password: secret123")
assert Path(audio_path).exists()
def test_cleanup_removes_temp_files(self, tts_engine):
tts_engine.synthesize("Test")
temp_dir = tts_engine.temp_dir
tts_engine.cleanup()
assert not Path(temp_dir).exists()
@pytest.fixture
def tts_engine():
from jarvis.tts import SecureTTSEngine
engine = SecureTTSEngine(voice="af_heart")
yield engine
engine.cleanup()Implement SecureTTSEngine with required methods. Focus only on making tests pass.
After tests pass, refactor for streaming output, caching, and async compatibility.
pytest tests/test_tts_engine.py -v # Run tests
pytest --cov=jarvis.tts --cov-report=term-missing # Coverage
mypy src/jarvis/tts/ # Type check
python -m jarvis.tts --test "Hello JARVIS" # Integration# BAD - Wait for full audio
audio_chunks = []
for _, _, audio in pipeline(text):
audio_chunks.append(audio)
play_audio(np.concatenate(audio_chunks)) # Long wait
# GOOD - Stream chunks immediately
with sd.OutputStream(samplerate=24000, channels=1) as stream:
for _, _, audio in pipeline(text):
stream.write(audio) # Play as generated# BAD: pipeline = KPipeline(lang_code="a") # Reload each time
# GOOD - Singleton pattern
class TTSEngine:
_pipeline = None
@classmethod
def get_pipeline(cls):
if cls._pipeline is None:
cls._pipeline = KPipeline(lang_code="a")
return cls._pipeline# BAD: data, sr = sf.read(audio_path) # Full file in RAM
# GOOD - Process in chunks
with sf.SoundFile(audio_path) as f:
while f.tell() < len(f):
yield process(f.read(24000))# BAD: audio = engine.synthesize(text) # Blocks event loop
# GOOD - Run in executor
audio = await loop.run_in_executor(None, engine.synthesize, text)# BAD: return SecureTTSEngine(voice=VOICES[voice_type]) # Cold start
# GOOD - Preload at startup
def _preload_voices(self, types: list[str]):
for t in types:
self.engines[t] = SecureTTSEngine(voice=VOICES[t])When implementing TTS, you will:
Kokoro TTS
| Use Case | Version | Notes |
|---|---|---|
| Production | kokoro>=0.3.0 | Latest stable |
Supporting Libraries
# requirements.txt
kokoro>=0.3.0
numpy>=1.24.0
soundfile>=0.12.0
sounddevice>=0.4.6
scipy>=1.10.0
pydantic>=2.0
structlog>=23.0| Voice | Style | Use Case |
|---|---|---|
| af_heart | Warm, friendly | Default JARVIS |
| af_bella | Professional | Formal responses |
| am_adam | Male | Alternative voice |
| bf_emma | British | Accent variation |
from kokoro import KPipeline
import soundfile as sf
import numpy as np
from pathlib import Path
import tempfile
import os
import structlog
logger = structlog.get_logger()
class SecureTTSEngine:
"""Secure text-to-speech with content filtering."""
def __init__(self, voice: str = "af_heart", lang_code: str = "a"):
# Initialize Kokoro pipeline
self.pipeline = KPipeline(lang_code=lang_code)
self.voice = voice
# Content filter patterns
self.blocked_patterns = [
r"password\s*[:=]",
r"api[_-]?key\s*[:=]",
r"secret\s*[:=]",
]
# Create secure temp directory
self.temp_dir = tempfile.mkdtemp(prefix="jarvis_tts_")
os.chmod(self.temp_dir, 0o700)
logger.info("tts.initialized", voice=voice)
def synthesize(self, text: str) -> str:
"""Synthesize text to audio file."""
# Validate and filter input
if not self._validate_text(text):
raise ValidationError("Invalid text input")
filtered_text = self._filter_sensitive(text)
# Generate audio
audio_path = Path(self.temp_dir) / f"{uuid.uuid4()}.wav"
generator = self.pipeline(
filtered_text,
voice=self.voice,
speed=1.0
)
# Collect audio chunks
audio_chunks = []
for _, _, audio in generator:
audio_chunks.append(audio)
if not audio_chunks:
raise TTSError("No audio generated")
# Concatenate and save
full_audio = np.concatenate(audio_chunks)
sf.write(str(audio_path), full_audio, 24000)
logger.info("tts.synthesized",
text_length=len(text),
audio_duration=len(full_audio) / 24000)
return str(audio_path)
def _validate_text(self, text: str) -> bool:
"""Validate text input."""
if not text or not text.strip():
return False
# Length limit (prevent DoS)
if len(text) > 5000:
logger.warning("tts.text_too_long", length=len(text))
return False
return True
def _filter_sensitive(self, text: str) -> str:
"""Filter sensitive content from text."""
import re
filtered = text
for pattern in self.blocked_patterns:
if re.search(pattern, filtered, re.IGNORECASE):
logger.warning("tts.sensitive_content_filtered")
filtered = re.sub(pattern + r'\S+', '[FILTERED]', filtered, flags=re.IGNORECASE)
return filtered
def cleanup(self):
"""Clean up temp files."""
import shutil
if os.path.exists(self.temp_dir):
shutil.rmtree(self.temp_dir)# Stream audio chunks as generated for low latency
with sd.OutputStream(samplerate=24000, channels=1) as stream:
for _, _, audio in pipeline(text, voice=voice):
stream.write(audio) # Play immediately# Cache common phrases with hash key
cache_key = hashlib.sha256(f"{text}:{voice}".encode()).hexdigest()
cache_path = cache_dir / f"{cache_key}.wav"
if cache_path.exists():
return str(cache_path) # Cache hit
# Generate, save to cache, return path# Lazy-load engines per voice type
VOICES = {"default": "af_heart", "formal": "af_bella"}
def get_engine(voice_type: str) -> SecureTTSEngine:
if voice_type not in engines:
engines[voice_type] = SecureTTSEngine(voice=VOICES[voice_type])
return engines[voice_type]# Semaphore for concurrency + timeout for protection
async with asyncio.Semaphore(2):
result = await asyncio.wait_for(
loop.run_in_executor(None, engine.synthesize, text),
timeout=30.0
)Prevent synthesis of inappropriate content:
class ContentFilter:
"""Filter inappropriate content before synthesis."""
BLOCKED_CATEGORIES = [
"violence",
"hate_speech",
"explicit",
]
def filter(self, text: str) -> tuple[str, bool]:
"""Filter text and return (filtered_text, was_modified)."""
# Remove potential command injection
text = text.replace(";", "").replace("|", "").replace("&", "")
# Check for blocked patterns
for pattern in self.blocked_patterns:
if re.search(pattern, text, re.IGNORECASE):
return "[Content filtered]", True
return text, Falsedef validate_tts_input(text: str) -> bool:
"""Validate text for TTS synthesis."""
# Length limit
if len(text) > 5000:
raise ValidationError("Text too long (max 5000 chars)")
# Character validation
if not all(c.isprintable() or c in '\n\t' for c in text):
raise ValidationError("Invalid characters in text")
return True# BAD - No filtering
def speak(user_input: str):
engine.synthesize(user_input)
# GOOD - Filter first
def speak(user_input: str):
filtered = content_filter.filter(user_input)
engine.synthesize(filtered)# BAD - Can generate very long audio
engine.synthesize(long_text) # No limit
# GOOD - Enforce limits
if len(text) > 5000:
raise ValidationError("Text too long")
engine.synthesize(text)pytest tests/test_tts_engine.py -vpytest --cov=jarvis.ttsmypy src/jarvis/tts/python -m jarvis.tts --testYour goal is to create TTS systems that are:
You understand that TTS requires input validation and content filtering to prevent synthesis of inappropriate content. Always enforce text length limits and clean up generated audio files.
Critical Reminders:
If you maintain this skill, you can claim it as your own. Once claimed, you can manage eval scenarios, bundle related skills, attach documentation or rules, and ensure cross-agent compatibility.