Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.
—
Constrained text generation using formal grammars (GBNF), JSON Schema validation, and built-in templates for structured outputs like JSON, code, and domain-specific formats.
Control model output using formal grammar rules in GBNF (GGML BNF) format.
class LlamaGrammar:
def __init__(self, grammar_str: str, verbose: bool = True):
"""
Initialize grammar from GBNF string.
Args:
grammar_str: Grammar rules in GBNF format
verbose: Enable verbose logging
"""
@classmethod
def from_string(
cls,
grammar_str: str,
verbose: bool = True
) -> "LlamaGrammar":
"""
Create grammar from GBNF string.
Args:
grammar_str: Grammar rules in GBNF format
verbose: Enable verbose logging
Returns:
LlamaGrammar instance
"""
@classmethod
def from_file(
cls,
file_path: str,
verbose: bool = True
) -> "LlamaGrammar":
"""
Create grammar from GBNF file.
Args:
file_path: Path to GBNF grammar file
verbose: Enable verbose logging
Returns:
LlamaGrammar instance
"""
@classmethod
def from_json_schema(
cls,
json_schema: str,
verbose: bool = True
) -> "LlamaGrammar":
"""
Create grammar from JSON Schema string.
Args:
json_schema: JSON Schema as string (use json.dumps() to convert dict)
verbose: Enable verbose logging
Returns:
LlamaGrammar instance with schema constraints
"""Convert JSON Schema specifications to GBNF grammar format for structured JSON generation.
def json_schema_to_gbnf(
schema: dict,
prop_order: Optional[List[str]] = None,
allow_fetch: bool = True,
dotall: bool = False,
raw_pattern: bool = False
) -> str:
"""
Convert JSON Schema to GBNF grammar format.
Args:
schema: JSON Schema dictionary
prop_order: Property ordering for object serialization
allow_fetch: Allow fetching external schema references
dotall: Enable dotall mode for regex patterns
raw_pattern: Use raw pattern matching
Returns:
GBNF grammar string
"""
class SchemaConverter:
def __init__(
self,
prop_order: Optional[List[str]] = None,
allow_fetch: bool = True,
dotall: bool = False,
raw_pattern: bool = False
):
"""
Initialize schema converter.
Args:
prop_order: Default property ordering
allow_fetch: Allow external references
dotall: Dotall mode for patterns
raw_pattern: Raw pattern mode
"""
def convert(self, schema: dict) -> str:
"""
Convert schema to GBNF format.
Args:
schema: JSON Schema dictionary
Returns:
GBNF grammar string
"""Pre-defined grammar building blocks for common patterns.
class BuiltinRule:
def __init__(self, rule_name: str, content: str):
"""
Built-in grammar rule definition.
Args:
rule_name: Name of the rule
content: Rule content in GBNF format
"""
self.rule_name = rule_name
self.content = content# Arithmetic expressions
ARITHMETIC_GBNF: str
# C-like programming language syntax
C_GBNF: str
# Chess notation (algebraic notation)
CHESS_GBNF: str
# Japanese text patterns
JAPANESE_GBNF: str
# JSON array format
JSON_ARR_GBNF: str
# JSON object format
JSON_GBNF: str
# List structures
LIST_GBNF: str
# Default grammar root rule
LLAMA_GRAMMAR_DEFAULT_ROOT: strfrom llama_cpp import Llama, LlamaGrammar
# Initialize model
llm = Llama(model_path="./models/llama-2-7b.gguf")
# Create simple grammar for yes/no responses
yes_no_grammar = """
root ::= response
response ::= "yes" | "no" | "Yes" | "No" | "YES" | "NO"
"""
grammar = LlamaGrammar.from_string(yes_no_grammar)
# Generate constrained response
response = llm.create_completion(
prompt="Do you like pizza? Answer with yes or no:",
max_tokens=10,
grammar=grammar,
)
print(response['choices'][0]['text']) # Will be "yes", "no", etc.from llama_cpp import Llama, LlamaGrammar
llm = Llama(model_path="./models/llama-2-7b.gguf")
# Define JSON schema for person data
person_schema = {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "integer", "minimum": 0, "maximum": 150},
"email": {"type": "string", "format": "email"},
"hobbies": {
"type": "array",
"items": {"type": "string"},
"maxItems": 5
}
},
"required": ["name", "age"],
"additionalProperties": False
}
# Create grammar from schema (convert dict to string first)
import json
grammar = LlamaGrammar.from_json_schema(json.dumps(json.dumps(person_schema)))
# Generate valid JSON
response = llm.create_completion(
prompt="Generate a person's information in JSON format:",
max_tokens=200,
grammar=grammar,
temperature=0.7,
)
import json
generated_json = json.loads(response['choices'][0]['text'])
print(json.dumps(generated_json, indent=2))from llama_cpp import Llama, LlamaGrammar
from llama_cpp.llama_grammar import JSON_GBNF, ARITHMETIC_GBNF
llm = Llama(model_path="./models/model.gguf")
# Use built-in JSON grammar
json_grammar = LlamaGrammar.from_string(JSON_GBNF)
response = llm.create_completion(
prompt="Create a JSON object describing a book:",
max_tokens=150,
grammar=json_grammar,
)
print("JSON output:", response['choices'][0]['text'])
# Use arithmetic grammar
math_grammar = LlamaGrammar.from_string(ARITHMETIC_GBNF)
math_response = llm.create_completion(
prompt="Write a mathematical expression:",
max_tokens=50,
grammar=math_grammar,
)
print("Math expression:", math_response['choices'][0]['text'])# Create grammar for structured code generation
code_grammar = """
root ::= program
program ::= statement+
statement ::= assignment | function_call | comment
assignment ::= variable " = " expression "\\n"
function_call ::= function_name "(" arguments? ")" "\\n"
comment ::= "# " [^\\n]* "\\n"
variable ::= [a-zA-Z_][a-zA-Z0-9_]*
function_name ::= [a-zA-Z_][a-zA-Z0-9_]*
expression ::= number | string | variable
arguments ::= expression ("," " " expression)*
number ::= [0-9]+
string ::= "\\"" [^"]* "\\""
"""
grammar = LlamaGrammar.from_string(code_grammar)
response = llm.create_completion(
prompt="Write a Python program that processes data:",
max_tokens=200,
grammar=grammar,
)
print("Generated code:")
print(response['choices'][0]['text'])# Generate structured conversation data
conversation_schema = {
"type": "object",
"properties": {
"conversation_id": {"type": "string"},
"participants": {
"type": "array",
"items": {"type": "string"},
"minItems": 2,
"maxItems": 4
},
"messages": {
"type": "array",
"items": {
"type": "object",
"properties": {
"speaker": {"type": "string"},
"message": {"type": "string"},
"timestamp": {"type": "string", "format": "time"}
},
"required": ["speaker", "message"]
},
"minItems": 1
},
"topic": {"type": "string"},
"sentiment": {"enum": ["positive", "negative", "neutral"]}
},
"required": ["conversation_id", "participants", "messages", "topic"]
}
conversation_grammar = LlamaGrammar.from_json_schema(json.dumps(conversation_schema))
# Generate structured conversation
response = llm.create_completion(
prompt="Create a conversation between friends about technology:",
max_tokens=300,
grammar=conversation_grammar,
temperature=0.8,
)
conversation_data = json.loads(response['choices'][0]['text'])
print("Generated conversation:")
print(json.dumps(conversation_data, indent=2))# Save grammar to file
weather_grammar = """
root ::= weather_report
weather_report ::= location " weather: " condition " " temperature " " humidity
location ::= [A-Z][a-z]+ ("," " " [A-Z][a-z]+)?
condition ::= "sunny" | "cloudy" | "rainy" | "snowy" | "overcast"
temperature ::= [0-9]+ "°" ("C" | "F")
humidity ::= [0-9]+ "%"
"""
# Write to file
with open("weather_grammar.gbnf", "w") as f:
f.write(weather_grammar)
# Load from file
grammar = LlamaGrammar.from_file("weather_grammar.gbnf")
response = llm.create_completion(
prompt="Give me a weather report for New York:",
max_tokens=50,
grammar=grammar,
)
print("Weather report:", response['choices'][0]['text'])# Complex nested schema with arrays and objects
api_response_schema = {
"type": "object",
"properties": {
"status": {"enum": ["success", "error"]},
"data": {
"oneOf": [
{
"type": "object",
"properties": {
"users": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": {"type": "integer"},
"username": {"type": "string"},
"active": {"type": "boolean"}
},
"required": ["id", "username"]
}
},
"total_count": {"type": "integer"}
}
},
{
"type": "null"
}
]
},
"error": {
"type": "object",
"properties": {
"code": {"type": "integer"},
"message": {"type": "string"}
}
}
},
"required": ["status"]
}
api_grammar = LlamaGrammar.from_json_schema(json.dumps(api_response_schema))
# Generate API response
response = llm.create_completion(
prompt="Generate a successful API response with user data:",
max_tokens=250,
grammar=api_grammar,
)
api_data = json.loads(response['choices'][0]['text'])
print("API Response:")
print(json.dumps(api_data, indent=2))# Enable verbose mode for grammar debugging
debug_grammar = """
root ::= greeting
greeting ::= salutation " " name punctuation
salutation ::= "Hello" | "Hi" | "Hey"
name ::= [A-Z][a-z]+
punctuation ::= "!" | "."
"""
grammar = LlamaGrammar.from_string(debug_grammar, verbose=True)
# This will show grammar parsing information
response = llm.create_completion(
prompt="Say hello to someone:",
max_tokens=20,
grammar=grammar,
)
print("Greeting:", response['choices'][0]['text'])import jsonschema
# Create schema and corresponding grammar
product_schema = {
"type": "object",
"properties": {
"product_name": {"type": "string"},
"price": {"type": "number", "minimum": 0},
"in_stock": {"type": "boolean"},
"categories": {
"type": "array",
"items": {"type": "string"},
"uniqueItems": True
}
},
"required": ["product_name", "price", "in_stock"]
}
grammar = LlamaGrammar.from_json_schema(json.dumps(product_schema))
# Generate product data
response = llm.create_completion(
prompt="Create product information for an electronic device:",
max_tokens=150,
grammar=grammar,
)
# Validate against original schema
try:
product_data = json.loads(response['choices'][0]['text'])
jsonschema.validate(product_data, product_schema))
print("Generated valid product data:")
print(json.dumps(product_data, indent=2))
except jsonschema.ValidationError as e:
print(f"Validation error: {e}")
except json.JSONDecodeError as e:
print(f"JSON parsing error: {e}")Install with Tessl CLI
npx tessl i tessl/pypi-llama-cpp-python