CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-llama-cpp-python

Python bindings for the llama.cpp library providing high-performance LLM inference with OpenAI-compatible APIs.

Pending
Overview
Eval results
Files

grammar.mddocs/

Grammar and Structured Generation

Constrained text generation using formal grammars (GBNF), JSON Schema validation, and built-in templates for structured outputs like JSON, code, and domain-specific formats.

Capabilities

Grammar-Based Generation

Control model output using formal grammar rules in GBNF (GGML BNF) format.

class LlamaGrammar:
    def __init__(self, grammar_str: str, verbose: bool = True):
        """
        Initialize grammar from GBNF string.
        
        Args:
            grammar_str: Grammar rules in GBNF format
            verbose: Enable verbose logging
        """

    @classmethod
    def from_string(
        cls, 
        grammar_str: str, 
        verbose: bool = True
    ) -> "LlamaGrammar":
        """
        Create grammar from GBNF string.
        
        Args:
            grammar_str: Grammar rules in GBNF format
            verbose: Enable verbose logging
            
        Returns:
            LlamaGrammar instance
        """

    @classmethod
    def from_file(
        cls, 
        file_path: str, 
        verbose: bool = True
    ) -> "LlamaGrammar":
        """
        Create grammar from GBNF file.
        
        Args:
            file_path: Path to GBNF grammar file
            verbose: Enable verbose logging
            
        Returns:
            LlamaGrammar instance
        """

    @classmethod
    def from_json_schema(
        cls, 
        json_schema: str, 
        verbose: bool = True
    ) -> "LlamaGrammar":
        """
        Create grammar from JSON Schema string.
        
        Args:
            json_schema: JSON Schema as string (use json.dumps() to convert dict)
            verbose: Enable verbose logging
            
        Returns:
            LlamaGrammar instance with schema constraints
        """

JSON Schema Conversion

Convert JSON Schema specifications to GBNF grammar format for structured JSON generation.

def json_schema_to_gbnf(
    schema: dict,
    prop_order: Optional[List[str]] = None,
    allow_fetch: bool = True,
    dotall: bool = False,
    raw_pattern: bool = False
) -> str:
    """
    Convert JSON Schema to GBNF grammar format.
    
    Args:
        schema: JSON Schema dictionary
        prop_order: Property ordering for object serialization
        allow_fetch: Allow fetching external schema references
        dotall: Enable dotall mode for regex patterns
        raw_pattern: Use raw pattern matching
        
    Returns:
        GBNF grammar string
    """

class SchemaConverter:
    def __init__(
        self,
        prop_order: Optional[List[str]] = None,
        allow_fetch: bool = True,
        dotall: bool = False,
        raw_pattern: bool = False
    ):
        """
        Initialize schema converter.
        
        Args:
            prop_order: Default property ordering
            allow_fetch: Allow external references
            dotall: Dotall mode for patterns
            raw_pattern: Raw pattern mode
        """

    def convert(self, schema: dict) -> str:
        """
        Convert schema to GBNF format.
        
        Args:
            schema: JSON Schema dictionary
            
        Returns:
            GBNF grammar string
        """

Built-in Grammar Rules

Pre-defined grammar building blocks for common patterns.

class BuiltinRule:
    def __init__(self, rule_name: str, content: str):
        """
        Built-in grammar rule definition.
        
        Args:
            rule_name: Name of the rule
            content: Rule content in GBNF format
        """
        self.rule_name = rule_name
        self.content = content

Pre-defined Grammar Templates

# Arithmetic expressions
ARITHMETIC_GBNF: str

# C-like programming language syntax
C_GBNF: str

# Chess notation (algebraic notation)
CHESS_GBNF: str

# Japanese text patterns
JAPANESE_GBNF: str

# JSON array format
JSON_ARR_GBNF: str

# JSON object format
JSON_GBNF: str

# List structures
LIST_GBNF: str

# Default grammar root rule
LLAMA_GRAMMAR_DEFAULT_ROOT: str

Usage Examples

Basic Grammar Usage

from llama_cpp import Llama, LlamaGrammar

# Initialize model
llm = Llama(model_path="./models/llama-2-7b.gguf")

# Create simple grammar for yes/no responses
yes_no_grammar = """
root ::= response
response ::= "yes" | "no" | "Yes" | "No" | "YES" | "NO"
"""

grammar = LlamaGrammar.from_string(yes_no_grammar)

# Generate constrained response
response = llm.create_completion(
    prompt="Do you like pizza? Answer with yes or no:",
    max_tokens=10,
    grammar=grammar,
)

print(response['choices'][0]['text'])  # Will be "yes", "no", etc.

JSON Schema Generation

from llama_cpp import Llama, LlamaGrammar

llm = Llama(model_path="./models/llama-2-7b.gguf")

# Define JSON schema for person data
person_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "age": {"type": "integer", "minimum": 0, "maximum": 150},
        "email": {"type": "string", "format": "email"},
        "hobbies": {
            "type": "array",
            "items": {"type": "string"},
            "maxItems": 5
        }
    },
    "required": ["name", "age"],
    "additionalProperties": False
}

# Create grammar from schema (convert dict to string first)
import json
grammar = LlamaGrammar.from_json_schema(json.dumps(json.dumps(person_schema)))

# Generate valid JSON
response = llm.create_completion(
    prompt="Generate a person's information in JSON format:",
    max_tokens=200,
    grammar=grammar,
    temperature=0.7,
)

import json
generated_json = json.loads(response['choices'][0]['text'])
print(json.dumps(generated_json, indent=2))

Pre-defined Grammar Templates

from llama_cpp import Llama, LlamaGrammar
from llama_cpp.llama_grammar import JSON_GBNF, ARITHMETIC_GBNF

llm = Llama(model_path="./models/model.gguf")

# Use built-in JSON grammar
json_grammar = LlamaGrammar.from_string(JSON_GBNF)

response = llm.create_completion(
    prompt="Create a JSON object describing a book:",
    max_tokens=150,
    grammar=json_grammar,
)

print("JSON output:", response['choices'][0]['text'])

# Use arithmetic grammar
math_grammar = LlamaGrammar.from_string(ARITHMETIC_GBNF)

math_response = llm.create_completion(
    prompt="Write a mathematical expression:",
    max_tokens=50,
    grammar=math_grammar,
)

print("Math expression:", math_response['choices'][0]['text'])

Complex Grammar Rules

# Create grammar for structured code generation
code_grammar = """
root ::= program
program ::= statement+
statement ::= assignment | function_call | comment
assignment ::= variable " = " expression "\\n"
function_call ::= function_name "(" arguments? ")" "\\n"
comment ::= "# " [^\\n]* "\\n"
variable ::= [a-zA-Z_][a-zA-Z0-9_]*
function_name ::= [a-zA-Z_][a-zA-Z0-9_]*
expression ::= number | string | variable
arguments ::= expression ("," " " expression)*
number ::= [0-9]+
string ::= "\\"" [^"]* "\\""
"""

grammar = LlamaGrammar.from_string(code_grammar)

response = llm.create_completion(
    prompt="Write a Python program that processes data:",
    max_tokens=200,
    grammar=grammar,
)

print("Generated code:")
print(response['choices'][0]['text'])

Multi-step Structured Generation

# Generate structured conversation data
conversation_schema = {
    "type": "object",
    "properties": {
        "conversation_id": {"type": "string"},
        "participants": {
            "type": "array",
            "items": {"type": "string"},
            "minItems": 2,
            "maxItems": 4
        },
        "messages": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "speaker": {"type": "string"},
                    "message": {"type": "string"},
                    "timestamp": {"type": "string", "format": "time"}
                },
                "required": ["speaker", "message"]
            },
            "minItems": 1
        },
        "topic": {"type": "string"},
        "sentiment": {"enum": ["positive", "negative", "neutral"]}
    },
    "required": ["conversation_id", "participants", "messages", "topic"]
}

conversation_grammar = LlamaGrammar.from_json_schema(json.dumps(conversation_schema))

# Generate structured conversation
response = llm.create_completion(
    prompt="Create a conversation between friends about technology:",
    max_tokens=300,
    grammar=conversation_grammar,
    temperature=0.8,
)

conversation_data = json.loads(response['choices'][0]['text'])
print("Generated conversation:")
print(json.dumps(conversation_data, indent=2))

Grammar File Usage

# Save grammar to file
weather_grammar = """
root ::= weather_report
weather_report ::= location " weather: " condition " " temperature " " humidity
location ::= [A-Z][a-z]+ ("," " " [A-Z][a-z]+)?
condition ::= "sunny" | "cloudy" | "rainy" | "snowy" | "overcast"
temperature ::= [0-9]+ "°" ("C" | "F")
humidity ::= [0-9]+ "%"
"""

# Write to file
with open("weather_grammar.gbnf", "w") as f:
    f.write(weather_grammar)

# Load from file
grammar = LlamaGrammar.from_file("weather_grammar.gbnf")

response = llm.create_completion(
    prompt="Give me a weather report for New York:",
    max_tokens=50,
    grammar=grammar,
)

print("Weather report:", response['choices'][0]['text'])

Advanced Schema Patterns

# Complex nested schema with arrays and objects
api_response_schema = {
    "type": "object",
    "properties": {
        "status": {"enum": ["success", "error"]},
        "data": {
            "oneOf": [
                {
                    "type": "object",
                    "properties": {
                        "users": {
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "id": {"type": "integer"},
                                    "username": {"type": "string"},
                                    "active": {"type": "boolean"}
                                },
                                "required": ["id", "username"]
                            }
                        },
                        "total_count": {"type": "integer"}
                    }
                },
                {
                    "type": "null"
                }
            ]
        },
        "error": {
            "type": "object",
            "properties": {
                "code": {"type": "integer"},
                "message": {"type": "string"}
            }
        }
    },
    "required": ["status"]
}

api_grammar = LlamaGrammar.from_json_schema(json.dumps(api_response_schema))

# Generate API response
response = llm.create_completion(
    prompt="Generate a successful API response with user data:",
    max_tokens=250,
    grammar=api_grammar,
)

api_data = json.loads(response['choices'][0]['text'])
print("API Response:")
print(json.dumps(api_data, indent=2))

Grammar Debugging

# Enable verbose mode for grammar debugging
debug_grammar = """
root ::= greeting
greeting ::= salutation " " name punctuation
salutation ::= "Hello" | "Hi" | "Hey"
name ::= [A-Z][a-z]+
punctuation ::= "!" | "."
"""

grammar = LlamaGrammar.from_string(debug_grammar, verbose=True)

# This will show grammar parsing information
response = llm.create_completion(
    prompt="Say hello to someone:",
    max_tokens=20,
    grammar=grammar,
)

print("Greeting:", response['choices'][0]['text'])

Schema Validation Integration

import jsonschema

# Create schema and corresponding grammar
product_schema = {
    "type": "object",
    "properties": {
        "product_name": {"type": "string"},
        "price": {"type": "number", "minimum": 0},
        "in_stock": {"type": "boolean"},
        "categories": {
            "type": "array",
            "items": {"type": "string"},
            "uniqueItems": True
        }
    },
    "required": ["product_name", "price", "in_stock"]
}

grammar = LlamaGrammar.from_json_schema(json.dumps(product_schema))

# Generate product data
response = llm.create_completion(
    prompt="Create product information for an electronic device:",
    max_tokens=150,
    grammar=grammar,
)

# Validate against original schema
try:
    product_data = json.loads(response['choices'][0]['text'])
    jsonschema.validate(product_data, product_schema))
    print("Generated valid product data:")
    print(json.dumps(product_data, indent=2))
except jsonschema.ValidationError as e:
    print(f"Validation error: {e}")
except json.JSONDecodeError as e:
    print(f"JSON parsing error: {e}")

Install with Tessl CLI

npx tessl i tessl/pypi-llama-cpp-python

docs

caching.md

chat-completion.md

grammar.md

index.md

llama-model.md

low-level.md

server.md

tokenization.md

vision.md

tile.json