Python bindings for llama.cpp enabling efficient local language model inference without external API dependencies
—
LangChain-compatible wrapper class enabling seamless integration with LangChain workflows and chains. The PyllamacppLLM class provides the same interface as other LangChain LLM implementations, allowing drop-in replacement in existing LangChain applications.
A full-featured LangChain LLM implementation that wraps pyllamacpp's Model class, providing compatibility with LangChain's ecosystem of tools, chains, and agents.
class PyllamacppLLM(LLM):
"""
LangChain-compatible wrapper for Pyllamacpp models.
Inherits from langchain.llms.base.LLM and provides
full compatibility with LangChain workflows.
"""
# Required model path
model: str
"""Path to the GGML model file."""
# Context and model parameters
n_ctx: int = 512
"""Token context window size."""
seed: int = 0
"""Random seed for generation. If -1, uses random seed."""
f16_kv: bool = False
"""Use half-precision for key/value cache."""
logits_all: bool = False
"""Return logits for all tokens, not just the last token."""
vocab_only: bool = False
"""Only load the vocabulary, no weights."""
use_mlock: bool = False
"""Force system to keep model in RAM."""
embedding: bool = False
"""Use embedding mode only."""
# Generation parameters
n_threads: int = 4
"""Number of CPU threads to use."""
n_predict: int = 50
"""Maximum number of tokens to generate."""
temp: float = 0.8
"""Temperature for sampling (higher = more random)."""
top_p: float = 0.95
"""Top-p nucleus sampling parameter."""
top_k: int = 40
"""Top-k sampling parameter."""
echo: bool = False
"""Whether to echo the input prompt in output."""
stop: List[str] = []
"""List of strings to stop generation when encountered."""
repeat_last_n: int = 64
"""Last n tokens to consider for repetition penalty."""
repeat_penalty: float = 1.3
"""Penalty factor for repeated tokens."""
n_batch: int = 1
"""Batch size for prompt processing."""
streaming: bool = False
"""Whether to stream results (not yet implemented)."""from pyllamacpp.langchain_llm import PyllamacppLLM
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
# Initialize the LLM
llm = PyllamacppLLM(
model="/path/to/model.ggml",
temp=0.7,
n_predict=100,
top_p=0.9,
top_k=40
)
# Use with LangChain PromptTemplate
template = """
Question: {question}
Answer: Let me think about this step by step.
"""
prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)
# Generate response
question = "What are the benefits of renewable energy?"
answer = llm_chain.run(question)
print(answer)from pyllamacpp.langchain_llm import PyllamacppLLM
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
# Configure LLM with advanced parameters
llm = PyllamacppLLM(
model="/path/to/model.ggml",
n_ctx=2048,
temp=0.8,
top_p=0.95,
top_k=40,
repeat_penalty=1.1,
n_predict=200,
stop=["Human:", "AI:"]
)
# Create conversation chain with memory
memory = ConversationBufferMemory()
conversation = ConversationChain(
llm=llm,
memory=memory,
verbose=True
)
# Multi-turn conversation
response1 = conversation.predict(input="Tell me about machine learning")
print(response1)
response2 = conversation.predict(input="What are some practical applications?")
print(response2)from pyllamacpp.langchain_llm import PyllamacppLLM
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
llm = PyllamacppLLM(
model="/path/to/model.ggml",
temp=0.75,
n_predict=150
)
# Instruction-following template
instruction_template = """
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:
"""
instruction_prompt = PromptTemplate(
template=instruction_template,
input_variables=["instruction"]
)
instruction_chain = LLMChain(prompt=instruction_prompt, llm=llm)
# Use the chain
result = instruction_chain.run(instruction="Explain how neural networks work")
print(result)from pyllamacpp.langchain_llm import PyllamacppLLM
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
# Load and process documents
loader = TextLoader("document.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
# Create vector store
embeddings = HuggingFaceEmbeddings()
db = FAISS.from_documents(docs, embeddings)
# Configure LLM
llm = PyllamacppLLM(
model="/path/to/model.ggml",
temp=0.3,
n_predict=200,
top_p=0.9
)
# Create RAG chain
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=db.as_retriever()
)
# Ask questions about the documents
question = "What are the main topics discussed in the document?"
answer = qa_chain.run(question)
print(answer)All LangChain-specific parameters can be configured during initialization:
# Model initialization parameters (passed to Model class)
llm = PyllamacppLLM(
model="/path/to/model.ggml",
# Context parameters
n_ctx=2048,
seed=42,
f16_kv=True,
logits_all=False,
vocab_only=False,
use_mlock=True,
embedding=False,
# Generation parameters
n_threads=8,
n_predict=200,
temp=0.8,
top_p=0.95,
top_k=40,
repeat_last_n=64,
repeat_penalty=1.1,
n_batch=8,
# LangChain-specific parameters
echo=False,
stop=["Human:", "Assistant:", "\n\n"],
streaming=False
)from pyllamacpp.langchain_llm import PyllamacppLLM
try:
llm = PyllamacppLLM(model="/path/to/nonexistent/model.ggml")
except ValueError as e:
print(f"Model loading failed: {e}")
# Fallback to different model or error handling
try:
response = llm("Generate a very long response...")
except Exception as e:
print(f"Generation failed: {e}")
# Handle generation errorsThe PyllamacppLLM class integrates seamlessly with the broader LangChain ecosystem:
Example with agents:
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
# Define tools
def calculator(expression):
return str(eval(expression))
tools = [
Tool(
name="Calculator",
func=calculator,
description="Useful for mathematical calculations"
)
]
# Initialize agent with PyllamacppLLM
agent = initialize_agent(
tools=tools,
llm=llm,
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
verbose=True
)
# Use the agent
result = agent.run("What is 15 * 23 + 45?")
print(result)Install with Tessl CLI
npx tessl i tessl/pypi-pyllamacpp