tessl/pypi-pyllamacpp

Python bindings for llama.cpp enabling efficient local language model inference without external API dependencies

—

Pending

Overview

Eval results

Files

LangChain Integration

Name: tessl/pypi-pyllamacpp
Author: tessl

LangChain-compatible wrapper class enabling seamless integration with LangChain workflows and chains. The PyllamacppLLM class provides the same interface as other LangChain LLM implementations, allowing drop-in replacement in existing LangChain applications.

Capabilities

LangChain LLM Wrapper

A full-featured LangChain LLM implementation that wraps pyllamacpp's Model class, providing compatibility with LangChain's ecosystem of tools, chains, and agents.

class PyllamacppLLM(LLM):
    """
    LangChain-compatible wrapper for Pyllamacpp models.
    
    Inherits from langchain.llms.base.LLM and provides
    full compatibility with LangChain workflows.
    """
    
    # Required model path
    model: str
    """Path to the GGML model file."""
    
    # Context and model parameters
    n_ctx: int = 512
    """Token context window size."""
    
    seed: int = 0
    """Random seed for generation. If -1, uses random seed."""
    
    f16_kv: bool = False
    """Use half-precision for key/value cache."""
    
    logits_all: bool = False
    """Return logits for all tokens, not just the last token."""
    
    vocab_only: bool = False
    """Only load the vocabulary, no weights."""
    
    use_mlock: bool = False
    """Force system to keep model in RAM."""
    
    embedding: bool = False
    """Use embedding mode only."""
    
    # Generation parameters
    n_threads: int = 4
    """Number of CPU threads to use."""
    
    n_predict: int = 50
    """Maximum number of tokens to generate."""
    
    temp: float = 0.8
    """Temperature for sampling (higher = more random)."""
    
    top_p: float = 0.95
    """Top-p nucleus sampling parameter."""
    
    top_k: int = 40
    """Top-k sampling parameter."""
    
    echo: bool = False
    """Whether to echo the input prompt in output."""
    
    stop: List[str] = []
    """List of strings to stop generation when encountered."""
    
    repeat_last_n: int = 64
    """Last n tokens to consider for repetition penalty."""
    
    repeat_penalty: float = 1.3
    """Penalty factor for repeated tokens."""
    
    n_batch: int = 1
    """Batch size for prompt processing."""
    
    streaming: bool = False
    """Whether to stream results (not yet implemented)."""

Basic Usage

from pyllamacpp.langchain_llm import PyllamacppLLM
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Initialize the LLM
llm = PyllamacppLLM(
    model="/path/to/model.ggml",
    temp=0.7,
    n_predict=100,
    top_p=0.9,
    top_k=40
)

# Use with LangChain PromptTemplate
template = """
Question: {question}

Answer: Let me think about this step by step.
"""

prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

# Generate response
question = "What are the benefits of renewable energy?"
answer = llm_chain.run(question)
print(answer)

Advanced LangChain Integration

from pyllamacpp.langchain_llm import PyllamacppLLM
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate

# Configure LLM with advanced parameters
llm = PyllamacppLLM(
    model="/path/to/model.ggml",
    n_ctx=2048,
    temp=0.8,
    top_p=0.95,
    top_k=40,
    repeat_penalty=1.1,
    n_predict=200,
    stop=["Human:", "AI:"]
)

# Create conversation chain with memory
memory = ConversationBufferMemory()
conversation = ConversationChain(
    llm=llm,
    memory=memory,
    verbose=True
)

# Multi-turn conversation
response1 = conversation.predict(input="Tell me about machine learning")
print(response1)

response2 = conversation.predict(input="What are some practical applications?")
print(response2)

Custom Prompt Templates

from pyllamacpp.langchain_llm import PyllamacppLLM
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

llm = PyllamacppLLM(
    model="/path/to/model.ggml",
    temp=0.75,
    n_predict=150
)

# Instruction-following template
instruction_template = """
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:
"""

instruction_prompt = PromptTemplate(
    template=instruction_template,
    input_variables=["instruction"]
)

instruction_chain = LLMChain(prompt=instruction_prompt, llm=llm)

# Use the chain
result = instruction_chain.run(instruction="Explain how neural networks work")
print(result)

RAG (Retrieval-Augmented Generation) Example

from pyllamacpp.langchain_llm import PyllamacppLLM
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader

# Load and process documents
loader = TextLoader("document.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

# Create vector store
embeddings = HuggingFaceEmbeddings()
db = FAISS.from_documents(docs, embeddings)

# Configure LLM
llm = PyllamacppLLM(
    model="/path/to/model.ggml",
    temp=0.3,
    n_predict=200,
    top_p=0.9
)

# Create RAG chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever()
)

# Ask questions about the documents
question = "What are the main topics discussed in the document?"
answer = qa_chain.run(question)
print(answer)

Parameter Configuration

All LangChain-specific parameters can be configured during initialization:

# Model initialization parameters (passed to Model class)
llm = PyllamacppLLM(
    model="/path/to/model.ggml",
    # Context parameters
    n_ctx=2048,
    seed=42,
    f16_kv=True,
    logits_all=False,
    vocab_only=False,
    use_mlock=True,
    embedding=False,
    
    # Generation parameters
    n_threads=8,
    n_predict=200,
    temp=0.8,
    top_p=0.95,
    top_k=40,
    repeat_last_n=64,
    repeat_penalty=1.1,
    n_batch=8,
    
    # LangChain-specific parameters
    echo=False,
    stop=["Human:", "Assistant:", "\n\n"],
    streaming=False
)

Error Handling

from pyllamacpp.langchain_llm import PyllamacppLLM

try:
    llm = PyllamacppLLM(model="/path/to/nonexistent/model.ggml")
except ValueError as e:
    print(f"Model loading failed: {e}")
    # Fallback to different model or error handling

try:
    response = llm("Generate a very long response...")
except Exception as e:
    print(f"Generation failed: {e}")
    # Handle generation errors

Integration with LangChain Ecosystem

The PyllamacppLLM class integrates seamlessly with the broader LangChain ecosystem:

Chains: Use with SequentialChain, TransformChain, etc.
Agents: Compatible with LangChain agents and tools
Memory: Works with all LangChain memory implementations
Callbacks: Supports LangChain callback system
Async: Future support for async operations

Example with agents:

from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType

# Define tools
def calculator(expression):
    return str(eval(expression))

tools = [
    Tool(
        name="Calculator",
        func=calculator,
        description="Useful for mathematical calculations"
    )
]

# Initialize agent with PyllamacppLLM
agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

# Use the agent
result = agent.run("What is 15 * 23 + 45?")
print(result)

Install with Tessl CLI