tessl/pypi-flagembedding

FlagEmbedding - BGE: One-Stop Retrieval Toolkit For Search and RAG

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Decoder-Only Embedders

Name: tessl/pypi-flagembedding
Author: tessl

Embedders designed for decoder-only transformer models (LLM-like architectures). These models leverage large language model capabilities for embedding generation, often using the last token for representation and supporting instruction-based formatting.

Capabilities

FlagLLMModel (Base LLM Embedder)

Standard embedder for decoder-only models using last token pooling. Designed for large language models that generate embeddings through their natural language understanding capabilities.

from typing import Union

class FlagLLMModel(AbsEmbedder):
    def __init__(
        self,
        model_name_or_path: str,
        pooling_method: str = "last_token",
        normalize_embeddings: bool = True,
        use_fp16: bool = True,
        query_instruction_for_retrieval: Optional[str] = None,
        query_instruction_format: str = "Instruct: {}\nQuery: {}",
        devices: Optional[Union[str, List[str]]] = None,
        batch_size: int = 256,
        query_max_length: int = 512,
        passage_max_length: int = 512,
        convert_to_numpy: bool = True,
        **kwargs
    ):
        """
        Initialize decoder-only LLM embedder.
        
        Args:
            model_name_or_path: Path to model or HuggingFace model name
            pooling_method: Pooling strategy ("last_token")
            normalize_embeddings: Whether to normalize output embeddings
            use_fp16: Use half precision for inference
            query_instruction_for_retrieval: Instruction for retrieval queries
            query_instruction_format: Format string for instructions
            devices: List of devices for multi-GPU inference
            batch_size: Default batch size for encoding
            query_max_length: Maximum query token length
            passage_max_length: Maximum passage token length
            convert_to_numpy: Convert outputs to numpy arrays
            **kwargs: Additional model parameters
        """

FlagICLModel (In-Context Learning Embedder)

Specialized embedder for in-context learning approaches with large language models. Leverages few-shot examples and context to generate high-quality embeddings.

class FlagICLModel(AbsEmbedder):
    def __init__(
        self,
        model_name_or_path: str,
        pooling_method: str = "last_token",
        normalize_embeddings: bool = True,
        use_fp16: bool = True,
        query_instruction_for_retrieval: Optional[str] = None,
        query_instruction_format: str = "{}{}",
        devices: Optional[Union[str, List[str]]] = None,
        batch_size: int = 256,
        query_max_length: int = 512,
        passage_max_length: int = 512,
        convert_to_numpy: bool = True,
        **kwargs
    ):
        """
        Initialize in-context learning embedder.
        
        Args:
            model_name_or_path: Path to ICL-capable model
            pooling_method: Pooling strategy ("last_token")
            normalize_embeddings: Whether to normalize output embeddings
            use_fp16: Use half precision for inference
            query_instruction_for_retrieval: Instruction for retrieval queries
            query_instruction_format: Format string for instructions
            devices: List of devices for multi-GPU inference
            batch_size: Default batch size for encoding
            query_max_length: Maximum query token length
            passage_max_length: Maximum passage token length
            convert_to_numpy: Convert outputs to numpy arrays
            **kwargs: Additional model parameters
        """

Usage Examples

Basic LLM Embedder

from FlagEmbedding import FlagLLMModel

# Initialize LLM embedder with last token pooling
embedder = FlagLLMModel(
    'e5-mistral-7b-instruct',
    pooling_method="last_token",
    use_fp16=True
)

# Encode queries and passages
queries = ["What are the applications of machine learning?"]
passages = ["Machine learning is applied in healthcare, finance, and autonomous systems"]

query_embeddings = embedder.encode_queries(queries)
passage_embeddings = embedder.encode_corpus(passages)

print(f"Query embedding shape: {query_embeddings.shape}")
print(f"Passage embedding shape: {passage_embeddings.shape}")

Custom Instruction Formatting

from FlagEmbedding import FlagLLMModel

# Use custom instruction format for queries
embedder = FlagLLMModel(
    'e5-mistral-7b-instruct',
    query_instruction_for_retrieval="Given a question, retrieve relevant documents that answer the question",
    query_instruction_format="Instruct: {}\\nQuery: {}",
    use_fp16=True
)

# Queries will be formatted with custom instructions
queries = ["How do neural networks learn?"]
embeddings = embedder.encode_queries(queries)

In-Context Learning Embedder

from FlagEmbedding import FlagICLModel

# Initialize ICL embedder for few-shot learning
embedder = FlagICLModel(
    'bge-en-icl',
    use_fp16=True,
    batch_size=64  # Smaller batch for memory efficiency
)

# ICL works well with examples in context
queries = [
    "Example: 'What is AI?' -> AI concepts. Query: 'What is machine learning?'"
]

embeddings = embedder.encode_queries(queries)

Multi-GPU LLM Processing

from FlagEmbedding import FlagLLMModel

# Use multiple GPUs for large LLM models
embedder = FlagLLMModel(
    'e5-mistral-7b-instruct',
    devices=['cuda:0', 'cuda:1'],
    batch_size=32,  # Smaller batch size for large models
    use_fp16=True
)

# Process documents efficiently across GPUs
documents = [f"Document {i} content" for i in range(1000)]
embeddings = embedder.encode_corpus(documents)

Custom Max Length Settings

from FlagEmbedding import FlagLLMModel

# Configure different max lengths for queries vs passages
embedder = FlagLLMModel(
    'e5-mistral-7b-instruct',
    query_max_length=256,     # Shorter for queries
    passage_max_length=1024,  # Longer for passages
    use_fp16=True
)

# Long passage encoding
long_passage = "Very long document content..." * 100
passage_embedding = embedder.encode_corpus([long_passage])

Retrieval-Specific Instructions

from FlagEmbedding import FlagLLMModel

# Specialized instructions for different retrieval tasks
qa_embedder = FlagLLMModel(
    'e5-mistral-7b-instruct',
    query_instruction_for_retrieval="Represent this question for retrieving relevant answers",
    query_instruction_format="Task: {}\\nInput: {}"
)

semantic_embedder = FlagLLMModel(
    'e5-mistral-7b-instruct',
    query_instruction_for_retrieval="Encode this text for semantic similarity search",
    query_instruction_format="{}: {}"
)

# Different use cases
qa_queries = ["What causes climate change?"]
semantic_queries = ["renewable energy technologies"]

qa_embeddings = qa_embedder.encode_queries(qa_queries)
semantic_embeddings = semantic_embedder.encode_queries(semantic_queries)

Comparing Decoder vs Encoder Models

from FlagEmbedding import FlagLLMModel, FlagModel

# Decoder-only model
llm_embedder = FlagLLMModel('e5-mistral-7b-instruct')

# Encoder-only model  
encoder_embedder = FlagModel('bge-large-en-v1.5')

text = ["Machine learning algorithms"]

# Both produce embeddings but with different characteristics
llm_emb = llm_embedder.encode(text)
encoder_emb = encoder_embedder.encode(text)

print(f"LLM embedding shape: {llm_emb.shape}")
print(f"Encoder embedding shape: {encoder_emb.shape}")

Memory-Efficient Processing

from FlagEmbedding import FlagLLMModel

# Configure for memory-constrained environments
embedder = FlagLLMModel(
    'e5-mistral-7b-instruct',
    use_fp16=True,
    batch_size=8,      # Very small batch
    devices=['cuda:0'], # Single GPU
    convert_to_numpy=True  # Free GPU memory faster
)

# Process in smaller chunks
large_corpus = [f"Document {i}" for i in range(10000)]
chunk_size = 100

all_embeddings = []
for i in range(0, len(large_corpus), chunk_size):
    chunk = large_corpus[i:i+chunk_size]
    chunk_embeddings = embedder.encode_corpus(chunk)
    all_embeddings.append(chunk_embeddings)
    
# Combine results
import numpy as np
final_embeddings = np.vstack(all_embeddings)

Supported Models

E5 LLM Models

e5-mistral-7b-instruct (instruction-tuned Mistral)

BGE LLM Models

bge-en-icl (in-context learning model)
bge-multilingual-gemma2

GTE LLM Models

gte-Qwen2-7B-instruct
gte-Qwen2-1.5B-instruct
gte-Qwen1.5-7B-instruct

Model Selection Guidelines

When to Use FlagLLMModel

Working with instruction-tuned language models
Need natural language understanding in embeddings
Have computational resources for larger models
Want to leverage instruction following capabilities

When to Use FlagICLModel

Need few-shot learning capabilities
Working with domain-specific tasks
Want to provide examples in context
Need adaptability without fine-tuning

Types

from typing import Optional, List, Union
import torch
import numpy as np

# Decoder-specific pooling (only last_token supported)
DecoderPoolingMethod = Literal["last_token"]

# Instruction format templates
InstructionTemplate = str  # Format string with {} placeholders

Install with Tessl CLI