FlagEmbedding - BGE: One-Stop Retrieval Toolkit For Search and RAG
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Embedders designed for decoder-only transformer models (LLM-like architectures). These models leverage large language model capabilities for embedding generation, often using the last token for representation and supporting instruction-based formatting.
Standard embedder for decoder-only models using last token pooling. Designed for large language models that generate embeddings through their natural language understanding capabilities.
from typing import Union
class FlagLLMModel(AbsEmbedder):
def __init__(
self,
model_name_or_path: str,
pooling_method: str = "last_token",
normalize_embeddings: bool = True,
use_fp16: bool = True,
query_instruction_for_retrieval: Optional[str] = None,
query_instruction_format: str = "Instruct: {}\nQuery: {}",
devices: Optional[Union[str, List[str]]] = None,
batch_size: int = 256,
query_max_length: int = 512,
passage_max_length: int = 512,
convert_to_numpy: bool = True,
**kwargs
):
"""
Initialize decoder-only LLM embedder.
Args:
model_name_or_path: Path to model or HuggingFace model name
pooling_method: Pooling strategy ("last_token")
normalize_embeddings: Whether to normalize output embeddings
use_fp16: Use half precision for inference
query_instruction_for_retrieval: Instruction for retrieval queries
query_instruction_format: Format string for instructions
devices: List of devices for multi-GPU inference
batch_size: Default batch size for encoding
query_max_length: Maximum query token length
passage_max_length: Maximum passage token length
convert_to_numpy: Convert outputs to numpy arrays
**kwargs: Additional model parameters
"""Specialized embedder for in-context learning approaches with large language models. Leverages few-shot examples and context to generate high-quality embeddings.
class FlagICLModel(AbsEmbedder):
def __init__(
self,
model_name_or_path: str,
pooling_method: str = "last_token",
normalize_embeddings: bool = True,
use_fp16: bool = True,
query_instruction_for_retrieval: Optional[str] = None,
query_instruction_format: str = "{}{}",
devices: Optional[Union[str, List[str]]] = None,
batch_size: int = 256,
query_max_length: int = 512,
passage_max_length: int = 512,
convert_to_numpy: bool = True,
**kwargs
):
"""
Initialize in-context learning embedder.
Args:
model_name_or_path: Path to ICL-capable model
pooling_method: Pooling strategy ("last_token")
normalize_embeddings: Whether to normalize output embeddings
use_fp16: Use half precision for inference
query_instruction_for_retrieval: Instruction for retrieval queries
query_instruction_format: Format string for instructions
devices: List of devices for multi-GPU inference
batch_size: Default batch size for encoding
query_max_length: Maximum query token length
passage_max_length: Maximum passage token length
convert_to_numpy: Convert outputs to numpy arrays
**kwargs: Additional model parameters
"""from FlagEmbedding import FlagLLMModel
# Initialize LLM embedder with last token pooling
embedder = FlagLLMModel(
'e5-mistral-7b-instruct',
pooling_method="last_token",
use_fp16=True
)
# Encode queries and passages
queries = ["What are the applications of machine learning?"]
passages = ["Machine learning is applied in healthcare, finance, and autonomous systems"]
query_embeddings = embedder.encode_queries(queries)
passage_embeddings = embedder.encode_corpus(passages)
print(f"Query embedding shape: {query_embeddings.shape}")
print(f"Passage embedding shape: {passage_embeddings.shape}")from FlagEmbedding import FlagLLMModel
# Use custom instruction format for queries
embedder = FlagLLMModel(
'e5-mistral-7b-instruct',
query_instruction_for_retrieval="Given a question, retrieve relevant documents that answer the question",
query_instruction_format="Instruct: {}\\nQuery: {}",
use_fp16=True
)
# Queries will be formatted with custom instructions
queries = ["How do neural networks learn?"]
embeddings = embedder.encode_queries(queries)from FlagEmbedding import FlagICLModel
# Initialize ICL embedder for few-shot learning
embedder = FlagICLModel(
'bge-en-icl',
use_fp16=True,
batch_size=64 # Smaller batch for memory efficiency
)
# ICL works well with examples in context
queries = [
"Example: 'What is AI?' -> AI concepts. Query: 'What is machine learning?'"
]
embeddings = embedder.encode_queries(queries)from FlagEmbedding import FlagLLMModel
# Use multiple GPUs for large LLM models
embedder = FlagLLMModel(
'e5-mistral-7b-instruct',
devices=['cuda:0', 'cuda:1'],
batch_size=32, # Smaller batch size for large models
use_fp16=True
)
# Process documents efficiently across GPUs
documents = [f"Document {i} content" for i in range(1000)]
embeddings = embedder.encode_corpus(documents)from FlagEmbedding import FlagLLMModel
# Configure different max lengths for queries vs passages
embedder = FlagLLMModel(
'e5-mistral-7b-instruct',
query_max_length=256, # Shorter for queries
passage_max_length=1024, # Longer for passages
use_fp16=True
)
# Long passage encoding
long_passage = "Very long document content..." * 100
passage_embedding = embedder.encode_corpus([long_passage])from FlagEmbedding import FlagLLMModel
# Specialized instructions for different retrieval tasks
qa_embedder = FlagLLMModel(
'e5-mistral-7b-instruct',
query_instruction_for_retrieval="Represent this question for retrieving relevant answers",
query_instruction_format="Task: {}\\nInput: {}"
)
semantic_embedder = FlagLLMModel(
'e5-mistral-7b-instruct',
query_instruction_for_retrieval="Encode this text for semantic similarity search",
query_instruction_format="{}: {}"
)
# Different use cases
qa_queries = ["What causes climate change?"]
semantic_queries = ["renewable energy technologies"]
qa_embeddings = qa_embedder.encode_queries(qa_queries)
semantic_embeddings = semantic_embedder.encode_queries(semantic_queries)from FlagEmbedding import FlagLLMModel, FlagModel
# Decoder-only model
llm_embedder = FlagLLMModel('e5-mistral-7b-instruct')
# Encoder-only model
encoder_embedder = FlagModel('bge-large-en-v1.5')
text = ["Machine learning algorithms"]
# Both produce embeddings but with different characteristics
llm_emb = llm_embedder.encode(text)
encoder_emb = encoder_embedder.encode(text)
print(f"LLM embedding shape: {llm_emb.shape}")
print(f"Encoder embedding shape: {encoder_emb.shape}")from FlagEmbedding import FlagLLMModel
# Configure for memory-constrained environments
embedder = FlagLLMModel(
'e5-mistral-7b-instruct',
use_fp16=True,
batch_size=8, # Very small batch
devices=['cuda:0'], # Single GPU
convert_to_numpy=True # Free GPU memory faster
)
# Process in smaller chunks
large_corpus = [f"Document {i}" for i in range(10000)]
chunk_size = 100
all_embeddings = []
for i in range(0, len(large_corpus), chunk_size):
chunk = large_corpus[i:i+chunk_size]
chunk_embeddings = embedder.encode_corpus(chunk)
all_embeddings.append(chunk_embeddings)
# Combine results
import numpy as np
final_embeddings = np.vstack(all_embeddings)from typing import Optional, List, Union
import torch
import numpy as np
# Decoder-specific pooling (only last_token supported)
DecoderPoolingMethod = Literal["last_token"]
# Instruction format templates
InstructionTemplate = str # Format string with {} placeholdersInstall with Tessl CLI
npx tessl i tessl/pypi-flagembedding