Comprehensive developer toolkit providing reusable skills for Java/Spring Boot, TypeScript/NestJS/React/Next.js, Python, PHP, AWS CloudFormation, AI/RAG, DevOps, and more.
82
82%
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Risky
Do not use without reviewing
This document provides comprehensive implementation details for all chunking strategies mentioned in the main skill.
from langchain.text_splitter import RecursiveCharacterTextSplitter
class FixedSizeChunker:
def __init__(self, chunk_size=512, chunk_overlap=50):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
def chunk(self, documents):
return self.splitter.split_documents(documents)| Use Case | Chunk Size | Overlap | Rationale |
|---|---|---|---|
| Factoid Queries | 256 | 25 | Small chunks for precise answers |
| General Q&A | 512 | 50 | Balanced approach for most cases |
| Analytical Queries | 1024 | 100 | Larger context for complex analysis |
| Code Documentation | 300 | 30 | Preserve code context while maintaining focus |
from langchain.text_splitter import RecursiveCharacterTextSplitter
class RecursiveChunker:
def __init__(self, chunk_size=512, separators=None):
self.chunk_size = chunk_size
self.separators = separators or ["\n\n", "\n", " ", ""]
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=0,
length_function=len,
separators=self.separators
)
def chunk(self, text):
return self.splitter.create_documents([text])
# Document-specific configurations
def get_chunker_for_document_type(doc_type):
configurations = {
"markdown": ["\n## ", "\n### ", "\n\n", "\n", " ", ""],
"html": ["</div>", "</p>", "\n\n", "\n", " ", ""],
"code": ["\n\n", "\n", " ", ""],
"plain": ["\n\n", "\n", " ", ""]
}
return RecursiveChunker(separators=configurations.get(doc_type, ["\n\n", "\n", " ", ""]))import markdown
from bs4 import BeautifulSoup
class MarkdownChunker:
def __init__(self, max_chunk_size=512):
self.max_chunk_size = max_chunk_size
def chunk(self, markdown_text):
html = markdown.markdown(markdown_text)
soup = BeautifulSoup(html, 'html.parser')
chunks = []
current_chunk = ""
current_heading = "Introduction"
for element in soup.find_all(['h1', 'h2', 'h3', 'p', 'pre', 'table']):
if element.name.startswith('h'):
if current_chunk.strip():
chunks.append({
"content": current_chunk.strip(),
"heading": current_heading
})
current_heading = element.get_text().strip()
current_chunk = f"{element}\n"
elif element.name in ['pre', 'table']:
# Preserve code blocks and tables intact
if len(current_chunk) + len(str(element)) > self.max_chunk_size:
if current_chunk.strip():
chunks.append({
"content": current_chunk.strip(),
"heading": current_heading
})
current_chunk = f"{element}\n"
else:
current_chunk += f"{element}\n"
else:
current_chunk += str(element)
if current_chunk.strip():
chunks.append({
"content": current_chunk.strip(),
"heading": current_heading
})
return chunksimport ast
import re
class CodeChunker:
def __init__(self, language='python'):
self.language = language
def chunk_python(self, code):
tree = ast.parse(code)
chunks = []
for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
start_line = node.lineno - 1
end_line = node.end_lineno if hasattr(node, 'end_lineno') else start_line + 10
lines = code.split('\n')
chunk_lines = lines[start_line:end_line]
chunks.append('\n'.join(chunk_lines))
return chunks
def chunk_javascript(self, code):
# Use regex for languages without AST parsers
function_pattern = r'(function\s+\w+\s*\([^)]*\)\s*\{[^}]*\})'
class_pattern = r'(class\s+\w+\s*\{[^}]*\})'
patterns = [function_pattern, class_pattern]
chunks = []
for pattern in patterns:
matches = re.finditer(pattern, code, re.MULTILINE | re.DOTALL)
for match in matches:
chunks.append(match.group(1))
return chunks
def chunk(self, code):
if self.language == 'python':
return self.chunk_python(code)
elif self.language == 'javascript':
return self.chunk_javascript(code)
else:
# Fallback to line-based chunking
return self.chunk_by_lines(code)
def chunk_by_lines(self, code, max_lines=50):
lines = code.split('\n')
chunks = []
for i in range(0, len(lines), max_lines):
chunk = '\n'.join(lines[i:i+max_lines])
chunks.append(chunk)
return chunksimport pandas as pd
class TableChunker:
def __init__(self, max_rows=100, summary_rows=5):
self.max_rows = max_rows
self.summary_rows = summary_rows
def chunk(self, table_data):
if isinstance(table_data, str):
df = pd.read_csv(StringIO(table_data))
else:
df = table_data
chunks = []
if len(df) <= self.max_rows:
# Small table - keep intact
chunks.append({
"type": "full_table",
"content": df.to_string(),
"metadata": {
"rows": len(df),
"columns": len(df.columns)
}
})
else:
# Large table - create summary + chunks
summary = df.head(self.summary_rows)
chunks.append({
"type": "table_summary",
"content": f"Table Summary ({len(df)} rows, {len(df.columns)} columns):\n{summary.to_string()}",
"metadata": {
"total_rows": len(df),
"summary_rows": self.summary_rows,
"columns": list(df.columns)
}
})
# Chunk the remaining data
for i in range(self.summary_rows, len(df), self.max_rows):
chunk_df = df.iloc[i:i+self.max_rows]
chunks.append({
"type": "table_chunk",
"content": f"Rows {i+1}-{min(i+self.max_rows, len(df))}:\n{chunk_df.to_string()}",
"metadata": {
"start_row": i + 1,
"end_row": min(i + self.max_rows, len(df)),
"columns": list(df.columns)
}
})
return chunksimport numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
class SemanticChunker:
def __init__(self, model_name="all-MiniLM-L6-v2", similarity_threshold=0.8, buffer_size=3):
self.model = SentenceTransformer(model_name)
self.similarity_threshold = similarity_threshold
self.buffer_size = buffer_size
def split_into_sentences(self, text):
# Simple sentence splitting - can be enhanced with nltk/spacy
sentences = re.split(r'[.!?]+', text)
return [s.strip() for s in sentences if s.strip()]
def chunk(self, text):
sentences = self.split_into_sentences(text)
if len(sentences) <= self.buffer_size:
return [text]
# Create embeddings
embeddings = self.model.encode(sentences)
chunks = []
current_chunk_sentences = []
for i in range(len(sentences)):
current_chunk_sentences.append(sentences[i])
# Check if we should create a boundary
if i < len(sentences) - 1:
similarity = cosine_similarity(
[embeddings[i]],
[embeddings[i + 1]]
)[0][0]
if similarity < self.similarity_threshold and len(current_chunk_sentences) >= 2:
chunks.append(' '.join(current_chunk_sentences))
current_chunk_sentences = []
# Add remaining sentences
if current_chunk_sentences:
chunks.append(' '.join(current_chunk_sentences))
return chunks| Parameter | Range | Effect |
|---|---|---|
| similarity_threshold | 0.5-0.9 | Higher values create more chunks |
| buffer_size | 1-10 | Larger buffers provide more context |
| model_name | Various | Different models for different domains |
import torch
from transformers import AutoTokenizer, AutoModel
class LateChunker:
def __init__(self, model_name="microsoft/DialoGPT-medium"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name)
def chunk(self, text, chunk_size=512):
# Tokenize entire document
tokens = self.tokenizer(text, return_tensors="pt", truncation=False)
# Get token-level embeddings
with torch.no_grad():
outputs = self.model(**tokens, output_hidden_states=True)
token_embeddings = outputs.last_hidden_state[0]
# Create chunk embeddings from token embeddings
chunks = []
for i in range(0, len(token_embeddings), chunk_size):
chunk_tokens = token_embeddings[i:i+chunk_size]
chunk_embedding = torch.mean(chunk_tokens, dim=0)
chunks.append({
"content": self.tokenizer.decode(tokens["input_ids"][0][i:i+chunk_size]),
"embedding": chunk_embedding.numpy()
})
return chunksimport openai
class ContextualChunker:
def __init__(self, api_key):
self.client = openai.OpenAI(api_key=api_key)
def generate_context(self, chunk, full_document):
prompt = f"""
Given the following document and a chunk from it, provide a brief context
that helps understand the chunk's meaning within the full document.
Document:
{full_document[:2000]}...
Chunk:
{chunk}
Context (max 50 words):
"""
response = self.client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
max_tokens=100,
temperature=0
)
return response.choices[0].message.content.strip()
def chunk_with_context(self, text, base_chunker):
# First create base chunks
base_chunks = base_chunker.chunk(text)
# Then add context to each chunk
contextualized_chunks = []
for chunk in base_chunks:
context = self.generate_context(chunk.page_content, text)
contextualized_content = f"Context: {context}\n\nContent: {chunk.page_content}"
contextualized_chunks.append({
"content": contextualized_content,
"original_content": chunk.page_content,
"context": context
})
return contextualized_chunks| Strategy | Time Complexity | Space Complexity | Relative Cost |
|---|---|---|---|
| Fixed-Size | O(n) | O(n) | Low |
| Recursive | O(n) | O(n) | Low |
| Structure-Aware | O(n log n) | O(n) | Medium |
| Semantic | O(n²) | O(n²) | High |
| Late Chunking | O(n) | O(n) | Very High |
| Contextual | O(n²) | O(n²) | Very High |
plugins
developer-kit-ai
skills
chunking-strategy
prompt-engineering
developer-kit-aws
skills
aws
aws-cli-beast
aws-cost-optimization
aws-drawio-architecture-diagrams
aws-sam-bootstrap
aws-cloudformation
aws-cloudformation-auto-scaling
references
aws-cloudformation-bedrock
references
aws-cloudformation-cloudfront
references
aws-cloudformation-cloudwatch
references
aws-cloudformation-dynamodb
references
aws-cloudformation-ec2
aws-cloudformation-ecs
references
aws-cloudformation-elasticache
aws-cloudformation-iam
references
aws-cloudformation-lambda
references
aws-cloudformation-rds
aws-cloudformation-s3
references
aws-cloudformation-security
references
aws-cloudformation-task-ecs-deploy-gh
aws-cloudformation-vpc
developer-kit-core
skills
developer-kit-java
skills
aws-lambda-java-integration
aws-rds-spring-boot-integration
aws-sdk-java-v2-bedrock
aws-sdk-java-v2-core
aws-sdk-java-v2-dynamodb
aws-sdk-java-v2-kms
aws-sdk-java-v2-lambda
aws-sdk-java-v2-messaging
aws-sdk-java-v2-rds
aws-sdk-java-v2-s3
aws-sdk-java-v2-secrets-manager
graalvm-native-image
langchain4j
langchain4j-mcp-server-patterns
langchain4j-ai-services-patterns
references
langchain4j-mcp-server-patterns
references
langchain4j-rag-implementation-patterns
references
langchain4j-spring-boot-integration
langchain4j-testing-strategies
langchain4j-tool-function-calling-patterns
langchain4j-vector-stores-configuration
references
qdrant
references
spring-ai-mcp-server-patterns
references
spring-boot-actuator
spring-boot-cache
spring-boot-crud-patterns
spring-boot-dependency-injection
spring-boot-event-driven-patterns
spring-boot-openapi-documentation
spring-boot-project-creator
spring-boot-resilience4j
spring-boot-rest-api-standards
spring-boot-saga-pattern
spring-boot-security-jwt
assets
references
scripts
spring-boot-test-patterns
spring-data-jpa
references
spring-data-neo4j
references
unit-test-application-events
unit-test-bean-validation
unit-test-boundary-conditions
unit-test-caching
unit-test-config-properties
unit-test-controller-layer
unit-test-exception-handler
unit-test-json-serialization
unit-test-mapper-converter
unit-test-parameterized
unit-test-scheduled-async
unit-test-service-layer
unit-test-utility-methods
unit-test-wiremock-rest-api
developer-kit-php
skills
aws-lambda-php-integration
developer-kit-python
skills
aws-lambda-python-integration
developer-kit-tools
developer-kit-typescript
skills
aws-lambda-typescript-integration
better-auth
drizzle-orm-patterns
dynamodb-toolbox-patterns
references
nestjs
nestjs-best-practices
nestjs-code-review
nestjs-drizzle-crud-generator
scripts
nextjs-app-router
nextjs-authentication
nextjs-code-review
nextjs-data-fetching
references
nextjs-deployment
nextjs-performance
nx-monorepo
react-code-review
react-patterns
references
shadcn-ui
tailwind-css-patterns
references
tailwind-design-system
references
turborepo-monorepo
typescript-docs
typescript-security-review
zod-validation-utilities