End-to-end Optical Music Recognition (OMR) system for transcribing musical notation from images into structured MusicXML format.
—
Model inference capabilities using U-Net architectures for semantic segmentation of musical elements. Oemer uses two specialized neural networks for different aspects of musical notation recognition.
Run neural network inference using sliding window approach for large images.
def inference(model_path: str, img_path: str, step_size: int = 128, batch_size: int = 16, manual_th: Optional[Any] = None, use_tf: bool = False) -> Tuple[ndarray, ndarray]:
"""
Run neural network inference on image patches using sliding window approach.
Parameters:
- model_path (str): Path to model checkpoint directory containing model files
- img_path (str): Path to input image file
- step_size (int): Sliding window step size in pixels (default: 128)
- batch_size (int): Number of patches to process in each batch (default: 16)
- manual_th (Optional[Any]): Manual threshold for prediction binarization
- use_tf (bool): Use TensorFlow instead of ONNX runtime (default: False)
Returns:
Tuple containing:
- predictions (ndarray): Segmentation predictions with class labels
- metadata (ndarray): Additional prediction metadata
Raises:
FileNotFoundError: If model files or input image are not found
RuntimeError: If inference fails due to model or memory issues
"""Prepare images for optimal neural network processing.
def resize_image(image: Image.Image) -> Image.Image:
"""
Resize image to optimal dimensions for neural network processing.
Maintains aspect ratio while ensuring dimensions are compatible
with the model's expected input size requirements.
Parameters:
- image (PIL.Image.Image): Input image to resize
Returns:
PIL.Image.Image: Resized image optimized for model inference
"""Use trained sklearn models for fine-grained symbol classification.
def predict(region: ndarray, model_name: str) -> str:
"""
Predict symbol type using trained sklearn classification models.
Used for distinguishing between similar symbols that neural networks
cannot reliably differentiate (e.g., different clef types, accidentals).
Parameters:
- region (ndarray): Image region containing the symbol to classify
- model_name (str): Name of the sklearn model to use for prediction
Returns:
str: Predicted symbol class label
Raises:
ValueError: If model_name is not recognized
FileNotFoundError: If sklearn model file is not found
"""Oemer uses two specialized U-Net models for different aspects of music recognition:
checkpoints/unet_big/checkpoints/seg_net/Each model directory contains:
model.onnx - ONNX format model (default runtime)weights.h5 - TensorFlow/Keras weights (when using --use-tf)metadata.pkl - Model metadata and configurationarch.json - Model architecture descriptionfrom oemer.inference import inference
import numpy as np
# Run inference on a music sheet image
model_path = "oemer/checkpoints/unet_big"
img_path = "sheet_music.jpg"
# Generate predictions
predictions, metadata = inference(
model_path=model_path,
img_path=img_path,
step_size=128,
batch_size=16,
use_tf=False # Use ONNX runtime
)
# Extract staff and symbol predictions
staff_mask = np.where(predictions == 1, 1, 0)
symbol_mask = np.where(predictions == 2, 1, 0)
print(f"Predictions shape: {predictions.shape}")
print(f"Staff pixels: {np.sum(staff_mask)}")
print(f"Symbol pixels: {np.sum(symbol_mask)}")from oemer.inference import inference, resize_image
from PIL import Image
import os
def run_complete_inference(img_path: str, use_tf: bool = False):
"""Run both inference models on an image."""
# Resize image for optimal processing
image = Image.open(img_path)
resized_image = resize_image(image)
temp_path = "temp_resized.jpg"
resized_image.save(temp_path)
try:
# Stage 1: Staff vs. symbols segmentation
print("Running stage 1 inference (staff vs symbols)...")
staff_symbols, _ = inference(
model_path="oemer/checkpoints/unet_big",
img_path=temp_path,
step_size=128,
batch_size=16,
use_tf=use_tf
)
# Stage 2: Detailed symbol classification
print("Running stage 2 inference (symbol details)...")
symbol_details, _ = inference(
model_path="oemer/checkpoints/seg_net",
img_path=temp_path,
step_size=128,
batch_size=16,
use_tf=use_tf
)
# Process results
staff = np.where(staff_symbols == 1, 1, 0)
symbols = np.where(staff_symbols == 2, 1, 0)
stems_rests = np.where(symbol_details == 1, 1, 0)
noteheads = np.where(symbol_details == 2, 1, 0)
clefs_keys = np.where(symbol_details == 3, 1, 0)
return {
'staff': staff,
'symbols': symbols,
'stems_rests': stems_rests,
'noteheads': noteheads,
'clefs_keys': clefs_keys
}
finally:
# Clean up temporary file
if os.path.exists(temp_path):
os.remove(temp_path)
# Run complete inference
results = run_complete_inference("my_sheet_music.jpg")
for key, mask in results.items():
print(f"{key}: {mask.shape}, pixels: {np.sum(mask)}")from oemer.inference import inference
# High-precision inference with smaller steps
high_precision_predictions, _ = inference(
model_path="oemer/checkpoints/unet_big",
img_path="complex_score.jpg",
step_size=64, # Smaller steps for more overlap
batch_size=8, # Smaller batches to reduce memory usage
use_tf=True # Use TensorFlow for potentially better precision
)
# Fast inference with larger steps
fast_predictions, _ = inference(
model_path="oemer/checkpoints/unet_big",
img_path="simple_score.jpg",
step_size=256, # Larger steps for faster processing
batch_size=32, # Larger batches if memory allows
use_tf=False # ONNX is typically faster
)from oemer.inference import predict
import cv2
import numpy as np
# Extract a symbol region from the image
image = cv2.imread("sheet_music.jpg", cv2.IMREAD_GRAYSCALE)
symbol_region = image[100:150, 200:250] # Extract 50x50 region
# Classify the symbol using trained sklearn models
try:
# Predict clef type
clef_type = predict(symbol_region, "clef_classifier")
print(f"Detected clef: {clef_type}")
# Predict accidental type
accidental_type = predict(symbol_region, "accidental_classifier")
print(f"Detected accidental: {accidental_type}")
# Predict rest type
rest_type = predict(symbol_region, "rest_classifier")
print(f"Detected rest: {rest_type}")
except ValueError as e:
print(f"Classification error: {e}")batch_size if encountering out-of-memory errorsstep_size uses less memory but may reduce accuracyonnxruntime-gpu for GPU acceleration on Linux# Quality-focused settings (slower)
quality_settings = {
'step_size': 64,
'batch_size': 8,
'use_tf': True
}
# Speed-focused settings (faster)
speed_settings = {
'step_size': 256,
'batch_size': 32,
'use_tf': False
}
# Balanced settings (recommended)
balanced_settings = {
'step_size': 128,
'batch_size': 16,
'use_tf': False
}The inference system is designed to handle various image sizes and qualities, automatically adapting the processing pipeline for optimal results while maintaining reasonable performance.
Install with Tessl CLI
npx tessl i tessl/pypi-oemer