End-to-end Optical Music Recognition (OMR) system for transcribing musical notation from images into structured MusicXML format.
npx @tessl/cli install tessl/pypi-oemer@0.1.0End-to-end Optical Music Recognition (OMR) system that transcribes musical notation from images into structured MusicXML format. Built on deep learning models and computer vision techniques, oemer can process skewed and phone-taken photos of Western music notation sheets, providing a complete pipeline from image preprocessing to digital score generation.
pip install oemerpip install oemer[tf] for TensorFlow supportoemer <path_to_image>import oemer
from oemer.ete import extract, mainFor programmatic usage:
from oemer.ete import extract
from argparse import Namespace# Basic usage - outputs MusicXML file and analysis image
oemer image.jpg
# Specify output directory
oemer image.jpg -o ./output/
# Use TensorFlow instead of ONNX runtime
oemer image.jpg --use-tf
# Save model predictions for reuse
oemer image.jpg --save-cache
# Disable image deskewing for aligned images
oemer image.jpg --without-deskewfrom oemer.ete import extract
from argparse import Namespace
# Configure extraction parameters
args = Namespace(
img_path='path/to/music_sheet.jpg',
output_path='./',
use_tf=False,
save_cache=False,
without_deskew=False
)
# Extract musical notation and generate MusicXML
musicxml_path = extract(args)
print(f"Generated MusicXML: {musicxml_path}")Oemer follows a layered pipeline architecture using a global state management system:
The Layer Management System (oemer.layers) provides global state management, allowing each processing stage to register intermediate results for use by subsequent stages.
Complete end-to-end optical music recognition pipeline that handles the full workflow from image input to MusicXML output.
def extract(args: Namespace) -> str:
"""
Main extraction pipeline function.
Parameters:
- args.img_path (str): Path to input image
- args.output_path (str): Output directory path
- args.use_tf (bool): Use TensorFlow instead of ONNX
- args.save_cache (bool): Save predictions for reuse
- args.without_deskew (bool): Skip deskewing step
Returns:
str: Path to generated MusicXML file
"""
def main() -> None:
"""CLI entry point for oemer command."""
def generate_pred(img_path: str, use_tf: bool = False) -> Tuple[ndarray, ndarray, ndarray, ndarray, ndarray]:
"""
Generate neural network predictions.
Returns:
Tuple containing staff, symbols, stems_rests, notehead, and clefs_keys predictions
"""Model inference capabilities using U-Net architectures for semantic segmentation of musical elements.
def inference(model_path: str, img_path: str, step_size: int = 128, batch_size: int = 16, manual_th: Optional[Any] = None, use_tf: bool = False) -> Tuple[ndarray, ndarray]:
"""
Run neural network inference on image patches.
Parameters:
- model_path (str): Path to model checkpoint directory
- img_path (str): Path to input image
- step_size (int): Sliding window step size
- batch_size (int): Inference batch size
- manual_th: Manual threshold for predictions
- use_tf (bool): Use TensorFlow instead of ONNX
Returns:
Tuple of prediction arrays and metadata
"""Detection and analysis of musical staff lines, which form the foundation for all subsequent processing steps.
def extract(splits: int = 8, line_threshold: float = 0.8, horizontal_diff_th: float = 0.1, unit_size_diff_th: float = 0.1, barline_min_degree: int = 75) -> Tuple[ndarray, ndarray]:
"""Extract staff lines and group information."""
class Staff:
"""Complete staff (5 lines) representation."""
lines: List[Line]
track: int
group: int
is_interp: bool
def add_line(self, line: Line) -> None: ...
def duplicate(self, x_offset=0, y_offset=0): ...
@property
def unit_size(self) -> float: ...
@property
def y_center(self) -> float: ...
@property
def slope(self) -> float: ...Recognition and classification of musical symbols including noteheads, clefs, accidentals, rests, and barlines.
def extract() -> List[NoteHead]:
"""Extract noteheads from neural network predictions."""
def extract(min_barline_h_unit_ratio: float = 3.75) -> Tuple[List[Barline], List[Clef], List[Sfn], List[Rest]]:
"""Extract musical symbols (barlines, clefs, accidentals, rests)."""
class NoteHead:
"""Note head representation with rhythm and pitch information."""
points: List[Tuple[int, int]]
pitch: Optional[int]
has_dot: bool
bbox: BBox
stem_up: Optional[bool]
stem_right: Optional[bool]
track: Optional[int]
group: Optional[int]
staff_line_pos: int
invalid: bool
id: Optional[int]
note_group_id: Optional[int]
sfn: Optional[Any] # Sharp/flat/natural association
label: NoteType
def add_point(self, x: int, y: int) -> None: ...
def force_set_label(self, label: NoteType) -> None: ...
class Clef:
"""Musical clef representation."""
bbox: BBox
track: Optional[int]
group: Optional[int]
label: ClefType
@property
def x_center(self) -> float: ...
class Sfn:
"""Sharp/Flat/Natural (accidental) representation."""
bbox: BBox
note_id: Optional[int]
is_key: Optional[bool] # Whether is key signature or accidental
track: Optional[int]
group: Optional[int]
label: SfnType
@property
def x_center(self) -> float: ...
class Rest:
"""Musical rest representation."""
bbox: BBox
track: Optional[int]
group: Optional[int]
label: RestType
@property
def x_center(self) -> float: ...
class Barline:
"""Musical barline representation."""
bbox: BBox
track: Optional[int]
group: Optional[int]
@property
def x_center(self) -> float: ...For complete notehead extraction details, see:
Advanced grouping of individual notes into chords and rhythm pattern recognition through beam and flag analysis.
def extract() -> Tuple[List[NoteGroup], ndarray]:
"""Group notes by stems and beams into chord groups."""
def extract(min_area_ratio: float = 0.08, max_area_ratio: float = 0.2, beam_th: float = 0.5) -> None:
"""Extract rhythm information from beams, flags, and dots."""
class NoteGroup:
"""Group of notes connected by stems/beams."""
id: Optional[int]
bbox: BBox
note_ids: List[int]
top_note_ids: List[int] # For multi-melody cases
bottom_note_ids: List[int] # For multi-melody cases
stem_up: Optional[bool]
has_stem: Optional[bool]
all_same_type: Optional[bool] # All notes are solid or hollow
group: Optional[int]
track: Optional[int]
@property
def x_center(self) -> float: ...For complete note grouping and rhythm analysis details, see:
Note Grouping and Rhythm Analysis
Generation of structured MusicXML documents from extracted musical elements with proper musical semantics and formatting.
class MusicXMLBuilder:
"""Main MusicXML document builder."""
def __init__(self, title: str = "Unknown"): ...
def build(self) -> None:
"""Build the MusicXML structure from extracted elements."""
def to_musicxml(self) -> bytes:
"""Export to MusicXML format."""
# Key signature enumeration
class Key(enum.Enum):
C_MAJOR = 0 # Same as A-minor
G_MAJOR = 1 # Same as E-minor
D_MAJOR = 2 # Same as B-minor
A_MAJOR = 3 # Same as F#-minor
E_MAJOR = 4 # Same as C#-minor
B_MAJOR = 5 # Same as G#-minor
F_SHARP_MAJOR = 6 # Same as D#-minor
F_MAJOR = -1 # Same as D-minor
B_FLAT_MAJOR = -2 # Same as G-minor
E_FLAT_MAJOR = -3 # Same as C-minor
A_FLAT_MAJOR = -4 # Same as F-minor
D_FLAT_MAJOR = -5 # Same as Bb-minor
G_FLAT_MAJOR = -6 # Same as Eb-minor
class Voice:
"""Voice representation for MusicXML generation."""
id: Optional[int]
note_ids: List[int]
stem_up: Optional[bool]
group_id: Optional[int]
x_center: Optional[float]
label: NoteType
has_dot: Optional[bool]
group: Optional[int]
track: Optional[int]
duration: int
rhythm_name: Optional[str]
def init(self) -> None: ...MusicXML generation is handled by the MusicXMLBuilder class - see the main processing pipeline documentation above for complete details.
Global state management system for intermediate processing results, enabling modular pipeline architecture.
def register_layer(name: str, layer: ndarray) -> None:
"""Register a processing layer for global access."""
def get_layer(name: str) -> ndarray:
"""Retrieve a registered processing layer."""
def delete_layer(name: str) -> None:
"""Delete a registered layer."""
def list_layers() -> List[str]:
"""List all registered layer names."""Comprehensive image processing utilities including dewarping, morphological operations, and bounding box management.
def estimate_coords(staff_pred: ndarray) -> Tuple[ndarray, ndarray]:
"""Estimate dewarping coordinates from staff predictions."""
def dewarp(img: ndarray, coords_x: ndarray, coords_y: ndarray) -> ndarray:
"""Apply dewarping transformation to correct image skew."""
def get_bbox(data: ndarray) -> List[BBox]:
"""Extract bounding boxes from binary image data using OpenCV contours."""
def get_center(bbox: Union[BBox, ndarray]) -> Tuple[int, int]:
"""Get center coordinates of a bounding box."""
def merge_nearby_bbox(bboxes: List[BBox], distance: float, x_factor: int = 1, y_factor: int = 1) -> List[BBox]:
"""Merge nearby bounding boxes using agglomerative clustering."""
def rm_merge_overlap_bbox(bboxes: List[BBox], overlap_ratio: float = 0.8) -> List[BBox]:
"""Remove and merge overlapping bounding boxes."""
class Grid:
"""Grid structure for dewarping coordinate estimation."""
id: Optional[int]
bbox: BBox
y_shift: int
@property
def y_center(self) -> float: ...
@property
def height(self) -> int: ...
class GridGroup:
"""Group of grids for dewarping processing."""
id: Optional[int]
reg_id: Optional[int]
bbox: BBox
gids: List[int]
split_unit: int
@property
def y_center(self) -> int: ...
def build_grid(st_pred: ndarray, split_unit: int = 11) -> Tuple[ndarray, List[Grid]]:
"""Build grid structure from staff predictions for dewarping."""
def build_grid_group(grid_map: ndarray, grids: List[Grid]) -> Tuple[ndarray, List[GridGroup]]:
"""Group grids into connected components for dewarping."""Image processing utilities are used throughout the pipeline - key dewarping and bounding box functions are documented in the main pipeline and neural network inference sections above.
from typing import Tuple, List, Optional, Union
from numpy import ndarray
from argparse import Namespace
# Core type aliases
BBox = Tuple[int, int, int, int] # Bounding box (x1, y1, x2, y2)
# Enumerations
class NoteType(enum.Enum):
WHOLE = 0
HALF = 1
QUARTER = 2
EIGHTH = 3
SIXTEENTH = 4
THIRTY_SECOND = 5
SIXTY_FOURTH = 6
TRIPLET = 7
OTHERS = 8
HALF_OR_WHOLE = 9 # Intermediate parsing state
class ClefType(enum.Enum):
G_CLEF = 1
F_CLEF = 2
class SfnType(enum.Enum):
FLAT = 1
SHARP = 2
NATURAL = 3
class RestType(enum.Enum):
WHOLE_HALF = 1
QUARTER = 2
EIGHTH = 3
SIXTEENTH = 4
THIRTY_SECOND = 5
SIXTY_FOURTH = 6
WHOLE = 7
HALF = 8Oemer defines custom exceptions for specific processing errors:
class SfnException(Exception):
"""Base exception for Sharp/Flat/Natural processing errors."""
class SfnNoteTrackMismatch(SfnException):
"""Track mismatch error in accidental processing."""
class SfnNoteGroupMismatch(SfnException):
"""Group mismatch error in accidental processing."""
class StafflineException(Exception):
"""Base exception for staffline processing errors."""
class StafflineCountInconsistent(StafflineException):
"""Inconsistent staffline count detected."""
class StafflineNotAligned(StafflineException):
"""Stafflines are not properly aligned."""
class StafflineUnitSizeInconsistent(StafflineException):
"""Inconsistent unit sizes across stafflines."""Common error handling pattern:
try:
musicxml_path = extract(args)
except FileNotFoundError:
print("Input image file not found")
except StafflineException as e:
print(f"Staffline processing error: {e}")
except Exception as e:
print(f"Processing failed: {e}")