A comprehensive BibTeX parser library for Python 3 that enables parsing and writing of bibliographic data files
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Collection of functions for customizing and processing bibliographic entries including name parsing, field normalization, LaTeX encoding conversion, and specialized field handling. These functions are designed to be used as customization callbacks during parsing or for post-processing entries.
Functions for processing and formatting author and editor names with support for various name formats and structured output.
def author(record: dict) -> dict:
"""
Split author field into a list of formatted names.
Processes the 'author' field by splitting on ' and ' delimiter and
formatting each name as "Last, First" format.
Parameters:
- record (dict): Entry dictionary to process
Returns:
dict: Modified record with author field as list of formatted names
Note: Removes empty author fields. Handles newlines in author strings.
"""
def editor(record: dict) -> dict:
"""
Process editor field into structured objects with names and IDs.
Similar to author processing but creates objects with 'name' and 'ID'
fields for each editor, where ID is a sanitized version of the name.
Parameters:
- record (dict): Entry dictionary to process
Returns:
dict: Modified record with editor field as list of editor objects
"""Advanced name parsing functions that handle complex name formats according to BibTeX conventions.
def splitname(name: str, strict_mode: bool = True) -> dict:
"""
Break a name into its constituent parts: First, von, Last, and Jr.
Parses names according to BibTeX conventions supporting three formats:
- First von Last
- von Last, First
- von Last, Jr, First
Parameters:
- name (str): Single name string to parse
- strict_mode (bool): If True, raise exceptions on invalid names
Returns:
dict: Dictionary with keys 'first', 'last', 'von', 'jr' (each a list of words)
Raises:
InvalidName: If name is invalid and strict_mode=True
"""
def getnames(names: list) -> list:
"""
Convert list of name strings to "surname, firstnames" format.
Parameters:
- names (list): List of name strings
Returns:
list: List of formatted names in "Last, First" format
Note: Simplified implementation, may not handle all complex cases
"""
def find_matching(
text: str,
opening: str,
closing: str,
ignore_escaped: bool = True
) -> dict:
"""
Find matching bracket pairs in text.
Parameters:
- text (str): Text to search
- opening (str): Opening bracket character
- closing (str): Closing bracket character
- ignore_escaped (bool): Ignore escaped brackets
Returns:
dict: Mapping of opening positions to closing positions
Raises:
IndexError: If brackets are unmatched
"""Functions for processing and normalizing specific bibliographic fields.
def journal(record: dict) -> dict:
"""
Convert journal field into structured object with name and ID.
Parameters:
- record (dict): Entry dictionary to process
Returns:
dict: Modified record with journal as object containing 'name' and 'ID'
"""
def keyword(record: dict, sep: str = ',|;') -> dict:
"""
Split keyword field into a list using specified separators.
Parameters:
- record (dict): Entry dictionary to process
- sep (str): Regular expression pattern for separators
Returns:
dict: Modified record with keyword field as list of keywords
"""
def link(record: dict) -> dict:
"""
Process link field into structured objects.
Parses link field lines into objects with 'url', 'anchor', and 'format' fields.
Parameters:
- record (dict): Entry dictionary to process
Returns:
dict: Modified record with link field as list of link objects
"""
def page_double_hyphen(record: dict) -> dict:
"""
Normalize page ranges to use double hyphens.
Converts various hyphen types in page ranges to standard double hyphen (--).
Parameters:
- record (dict): Entry dictionary to process
Returns:
dict: Modified record with normalized page field
"""
def type(record: dict) -> dict:
"""
Convert type field to lowercase.
Parameters:
- record (dict): Entry dictionary to process
Returns:
dict: Modified record with lowercase type field
"""
def doi(record: dict) -> dict:
"""
Process DOI field and add to links.
Converts DOI to URL format and adds to link field if not already present.
Parameters:
- record (dict): Entry dictionary to process
Returns:
dict: Modified record with DOI added to links
"""Functions for converting between LaTeX encoding and Unicode in bibliographic data.
def convert_to_unicode(record: dict) -> dict:
"""
Convert LaTeX accents and encoding to Unicode throughout record.
Processes all string fields, lists, and dictionary values in the record
to convert LaTeX-encoded special characters to Unicode equivalents.
Parameters:
- record (dict): Entry dictionary to process
Returns:
dict: Modified record with Unicode characters
"""
def homogenize_latex_encoding(record: dict) -> dict:
"""
Homogenize LaTeX encoding style for BibTeX output.
First converts to Unicode, then converts back to consistent LaTeX encoding.
Protects uppercase letters in title field.
Parameters:
- record (dict): Entry dictionary to process
Returns:
dict: Modified record with homogenized LaTeX encoding
Note: Experimental function, may have limitations
"""
def add_plaintext_fields(record: dict) -> dict:
"""
Add plaintext versions of all fields with 'plain_' prefix.
Creates additional fields with braces and special characters removed
for easier text processing and searching.
Parameters:
- record (dict): Entry dictionary to process
Returns:
dict: Modified record with additional plain_* fields
"""Exception classes for handling errors in name processing.
class InvalidName(ValueError):
"""
Exception raised by splitname() when an invalid name is encountered.
Used when strict_mode=True and name cannot be parsed according to
BibTeX naming conventions.
"""
passimport bibtexparser
from bibtexparser import customization
def my_customization(record):
"""Custom function to process entries during parsing."""
# Process author names
record = customization.author(record)
# Convert journal to structured format
record = customization.journal(record)
# Split keywords
record = customization.keyword(record)
# Convert LaTeX to Unicode
record = customization.convert_to_unicode(record)
return record
# Use with parser
parser = bibtexparser.bparser.BibTexParser(customization=my_customization)
with open('bibliography.bib') as f:
db = parser.parse_file(f)from bibtexparser import customization
# Load database normally
with open('bibliography.bib') as f:
db = bibtexparser.load(f)
# Apply customizations to all entries
for entry in db.entries:
entry = customization.author(entry)
entry = customization.page_double_hyphen(entry)
entry = customization.doi(entry)from bibtexparser.customization import splitname, getnames
# Parse individual names
name_parts = splitname("Jean-Baptiste von Neumann, Jr.")
print(name_parts)
# {'first': ['Jean-Baptiste'], 'von': ['von'], 'last': ['Neumann'], 'jr': ['Jr.']}
# Format multiple names
authors = ["Einstein, Albert", "Newton, Isaac", "Curie, Marie"]
formatted = getnames(authors)
print(formatted)
# ['Einstein, Albert', 'Newton, Isaac', 'Curie, Marie']from bibtexparser.customization import convert_to_unicode, homogenize_latex_encoding
# Sample entry with LaTeX encoding
entry = {
'title': 'Schr{\\"o}dinger\\'s Cat',
'author': 'Erwin Schr{\\"o}dinger'
}
# Convert to Unicode
unicode_entry = convert_to_unicode(entry.copy())
print(unicode_entry['title']) # Schrödinger's Cat
# Homogenize LaTeX encoding
latex_entry = homogenize_latex_encoding(entry.copy())
print(latex_entry['title']) # Consistent LaTeX formatfrom bibtexparser.customization import keyword, link, journal
# Process keywords
entry = {'keyword': 'physics; quantum mechanics, uncertainty'}
entry = keyword(entry, sep=';|,')
print(entry['keyword']) # ['physics', 'quantum mechanics', 'uncertainty']
# Process journal
entry = {'journal': 'Nature Physics'}
entry = journal(entry)
print(entry['journal']) # {'name': 'Nature Physics', 'ID': 'NaturePhysics'}
# Process links
entry = {'link': 'https://example.com PDF article\nhttps://doi.org/10.1000/123 DOI'}
entry = link(entry)
print(entry['link'])
# [{'url': 'https://example.com', 'anchor': 'PDF', 'format': 'article'},
# {'url': 'https://doi.org/10.1000/123', 'anchor': 'DOI'}]def custom_year_processor(record):
"""Custom function to process year field."""
if 'year' in record:
year = record['year']
# Convert to integer if possible
try:
record['year_int'] = int(year)
except ValueError:
record['year_int'] = None
# Add century field
if record['year_int']:
record['century'] = (record['year_int'] - 1) // 100 + 1
return record
def comprehensive_customization(record):
"""Comprehensive processing pipeline."""
# Apply built-in customizations
record = customization.author(record)
record = customization.editor(record)
record = customization.journal(record)
record = customization.keyword(record)
record = customization.doi(record)
record = customization.page_double_hyphen(record)
record = customization.convert_to_unicode(record)
# Apply custom processing
record = custom_year_processor(record)
# Add plaintext fields for searching
record = customization.add_plaintext_fields(record)
return recordInstall with Tessl CLI
npx tessl i tessl/pypi-bibtexparser@1.4.2