tessl/pypi-pygments

A syntax highlighting package that supports over 500 programming languages and text formats with extensive output format options

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Custom Components

Name: tessl/pypi-pygments
Author: tessl

Base classes and utilities for creating custom lexers, formatters, styles, and filters to extend Pygments functionality.

Capabilities

Custom Lexers

Base classes for implementing language-specific lexers.

class Lexer:
    """
    Base lexer class.
    
    Attributes:
    - name: Human-readable lexer name
    - aliases: List of short identifiers
    - filenames: List of filename patterns  
    - mimetypes: List of MIME types
    - priority: Priority for lexer selection (higher = preferred)
    """
    
    def get_tokens(self, text: str): ...
    def get_tokens_unprocessed(self, text: str): ...
    def analyse_text(text: str) -> float: ...

class RegexLexer(Lexer):
    """
    Lexer based on regular expressions and states.
    
    Attributes:
    - tokens: Dictionary mapping state names to token rules
    - flags: Regex flags (re.MULTILINE | re.IGNORECASE, etc.)
    """

class ExtendedRegexLexer(RegexLexer):
    """
    Enhanced regex lexer with additional features.
    """

class DelegatingLexer(Lexer):
    """
    Lexer that delegates to other lexers based on content.
    """

Usage example:

from pygments.lexer import RegexLexer
from pygments.token import *

class MyLanguageLexer(RegexLexer):
    name = 'MyLanguage'
    aliases = ['mylang', 'ml']
    filenames = ['*.ml', '*.mylang']
    mimetypes = ['text/x-mylang']
    
    tokens = {
        'root': [
            (r'\s+', Whitespace),
            (r'#.*$', Comment.Single),
            (r'\b(if|else|while|for)\b', Keyword),
            (r'\b[A-Z][a-zA-Z0-9_]*\b', Name.Class),
            (r'\b[a-z][a-zA-Z0-9_]*\b', Name),
            (r'"[^"]*"', String.Double),
            (r'\d+', Number.Integer),
            (r'[+\-*/=<>!]', Operator),
            (r'[(){}[\],;]', Punctuation),
        ]
    }

Custom Formatters

Base class for creating output formatters.

class Formatter:
    """
    Base formatter class.
    
    Attributes:
    - name: Human-readable formatter name
    - aliases: List of short identifiers
    - filenames: List of filename patterns
    - unicodeoutput: Whether formatter outputs Unicode
    """
    
    def format(self, tokensource, outfile): ...
    def get_style_defs(self, arg='') -> str: ...

Usage example:

from pygments.formatter import Formatter
from pygments.token import *

class JsonFormatter(Formatter):
    name = 'JSON'
    aliases = ['json']
    filenames = ['*.json']
    
    def format(self, tokensource, outfile):
        import json
        tokens = []
        for ttype, value in tokensource:
            tokens.append({
                'type': str(ttype),
                'value': value
            })
        json.dump(tokens, outfile, indent=2)

Custom Styles

Base class for creating color schemes.

class Style:
    """
    Base style class.
    
    Attributes:
    - name: Style name
    - styles: Dictionary mapping token types to style definitions
    """

Usage example:

from pygments.style import Style
from pygments.token import *

class MyDarkStyle(Style):
    name = 'mydark'
    
    styles = {
        Comment:                'italic #75715e',
        Keyword:                'bold #66d9ef',
        Name:                   '#f8f8f2',
        Name.Attribute:         '#a6e22e',
        Name.Class:             'bold #a6e22e', 
        Name.Function:          '#a6e22e',
        Number:                 '#ae81ff',
        Operator:               '#f92672',
        String:                 '#e6db74',
        String.Doc:             'italic #e6db74',
        Generic.Deleted:        '#f92672',
        Generic.Inserted:       '#a6e22e',
        Generic.Heading:        'bold #f8f8f2',
        Error:                  '#f8f8f2 bg:#f92672',
    }

Custom Filters

Base class for creating token stream filters.

class Filter:
    """
    Base filter class.
    
    Methods:
    - filter(lexer, stream): Process token stream
    """
    
    def filter(self, lexer, stream): ...

Usage example:

from pygments.filter import Filter
from pygments.token import *

class UppercaseFilter(Filter):
    """Convert all text to uppercase."""
    
    def filter(self, lexer, stream):
        for ttype, value in stream:
            yield ttype, value.upper()

class RedactSecretsFilter(Filter):
    """Replace sensitive information with asterisks."""
    
    def __init__(self, **options):
        Filter.__init__(self, **options)
        self.keywords = options.get('keywords', ['password', 'secret', 'key'])
    
    def filter(self, lexer, stream):
        for ttype, value in stream:
            if ttype is String:
                for keyword in self.keywords:
                    if keyword.lower() in value.lower():
                        value = '***REDACTED***'
                        break
            yield ttype, value

Lexer Development Utilities

Token Rules

def include(state: str): ...
def inherit(): ...  
def bygroups(*args): ...
def using(cls, **kwargs): ...
def this(): ...
def default(state: str): ...
def words(words: list, prefix: str = '', suffix: str = ''): ...

Usage in lexer tokens:

tokens = {
    'root': [
        (r'\s+', Whitespace),
        include('comments'),
        (r'\b(class|def)\b', Keyword, 'classdef'),
        (words(['int', 'str', 'bool'], suffix=r'\b'), Name.Builtin.Type),
        default('expr'),
    ],
    
    'comments': [
        (r'#.*$', Comment.Single),
        (r'/\*', Comment.Multiline, 'multiline-comment'),
    ],
    
    'multiline-comment': [
        (r'[^*/]+', Comment.Multiline),
        (r'/\*', Comment.Multiline, '#push'),
        (r'\*/', Comment.Multiline, '#pop'),
        (r'[*/]', Comment.Multiline),
    ],
    
    'classdef': [
        (r'\s+', Whitespace),
        (r'[A-Z][a-zA-Z0-9_]*', Name.Class, '#pop'),
    ],
    
    'expr': [
        (r'"', String.Double, 'string'),
        (r'\d+', Number.Integer),
        (r'[a-zA-Z_][a-zA-Z0-9_]*', Name),
        (r'[+\-*/]', Operator),
    ],
    
    'string': [
        (r'[^"\\]+', String.Double),
        (r'\\.', String.Escape),
        (r'"', String.Double, '#pop'),
    ],
}

Analysis Functions

def analyse_text(text: str) -> float:
    """
    Analyze text and return confidence score (0.0-1.0).
    Used for lexer guessing. Higher scores indicate better match.
    """

Example implementation:

@staticmethod
def analyse_text(text):
    score = 0.0
    
    # Check for specific keywords
    if re.search(r'\b(function|var|const|let)\b', text):
        score += 0.3
        
    # Check for syntax patterns
    if re.search(r'function\s+\w+\s*\(', text):
        score += 0.2
        
    # Check file structure
    if re.search(r'export\s+(default\s+)?', text):
        score += 0.1
        
    return min(score, 1.0)

Helper Classes

Lexer Context Management

class LexerContext:
    """Context for lexer state management."""

Token Type Utilities

def string_to_tokentype(s: str) -> _TokenType:
    """Convert string to token type (e.g., 'Name.Function' -> Token.Name.Function)."""

def is_token_subtype(ttype: _TokenType, other: _TokenType) -> bool:
    """Check if ttype is a subtype of other."""

Registration and Discovery

Plugin Entry Points

# setup.py
setup(
    name='my-pygments-extensions',
    entry_points={
        'pygments.lexers': [
            'mylang = mypackage.lexers:MyLanguageLexer',
        ],
        'pygments.formatters': [
            'json = mypackage.formatters:JsonFormatter',
        ],
        'pygments.styles': [
            'mydark = mypackage.styles:MyDarkStyle',
        ],
        'pygments.filters': [
            'redact = mypackage.filters:RedactSecretsFilter',
        ],
    }
)

Loading Custom Components

from pygments.lexers import load_lexer_from_file
from pygments.formatters import load_formatter_from_file

# Load from files
custom_lexer = load_lexer_from_file('mylexer.py', 'MyLexer')
custom_formatter = load_formatter_from_file('myformatter.py', 'MyFormatter')

Testing Custom Components

# Test lexer
lexer = MyLanguageLexer()
tokens = list(lexer.get_tokens('test code here'))
assert len(tokens) > 0

# Test formatter
formatter = JsonFormatter()
result = formatter.format(tokens, sys.stdout)

# Test style
style = MyDarkStyle()
html_formatter = HtmlFormatter(style=style)

# Test filter
filter_instance = RedactSecretsFilter(keywords=['secret', 'password'])
lexer.add_filter(filter_instance)

Install with Tessl CLI