tessl/pypi-reuse

A tool for compliance with the REUSE recommendations for software licensing and copyright management.

—

Pending

Overview

Eval results

Files

REUSE Information Processing

Name: tessl/pypi-reuse
Author: tessl

The REUSE information processing system provides data structures and functions for handling licensing and copyright information. The core ReuseInfo class and supporting utilities enable extraction, manipulation, and analysis of REUSE compliance data.

Capabilities

Core Data Structures

The foundational data classes for REUSE information handling.

@dataclass(frozen=True)
class ReuseInfo:
    """
    Simple dataclass holding licensing and copyright information.
    
    Attributes:
        spdx_expressions: set[Expression] - SPDX license expressions
        copyright_lines: set[str] - Copyright statements  
        contributor_lines: set[str] - Contributor information
        path: Optional[str] - File path this info applies to
        source_path: Optional[str] - Source file where info was found
        source_type: Optional[SourceType] - Type of source containing the info
    """
    spdx_expressions: set[Expression] = field(default_factory=set)
    copyright_lines: set[str] = field(default_factory=set)
    contributor_lines: set[str] = field(default_factory=set)
    path: Optional[str] = None
    source_path: Optional[str] = None
    source_type: Optional[SourceType] = None

class SourceType(Enum):
    """
    Enumeration representing types of sources for license information.
    """
    DOT_LICENSE = "dot-license"        # A .license file containing license information
    FILE_HEADER = "file-header"        # A file header containing license information
    DEP5 = "dep5"                     # A .reuse/dep5 file containing license information  
    REUSE_TOML = "reuse-toml"         # A REUSE.toml file containing license information

ReuseInfo Manipulation

Methods for creating, copying, and combining ReuseInfo instances.

def copy(self, **kwargs: Any) -> ReuseInfo:
    """
    Return a copy of ReuseInfo, replacing the values of attributes with
    the values from kwargs.
    
    Args:
        **kwargs: Attribute values to replace
        
    Returns:
        New ReuseInfo instance with updated attributes
        
    Raises:
        KeyError: If kwargs contains non-existent attributes
    """

def union(self, value: ReuseInfo) -> ReuseInfo:
    """
    Return a new instance of ReuseInfo where all set attributes are equal
    to the union of the set in self and the set in value.
    
    All non-set attributes are set to their values in self.
    
    Args:
        value: ReuseInfo instance to union with
        
    Returns:
        New ReuseInfo instance with combined data
    """

def __or__(self, value: ReuseInfo) -> ReuseInfo:
    """Union operator support (| operator)."""
    return self.union(value)

Usage Examples:

from reuse import ReuseInfo, SourceType

# Create basic ReuseInfo
info1 = ReuseInfo(
    copyright_lines={"2023 Jane Doe"},
    source_path="example.py",
    source_type=SourceType.FILE_HEADER
)

# Create another with different data  
info2 = ReuseInfo(
    copyright_lines={"2023 John Smith"},
    spdx_expressions={"MIT"}
)

# Copy with modifications
modified_info = info1.copy(
    copyright_lines={"2024 Jane Doe"},
    spdx_expressions={"Apache-2.0"}
)

# Union two ReuseInfo instances
combined = info1.union(info2)
print(f"Combined copyrights: {combined.copyright_lines}")
# Output: {'2023 Jane Doe', '2023 John Smith'}

# Using union operator
combined_alt = info1 | info2  # Same as info1.union(info2)

Content Analysis Methods

Methods for analyzing ReuseInfo content and compliance status.

def contains_copyright_or_licensing(self) -> bool:
    """
    Check if either spdx_expressions or copyright_lines is non-empty.
    
    Returns:
        True if the instance contains copyright or licensing information
    """

def contains_copyright_xor_licensing(self) -> bool:
    """
    Check if exactly one of spdx_expressions or copyright_lines is non-empty.
    
    Returns:
        True if contains exactly one type of information (copyright XOR licensing)
    """

def contains_info(self) -> bool:
    """
    Check if any field except path, source_path and source_type is non-empty.
    
    Returns:
        True if the instance contains any substantive REUSE information
    """

def __bool__(self) -> bool:
    """
    Check if any attributes have values.
    
    Returns:
        True if any attribute is truthy
    """

Usage Examples:

# Create ReuseInfo instances for testing
empty_info = ReuseInfo()
copyright_only = ReuseInfo(copyright_lines={"2023 Jane Doe"})
license_only = ReuseInfo(spdx_expressions={"MIT"})
complete_info = ReuseInfo(
    copyright_lines={"2023 Jane Doe"},
    spdx_expressions={"MIT"}
)

# Test content analysis methods
print(f"Empty has info: {empty_info.contains_info()}")  # False
print(f"Copyright only has copyright or license: {copyright_only.contains_copyright_or_licensing()}")  # True  
print(f"License only has copyright XOR license: {license_only.contains_copyright_xor_licensing()}")  # True
print(f"Complete info has copyright or license: {complete_info.contains_copyright_or_licensing()}")  # True
print(f"Complete info has copyright XOR license: {complete_info.contains_copyright_xor_licensing()}")  # False

# Boolean evaluation
print(f"Empty info is truthy: {bool(empty_info)}")  # False
print(f"Complete info is truthy: {bool(complete_info)}")  # True

Content Extraction Functions

Functions for extracting REUSE information from text content and files.

def extract_reuse_info(text: str) -> ReuseInfo:
    """
    Extract REUSE info from text content.
    
    Args:
        text: Text content to analyze for REUSE information
        
    Returns:
        ReuseInfo instance containing extracted information
        
    Note:
        Searches for SPDX license identifiers, copyright statements,
        and contributor information using pattern matching.
    """

def reuse_info_of_file(path: Path) -> ReuseInfo:
    """
    Get REUSE info for specific file.
    
    Args:
        path: File path to analyze
        
    Returns:
        ReuseInfo instance for the file
        
    Raises:
        FileNotFoundError: If file doesn't exist
        UnicodeDecodeError: If file can't be decoded as text
    """

def contains_reuse_info(text: str) -> bool:
    """
    Check if text contains REUSE information.
    
    Args:
        text: Text content to check
        
    Returns:
        True if text contains REUSE licensing or copyright information
    """

Usage Examples:

from reuse.extract import extract_reuse_info, contains_reuse_info
from pathlib import Path

# Extract from text content
file_content = '''
# SPDX-FileCopyrightText: 2023 Jane Doe <jane@example.com>
# SPDX-License-Identifier: MIT

def hello_world():
    print("Hello, World!")
'''

info = extract_reuse_info(file_content)
print(f"Extracted licenses: {info.spdx_expressions}")
print(f"Extracted copyrights: {info.copyright_lines}")

# Check if content has REUSE info
has_info = contains_reuse_info(file_content)
print(f"Contains REUSE info: {has_info}")

# Extract from file
if Path("example.py").exists():
    file_info = reuse_info_of_file(Path("example.py"))
    print(f"File REUSE info: {file_info}")

Text Processing Utilities

Utility functions for processing and manipulating text content.

def find_spdx_tag(text: str, pattern: re.Pattern) -> Iterator[str]:
    """
    Find SPDX tags in text using regex pattern.
    
    Args:
        text: Text to search
        pattern: Compiled regex pattern for SPDX tags
        
    Yields:
        str: SPDX tag values found in text
    """

def filter_ignore_block(text: str) -> str:
    """
    Filter ignored blocks from text.
    
    Args:
        text: Input text potentially containing ignore blocks
        
    Returns:
        Text with ignore blocks removed
        
    Note:
        Removes sections marked with REUSE-IgnoreStart/REUSE-IgnoreEnd comments.
    """

def detect_line_endings(text: str) -> str:
    """
    Detect line ending style in text.
    
    Args:
        text: Text content to analyze
        
    Returns:
        Line ending character(s) detected ('\\n', '\\r\\n', or '\\r')
    """

Usage Examples:

import re
from reuse.extract import find_spdx_tag, filter_ignore_block, detect_line_endings

# Find SPDX license identifiers
license_pattern = re.compile(r'SPDX-License-Identifier:\s*([^\n\r]*)')
text_with_licenses = "SPDX-License-Identifier: MIT\nSPDX-License-Identifier: GPL-3.0"

for license_id in find_spdx_tag(text_with_licenses, license_pattern):
    print(f"Found license: {license_id}")

# Filter ignore blocks
text_with_ignore = '''
Some content
# REUSE-IgnoreStart
This content should be ignored
# REUSE-IgnoreEnd  
More content
'''

filtered = filter_ignore_block(text_with_ignore)
print(f"Filtered text: {filtered}")

# Detect line endings
unix_text = "Line 1\nLine 2\n"
windows_text = "Line 1\r\nLine 2\r\n"

print(f"Unix endings: {repr(detect_line_endings(unix_text))}")      # '\\n'
print(f"Windows endings: {repr(detect_line_endings(windows_text))}")  # '\\r\\n'

Binary File Handling

Functions for handling binary files and extracting text content.

def decoded_text_from_binary(binary_data: bytes) -> str:
    """
    Extract text from binary file data.
    
    Args:
        binary_data: Raw binary data from file
        
    Returns:
        Decoded text content
        
    Raises:
        UnicodeDecodeError: If binary data cannot be decoded as text
        
    Note:
        Attempts multiple encoding strategies (UTF-8, Latin-1, etc.)
        and handles byte order marks (BOM).
    """

Usage Examples:

from reuse.extract import decoded_text_from_binary

# Read binary file and decode
with open("example.py", "rb") as f:
    binary_data = f.read()

try:
    text_content = decoded_text_from_binary(binary_data)
    # Now extract REUSE info from text
    info = extract_reuse_info(text_content)
except UnicodeDecodeError:
    print("File is not text or uses unsupported encoding")

Complete REUSE Information Processing Example

from reuse import ReuseInfo, SourceType
from reuse.extract import extract_reuse_info, contains_reuse_info
from pathlib import Path

def process_file_reuse_info(file_path: Path) -> dict:
    """Complete example of processing REUSE information."""
    
    result = {
        "file": str(file_path),
        "has_reuse_info": False,
        "licenses": [],
        "copyrights": [],
        "contributors": [],
        "compliance_status": "unknown"
    }
    
    try:
        # Read file content
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Check if file contains REUSE info
        if not contains_reuse_info(content):
            result["compliance_status"] = "missing_info"
            return result
        
        # Extract REUSE information
        info = extract_reuse_info(content)
        
        if info.contains_info():
            result["has_reuse_info"] = True
            result["licenses"] = list(str(expr) for expr in info.spdx_expressions)
            result["copyrights"] = list(info.copyright_lines)
            result["contributors"] = list(info.contributor_lines)
            
            # Determine compliance status
            if info.contains_copyright_or_licensing():
                if info.spdx_expressions and info.copyright_lines:
                    result["compliance_status"] = "compliant"
                elif info.contains_copyright_xor_licensing():
                    result["compliance_status"] = "partial"
                else:
                    result["compliance_status"] = "missing_info"
            else:
                result["compliance_status"] = "missing_info"
    
    except (FileNotFoundError, UnicodeDecodeError) as e:
        result["error"] = str(e)
        result["compliance_status"] = "error"
    
    return result

# Usage
file_analysis = process_file_reuse_info(Path("src/example.py"))
print(f"File: {file_analysis['file']}")
print(f"Compliance: {file_analysis['compliance_status']}")
print(f"Licenses: {file_analysis['licenses']}")
print(f"Copyrights: {file_analysis['copyrights']}")

Install with Tessl CLI