A comprehensive Python API for the Terrier information retrieval platform, enabling declarative experimentation with transformer pipelines for indexing, retrieval, and evaluation tasks.
—
PyTerrier's Java integration layer provides seamless access to the underlying Terrier information retrieval platform through comprehensive Java VM management, configuration, and interoperability features.
Core functions for initializing and managing the Java Virtual Machine that runs the Terrier platform.
def init(version: str = None, **kwargs) -> None:
"""
Initialize the Java VM and Terrier platform.
Parameters:
- version: Specific Terrier version to use
- **kwargs: Additional configuration options
"""
def legacy_init(*args, **kwargs) -> None:
"""
Legacy initialization function (deprecated).
Issues deprecation warning and delegates to init().
"""
def started() -> bool:
"""
Check if the Java VM has been started.
Returns:
- True if Java VM is running, False otherwise
"""
def configure(**kwargs) -> None:
"""
Configure Java environment before initialization.
Parameters:
- **kwargs: Configuration options (memory, classpath, etc.)
"""Usage Examples:
# Basic initialization
if not pt.java.started():
pt.java.init()
# Initialize with specific version
pt.java.init(version='5.7')
# Check if already started
if pt.java.started():
print("Java VM is running")
# Configure before initialization
pt.java.configure(memory='4G', redirect_io=True)
pt.java.init()Functions for configuring JVM memory limits and performance settings.
def set_memory_limit(memory: str) -> None:
"""
Set JVM memory limit (must be called before init).
Parameters:
- memory: Memory limit (e.g., '4G', '2048M', '1024m')
"""
def add_option(option: str) -> None:
"""
Add JVM command line option (must be called before init).
Parameters:
- option: JVM option (e.g., '-Xmx4G', '-XX:+UseG1GC')
"""
def set_java_home(java_home: str) -> None:
"""
Set JAVA_HOME path for JVM discovery.
Parameters:
- java_home: Path to Java installation directory
"""Usage Examples:
# Set memory limit before initialization
pt.java.set_memory_limit('8G')
pt.java.init()
# Add custom JVM options
pt.java.add_option('-XX:+UseG1GC')
pt.java.add_option('-Xmx4G')
pt.java.init()
# Set custom Java installation
pt.java.set_java_home('/usr/lib/jvm/java-11-openjdk')Functions for managing Java classpath and adding external JAR files or Maven packages.
def extend_classpath(paths: List[str]) -> None:
"""
Extend Java classpath with additional JAR files or directories.
Parameters:
- paths: List of paths to JAR files or directories
"""
def add_jar(jar_path: str) -> None:
"""
Add single JAR file to classpath.
Parameters:
- jar_path: Path to JAR file
"""
def add_package(package_spec: str) -> None:
"""
Add Maven package to classpath.
Parameters:
- package_spec: Maven coordinates (e.g., 'org.apache.lucene:lucene-core:8.11.1')
"""Usage Examples:
# Add external JAR files
pt.java.add_jar('/path/to/custom.jar')
pt.java.extend_classpath(['/path/to/lib1.jar', '/path/to/lib2.jar'])
# Add Maven packages
pt.java.add_package('org.apache.commons:commons-lang3:3.12.0')
pt.java.add_package('com.fasterxml.jackson.core:jackson-core:2.13.0')
pt.java.init() # Initialize after adding dependenciesFunctions for accessing and interacting with Java classes and objects from Python.
def autoclass(class_name: str) -> type:
"""
Automatically load Java class for use in Python.
Parameters:
- class_name: Fully qualified Java class name
Returns:
- Python wrapper for Java class
"""
def cast(java_object: Any, target_class: str) -> Any:
"""
Cast Java object to specific type.
Parameters:
- java_object: Java object to cast
- target_class: Target class name for casting
Returns:
- Cast Java object
"""
J: Any # Direct access to Java classes (J.java.lang.String, etc.)
class JavaClasses:
"""Registry for commonly used Java classes."""Usage Examples:
# Load Java classes
ArrayList = pt.java.autoclass('java.util.ArrayList')
HashMap = pt.java.autoclass('java.util.HashMap')
# Create Java objects
java_list = ArrayList()
java_map = HashMap()
# Direct class access
string_class = pt.java.J.java.lang.String
integer_class = pt.java.J.java.lang.Integer
# Type casting
casted_object = pt.java.cast(some_object, 'org.terrier.structures.Index')Functions for managing Java I/O redirection and logging levels.
def redirect_stdouterr() -> None:
"""
Redirect Java stdout/stderr to Python stdout/stderr.
"""
def set_redirect_io(redirect: bool) -> None:
"""
Configure I/O redirection (must be called before init).
Parameters:
- redirect: Whether to redirect Java I/O to Python
"""
def set_log_level(level: str) -> None:
"""
Set Java logging level.
Parameters:
- level: Log level ('ERROR', 'WARN', 'INFO', 'DEBUG')
"""Usage Examples:
# Configure I/O redirection
pt.java.set_redirect_io(True)
pt.java.init()
# Set logging level
pt.java.set_log_level('WARN') # Reduce log verbosity
# Manually redirect output
pt.java.redirect_stdouterr()Helper functions for Java integration and data conversion.
def bytebuffer_to_array(bytebuffer: Any) -> bytes:
"""
Convert Java ByteBuffer to Python bytes array.
Parameters:
- bytebuffer: Java ByteBuffer object
Returns:
- Python bytes object
"""
def required() -> bool:
"""
Check if Java is required for current operations.
Returns:
- True if Java is required, False otherwise
"""
def required_raise() -> None:
"""
Raise exception if Java is required but not available.
Raises:
- RuntimeError: If Java is required but not started
"""Functions for parallel Java VM initialization and configuration.
def parallel_init(*args, **kwargs) -> None:
"""
Initialize Java VM for parallel processing contexts.
Parameters:
- *args, **kwargs: Initialization parameters
"""
def parallel_init_args() -> Tuple[Any, ...]:
"""
Get arguments for parallel Java initialization.
Returns:
- Tuple of initialization arguments
"""Functions for registering callbacks that run before Java VM initialization.
def before_init(callback: Callable[[], None]) -> None:
"""
Register callback to run before Java VM initialization.
Parameters:
- callback: Function to call before init
"""Usage Example:
# Register pre-initialization callback
def setup_custom_properties():
pt.terrier.set_property('custom.property', 'value')
pt.java.before_init(setup_custom_properties)
pt.java.init()# Load custom Java classes
CustomRetriever = pt.java.autoclass('com.example.CustomRetriever')
custom_retriever = CustomRetriever()
# Use in PyTerrier pipeline
class CustomTransformer(pt.Transformer):
def __init__(self):
self.java_retriever = CustomRetriever()
def transform(self, topics):
# Use Java object in transformation
return self.java_retriever.retrieve(topics)# Add multiple dependencies
dependencies = [
'org.apache.lucene:lucene-core:8.11.1',
'org.apache.lucene:lucene-analyzers-common:8.11.1',
'com.fasterxml.jackson.core:jackson-databind:2.13.0'
]
for dep in dependencies:
pt.java.add_package(dep)
pt.java.init()# Configure for large-scale processing
pt.java.set_memory_limit('16G')
pt.java.add_option('-XX:+UseG1GC')
pt.java.add_option('-XX:MaxGCPauseMillis=200')
pt.java.add_option('-XX:+DisableExplicitGC')
pt.java.set_redirect_io(False) # Reduce I/O overhead
pt.java.init()from typing import List, Any, Callable, Tuple, Optional
# Java integration types
JavaClass = type # Python wrapper for Java class
JavaObject = Any # Java object instance
ClassPath = List[str] # List of JAR files or directories
MavenCoordinate = str # Maven artifact coordinates
LogLevel = str # Java logging level
MemorySpec = str # Memory specification (e.g., '4G', '2048M')
InitCallback = Callable[[], None] # Pre-initialization callbackInstall with Tessl CLI
npx tessl i tessl/pypi-python-terrier