Quarkus extension for integrating IBM watsonx.ai foundation models with LangChain4j. Provides chat models, generation models, streaming models, embedding models, and scoring models for IBM watsonx.ai. Includes comprehensive configuration options, support for tool/function calling, text extraction from documents in Cloud Object Storage, and experimental built-in services for Google search, weather, and web crawling. Designed for enterprise Java applications using the Quarkus framework with built-in dependency injection and native compilation support.
Extract text content from documents (PDF, images) stored in IBM Cloud Object Storage (COS). Supports multiple output formats including JSON, Markdown, HTML, and plain text. Useful for preprocessing documents before RAG ingestion or analysis.
Service for extracting text from documents in IBM Cloud Object Storage.
public class TextExtraction {
public TextExtraction(
Reference documentReference,
Reference resultReference,
String projectId,
String spaceId,
String version,
COSRestApi cosClient,
WatsonxRestApi watsonxClient
);
// Synchronous methods (start extraction and wait for result)
public String extractAndFetch(String absolutePath);
public String extractAndFetch(String absolutePath, Parameters parameters);
public String uploadExtractAndFetch(File file);
public String uploadExtractAndFetch(File file, Parameters parameters);
public String uploadExtractAndFetch(InputStream is, String fileName);
public String uploadExtractAndFetch(InputStream is, String fileName, Parameters parameters);
// Asynchronous methods (start extraction only)
public String startExtraction(String absolutePath);
public String startExtraction(String absolutePath, Parameters parameters);
public String uploadAndStartExtraction(File file);
public String uploadAndStartExtraction(File file, Parameters parameters);
public String uploadAndStartExtraction(InputStream is, String fileName);
public String uploadAndStartExtraction(InputStream is, String fileName, Parameters parameters);
// Status and cleanup
public TextExtractionResponse checkExtractionStatus(String id);
public void deleteFile(String bucketName, String path);
public void deleteFile(String bucketName, String path, Duration timeout);
}Supported Input Formats:
.pdf).gif).jpg, .jpeg).png).tif, .tiff)Supported Output Formats:
Synchronous Extraction (Upload and Extract):
import io.quarkiverse.langchain4j.watsonx.runtime.TextExtraction;
import jakarta.inject.Inject;
import java.io.File;
@ApplicationScoped
public class DocumentProcessor {
@Inject
TextExtraction textExtraction;
public String extractPdf(File pdfFile) {
// Upload file, extract text, and return result
String extractedText = textExtraction.uploadExtractAndFetch(pdfFile);
return extractedText;
}
}Asynchronous Extraction:
import io.quarkiverse.langchain4j.watsonx.bean.TextExtractionResponse;
public class AsyncDocumentProcessor {
@Inject
TextExtraction textExtraction;
public String startExtraction(File pdfFile) {
// Start extraction job and return job ID
String jobId = textExtraction.uploadAndStartExtraction(pdfFile);
return jobId;
}
public String checkStatus(String jobId) {
TextExtractionResponse status = textExtraction.checkExtractionStatus(jobId);
switch (status.metadata().status()) {
case COMPLETED:
// Fetch result from COS
String result = fetchResultFromCOS(status.entity().resultsReference());
return result;
case FAILED:
throw new RuntimeException("Extraction failed: " + status.entity().serviceError());
case RUNNING:
case QUEUED:
return "Still processing...";
default:
return "Unknown status";
}
}
}Reference to Cloud Object Storage bucket for input documents or output results.
public record Reference(String connection, String bucket) {
public Reference(String connection); // bucket = null
public Reference(String connection, String bucket);
}Example:
// Reference with both connection and bucket
TextExtraction.Reference documentRef = new TextExtraction.Reference(
"cos-connection-id",
"input-documents"
);
// Reference with connection only (bucket specified elsewhere)
TextExtraction.Reference resultsRef = new TextExtraction.Reference(
"cos-connection-id"
);Configuration parameters for text extraction operations.
public class Parameters {
public static Builder builder();
public Duration getTimeout();
public String getOutputFileName();
public List<Type> getTypes();
public Optional<Reference> getDocumentReference();
public Optional<Reference> getResultsReference();
public Optional<Boolean> getRemoveUploadedFile();
public Optional<Boolean> getRemoveOutputFile();
public Mode getMode();
public OCR getOcr();
public Boolean getAutoRotationCorrection();
public EmbeddedImages getEmbeddedImages();
public Integer getDpi();
public Boolean getOutputTokensAndBbox();
public static class Builder {
public Builder timeout(Duration timeout);
public Builder outputFileName(String outputFileName);
public Builder types(List<Type> types);
public Builder types(Type type);
public Builder types(Type... types);
public Builder documentReference(Reference documentReference);
public Builder resultsReference(Reference resultsReference);
public Builder removeUploadedFile(Boolean removeUploadedFile);
public Builder removeOutputFile(Boolean removeOutputFile);
public Builder mode(Mode mode);
public Builder ocr(OCR ocr);
public Builder autoRotationCorrection(Boolean autoRotationCorrection);
public Builder embeddedImages(EmbeddedImages embeddedImages);
public Builder dpi(Integer dpi);
public Builder outputTokensAndBbox(Boolean outputTokensAndBbox);
public Parameters build();
}
}Parameter Details:
timeout (Duration): Maximum time to wait for extraction completion
outputFileName (String): Name of output file in results bucket
types (List<Type>): Output format types
documentReference (Reference): Override default document bucket reference
resultsReference (Reference): Override default results bucket reference
removeUploadedFile (Boolean): Delete uploaded file after extraction
removeOutputFile (Boolean): Delete output file after retrieval
mode (Mode): Processing mode for extraction
ocr (OCR): OCR configuration for image-based text extraction
autoRotationCorrection (Boolean): Automatically correct page rotation
embeddedImages (EmbeddedImages): How to handle embedded images
dpi (Integer): Target DPI for image processing
outputTokensAndBbox (Boolean): Include token positions and bounding boxes
Examples:
// Simple extraction to markdown
TextExtraction.Parameters params = TextExtraction.Parameters.builder()
.types(Type.MARKDOWN)
.timeout(Duration.ofMinutes(2))
.build();
// Multiple output formats
TextExtraction.Parameters multiFormat = TextExtraction.Parameters.builder()
.types(Type.MARKDOWN, Type.JSON, Type.PLAIN_TEXT)
.outputFileName("extracted-content")
.build();
// Advanced OCR with auto-rotation
TextExtraction.Parameters ocrParams = TextExtraction.Parameters.builder()
.types(Type.MARKDOWN)
.autoRotationCorrection(true)
.dpi(300)
.outputTokensAndBbox(true)
.build();
// Clean up files after extraction
TextExtraction.Parameters cleanupParams = TextExtraction.Parameters.builder()
.types(Type.PLAIN_TEXT)
.removeUploadedFile(true)
.removeOutputFile(true)
.build();Extract text from files already in Cloud Object Storage.
// Extract from existing file in COS
String result = textExtraction.extractAndFetch("/path/to/document.pdf");
// With custom parameters
TextExtraction.Parameters params = TextExtraction.Parameters.builder()
.types(Type.MARKDOWN)
.timeout(Duration.ofMinutes(3))
.build();
String result = textExtraction.extractAndFetch("/path/to/document.pdf", params);Upload local file and extract in one operation.
import java.io.File;
import java.io.FileInputStream;
// Upload file and extract
File pdfFile = new File("/local/path/document.pdf");
String result = textExtraction.uploadExtractAndFetch(pdfFile);
// Upload from input stream
try (FileInputStream fis = new FileInputStream(pdfFile)) {
String result = textExtraction.uploadExtractAndFetch(
fis,
"document.pdf"
);
}
// With custom parameters
TextExtraction.Parameters params = TextExtraction.Parameters.builder()
.types(Type.JSON, Type.MARKDOWN)
.removeUploadedFile(true)
.removeOutputFile(true)
.build();
String result = textExtraction.uploadExtractAndFetch(pdfFile, params);Start extraction job and check status separately.
// Start extraction job
String jobId = textExtraction.startExtraction("/path/to/document.pdf");
// Check status later
TextExtractionResponse status = textExtraction.checkExtractionStatus(jobId);
if (status.metadata().status() == TextExtractionResponse.Status.COMPLETED) {
// Retrieve result from COS
String resultPath = status.entity().resultsReference().location();
String result = fetchFromCOS(resultPath);
}Upload and start extraction job.
import java.io.File;
// Upload and start extraction
File pdfFile = new File("/local/path/document.pdf");
String jobId = textExtraction.uploadAndStartExtraction(pdfFile);
// Poll for completion
TextExtractionResponse status;
do {
Thread.sleep(5000); // Wait 5 seconds
status = textExtraction.checkExtractionStatus(jobId);
} while (status.metadata().status() == TextExtractionResponse.Status.RUNNING ||
status.metadata().status() == TextExtractionResponse.Status.QUEUED);
if (status.metadata().status() == TextExtractionResponse.Status.COMPLETED) {
System.out.println("Extraction completed!");
}Delete files from Cloud Object Storage.
// Delete file from bucket
textExtraction.deleteFile("bucket-name", "/path/to/file.pdf");
// Delete with custom timeout
textExtraction.deleteFile(
"bucket-name",
"/path/to/file.pdf",
Duration.ofSeconds(30)
);
// Clean up after extraction
String jobId = textExtraction.uploadAndStartExtraction(file);
TextExtractionResponse status = textExtraction.checkExtractionStatus(jobId);
if (status.metadata().status() == TextExtractionResponse.Status.COMPLETED) {
// Delete input and output files
textExtraction.deleteFile(inputBucket, inputPath);
textExtraction.deleteFile(outputBucket, outputPath);
}Configure and inject TextExtraction service.
Configuration:
quarkus.langchain4j.watsonx.base-url=https://us-south.ml.cloud.ibm.com
quarkus.langchain4j.watsonx.api-key=your-api-key
quarkus.langchain4j.watsonx.project-id=your-project-id
# Text extraction configuration
quarkus.langchain4j.watsonx.text-extraction.base-url=https://s3.us-south.cloud-object-storage.appdomain.cloud
quarkus.langchain4j.watsonx.text-extraction.document-reference.connection=input-connection-id
quarkus.langchain4j.watsonx.text-extraction.document-reference.bucket-name=input-documents
quarkus.langchain4j.watsonx.text-extraction.results-reference.connection=output-connection-id
quarkus.langchain4j.watsonx.text-extraction.results-reference.bucket-name=output-results
quarkus.langchain4j.watsonx.text-extraction.log-requests=false
quarkus.langchain4j.watsonx.text-extraction.log-responses=falseInjection:
import io.quarkiverse.langchain4j.watsonx.runtime.TextExtraction;
import jakarta.inject.Inject;
@ApplicationScoped
public class DocumentService {
@Inject
TextExtraction textExtraction;
public String extractDocument(File file) {
return textExtraction.uploadExtractAndFetch(file);
}
}public record TextExtractionResponse(
Metadata metadata,
Entity entity
) {
public record Metadata(
String id,
String href,
Status status,
Instant createdAt,
Instant completedAt
) {}
public record Entity(
ResultsReference resultsReference,
ServiceError serviceError
) {}
public record ResultsReference(
String location,
String type,
String connection,
String bucket
) {}
public record ServiceError(
String code,
String message
) {}
public enum Status {
QUEUED, // Extraction job queued
RUNNING, // Extraction in progress
COMPLETED, // Extraction completed successfully
FAILED // Extraction failed
}
}Usage Example:
String jobId = textExtraction.startExtraction("/path/to/doc.pdf");
TextExtractionResponse response = textExtraction.checkExtractionStatus(jobId);
System.out.println("Job ID: " + response.metadata().id());
System.out.println("Status: " + response.metadata().status());
System.out.println("Created at: " + response.metadata().createdAt());
if (response.metadata().status() == TextExtractionResponse.Status.COMPLETED) {
System.out.println("Result location: " + response.entity().resultsReference().location());
} else if (response.metadata().status() == TextExtractionResponse.Status.FAILED) {
System.out.println("Error: " + response.entity().serviceError().message());
}Internal request bean:
public class TextExtractionRequest {
public static Builder builder();
public static class Builder {
public Builder documentReference(DocumentReference documentReference);
public Builder resultsReference(ResultsReference resultsReference);
public Builder parameters(ExtractionParameters parameters);
public Builder projectId(String projectId);
public Builder spaceId(String spaceId);
public TextExtractionRequest build();
}
}public enum Type {
JSON, // Structured JSON output
MD, // Markdown output (same as MARKDOWN)
MARKDOWN, // Markdown output
HTML, // HTML output
PLAIN_TEXT, // Plain text output
PAGE_IMAGES // Page images output
}// Processing mode
public enum Mode {
// Mode values depend on document type and capabilities
}
// OCR configuration
public enum OCR {
// OCR values depend on OCR capabilities
}
// Embedded images handling
public enum EmbeddedImages {
// Values depend on output format
}import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentSplitter;
import dev.langchain4j.data.document.splitter.DocumentSplitters;
import dev.langchain4j.store.embedding.EmbeddingStore;
import dev.langchain4j.store.embedding.EmbeddingStoreIngestor;
import java.io.File;
@ApplicationScoped
public class DocumentIngestor {
@Inject
TextExtraction textExtraction;
@Inject
EmbeddingModel embeddingModel;
@Inject
EmbeddingStore<TextSegment> embeddingStore;
public void ingestPdf(File pdfFile) {
// Extract text from PDF
String extractedText = textExtraction.uploadExtractAndFetch(pdfFile);
// Create document
Document document = Document.from(
extractedText,
Metadata.from("source", pdfFile.getName())
);
// Split into chunks
DocumentSplitter splitter = DocumentSplitters.recursive(500, 50);
List<TextSegment> segments = splitter.split(document);
// Embed and store
EmbeddingStoreIngestor ingestor = EmbeddingStoreIngestor.builder()
.embeddingModel(embeddingModel)
.embeddingStore(embeddingStore)
.build();
ingestor.ingest(segments);
}
}import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
@ApplicationScoped
public class BatchDocumentProcessor {
@Inject
TextExtraction textExtraction;
private final ExecutorService executor = Executors.newFixedThreadPool(10);
public List<String> processDocumentsBatch(List<File> files) {
// Start all extraction jobs asynchronously
List<CompletableFuture<String>> futures = files.stream()
.map(file -> CompletableFuture.supplyAsync(() -> {
// Upload and start extraction
String jobId = textExtraction.uploadAndStartExtraction(file);
// Poll for completion
TextExtractionResponse status;
do {
Thread.sleep(5000);
status = textExtraction.checkExtractionStatus(jobId);
} while (status.metadata().status() == TextExtractionResponse.Status.RUNNING ||
status.metadata().status() == TextExtractionResponse.Status.QUEUED);
if (status.metadata().status() == TextExtractionResponse.Status.COMPLETED) {
// Fetch result
return fetchResultFromCOS(status.entity().resultsReference());
} else {
throw new RuntimeException("Extraction failed: " +
status.entity().serviceError().message());
}
}, executor))
.toList();
// Wait for all to complete
return futures.stream()
.map(CompletableFuture::join)
.toList();
}
}@ApplicationScoped
public class MetadataPreservingExtractor {
@Inject
TextExtraction textExtraction;
public Document extractWithMetadata(File file, Map<String, Object> additionalMetadata) {
// Extract text
TextExtraction.Parameters params = TextExtraction.Parameters.builder()
.types(Type.JSON) // JSON includes structural metadata
.outputTokensAndBbox(true) // Include token positions
.build();
String jsonResult = textExtraction.uploadExtractAndFetch(file, params);
// Parse JSON to extract metadata
JsonObject json = parseJson(jsonResult);
String text = extractTextFromJson(json);
Map<String, Object> extractedMetadata = extractMetadataFromJson(json);
// Combine with additional metadata
Metadata metadata = Metadata.from(extractedMetadata);
additionalMetadata.forEach(metadata::put);
return Document.from(text, metadata);
}
}// Short timeout for small documents
TextExtraction.Parameters quickParams = TextExtraction.Parameters.builder()
.timeout(Duration.ofSeconds(30))
.build();
// Longer timeout for large documents
TextExtraction.Parameters largeParams = TextExtraction.Parameters.builder()
.timeout(Duration.ofMinutes(5))
.build();// Always clean up temporary files
TextExtraction.Parameters cleanupParams = TextExtraction.Parameters.builder()
.removeUploadedFile(true)
.removeOutputFile(true)
.build();
String result = textExtraction.uploadExtractAndFetch(file, cleanupParams);
// Input and output files are automatically deletedimport io.quarkiverse.langchain4j.watsonx.exception.TextExtractionException;
import io.quarkiverse.langchain4j.watsonx.exception.COSException;
public String extractSafely(File file) {
try {
return textExtraction.uploadExtractAndFetch(file);
} catch (TextExtractionException e) {
System.err.println("Extraction failed: " + e.getMessage());
// Handle extraction-specific errors
return null;
} catch (COSException e) {
System.err.println("COS error: " + e.getMessage() +
" (Status: " + e.statusCode() + ")");
// Handle COS-specific errors
return null;
} catch (Exception e) {
System.err.println("Unexpected error: " + e.getMessage());
return null;
}
}// For large document batches, use asynchronous methods
public void processLargeBatch(List<File> files) {
// Start all extractions
Map<String, File> jobIdToFile = new HashMap<>();
for (File file : files) {
String jobId = textExtraction.uploadAndStartExtraction(file);
jobIdToFile.put(jobId, file);
}
// Poll periodically
ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1);
scheduler.scheduleAtFixedRate(() -> {
for (Map.Entry<String, File> entry : jobIdToFile.entrySet()) {
String jobId = entry.getKey();
TextExtractionResponse status = textExtraction.checkExtractionStatus(jobId);
if (status.metadata().status() == TextExtractionResponse.Status.COMPLETED) {
// Process completed extraction
String result = fetchResultFromCOS(status.entity().resultsReference());
processResult(result, entry.getValue());
jobIdToFile.remove(jobId);
} else if (status.metadata().status() == TextExtractionResponse.Status.FAILED) {
// Handle failed extraction
handleFailure(entry.getValue(), status.entity().serviceError());
jobIdToFile.remove(jobId);
}
}
// Stop scheduler when all done
if (jobIdToFile.isEmpty()) {
scheduler.shutdown();
}
}, 0, 10, TimeUnit.SECONDS);
}// Plain text for simple RAG ingestion
TextExtraction.Parameters plainParams = TextExtraction.Parameters.builder()
.types(Type.PLAIN_TEXT)
.build();
// Markdown for structured content
TextExtraction.Parameters mdParams = TextExtraction.Parameters.builder()
.types(Type.MARKDOWN)
.build();
// JSON for detailed analysis
TextExtraction.Parameters jsonParams = TextExtraction.Parameters.builder()
.types(Type.JSON)
.outputTokensAndBbox(true)
.build();
// Multiple formats for different use cases
TextExtraction.Parameters multiParams = TextExtraction.Parameters.builder()
.types(Type.MARKDOWN, Type.JSON, Type.PLAIN_TEXT)
.build();Bean classes for text extraction API requests and responses.
public record TextExtractionRequest(
String spaceId,
String projectId,
TextExtractionDataReference documentReference,
TextExtractionDataReference resultsReference,
TextExtractionParameters parameters
) {
public static Builder builder();
public static record CosDataLocation(String fileName, String bucket) {}
public static record CosDataConnection(String id) {}
public static record TextExtractionDataReference(
String type,
CosDataConnection connection,
CosDataLocation location
) {
public static TextExtractionDataReference of(String connectionId, String fileName);
public static TextExtractionDataReference of(String connectionId, String fileName, String bucket);
}
public static record TextExtractionParameters(
Mode mode,
OCR ocr,
List<Type> output,
String outputFileName,
Boolean autoRotationCorrection,
EmbeddedImages embeddedImages,
Integer dpi,
Boolean outputTokensAndBbox
) {}
public static class Builder {
public Builder spaceId(String spaceId);
public Builder projectId(String projectId);
public Builder documentReference(TextExtractionDataReference documentReference);
public Builder resultsReference(TextExtractionDataReference resultsReference);
public Builder parameters(TextExtractionParameters parameters);
public TextExtractionRequest build();
}
public static enum Type {
JSON,
HTML,
MD,
PLAIN_TEXT,
PAGE_IMAGES
}
public static enum Mode {
STANDARD,
HIGH_QUALITY
}
public static enum OCR {
DISABLED,
ENABLED,
FORCED
}
public static enum EmbeddedImages {
DISABLED,
ENABLED_PLACEHOLDER,
ENABLED_TEXT,
ENABLED_VERBALIZATION,
ENABLED_VERBALIZATION_ALL
}
}Enum Descriptions:
Type (Output Formats):
Mode:
OCR:
EmbeddedImages:
Install with Tessl CLI
npx tessl i tessl/maven-io-quarkiverse-langchain4j--quarkus-langchain4j-watsonx