PMD Core - The foundational library module providing essential infrastructure for PMD static code analysis including AST handling, rule execution, configuration management, and reporting mechanisms.
—
The Copy-Paste Detection (CPD) module provides specialized capabilities for identifying code duplications across files using token-based analysis. It includes duplicate detection algorithms, match representation, configuration options, and reporting for code clone analysis.
Main class for executing copy-paste detection analysis with configurable parameters and comprehensive duplicate identification.
/**
* Main class for Copy-Paste Detection functionality.
* Analyzes source files to identify duplicate code segments using token-based comparison.
*/
public class CPD {
/**
* Constructor with CPD configuration
* @param configuration CPDConfiguration with analysis settings
*/
CPD(CPDConfiguration configuration);
/**
* Execute CPD analysis on configured source files
* Processes all files and identifies duplicate code segments
*/
void go();
/**
* Get detected code duplication matches
* @return Iterator over Match instances representing duplicate code
*/
Iterator<Match> getMatches();
/**
* Get token count for specific file
* @param file File path to query
* @return Number of tokens found in the file
*/
int getNumberOfTokens(String file);
/**
* Get token counts for all analyzed files
* @return Map of file paths to token counts
*/
Map<String, Integer> getTokenCounts();
}Usage Examples:
import net.sourceforge.pmd.cpd.*;
import java.nio.file.Paths;
import java.util.Iterator;
// Basic CPD analysis
public class CPDAnalysisExample {
public void runCPDAnalysis() {
// Create CPD configuration
CPDConfiguration config = new CPDConfiguration();
config.setMinimumTileSize(50); // Minimum tokens for duplication
config.addInputPath(Paths.get("src/main/java"));
config.setLanguage(LanguageRegistry.CPD.getLanguageByFullName("Java"));
// Create and execute CPD
CPD cpd = new CPD(config);
cpd.go(); // Execute analysis
// Process results
Iterator<Match> matches = cpd.getMatches();
int duplicateCount = 0;
while (matches.hasNext()) {
Match match = matches.next();
duplicateCount++;
System.out.printf("Duplicate #%d:%n", duplicateCount);
System.out.printf(" Tokens: %d%n", match.getTokenCount());
System.out.printf(" Lines: %d%n", match.getLineCount());
System.out.printf(" Locations: %d%n", match.getMarkSet().size());
// Show all locations of this duplicate
for (Mark mark : match.getMarkSet()) {
System.out.printf(" %s:%d-%d%n",
mark.getFilename(),
mark.getBeginLine(),
mark.getEndLine());
}
// Show the duplicated code
System.out.println(" Code:");
String[] lines = match.getSourceCodeSlice().split("\\n");
for (int i = 0; i < Math.min(lines.length, 5); i++) {
System.out.printf(" %s%n", lines[i]);
}
if (lines.length > 5) {
System.out.println(" ...");
}
System.out.println();
}
System.out.printf("Found %d code duplications%n", duplicateCount);
}
public void analyzeTokenCounts(CPD cpd) {
// Get token statistics
Map<String, Integer> tokenCounts = cpd.getTokenCounts();
System.out.println("Token counts by file:");
tokenCounts.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
.forEach(entry ->
System.out.printf(" %s: %d tokens%n",
entry.getKey(), entry.getValue()));
// Calculate total tokens
int totalTokens = tokenCounts.values().stream()
.mapToInt(Integer::intValue)
.sum();
System.out.printf("Total tokens analyzed: %d%n", totalTokens);
// Find largest files
String largestFile = tokenCounts.entrySet().stream()
.max(Map.Entry.comparingByValue())
.map(Map.Entry::getKey)
.orElse("none");
System.out.printf("Largest file: %s (%d tokens)%n",
largestFile,
tokenCounts.getOrDefault(largestFile, 0));
}
}Representation of detected code duplication matches with location tracking and source code access.
/**
* Represents a detected code duplication match.
* Contains information about duplicate locations and the duplicated source code.
*/
public final class Match {
/**
* Get number of duplicate tokens
* @return Token count for the duplicated code segment
*/
int getTokenCount();
/**
* Get number of duplicate lines
* @return Line count for the duplicated code segment
*/
int getLineCount();
/**
* Get all locations where this duplication appears
* @return List of Mark instances representing duplicate locations
*/
List<Mark> getMarkSet();
/**
* Get duplicated source code content
* @return Source code text that is duplicated across locations
*/
String getSourceCodeSlice();
/**
* Compare matches for sorting (by token count, then line count)
* @param other Match to compare against
* @return Comparison result for ordering matches
*/
int compareTo(Match other);
}Usage Examples:
import net.sourceforge.pmd.cpd.*;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
// Processing duplicate matches
public class MatchProcessingExample {
public void processMatches(Iterator<Match> matches) {
List<Match> matchList = new ArrayList<>();
matches.forEachRemaining(matchList::add);
// Sort matches by significance (token count descending)
Collections.sort(matchList, Collections.reverseOrder());
System.out.printf("Found %d duplicate code blocks:%n%n", matchList.size());
for (int i = 0; i < matchList.size(); i++) {
Match match = matchList.get(i);
processSingleMatch(match, i + 1);
}
}
public void processSingleMatch(Match match, int index) {
System.out.printf("=== Duplicate #%d ===%n", index);
System.out.printf("Size: %d tokens (%d lines)%n",
match.getTokenCount(),
match.getLineCount());
List<Mark> locations = match.getMarkSet();
System.out.printf("Appears in %d locations:%n", locations.size());
// Show all locations
for (int i = 0; i < locations.size(); i++) {
Mark mark = locations.get(i);
System.out.printf(" %d. %s (lines %d-%d)%n",
i + 1,
mark.getFilename(),
mark.getBeginLine(),
mark.getEndLine());
}
// Show the duplicated code
String sourceCode = match.getSourceCodeSlice();
System.out.println("Duplicated code:");
String[] lines = sourceCode.split("\\r?\\n");
for (int i = 0; i < Math.min(lines.length, 10); i++) {
System.out.printf(" %2d: %s%n", i + 1, lines[i]);
}
if (lines.length > 10) {
System.out.printf(" ... (%d more lines)%n", lines.length - 10);
}
System.out.println();
}
public void generateDuplicationReport(List<Match> matches) {
// Calculate duplication statistics
int totalDuplicateTokens = matches.stream()
.mapToInt(match -> match.getTokenCount() * (match.getMarkSet().size() - 1))
.sum();
int totalDuplicateLines = matches.stream()
.mapToInt(match -> match.getLineCount() * (match.getMarkSet().size() - 1))
.sum();
// Find files with most duplications
Map<String, Integer> fileOccurrences = new HashMap<>();
matches.forEach(match ->
match.getMarkSet().forEach(mark ->
fileOccurrences.merge(mark.getFilename(), 1, Integer::sum)));
System.out.println("=== Duplication Summary ===");
System.out.printf("Total duplicate blocks: %d%n", matches.size());
System.out.printf("Total duplicate tokens: %d%n", totalDuplicateTokens);
System.out.printf("Total duplicate lines: %d%n", totalDuplicateLines);
System.out.println("%nFiles with most duplications:");
fileOccurrences.entrySet().stream()
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
.limit(10)
.forEach(entry ->
System.out.printf(" %s: %d occurrences%n",
entry.getKey(), entry.getValue()));
// Analyze duplication sizes
System.out.println("%nDuplication size distribution:");
Map<String, Long> sizeDistribution = matches.stream()
.collect(Collectors.groupingBy(
match -> {
int tokens = match.getTokenCount();
if (tokens < 100) return "Small (< 100 tokens)";
else if (tokens < 500) return "Medium (100-500 tokens)";
else return "Large (500+ tokens)";
},
Collectors.counting()));
sizeDistribution.forEach((size, count) ->
System.out.printf(" %s: %d duplicates%n", size, count));
}
}Configuration class for customizing copy-paste detection analysis parameters and behavior.
/**
* Configuration for Copy-Paste Detection analysis.
* Extends AbstractConfiguration with CPD-specific settings.
*/
public class CPDConfiguration extends AbstractConfiguration {
/**
* Default constructor with CPD language registry
*/
CPDConfiguration();
/**
* Constructor with custom language registry
* @param languageRegistry Registry of CPD-capable languages
*/
CPDConfiguration(LanguageRegistry languageRegistry);
/**
* Get minimum tile size (minimum tokens for duplication)
* @return Minimum number of tokens required for duplicate detection
*/
int getMinimumTileSize();
/**
* Set minimum tile size for duplication detection
* @param minimumTileSize Minimum tokens (must be positive)
*/
void setMinimumTileSize(int minimumTileSize);
/**
* Check if differences in literals are ignored
* @return true if literal values are ignored during comparison
*/
boolean isIgnoreLiterals();
/**
* Set whether to ignore literal differences
* @param ignoreLiterals true to ignore string/numeric literal values
*/
void setIgnoreLiterals(boolean ignoreLiterals);
/**
* Check if differences in identifiers are ignored
* @return true if identifier names are ignored during comparison
*/
boolean isIgnoreIdentifiers();
/**
* Set whether to ignore identifier differences
* @param ignoreIdentifiers true to ignore variable/method names
*/
void setIgnoreIdentifiers(boolean ignoreIdentifiers);
/**
* Check if annotation differences are ignored
* @return true if annotations are ignored during comparison
*/
boolean isIgnoreAnnotations();
/**
* Set whether to ignore annotation differences
* @param ignoreAnnotations true to ignore annotation presence/content
*/
void setIgnoreAnnotations(boolean ignoreAnnotations);
/**
* Check if only files with same name are compared
* @return true if cross-file comparison is limited to same filenames
*/
boolean isMatchOnlyFilesWithSameName();
/**
* Set whether to compare only files with same name
* @param matchOnlyFilesWithSameName true to limit to same-name files
*/
void setMatchOnlyFilesWithSameName(boolean matchOnlyFilesWithSameName);
/**
* Get renderer for CPD output formatting
* @return CPDRenderer for generating reports
*/
CPDRenderer getRenderer();
/**
* Set renderer for CPD output
* @param renderer CPDRenderer for formatting results
*/
void setRenderer(CPDRenderer renderer);
}Usage Examples:
import net.sourceforge.pmd.cpd.*;
import java.nio.file.Paths;
// Configuring CPD analysis
public class CPDConfigurationExample {
public void createBasicConfiguration() {
CPDConfiguration config = new CPDConfiguration();
// Set basic parameters
config.setMinimumTileSize(50); // Minimum 50 tokens for duplication
config.addInputPath(Paths.get("src/main/java"));
config.setLanguage(LanguageRegistry.CPD.getLanguageById("java"));
// Configure output
config.setReportFormat("text");
config.setReportFile(Paths.get("cpd-report.txt"));
System.out.println("Basic CPD configuration created");
}
public void createAdvancedConfiguration() {
CPDConfiguration config = new CPDConfiguration();
// Advanced duplication detection settings
config.setMinimumTileSize(25); // Lower threshold for more sensitive detection
config.setIgnoreLiterals(true); // Ignore string/number differences
config.setIgnoreIdentifiers(true); // Ignore variable name differences
config.setIgnoreAnnotations(true); // Ignore annotation differences
// File matching configuration
config.setMatchOnlyFilesWithSameName(false); // Allow cross-file comparison
// Input configuration
config.addInputPath(Paths.get("src/main/java"));
config.addInputPath(Paths.get("src/test/java"));
config.setLanguage(LanguageRegistry.CPD.getLanguageById("java"));
// Exclude certain patterns
config.setExcludes(Arrays.asList(
Paths.get("**/generated/**"),
Paths.get("**/target/**")
));
// Configure encoding
config.setSourceEncoding(StandardCharsets.UTF_8);
System.out.println("Advanced CPD configuration created");
}
public void configureCPDForDifferentLanguages() {
// Java configuration
CPDConfiguration javaConfig = new CPDConfiguration();
javaConfig.setLanguage(LanguageRegistry.CPD.getLanguageById("java"));
javaConfig.setMinimumTileSize(50);
javaConfig.addInputPath(Paths.get("src/main/java"));
// JavaScript configuration
CPDConfiguration jsConfig = new CPDConfiguration();
jsConfig.setLanguage(LanguageRegistry.CPD.getLanguageById("javascript"));
jsConfig.setMinimumTileSize(30); // Smaller threshold for JS
jsConfig.setIgnoreLiterals(true);
jsConfig.addInputPath(Paths.get("src/main/webapp/js"));
// Python configuration
CPDConfiguration pythonConfig = new CPDConfiguration();
pythonConfig.setLanguage(LanguageRegistry.CPD.getLanguageById("python"));
pythonConfig.setMinimumTileSize(40);
pythonConfig.addInputPath(Paths.get("src/main/python"));
System.out.println("Language-specific configurations created");
}
public void configureIgnoreOptions() {
CPDConfiguration config = new CPDConfiguration();
// Configure what to ignore for more flexible matching
config.setIgnoreLiterals(true); // "hello" matches "world"
config.setIgnoreIdentifiers(true); // variable names don't matter
config.setIgnoreAnnotations(true); // @Override vs no annotation
// This configuration will find structural duplicates even when:
// - String literals are different
// - Variable names are different
// - Method names are different
// - Annotations are present/absent
config.setMinimumTileSize(30); // Lower threshold since we're ignoring more
config.addInputPath(Paths.get("src"));
System.out.println("Flexible matching configuration created");
}
public void runMultipleAnalyses() {
// Run strict analysis (exact matches)
CPDConfiguration strictConfig = new CPDConfiguration();
strictConfig.setMinimumTileSize(100);
strictConfig.setIgnoreLiterals(false);
strictConfig.setIgnoreIdentifiers(false);
strictConfig.addInputPath(Paths.get("src"));
CPD strictCpd = new CPD(strictConfig);
strictCpd.go();
System.out.printf("Strict analysis found %d exact duplicates%n",
countMatches(strictCpd.getMatches()));
// Run flexible analysis (structural matches)
CPDConfiguration flexibleConfig = new CPDConfiguration();
flexibleConfig.setMinimumTileSize(50);
flexibleConfig.setIgnoreLiterals(true);
flexibleConfig.setIgnoreIdentifiers(true);
flexibleConfig.addInputPath(Paths.get("src"));
CPD flexibleCpd = new CPD(flexibleConfig);
flexibleCpd.go();
System.out.printf("Flexible analysis found %d structural duplicates%n",
countMatches(flexibleCpd.getMatches()));
}
private int countMatches(Iterator<Match> matches) {
int count = 0;
while (matches.hasNext()) {
matches.next();
count++;
}
return count;
}
}/**
* Mark representing a specific location of duplicated code
*/
final class Mark {
/**
* Get filename containing the duplicate
* @return File path where duplicate code appears
*/
String getFilename();
/**
* Get starting line number of duplicate
* @return One-based line number where duplicate begins
*/
int getBeginLine();
/**
* Get ending line number of duplicate
* @return One-based line number where duplicate ends
*/
int getEndLine();
/**
* Get starting column number of duplicate
* @return One-based column number where duplicate begins
*/
int getBeginColumn();
/**
* Get ending column number of duplicate
* @return One-based column number where duplicate ends
*/
int getEndColumn();
/**
* Get token count for this mark
* @return Number of tokens in the duplicate
*/
int getTokenCount();
/**
* Compare marks for sorting
* @param other Mark to compare against
* @return Comparison result for ordering
*/
int compareTo(Mark other);
}
/**
* Renderer interface for CPD output formatting
*/
interface CPDRenderer {
/**
* Start rendering CPD results
*/
void start();
/**
* Render a single duplication match
* @param match Match to render
*/
void renderDuplication(Match match);
/**
* Finish rendering and cleanup
*/
void end();
/**
* Set output writer for rendering
* @param writer Writer for output
*/
void setWriter(Writer writer);
}
/**
* Built-in CPD renderers for different output formats
*/
class CPDRenderers {
static CPDRenderer text();
static CPDRenderer xml();
static CPDRenderer csv();
static CPDRenderer json();
}
/**
* Token for CPD analysis representing atomic code elements
*/
interface Token {
/**
* Get token image (text representation)
* @return String representation of token
*/
String getImage();
/**
* Get token type identifier
* @return Integer representing token type
*/
int getKind();
/**
* Get line number where token appears
* @return One-based line number
*/
int getBeginLine();
/**
* Get column number where token appears
* @return One-based column number
*/
int getBeginColumn();
/**
* Get ending line number of token
* @return One-based ending line number
*/
int getEndLine();
/**
* Get ending column number of token
* @return One-based ending column number
*/
int getEndColumn();
}
/**
* CPD visitor for language-specific tokenization
*/
interface CpdVisitor {
/**
* Visit source file and generate tokens
* @param sourceCode Source code to tokenize
* @param filename File name for context
*/
void visitFile(String sourceCode, String filename);
/**
* Add token to CPD analysis
* @param image Token text
* @param beginLine Starting line
* @param endLine Ending line
* @param beginColumn Starting column
* @param endColumn Ending column
*/
void add(String image, int beginLine, int endLine, int beginColumn, int endColumn);
}
/**
* Exception thrown during CPD processing
*/
class CPDException extends Exception {
CPDException(String message);
CPDException(String message, Throwable cause);
}
/**
* CPD report statistics
*/
interface CPDReportStats {
/**
* Get total number of duplicate blocks found
* @return Count of duplicate code blocks
*/
int getNumberOfDuplicates();
/**
* Get total number of duplicate tokens
* @return Sum of all duplicate token counts
*/
int getTotalDuplicateTokens();
/**
* Get total number of duplicate lines
* @return Sum of all duplicate line counts
*/
int getTotalDuplicateLines();
/**
* Get files analyzed count
* @return Number of source files processed
*/
int getFilesAnalyzed();
/**
* Get duplication percentage
* @return Percentage of code that is duplicated
*/
double getDuplicationPercentage();
}Install with Tessl CLI
npx tessl i tessl/maven-net-sourceforge-pmd--pmd-core