tessl/maven-net-sourceforge-pmd--pmd-core

PMD Core - The foundational library module providing essential infrastructure for PMD static code analysis including AST handling, rule execution, configuration management, and reporting mechanisms.

—

Pending

Overview

Eval results

Files

Copy-Paste Detection

Name: tessl/maven-net-sourceforge-pmd--pmd-core
Author: tessl

The Copy-Paste Detection (CPD) module provides specialized capabilities for identifying code duplications across files using token-based analysis. It includes duplicate detection algorithms, match representation, configuration options, and reporting for code clone analysis.

Capabilities

CPD Analysis Engine

Main class for executing copy-paste detection analysis with configurable parameters and comprehensive duplicate identification.

/**
 * Main class for Copy-Paste Detection functionality.
 * Analyzes source files to identify duplicate code segments using token-based comparison.
 */
public class CPD {
    
    /**
     * Constructor with CPD configuration
     * @param configuration CPDConfiguration with analysis settings
     */
    CPD(CPDConfiguration configuration);
    
    /**
     * Execute CPD analysis on configured source files
     * Processes all files and identifies duplicate code segments
     */
    void go();
    
    /**
     * Get detected code duplication matches
     * @return Iterator over Match instances representing duplicate code
     */
    Iterator<Match> getMatches();
    
    /**
     * Get token count for specific file
     * @param file File path to query
     * @return Number of tokens found in the file
     */
    int getNumberOfTokens(String file);
    
    /**
     * Get token counts for all analyzed files
     * @return Map of file paths to token counts
     */
    Map<String, Integer> getTokenCounts();
}

Usage Examples:

import net.sourceforge.pmd.cpd.*;
import java.nio.file.Paths;
import java.util.Iterator;

// Basic CPD analysis
public class CPDAnalysisExample {
    
    public void runCPDAnalysis() {
        // Create CPD configuration
        CPDConfiguration config = new CPDConfiguration();
        config.setMinimumTileSize(50);  // Minimum tokens for duplication
        config.addInputPath(Paths.get("src/main/java"));
        config.setLanguage(LanguageRegistry.CPD.getLanguageByFullName("Java"));
        
        // Create and execute CPD
        CPD cpd = new CPD(config);
        cpd.go();  // Execute analysis
        
        // Process results
        Iterator<Match> matches = cpd.getMatches();
        int duplicateCount = 0;
        
        while (matches.hasNext()) {
            Match match = matches.next();
            duplicateCount++;
            
            System.out.printf("Duplicate #%d:%n", duplicateCount);
            System.out.printf("  Tokens: %d%n", match.getTokenCount());
            System.out.printf("  Lines: %d%n", match.getLineCount());
            System.out.printf("  Locations: %d%n", match.getMarkSet().size());
            
            // Show all locations of this duplicate
            for (Mark mark : match.getMarkSet()) {
                System.out.printf("    %s:%d-%d%n", 
                    mark.getFilename(), 
                    mark.getBeginLine(), 
                    mark.getEndLine());
            }
            
            // Show the duplicated code
            System.out.println("  Code:");
            String[] lines = match.getSourceCodeSlice().split("\\n");
            for (int i = 0; i < Math.min(lines.length, 5); i++) {
                System.out.printf("    %s%n", lines[i]);
            }
            if (lines.length > 5) {
                System.out.println("    ...");
            }
            System.out.println();
        }
        
        System.out.printf("Found %d code duplications%n", duplicateCount);
    }
    
    public void analyzeTokenCounts(CPD cpd) {
        // Get token statistics
        Map<String, Integer> tokenCounts = cpd.getTokenCounts();
        
        System.out.println("Token counts by file:");
        tokenCounts.entrySet().stream()
            .sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
            .forEach(entry -> 
                System.out.printf("  %s: %d tokens%n", 
                    entry.getKey(), entry.getValue()));
        
        // Calculate total tokens
        int totalTokens = tokenCounts.values().stream()
            .mapToInt(Integer::intValue)
            .sum();
        
        System.out.printf("Total tokens analyzed: %d%n", totalTokens);
        
        // Find largest files
        String largestFile = tokenCounts.entrySet().stream()
            .max(Map.Entry.comparingByValue())
            .map(Map.Entry::getKey)
            .orElse("none");
        
        System.out.printf("Largest file: %s (%d tokens)%n", 
            largestFile, 
            tokenCounts.getOrDefault(largestFile, 0));
    }
}

Match Representation

Representation of detected code duplication matches with location tracking and source code access.

/**
 * Represents a detected code duplication match.
 * Contains information about duplicate locations and the duplicated source code.
 */
public final class Match {
    
    /**
     * Get number of duplicate tokens
     * @return Token count for the duplicated code segment
     */
    int getTokenCount();
    
    /**
     * Get number of duplicate lines
     * @return Line count for the duplicated code segment
     */
    int getLineCount();
    
    /**
     * Get all locations where this duplication appears
     * @return List of Mark instances representing duplicate locations
     */
    List<Mark> getMarkSet();
    
    /**
     * Get duplicated source code content
     * @return Source code text that is duplicated across locations
     */
    String getSourceCodeSlice();
    
    /**
     * Compare matches for sorting (by token count, then line count)
     * @param other Match to compare against
     * @return Comparison result for ordering matches
     */
    int compareTo(Match other);
}

Usage Examples:

import net.sourceforge.pmd.cpd.*;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;

// Processing duplicate matches
public class MatchProcessingExample {
    
    public void processMatches(Iterator<Match> matches) {
        List<Match> matchList = new ArrayList<>();
        matches.forEachRemaining(matchList::add);
        
        // Sort matches by significance (token count descending)
        Collections.sort(matchList, Collections.reverseOrder());
        
        System.out.printf("Found %d duplicate code blocks:%n%n", matchList.size());
        
        for (int i = 0; i < matchList.size(); i++) {
            Match match = matchList.get(i);
            processSingleMatch(match, i + 1);
        }
    }
    
    public void processSingleMatch(Match match, int index) {
        System.out.printf("=== Duplicate #%d ===%n", index);
        System.out.printf("Size: %d tokens (%d lines)%n", 
            match.getTokenCount(), 
            match.getLineCount());
        
        List<Mark> locations = match.getMarkSet();
        System.out.printf("Appears in %d locations:%n", locations.size());
        
        // Show all locations
        for (int i = 0; i < locations.size(); i++) {
            Mark mark = locations.get(i);
            System.out.printf("  %d. %s (lines %d-%d)%n",
                i + 1,
                mark.getFilename(),
                mark.getBeginLine(),
                mark.getEndLine());
        }
        
        // Show the duplicated code
        String sourceCode = match.getSourceCodeSlice();
        System.out.println("Duplicated code:");
        String[] lines = sourceCode.split("\\r?\\n");
        
        for (int i = 0; i < Math.min(lines.length, 10); i++) {
            System.out.printf("  %2d: %s%n", i + 1, lines[i]);
        }
        
        if (lines.length > 10) {
            System.out.printf("  ... (%d more lines)%n", lines.length - 10);
        }
        
        System.out.println();
    }
    
    public void generateDuplicationReport(List<Match> matches) {
        // Calculate duplication statistics
        int totalDuplicateTokens = matches.stream()
            .mapToInt(match -> match.getTokenCount() * (match.getMarkSet().size() - 1))
            .sum();
        
        int totalDuplicateLines = matches.stream()
            .mapToInt(match -> match.getLineCount() * (match.getMarkSet().size() - 1))
            .sum();
        
        // Find files with most duplications
        Map<String, Integer> fileOccurrences = new HashMap<>();
        matches.forEach(match -> 
            match.getMarkSet().forEach(mark -> 
                fileOccurrences.merge(mark.getFilename(), 1, Integer::sum)));
        
        System.out.println("=== Duplication Summary ===");
        System.out.printf("Total duplicate blocks: %d%n", matches.size());
        System.out.printf("Total duplicate tokens: %d%n", totalDuplicateTokens);
        System.out.printf("Total duplicate lines: %d%n", totalDuplicateLines);
        
        System.out.println("%nFiles with most duplications:");
        fileOccurrences.entrySet().stream()
            .sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
            .limit(10)
            .forEach(entry -> 
                System.out.printf("  %s: %d occurrences%n", 
                    entry.getKey(), entry.getValue()));
        
        // Analyze duplication sizes
        System.out.println("%nDuplication size distribution:");
        Map<String, Long> sizeDistribution = matches.stream()
            .collect(Collectors.groupingBy(
                match -> {
                    int tokens = match.getTokenCount();
                    if (tokens < 100) return "Small (< 100 tokens)";
                    else if (tokens < 500) return "Medium (100-500 tokens)";
                    else return "Large (500+ tokens)";
                },
                Collectors.counting()));
        
        sizeDistribution.forEach((size, count) -> 
            System.out.printf("  %s: %d duplicates%n", size, count));
    }
}

CPD Configuration

Configuration class for customizing copy-paste detection analysis parameters and behavior.

/**
 * Configuration for Copy-Paste Detection analysis.
 * Extends AbstractConfiguration with CPD-specific settings.
 */
public class CPDConfiguration extends AbstractConfiguration {
    
    /**
     * Default constructor with CPD language registry
     */
    CPDConfiguration();
    
    /**
     * Constructor with custom language registry
     * @param languageRegistry Registry of CPD-capable languages
     */
    CPDConfiguration(LanguageRegistry languageRegistry);
    
    /**
     * Get minimum tile size (minimum tokens for duplication)
     * @return Minimum number of tokens required for duplicate detection
     */
    int getMinimumTileSize();
    
    /**
     * Set minimum tile size for duplication detection
     * @param minimumTileSize Minimum tokens (must be positive)
     */
    void setMinimumTileSize(int minimumTileSize);
    
    /**
     * Check if differences in literals are ignored
     * @return true if literal values are ignored during comparison
     */
    boolean isIgnoreLiterals();
    
    /**
     * Set whether to ignore literal differences
     * @param ignoreLiterals true to ignore string/numeric literal values
     */
    void setIgnoreLiterals(boolean ignoreLiterals);
    
    /**
     * Check if differences in identifiers are ignored
     * @return true if identifier names are ignored during comparison
     */
    boolean isIgnoreIdentifiers();
    
    /**
     * Set whether to ignore identifier differences
     * @param ignoreIdentifiers true to ignore variable/method names
     */
    void setIgnoreIdentifiers(boolean ignoreIdentifiers);
    
    /**
     * Check if annotation differences are ignored
     * @return true if annotations are ignored during comparison
     */
    boolean isIgnoreAnnotations();
    
    /**
     * Set whether to ignore annotation differences
     * @param ignoreAnnotations true to ignore annotation presence/content
     */
    void setIgnoreAnnotations(boolean ignoreAnnotations);
    
    /**
     * Check if only files with same name are compared
     * @return true if cross-file comparison is limited to same filenames
     */
    boolean isMatchOnlyFilesWithSameName();
    
    /**
     * Set whether to compare only files with same name
     * @param matchOnlyFilesWithSameName true to limit to same-name files
     */
    void setMatchOnlyFilesWithSameName(boolean matchOnlyFilesWithSameName);
    
    /**
     * Get renderer for CPD output formatting
     * @return CPDRenderer for generating reports
     */
    CPDRenderer getRenderer();
    
    /**
     * Set renderer for CPD output
     * @param renderer CPDRenderer for formatting results
     */
    void setRenderer(CPDRenderer renderer);
}

Usage Examples:

import net.sourceforge.pmd.cpd.*;
import java.nio.file.Paths;

// Configuring CPD analysis
public class CPDConfigurationExample {
    
    public void createBasicConfiguration() {
        CPDConfiguration config = new CPDConfiguration();
        
        // Set basic parameters
        config.setMinimumTileSize(50);  // Minimum 50 tokens for duplication
        config.addInputPath(Paths.get("src/main/java"));
        config.setLanguage(LanguageRegistry.CPD.getLanguageById("java"));
        
        // Configure output
        config.setReportFormat("text");
        config.setReportFile(Paths.get("cpd-report.txt"));
        
        System.out.println("Basic CPD configuration created");
    }
    
    public void createAdvancedConfiguration() {
        CPDConfiguration config = new CPDConfiguration();
        
        // Advanced duplication detection settings
        config.setMinimumTileSize(25);      // Lower threshold for more sensitive detection
        config.setIgnoreLiterals(true);     // Ignore string/number differences
        config.setIgnoreIdentifiers(true);  // Ignore variable name differences
        config.setIgnoreAnnotations(true);  // Ignore annotation differences
        
        // File matching configuration
        config.setMatchOnlyFilesWithSameName(false);  // Allow cross-file comparison
        
        // Input configuration
        config.addInputPath(Paths.get("src/main/java"));
        config.addInputPath(Paths.get("src/test/java"));
        config.setLanguage(LanguageRegistry.CPD.getLanguageById("java"));
        
        // Exclude certain patterns
        config.setExcludes(Arrays.asList(
            Paths.get("**/generated/**"),
            Paths.get("**/target/**")
        ));
        
        // Configure encoding
        config.setSourceEncoding(StandardCharsets.UTF_8);
        
        System.out.println("Advanced CPD configuration created");
    }
    
    public void configureCPDForDifferentLanguages() {
        // Java configuration
        CPDConfiguration javaConfig = new CPDConfiguration();
        javaConfig.setLanguage(LanguageRegistry.CPD.getLanguageById("java"));
        javaConfig.setMinimumTileSize(50);
        javaConfig.addInputPath(Paths.get("src/main/java"));
        
        // JavaScript configuration  
        CPDConfiguration jsConfig = new CPDConfiguration();
        jsConfig.setLanguage(LanguageRegistry.CPD.getLanguageById("javascript"));
        jsConfig.setMinimumTileSize(30);  // Smaller threshold for JS
        jsConfig.setIgnoreLiterals(true);
        jsConfig.addInputPath(Paths.get("src/main/webapp/js"));
        
        // Python configuration
        CPDConfiguration pythonConfig = new CPDConfiguration();
        pythonConfig.setLanguage(LanguageRegistry.CPD.getLanguageById("python"));
        pythonConfig.setMinimumTileSize(40);
        pythonConfig.addInputPath(Paths.get("src/main/python"));
        
        System.out.println("Language-specific configurations created");
    }
    
    public void configureIgnoreOptions() {
        CPDConfiguration config = new CPDConfiguration();
        
        // Configure what to ignore for more flexible matching
        config.setIgnoreLiterals(true);     // "hello" matches "world"
        config.setIgnoreIdentifiers(true);  // variable names don't matter
        config.setIgnoreAnnotations(true);  // @Override vs no annotation
        
        // This configuration will find structural duplicates even when:
        // - String literals are different
        // - Variable names are different  
        // - Method names are different
        // - Annotations are present/absent
        
        config.setMinimumTileSize(30);  // Lower threshold since we're ignoring more
        config.addInputPath(Paths.get("src"));
        
        System.out.println("Flexible matching configuration created");
    }
    
    public void runMultipleAnalyses() {
        // Run strict analysis (exact matches)
        CPDConfiguration strictConfig = new CPDConfiguration();
        strictConfig.setMinimumTileSize(100);
        strictConfig.setIgnoreLiterals(false);
        strictConfig.setIgnoreIdentifiers(false);
        strictConfig.addInputPath(Paths.get("src"));
        
        CPD strictCpd = new CPD(strictConfig);
        strictCpd.go();
        System.out.printf("Strict analysis found %d exact duplicates%n",
            countMatches(strictCpd.getMatches()));
        
        // Run flexible analysis (structural matches)
        CPDConfiguration flexibleConfig = new CPDConfiguration();
        flexibleConfig.setMinimumTileSize(50);
        flexibleConfig.setIgnoreLiterals(true);
        flexibleConfig.setIgnoreIdentifiers(true);
        flexibleConfig.addInputPath(Paths.get("src"));
        
        CPD flexibleCpd = new CPD(flexibleConfig);
        flexibleCpd.go();
        System.out.printf("Flexible analysis found %d structural duplicates%n",
            countMatches(flexibleCpd.getMatches()));
    }
    
    private int countMatches(Iterator<Match> matches) {
        int count = 0;
        while (matches.hasNext()) {
            matches.next();
            count++;
        }
        return count;
    }
}

Types

/**
 * Mark representing a specific location of duplicated code
 */
final class Mark {
    
    /**
     * Get filename containing the duplicate
     * @return File path where duplicate code appears
     */
    String getFilename();
    
    /**
     * Get starting line number of duplicate
     * @return One-based line number where duplicate begins
     */
    int getBeginLine();
    
    /**
     * Get ending line number of duplicate  
     * @return One-based line number where duplicate ends
     */
    int getEndLine();
    
    /**
     * Get starting column number of duplicate
     * @return One-based column number where duplicate begins  
     */
    int getBeginColumn();
    
    /**
     * Get ending column number of duplicate
     * @return One-based column number where duplicate ends
     */
    int getEndColumn();
    
    /**
     * Get token count for this mark
     * @return Number of tokens in the duplicate
     */
    int getTokenCount();
    
    /**
     * Compare marks for sorting
     * @param other Mark to compare against
     * @return Comparison result for ordering
     */
    int compareTo(Mark other);
}

/**
 * Renderer interface for CPD output formatting
 */
interface CPDRenderer {
    
    /**
     * Start rendering CPD results
     */
    void start();
    
    /**
     * Render a single duplication match
     * @param match Match to render
     */
    void renderDuplication(Match match);
    
    /**
     * Finish rendering and cleanup
     */
    void end();
    
    /**
     * Set output writer for rendering
     * @param writer Writer for output
     */
    void setWriter(Writer writer);
}

/**
 * Built-in CPD renderers for different output formats
 */
class CPDRenderers {
    static CPDRenderer text();
    static CPDRenderer xml();
    static CPDRenderer csv();
    static CPDRenderer json();
}

/**
 * Token for CPD analysis representing atomic code elements
 */
interface Token {
    
    /**
     * Get token image (text representation)
     * @return String representation of token
     */
    String getImage();
    
    /**
     * Get token type identifier
     * @return Integer representing token type
     */
    int getKind();
    
    /**
     * Get line number where token appears
     * @return One-based line number
     */
    int getBeginLine();
    
    /**
     * Get column number where token appears
     * @return One-based column number
     */
    int getBeginColumn();
    
    /**
     * Get ending line number of token
     * @return One-based ending line number
     */
    int getEndLine();
    
    /**
     * Get ending column number of token
     * @return One-based ending column number
     */
    int getEndColumn();
}

/**
 * CPD visitor for language-specific tokenization
 */
interface CpdVisitor {
    
    /**
     * Visit source file and generate tokens
     * @param sourceCode Source code to tokenize
     * @param filename File name for context
     */
    void visitFile(String sourceCode, String filename);
    
    /**
     * Add token to CPD analysis
     * @param image Token text
     * @param beginLine Starting line
     * @param endLine Ending line  
     * @param beginColumn Starting column
     * @param endColumn Ending column
     */
    void add(String image, int beginLine, int endLine, int beginColumn, int endColumn);
}

/**
 * Exception thrown during CPD processing
 */
class CPDException extends Exception {
    CPDException(String message);
    CPDException(String message, Throwable cause);
}

/**
 * CPD report statistics
 */
interface CPDReportStats {
    
    /**
     * Get total number of duplicate blocks found
     * @return Count of duplicate code blocks
     */
    int getNumberOfDuplicates();
    
    /**
     * Get total number of duplicate tokens
     * @return Sum of all duplicate token counts
     */
    int getTotalDuplicateTokens();
    
    /**
     * Get total number of duplicate lines
     * @return Sum of all duplicate line counts
     */
    int getTotalDuplicateLines();
    
    /**
     * Get files analyzed count
     * @return Number of source files processed
     */
    int getFilesAnalyzed();
    
    /**
     * Get duplication percentage
     * @return Percentage of code that is duplicated
     */
    double getDuplicationPercentage();
}

Install with Tessl CLI

npx tessl i tessl/maven-net-sourceforge-pmd--pmd-core

docs

ast-processing.md

copy-paste-detection.md

core-analysis.md

index.md

language-framework.md

tessl/maven-net-sourceforge-pmd--pmd-core