Tessl Tile for maven/net.sourceforge.pmd/pmd-core@7.13.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

ast-processing.md copy-paste-detection.md core-analysis.md index.md language-framework.md properties-system.md rendering-system.md reporting-system.md rule-system.md utilities.md

copy-paste-detection.mddocs/

0
# Copy-Paste Detection
1

2
The Copy-Paste Detection (CPD) module provides specialized capabilities for identifying code duplications across files using token-based analysis. It includes duplicate detection algorithms, match representation, configuration options, and reporting for code clone analysis.
3

4
## Capabilities
5

6
### CPD Analysis Engine
7

8
Main class for executing copy-paste detection analysis with configurable parameters and comprehensive duplicate identification.
9

10
```java { .api }
11
/**
12
 * Main class for Copy-Paste Detection functionality.
13
 * Analyzes source files to identify duplicate code segments using token-based comparison.
14
 */
15
public class CPD {
16
    
17
    /**
18
     * Constructor with CPD configuration
19
     * @param configuration CPDConfiguration with analysis settings
20
     */
21
    CPD(CPDConfiguration configuration);
22
    
23
    /**
24
     * Execute CPD analysis on configured source files
25
     * Processes all files and identifies duplicate code segments
26
     */
27
    void go();
28
    
29
    /**
30
     * Get detected code duplication matches
31
     * @return Iterator over Match instances representing duplicate code
32
     */
33
    Iterator<Match> getMatches();
34
    
35
    /**
36
     * Get token count for specific file
37
     * @param file File path to query
38
     * @return Number of tokens found in the file
39
     */
40
    int getNumberOfTokens(String file);
41
    
42
    /**
43
     * Get token counts for all analyzed files
44
     * @return Map of file paths to token counts
45
     */
46
    Map<String, Integer> getTokenCounts();
47
}
48
```
49

50
**Usage Examples:**
51

52
```java
53
import net.sourceforge.pmd.cpd.*;
54
import java.nio.file.Paths;
55
import java.util.Iterator;
56

57
// Basic CPD analysis
58
public class CPDAnalysisExample {
59
    
60
    public void runCPDAnalysis() {
61
        // Create CPD configuration
62
        CPDConfiguration config = new CPDConfiguration();
63
        config.setMinimumTileSize(50);  // Minimum tokens for duplication
64
        config.addInputPath(Paths.get("src/main/java"));
65
        config.setLanguage(LanguageRegistry.CPD.getLanguageByFullName("Java"));
66
        
67
        // Create and execute CPD
68
        CPD cpd = new CPD(config);
69
        cpd.go();  // Execute analysis
70
        
71
        // Process results
72
        Iterator<Match> matches = cpd.getMatches();
73
        int duplicateCount = 0;
74
        
75
        while (matches.hasNext()) {
76
            Match match = matches.next();
77
            duplicateCount++;
78
            
79
            System.out.printf("Duplicate #%d:%n", duplicateCount);
80
            System.out.printf("  Tokens: %d%n", match.getTokenCount());
81
            System.out.printf("  Lines: %d%n", match.getLineCount());
82
            System.out.printf("  Locations: %d%n", match.getMarkSet().size());
83
            
84
            // Show all locations of this duplicate
85
            for (Mark mark : match.getMarkSet()) {
86
                System.out.printf("    %s:%d-%d%n", 
87
                    mark.getFilename(), 
88
                    mark.getBeginLine(), 
89
                    mark.getEndLine());
90
            }
91
            
92
            // Show the duplicated code
93
            System.out.println("  Code:");
94
            String[] lines = match.getSourceCodeSlice().split("\\n");
95
            for (int i = 0; i < Math.min(lines.length, 5); i++) {
96
                System.out.printf("    %s%n", lines[i]);
97
            }
98
            if (lines.length > 5) {
99
                System.out.println("    ...");
100
            }
101
            System.out.println();
102
        }
103
        
104
        System.out.printf("Found %d code duplications%n", duplicateCount);
105
    }
106
    
107
    public void analyzeTokenCounts(CPD cpd) {
108
        // Get token statistics
109
        Map<String, Integer> tokenCounts = cpd.getTokenCounts();
110
        
111
        System.out.println("Token counts by file:");
112
        tokenCounts.entrySet().stream()
113
            .sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
114
            .forEach(entry -> 
115
                System.out.printf("  %s: %d tokens%n", 
116
                    entry.getKey(), entry.getValue()));
117
        
118
        // Calculate total tokens
119
        int totalTokens = tokenCounts.values().stream()
120
            .mapToInt(Integer::intValue)
121
            .sum();
122
        
123
        System.out.printf("Total tokens analyzed: %d%n", totalTokens);
124
        
125
        // Find largest files
126
        String largestFile = tokenCounts.entrySet().stream()
127
            .max(Map.Entry.comparingByValue())
128
            .map(Map.Entry::getKey)
129
            .orElse("none");
130
        
131
        System.out.printf("Largest file: %s (%d tokens)%n", 
132
            largestFile, 
133
            tokenCounts.getOrDefault(largestFile, 0));
134
    }
135
}
136
```
137

138
### Match Representation
139

140
Representation of detected code duplication matches with location tracking and source code access.
141

142
```java { .api }
143
/**
144
 * Represents a detected code duplication match.
145
 * Contains information about duplicate locations and the duplicated source code.
146
 */
147
public final class Match {
148
    
149
    /**
150
     * Get number of duplicate tokens
151
     * @return Token count for the duplicated code segment
152
     */
153
    int getTokenCount();
154
    
155
    /**
156
     * Get number of duplicate lines
157
     * @return Line count for the duplicated code segment
158
     */
159
    int getLineCount();
160
    
161
    /**
162
     * Get all locations where this duplication appears
163
     * @return List of Mark instances representing duplicate locations
164
     */
165
    List<Mark> getMarkSet();
166
    
167
    /**
168
     * Get duplicated source code content
169
     * @return Source code text that is duplicated across locations
170
     */
171
    String getSourceCodeSlice();
172
    
173
    /**
174
     * Compare matches for sorting (by token count, then line count)
175
     * @param other Match to compare against
176
     * @return Comparison result for ordering matches
177
     */
178
    int compareTo(Match other);
179
}
180
```
181

182
**Usage Examples:**
183

184
```java
185
import net.sourceforge.pmd.cpd.*;
186
import java.util.List;
187
import java.util.ArrayList;
188
import java.util.Collections;
189

190
// Processing duplicate matches
191
public class MatchProcessingExample {
192
    
193
    public void processMatches(Iterator<Match> matches) {
194
        List<Match> matchList = new ArrayList<>();
195
        matches.forEachRemaining(matchList::add);
196
        
197
        // Sort matches by significance (token count descending)
198
        Collections.sort(matchList, Collections.reverseOrder());
199
        
200
        System.out.printf("Found %d duplicate code blocks:%n%n", matchList.size());
201
        
202
        for (int i = 0; i < matchList.size(); i++) {
203
            Match match = matchList.get(i);
204
            processSingleMatch(match, i + 1);
205
        }
206
    }
207
    
208
    public void processSingleMatch(Match match, int index) {
209
        System.out.printf("=== Duplicate #%d ===%n", index);
210
        System.out.printf("Size: %d tokens (%d lines)%n", 
211
            match.getTokenCount(), 
212
            match.getLineCount());
213
        
214
        List<Mark> locations = match.getMarkSet();
215
        System.out.printf("Appears in %d locations:%n", locations.size());
216
        
217
        // Show all locations
218
        for (int i = 0; i < locations.size(); i++) {
219
            Mark mark = locations.get(i);
220
            System.out.printf("  %d. %s (lines %d-%d)%n",
221
                i + 1,
222
                mark.getFilename(),
223
                mark.getBeginLine(),
224
                mark.getEndLine());
225
        }
226
        
227
        // Show the duplicated code
228
        String sourceCode = match.getSourceCodeSlice();
229
        System.out.println("Duplicated code:");
230
        String[] lines = sourceCode.split("\\r?\\n");
231
        
232
        for (int i = 0; i < Math.min(lines.length, 10); i++) {
233
            System.out.printf("  %2d: %s%n", i + 1, lines[i]);
234
        }
235
        
236
        if (lines.length > 10) {
237
            System.out.printf("  ... (%d more lines)%n", lines.length - 10);
238
        }
239
        
240
        System.out.println();
241
    }
242
    
243
    public void generateDuplicationReport(List<Match> matches) {
244
        // Calculate duplication statistics
245
        int totalDuplicateTokens = matches.stream()
246
            .mapToInt(match -> match.getTokenCount() * (match.getMarkSet().size() - 1))
247
            .sum();
248
        
249
        int totalDuplicateLines = matches.stream()
250
            .mapToInt(match -> match.getLineCount() * (match.getMarkSet().size() - 1))
251
            .sum();
252
        
253
        // Find files with most duplications
254
        Map<String, Integer> fileOccurrences = new HashMap<>();
255
        matches.forEach(match -> 
256
            match.getMarkSet().forEach(mark -> 
257
                fileOccurrences.merge(mark.getFilename(), 1, Integer::sum)));
258
        
259
        System.out.println("=== Duplication Summary ===");
260
        System.out.printf("Total duplicate blocks: %d%n", matches.size());
261
        System.out.printf("Total duplicate tokens: %d%n", totalDuplicateTokens);
262
        System.out.printf("Total duplicate lines: %d%n", totalDuplicateLines);
263
        
264
        System.out.println("%nFiles with most duplications:");
265
        fileOccurrences.entrySet().stream()
266
            .sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
267
            .limit(10)
268
            .forEach(entry -> 
269
                System.out.printf("  %s: %d occurrences%n", 
270
                    entry.getKey(), entry.getValue()));
271
        
272
        // Analyze duplication sizes
273
        System.out.println("%nDuplication size distribution:");
274
        Map<String, Long> sizeDistribution = matches.stream()
275
            .collect(Collectors.groupingBy(
276
                match -> {
277
                    int tokens = match.getTokenCount();
278
                    if (tokens < 100) return "Small (< 100 tokens)";
279
                    else if (tokens < 500) return "Medium (100-500 tokens)";
280
                    else return "Large (500+ tokens)";
281
                },
282
                Collectors.counting()));
283
        
284
        sizeDistribution.forEach((size, count) -> 
285
            System.out.printf("  %s: %d duplicates%n", size, count));
286
    }
287
}
288
```
289

290
### CPD Configuration
291

292
Configuration class for customizing copy-paste detection analysis parameters and behavior.
293

294
```java { .api }
295
/**
296
 * Configuration for Copy-Paste Detection analysis.
297
 * Extends AbstractConfiguration with CPD-specific settings.
298
 */
299
public class CPDConfiguration extends AbstractConfiguration {
300
    
301
    /**
302
     * Default constructor with CPD language registry
303
     */
304
    CPDConfiguration();
305
    
306
    /**
307
     * Constructor with custom language registry
308
     * @param languageRegistry Registry of CPD-capable languages
309
     */
310
    CPDConfiguration(LanguageRegistry languageRegistry);
311
    
312
    /**
313
     * Get minimum tile size (minimum tokens for duplication)
314
     * @return Minimum number of tokens required for duplicate detection
315
     */
316
    int getMinimumTileSize();
317
    
318
    /**
319
     * Set minimum tile size for duplication detection
320
     * @param minimumTileSize Minimum tokens (must be positive)
321
     */
322
    void setMinimumTileSize(int minimumTileSize);
323
    
324
    /**
325
     * Check if differences in literals are ignored
326
     * @return true if literal values are ignored during comparison
327
     */
328
    boolean isIgnoreLiterals();
329
    
330
    /**
331
     * Set whether to ignore literal differences
332
     * @param ignoreLiterals true to ignore string/numeric literal values
333
     */
334
    void setIgnoreLiterals(boolean ignoreLiterals);
335
    
336
    /**
337
     * Check if differences in identifiers are ignored
338
     * @return true if identifier names are ignored during comparison
339
     */
340
    boolean isIgnoreIdentifiers();
341
    
342
    /**
343
     * Set whether to ignore identifier differences
344
     * @param ignoreIdentifiers true to ignore variable/method names
345
     */
346
    void setIgnoreIdentifiers(boolean ignoreIdentifiers);
347
    
348
    /**
349
     * Check if annotation differences are ignored
350
     * @return true if annotations are ignored during comparison
351
     */
352
    boolean isIgnoreAnnotations();
353
    
354
    /**
355
     * Set whether to ignore annotation differences
356
     * @param ignoreAnnotations true to ignore annotation presence/content
357
     */
358
    void setIgnoreAnnotations(boolean ignoreAnnotations);
359
    
360
    /**
361
     * Check if only files with same name are compared
362
     * @return true if cross-file comparison is limited to same filenames
363
     */
364
    boolean isMatchOnlyFilesWithSameName();
365
    
366
    /**
367
     * Set whether to compare only files with same name
368
     * @param matchOnlyFilesWithSameName true to limit to same-name files
369
     */
370
    void setMatchOnlyFilesWithSameName(boolean matchOnlyFilesWithSameName);
371
    
372
    /**
373
     * Get renderer for CPD output formatting
374
     * @return CPDRenderer for generating reports
375
     */
376
    CPDRenderer getRenderer();
377
    
378
    /**
379
     * Set renderer for CPD output
380
     * @param renderer CPDRenderer for formatting results
381
     */
382
    void setRenderer(CPDRenderer renderer);
383
}
384
```
385

386
**Usage Examples:**
387

388
```java
389
import net.sourceforge.pmd.cpd.*;
390
import java.nio.file.Paths;
391

392
// Configuring CPD analysis
393
public class CPDConfigurationExample {
394
    
395
    public void createBasicConfiguration() {
396
        CPDConfiguration config = new CPDConfiguration();
397
        
398
        // Set basic parameters
399
        config.setMinimumTileSize(50);  // Minimum 50 tokens for duplication
400
        config.addInputPath(Paths.get("src/main/java"));
401
        config.setLanguage(LanguageRegistry.CPD.getLanguageById("java"));
402
        
403
        // Configure output
404
        config.setReportFormat("text");
405
        config.setReportFile(Paths.get("cpd-report.txt"));
406
        
407
        System.out.println("Basic CPD configuration created");
408
    }
409
    
410
    public void createAdvancedConfiguration() {
411
        CPDConfiguration config = new CPDConfiguration();
412
        
413
        // Advanced duplication detection settings
414
        config.setMinimumTileSize(25);      // Lower threshold for more sensitive detection
415
        config.setIgnoreLiterals(true);     // Ignore string/number differences
416
        config.setIgnoreIdentifiers(true);  // Ignore variable name differences
417
        config.setIgnoreAnnotations(true);  // Ignore annotation differences
418
        
419
        // File matching configuration
420
        config.setMatchOnlyFilesWithSameName(false);  // Allow cross-file comparison
421
        
422
        // Input configuration
423
        config.addInputPath(Paths.get("src/main/java"));
424
        config.addInputPath(Paths.get("src/test/java"));
425
        config.setLanguage(LanguageRegistry.CPD.getLanguageById("java"));
426
        
427
        // Exclude certain patterns
428
        config.setExcludes(Arrays.asList(
429
            Paths.get("**/generated/**"),
430
            Paths.get("**/target/**")
431
        ));
432
        
433
        // Configure encoding
434
        config.setSourceEncoding(StandardCharsets.UTF_8);
435
        
436
        System.out.println("Advanced CPD configuration created");
437
    }
438
    
439
    public void configureCPDForDifferentLanguages() {
440
        // Java configuration
441
        CPDConfiguration javaConfig = new CPDConfiguration();
442
        javaConfig.setLanguage(LanguageRegistry.CPD.getLanguageById("java"));
443
        javaConfig.setMinimumTileSize(50);
444
        javaConfig.addInputPath(Paths.get("src/main/java"));
445
        
446
        // JavaScript configuration  
447
        CPDConfiguration jsConfig = new CPDConfiguration();
448
        jsConfig.setLanguage(LanguageRegistry.CPD.getLanguageById("javascript"));
449
        jsConfig.setMinimumTileSize(30);  // Smaller threshold for JS
450
        jsConfig.setIgnoreLiterals(true);
451
        jsConfig.addInputPath(Paths.get("src/main/webapp/js"));
452
        
453
        // Python configuration
454
        CPDConfiguration pythonConfig = new CPDConfiguration();
455
        pythonConfig.setLanguage(LanguageRegistry.CPD.getLanguageById("python"));
456
        pythonConfig.setMinimumTileSize(40);
457
        pythonConfig.addInputPath(Paths.get("src/main/python"));
458
        
459
        System.out.println("Language-specific configurations created");
460
    }
461
    
462
    public void configureIgnoreOptions() {
463
        CPDConfiguration config = new CPDConfiguration();
464
        
465
        // Configure what to ignore for more flexible matching
466
        config.setIgnoreLiterals(true);     // "hello" matches "world"
467
        config.setIgnoreIdentifiers(true);  // variable names don't matter
468
        config.setIgnoreAnnotations(true);  // @Override vs no annotation
469
        
470
        // This configuration will find structural duplicates even when:
471
        // - String literals are different
472
        // - Variable names are different  
473
        // - Method names are different
474
        // - Annotations are present/absent
475
        
476
        config.setMinimumTileSize(30);  // Lower threshold since we're ignoring more
477
        config.addInputPath(Paths.get("src"));
478
        
479
        System.out.println("Flexible matching configuration created");
480
    }
481
    
482
    public void runMultipleAnalyses() {
483
        // Run strict analysis (exact matches)
484
        CPDConfiguration strictConfig = new CPDConfiguration();
485
        strictConfig.setMinimumTileSize(100);
486
        strictConfig.setIgnoreLiterals(false);
487
        strictConfig.setIgnoreIdentifiers(false);
488
        strictConfig.addInputPath(Paths.get("src"));
489
        
490
        CPD strictCpd = new CPD(strictConfig);
491
        strictCpd.go();
492
        System.out.printf("Strict analysis found %d exact duplicates%n",
493
            countMatches(strictCpd.getMatches()));
494
        
495
        // Run flexible analysis (structural matches)
496
        CPDConfiguration flexibleConfig = new CPDConfiguration();
497
        flexibleConfig.setMinimumTileSize(50);
498
        flexibleConfig.setIgnoreLiterals(true);
499
        flexibleConfig.setIgnoreIdentifiers(true);
500
        flexibleConfig.addInputPath(Paths.get("src"));
501
        
502
        CPD flexibleCpd = new CPD(flexibleConfig);
503
        flexibleCpd.go();
504
        System.out.printf("Flexible analysis found %d structural duplicates%n",
505
            countMatches(flexibleCpd.getMatches()));
506
    }
507
    
508
    private int countMatches(Iterator<Match> matches) {
509
        int count = 0;
510
        while (matches.hasNext()) {
511
            matches.next();
512
            count++;
513
        }
514
        return count;
515
    }
516
}
517
```
518

519
## Types
520

521
```java { .api }
522
/**
523
 * Mark representing a specific location of duplicated code
524
 */
525
final class Mark {
526
    
527
    /**
528
     * Get filename containing the duplicate
529
     * @return File path where duplicate code appears
530
     */
531
    String getFilename();
532
    
533
    /**
534
     * Get starting line number of duplicate
535
     * @return One-based line number where duplicate begins
536
     */
537
    int getBeginLine();
538
    
539
    /**
540
     * Get ending line number of duplicate  
541
     * @return One-based line number where duplicate ends
542
     */
543
    int getEndLine();
544
    
545
    /**
546
     * Get starting column number of duplicate
547
     * @return One-based column number where duplicate begins  
548
     */
549
    int getBeginColumn();
550
    
551
    /**
552
     * Get ending column number of duplicate
553
     * @return One-based column number where duplicate ends
554
     */
555
    int getEndColumn();
556
    
557
    /**
558
     * Get token count for this mark
559
     * @return Number of tokens in the duplicate
560
     */
561
    int getTokenCount();
562
    
563
    /**
564
     * Compare marks for sorting
565
     * @param other Mark to compare against
566
     * @return Comparison result for ordering
567
     */
568
    int compareTo(Mark other);
569
}
570

571
/**
572
 * Renderer interface for CPD output formatting
573
 */
574
interface CPDRenderer {
575
    
576
    /**
577
     * Start rendering CPD results
578
     */
579
    void start();
580
    
581
    /**
582
     * Render a single duplication match
583
     * @param match Match to render
584
     */
585
    void renderDuplication(Match match);
586
    
587
    /**
588
     * Finish rendering and cleanup
589
     */
590
    void end();
591
    
592
    /**
593
     * Set output writer for rendering
594
     * @param writer Writer for output
595
     */
596
    void setWriter(Writer writer);
597
}
598

599
/**
600
 * Built-in CPD renderers for different output formats
601
 */
602
class CPDRenderers {
603
    static CPDRenderer text();
604
    static CPDRenderer xml();
605
    static CPDRenderer csv();
606
    static CPDRenderer json();
607
}
608

609
/**
610
 * Token for CPD analysis representing atomic code elements
611
 */
612
interface Token {
613
    
614
    /**
615
     * Get token image (text representation)
616
     * @return String representation of token
617
     */
618
    String getImage();
619
    
620
    /**
621
     * Get token type identifier
622
     * @return Integer representing token type
623
     */
624
    int getKind();
625
    
626
    /**
627
     * Get line number where token appears
628
     * @return One-based line number
629
     */
630
    int getBeginLine();
631
    
632
    /**
633
     * Get column number where token appears
634
     * @return One-based column number
635
     */
636
    int getBeginColumn();
637
    
638
    /**
639
     * Get ending line number of token
640
     * @return One-based ending line number
641
     */
642
    int getEndLine();
643
    
644
    /**
645
     * Get ending column number of token
646
     * @return One-based ending column number
647
     */
648
    int getEndColumn();
649
}
650

651
/**
652
 * CPD visitor for language-specific tokenization
653
 */
654
interface CpdVisitor {
655
    
656
    /**
657
     * Visit source file and generate tokens
658
     * @param sourceCode Source code to tokenize
659
     * @param filename File name for context
660
     */
661
    void visitFile(String sourceCode, String filename);
662
    
663
    /**
664
     * Add token to CPD analysis
665
     * @param image Token text
666
     * @param beginLine Starting line
667
     * @param endLine Ending line  
668
     * @param beginColumn Starting column
669
     * @param endColumn Ending column
670
     */
671
    void add(String image, int beginLine, int endLine, int beginColumn, int endColumn);
672
}
673

674
/**
675
 * Exception thrown during CPD processing
676
 */
677
class CPDException extends Exception {
678
    CPDException(String message);
679
    CPDException(String message, Throwable cause);
680
}
681

682
/**
683
 * CPD report statistics
684
 */
685
interface CPDReportStats {
686
    
687
    /**
688
     * Get total number of duplicate blocks found
689
     * @return Count of duplicate code blocks
690
     */
691
    int getNumberOfDuplicates();
692
    
693
    /**
694
     * Get total number of duplicate tokens
695
     * @return Sum of all duplicate token counts
696
     */
697
    int getTotalDuplicateTokens();
698
    
699
    /**
700
     * Get total number of duplicate lines
701
     * @return Sum of all duplicate line counts
702
     */
703
    int getTotalDuplicateLines();
704
    
705
    /**
706
     * Get files analyzed count
707
     * @return Number of source files processed
708
     */
709
    int getFilesAnalyzed();
710
    
711
    /**
712
     * Get duplication percentage
713
     * @return Percentage of code that is duplicated
714
     */
715
    double getDuplicationPercentage();
716
}
717
```

Version

Tile

Files

copy-paste-detection.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

copy-paste-detection.mddocs/