0
# Copy-Paste Detection
1
2
The Copy-Paste Detection (CPD) module provides specialized capabilities for identifying code duplications across files using token-based analysis. It includes duplicate detection algorithms, match representation, configuration options, and reporting for code clone analysis.
3
4
## Capabilities
5
6
### CPD Analysis Engine
7
8
Main class for executing copy-paste detection analysis with configurable parameters and comprehensive duplicate identification.
9
10
```java { .api }
11
/**
12
* Main class for Copy-Paste Detection functionality.
13
* Analyzes source files to identify duplicate code segments using token-based comparison.
14
*/
15
public class CPD {
16
17
/**
18
* Constructor with CPD configuration
19
* @param configuration CPDConfiguration with analysis settings
20
*/
21
CPD(CPDConfiguration configuration);
22
23
/**
24
* Execute CPD analysis on configured source files
25
* Processes all files and identifies duplicate code segments
26
*/
27
void go();
28
29
/**
30
* Get detected code duplication matches
31
* @return Iterator over Match instances representing duplicate code
32
*/
33
Iterator<Match> getMatches();
34
35
/**
36
* Get token count for specific file
37
* @param file File path to query
38
* @return Number of tokens found in the file
39
*/
40
int getNumberOfTokens(String file);
41
42
/**
43
* Get token counts for all analyzed files
44
* @return Map of file paths to token counts
45
*/
46
Map<String, Integer> getTokenCounts();
47
}
48
```
49
50
**Usage Examples:**
51
52
```java
53
import net.sourceforge.pmd.cpd.*;
54
import java.nio.file.Paths;
55
import java.util.Iterator;
56
57
// Basic CPD analysis
58
public class CPDAnalysisExample {
59
60
public void runCPDAnalysis() {
61
// Create CPD configuration
62
CPDConfiguration config = new CPDConfiguration();
63
config.setMinimumTileSize(50); // Minimum tokens for duplication
64
config.addInputPath(Paths.get("src/main/java"));
65
config.setLanguage(LanguageRegistry.CPD.getLanguageByFullName("Java"));
66
67
// Create and execute CPD
68
CPD cpd = new CPD(config);
69
cpd.go(); // Execute analysis
70
71
// Process results
72
Iterator<Match> matches = cpd.getMatches();
73
int duplicateCount = 0;
74
75
while (matches.hasNext()) {
76
Match match = matches.next();
77
duplicateCount++;
78
79
System.out.printf("Duplicate #%d:%n", duplicateCount);
80
System.out.printf(" Tokens: %d%n", match.getTokenCount());
81
System.out.printf(" Lines: %d%n", match.getLineCount());
82
System.out.printf(" Locations: %d%n", match.getMarkSet().size());
83
84
// Show all locations of this duplicate
85
for (Mark mark : match.getMarkSet()) {
86
System.out.printf(" %s:%d-%d%n",
87
mark.getFilename(),
88
mark.getBeginLine(),
89
mark.getEndLine());
90
}
91
92
// Show the duplicated code
93
System.out.println(" Code:");
94
String[] lines = match.getSourceCodeSlice().split("\\n");
95
for (int i = 0; i < Math.min(lines.length, 5); i++) {
96
System.out.printf(" %s%n", lines[i]);
97
}
98
if (lines.length > 5) {
99
System.out.println(" ...");
100
}
101
System.out.println();
102
}
103
104
System.out.printf("Found %d code duplications%n", duplicateCount);
105
}
106
107
public void analyzeTokenCounts(CPD cpd) {
108
// Get token statistics
109
Map<String, Integer> tokenCounts = cpd.getTokenCounts();
110
111
System.out.println("Token counts by file:");
112
tokenCounts.entrySet().stream()
113
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
114
.forEach(entry ->
115
System.out.printf(" %s: %d tokens%n",
116
entry.getKey(), entry.getValue()));
117
118
// Calculate total tokens
119
int totalTokens = tokenCounts.values().stream()
120
.mapToInt(Integer::intValue)
121
.sum();
122
123
System.out.printf("Total tokens analyzed: %d%n", totalTokens);
124
125
// Find largest files
126
String largestFile = tokenCounts.entrySet().stream()
127
.max(Map.Entry.comparingByValue())
128
.map(Map.Entry::getKey)
129
.orElse("none");
130
131
System.out.printf("Largest file: %s (%d tokens)%n",
132
largestFile,
133
tokenCounts.getOrDefault(largestFile, 0));
134
}
135
}
136
```
137
138
### Match Representation
139
140
Representation of detected code duplication matches with location tracking and source code access.
141
142
```java { .api }
143
/**
144
* Represents a detected code duplication match.
145
* Contains information about duplicate locations and the duplicated source code.
146
*/
147
public final class Match {
148
149
/**
150
* Get number of duplicate tokens
151
* @return Token count for the duplicated code segment
152
*/
153
int getTokenCount();
154
155
/**
156
* Get number of duplicate lines
157
* @return Line count for the duplicated code segment
158
*/
159
int getLineCount();
160
161
/**
162
* Get all locations where this duplication appears
163
* @return List of Mark instances representing duplicate locations
164
*/
165
List<Mark> getMarkSet();
166
167
/**
168
* Get duplicated source code content
169
* @return Source code text that is duplicated across locations
170
*/
171
String getSourceCodeSlice();
172
173
/**
174
* Compare matches for sorting (by token count, then line count)
175
* @param other Match to compare against
176
* @return Comparison result for ordering matches
177
*/
178
int compareTo(Match other);
179
}
180
```
181
182
**Usage Examples:**
183
184
```java
185
import net.sourceforge.pmd.cpd.*;
186
import java.util.List;
187
import java.util.ArrayList;
188
import java.util.Collections;
189
190
// Processing duplicate matches
191
public class MatchProcessingExample {
192
193
public void processMatches(Iterator<Match> matches) {
194
List<Match> matchList = new ArrayList<>();
195
matches.forEachRemaining(matchList::add);
196
197
// Sort matches by significance (token count descending)
198
Collections.sort(matchList, Collections.reverseOrder());
199
200
System.out.printf("Found %d duplicate code blocks:%n%n", matchList.size());
201
202
for (int i = 0; i < matchList.size(); i++) {
203
Match match = matchList.get(i);
204
processSingleMatch(match, i + 1);
205
}
206
}
207
208
public void processSingleMatch(Match match, int index) {
209
System.out.printf("=== Duplicate #%d ===%n", index);
210
System.out.printf("Size: %d tokens (%d lines)%n",
211
match.getTokenCount(),
212
match.getLineCount());
213
214
List<Mark> locations = match.getMarkSet();
215
System.out.printf("Appears in %d locations:%n", locations.size());
216
217
// Show all locations
218
for (int i = 0; i < locations.size(); i++) {
219
Mark mark = locations.get(i);
220
System.out.printf(" %d. %s (lines %d-%d)%n",
221
i + 1,
222
mark.getFilename(),
223
mark.getBeginLine(),
224
mark.getEndLine());
225
}
226
227
// Show the duplicated code
228
String sourceCode = match.getSourceCodeSlice();
229
System.out.println("Duplicated code:");
230
String[] lines = sourceCode.split("\\r?\\n");
231
232
for (int i = 0; i < Math.min(lines.length, 10); i++) {
233
System.out.printf(" %2d: %s%n", i + 1, lines[i]);
234
}
235
236
if (lines.length > 10) {
237
System.out.printf(" ... (%d more lines)%n", lines.length - 10);
238
}
239
240
System.out.println();
241
}
242
243
public void generateDuplicationReport(List<Match> matches) {
244
// Calculate duplication statistics
245
int totalDuplicateTokens = matches.stream()
246
.mapToInt(match -> match.getTokenCount() * (match.getMarkSet().size() - 1))
247
.sum();
248
249
int totalDuplicateLines = matches.stream()
250
.mapToInt(match -> match.getLineCount() * (match.getMarkSet().size() - 1))
251
.sum();
252
253
// Find files with most duplications
254
Map<String, Integer> fileOccurrences = new HashMap<>();
255
matches.forEach(match ->
256
match.getMarkSet().forEach(mark ->
257
fileOccurrences.merge(mark.getFilename(), 1, Integer::sum)));
258
259
System.out.println("=== Duplication Summary ===");
260
System.out.printf("Total duplicate blocks: %d%n", matches.size());
261
System.out.printf("Total duplicate tokens: %d%n", totalDuplicateTokens);
262
System.out.printf("Total duplicate lines: %d%n", totalDuplicateLines);
263
264
System.out.println("%nFiles with most duplications:");
265
fileOccurrences.entrySet().stream()
266
.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())
267
.limit(10)
268
.forEach(entry ->
269
System.out.printf(" %s: %d occurrences%n",
270
entry.getKey(), entry.getValue()));
271
272
// Analyze duplication sizes
273
System.out.println("%nDuplication size distribution:");
274
Map<String, Long> sizeDistribution = matches.stream()
275
.collect(Collectors.groupingBy(
276
match -> {
277
int tokens = match.getTokenCount();
278
if (tokens < 100) return "Small (< 100 tokens)";
279
else if (tokens < 500) return "Medium (100-500 tokens)";
280
else return "Large (500+ tokens)";
281
},
282
Collectors.counting()));
283
284
sizeDistribution.forEach((size, count) ->
285
System.out.printf(" %s: %d duplicates%n", size, count));
286
}
287
}
288
```
289
290
### CPD Configuration
291
292
Configuration class for customizing copy-paste detection analysis parameters and behavior.
293
294
```java { .api }
295
/**
296
* Configuration for Copy-Paste Detection analysis.
297
* Extends AbstractConfiguration with CPD-specific settings.
298
*/
299
public class CPDConfiguration extends AbstractConfiguration {
300
301
/**
302
* Default constructor with CPD language registry
303
*/
304
CPDConfiguration();
305
306
/**
307
* Constructor with custom language registry
308
* @param languageRegistry Registry of CPD-capable languages
309
*/
310
CPDConfiguration(LanguageRegistry languageRegistry);
311
312
/**
313
* Get minimum tile size (minimum tokens for duplication)
314
* @return Minimum number of tokens required for duplicate detection
315
*/
316
int getMinimumTileSize();
317
318
/**
319
* Set minimum tile size for duplication detection
320
* @param minimumTileSize Minimum tokens (must be positive)
321
*/
322
void setMinimumTileSize(int minimumTileSize);
323
324
/**
325
* Check if differences in literals are ignored
326
* @return true if literal values are ignored during comparison
327
*/
328
boolean isIgnoreLiterals();
329
330
/**
331
* Set whether to ignore literal differences
332
* @param ignoreLiterals true to ignore string/numeric literal values
333
*/
334
void setIgnoreLiterals(boolean ignoreLiterals);
335
336
/**
337
* Check if differences in identifiers are ignored
338
* @return true if identifier names are ignored during comparison
339
*/
340
boolean isIgnoreIdentifiers();
341
342
/**
343
* Set whether to ignore identifier differences
344
* @param ignoreIdentifiers true to ignore variable/method names
345
*/
346
void setIgnoreIdentifiers(boolean ignoreIdentifiers);
347
348
/**
349
* Check if annotation differences are ignored
350
* @return true if annotations are ignored during comparison
351
*/
352
boolean isIgnoreAnnotations();
353
354
/**
355
* Set whether to ignore annotation differences
356
* @param ignoreAnnotations true to ignore annotation presence/content
357
*/
358
void setIgnoreAnnotations(boolean ignoreAnnotations);
359
360
/**
361
* Check if only files with same name are compared
362
* @return true if cross-file comparison is limited to same filenames
363
*/
364
boolean isMatchOnlyFilesWithSameName();
365
366
/**
367
* Set whether to compare only files with same name
368
* @param matchOnlyFilesWithSameName true to limit to same-name files
369
*/
370
void setMatchOnlyFilesWithSameName(boolean matchOnlyFilesWithSameName);
371
372
/**
373
* Get renderer for CPD output formatting
374
* @return CPDRenderer for generating reports
375
*/
376
CPDRenderer getRenderer();
377
378
/**
379
* Set renderer for CPD output
380
* @param renderer CPDRenderer for formatting results
381
*/
382
void setRenderer(CPDRenderer renderer);
383
}
384
```
385
386
**Usage Examples:**
387
388
```java
389
import net.sourceforge.pmd.cpd.*;
390
import java.nio.file.Paths;
391
392
// Configuring CPD analysis
393
public class CPDConfigurationExample {
394
395
public void createBasicConfiguration() {
396
CPDConfiguration config = new CPDConfiguration();
397
398
// Set basic parameters
399
config.setMinimumTileSize(50); // Minimum 50 tokens for duplication
400
config.addInputPath(Paths.get("src/main/java"));
401
config.setLanguage(LanguageRegistry.CPD.getLanguageById("java"));
402
403
// Configure output
404
config.setReportFormat("text");
405
config.setReportFile(Paths.get("cpd-report.txt"));
406
407
System.out.println("Basic CPD configuration created");
408
}
409
410
public void createAdvancedConfiguration() {
411
CPDConfiguration config = new CPDConfiguration();
412
413
// Advanced duplication detection settings
414
config.setMinimumTileSize(25); // Lower threshold for more sensitive detection
415
config.setIgnoreLiterals(true); // Ignore string/number differences
416
config.setIgnoreIdentifiers(true); // Ignore variable name differences
417
config.setIgnoreAnnotations(true); // Ignore annotation differences
418
419
// File matching configuration
420
config.setMatchOnlyFilesWithSameName(false); // Allow cross-file comparison
421
422
// Input configuration
423
config.addInputPath(Paths.get("src/main/java"));
424
config.addInputPath(Paths.get("src/test/java"));
425
config.setLanguage(LanguageRegistry.CPD.getLanguageById("java"));
426
427
// Exclude certain patterns
428
config.setExcludes(Arrays.asList(
429
Paths.get("**/generated/**"),
430
Paths.get("**/target/**")
431
));
432
433
// Configure encoding
434
config.setSourceEncoding(StandardCharsets.UTF_8);
435
436
System.out.println("Advanced CPD configuration created");
437
}
438
439
public void configureCPDForDifferentLanguages() {
440
// Java configuration
441
CPDConfiguration javaConfig = new CPDConfiguration();
442
javaConfig.setLanguage(LanguageRegistry.CPD.getLanguageById("java"));
443
javaConfig.setMinimumTileSize(50);
444
javaConfig.addInputPath(Paths.get("src/main/java"));
445
446
// JavaScript configuration
447
CPDConfiguration jsConfig = new CPDConfiguration();
448
jsConfig.setLanguage(LanguageRegistry.CPD.getLanguageById("javascript"));
449
jsConfig.setMinimumTileSize(30); // Smaller threshold for JS
450
jsConfig.setIgnoreLiterals(true);
451
jsConfig.addInputPath(Paths.get("src/main/webapp/js"));
452
453
// Python configuration
454
CPDConfiguration pythonConfig = new CPDConfiguration();
455
pythonConfig.setLanguage(LanguageRegistry.CPD.getLanguageById("python"));
456
pythonConfig.setMinimumTileSize(40);
457
pythonConfig.addInputPath(Paths.get("src/main/python"));
458
459
System.out.println("Language-specific configurations created");
460
}
461
462
public void configureIgnoreOptions() {
463
CPDConfiguration config = new CPDConfiguration();
464
465
// Configure what to ignore for more flexible matching
466
config.setIgnoreLiterals(true); // "hello" matches "world"
467
config.setIgnoreIdentifiers(true); // variable names don't matter
468
config.setIgnoreAnnotations(true); // @Override vs no annotation
469
470
// This configuration will find structural duplicates even when:
471
// - String literals are different
472
// - Variable names are different
473
// - Method names are different
474
// - Annotations are present/absent
475
476
config.setMinimumTileSize(30); // Lower threshold since we're ignoring more
477
config.addInputPath(Paths.get("src"));
478
479
System.out.println("Flexible matching configuration created");
480
}
481
482
public void runMultipleAnalyses() {
483
// Run strict analysis (exact matches)
484
CPDConfiguration strictConfig = new CPDConfiguration();
485
strictConfig.setMinimumTileSize(100);
486
strictConfig.setIgnoreLiterals(false);
487
strictConfig.setIgnoreIdentifiers(false);
488
strictConfig.addInputPath(Paths.get("src"));
489
490
CPD strictCpd = new CPD(strictConfig);
491
strictCpd.go();
492
System.out.printf("Strict analysis found %d exact duplicates%n",
493
countMatches(strictCpd.getMatches()));
494
495
// Run flexible analysis (structural matches)
496
CPDConfiguration flexibleConfig = new CPDConfiguration();
497
flexibleConfig.setMinimumTileSize(50);
498
flexibleConfig.setIgnoreLiterals(true);
499
flexibleConfig.setIgnoreIdentifiers(true);
500
flexibleConfig.addInputPath(Paths.get("src"));
501
502
CPD flexibleCpd = new CPD(flexibleConfig);
503
flexibleCpd.go();
504
System.out.printf("Flexible analysis found %d structural duplicates%n",
505
countMatches(flexibleCpd.getMatches()));
506
}
507
508
private int countMatches(Iterator<Match> matches) {
509
int count = 0;
510
while (matches.hasNext()) {
511
matches.next();
512
count++;
513
}
514
return count;
515
}
516
}
517
```
518
519
## Types
520
521
```java { .api }
522
/**
523
* Mark representing a specific location of duplicated code
524
*/
525
final class Mark {
526
527
/**
528
* Get filename containing the duplicate
529
* @return File path where duplicate code appears
530
*/
531
String getFilename();
532
533
/**
534
* Get starting line number of duplicate
535
* @return One-based line number where duplicate begins
536
*/
537
int getBeginLine();
538
539
/**
540
* Get ending line number of duplicate
541
* @return One-based line number where duplicate ends
542
*/
543
int getEndLine();
544
545
/**
546
* Get starting column number of duplicate
547
* @return One-based column number where duplicate begins
548
*/
549
int getBeginColumn();
550
551
/**
552
* Get ending column number of duplicate
553
* @return One-based column number where duplicate ends
554
*/
555
int getEndColumn();
556
557
/**
558
* Get token count for this mark
559
* @return Number of tokens in the duplicate
560
*/
561
int getTokenCount();
562
563
/**
564
* Compare marks for sorting
565
* @param other Mark to compare against
566
* @return Comparison result for ordering
567
*/
568
int compareTo(Mark other);
569
}
570
571
/**
572
* Renderer interface for CPD output formatting
573
*/
574
interface CPDRenderer {
575
576
/**
577
* Start rendering CPD results
578
*/
579
void start();
580
581
/**
582
* Render a single duplication match
583
* @param match Match to render
584
*/
585
void renderDuplication(Match match);
586
587
/**
588
* Finish rendering and cleanup
589
*/
590
void end();
591
592
/**
593
* Set output writer for rendering
594
* @param writer Writer for output
595
*/
596
void setWriter(Writer writer);
597
}
598
599
/**
600
* Built-in CPD renderers for different output formats
601
*/
602
class CPDRenderers {
603
static CPDRenderer text();
604
static CPDRenderer xml();
605
static CPDRenderer csv();
606
static CPDRenderer json();
607
}
608
609
/**
610
* Token for CPD analysis representing atomic code elements
611
*/
612
interface Token {
613
614
/**
615
* Get token image (text representation)
616
* @return String representation of token
617
*/
618
String getImage();
619
620
/**
621
* Get token type identifier
622
* @return Integer representing token type
623
*/
624
int getKind();
625
626
/**
627
* Get line number where token appears
628
* @return One-based line number
629
*/
630
int getBeginLine();
631
632
/**
633
* Get column number where token appears
634
* @return One-based column number
635
*/
636
int getBeginColumn();
637
638
/**
639
* Get ending line number of token
640
* @return One-based ending line number
641
*/
642
int getEndLine();
643
644
/**
645
* Get ending column number of token
646
* @return One-based ending column number
647
*/
648
int getEndColumn();
649
}
650
651
/**
652
* CPD visitor for language-specific tokenization
653
*/
654
interface CpdVisitor {
655
656
/**
657
* Visit source file and generate tokens
658
* @param sourceCode Source code to tokenize
659
* @param filename File name for context
660
*/
661
void visitFile(String sourceCode, String filename);
662
663
/**
664
* Add token to CPD analysis
665
* @param image Token text
666
* @param beginLine Starting line
667
* @param endLine Ending line
668
* @param beginColumn Starting column
669
* @param endColumn Ending column
670
*/
671
void add(String image, int beginLine, int endLine, int beginColumn, int endColumn);
672
}
673
674
/**
675
* Exception thrown during CPD processing
676
*/
677
class CPDException extends Exception {
678
CPDException(String message);
679
CPDException(String message, Throwable cause);
680
}
681
682
/**
683
* CPD report statistics
684
*/
685
interface CPDReportStats {
686
687
/**
688
* Get total number of duplicate blocks found
689
* @return Count of duplicate code blocks
690
*/
691
int getNumberOfDuplicates();
692
693
/**
694
* Get total number of duplicate tokens
695
* @return Sum of all duplicate token counts
696
*/
697
int getTotalDuplicateTokens();
698
699
/**
700
* Get total number of duplicate lines
701
* @return Sum of all duplicate line counts
702
*/
703
int getTotalDuplicateLines();
704
705
/**
706
* Get files analyzed count
707
* @return Number of source files processed
708
*/
709
int getFilesAnalyzed();
710
711
/**
712
* Get duplication percentage
713
* @return Percentage of code that is duplicated
714
*/
715
double getDuplicationPercentage();
716
}
717
```