or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

ast-processing.mdcopy-paste-detection.mdcore-analysis.mdindex.mdlanguage-framework.mdproperties-system.mdrendering-system.mdreporting-system.mdrule-system.mdutilities.md

copy-paste-detection.mddocs/

0

# Copy-Paste Detection

1

2

The Copy-Paste Detection (CPD) module provides specialized capabilities for identifying code duplications across files using token-based analysis. It includes duplicate detection algorithms, match representation, configuration options, and reporting for code clone analysis.

3

4

## Capabilities

5

6

### CPD Analysis Engine

7

8

Main class for executing copy-paste detection analysis with configurable parameters and comprehensive duplicate identification.

9

10

```java { .api }

11

/**

12

* Main class for Copy-Paste Detection functionality.

13

* Analyzes source files to identify duplicate code segments using token-based comparison.

14

*/

15

public class CPD {

16

17

/**

18

* Constructor with CPD configuration

19

* @param configuration CPDConfiguration with analysis settings

20

*/

21

CPD(CPDConfiguration configuration);

22

23

/**

24

* Execute CPD analysis on configured source files

25

* Processes all files and identifies duplicate code segments

26

*/

27

void go();

28

29

/**

30

* Get detected code duplication matches

31

* @return Iterator over Match instances representing duplicate code

32

*/

33

Iterator<Match> getMatches();

34

35

/**

36

* Get token count for specific file

37

* @param file File path to query

38

* @return Number of tokens found in the file

39

*/

40

int getNumberOfTokens(String file);

41

42

/**

43

* Get token counts for all analyzed files

44

* @return Map of file paths to token counts

45

*/

46

Map<String, Integer> getTokenCounts();

47

}

48

```

49

50

**Usage Examples:**

51

52

```java

53

import net.sourceforge.pmd.cpd.*;

54

import java.nio.file.Paths;

55

import java.util.Iterator;

56

57

// Basic CPD analysis

58

public class CPDAnalysisExample {

59

60

public void runCPDAnalysis() {

61

// Create CPD configuration

62

CPDConfiguration config = new CPDConfiguration();

63

config.setMinimumTileSize(50); // Minimum tokens for duplication

64

config.addInputPath(Paths.get("src/main/java"));

65

config.setLanguage(LanguageRegistry.CPD.getLanguageByFullName("Java"));

66

67

// Create and execute CPD

68

CPD cpd = new CPD(config);

69

cpd.go(); // Execute analysis

70

71

// Process results

72

Iterator<Match> matches = cpd.getMatches();

73

int duplicateCount = 0;

74

75

while (matches.hasNext()) {

76

Match match = matches.next();

77

duplicateCount++;

78

79

System.out.printf("Duplicate #%d:%n", duplicateCount);

80

System.out.printf(" Tokens: %d%n", match.getTokenCount());

81

System.out.printf(" Lines: %d%n", match.getLineCount());

82

System.out.printf(" Locations: %d%n", match.getMarkSet().size());

83

84

// Show all locations of this duplicate

85

for (Mark mark : match.getMarkSet()) {

86

System.out.printf(" %s:%d-%d%n",

87

mark.getFilename(),

88

mark.getBeginLine(),

89

mark.getEndLine());

90

}

91

92

// Show the duplicated code

93

System.out.println(" Code:");

94

String[] lines = match.getSourceCodeSlice().split("\\n");

95

for (int i = 0; i < Math.min(lines.length, 5); i++) {

96

System.out.printf(" %s%n", lines[i]);

97

}

98

if (lines.length > 5) {

99

System.out.println(" ...");

100

}

101

System.out.println();

102

}

103

104

System.out.printf("Found %d code duplications%n", duplicateCount);

105

}

106

107

public void analyzeTokenCounts(CPD cpd) {

108

// Get token statistics

109

Map<String, Integer> tokenCounts = cpd.getTokenCounts();

110

111

System.out.println("Token counts by file:");

112

tokenCounts.entrySet().stream()

113

.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())

114

.forEach(entry ->

115

System.out.printf(" %s: %d tokens%n",

116

entry.getKey(), entry.getValue()));

117

118

// Calculate total tokens

119

int totalTokens = tokenCounts.values().stream()

120

.mapToInt(Integer::intValue)

121

.sum();

122

123

System.out.printf("Total tokens analyzed: %d%n", totalTokens);

124

125

// Find largest files

126

String largestFile = tokenCounts.entrySet().stream()

127

.max(Map.Entry.comparingByValue())

128

.map(Map.Entry::getKey)

129

.orElse("none");

130

131

System.out.printf("Largest file: %s (%d tokens)%n",

132

largestFile,

133

tokenCounts.getOrDefault(largestFile, 0));

134

}

135

}

136

```

137

138

### Match Representation

139

140

Representation of detected code duplication matches with location tracking and source code access.

141

142

```java { .api }

143

/**

144

* Represents a detected code duplication match.

145

* Contains information about duplicate locations and the duplicated source code.

146

*/

147

public final class Match {

148

149

/**

150

* Get number of duplicate tokens

151

* @return Token count for the duplicated code segment

152

*/

153

int getTokenCount();

154

155

/**

156

* Get number of duplicate lines

157

* @return Line count for the duplicated code segment

158

*/

159

int getLineCount();

160

161

/**

162

* Get all locations where this duplication appears

163

* @return List of Mark instances representing duplicate locations

164

*/

165

List<Mark> getMarkSet();

166

167

/**

168

* Get duplicated source code content

169

* @return Source code text that is duplicated across locations

170

*/

171

String getSourceCodeSlice();

172

173

/**

174

* Compare matches for sorting (by token count, then line count)

175

* @param other Match to compare against

176

* @return Comparison result for ordering matches

177

*/

178

int compareTo(Match other);

179

}

180

```

181

182

**Usage Examples:**

183

184

```java

185

import net.sourceforge.pmd.cpd.*;

186

import java.util.List;

187

import java.util.ArrayList;

188

import java.util.Collections;

189

190

// Processing duplicate matches

191

public class MatchProcessingExample {

192

193

public void processMatches(Iterator<Match> matches) {

194

List<Match> matchList = new ArrayList<>();

195

matches.forEachRemaining(matchList::add);

196

197

// Sort matches by significance (token count descending)

198

Collections.sort(matchList, Collections.reverseOrder());

199

200

System.out.printf("Found %d duplicate code blocks:%n%n", matchList.size());

201

202

for (int i = 0; i < matchList.size(); i++) {

203

Match match = matchList.get(i);

204

processSingleMatch(match, i + 1);

205

}

206

}

207

208

public void processSingleMatch(Match match, int index) {

209

System.out.printf("=== Duplicate #%d ===%n", index);

210

System.out.printf("Size: %d tokens (%d lines)%n",

211

match.getTokenCount(),

212

match.getLineCount());

213

214

List<Mark> locations = match.getMarkSet();

215

System.out.printf("Appears in %d locations:%n", locations.size());

216

217

// Show all locations

218

for (int i = 0; i < locations.size(); i++) {

219

Mark mark = locations.get(i);

220

System.out.printf(" %d. %s (lines %d-%d)%n",

221

i + 1,

222

mark.getFilename(),

223

mark.getBeginLine(),

224

mark.getEndLine());

225

}

226

227

// Show the duplicated code

228

String sourceCode = match.getSourceCodeSlice();

229

System.out.println("Duplicated code:");

230

String[] lines = sourceCode.split("\\r?\\n");

231

232

for (int i = 0; i < Math.min(lines.length, 10); i++) {

233

System.out.printf(" %2d: %s%n", i + 1, lines[i]);

234

}

235

236

if (lines.length > 10) {

237

System.out.printf(" ... (%d more lines)%n", lines.length - 10);

238

}

239

240

System.out.println();

241

}

242

243

public void generateDuplicationReport(List<Match> matches) {

244

// Calculate duplication statistics

245

int totalDuplicateTokens = matches.stream()

246

.mapToInt(match -> match.getTokenCount() * (match.getMarkSet().size() - 1))

247

.sum();

248

249

int totalDuplicateLines = matches.stream()

250

.mapToInt(match -> match.getLineCount() * (match.getMarkSet().size() - 1))

251

.sum();

252

253

// Find files with most duplications

254

Map<String, Integer> fileOccurrences = new HashMap<>();

255

matches.forEach(match ->

256

match.getMarkSet().forEach(mark ->

257

fileOccurrences.merge(mark.getFilename(), 1, Integer::sum)));

258

259

System.out.println("=== Duplication Summary ===");

260

System.out.printf("Total duplicate blocks: %d%n", matches.size());

261

System.out.printf("Total duplicate tokens: %d%n", totalDuplicateTokens);

262

System.out.printf("Total duplicate lines: %d%n", totalDuplicateLines);

263

264

System.out.println("%nFiles with most duplications:");

265

fileOccurrences.entrySet().stream()

266

.sorted(Map.Entry.<String, Integer>comparingByValue().reversed())

267

.limit(10)

268

.forEach(entry ->

269

System.out.printf(" %s: %d occurrences%n",

270

entry.getKey(), entry.getValue()));

271

272

// Analyze duplication sizes

273

System.out.println("%nDuplication size distribution:");

274

Map<String, Long> sizeDistribution = matches.stream()

275

.collect(Collectors.groupingBy(

276

match -> {

277

int tokens = match.getTokenCount();

278

if (tokens < 100) return "Small (< 100 tokens)";

279

else if (tokens < 500) return "Medium (100-500 tokens)";

280

else return "Large (500+ tokens)";

281

},

282

Collectors.counting()));

283

284

sizeDistribution.forEach((size, count) ->

285

System.out.printf(" %s: %d duplicates%n", size, count));

286

}

287

}

288

```

289

290

### CPD Configuration

291

292

Configuration class for customizing copy-paste detection analysis parameters and behavior.

293

294

```java { .api }

295

/**

296

* Configuration for Copy-Paste Detection analysis.

297

* Extends AbstractConfiguration with CPD-specific settings.

298

*/

299

public class CPDConfiguration extends AbstractConfiguration {

300

301

/**

302

* Default constructor with CPD language registry

303

*/

304

CPDConfiguration();

305

306

/**

307

* Constructor with custom language registry

308

* @param languageRegistry Registry of CPD-capable languages

309

*/

310

CPDConfiguration(LanguageRegistry languageRegistry);

311

312

/**

313

* Get minimum tile size (minimum tokens for duplication)

314

* @return Minimum number of tokens required for duplicate detection

315

*/

316

int getMinimumTileSize();

317

318

/**

319

* Set minimum tile size for duplication detection

320

* @param minimumTileSize Minimum tokens (must be positive)

321

*/

322

void setMinimumTileSize(int minimumTileSize);

323

324

/**

325

* Check if differences in literals are ignored

326

* @return true if literal values are ignored during comparison

327

*/

328

boolean isIgnoreLiterals();

329

330

/**

331

* Set whether to ignore literal differences

332

* @param ignoreLiterals true to ignore string/numeric literal values

333

*/

334

void setIgnoreLiterals(boolean ignoreLiterals);

335

336

/**

337

* Check if differences in identifiers are ignored

338

* @return true if identifier names are ignored during comparison

339

*/

340

boolean isIgnoreIdentifiers();

341

342

/**

343

* Set whether to ignore identifier differences

344

* @param ignoreIdentifiers true to ignore variable/method names

345

*/

346

void setIgnoreIdentifiers(boolean ignoreIdentifiers);

347

348

/**

349

* Check if annotation differences are ignored

350

* @return true if annotations are ignored during comparison

351

*/

352

boolean isIgnoreAnnotations();

353

354

/**

355

* Set whether to ignore annotation differences

356

* @param ignoreAnnotations true to ignore annotation presence/content

357

*/

358

void setIgnoreAnnotations(boolean ignoreAnnotations);

359

360

/**

361

* Check if only files with same name are compared

362

* @return true if cross-file comparison is limited to same filenames

363

*/

364

boolean isMatchOnlyFilesWithSameName();

365

366

/**

367

* Set whether to compare only files with same name

368

* @param matchOnlyFilesWithSameName true to limit to same-name files

369

*/

370

void setMatchOnlyFilesWithSameName(boolean matchOnlyFilesWithSameName);

371

372

/**

373

* Get renderer for CPD output formatting

374

* @return CPDRenderer for generating reports

375

*/

376

CPDRenderer getRenderer();

377

378

/**

379

* Set renderer for CPD output

380

* @param renderer CPDRenderer for formatting results

381

*/

382

void setRenderer(CPDRenderer renderer);

383

}

384

```

385

386

**Usage Examples:**

387

388

```java

389

import net.sourceforge.pmd.cpd.*;

390

import java.nio.file.Paths;

391

392

// Configuring CPD analysis

393

public class CPDConfigurationExample {

394

395

public void createBasicConfiguration() {

396

CPDConfiguration config = new CPDConfiguration();

397

398

// Set basic parameters

399

config.setMinimumTileSize(50); // Minimum 50 tokens for duplication

400

config.addInputPath(Paths.get("src/main/java"));

401

config.setLanguage(LanguageRegistry.CPD.getLanguageById("java"));

402

403

// Configure output

404

config.setReportFormat("text");

405

config.setReportFile(Paths.get("cpd-report.txt"));

406

407

System.out.println("Basic CPD configuration created");

408

}

409

410

public void createAdvancedConfiguration() {

411

CPDConfiguration config = new CPDConfiguration();

412

413

// Advanced duplication detection settings

414

config.setMinimumTileSize(25); // Lower threshold for more sensitive detection

415

config.setIgnoreLiterals(true); // Ignore string/number differences

416

config.setIgnoreIdentifiers(true); // Ignore variable name differences

417

config.setIgnoreAnnotations(true); // Ignore annotation differences

418

419

// File matching configuration

420

config.setMatchOnlyFilesWithSameName(false); // Allow cross-file comparison

421

422

// Input configuration

423

config.addInputPath(Paths.get("src/main/java"));

424

config.addInputPath(Paths.get("src/test/java"));

425

config.setLanguage(LanguageRegistry.CPD.getLanguageById("java"));

426

427

// Exclude certain patterns

428

config.setExcludes(Arrays.asList(

429

Paths.get("**/generated/**"),

430

Paths.get("**/target/**")

431

));

432

433

// Configure encoding

434

config.setSourceEncoding(StandardCharsets.UTF_8);

435

436

System.out.println("Advanced CPD configuration created");

437

}

438

439

public void configureCPDForDifferentLanguages() {

440

// Java configuration

441

CPDConfiguration javaConfig = new CPDConfiguration();

442

javaConfig.setLanguage(LanguageRegistry.CPD.getLanguageById("java"));

443

javaConfig.setMinimumTileSize(50);

444

javaConfig.addInputPath(Paths.get("src/main/java"));

445

446

// JavaScript configuration

447

CPDConfiguration jsConfig = new CPDConfiguration();

448

jsConfig.setLanguage(LanguageRegistry.CPD.getLanguageById("javascript"));

449

jsConfig.setMinimumTileSize(30); // Smaller threshold for JS

450

jsConfig.setIgnoreLiterals(true);

451

jsConfig.addInputPath(Paths.get("src/main/webapp/js"));

452

453

// Python configuration

454

CPDConfiguration pythonConfig = new CPDConfiguration();

455

pythonConfig.setLanguage(LanguageRegistry.CPD.getLanguageById("python"));

456

pythonConfig.setMinimumTileSize(40);

457

pythonConfig.addInputPath(Paths.get("src/main/python"));

458

459

System.out.println("Language-specific configurations created");

460

}

461

462

public void configureIgnoreOptions() {

463

CPDConfiguration config = new CPDConfiguration();

464

465

// Configure what to ignore for more flexible matching

466

config.setIgnoreLiterals(true); // "hello" matches "world"

467

config.setIgnoreIdentifiers(true); // variable names don't matter

468

config.setIgnoreAnnotations(true); // @Override vs no annotation

469

470

// This configuration will find structural duplicates even when:

471

// - String literals are different

472

// - Variable names are different

473

// - Method names are different

474

// - Annotations are present/absent

475

476

config.setMinimumTileSize(30); // Lower threshold since we're ignoring more

477

config.addInputPath(Paths.get("src"));

478

479

System.out.println("Flexible matching configuration created");

480

}

481

482

public void runMultipleAnalyses() {

483

// Run strict analysis (exact matches)

484

CPDConfiguration strictConfig = new CPDConfiguration();

485

strictConfig.setMinimumTileSize(100);

486

strictConfig.setIgnoreLiterals(false);

487

strictConfig.setIgnoreIdentifiers(false);

488

strictConfig.addInputPath(Paths.get("src"));

489

490

CPD strictCpd = new CPD(strictConfig);

491

strictCpd.go();

492

System.out.printf("Strict analysis found %d exact duplicates%n",

493

countMatches(strictCpd.getMatches()));

494

495

// Run flexible analysis (structural matches)

496

CPDConfiguration flexibleConfig = new CPDConfiguration();

497

flexibleConfig.setMinimumTileSize(50);

498

flexibleConfig.setIgnoreLiterals(true);

499

flexibleConfig.setIgnoreIdentifiers(true);

500

flexibleConfig.addInputPath(Paths.get("src"));

501

502

CPD flexibleCpd = new CPD(flexibleConfig);

503

flexibleCpd.go();

504

System.out.printf("Flexible analysis found %d structural duplicates%n",

505

countMatches(flexibleCpd.getMatches()));

506

}

507

508

private int countMatches(Iterator<Match> matches) {

509

int count = 0;

510

while (matches.hasNext()) {

511

matches.next();

512

count++;

513

}

514

return count;

515

}

516

}

517

```

518

519

## Types

520

521

```java { .api }

522

/**

523

* Mark representing a specific location of duplicated code

524

*/

525

final class Mark {

526

527

/**

528

* Get filename containing the duplicate

529

* @return File path where duplicate code appears

530

*/

531

String getFilename();

532

533

/**

534

* Get starting line number of duplicate

535

* @return One-based line number where duplicate begins

536

*/

537

int getBeginLine();

538

539

/**

540

* Get ending line number of duplicate

541

* @return One-based line number where duplicate ends

542

*/

543

int getEndLine();

544

545

/**

546

* Get starting column number of duplicate

547

* @return One-based column number where duplicate begins

548

*/

549

int getBeginColumn();

550

551

/**

552

* Get ending column number of duplicate

553

* @return One-based column number where duplicate ends

554

*/

555

int getEndColumn();

556

557

/**

558

* Get token count for this mark

559

* @return Number of tokens in the duplicate

560

*/

561

int getTokenCount();

562

563

/**

564

* Compare marks for sorting

565

* @param other Mark to compare against

566

* @return Comparison result for ordering

567

*/

568

int compareTo(Mark other);

569

}

570

571

/**

572

* Renderer interface for CPD output formatting

573

*/

574

interface CPDRenderer {

575

576

/**

577

* Start rendering CPD results

578

*/

579

void start();

580

581

/**

582

* Render a single duplication match

583

* @param match Match to render

584

*/

585

void renderDuplication(Match match);

586

587

/**

588

* Finish rendering and cleanup

589

*/

590

void end();

591

592

/**

593

* Set output writer for rendering

594

* @param writer Writer for output

595

*/

596

void setWriter(Writer writer);

597

}

598

599

/**

600

* Built-in CPD renderers for different output formats

601

*/

602

class CPDRenderers {

603

static CPDRenderer text();

604

static CPDRenderer xml();

605

static CPDRenderer csv();

606

static CPDRenderer json();

607

}

608

609

/**

610

* Token for CPD analysis representing atomic code elements

611

*/

612

interface Token {

613

614

/**

615

* Get token image (text representation)

616

* @return String representation of token

617

*/

618

String getImage();

619

620

/**

621

* Get token type identifier

622

* @return Integer representing token type

623

*/

624

int getKind();

625

626

/**

627

* Get line number where token appears

628

* @return One-based line number

629

*/

630

int getBeginLine();

631

632

/**

633

* Get column number where token appears

634

* @return One-based column number

635

*/

636

int getBeginColumn();

637

638

/**

639

* Get ending line number of token

640

* @return One-based ending line number

641

*/

642

int getEndLine();

643

644

/**

645

* Get ending column number of token

646

* @return One-based ending column number

647

*/

648

int getEndColumn();

649

}

650

651

/**

652

* CPD visitor for language-specific tokenization

653

*/

654

interface CpdVisitor {

655

656

/**

657

* Visit source file and generate tokens

658

* @param sourceCode Source code to tokenize

659

* @param filename File name for context

660

*/

661

void visitFile(String sourceCode, String filename);

662

663

/**

664

* Add token to CPD analysis

665

* @param image Token text

666

* @param beginLine Starting line

667

* @param endLine Ending line

668

* @param beginColumn Starting column

669

* @param endColumn Ending column

670

*/

671

void add(String image, int beginLine, int endLine, int beginColumn, int endColumn);

672

}

673

674

/**

675

* Exception thrown during CPD processing

676

*/

677

class CPDException extends Exception {

678

CPDException(String message);

679

CPDException(String message, Throwable cause);

680

}

681

682

/**

683

* CPD report statistics

684

*/

685

interface CPDReportStats {

686

687

/**

688

* Get total number of duplicate blocks found

689

* @return Count of duplicate code blocks

690

*/

691

int getNumberOfDuplicates();

692

693

/**

694

* Get total number of duplicate tokens

695

* @return Sum of all duplicate token counts

696

*/

697

int getTotalDuplicateTokens();

698

699

/**

700

* Get total number of duplicate lines

701

* @return Sum of all duplicate line counts

702

*/

703

int getTotalDuplicateLines();

704

705

/**

706

* Get files analyzed count

707

* @return Number of source files processed

708

*/

709

int getFilesAnalyzed();

710

711

/**

712

* Get duplication percentage

713

* @return Percentage of code that is duplicated

714

*/

715

double getDuplicationPercentage();

716

}

717

```