Copy-paste detection (CPD) capabilities provide Scalameta-based tokenization for identifying code duplication in Scala projects. The CPD system integrates with PMD's duplicate code detection framework to analyze Scala source files for similar code patterns.
Primary tokenizer for copy-paste detection that converts Scala source code into tokens for duplication analysis.
public class ScalaCpdLexer implements CpdLexer {
public ScalaCpdLexer(LanguagePropertyBundle bundle);
public void tokenize(TextDocument document, TokenFactory tokenEntries) throws IOException;
}Usage Example:
// Create CPD lexer with language properties
LanguagePropertyBundle bundle = LanguagePropertyBundle.create();
ScalaCpdLexer lexer = new ScalaCpdLexer(bundle);
// Tokenize a Scala source file
TextDocument document = TextDocument.readOnlyString("Example.scala", sourceCode);
List<CpdToken> tokens = new ArrayList<>();
TokenFactory tokenFactory = new TokenFactory(tokens);
try {
lexer.tokenize(document, tokenFactory);
System.out.println("Generated " + tokens.size() + " tokens for CPD analysis");
} catch (IOException e) {
System.err.println("Tokenization failed: " + e.getMessage());
}Adapter class that bridges Scalameta tokens to PMD's CPD token interface.
public class ScalaTokenAdapter {
// Internal adapter implementation
// Converts scala.meta.Token to PMD CpdToken format
}Internal Usage:
// Used internally by ScalaCpdLexer
scala.meta.Token scalametaToken = // ... from Scalameta parsing
CpdToken pmdToken = ScalaTokenAdapter.adapt(scalametaToken, document);
tokenFactory.recordToken(pmdToken);The CPD lexer processes Scala source code through the following steps:
// Internal tokenization process
public void tokenize(TextDocument document, TokenFactory tokenEntries) throws IOException {
try {
// Parse with Scalameta
Input input = Input.String(document.getText().toString());
Tokens tokens = input.tokenize().get();
// Filter and adapt tokens
for (Token token : tokens) {
if (shouldIncludeToken(token)) {
CpdToken cpdToken = adaptToken(token, document);
tokenEntries.recordToken(cpdToken);
}
}
} catch (Exception e) {
throw new IOException("Scala tokenization failed", e);
}
}The tokenizer applies filtering rules to focus on semantically meaningful tokens:
private boolean shouldIncludeToken(Token token) {
// Include identifiers, keywords, literals, operators
// Exclude comments, whitespace, formatting tokens
return !(token instanceof Token.Comment ||
token instanceof Token.Space ||
token instanceof Token.Tab ||
token instanceof Token.LF ||
token instanceof Token.CRLF ||
token instanceof Token.FF);
}CPD integration is handled through the language module:
public class ScalaLanguageModule extends SimpleLanguageModuleBase {
@Override
public CpdLexer createCpdLexer(LanguagePropertyBundle bundle) {
return new ScalaCpdLexer(bundle);
}
}Usage Example:
// Get CPD lexer from language module
ScalaLanguageModule module = ScalaLanguageModule.getInstance();
LanguagePropertyBundle bundle = // ... configure properties
CpdLexer lexer = module.createCpdLexer(bundle);
// Use lexer for CPD analysis
lexer.tokenize(document, tokenEntries);Configure CPD analysis parameters for Scala projects:
// CPD configuration for Scala
CpdConfiguration config = CpdConfiguration.builder()
.setMinimumTileSize(50) // Minimum tokens for duplication
.setLanguage("scala") // Use Scala CPD lexer
.setIgnoreAnnotations(true) // Ignore annotation differences
.setIgnoreIdentifiers(false) // Consider identifier names
.setIgnoreLiterals(true) // Ignore literal value differences
.build();CPD can detect duplicated code patterns at various levels:
// Example: Similar class structures
class UserService {
def findById(id: Long): Option[User] = {
val query = "SELECT * FROM users WHERE id = ?"
executeQuery(query, id).map(parseUser)
}
def save(user: User): Boolean = {
val query = "INSERT INTO users (name, email) VALUES (?, ?)"
executeUpdate(query, user.name, user.email) > 0
}
}
class ProductService {
def findById(id: Long): Option[Product] = {
val query = "SELECT * FROM products WHERE id = ?"
executeQuery(query, id).map(parseProduct) // Similar pattern
}
def save(product: Product): Boolean = {
val query = "INSERT INTO products (name, price) VALUES (?, ?)"
executeUpdate(query, product.name, product.price) > 0 // Similar pattern
}
}// Example: Similar method implementations
def processUsers(users: List[User]): List[ProcessedUser] = {
users.map { user =>
val validated = validateUser(user)
val normalized = normalizeUser(validated)
val enriched = enrichUser(normalized)
ProcessedUser(enriched)
}
}
def processProducts(products: List[Product]): List[ProcessedProduct] = {
products.map { product =>
val validated = validateProduct(product) // Similar structure
val normalized = normalizeProduct(validated) // Similar structure
val enriched = enrichProduct(normalized) // Similar structure
ProcessedProduct(enriched)
}
}// Example: Similar expression patterns
val userResult = Try {
val data = fetchUserData(id)
val parsed = parseUserData(data)
val validated = validateUserData(parsed)
validated
}.recover {
case _: NetworkException => DefaultUser
case _: ParseException => DefaultUser
}.get
val productResult = Try {
val data = fetchProductData(id) // Similar pattern
val parsed = parseProductData(data) // Similar pattern
val validated = validateProductData(parsed) // Similar pattern
validated
}.recover {
case _: NetworkException => DefaultProduct // Similar pattern
case _: ParseException => DefaultProduct // Similar pattern
}.getCPD generates reports identifying duplicated code blocks:
<!-- Example CPD report for Scala -->
<pmd-cpd>
<duplication lines="12" tokens="45">
<file line="15" path="src/main/scala/UserService.scala"/>
<file line="28" path="src/main/scala/ProductService.scala"/>
<codefragment><![CDATA[
def findById(id: Long): Option[T] = {
val query = "SELECT * FROM table WHERE id = ?"
executeQuery(query, id).map(parseEntity)
}
]]></codefragment>
</duplication>
</pmd-cpd>// Analyze CPD results programmatically
public void analyzeCpdResults(List<Match> duplications) {
for (Match duplication : duplications) {
System.out.println("Found duplication:");
System.out.println(" Tokens: " + duplication.getTokenCount());
System.out.println(" Lines: " + duplication.getLineCount());
for (Mark mark : duplication.getMarkSet()) {
System.out.println(" File: " + mark.getFilename() +
" (line " + mark.getBeginLine() + ")");
}
System.out.println(" Code fragment:");
System.out.println(" " + duplication.getSourceCodeSlice());
}
}CPD can normalize tokens to detect semantic duplications that differ in naming:
// Configuration for token normalization
CpdConfiguration config = CpdConfiguration.builder()
.setIgnoreIdentifiers(true) // user/product → identifier
.setIgnoreLiterals(true) // "users"/"products" → string_literal
.setIgnoreAnnotations(true) // @Entity/@Component → annotation
.build();This allows detection of functionally identical code with different names:
// These would be detected as duplicates with normalization
def saveUser(user: User) = repository.save(user)
def saveProduct(product: Product) = repository.save(product)
// Normalized tokens: save(identifier) = identifier.save(identifier)public class CustomScalaCpdLexer extends ScalaCpdLexer {
public CustomScalaCpdLexer(LanguagePropertyBundle bundle) {
super(bundle);
}
@Override
protected boolean shouldIncludeToken(Token token) {
// Custom filtering logic
if (token instanceof Token.KwPrivate || token instanceof Token.KwProtected) {
return false; // Ignore visibility modifiers
}
if (token instanceof Token.Ident && isTestMethodName(token)) {
return false; // Ignore test method names
}
return super.shouldIncludeToken(token);
}
private boolean isTestMethodName(Token.Ident token) {
String name = token.value();
return name.startsWith("test") || name.contains("should");
}
}<plugin>
<groupId>com.github.spotbugs</groupId>
<artifactId>spotbugs-maven-plugin</artifactId>
<configuration>
<xmlOutput>true</xmlOutput>
<includeLanguages>
<language>scala</language>
</includeLanguages>
<cpdMinimumTokens>50</cpdMinimumTokens>
</configuration>
</plugin>// build.sbt
libraryDependencies += "net.sourceforge.pmd" % "pmd-scala_2.12" % "7.13.0"
// Custom CPD task
lazy val cpd = taskKey[Unit]("Run copy-paste detection")
cpd := {
val classpath = (Compile / dependencyClasspath).value
val sourceDir = (Compile / scalaSource).value
// Run CPD analysis on Scala sources
runCpdAnalysis(sourceDir, classpath)
}// Optimize tokenization for large codebases
public class OptimizedScalaCpdLexer extends ScalaCpdLexer {
private final Cache<String, Tokens> tokenCache =
CacheBuilder.newBuilder()
.maximumSize(1000)
.expireAfterWrite(10, TimeUnit.MINUTES)
.build();
@Override
public void tokenize(TextDocument document, TokenFactory tokenEntries) throws IOException {
String content = document.getText().toString();
try {
Tokens tokens = tokenCache.get(content, () -> {
Input input = Input.String(content);
return input.tokenize().get();
});
processTokens(tokens, document, tokenEntries);
} catch (ExecutionException e) {
throw new IOException("Tokenization failed", e.getCause());
}
}
}// Stream-based processing for large files
public void tokenizeLargeFile(TextDocument document, TokenFactory tokenEntries) throws IOException {
try (Stream<String> lines = document.getText().lines()) {
lines.forEach(line -> {
try {
tokenizeLine(line, tokenEntries);
} catch (IOException e) {
throw new RuntimeException(e);
}
});
}
}The copy-paste detection system provides comprehensive duplication analysis capabilities for Scala codebases, enabling teams to identify and eliminate code duplication effectively.