From c47f5a60624409792fb1afd6328b50fc0db10afd Mon Sep 17 00:00:00 2001 From: Jakub Dupak Date: Fri, 24 May 2024 11:39:08 +0200 Subject: [PATCH 1/3] [cpd] Add ignore literals and identifiers capability to C++ --- docs/pages/pmd/userdocs/cpd/cpd.md | 4 +- .../pmd/lang/cpp/CppLanguageModule.java | 2 + .../pmd/lang/cpp/cpd/CppCpdLexer.java | 26 +++- .../pmd/lang/cpp/cpd/CppCpdLexerTest.java | 33 ++++- .../lang/cpp/cpd/testdata/ignoreIdents.cpp | 6 + .../lang/cpp/cpd/testdata/ignoreIdents.txt | 35 +++++ .../lang/cpp/cpd/testdata/ignoreLiterals.cpp | 43 ++++++ .../lang/cpp/cpd/testdata/ignoreLiterals.txt | 135 ++++++++++++++++++ 8 files changed, 274 insertions(+), 10 deletions(-) create mode 100644 pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreIdents.cpp create mode 100644 pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreIdents.txt create mode 100644 pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreLiterals.cpp create mode 100644 pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreLiterals.txt diff --git a/docs/pages/pmd/userdocs/cpd/cpd.md b/docs/pages/pmd/userdocs/cpd/cpd.md index 1bf2954c18..0580732bb4 100644 --- a/docs/pages/pmd/userdocs/cpd/cpd.md +++ b/docs/pages/pmd/userdocs/cpd/cpd.md @@ -163,7 +163,7 @@ exactly identical. {% include custom/cli_option_row.html options="--ignore-literals" description="Ignore literal values such as numbers and strings when comparing text. By default, literals are not ignored." - languages="Java" + languages="Java, C++" %} {% include custom/cli_option_row.html options="--ignore-literal-sequences" description="Ignore sequences of literals such as list initializers. @@ -173,7 +173,7 @@ exactly identical. {% include custom/cli_option_row.html options="--ignore-identifiers" description="Ignore names of classes, methods, variables, constants, etc. when comparing text. By default, identifier names are not ignored." - languages="Java" + languages="Java, C++" %} {% include custom/cli_option_row.html options="--ignore-annotations" description="Ignore language annotations (Java) or attributes (C#) when comparing text. diff --git a/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/CppLanguageModule.java b/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/CppLanguageModule.java index 520fc7327c..5312f8abaf 100644 --- a/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/CppLanguageModule.java +++ b/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/CppLanguageModule.java @@ -47,6 +47,8 @@ public class CppLanguageModule extends CpdOnlyLanguageModuleBase { LanguagePropertyBundle bundle = super.newPropertyBundle(); bundle.definePropertyDescriptor(CpdLanguageProperties.CPD_IGNORE_LITERAL_SEQUENCES); bundle.definePropertyDescriptor(CpdLanguageProperties.CPD_IGNORE_LITERAL_AND_IDENTIFIER_SEQUENCES); + bundle.definePropertyDescriptor(CpdLanguageProperties.CPD_ANONYMIZE_IDENTIFIERS); + bundle.definePropertyDescriptor(CpdLanguageProperties.CPD_ANONYMIZE_LITERALS); bundle.definePropertyDescriptor(CPD_SKIP_BLOCKS); return bundle; } diff --git a/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/cpd/CppCpdLexer.java b/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/cpd/CppCpdLexer.java index 5d0e423a51..69287747ba 100644 --- a/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/cpd/CppCpdLexer.java +++ b/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/cpd/CppCpdLexer.java @@ -9,8 +9,9 @@ import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import net.sourceforge.pmd.cpd.CpdLanguageProperties; -import net.sourceforge.pmd.cpd.impl.CpdLexerBase; +import net.sourceforge.pmd.cpd.TokenFactory; import net.sourceforge.pmd.cpd.impl.JavaCCTokenFilter; +import net.sourceforge.pmd.cpd.impl.JavaccCpdLexer; import net.sourceforge.pmd.lang.LanguagePropertyBundle; import net.sourceforge.pmd.lang.TokenManager; import net.sourceforge.pmd.lang.ast.impl.javacc.CharStream; @@ -26,17 +27,21 @@ import net.sourceforge.pmd.lang.document.TextDocument; * *

Note: This class has been called CPPTokenizer in PMD 6

. */ -public class CppCpdLexer extends CpdLexerBase { +public class CppCpdLexer extends JavaccCpdLexer { private boolean skipBlocks; private Pattern skipBlocksStart; private Pattern skipBlocksEnd; private final boolean ignoreIdentifierAndLiteralSeqences; private final boolean ignoreLiteralSequences; + private final boolean ignoreLiterals; + private final boolean ignoreIdentifiers; public CppCpdLexer(LanguagePropertyBundle cppProperties) { ignoreLiteralSequences = cppProperties.getProperty(CpdLanguageProperties.CPD_IGNORE_LITERAL_SEQUENCES); ignoreIdentifierAndLiteralSeqences = cppProperties.getProperty(CpdLanguageProperties.CPD_IGNORE_LITERAL_AND_IDENTIFIER_SEQUENCES); + ignoreLiterals = cppProperties.getProperty(CpdLanguageProperties.CPD_ANONYMIZE_LITERALS); + ignoreIdentifiers = cppProperties.getProperty(CpdLanguageProperties.CPD_ANONYMIZE_IDENTIFIERS); String skipBlocksPattern = cppProperties.getProperty(CppLanguageModule.CPD_SKIP_BLOCKS); if (StringUtils.isNotBlank(skipBlocksPattern)) { skipBlocks = true; @@ -73,6 +78,23 @@ public class CppCpdLexer extends CpdLexerBase { return new CppTokenFilter(tokenManager, ignoreLiteralSequences, ignoreIdentifierAndLiteralSeqences); } + @Override + protected void processToken(TokenFactory tokenEntries, JavaccToken currentToken) { + int kind = currentToken.getKind(); + String image = currentToken.getImage(); + + boolean isLiteral = kind == CppTokenKinds.STRING || kind == CppTokenKinds.RSTRING || kind == CppTokenKinds.CHARACTER || kind == CppTokenKinds.DECIMAL_INT_LITERAL || kind == CppTokenKinds.HEXADECIMAL_INT_LITERAL || kind == CppTokenKinds.OCTAL_INT_LITERAL || kind == CppTokenKinds.FLOAT_LITERAL || kind == CppTokenKinds.BINARY_INT_LITERAL || kind == CppTokenKinds.ZERO; + if (ignoreLiterals && isLiteral) { + image = CppTokenKinds.describe(kind); + } + + if (ignoreIdentifiers && (kind == CppTokenKinds.ID)) { + image = CppTokenKinds.describe(kind); + } + + tokenEntries.recordToken(image, currentToken.getReportLocation()); + } + private static class CppTokenFilter extends JavaCCTokenFilter { private final boolean ignoreLiteralSequences; diff --git a/pmd-cpp/src/test/java/net/sourceforge/pmd/lang/cpp/cpd/CppCpdLexerTest.java b/pmd-cpp/src/test/java/net/sourceforge/pmd/lang/cpp/cpd/CppCpdLexerTest.java index 805bb9a86d..90bfe90a32 100644 --- a/pmd-cpp/src/test/java/net/sourceforge/pmd/lang/cpp/cpd/CppCpdLexerTest.java +++ b/pmd-cpp/src/test/java/net/sourceforge/pmd/lang/cpp/cpd/CppCpdLexerTest.java @@ -59,6 +59,16 @@ class CppCpdLexerTest extends CpdTextComparisonTest { doTest("specialComments"); } + @Test + void testIgnoreLiterals() { + doTest("ignoreLiterals", "", ignoreLiterals()); + } + + @Test + void testIgnoreIdents() { + doTest("ignoreIdents", "", ignoreIdents()); + } + @Test void testMultiLineMacros() { doTest("multilineMacros"); @@ -142,7 +152,7 @@ class CppCpdLexerTest extends CpdTextComparisonTest { } private static LanguagePropertyConfig skipBlocks(String skipPattern) { - return properties(true, skipPattern, false, false); + return properties(true, skipPattern, false, false, false, false); } private static LanguagePropertyConfig skipBlocks() { @@ -150,22 +160,31 @@ class CppCpdLexerTest extends CpdTextComparisonTest { } private static LanguagePropertyConfig dontSkipBlocks() { - return properties(false, null, false, false); + return properties(false, null, false, false, false, false); } private static LanguagePropertyConfig skipLiteralSequences() { - return properties(false, null, true, false); + return properties(false, null, true, false, false, false); } private static LanguagePropertyConfig skipIdentifierAndLiteralsSequences() { - return properties(false, null, true, true); + return properties(false, null, true, true, false, false); } private static LanguagePropertyConfig skipIdentifierSequences() { - return properties(false, null, false, true); + return properties(false, null, false, true, false, false); } - private static LanguagePropertyConfig properties(boolean skipBlocks, String skipPattern, boolean skipLiteralSequences, boolean skipSequences) { + private static LanguagePropertyConfig ignoreIdents() { + return properties(false, null, false, false, false, true); + } + + private static LanguagePropertyConfig ignoreLiterals() { + return properties(false, null, false, false, true, false); + } + + + private static LanguagePropertyConfig properties(boolean skipBlocks, String skipPattern, boolean skipLiteralSequences, boolean skipSequences, boolean ignoreLiterals, boolean ignoreIdents) { return properties -> { if (!skipBlocks) { properties.setProperty(CppLanguageModule.CPD_SKIP_BLOCKS, ""); @@ -174,6 +193,8 @@ class CppCpdLexerTest extends CpdTextComparisonTest { } properties.setProperty(CpdLanguageProperties.CPD_IGNORE_LITERAL_SEQUENCES, skipLiteralSequences); properties.setProperty(CpdLanguageProperties.CPD_IGNORE_LITERAL_AND_IDENTIFIER_SEQUENCES, skipSequences); + properties.setProperty(CpdLanguageProperties.CPD_ANONYMIZE_LITERALS, ignoreLiterals); + properties.setProperty(CpdLanguageProperties.CPD_ANONYMIZE_IDENTIFIERS, ignoreIdents); }; } } diff --git a/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreIdents.cpp b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreIdents.cpp new file mode 100644 index 0000000000..91473e3e67 --- /dev/null +++ b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreIdents.cpp @@ -0,0 +1,6 @@ +class Test { + void f(int a, float b) { + auto c = a + b; + int d = 6; + } +} \ No newline at end of file diff --git a/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreIdents.txt b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreIdents.txt new file mode 100644 index 0000000000..b564fde247 --- /dev/null +++ b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreIdents.txt @@ -0,0 +1,35 @@ + [Image] or [Truncated image[ Bcol Ecol +L1 + [class] 1 6 + [] 7 11 + [{] 12 13 +L2 + [void] 2 6 + [] 7 8 + [(] 8 9 + [int] 9 12 + [] 13 14 + [,] 14 15 + [float] 16 21 + [] 22 23 + [)] 23 24 + [{] 25 26 +L3 + [auto] 3 7 + [] 8 9 + [=] 10 11 + [] 12 13 + [+] 14 15 + [] 16 17 + [;] 17 18 +L4 + [int] 3 6 + [] 7 8 + [=] 9 10 + [6] 11 12 + [;] 12 13 +L5 + [}] 2 3 +L6 + [}] 1 2 +EOF diff --git a/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreLiterals.cpp b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreLiterals.cpp new file mode 100644 index 0000000000..cbae7336ba --- /dev/null +++ b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreLiterals.cpp @@ -0,0 +1,43 @@ + void main() { + char x = L'a'; // wide chars + x = '\0x05'; // hex + // x = L''; // empty character is an error + + print("\ oMedia"); // whitespace escape + + + // char prefixes + char16_t c = u'\u00F6'; + wchar_t b = L'\xFFEF'; + char a = '\x30'; + char32_t d = U'\U0010FFFF'; + + // string prefixes + char A[] = "Hello\x0A"; + wchar_t B[] = L"Hell\xF6\x0A"; + char16_t C[] = u"Hell\u00F6"; + char32_t D[] = U"Hell\U000000F6\U0010FFFF"; + auto E[] = u8"\u00F6\U0010FFFF"; + + + + char* rawString = R"( + [Sinks.1] + Destination=Console + AutoFlush=true + Format="[%TimeStamp%] %ThreadId% %QueryIdHigh% %QueryIdLow% %LoggerFile%:%Line% (%Severity%) - %Message%" + Filter="%Severity% >= WRN" + )"; + + + + // digit separators + auto integer_literal = 1'000''000; + auto floating_point_literal = 0.000'015'3; + auto hex_literal = 0x0F00'abcd'6f3d; + auto silly_example = 1'0'0'000'00; + + // boolean literals + int b1 = 0B001101; // C++ 14 binary literal + int b2 = 0b000001; // C++ 14 binary literal +} \ No newline at end of file diff --git a/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreLiterals.txt b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreLiterals.txt new file mode 100644 index 0000000000..ef31240e44 --- /dev/null +++ b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreLiterals.txt @@ -0,0 +1,135 @@ + [Image] or [Truncated image[ Bcol Ecol +L1 + [void] 2 6 + [main] 7 11 + [(] 11 12 + [)] 12 13 + [{] 14 15 +L2 + [char] 5 9 + [x] 10 11 + [=] 12 13 + [] 14 18 + [;] 18 19 +L3 + [x] 5 6 + [=] 7 8 + [] 9 16 + [;] 16 17 +L6 + [print] 5 10 + [(] 10 11 + [] 11 24 + [)] 24 25 + [;] 25 26 +L10 + [char16_t] 5 13 + [c] 14 15 + [=] 16 17 + [] 18 27 + [;] 27 28 +L11 + [wchar_t] 5 12 + [b] 13 14 + [=] 15 16 + [] 17 26 + [;] 26 27 +L12 + [char] 5 9 + [a] 10 11 + [=] 12 13 + [] 15 21 + [;] 21 22 +L13 + [char32_t] 5 13 + [d] 14 15 + [=] 16 17 + [] 18 31 + [;] 31 32 +L16 + [char] 5 9 + [A] 10 11 + [\[] 11 12 + [\]] 12 13 + [=] 14 15 + [] 16 27 + [;] 27 28 +L17 + [wchar_t] 5 12 + [B] 13 14 + [\[] 14 15 + [\]] 15 16 + [=] 17 18 + [] 19 34 + [;] 34 35 +L18 + [char16_t] 5 13 + [C] 14 15 + [\[] 15 16 + [\]] 16 17 + [=] 18 19 + [] 20 33 + [;] 33 34 +L19 + [char32_t] 5 13 + [D] 14 15 + [\[] 15 16 + [\]] 16 17 + [=] 18 19 + [] 20 47 + [;] 47 48 +L20 + [auto] 5 9 + [E] 10 11 + [\[] 11 12 + [\]] 12 13 + [=] 14 15 + [] 16 36 + [;] 36 37 +L24 + [char] 5 9 + [*] 9 10 + [rawString] 11 20 + [=] 21 22 + [] 23 7 +L30 + [;] 7 8 +L35 + [auto] 5 9 + [integer_literal] 10 25 + [=] 26 27 + [] 28 38 + [;] 38 39 +L36 + [auto] 5 9 + [floating_point_literal] 10 32 + [=] 33 34 + [] 35 46 + [;] 46 47 +L37 + [auto] 5 9 + [hex_literal] 10 21 + [=] 22 23 + [] 24 40 + [;] 40 41 +L38 + [auto] 5 9 + [silly_example] 10 23 + [=] 24 25 + [] 26 38 + [;] 38 39 +L41 + [int] 5 8 + [b1] 9 11 + [=] 12 13 + [] 14 22 + [;] 22 23 +L42 + [int] 5 8 + [b2] 9 11 + [=] 12 13 + [] 14 22 + [;] 22 23 +L43 + [}] 1 2 +EOF From 7ad4e0f0fc7a9897cf043012e3a1e8392d9a8a70 Mon Sep 17 00:00:00 2001 From: Jakub Dupak Date: Fri, 24 May 2024 11:48:03 +0200 Subject: [PATCH 2/3] Add @jdupak as a contributor --- .all-contributorsrc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.all-contributorsrc b/.all-contributorsrc index 8d11a3745a..6d612e8df8 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -7846,6 +7846,15 @@ "contributions": [ "code" ] + }, + { + "login": "jdupak", + "name": "Jakub Dupak", + "avatar_url": "https://avatars.githubusercontent.com/u/22683640?v=4", + "profile": "https://github.com/jdupak", + "contributions": [ + "code" + ] } ], "contributorsPerLine": 7, From a1802580336f7b9d1bce5d2156e9b7d248045c32 Mon Sep 17 00:00:00 2001 From: Andreas Dangel Date: Thu, 24 Oct 2024 10:17:51 +0200 Subject: [PATCH 3/3] [doc] Update release notes (#5040) --- docs/pages/release_notes.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/pages/release_notes.md b/docs/pages/release_notes.md index 8b0a13de9b..f595d1a885 100644 --- a/docs/pages/release_notes.md +++ b/docs/pages/release_notes.md @@ -14,6 +14,13 @@ This is a {{ site.pmd.release_type }} release. ### 🚀 New and noteworthy +#### CPD can now ignore literals and identifiers in C++ code + +When searching for duplicated code in C++ differences in literals or identifiers can be +ignored now (like in Java). This can be enabled via the command line options `--ignore-literal` +and `--ignore-identifiers`. +See [PR #5040](https://github.com/pmd/pmd/pull/5040) for details. + ### 🌟 Rule Changes #### Renamed Rules @@ -49,6 +56,7 @@ The old rule names still work but are deprecated. ### ✨ Merged pull requests * [#4965](https://github.com/pmd/pmd/pull/4965): Fix #4532: \[java] Rename JUnit rules with overly restrictive names - [Juan Martín Sotuyo Dodero](https://github.com/jsotuyod) (@jsotuyod) +* [#5040](https://github.com/pmd/pmd/pull/5040): \[cpp] Ignore literals and ignore identifiers capability to C++ CPD - [Jakub Dupak](https://github.com/jdupak) (@jdupak) * [#5225](https://github.com/pmd/pmd/pull/5225): Fix #5067: \[java] CloseResource: False positive for FileSystems.getDefault() - [Lukas Gräf](https://github.com/lukasgraef) (@lukasgraef) * [#5241](https://github.com/pmd/pmd/pull/5241): Ignore javacc code in coverage report - [Juan Martín Sotuyo Dodero](https://github.com/jsotuyod) (@jsotuyod) * [#5258](https://github.com/pmd/pmd/pull/5258): Ignore generated antlr classes in coverage reports - [Juan Martín Sotuyo Dodero](https://github.com/jsotuyod) (@jsotuyod)