From c47f5a60624409792fb1afd6328b50fc0db10afd Mon Sep 17 00:00:00 2001
From: Jakub Dupak <dev@jakubdupak.com>
Date: Fri, 24 May 2024 11:39:08 +0200
Subject: [PATCH] [cpd] Add ignore literals and identifiers capability to C++

---
 docs/pages/pmd/userdocs/cpd/cpd.md            |   4 +-
 .../pmd/lang/cpp/CppLanguageModule.java       |   2 +
 .../pmd/lang/cpp/cpd/CppCpdLexer.java         |  26 +++-
 .../pmd/lang/cpp/cpd/CppCpdLexerTest.java     |  33 ++++-
 .../lang/cpp/cpd/testdata/ignoreIdents.cpp    |   6 +
 .../lang/cpp/cpd/testdata/ignoreIdents.txt    |  35 +++++
 .../lang/cpp/cpd/testdata/ignoreLiterals.cpp  |  43 ++++++
 .../lang/cpp/cpd/testdata/ignoreLiterals.txt  | 135 ++++++++++++++++++
 8 files changed, 274 insertions(+), 10 deletions(-)
 create mode 100644 pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreIdents.cpp
 create mode 100644 pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreIdents.txt
 create mode 100644 pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreLiterals.cpp
 create mode 100644 pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreLiterals.txt

diff --git a/docs/pages/pmd/userdocs/cpd/cpd.md b/docs/pages/pmd/userdocs/cpd/cpd.md
index 1bf2954c18..0580732bb4 100644
--- a/docs/pages/pmd/userdocs/cpd/cpd.md
+++ b/docs/pages/pmd/userdocs/cpd/cpd.md
@@ -163,7 +163,7 @@ exactly identical.
     {% include custom/cli_option_row.html options="--ignore-literals"
                description="Ignore literal values such as numbers and strings when comparing text.
                             By default, literals are not ignored."
-               languages="Java"
+               languages="Java, C++"
     %}
     {% include custom/cli_option_row.html options="--ignore-literal-sequences"
                description="Ignore sequences of literals such as list initializers.
@@ -173,7 +173,7 @@ exactly identical.
     {% include custom/cli_option_row.html options="--ignore-identifiers"
                description="Ignore names of classes, methods, variables, constants, etc. when comparing text.
                             By default, identifier names are not ignored."
-               languages="Java"
+               languages="Java, C++"
     %}
     {% include custom/cli_option_row.html options="--ignore-annotations"
                description="Ignore language annotations (Java) or attributes (C#) when comparing text.
diff --git a/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/CppLanguageModule.java b/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/CppLanguageModule.java
index 520fc7327c..5312f8abaf 100644
--- a/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/CppLanguageModule.java
+++ b/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/CppLanguageModule.java
@@ -47,6 +47,8 @@ public class CppLanguageModule extends CpdOnlyLanguageModuleBase {
         LanguagePropertyBundle bundle = super.newPropertyBundle();
         bundle.definePropertyDescriptor(CpdLanguageProperties.CPD_IGNORE_LITERAL_SEQUENCES);
         bundle.definePropertyDescriptor(CpdLanguageProperties.CPD_IGNORE_LITERAL_AND_IDENTIFIER_SEQUENCES);
+        bundle.definePropertyDescriptor(CpdLanguageProperties.CPD_ANONYMIZE_IDENTIFIERS);
+        bundle.definePropertyDescriptor(CpdLanguageProperties.CPD_ANONYMIZE_LITERALS);
         bundle.definePropertyDescriptor(CPD_SKIP_BLOCKS);
         return bundle;
     }
diff --git a/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/cpd/CppCpdLexer.java b/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/cpd/CppCpdLexer.java
index 5d0e423a51..69287747ba 100644
--- a/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/cpd/CppCpdLexer.java
+++ b/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/cpd/CppCpdLexer.java
@@ -9,8 +9,9 @@ import java.util.regex.Pattern;
 import org.apache.commons.lang3.StringUtils;
 
 import net.sourceforge.pmd.cpd.CpdLanguageProperties;
-import net.sourceforge.pmd.cpd.impl.CpdLexerBase;
+import net.sourceforge.pmd.cpd.TokenFactory;
 import net.sourceforge.pmd.cpd.impl.JavaCCTokenFilter;
+import net.sourceforge.pmd.cpd.impl.JavaccCpdLexer;
 import net.sourceforge.pmd.lang.LanguagePropertyBundle;
 import net.sourceforge.pmd.lang.TokenManager;
 import net.sourceforge.pmd.lang.ast.impl.javacc.CharStream;
@@ -26,17 +27,21 @@ import net.sourceforge.pmd.lang.document.TextDocument;
  *
  * <p>Note: This class has been called CPPTokenizer in PMD 6</p>.
  */
-public class CppCpdLexer extends CpdLexerBase<JavaccToken> {
+public class CppCpdLexer extends JavaccCpdLexer {
 
     private boolean skipBlocks;
     private Pattern skipBlocksStart;
     private Pattern skipBlocksEnd;
     private final boolean ignoreIdentifierAndLiteralSeqences;
     private final boolean ignoreLiteralSequences;
+    private final boolean ignoreLiterals;
+    private final boolean ignoreIdentifiers;
 
     public CppCpdLexer(LanguagePropertyBundle cppProperties) {
         ignoreLiteralSequences = cppProperties.getProperty(CpdLanguageProperties.CPD_IGNORE_LITERAL_SEQUENCES);
         ignoreIdentifierAndLiteralSeqences = cppProperties.getProperty(CpdLanguageProperties.CPD_IGNORE_LITERAL_AND_IDENTIFIER_SEQUENCES);
+        ignoreLiterals = cppProperties.getProperty(CpdLanguageProperties.CPD_ANONYMIZE_LITERALS);
+        ignoreIdentifiers = cppProperties.getProperty(CpdLanguageProperties.CPD_ANONYMIZE_IDENTIFIERS);
         String skipBlocksPattern = cppProperties.getProperty(CppLanguageModule.CPD_SKIP_BLOCKS);
         if (StringUtils.isNotBlank(skipBlocksPattern)) {
             skipBlocks = true;
@@ -73,6 +78,23 @@ public class CppCpdLexer extends CpdLexerBase<JavaccToken> {
         return new CppTokenFilter(tokenManager, ignoreLiteralSequences, ignoreIdentifierAndLiteralSeqences);
     }
 
+    @Override
+    protected void processToken(TokenFactory tokenEntries, JavaccToken currentToken) {
+        int kind = currentToken.getKind();
+        String image = currentToken.getImage();
+
+        boolean isLiteral = kind == CppTokenKinds.STRING || kind == CppTokenKinds.RSTRING || kind == CppTokenKinds.CHARACTER || kind == CppTokenKinds.DECIMAL_INT_LITERAL || kind == CppTokenKinds.HEXADECIMAL_INT_LITERAL || kind == CppTokenKinds.OCTAL_INT_LITERAL || kind == CppTokenKinds.FLOAT_LITERAL || kind == CppTokenKinds.BINARY_INT_LITERAL || kind == CppTokenKinds.ZERO;
+        if (ignoreLiterals && isLiteral) {
+            image = CppTokenKinds.describe(kind);
+        }
+
+        if (ignoreIdentifiers && (kind == CppTokenKinds.ID)) {
+            image = CppTokenKinds.describe(kind);
+        }
+
+        tokenEntries.recordToken(image, currentToken.getReportLocation());
+    }
+
     private static class CppTokenFilter extends JavaCCTokenFilter {
 
         private final boolean ignoreLiteralSequences;
diff --git a/pmd-cpp/src/test/java/net/sourceforge/pmd/lang/cpp/cpd/CppCpdLexerTest.java b/pmd-cpp/src/test/java/net/sourceforge/pmd/lang/cpp/cpd/CppCpdLexerTest.java
index 805bb9a86d..90bfe90a32 100644
--- a/pmd-cpp/src/test/java/net/sourceforge/pmd/lang/cpp/cpd/CppCpdLexerTest.java
+++ b/pmd-cpp/src/test/java/net/sourceforge/pmd/lang/cpp/cpd/CppCpdLexerTest.java
@@ -59,6 +59,16 @@ class CppCpdLexerTest extends CpdTextComparisonTest {
         doTest("specialComments");
     }
 
+    @Test
+    void testIgnoreLiterals() {
+        doTest("ignoreLiterals", "", ignoreLiterals());
+    }
+
+    @Test
+    void testIgnoreIdents() {
+        doTest("ignoreIdents", "", ignoreIdents());
+    }
+
     @Test
     void testMultiLineMacros() {
         doTest("multilineMacros");
@@ -142,7 +152,7 @@ class CppCpdLexerTest extends CpdTextComparisonTest {
     }
 
     private static LanguagePropertyConfig skipBlocks(String skipPattern) {
-        return properties(true, skipPattern, false, false);
+        return properties(true, skipPattern, false, false, false, false);
     }
 
     private static LanguagePropertyConfig skipBlocks() {
@@ -150,22 +160,31 @@ class CppCpdLexerTest extends CpdTextComparisonTest {
     }
 
     private static LanguagePropertyConfig dontSkipBlocks() {
-        return properties(false, null, false, false);
+        return properties(false, null, false, false, false, false);
     }
 
     private static LanguagePropertyConfig skipLiteralSequences() {
-        return properties(false, null, true, false);
+        return properties(false, null, true, false, false, false);
     }
 
     private static LanguagePropertyConfig skipIdentifierAndLiteralsSequences() {
-        return properties(false, null, true, true);
+        return properties(false, null, true, true, false, false);
     }
 
     private static LanguagePropertyConfig skipIdentifierSequences() {
-        return properties(false, null, false, true);
+        return properties(false, null, false, true, false, false);
     }
 
-    private static LanguagePropertyConfig properties(boolean skipBlocks, String skipPattern, boolean skipLiteralSequences, boolean skipSequences) {
+    private static LanguagePropertyConfig ignoreIdents() {
+        return properties(false, null, false, false, false, true);
+    }
+
+    private static LanguagePropertyConfig ignoreLiterals() {
+        return properties(false, null, false, false, true, false);
+    }
+
+
+    private static LanguagePropertyConfig properties(boolean skipBlocks, String skipPattern, boolean skipLiteralSequences, boolean skipSequences, boolean ignoreLiterals, boolean ignoreIdents) {
         return properties -> {
             if (!skipBlocks) {
                 properties.setProperty(CppLanguageModule.CPD_SKIP_BLOCKS, "");
@@ -174,6 +193,8 @@ class CppCpdLexerTest extends CpdTextComparisonTest {
             }
             properties.setProperty(CpdLanguageProperties.CPD_IGNORE_LITERAL_SEQUENCES, skipLiteralSequences);
             properties.setProperty(CpdLanguageProperties.CPD_IGNORE_LITERAL_AND_IDENTIFIER_SEQUENCES, skipSequences);
+            properties.setProperty(CpdLanguageProperties.CPD_ANONYMIZE_LITERALS, ignoreLiterals);
+            properties.setProperty(CpdLanguageProperties.CPD_ANONYMIZE_IDENTIFIERS, ignoreIdents);
         };
     }
 }
diff --git a/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreIdents.cpp b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreIdents.cpp
new file mode 100644
index 0000000000..91473e3e67
--- /dev/null
+++ b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreIdents.cpp
@@ -0,0 +1,6 @@
+class Test {
+	void f(int a, float b) {
+		auto c = a + b;
+		int d = 6;
+	}
+}
\ No newline at end of file
diff --git a/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreIdents.txt b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreIdents.txt
new file mode 100644
index 0000000000..b564fde247
--- /dev/null
+++ b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreIdents.txt
@@ -0,0 +1,35 @@
+    [Image] or [Truncated image[            Bcol      Ecol
+L1
+    [class]                                 1         6
+    [<ID>]                                  7         11
+    [{]                                     12        13
+L2
+    [void]                                  2         6
+    [<ID>]                                  7         8
+    [(]                                     8         9
+    [int]                                   9         12
+    [<ID>]                                  13        14
+    [,]                                     14        15
+    [float]                                 16        21
+    [<ID>]                                  22        23
+    [)]                                     23        24
+    [{]                                     25        26
+L3
+    [auto]                                  3         7
+    [<ID>]                                  8         9
+    [=]                                     10        11
+    [<ID>]                                  12        13
+    [+]                                     14        15
+    [<ID>]                                  16        17
+    [;]                                     17        18
+L4
+    [int]                                   3         6
+    [<ID>]                                  7         8
+    [=]                                     9         10
+    [6]                                     11        12
+    [;]                                     12        13
+L5
+    [}]                                     2         3
+L6
+    [}]                                     1         2
+EOF
diff --git a/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreLiterals.cpp b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreLiterals.cpp
new file mode 100644
index 0000000000..cbae7336ba
--- /dev/null
+++ b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreLiterals.cpp
@@ -0,0 +1,43 @@
+ void main() {
+    char x = L'a'; // wide chars
+    x = '\0x05';   // hex
+    // x = L'';    // empty character is an error
+
+    print("\    oMedia"); // whitespace escape
+
+
+    // char prefixes
+    char16_t c = u'\u00F6';
+    wchar_t b = L'\xFFEF';
+    char a =  '\x30';
+    char32_t d = U'\U0010FFFF';
+
+    // string prefixes
+    char A[] = "Hello\x0A";
+    wchar_t B[] = L"Hell\xF6\x0A";
+    char16_t C[] = u"Hell\u00F6";
+    char32_t D[] = U"Hell\U000000F6\U0010FFFF";
+    auto E[] = u8"\u00F6\U0010FFFF";
+
+
+
+    char* rawString = R"(
+        [Sinks.1]
+        Destination=Console
+        AutoFlush=true
+        Format="[%TimeStamp%] %ThreadId% %QueryIdHigh% %QueryIdLow% %LoggerFile%:%Line% (%Severity%) - %Message%"
+        Filter="%Severity% >= WRN"
+    )";
+
+
+
+    // digit separators
+    auto integer_literal = 1'000''000;
+    auto floating_point_literal = 0.000'015'3;
+    auto hex_literal = 0x0F00'abcd'6f3d;
+    auto silly_example = 1'0'0'000'00;
+
+    // boolean literals
+    int b1 = 0B001101; // C++ 14 binary literal
+    int b2 = 0b000001; // C++ 14 binary literal
+}
\ No newline at end of file
diff --git a/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreLiterals.txt b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreLiterals.txt
new file mode 100644
index 0000000000..ef31240e44
--- /dev/null
+++ b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/ignoreLiterals.txt
@@ -0,0 +1,135 @@
+    [Image] or [Truncated image[            Bcol      Ecol
+L1
+    [void]                                  2         6
+    [main]                                  7         11
+    [(]                                     11        12
+    [)]                                     12        13
+    [{]                                     14        15
+L2
+    [char]                                  5         9
+    [x]                                     10        11
+    [=]                                     12        13
+    [<CHARACTER>]                           14        18
+    [;]                                     18        19
+L3
+    [x]                                     5         6
+    [=]                                     7         8
+    [<CHARACTER>]                           9         16
+    [;]                                     16        17
+L6
+    [print]                                 5         10
+    [(]                                     10        11
+    [<STRING>]                              11        24
+    [)]                                     24        25
+    [;]                                     25        26
+L10
+    [char16_t]                              5         13
+    [c]                                     14        15
+    [=]                                     16        17
+    [<CHARACTER>]                           18        27
+    [;]                                     27        28
+L11
+    [wchar_t]                               5         12
+    [b]                                     13        14
+    [=]                                     15        16
+    [<CHARACTER>]                           17        26
+    [;]                                     26        27
+L12
+    [char]                                  5         9
+    [a]                                     10        11
+    [=]                                     12        13
+    [<CHARACTER>]                           15        21
+    [;]                                     21        22
+L13
+    [char32_t]                              5         13
+    [d]                                     14        15
+    [=]                                     16        17
+    [<CHARACTER>]                           18        31
+    [;]                                     31        32
+L16
+    [char]                                  5         9
+    [A]                                     10        11
+    [\[]                                    11        12
+    [\]]                                    12        13
+    [=]                                     14        15
+    [<STRING>]                              16        27
+    [;]                                     27        28
+L17
+    [wchar_t]                               5         12
+    [B]                                     13        14
+    [\[]                                    14        15
+    [\]]                                    15        16
+    [=]                                     17        18
+    [<STRING>]                              19        34
+    [;]                                     34        35
+L18
+    [char16_t]                              5         13
+    [C]                                     14        15
+    [\[]                                    15        16
+    [\]]                                    16        17
+    [=]                                     18        19
+    [<STRING>]                              20        33
+    [;]                                     33        34
+L19
+    [char32_t]                              5         13
+    [D]                                     14        15
+    [\[]                                    15        16
+    [\]]                                    16        17
+    [=]                                     18        19
+    [<STRING>]                              20        47
+    [;]                                     47        48
+L20
+    [auto]                                  5         9
+    [E]                                     10        11
+    [\[]                                    11        12
+    [\]]                                    12        13
+    [=]                                     14        15
+    [<STRING>]                              16        36
+    [;]                                     36        37
+L24
+    [char]                                  5         9
+    [*]                                     9         10
+    [rawString]                             11        20
+    [=]                                     21        22
+    [<RSTRING>]                             23        7
+L30
+    [;]                                     7         8
+L35
+    [auto]                                  5         9
+    [integer_literal]                       10        25
+    [=]                                     26        27
+    [<DECIMAL_INT_LITERAL>]                 28        38
+    [;]                                     38        39
+L36
+    [auto]                                  5         9
+    [floating_point_literal]                10        32
+    [=]                                     33        34
+    [<FLOAT_LITERAL>]                       35        46
+    [;]                                     46        47
+L37
+    [auto]                                  5         9
+    [hex_literal]                           10        21
+    [=]                                     22        23
+    [<HEXADECIMAL_INT_LITERAL>]             24        40
+    [;]                                     40        41
+L38
+    [auto]                                  5         9
+    [silly_example]                         10        23
+    [=]                                     24        25
+    [<DECIMAL_INT_LITERAL>]                 26        38
+    [;]                                     38        39
+L41
+    [int]                                   5         8
+    [b1]                                    9         11
+    [=]                                     12        13
+    [<BINARY_INT_LITERAL>]                  14        22
+    [;]                                     22        23
+L42
+    [int]                                   5         8
+    [b2]                                    9         11
+    [=]                                     12        13
+    [<BINARY_INT_LITERAL>]                  14        22
+    [;]                                     22        23
+L43
+    [}]                                     1         2
+EOF