[cpp] Ignore literals and ignore identifiers capability to C++ CPD (#5040)

Merge pull request #5040 from jdupak:master
This commit is contained in:
Andreas Dangel 2024-10-24 10:18:39 +02:00
commit 7358289bbe
No known key found for this signature in database
GPG Key ID: 93450DF2DF9A3FA3
10 changed files with 291 additions and 10 deletions

View File

@ -7846,6 +7846,15 @@
"contributions": [
"code"
]
},
{
"login": "jdupak",
"name": "Jakub Dupak",
"avatar_url": "https://avatars.githubusercontent.com/u/22683640?v=4",
"profile": "https://github.com/jdupak",
"contributions": [
"code"
]
}
],
"contributorsPerLine": 7,

View File

@ -163,7 +163,7 @@ exactly identical.
{% include custom/cli_option_row.html options="--ignore-literals"
description="Ignore literal values such as numbers and strings when comparing text.
By default, literals are not ignored."
languages="Java"
languages="Java, C++"
%}
{% include custom/cli_option_row.html options="--ignore-literal-sequences"
description="Ignore sequences of literals such as list initializers.
@ -173,7 +173,7 @@ exactly identical.
{% include custom/cli_option_row.html options="--ignore-identifiers"
description="Ignore names of classes, methods, variables, constants, etc. when comparing text.
By default, identifier names are not ignored."
languages="Java"
languages="Java, C++"
%}
{% include custom/cli_option_row.html options="--ignore-annotations"
description="Ignore language annotations (Java) or attributes (C#) when comparing text.

View File

@ -14,6 +14,13 @@ This is a {{ site.pmd.release_type }} release.
### 🚀 New and noteworthy
#### CPD can now ignore literals and identifiers in C++ code
When searching for duplicated code in C++ differences in literals or identifiers can be
ignored now (like in Java). This can be enabled via the command line options `--ignore-literal`
and `--ignore-identifiers`.
See [PR #5040](https://github.com/pmd/pmd/pull/5040) for details.
### 🌟 Rule Changes
#### Renamed Rules
@ -49,6 +56,7 @@ The old rule names still work but are deprecated.
### ✨ Merged pull requests
* [#4965](https://github.com/pmd/pmd/pull/4965): Fix #4532: \[java] Rename JUnit rules with overly restrictive names - [Juan Martín Sotuyo Dodero](https://github.com/jsotuyod) (@jsotuyod)
* [#5040](https://github.com/pmd/pmd/pull/5040): \[cpp] Ignore literals and ignore identifiers capability to C++ CPD - [Jakub Dupak](https://github.com/jdupak) (@jdupak)
* [#5225](https://github.com/pmd/pmd/pull/5225): Fix #5067: \[java] CloseResource: False positive for FileSystems.getDefault() - [Lukas Gräf](https://github.com/lukasgraef) (@lukasgraef)
* [#5241](https://github.com/pmd/pmd/pull/5241): Ignore javacc code in coverage report - [Juan Martín Sotuyo Dodero](https://github.com/jsotuyod) (@jsotuyod)
* [#5258](https://github.com/pmd/pmd/pull/5258): Ignore generated antlr classes in coverage reports - [Juan Martín Sotuyo Dodero](https://github.com/jsotuyod) (@jsotuyod)

View File

@ -47,6 +47,8 @@ public class CppLanguageModule extends CpdOnlyLanguageModuleBase {
LanguagePropertyBundle bundle = super.newPropertyBundle();
bundle.definePropertyDescriptor(CpdLanguageProperties.CPD_IGNORE_LITERAL_SEQUENCES);
bundle.definePropertyDescriptor(CpdLanguageProperties.CPD_IGNORE_LITERAL_AND_IDENTIFIER_SEQUENCES);
bundle.definePropertyDescriptor(CpdLanguageProperties.CPD_ANONYMIZE_IDENTIFIERS);
bundle.definePropertyDescriptor(CpdLanguageProperties.CPD_ANONYMIZE_LITERALS);
bundle.definePropertyDescriptor(CPD_SKIP_BLOCKS);
return bundle;
}

View File

@ -9,8 +9,9 @@ import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import net.sourceforge.pmd.cpd.CpdLanguageProperties;
import net.sourceforge.pmd.cpd.impl.CpdLexerBase;
import net.sourceforge.pmd.cpd.TokenFactory;
import net.sourceforge.pmd.cpd.impl.JavaCCTokenFilter;
import net.sourceforge.pmd.cpd.impl.JavaccCpdLexer;
import net.sourceforge.pmd.lang.LanguagePropertyBundle;
import net.sourceforge.pmd.lang.TokenManager;
import net.sourceforge.pmd.lang.ast.impl.javacc.CharStream;
@ -26,17 +27,21 @@ import net.sourceforge.pmd.lang.document.TextDocument;
*
* <p>Note: This class has been called CPPTokenizer in PMD 6</p>.
*/
public class CppCpdLexer extends CpdLexerBase<JavaccToken> {
public class CppCpdLexer extends JavaccCpdLexer {
private boolean skipBlocks;
private Pattern skipBlocksStart;
private Pattern skipBlocksEnd;
private final boolean ignoreIdentifierAndLiteralSeqences;
private final boolean ignoreLiteralSequences;
private final boolean ignoreLiterals;
private final boolean ignoreIdentifiers;
public CppCpdLexer(LanguagePropertyBundle cppProperties) {
ignoreLiteralSequences = cppProperties.getProperty(CpdLanguageProperties.CPD_IGNORE_LITERAL_SEQUENCES);
ignoreIdentifierAndLiteralSeqences = cppProperties.getProperty(CpdLanguageProperties.CPD_IGNORE_LITERAL_AND_IDENTIFIER_SEQUENCES);
ignoreLiterals = cppProperties.getProperty(CpdLanguageProperties.CPD_ANONYMIZE_LITERALS);
ignoreIdentifiers = cppProperties.getProperty(CpdLanguageProperties.CPD_ANONYMIZE_IDENTIFIERS);
String skipBlocksPattern = cppProperties.getProperty(CppLanguageModule.CPD_SKIP_BLOCKS);
if (StringUtils.isNotBlank(skipBlocksPattern)) {
skipBlocks = true;
@ -73,6 +78,23 @@ public class CppCpdLexer extends CpdLexerBase<JavaccToken> {
return new CppTokenFilter(tokenManager, ignoreLiteralSequences, ignoreIdentifierAndLiteralSeqences);
}
@Override
protected void processToken(TokenFactory tokenEntries, JavaccToken currentToken) {
int kind = currentToken.getKind();
String image = currentToken.getImage();
boolean isLiteral = kind == CppTokenKinds.STRING || kind == CppTokenKinds.RSTRING || kind == CppTokenKinds.CHARACTER || kind == CppTokenKinds.DECIMAL_INT_LITERAL || kind == CppTokenKinds.HEXADECIMAL_INT_LITERAL || kind == CppTokenKinds.OCTAL_INT_LITERAL || kind == CppTokenKinds.FLOAT_LITERAL || kind == CppTokenKinds.BINARY_INT_LITERAL || kind == CppTokenKinds.ZERO;
if (ignoreLiterals && isLiteral) {
image = CppTokenKinds.describe(kind);
}
if (ignoreIdentifiers && (kind == CppTokenKinds.ID)) {
image = CppTokenKinds.describe(kind);
}
tokenEntries.recordToken(image, currentToken.getReportLocation());
}
private static class CppTokenFilter extends JavaCCTokenFilter {
private final boolean ignoreLiteralSequences;

View File

@ -59,6 +59,16 @@ class CppCpdLexerTest extends CpdTextComparisonTest {
doTest("specialComments");
}
@Test
void testIgnoreLiterals() {
doTest("ignoreLiterals", "", ignoreLiterals());
}
@Test
void testIgnoreIdents() {
doTest("ignoreIdents", "", ignoreIdents());
}
@Test
void testMultiLineMacros() {
doTest("multilineMacros");
@ -142,7 +152,7 @@ class CppCpdLexerTest extends CpdTextComparisonTest {
}
private static LanguagePropertyConfig skipBlocks(String skipPattern) {
return properties(true, skipPattern, false, false);
return properties(true, skipPattern, false, false, false, false);
}
private static LanguagePropertyConfig skipBlocks() {
@ -150,22 +160,31 @@ class CppCpdLexerTest extends CpdTextComparisonTest {
}
private static LanguagePropertyConfig dontSkipBlocks() {
return properties(false, null, false, false);
return properties(false, null, false, false, false, false);
}
private static LanguagePropertyConfig skipLiteralSequences() {
return properties(false, null, true, false);
return properties(false, null, true, false, false, false);
}
private static LanguagePropertyConfig skipIdentifierAndLiteralsSequences() {
return properties(false, null, true, true);
return properties(false, null, true, true, false, false);
}
private static LanguagePropertyConfig skipIdentifierSequences() {
return properties(false, null, false, true);
return properties(false, null, false, true, false, false);
}
private static LanguagePropertyConfig properties(boolean skipBlocks, String skipPattern, boolean skipLiteralSequences, boolean skipSequences) {
private static LanguagePropertyConfig ignoreIdents() {
return properties(false, null, false, false, false, true);
}
private static LanguagePropertyConfig ignoreLiterals() {
return properties(false, null, false, false, true, false);
}
private static LanguagePropertyConfig properties(boolean skipBlocks, String skipPattern, boolean skipLiteralSequences, boolean skipSequences, boolean ignoreLiterals, boolean ignoreIdents) {
return properties -> {
if (!skipBlocks) {
properties.setProperty(CppLanguageModule.CPD_SKIP_BLOCKS, "");
@ -174,6 +193,8 @@ class CppCpdLexerTest extends CpdTextComparisonTest {
}
properties.setProperty(CpdLanguageProperties.CPD_IGNORE_LITERAL_SEQUENCES, skipLiteralSequences);
properties.setProperty(CpdLanguageProperties.CPD_IGNORE_LITERAL_AND_IDENTIFIER_SEQUENCES, skipSequences);
properties.setProperty(CpdLanguageProperties.CPD_ANONYMIZE_LITERALS, ignoreLiterals);
properties.setProperty(CpdLanguageProperties.CPD_ANONYMIZE_IDENTIFIERS, ignoreIdents);
};
}
}

View File

@ -0,0 +1,6 @@
class Test {
void f(int a, float b) {
auto c = a + b;
int d = 6;
}
}

View File

@ -0,0 +1,35 @@
[Image] or [Truncated image[ Bcol Ecol
L1
[class] 1 6
[<ID>] 7 11
[{] 12 13
L2
[void] 2 6
[<ID>] 7 8
[(] 8 9
[int] 9 12
[<ID>] 13 14
[,] 14 15
[float] 16 21
[<ID>] 22 23
[)] 23 24
[{] 25 26
L3
[auto] 3 7
[<ID>] 8 9
[=] 10 11
[<ID>] 12 13
[+] 14 15
[<ID>] 16 17
[;] 17 18
L4
[int] 3 6
[<ID>] 7 8
[=] 9 10
[6] 11 12
[;] 12 13
L5
[}] 2 3
L6
[}] 1 2
EOF

View File

@ -0,0 +1,43 @@
void main() {
char x = L'a'; // wide chars
x = '\0x05'; // hex
// x = L''; // empty character is an error
print("\ oMedia"); // whitespace escape
// char prefixes
char16_t c = u'\u00F6';
wchar_t b = L'\xFFEF';
char a = '\x30';
char32_t d = U'\U0010FFFF';
// string prefixes
char A[] = "Hello\x0A";
wchar_t B[] = L"Hell\xF6\x0A";
char16_t C[] = u"Hell\u00F6";
char32_t D[] = U"Hell\U000000F6\U0010FFFF";
auto E[] = u8"\u00F6\U0010FFFF";
char* rawString = R"(
[Sinks.1]
Destination=Console
AutoFlush=true
Format="[%TimeStamp%] %ThreadId% %QueryIdHigh% %QueryIdLow% %LoggerFile%:%Line% (%Severity%) - %Message%"
Filter="%Severity% >= WRN"
)";
// digit separators
auto integer_literal = 1'000''000;
auto floating_point_literal = 0.000'015'3;
auto hex_literal = 0x0F00'abcd'6f3d;
auto silly_example = 1'0'0'000'00;
// boolean literals
int b1 = 0B001101; // C++ 14 binary literal
int b2 = 0b000001; // C++ 14 binary literal
}

View File

@ -0,0 +1,135 @@
[Image] or [Truncated image[ Bcol Ecol
L1
[void] 2 6
[main] 7 11
[(] 11 12
[)] 12 13
[{] 14 15
L2
[char] 5 9
[x] 10 11
[=] 12 13
[<CHARACTER>] 14 18
[;] 18 19
L3
[x] 5 6
[=] 7 8
[<CHARACTER>] 9 16
[;] 16 17
L6
[print] 5 10
[(] 10 11
[<STRING>] 11 24
[)] 24 25
[;] 25 26
L10
[char16_t] 5 13
[c] 14 15
[=] 16 17
[<CHARACTER>] 18 27
[;] 27 28
L11
[wchar_t] 5 12
[b] 13 14
[=] 15 16
[<CHARACTER>] 17 26
[;] 26 27
L12
[char] 5 9
[a] 10 11
[=] 12 13
[<CHARACTER>] 15 21
[;] 21 22
L13
[char32_t] 5 13
[d] 14 15
[=] 16 17
[<CHARACTER>] 18 31
[;] 31 32
L16
[char] 5 9
[A] 10 11
[\[] 11 12
[\]] 12 13
[=] 14 15
[<STRING>] 16 27
[;] 27 28
L17
[wchar_t] 5 12
[B] 13 14
[\[] 14 15
[\]] 15 16
[=] 17 18
[<STRING>] 19 34
[;] 34 35
L18
[char16_t] 5 13
[C] 14 15
[\[] 15 16
[\]] 16 17
[=] 18 19
[<STRING>] 20 33
[;] 33 34
L19
[char32_t] 5 13
[D] 14 15
[\[] 15 16
[\]] 16 17
[=] 18 19
[<STRING>] 20 47
[;] 47 48
L20
[auto] 5 9
[E] 10 11
[\[] 11 12
[\]] 12 13
[=] 14 15
[<STRING>] 16 36
[;] 36 37
L24
[char] 5 9
[*] 9 10
[rawString] 11 20
[=] 21 22
[<RSTRING>] 23 7
L30
[;] 7 8
L35
[auto] 5 9
[integer_literal] 10 25
[=] 26 27
[<DECIMAL_INT_LITERAL>] 28 38
[;] 38 39
L36
[auto] 5 9
[floating_point_literal] 10 32
[=] 33 34
[<FLOAT_LITERAL>] 35 46
[;] 46 47
L37
[auto] 5 9
[hex_literal] 10 21
[=] 22 23
[<HEXADECIMAL_INT_LITERAL>] 24 40
[;] 40 41
L38
[auto] 5 9
[silly_example] 10 23
[=] 24 25
[<DECIMAL_INT_LITERAL>] 26 38
[;] 38 39
L41
[int] 5 8
[b1] 9 11
[=] 12 13
[<BINARY_INT_LITERAL>] 14 22
[;] 22 23
L42
[int] 5 8
[b2] 9 11
[=] 12 13
[<BINARY_INT_LITERAL>] 14 22
[;] 22 23
L43
[}] 1 2
EOF