Added strings as literal and ignore identifiers in sequences

This commit is contained in:
wener
2023-04-13 17:29:12 +02:00
parent 0313ffdedd
commit 85a0d7f59f
7 changed files with 174 additions and 25 deletions

View File

@ -69,6 +69,9 @@ public class CpdCommand extends AbstractAnalysisPmdSubcommand {
@Option(names = "--ignore-literal-sequences", description = "Ignore sequences of literals such as list initializers.")
private boolean ignoreLiteralSequences;
@Option(names = "--ignore-sequences", description = "Ignore sequences of identifiers and literals")
private boolean ignoreIdentifierAndLiteralSequences;
@Option(names = "--skip-lexical-errors",
description = "Skip files which can't be tokenized due to invalid characters, instead of aborting with an error.")
private boolean skipLexicalErrors;

View File

@ -65,6 +65,8 @@ public class CPDConfiguration extends AbstractConfiguration {
private boolean ignoreLiteralSequences = false;
private boolean ignoreIdentifierAndLiteralSequences = false;
private boolean skipLexicalErrors = false;
private boolean noSkipBlocks = false;
@ -190,6 +192,11 @@ public class CPDConfiguration extends AbstractConfiguration {
} else {
properties.remove(Tokenizer.OPTION_IGNORE_LITERAL_SEQUENCES);
}
if (configuration.isIgnoreIdentifierAndLiteralSequences()) {
properties.setProperty(Tokenizer.OPTION_IGNORE_IDENTIFIER_AND_LITERAL_SEQUENCES, "true");
} else {
properties.remove(Tokenizer.OPTION_IGNORE_IDENTIFIER_AND_LITERAL_SEQUENCES);
}
properties.setProperty(Tokenizer.OPTION_SKIP_BLOCKS, Boolean.toString(!configuration.isNoSkipBlocks()));
properties.setProperty(Tokenizer.OPTION_SKIP_BLOCKS_PATTERN, configuration.getSkipBlocksPattern());
configuration.getLanguage().setProperties(properties);
@ -316,6 +323,14 @@ public class CPDConfiguration extends AbstractConfiguration {
this.ignoreLiteralSequences = ignoreLiteralSequences;
}
public boolean isIgnoreIdentifierAndLiteralSequences() {
return ignoreIdentifierAndLiteralSequences;
}
public void setIgnoreIdentifierAndLiteralSequences(boolean ignoreIdentifierAndLiteralSequences) {
this.ignoreIdentifierAndLiteralSequences = ignoreIdentifierAndLiteralSequences;
}
public boolean isSkipLexicalErrors() {
return skipLexicalErrors;
}

View File

@ -15,6 +15,12 @@ public interface Tokenizer {
* Ignore sequences of literals (e.g, <code>0,0,0,0...</code>).
*/
String OPTION_IGNORE_LITERAL_SEQUENCES = "net.sourceforge.pmd.cpd.Tokenizer.skipLiteralSequences";
/**
* Ignore comma separated sequences of identifies and literals (e.g, <code>0,0,0,0...</code>).
*/
String OPTION_IGNORE_IDENTIFIER_AND_LITERAL_SEQUENCES = "net.sourceforge.pmd.cpd.Tokenizer.skipSequences";
/**
* Ignore using directives in C#. The default value is <code>false</code>.
*/

View File

@ -26,6 +26,7 @@ public class CPPTokenizer extends JavaCCTokenizer {
private boolean skipBlocks;
private Pattern skipBlocksStart;
private Pattern skipBlocksEnd;
private boolean ignoreIdentifierAndLiteralSeqences = false;
private boolean ignoreLiteralSequences = false;
public CPPTokenizer() {
@ -52,8 +53,9 @@ public class CPPTokenizer extends JavaCCTokenizer {
skipBlocksEnd = CppBlockSkipper.compileSkipMarker(split[1]);
}
}
ignoreLiteralSequences = Boolean.parseBoolean(properties.getProperty(OPTION_IGNORE_LITERAL_SEQUENCES,
Boolean.FALSE.toString()));
ignoreLiteralSequences = Boolean.parseBoolean(properties.getProperty(OPTION_IGNORE_LITERAL_SEQUENCES, Boolean.FALSE.toString()));
ignoreIdentifierAndLiteralSeqences =
Boolean.parseBoolean(properties.getProperty(OPTION_IGNORE_IDENTIFIER_AND_LITERAL_SEQUENCES, Boolean.FALSE.toString()));
}
@ -78,42 +80,44 @@ public class CPPTokenizer extends JavaCCTokenizer {
@Override
protected TokenFilter<JavaccToken> getTokenFilter(final TokenManager<JavaccToken> tokenManager) {
return new CppTokenFilter(tokenManager, ignoreLiteralSequences);
return new CppTokenFilter(tokenManager, ignoreLiteralSequences, ignoreIdentifierAndLiteralSeqences);
}
private static class CppTokenFilter extends JavaCCTokenFilter {
private final boolean ignoreLiteralSequences;
private JavaccToken discardingLiteralsUntil = null;
private final boolean ignoreIdentifierAndLiteralSeqences;
private JavaccToken discardingTokensUntil = null;
private boolean discardCurrent = false;
CppTokenFilter(final TokenManager<JavaccToken> tokenManager, final boolean ignoreLiteralSequences) {
CppTokenFilter(final TokenManager<JavaccToken> tokenManager, final boolean ignoreLiteralSequences, final boolean ignoreIdentifierAndLiteralSeqences) {
super(tokenManager);
this.ignoreIdentifierAndLiteralSeqences = ignoreIdentifierAndLiteralSeqences;
this.ignoreLiteralSequences = ignoreLiteralSequences;
}
@Override
protected void analyzeTokens(final JavaccToken currentToken, final Iterable<JavaccToken> remainingTokens) {
discardCurrent = false;
skipLiteralSequences(currentToken, remainingTokens);
skipSequences(currentToken, remainingTokens);
}
private void skipLiteralSequences(final JavaccToken currentToken, final Iterable<JavaccToken> remainingTokens) {
if (ignoreLiteralSequences) {
private void skipSequences(final JavaccToken currentToken, final Iterable<JavaccToken> remainingTokens) {
if (ignoreLiteralSequences || ignoreIdentifierAndLiteralSeqences) {
final int kind = currentToken.getKind();
if (isDiscardingLiterals()) {
if (currentToken == discardingLiteralsUntil) { // NOPMD - intentional check for reference equality
discardingLiteralsUntil = null;
if (isDiscardingToken()) {
if (currentToken == discardingTokensUntil) { // NOPMD - intentional check for reference equality
discardingTokensUntil = null;
discardCurrent = true;
}
} else if (kind == CppTokenKinds.LCURLYBRACE) {
final JavaccToken finalToken = findEndOfSequenceOfLiterals(remainingTokens);
discardingLiteralsUntil = finalToken;
final JavaccToken finalToken = findEndOfSequenceToDiscard(remainingTokens, ignoreIdentifierAndLiteralSeqences);
discardingTokensUntil = finalToken;
}
}
}
private static JavaccToken findEndOfSequenceOfLiterals(final Iterable<JavaccToken> remainingTokens) {
boolean seenLiteral = false;
private static JavaccToken findEndOfSequenceToDiscard(final Iterable<JavaccToken> remainingTokens, boolean ignoreIdentifierAndLiteralSeqences) {
boolean seenAllowedToken = false;
int braceCount = 0;
for (final JavaccToken token : remainingTokens) {
switch (token.getKind()) {
@ -123,8 +127,18 @@ public class CPPTokenizer extends JavaCCTokenizer {
case CppTokenKinds.HEXADECIMAL_INT_LITERAL:
case CppTokenKinds.OCTAL_INT_LITERAL:
case CppTokenKinds.ZERO:
seenLiteral = true;
case CppTokenKinds.STRING:
seenAllowedToken = true;
break; // can be skipped; continue to the next token
case CppTokenKinds.ID:
// Ignore identifiers if instructed
if (ignoreIdentifierAndLiteralSeqences) {
seenAllowedToken = true;
break; // can be skipped; continue to the next token
} else {
// token not expected, other than identifier
return null;
}
case CppTokenKinds.COMMA:
break; // can be skipped; continue to the next token
case CppTokenKinds.LCURLYBRACE:
@ -134,7 +148,7 @@ public class CPPTokenizer extends JavaCCTokenizer {
braceCount--;
if (braceCount < 0) {
// end of the list; skip all contents
return seenLiteral ? token : null;
return seenAllowedToken ? token : null;
} else {
// curly braces are not yet balanced; continue to the next token
break;
@ -147,13 +161,13 @@ public class CPPTokenizer extends JavaCCTokenizer {
return null;
}
private boolean isDiscardingLiterals() {
return discardingLiteralsUntil != null;
private boolean isDiscardingToken() {
return discardingTokensUntil != null;
}
@Override
protected boolean isLanguageSpecificDiscarding() {
return isDiscardingLiterals() || discardCurrent;
return isDiscardingToken() || discardCurrent;
}
}
}

View File

@ -139,8 +139,18 @@ class CPPTokenizerTest extends CpdTextComparisonTest {
doTest("listOfNumbers", "_ignored", skipLiteralSequences());
}
@Test
void testLongListsOfNumbersAndIdentifiersAreIgnored() {
doTest("listOfNumbers", "_ignored_identifiers", skipIdentifierAndLiteralsSequences());
}
@Test
void testLongListsOfIdentifiersAreIgnored() {
doTest("listOfNumbers", "_ignored_identifiers", skipIdentifierSequences());
}
private static Properties skipBlocks(String skipPattern) {
return properties(true, skipPattern, false);
return properties(true, skipPattern, false, false);
}
private static Properties skipBlocks() {
@ -148,20 +158,29 @@ class CPPTokenizerTest extends CpdTextComparisonTest {
}
private static Properties dontSkipBlocks() {
return properties(false, null, false);
return properties(false, null, false, false);
}
private static Properties skipLiteralSequences() {
return properties(false, null, true);
return properties(false, null, true, false);
}
private static Properties properties(boolean skipBlocks, String skipPattern, boolean skipLiteralSequences) {
private static Properties skipIdentifierAndLiteralsSequences() {
return properties(false, null, true, true);
}
private static Properties skipIdentifierSequences() {
return properties(false, null, false, true);
}
private static Properties properties(boolean skipBlocks, String skipPattern, boolean skipLiteralSequences, boolean skipSequences) {
Properties properties = new Properties();
properties.setProperty(Tokenizer.OPTION_SKIP_BLOCKS, Boolean.toString(skipBlocks));
if (skipPattern != null) {
properties.setProperty(Tokenizer.OPTION_SKIP_BLOCKS_PATTERN, skipPattern);
}
properties.setProperty(Tokenizer.OPTION_IGNORE_LITERAL_SEQUENCES, Boolean.toString(skipLiteralSequences));
properties.setProperty(Tokenizer.OPTION_IGNORE_IDENTIFIER_AND_LITERAL_SEQUENCES, Boolean.toString(skipSequences));
return properties;
}
}

View File

@ -18,7 +18,7 @@ int main() {
0b000001, // C++ 14 binary literal
};
int c[3][4] = {{0,1,2,3},{4,5,6,7},{8,9,10,11}}; // multi-dimensional array
int d[3] = {a, a, a}; // identifiers should not be filtered out
int d[3] = {a, a, a}; // identifiers should filtered out if identifiers are allowed in sequences
int e[1][3] = {{a, a, a}}; // identifiers in multi-dimensional array
int f[1] = {main()}; // method invocations should not be filtered out
int g[1][1] = {{main()}}; // method invocation in multi-dimensional array

View File

@ -0,0 +1,92 @@
[Image] or [Truncated image[ Bcol Ecol
L2
[int] 1 4
[main] 5 9
[(] 9 10
[)] 10 11
[{] 12 13
L3
[int] 3 6
[a] 7 8
[\[] 8 9
[50] 9 11
[\]] 11 12
[=] 13 14
[;] 116 117
L4
[double] 3 9
[b] 10 11
[\[] 11 12
[14] 12 14
[\]] 14 15
[=] 16 17
L19
[;] 4 5
L20
[int] 3 6
[c] 7 8
[\[] 8 9
[3] 9 10
[\]] 10 11
[\[] 11 12
[4] 12 13
[\]] 13 14
[=] 15 16
[;] 50 51
L21
[int] 3 6
[d] 7 8
[\[] 8 9
[3] 9 10
[\]] 10 11
[=] 12 13
[;] 23 24
L22
[int] 3 6
[e] 7 8
[\[] 8 9
[1] 9 10
[\]] 10 11
[\[] 11 12
[3] 12 13
[\]] 13 14
[=] 15 16
[;] 28 29
L23
[int] 3 6
[f] 7 8
[\[] 8 9
[1] 9 10
[\]] 10 11
[=] 12 13
[{] 14 15
[main] 15 19
[(] 19 20
[)] 20 21
[}] 21 22
[;] 22 23
L24
[int] 3 6
[g] 7 8
[\[] 8 9
[1] 9 10
[\]] 10 11
[\[] 11 12
[1] 12 13
[\]] 13 14
[=] 15 16
[{] 17 18
[{] 18 19
[main] 19 23
[(] 23 24
[)] 24 25
[}] 25 26
[}] 26 27
[;] 27 28
L25
[return] 3 9
[0] 10 11
[;] 11 12
L26
[}] 1 2
EOF