Add option to ignore sequences of literals

In some cases, code may include sequences of literals that represent lists or tables of constants, such as lookup tables. Large sequences of these (particularly parts with many zeroes) will be identified by CPD as duplicates, but in practice, these are not the types of duplicates that are considered interesting.

This introduces a new option for CPD (--ignore-literal-sequences) that ignores these sequences of literals, in a very similar way to how using directives for C# can already be skipped as well. For now, this functionality is restricted to C#, but it could be added for other languages as well.
This commit is contained in:
Maikel Steneker
2020-11-25 12:35:02 +01:00
parent f787e3a8e5
commit aaef0fde4e
8 changed files with 498 additions and 6 deletions

View File

@ -91,6 +91,9 @@ public class CPDConfiguration extends AbstractConfiguration {
@Parameter(names = "--ignore-usings", description = "Ignore using directives in C#", required = false)
private boolean ignoreUsings;
@Parameter(names = "--ignore-literal-sequences", description = "Ignore sequences of literals", required = false)
private boolean ignoreLiteralSequences = false;
@Parameter(names = "--skip-lexical-errors",
description = "Skip files which can't be tokenized due to invalid characters instead of aborting CPD",
required = false)
@ -273,6 +276,11 @@ public class CPDConfiguration extends AbstractConfiguration {
} else {
properties.remove(Tokenizer.IGNORE_USINGS);
}
if (configuration.isIgnoreLiteralSequences()) {
properties.setProperty(Tokenizer.OPTION_IGNORE_LITERAL_SEQUENCES, "true");
} else {
properties.remove(Tokenizer.OPTION_IGNORE_LITERAL_SEQUENCES);
}
properties.setProperty(Tokenizer.OPTION_SKIP_BLOCKS, Boolean.toString(!configuration.isNoSkipBlocks()));
properties.setProperty(Tokenizer.OPTION_SKIP_BLOCKS_PATTERN, configuration.getSkipBlocksPattern());
configuration.getLanguage().setProperties(properties);
@ -411,6 +419,14 @@ public class CPDConfiguration extends AbstractConfiguration {
this.ignoreUsings = ignoreUsings;
}
public boolean isIgnoreLiteralSequences() {
return ignoreLiteralSequences;
}
public void setIgnoreLiteralSequences(boolean ignoreLiteralSequences) {
this.ignoreLiteralSequences = ignoreLiteralSequences;
}
public boolean isSkipLexicalErrors() {
return skipLexicalErrors;
}

View File

@ -115,6 +115,10 @@ public class GUI implements CPDListener {
return false;
}
public boolean canIgnoreLiteralSequences() {
return false;
}
public abstract String[] extensions();
}
@ -160,6 +164,11 @@ public class GUI implements CPDListener {
public boolean canIgnoreUsings() {
return "cs".equals(terseName);
}
@Override
public boolean canIgnoreLiteralSequences() {
return "cs".equals(terseName);
}
};
}
LANGUAGE_SETS[index][0] = "by extension...";
@ -333,6 +342,7 @@ public class GUI implements CPDListener {
private JCheckBox ignoreLiteralsCheckbox = new JCheckBox("", false);
private JCheckBox ignoreAnnotationsCheckbox = new JCheckBox("", false);
private JCheckBox ignoreUsingsCheckbox = new JCheckBox("", false);
private JCheckBox ignoreLiteralSequencesCheckbox = new JCheckBox("", false);
private JComboBox<String> languageBox = new JComboBox<>();
private JTextField extensionField = new JTextField();
private JLabel extensionLabel = new JLabel("Extension:", SwingConstants.RIGHT);
@ -420,6 +430,7 @@ public class GUI implements CPDListener {
ignoreLiteralsCheckbox.setEnabled(current.canIgnoreLiterals());
ignoreAnnotationsCheckbox.setEnabled(current.canIgnoreAnnotations());
ignoreUsingsCheckbox.setEnabled(current.canIgnoreUsings());
ignoreLiteralSequencesCheckbox.setEnabled(current.canIgnoreLiteralSequences());
extensionField.setText(current.extensions()[0]);
boolean enableExtension = current.extensions()[0].isEmpty();
extensionField.setEnabled(enableExtension);
@ -478,6 +489,13 @@ public class GUI implements CPDListener {
helper.nextRow();
helper.addLabel("Ignore usings?");
helper.add(ignoreUsingsCheckbox);
helper.addLabel("");
helper.addLabel("");
helper.nextRow();
helper.nextRow();
helper.addLabel("Ignore literal sequences?");
helper.add(ignoreLiteralSequencesCheckbox);
helper.add(goButton);
helper.add(cxButton);
helper.nextRow();
@ -663,6 +681,7 @@ public class GUI implements CPDListener {
config.setIgnoreLiterals(ignoreLiteralsCheckbox.isSelected());
config.setIgnoreAnnotations(ignoreAnnotationsCheckbox.isSelected());
config.setIgnoreUsings(ignoreUsingsCheckbox.isSelected());
config.setIgnoreLiteralSequences(ignoreLiteralSequencesCheckbox.isSelected());
p.setProperty(LanguageFactory.EXTENSION, extensionField.getText());
LanguageConfig conf = languageConfigFor((String) languageBox.getSelectedItem());

View File

@ -11,6 +11,10 @@ public interface Tokenizer {
String IGNORE_IDENTIFIERS = "ignore_identifiers";
String IGNORE_ANNOTATIONS = "ignore_annotations";
/**
* Ignore sequences of literals (e.g, <code>0,0,0,0...</code>).
*/
String OPTION_IGNORE_LITERAL_SEQUENCES = "net.sourceforge.pmd.cpd.Tokenizer.skipLiteralSequences";
/**
* Ignore using directives in C#. The default value is <code>false</code>.
*/

View File

@ -19,15 +19,21 @@ import net.sourceforge.pmd.lang.cs.antlr4.CSharpLexer;
public class CsTokenizer extends AntlrTokenizer {
private boolean ignoreUsings = false;
private boolean ignoreLiteralSequences = false;
public void setProperties(Properties properties) {
ignoreUsings = Boolean.parseBoolean(properties.getProperty(IGNORE_USINGS, "false"));
ignoreLiteralSequences = Boolean.parseBoolean(properties.getProperty(OPTION_IGNORE_LITERAL_SEQUENCES, "false"));
}
public void setIgnoreUsings(boolean ignoreUsings) {
this.ignoreUsings = ignoreUsings;
}
public void setIgnoreLiteralSequences(boolean ignoreLiteralSequences) {
this.ignoreLiteralSequences = ignoreLiteralSequences;
}
@Override
protected AntlrTokenManager getLexerForSource(final SourceCode sourceCode) {
final CharStream charStream = AntlrTokenizer.getCharStreamFromSourceCode(sourceCode);
@ -36,7 +42,7 @@ public class CsTokenizer extends AntlrTokenizer {
@Override
protected AntlrTokenFilter getTokenFilter(final AntlrTokenManager tokenManager) {
return new CsTokenFilter(tokenManager, ignoreUsings);
return new CsTokenFilter(tokenManager, ignoreUsings, ignoreLiteralSequences);
}
/**
@ -54,13 +60,16 @@ public class CsTokenizer extends AntlrTokenizer {
}
private final boolean ignoreUsings;
private final boolean ignoreLiteralSequences;
private boolean discardingUsings = false;
private boolean discardingNL = false;
private boolean discardingLiterals = false;
private boolean discardCurrent = false;
CsTokenFilter(final AntlrTokenManager tokenManager, boolean ignoreUsings) {
CsTokenFilter(final AntlrTokenManager tokenManager, boolean ignoreUsings, boolean ignoreLiteralSequences) {
super(tokenManager);
this.ignoreUsings = ignoreUsings;
this.ignoreLiteralSequences = ignoreLiteralSequences;
}
@Override
@ -72,6 +81,7 @@ public class CsTokenizer extends AntlrTokenizer {
protected void analyzeTokens(final AntlrToken currentToken, final Iterable<AntlrToken> remainingTokens) {
discardCurrent = false;
skipUsingDirectives(currentToken, remainingTokens);
skipLiteralSequences(currentToken, remainingTokens);
}
private void skipUsingDirectives(final AntlrToken currentToken, final Iterable<AntlrToken> remainingTokens) {
@ -148,9 +158,44 @@ public class CsTokenizer extends AntlrTokenizer {
discardingNL = currentToken.getKind() == CSharpLexer.NL;
}
private void skipLiteralSequences(final AntlrToken currentToken, final Iterable<AntlrToken> remainingTokens) {
if (ignoreLiteralSequences) {
final int type = currentToken.getKind();
if (type == CSharpLexer.OPEN_BRACE && isSequenceOfLiterals(remainingTokens)) {
discardingLiterals = true;
} else if (type == CSharpLexer.CLOSE_BRACE && discardingLiterals) {
discardingLiterals = false;
discardCurrent = true;
}
}
}
private boolean isSequenceOfLiterals(final Iterable<AntlrToken> remainingTokens) {
boolean seenLiteral = false;
for (final AntlrToken token : remainingTokens) {
switch (token.getKind()) {
case CSharpLexer.CHARACTER_LITERAL:
case CSharpLexer.HEX_INTEGER_LITERAL:
case CSharpLexer.INTEGER_LITERAL:
case CSharpLexer.REAL_LITERAL:
seenLiteral = true;
break; // can be skipped; continue to the next token
case CSharpLexer.COMMA:
break; // can be skipped; continue to the next token
case CSharpLexer.CLOSE_BRACE:
// end of the list; skip all contents
return seenLiteral;
default:
// some other token than the expected ones; this is not a sequence of literals
return false;
}
}
return false;
}
@Override
protected boolean isLanguageSpecificDiscarding() {
return discardingUsings || discardingNL || discardCurrent;
return discardingUsings || discardingNL || discardingLiterals || discardCurrent;
}
}
}

View File

@ -90,13 +90,28 @@ public class CsTokenizerTest extends CpdTextComparisonTest {
doTest("tabWidth");
}
private Properties ignoreUsings() {
return properties(true);
@Test
public void testLongListsOfNumbersAreNotIgnored() {
doTest("listOfNumbers");
}
private Properties properties(boolean ignoreUsings) {
@Test
public void testLongListsOfNumbersAreIgnored() {
doTest("listOfNumbers", "_ignored", skipLiteralSequences());
}
private Properties ignoreUsings() {
return properties(true, false);
}
private Properties skipLiteralSequences() {
return properties(false, true);
}
private Properties properties(boolean ignoreUsings, boolean ignoreLiteralSequences) {
Properties properties = new Properties();
properties.setProperty(Tokenizer.IGNORE_USINGS, Boolean.toString(ignoreUsings));
properties.setProperty(Tokenizer.OPTION_IGNORE_LITERAL_SEQUENCES, Boolean.toString(ignoreLiteralSequences));
return properties;
}
}

View File

@ -0,0 +1,8 @@
using System;
using System.Collections;
using System.Collections.Generic;
public class LongLists {
List<byte> l = new List<byte> {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,41 @@
[Image] or [Truncated image[ Bcol Ecol
L1
[using] 1 5
[System] 7 12
[;] 13 13
L2
[using] 1 5
[System] 7 12
[.] 13 13
[Collections] 14 24
[;] 25 25
L3
[using] 1 5
[System] 7 12
[.] 13 13
[Collections] 14 24
[.] 25 25
[Generic] 26 32
[;] 33 33
L4
[public] 1 6
[class] 8 12
[LongLists] 14 22
[{] 24 24
L5
[List] 5 8
[<] 9 9
[byte] 10 13
[>] 14 14
[l] 16 16
[=] 18 18
[new] 20 22
[List] 24 27
[<] 28 28
[byte] 29 32
[>] 33 33
L7
[;] 6 6
L8
[}] 1 1
EOF