From aaef0fde4e0fdd75212b8b93db18a91f32f2862d Mon Sep 17 00:00:00 2001 From: Maikel Steneker Date: Wed, 25 Nov 2020 12:35:02 +0100 Subject: [PATCH] Add option to ignore sequences of literals In some cases, code may include sequences of literals that represent lists or tables of constants, such as lookup tables. Large sequences of these (particularly parts with many zeroes) will be identified by CPD as duplicates, but in practice, these are not the types of duplicates that are considered interesting. This introduces a new option for CPD (--ignore-literal-sequences) that ignores these sequences of literals, in a very similar way to how using directives for C# can already be skipped as well. For now, this functionality is restricted to C#, but it could be added for other languages as well. --- .../sourceforge/pmd/cpd/CPDConfiguration.java | 16 + .../java/net/sourceforge/pmd/cpd/GUI.java | 19 + .../net/sourceforge/pmd/cpd/Tokenizer.java | 4 + .../net/sourceforge/pmd/cpd/CsTokenizer.java | 51 ++- .../sourceforge/pmd/cpd/CsTokenizerTest.java | 21 +- .../pmd/lang/cs/cpd/testdata/listOfNumbers.cs | 8 + .../lang/cs/cpd/testdata/listOfNumbers.txt | 344 ++++++++++++++++++ .../cs/cpd/testdata/listOfNumbers_ignored.txt | 41 +++ 8 files changed, 498 insertions(+), 6 deletions(-) create mode 100644 pmd-cs/src/test/resources/net/sourceforge/pmd/lang/cs/cpd/testdata/listOfNumbers.cs create mode 100644 pmd-cs/src/test/resources/net/sourceforge/pmd/lang/cs/cpd/testdata/listOfNumbers.txt create mode 100644 pmd-cs/src/test/resources/net/sourceforge/pmd/lang/cs/cpd/testdata/listOfNumbers_ignored.txt diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/CPDConfiguration.java b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/CPDConfiguration.java index da009e56cc..262b0e5096 100644 --- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/CPDConfiguration.java +++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/CPDConfiguration.java @@ -91,6 +91,9 @@ public class CPDConfiguration extends AbstractConfiguration { @Parameter(names = "--ignore-usings", description = "Ignore using directives in C#", required = false) private boolean ignoreUsings; + @Parameter(names = "--ignore-literal-sequences", description = "Ignore sequences of literals", required = false) + private boolean ignoreLiteralSequences = false; + @Parameter(names = "--skip-lexical-errors", description = "Skip files which can't be tokenized due to invalid characters instead of aborting CPD", required = false) @@ -273,6 +276,11 @@ public class CPDConfiguration extends AbstractConfiguration { } else { properties.remove(Tokenizer.IGNORE_USINGS); } + if (configuration.isIgnoreLiteralSequences()) { + properties.setProperty(Tokenizer.OPTION_IGNORE_LITERAL_SEQUENCES, "true"); + } else { + properties.remove(Tokenizer.OPTION_IGNORE_LITERAL_SEQUENCES); + } properties.setProperty(Tokenizer.OPTION_SKIP_BLOCKS, Boolean.toString(!configuration.isNoSkipBlocks())); properties.setProperty(Tokenizer.OPTION_SKIP_BLOCKS_PATTERN, configuration.getSkipBlocksPattern()); configuration.getLanguage().setProperties(properties); @@ -411,6 +419,14 @@ public class CPDConfiguration extends AbstractConfiguration { this.ignoreUsings = ignoreUsings; } + public boolean isIgnoreLiteralSequences() { + return ignoreLiteralSequences; + } + + public void setIgnoreLiteralSequences(boolean ignoreLiteralSequences) { + this.ignoreLiteralSequences = ignoreLiteralSequences; + } + public boolean isSkipLexicalErrors() { return skipLexicalErrors; } diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/GUI.java b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/GUI.java index fd7e436af2..3f44d76b34 100644 --- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/GUI.java +++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/GUI.java @@ -115,6 +115,10 @@ public class GUI implements CPDListener { return false; } + public boolean canIgnoreLiteralSequences() { + return false; + } + public abstract String[] extensions(); } @@ -160,6 +164,11 @@ public class GUI implements CPDListener { public boolean canIgnoreUsings() { return "cs".equals(terseName); } + + @Override + public boolean canIgnoreLiteralSequences() { + return "cs".equals(terseName); + } }; } LANGUAGE_SETS[index][0] = "by extension..."; @@ -333,6 +342,7 @@ public class GUI implements CPDListener { private JCheckBox ignoreLiteralsCheckbox = new JCheckBox("", false); private JCheckBox ignoreAnnotationsCheckbox = new JCheckBox("", false); private JCheckBox ignoreUsingsCheckbox = new JCheckBox("", false); + private JCheckBox ignoreLiteralSequencesCheckbox = new JCheckBox("", false); private JComboBox languageBox = new JComboBox<>(); private JTextField extensionField = new JTextField(); private JLabel extensionLabel = new JLabel("Extension:", SwingConstants.RIGHT); @@ -420,6 +430,7 @@ public class GUI implements CPDListener { ignoreLiteralsCheckbox.setEnabled(current.canIgnoreLiterals()); ignoreAnnotationsCheckbox.setEnabled(current.canIgnoreAnnotations()); ignoreUsingsCheckbox.setEnabled(current.canIgnoreUsings()); + ignoreLiteralSequencesCheckbox.setEnabled(current.canIgnoreLiteralSequences()); extensionField.setText(current.extensions()[0]); boolean enableExtension = current.extensions()[0].isEmpty(); extensionField.setEnabled(enableExtension); @@ -478,6 +489,13 @@ public class GUI implements CPDListener { helper.nextRow(); helper.addLabel("Ignore usings?"); helper.add(ignoreUsingsCheckbox); + helper.addLabel(""); + helper.addLabel(""); + helper.nextRow(); + + helper.nextRow(); + helper.addLabel("Ignore literal sequences?"); + helper.add(ignoreLiteralSequencesCheckbox); helper.add(goButton); helper.add(cxButton); helper.nextRow(); @@ -663,6 +681,7 @@ public class GUI implements CPDListener { config.setIgnoreLiterals(ignoreLiteralsCheckbox.isSelected()); config.setIgnoreAnnotations(ignoreAnnotationsCheckbox.isSelected()); config.setIgnoreUsings(ignoreUsingsCheckbox.isSelected()); + config.setIgnoreLiteralSequences(ignoreLiteralSequencesCheckbox.isSelected()); p.setProperty(LanguageFactory.EXTENSION, extensionField.getText()); LanguageConfig conf = languageConfigFor((String) languageBox.getSelectedItem()); diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/Tokenizer.java b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/Tokenizer.java index 77e2de54d9..e6876fb960 100644 --- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/Tokenizer.java +++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/Tokenizer.java @@ -11,6 +11,10 @@ public interface Tokenizer { String IGNORE_IDENTIFIERS = "ignore_identifiers"; String IGNORE_ANNOTATIONS = "ignore_annotations"; + /** + * Ignore sequences of literals (e.g, 0,0,0,0...). + */ + String OPTION_IGNORE_LITERAL_SEQUENCES = "net.sourceforge.pmd.cpd.Tokenizer.skipLiteralSequences"; /** * Ignore using directives in C#. The default value is false. */ diff --git a/pmd-cs/src/main/java/net/sourceforge/pmd/cpd/CsTokenizer.java b/pmd-cs/src/main/java/net/sourceforge/pmd/cpd/CsTokenizer.java index 10fb5aed89..5ae96cbfa1 100644 --- a/pmd-cs/src/main/java/net/sourceforge/pmd/cpd/CsTokenizer.java +++ b/pmd-cs/src/main/java/net/sourceforge/pmd/cpd/CsTokenizer.java @@ -19,15 +19,21 @@ import net.sourceforge.pmd.lang.cs.antlr4.CSharpLexer; public class CsTokenizer extends AntlrTokenizer { private boolean ignoreUsings = false; + private boolean ignoreLiteralSequences = false; public void setProperties(Properties properties) { ignoreUsings = Boolean.parseBoolean(properties.getProperty(IGNORE_USINGS, "false")); + ignoreLiteralSequences = Boolean.parseBoolean(properties.getProperty(OPTION_IGNORE_LITERAL_SEQUENCES, "false")); } public void setIgnoreUsings(boolean ignoreUsings) { this.ignoreUsings = ignoreUsings; } + public void setIgnoreLiteralSequences(boolean ignoreLiteralSequences) { + this.ignoreLiteralSequences = ignoreLiteralSequences; + } + @Override protected AntlrTokenManager getLexerForSource(final SourceCode sourceCode) { final CharStream charStream = AntlrTokenizer.getCharStreamFromSourceCode(sourceCode); @@ -36,7 +42,7 @@ public class CsTokenizer extends AntlrTokenizer { @Override protected AntlrTokenFilter getTokenFilter(final AntlrTokenManager tokenManager) { - return new CsTokenFilter(tokenManager, ignoreUsings); + return new CsTokenFilter(tokenManager, ignoreUsings, ignoreLiteralSequences); } /** @@ -54,13 +60,16 @@ public class CsTokenizer extends AntlrTokenizer { } private final boolean ignoreUsings; + private final boolean ignoreLiteralSequences; private boolean discardingUsings = false; private boolean discardingNL = false; + private boolean discardingLiterals = false; private boolean discardCurrent = false; - CsTokenFilter(final AntlrTokenManager tokenManager, boolean ignoreUsings) { + CsTokenFilter(final AntlrTokenManager tokenManager, boolean ignoreUsings, boolean ignoreLiteralSequences) { super(tokenManager); this.ignoreUsings = ignoreUsings; + this.ignoreLiteralSequences = ignoreLiteralSequences; } @Override @@ -72,6 +81,7 @@ public class CsTokenizer extends AntlrTokenizer { protected void analyzeTokens(final AntlrToken currentToken, final Iterable remainingTokens) { discardCurrent = false; skipUsingDirectives(currentToken, remainingTokens); + skipLiteralSequences(currentToken, remainingTokens); } private void skipUsingDirectives(final AntlrToken currentToken, final Iterable remainingTokens) { @@ -148,9 +158,44 @@ public class CsTokenizer extends AntlrTokenizer { discardingNL = currentToken.getKind() == CSharpLexer.NL; } + private void skipLiteralSequences(final AntlrToken currentToken, final Iterable remainingTokens) { + if (ignoreLiteralSequences) { + final int type = currentToken.getKind(); + if (type == CSharpLexer.OPEN_BRACE && isSequenceOfLiterals(remainingTokens)) { + discardingLiterals = true; + } else if (type == CSharpLexer.CLOSE_BRACE && discardingLiterals) { + discardingLiterals = false; + discardCurrent = true; + } + } + } + + private boolean isSequenceOfLiterals(final Iterable remainingTokens) { + boolean seenLiteral = false; + for (final AntlrToken token : remainingTokens) { + switch (token.getKind()) { + case CSharpLexer.CHARACTER_LITERAL: + case CSharpLexer.HEX_INTEGER_LITERAL: + case CSharpLexer.INTEGER_LITERAL: + case CSharpLexer.REAL_LITERAL: + seenLiteral = true; + break; // can be skipped; continue to the next token + case CSharpLexer.COMMA: + break; // can be skipped; continue to the next token + case CSharpLexer.CLOSE_BRACE: + // end of the list; skip all contents + return seenLiteral; + default: + // some other token than the expected ones; this is not a sequence of literals + return false; + } + } + return false; + } + @Override protected boolean isLanguageSpecificDiscarding() { - return discardingUsings || discardingNL || discardCurrent; + return discardingUsings || discardingNL || discardingLiterals || discardCurrent; } } } diff --git a/pmd-cs/src/test/java/net/sourceforge/pmd/cpd/CsTokenizerTest.java b/pmd-cs/src/test/java/net/sourceforge/pmd/cpd/CsTokenizerTest.java index 6bc536a4c9..6b61c658bb 100644 --- a/pmd-cs/src/test/java/net/sourceforge/pmd/cpd/CsTokenizerTest.java +++ b/pmd-cs/src/test/java/net/sourceforge/pmd/cpd/CsTokenizerTest.java @@ -90,13 +90,28 @@ public class CsTokenizerTest extends CpdTextComparisonTest { doTest("tabWidth"); } - private Properties ignoreUsings() { - return properties(true); + @Test + public void testLongListsOfNumbersAreNotIgnored() { + doTest("listOfNumbers"); } - private Properties properties(boolean ignoreUsings) { + @Test + public void testLongListsOfNumbersAreIgnored() { + doTest("listOfNumbers", "_ignored", skipLiteralSequences()); + } + + private Properties ignoreUsings() { + return properties(true, false); + } + + private Properties skipLiteralSequences() { + return properties(false, true); + } + + private Properties properties(boolean ignoreUsings, boolean ignoreLiteralSequences) { Properties properties = new Properties(); properties.setProperty(Tokenizer.IGNORE_USINGS, Boolean.toString(ignoreUsings)); + properties.setProperty(Tokenizer.OPTION_IGNORE_LITERAL_SEQUENCES, Boolean.toString(ignoreLiteralSequences)); return properties; } } diff --git a/pmd-cs/src/test/resources/net/sourceforge/pmd/lang/cs/cpd/testdata/listOfNumbers.cs b/pmd-cs/src/test/resources/net/sourceforge/pmd/lang/cs/cpd/testdata/listOfNumbers.cs new file mode 100644 index 0000000000..a3ceee1dca --- /dev/null +++ b/pmd-cs/src/test/resources/net/sourceforge/pmd/lang/cs/cpd/testdata/listOfNumbers.cs @@ -0,0 +1,8 @@ +using System; +using System.Collections; +using System.Collections.Generic; +public class LongLists { + List l = new List { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + }; +} diff --git a/pmd-cs/src/test/resources/net/sourceforge/pmd/lang/cs/cpd/testdata/listOfNumbers.txt b/pmd-cs/src/test/resources/net/sourceforge/pmd/lang/cs/cpd/testdata/listOfNumbers.txt new file mode 100644 index 0000000000..17d20f914d --- /dev/null +++ b/pmd-cs/src/test/resources/net/sourceforge/pmd/lang/cs/cpd/testdata/listOfNumbers.txt @@ -0,0 +1,344 @@ + [Image] or [Truncated image[ Bcol Ecol +L1 + [using] 1 5 + [System] 7 12 + [;] 13 13 +L2 + [using] 1 5 + [System] 7 12 + [.] 13 13 + [Collections] 14 24 + [;] 25 25 +L3 + [using] 1 5 + [System] 7 12 + [.] 13 13 + [Collections] 14 24 + [.] 25 25 + [Generic] 26 32 + [;] 33 33 +L4 + [public] 1 6 + [class] 8 12 + [LongLists] 14 22 + [{] 24 24 +L5 + [List] 5 8 + [<] 9 9 + [byte] 10 13 + [>] 14 14 + [l] 16 16 + [=] 18 18 + [new] 20 22 + [List] 24 27 + [<] 28 28 + [byte] 29 32 + [>] 33 33 + [{] 35 35 +L6 + [0] 6 6 + [,] 7 7 + [0] 8 8 + [,] 9 9 + [0] 10 10 + [,] 11 11 + [0] 12 12 + [,] 13 13 + [0] 14 14 + [,] 15 15 + [0] 16 16 + [,] 17 17 + [0] 18 18 + [,] 19 19 + [0] 20 20 + [,] 21 21 + [0] 22 22 + [,] 23 23 + [0] 24 24 + [,] 25 25 + [0] 26 26 + [,] 27 27 + [0] 28 28 + [,] 29 29 + [0] 30 30 + [,] 31 31 + [0] 32 32 + [,] 33 33 + [0] 34 34 + [,] 35 35 + [0] 36 36 + [,] 37 37 + [0] 38 38 + [,] 39 39 + [0] 40 40 + [,] 41 41 + [0] 42 42 + [,] 43 43 + [0] 44 44 + [,] 45 45 + [0] 46 46 + [,] 47 47 + [0] 48 48 + [,] 49 49 + [0] 50 50 + [,] 51 51 + [0] 52 52 + [,] 53 53 + [0] 54 54 + [,] 55 55 + [0] 56 56 + [,] 57 57 + [0] 58 58 + [,] 59 59 + [0] 60 60 + [,] 61 61 + [0] 62 62 + [,] 63 63 + [0] 64 64 + [,] 65 65 + [0] 66 66 + [,] 67 67 + [0] 68 68 + [,] 69 69 + [0] 70 70 + [,] 71 71 + [0] 72 72 + [,] 73 73 + [0] 74 74 + [,] 75 75 + [0] 76 76 + [,] 77 77 + [0] 78 78 + [,] 79 79 + [0] 80 80 + [,] 81 81 + [0] 82 82 + [,] 83 83 + [0] 84 84 + [,] 85 85 + [0] 86 86 + [,] 87 87 + [0] 88 88 + [,] 89 89 + [0] 90 90 + [,] 91 91 + [0] 92 92 + [,] 93 93 + [0] 94 94 + [,] 95 95 + [0] 96 96 + [,] 97 97 + [0] 98 98 + [,] 99 99 + [0] 100 100 + [,] 101 101 + [0] 102 102 + [,] 103 103 + [0] 104 104 + [,] 105 105 + [0] 106 106 + [,] 107 107 + [0] 108 108 + [,] 109 109 + [0] 110 110 + [,] 111 111 + [0] 112 112 + [,] 113 113 + [0] 114 114 + [,] 115 115 + [0] 116 116 + [,] 117 117 + [0] 118 118 + [,] 119 119 + [0] 120 120 + [,] 121 121 + [0] 122 122 + [,] 123 123 + [0] 124 124 + [,] 125 125 + [0] 126 126 + [,] 127 127 + [0] 128 128 + [,] 129 129 + [0] 130 130 + [,] 131 131 + [0] 132 132 + [,] 133 133 + [0] 134 134 + [,] 135 135 + [0] 136 136 + [,] 137 137 + [0] 138 138 + [,] 139 139 + [0] 140 140 + [,] 141 141 + [0] 142 142 + [,] 143 143 + [0] 144 144 + [,] 145 145 + [0] 146 146 + [,] 147 147 + [0] 148 148 + [,] 149 149 + [0] 150 150 + [,] 151 151 + [0] 152 152 + [,] 153 153 + [0] 154 154 + [,] 155 155 + [0] 156 156 + [,] 157 157 + [0] 158 158 + [,] 159 159 + [0] 160 160 + [,] 161 161 + [0] 162 162 + [,] 163 163 + [0] 164 164 + [,] 165 165 + [0] 166 166 + [,] 167 167 + [0] 168 168 + [,] 169 169 + [0] 170 170 + [,] 171 171 + [0] 172 172 + [,] 173 173 + [0] 174 174 + [,] 175 175 + [0] 176 176 + [,] 177 177 + [0] 178 178 + [,] 179 179 + [0] 180 180 + [,] 181 181 + [0] 182 182 + [,] 183 183 + [0] 184 184 + [,] 185 185 + [0] 186 186 + [,] 187 187 + [0] 188 188 + [,] 189 189 + [0] 190 190 + [,] 191 191 + [0] 192 192 + [,] 193 193 + [0] 194 194 + [,] 195 195 + [0] 196 196 + [,] 197 197 + [0] 198 198 + [,] 199 199 + [0] 200 200 + [,] 201 201 + [0] 202 202 + [,] 203 203 + [0] 204 204 + [,] 205 205 + [0] 206 206 + [,] 207 207 + [0] 208 208 + [,] 209 209 + [0] 210 210 + [,] 211 211 + [0] 212 212 + [,] 213 213 + [0] 214 214 + [,] 215 215 + [0] 216 216 + [,] 217 217 + [0] 218 218 + [,] 219 219 + [0] 220 220 + [,] 221 221 + [0] 222 222 + [,] 223 223 + [0] 224 224 + [,] 225 225 + [0] 226 226 + [,] 227 227 + [0] 228 228 + [,] 229 229 + [0] 230 230 + [,] 231 231 + [0] 232 232 + [,] 233 233 + [0] 234 234 + [,] 235 235 + [0] 236 236 + [,] 237 237 + [0] 238 238 + [,] 239 239 + [0] 240 240 + [,] 241 241 + [0] 242 242 + [,] 243 243 + [0] 244 244 + [,] 245 245 + [0] 246 246 + [,] 247 247 + [0] 248 248 + [,] 249 249 + [0] 250 250 + [,] 251 251 + [0] 252 252 + [,] 253 253 + [0] 254 254 + [,] 255 255 + [0] 256 256 + [,] 257 257 + [0] 258 258 + [,] 259 259 + [0] 260 260 + [,] 261 261 + [0] 262 262 + [,] 263 263 + [0] 264 264 + [,] 265 265 + [0] 266 266 + [,] 267 267 + [0] 268 268 + [,] 269 269 + [0] 270 270 + [,] 271 271 + [0] 272 272 + [,] 273 273 + [0] 274 274 + [,] 275 275 + [0] 276 276 + [,] 277 277 + [0] 278 278 + [,] 279 279 + [0] 280 280 + [,] 281 281 + [0] 282 282 + [,] 283 283 + [0] 284 284 + [,] 285 285 + [0] 286 286 + [,] 287 287 + [0] 288 288 + [,] 289 289 + [0] 290 290 + [,] 291 291 + [0] 292 292 + [,] 293 293 + [0] 294 294 + [,] 295 295 + [0] 296 296 + [,] 297 297 + [0] 298 298 + [,] 299 299 + [0] 300 300 + [,] 301 301 + [0] 302 302 + [,] 303 303 + [0] 304 304 + [,] 305 305 +L7 + [}] 5 5 + [;] 6 6 +L8 + [}] 1 1 +EOF diff --git a/pmd-cs/src/test/resources/net/sourceforge/pmd/lang/cs/cpd/testdata/listOfNumbers_ignored.txt b/pmd-cs/src/test/resources/net/sourceforge/pmd/lang/cs/cpd/testdata/listOfNumbers_ignored.txt new file mode 100644 index 0000000000..03f835fdca --- /dev/null +++ b/pmd-cs/src/test/resources/net/sourceforge/pmd/lang/cs/cpd/testdata/listOfNumbers_ignored.txt @@ -0,0 +1,41 @@ + [Image] or [Truncated image[ Bcol Ecol +L1 + [using] 1 5 + [System] 7 12 + [;] 13 13 +L2 + [using] 1 5 + [System] 7 12 + [.] 13 13 + [Collections] 14 24 + [;] 25 25 +L3 + [using] 1 5 + [System] 7 12 + [.] 13 13 + [Collections] 14 24 + [.] 25 25 + [Generic] 26 32 + [;] 33 33 +L4 + [public] 1 6 + [class] 8 12 + [LongLists] 14 22 + [{] 24 24 +L5 + [List] 5 8 + [<] 9 9 + [byte] 10 13 + [>] 14 14 + [l] 16 16 + [=] 18 18 + [new] 20 22 + [List] 24 27 + [<] 28 28 + [byte] 29 32 + [>] 33 33 +L7 + [;] 6 6 +L8 + [}] 1 1 +EOF