From 08b7dd49dda043f3ac0078c8e071dc7e86e27ada Mon Sep 17 00:00:00 2001 From: Andreas Dangel Date: Sat, 26 Jan 2019 11:24:32 +0100 Subject: [PATCH] [core] Add a generic JavaCCTokenizer for CPD --- .../sourceforge/pmd/cpd/AntlrTokenizer.java | 1 + .../sourceforge/pmd/cpd/JavaCCTokenizer.java | 42 +++++++++ .../net/sourceforge/pmd/cpd/CPPTokenizer.java | 38 +++----- .../pmd/cpd/CPPTokenizerContinuationTest.java | 3 +- .../sourceforge/pmd/cpd/CPPTokenizerTest.java | 11 ++- .../sourceforge/pmd/cpd/JavaTokenizer.java | 66 +++++++------- .../pmd/cpd/JavaTokensTokenizerTest.java | 2 +- .../pmd/cpd/MatchAlgorithmTest.java | 5 +- .../pmd/cpd/EcmascriptTokenizer.java | 49 ++++------- .../sourceforge/pmd/cpd/MatlabTokenizer.java | 25 +----- .../pmd/cpd/ObjectiveCTokenizer.java | 26 ++---- .../sourceforge/pmd/cpd/PLSQLTokenizer.java | 87 +++++-------------- .../sourceforge/pmd/cpd/PythonTokenizer.java | 26 +----- 13 files changed, 154 insertions(+), 227 deletions(-) create mode 100644 pmd-core/src/main/java/net/sourceforge/pmd/cpd/JavaCCTokenizer.java diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AntlrTokenizer.java b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AntlrTokenizer.java index aad0d2379a..d1655da3a1 100644 --- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AntlrTokenizer.java +++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AntlrTokenizer.java @@ -23,6 +23,7 @@ public abstract class AntlrTokenizer implements Tokenizer { public void tokenize(final SourceCode sourceCode, final Tokens tokenEntries) { AntlrTokenManager tokenManager = getLexerForSource(sourceCode); + tokenManager.setFileName(sourceCode.getFileName()); try { AntlrToken token = (AntlrToken) tokenManager.getNextToken(); diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/JavaCCTokenizer.java b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/JavaCCTokenizer.java new file mode 100644 index 0000000000..e83f48cae4 --- /dev/null +++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/JavaCCTokenizer.java @@ -0,0 +1,42 @@ +/** + * BSD-style license; for more info see http://pmd.sourceforge.net/license.html + */ + +package net.sourceforge.pmd.cpd; + +import java.io.IOException; + +import net.sourceforge.pmd.cpd.token.JavaCCTokenFilter; +import net.sourceforge.pmd.cpd.token.TokenFilter; +import net.sourceforge.pmd.lang.TokenManager; +import net.sourceforge.pmd.lang.ast.GenericToken; + +public abstract class JavaCCTokenizer implements Tokenizer { + + protected abstract TokenManager getLexerForSource(SourceCode sourceCode); + + protected TokenFilter getTokenFilter(TokenManager tokenManager) { + return new JavaCCTokenFilter(tokenManager); + } + + protected TokenEntry processToken(Tokens tokenEntries, GenericToken currentToken, String filename) { + return new TokenEntry(currentToken.getImage(), filename, currentToken.getBeginLine()); + } + + @Override + public void tokenize(SourceCode sourceCode, Tokens tokenEntries) throws IOException { + TokenManager tokenManager = getLexerForSource(sourceCode); + tokenManager.setFileName(sourceCode.getFileName()); + try { + final TokenFilter tokenFilter = getTokenFilter(tokenManager); + + GenericToken currentToken = tokenFilter.getNextToken(); + while (currentToken != null) { + tokenEntries.add(processToken(tokenEntries, currentToken, sourceCode.getFileName())); + currentToken = tokenFilter.getNextToken(); + } + } finally { + tokenEntries.add(TokenEntry.getEOF()); + } + } +} diff --git a/pmd-cpp/src/main/java/net/sourceforge/pmd/cpd/CPPTokenizer.java b/pmd-cpp/src/main/java/net/sourceforge/pmd/cpd/CPPTokenizer.java index 26bdd74c40..18b2c93813 100644 --- a/pmd-cpp/src/main/java/net/sourceforge/pmd/cpd/CPPTokenizer.java +++ b/pmd-cpp/src/main/java/net/sourceforge/pmd/cpd/CPPTokenizer.java @@ -6,21 +6,18 @@ package net.sourceforge.pmd.cpd; import java.io.BufferedReader; import java.io.IOException; -import java.io.Reader; import java.io.StringReader; import java.util.Properties; import net.sourceforge.pmd.PMD; -import net.sourceforge.pmd.cpd.token.JavaCCTokenFilter; -import net.sourceforge.pmd.cpd.token.TokenFilter; -import net.sourceforge.pmd.lang.ast.GenericToken; +import net.sourceforge.pmd.lang.TokenManager; import net.sourceforge.pmd.lang.cpp.CppTokenManager; import net.sourceforge.pmd.util.IOUtil; /** * The C++ tokenizer. */ -public class CPPTokenizer implements Tokenizer { +public class CPPTokenizer extends JavaCCTokenizer { private boolean skipBlocks = true; private String skipBlocksStart; @@ -48,27 +45,6 @@ public class CPPTokenizer implements Tokenizer { } } - @Override - public void tokenize(SourceCode sourceCode, Tokens tokenEntries) { - StringBuilder buffer = sourceCode.getCodeBuffer(); - try (Reader reader = IOUtil.skipBOM(new StringReader(maybeSkipBlocks(buffer.toString())))) { - CppTokenManager tokenManager = new CppTokenManager(reader); - tokenManager.setFileName(sourceCode.getFileName()); - final TokenFilter tokenFilter = new JavaCCTokenFilter(tokenManager); - - GenericToken currentToken = tokenFilter.getNextToken(); - while (currentToken != null) { - tokenEntries.add(new TokenEntry(currentToken.getImage(), sourceCode.getFileName(), currentToken.getBeginLine())); - currentToken = tokenFilter.getNextToken(); - } - } catch (IOException e) { - e.printStackTrace(); - System.err.println("Error parsing " + sourceCode.getFileName()); - } finally { - tokenEntries.add(TokenEntry.getEOF()); - } - } - private String maybeSkipBlocks(String test) throws IOException { if (!skipBlocks) { return test; @@ -92,4 +68,14 @@ public class CPPTokenizer implements Tokenizer { } return filtered.toString(); } + + @Override + protected TokenManager getLexerForSource(SourceCode sourceCode) { + try { + StringBuilder buffer = sourceCode.getCodeBuffer(); + return new CppTokenManager(IOUtil.skipBOM(new StringReader(maybeSkipBlocks(buffer.toString())))); + } catch (IOException e) { + throw new RuntimeException(e); + } + } } diff --git a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerContinuationTest.java b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerContinuationTest.java index f5cd31bd1b..188609febd 100644 --- a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerContinuationTest.java +++ b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerContinuationTest.java @@ -7,6 +7,7 @@ package net.sourceforge.pmd.cpd; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; +import java.io.IOException; import java.io.StringReader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; @@ -117,7 +118,7 @@ public class CPPTokenizerContinuationTest { .getResourceAsStream("cpp/" + name), StandardCharsets.UTF_8); } - private Tokens parse(String code) { + private Tokens parse(String code) throws IOException { CPPTokenizer tokenizer = new CPPTokenizer(); tokenizer.setProperties(new Properties()); Tokens tokens = new Tokens(); diff --git a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java index 75f45768c6..4bffb5208d 100644 --- a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java +++ b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java @@ -8,6 +8,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotSame; import static org.junit.Assert.assertTrue; +import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.Properties; @@ -159,14 +160,18 @@ public class CPPTokenizerTest { } private Tokens parse(String snippet) { - return parse(snippet, false, new Tokens()); + try { + return parse(snippet, false, new Tokens()); + } catch (IOException e) { + throw new RuntimeException(e); + } } - private Tokens parse(String snippet, boolean skipBlocks, Tokens tokens) { + private Tokens parse(String snippet, boolean skipBlocks, Tokens tokens) throws IOException { return parse(snippet, skipBlocks, null, tokens); } - private Tokens parse(String snippet, boolean skipBlocks, String skipPattern, Tokens tokens) { + private Tokens parse(String snippet, boolean skipBlocks, String skipPattern, Tokens tokens) throws IOException { Properties properties = new Properties(); properties.setProperty(Tokenizer.OPTION_SKIP_BLOCKS, Boolean.toString(skipBlocks)); if (skipPattern != null) { diff --git a/pmd-java/src/main/java/net/sourceforge/pmd/cpd/JavaTokenizer.java b/pmd-java/src/main/java/net/sourceforge/pmd/cpd/JavaTokenizer.java index d758704ba3..9f3ea6d792 100644 --- a/pmd-java/src/main/java/net/sourceforge/pmd/cpd/JavaTokenizer.java +++ b/pmd-java/src/main/java/net/sourceforge/pmd/cpd/JavaTokenizer.java @@ -4,21 +4,21 @@ package net.sourceforge.pmd.cpd; +import java.io.IOException; import java.io.StringReader; import java.util.Deque; import java.util.LinkedList; import java.util.Properties; import net.sourceforge.pmd.cpd.token.JavaCCTokenFilter; -import net.sourceforge.pmd.lang.LanguageRegistry; -import net.sourceforge.pmd.lang.LanguageVersionHandler; +import net.sourceforge.pmd.cpd.token.TokenFilter; import net.sourceforge.pmd.lang.TokenManager; import net.sourceforge.pmd.lang.ast.GenericToken; -import net.sourceforge.pmd.lang.java.JavaLanguageModule; +import net.sourceforge.pmd.lang.java.JavaTokenManager; import net.sourceforge.pmd.lang.java.ast.JavaParserConstants; import net.sourceforge.pmd.lang.java.ast.Token; -public class JavaTokenizer implements Tokenizer { +public class JavaTokenizer extends JavaCCTokenizer { public static final String CPD_START = "\"CPD-START\""; public static final String CPD_END = "\"CPD-END\""; @@ -27,6 +27,8 @@ public class JavaTokenizer implements Tokenizer { private boolean ignoreLiterals; private boolean ignoreIdentifiers; + private ConstructorDetector constructorDetector; + public void setProperties(Properties properties) { ignoreAnnotations = Boolean.parseBoolean(properties.getProperty(IGNORE_ANNOTATIONS, "false")); ignoreLiterals = Boolean.parseBoolean(properties.getProperty(IGNORE_LITERALS, "false")); @@ -34,48 +36,42 @@ public class JavaTokenizer implements Tokenizer { } @Override - public void tokenize(SourceCode sourceCode, Tokens tokenEntries) { - final String fileName = sourceCode.getFileName(); - final JavaTokenFilter tokenFilter = createTokenFilter(sourceCode); - final ConstructorDetector constructorDetector = new ConstructorDetector(ignoreIdentifiers); - - Token currentToken = (Token) tokenFilter.getNextToken(); - while (currentToken != null) { - processToken(tokenEntries, fileName, currentToken, constructorDetector); - currentToken = (Token) tokenFilter.getNextToken(); - } - tokenEntries.add(TokenEntry.getEOF()); + public void tokenize(SourceCode sourceCode, Tokens tokenEntries) throws IOException { + constructorDetector = new ConstructorDetector(ignoreIdentifiers); + super.tokenize(sourceCode, tokenEntries); } - private JavaTokenFilter createTokenFilter(final SourceCode sourceCode) { + @Override + protected TokenManager getLexerForSource(SourceCode sourceCode) { final StringBuilder stringBuilder = sourceCode.getCodeBuffer(); - // Note that Java version is irrelevant for tokenizing - final LanguageVersionHandler languageVersionHandler = LanguageRegistry.getLanguage(JavaLanguageModule.NAME) - .getVersion("1.4").getLanguageVersionHandler(); - final TokenManager tokenMgr = languageVersionHandler.getParser(languageVersionHandler.getDefaultParserOptions()) - .getTokenManager(sourceCode.getFileName(), new StringReader(stringBuilder.toString())); - return new JavaTokenFilter(tokenMgr, ignoreAnnotations); + return new JavaTokenManager(new StringReader(stringBuilder.toString())); } - private void processToken(Tokens tokenEntries, String fileName, Token currentToken, - ConstructorDetector constructorDetector) { - String image = currentToken.image; + @Override + protected TokenFilter getTokenFilter(TokenManager tokenManager) { + return new JavaTokenFilter(tokenManager, ignoreAnnotations); + } - constructorDetector.restoreConstructorToken(tokenEntries, currentToken); + @Override + protected TokenEntry processToken(Tokens tokenEntries, GenericToken currentToken, String fileName) { + String image = currentToken.getImage(); + Token javaToken = (Token) currentToken; - if (ignoreLiterals && (currentToken.kind == JavaParserConstants.STRING_LITERAL - || currentToken.kind == JavaParserConstants.CHARACTER_LITERAL - || currentToken.kind == JavaParserConstants.DECIMAL_LITERAL - || currentToken.kind == JavaParserConstants.FLOATING_POINT_LITERAL)) { - image = String.valueOf(currentToken.kind); + constructorDetector.restoreConstructorToken(tokenEntries, javaToken); + + if (ignoreLiterals && (javaToken.kind == JavaParserConstants.STRING_LITERAL + || javaToken.kind == JavaParserConstants.CHARACTER_LITERAL + || javaToken.kind == JavaParserConstants.DECIMAL_LITERAL + || javaToken.kind == JavaParserConstants.FLOATING_POINT_LITERAL)) { + image = String.valueOf(javaToken.kind); } - if (ignoreIdentifiers && currentToken.kind == JavaParserConstants.IDENTIFIER) { - image = String.valueOf(currentToken.kind); + if (ignoreIdentifiers && javaToken.kind == JavaParserConstants.IDENTIFIER) { + image = String.valueOf(javaToken.kind); } - constructorDetector.processToken(currentToken); + constructorDetector.processToken(javaToken); - tokenEntries.add(new TokenEntry(image, fileName, currentToken.beginLine)); + return new TokenEntry(image, fileName, currentToken.getBeginLine()); } public void setIgnoreLiterals(boolean ignore) { diff --git a/pmd-java/src/test/java/net/sourceforge/pmd/cpd/JavaTokensTokenizerTest.java b/pmd-java/src/test/java/net/sourceforge/pmd/cpd/JavaTokensTokenizerTest.java index 3291c53ad6..74cb844c65 100644 --- a/pmd-java/src/test/java/net/sourceforge/pmd/cpd/JavaTokensTokenizerTest.java +++ b/pmd-java/src/test/java/net/sourceforge/pmd/cpd/JavaTokensTokenizerTest.java @@ -88,7 +88,7 @@ public class JavaTokensTokenizerTest { } @Test - public void testIgnoreComments() { + public void testIgnoreComments() throws IOException { JavaTokenizer t = new JavaTokenizer(); t.setIgnoreAnnotations(false); SourceCode sourceCode = new SourceCode(new SourceCode.StringCodeLoader("package foo.bar.baz;" + PMD.EOL diff --git a/pmd-java/src/test/java/net/sourceforge/pmd/cpd/MatchAlgorithmTest.java b/pmd-java/src/test/java/net/sourceforge/pmd/cpd/MatchAlgorithmTest.java index 972b90d1f7..6c2e4937bb 100644 --- a/pmd-java/src/test/java/net/sourceforge/pmd/cpd/MatchAlgorithmTest.java +++ b/pmd-java/src/test/java/net/sourceforge/pmd/cpd/MatchAlgorithmTest.java @@ -7,6 +7,7 @@ package net.sourceforge.pmd.cpd; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -32,7 +33,7 @@ public class MatchAlgorithmTest { } @Test - public void testSimple() { + public void testSimple() throws IOException { JavaTokenizer tokenizer = new JavaTokenizer(); SourceCode sourceCode = new SourceCode(new SourceCode.StringCodeLoader(getSampleCode(), "Foo.java")); Tokens tokens = new Tokens(); @@ -63,7 +64,7 @@ public class MatchAlgorithmTest { } @Test - public void testIgnore() { + public void testIgnore() throws IOException { JavaTokenizer tokenizer = new JavaTokenizer(); tokenizer.setIgnoreLiterals(true); tokenizer.setIgnoreIdentifiers(true); diff --git a/pmd-javascript/src/main/java/net/sourceforge/pmd/cpd/EcmascriptTokenizer.java b/pmd-javascript/src/main/java/net/sourceforge/pmd/cpd/EcmascriptTokenizer.java index bafbde5a9f..fed848801b 100644 --- a/pmd-javascript/src/main/java/net/sourceforge/pmd/cpd/EcmascriptTokenizer.java +++ b/pmd-javascript/src/main/java/net/sourceforge/pmd/cpd/EcmascriptTokenizer.java @@ -4,51 +4,38 @@ package net.sourceforge.pmd.cpd; -import java.io.IOException; -import java.io.Reader; import java.io.StringReader; -import net.sourceforge.pmd.cpd.token.JavaCCTokenFilter; -import net.sourceforge.pmd.cpd.token.TokenFilter; -import net.sourceforge.pmd.lang.LanguageRegistry; -import net.sourceforge.pmd.lang.LanguageVersionHandler; -import net.sourceforge.pmd.lang.ecmascript.EcmascriptLanguageModule; +import net.sourceforge.pmd.lang.TokenManager; +import net.sourceforge.pmd.lang.ast.GenericToken; +import net.sourceforge.pmd.lang.ecmascript5.Ecmascript5TokenManager; import net.sourceforge.pmd.lang.ecmascript5.ast.Ecmascript5ParserConstants; import net.sourceforge.pmd.lang.ecmascript5.ast.Token; +import net.sourceforge.pmd.util.IOUtil; /** * The Ecmascript Tokenizer */ -public class EcmascriptTokenizer implements Tokenizer { +public class EcmascriptTokenizer extends JavaCCTokenizer { @Override - public void tokenize(SourceCode sourceCode, Tokens tokenEntries) { + protected TokenManager getLexerForSource(SourceCode sourceCode) { StringBuilder buffer = sourceCode.getCodeBuffer(); - try (Reader reader = new StringReader(buffer.toString())) { - LanguageVersionHandler languageVersionHandler = LanguageRegistry.getLanguage(EcmascriptLanguageModule.NAME) - .getDefaultVersion().getLanguageVersionHandler(); - TokenFilter tokenFilter = new JavaCCTokenFilter(languageVersionHandler - .getParser(languageVersionHandler.getDefaultParserOptions()) - .getTokenManager(sourceCode.getFileName(), reader)); - Token currentToken = (Token) tokenFilter.getNextToken(); - while (currentToken != null) { - tokenEntries.add( - new TokenEntry(getTokenImage(currentToken), sourceCode.getFileName(), currentToken.beginLine)); - currentToken = (Token) tokenFilter.getNextToken(); - } - } catch (IOException e) { - e.printStackTrace(); - } finally { - tokenEntries.add(TokenEntry.getEOF()); - } + return new Ecmascript5TokenManager(IOUtil.skipBOM(new StringReader(buffer.toString()))); } - private String getTokenImage(Token token) { + @Override + protected TokenEntry processToken(Tokens tokenEntries, GenericToken currentToken, String filename) { + return new TokenEntry(getTokenImage(currentToken), filename, currentToken.getBeginLine()); + } + + private String getTokenImage(GenericToken token) { + Token jsToken = (Token) token; // Remove line continuation characters from string literals - if (token.kind == Ecmascript5ParserConstants.STRING_LITERAL - || token.kind == Ecmascript5ParserConstants.UNTERMINATED_STRING_LITERAL) { - return token.image.replaceAll("(?