From 579c134061211392008a1bdf8b50c745dc3df729 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Fournier?= Date: Sun, 16 Feb 2020 21:17:10 +0100 Subject: [PATCH] Port python module --- .../pmd/cpd/internal/JavaCCTokenizer.java | 7 +- .../etc/grammar/{python.jj => Python.jj} | 139 +++++------------- pmd-python/pom.xml | 6 +- pmd-python/src/main/ant/alljavacc.xml | 112 -------------- .../sourceforge/pmd/cpd/PythonTokenizer.java | 25 +++- .../pmd/lang/python/PythonTokenManager.java | 38 ----- .../lang/python/ast/PythonTokenManager.java | 53 +++++++ .../pmd/cpd/PythonTokenizerTest.java | 21 ++- 8 files changed, 136 insertions(+), 265 deletions(-) rename pmd-python/etc/grammar/{python.jj => Python.jj} (56%) delete mode 100644 pmd-python/src/main/ant/alljavacc.xml delete mode 100644 pmd-python/src/main/java/net/sourceforge/pmd/lang/python/PythonTokenManager.java create mode 100644 pmd-python/src/main/java/net/sourceforge/pmd/lang/python/ast/PythonTokenManager.java diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/internal/JavaCCTokenizer.java b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/internal/JavaCCTokenizer.java index 2721f5cbc6..f12861067a 100644 --- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/internal/JavaCCTokenizer.java +++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/internal/JavaCCTokenizer.java @@ -14,6 +14,7 @@ import net.sourceforge.pmd.cpd.token.JavaCCTokenFilter; import net.sourceforge.pmd.cpd.token.TokenFilter; import net.sourceforge.pmd.lang.TokenManager; import net.sourceforge.pmd.lang.ast.GenericToken; +import net.sourceforge.pmd.lang.ast.impl.javacc.JavaccToken; public abstract class JavaCCTokenizer implements Tokenizer { @@ -24,7 +25,11 @@ public abstract class JavaCCTokenizer implements Tokenizer { } protected TokenEntry processToken(Tokens tokenEntries, GenericToken currentToken, String filename) { - return new TokenEntry(currentToken.getImage(), filename, currentToken.getBeginLine(), currentToken.getBeginColumn(), currentToken.getEndColumn()); + return new TokenEntry(getImage((JavaccToken) currentToken), filename, currentToken.getBeginLine(), currentToken.getBeginColumn(), currentToken.getEndColumn()); + } + + protected String getImage(JavaccToken token) { + return token.getImage(); } @Override diff --git a/pmd-python/etc/grammar/python.jj b/pmd-python/etc/grammar/Python.jj similarity index 56% rename from pmd-python/etc/grammar/python.jj rename to pmd-python/etc/grammar/Python.jj index 1f97f9d948..b9c2313b62 100644 --- a/pmd-python/etc/grammar/python.jj +++ b/pmd-python/etc/grammar/Python.jj @@ -1,30 +1,30 @@ /** - * This Python 2.7 grammar was copied from the PyDev Project. (http://www.pydev.org/) - * - * Original source file: + * This Python 2.7 grammar was copied from the PyDev Project. (http://www.pydev.org/) + * + * Original source file: * https://github.com/aptana/Pydev/blob/development/plugins/org.python.pydev.parser/src/org/python/pydev/parser/grammar27/python.jjt (commit 32950d534139f286e03d34795aec99edab09c04c) */ - + options { BUILD_PARSER=false; CACHE_TOKENS=true; STATIC=false; UNICODE_INPUT = true; - USER_CHAR_STREAM=true; + USER_CHAR_STREAM=true; } -PARSER_BEGIN(PythonParser) +PARSER_BEGIN(PythonParserImpl) package net.sourceforge.pmd.lang.python.ast; import net.sourceforge.pmd.lang.ast.CharStream; import net.sourceforge.pmd.lang.ast.TokenMgrError; -public class PythonParser { +public class PythonParserImpl { } -PARSER_END(PythonParser) +PARSER_END(PythonParserImpl) SKIP : { @@ -175,110 +175,49 @@ MORE : /* Strings */ | < (["r", "R"])? "\"\"\"" > : IN_STRING23 } - TOKEN : { { - matchedToken.image = image.toString(); } : DEFAULT} - TOKEN : { { - matchedToken.image = image.toString(); } : DEFAULT} - TOKEN : { { - matchedToken.image = image.toString(); } : DEFAULT} - TOKEN : { { - matchedToken.image = image.toString(); } : DEFAULT} + TOKEN : { : DEFAULT} + TOKEN : { : DEFAULT} + TOKEN : { : DEFAULT} + TOKEN : { : DEFAULT} - TOKEN : { { - matchedToken.image = image.toString(); } : DEFAULT} - TOKEN : { { - matchedToken.image = image.toString(); } : DEFAULT} - TOKEN : { { - matchedToken.image = image.toString(); } : DEFAULT} - TOKEN : { { - matchedToken.image = image.toString(); } : DEFAULT} + TOKEN : { : DEFAULT} + TOKEN : { : DEFAULT} + TOKEN : { : DEFAULT} + TOKEN : { : DEFAULT} - TOKEN : { { - matchedToken.image = image.toString(); } : DEFAULT} - TOKEN : { { - matchedToken.image = image.toString(); } : DEFAULT} - TOKEN : { { - matchedToken.image = image.toString(); } : DEFAULT} - TOKEN : { { - matchedToken.image = image.toString(); } : DEFAULT} + TOKEN : { : DEFAULT} + TOKEN : { : DEFAULT} + TOKEN : { : DEFAULT} + TOKEN : { : DEFAULT} - MORE: -{ - <"\\\r\n"> { image.setLength(image.length()-3); } : IN_STRING1NLC -| <("\\" ("\n"|"\r"))> { image.setLength(image.length()-2); } : IN_STRING1NLC + + MORE: { + "\\'" +} + MORE: { + "\\\"" } - MORE: + MORE: { - <"\\\r\n"> { image.setLength(image.length()-3); } : IN_STRING2NLC -| <("\\" ("\n"|"\r"))> { image.setLength(image.length()-2); } : IN_STRING2NLC + // escaping a newline ignores it, this is handled by the token document + "\\\r\n" | "\\\n" | "\\\r" + | "\\\\" + | < ~["\n","\r"] > } - MORE: -{ - <"\\\r\n"> { image.setLength(image.length()-3); } : IN_USTRING1NLC -| <("\\" ("\n"|"\r"))> { image.setLength(image.length()-2); } : IN_USTRING1NLC -} - - MORE: -{ - <"\\\r\n"> { image.setLength(image.length()-3); } : IN_USTRING2NLC -| <("\\" ("\n"|"\r"))> { image.setLength(image.length()-2); } : IN_USTRING2NLC -} - - MORE: -{ - <"\\\r\n"> { image.setLength(image.length()-3); } : IN_BSTRING1NLC -| <("\\" ("\n"|"\r"))> { image.setLength(image.length()-2); } : IN_BSTRING1NLC -} - - MORE: -{ - <"\\\r\n"> { image.setLength(image.length()-3); } : IN_BSTRING2NLC -| <("\\" ("\n"|"\r"))> { image.setLength(image.length()-2); } : IN_BSTRING2NLC -} - - MORE: -{ - <""> : IN_STRING11 -} - - MORE: -{ - <""> : IN_STRING21 -} - - MORE: -{ - <""> : IN_USTRING11 -} - - MORE: -{ - <""> : IN_USTRING21 -} - - MORE: -{ - <""> : IN_BSTRING11 -} - - MORE: -{ - <""> : IN_BSTRING21 -} - - MORE: { <("\\" ("\\"|"'")) | ~["\n","\r"]> } - MORE: { <("\\" ("\\"|"\"")) | ~["\n","\r"]> } +// NLs are normalized in triple-quoted strings MORE: { - <"\r\n"> { + "\r\n" { int l = image.length(); image.setLength(l-1); image.setCharAt(l-2, '\n'); } -| <"\n"> -| <"\r"> { image.setCharAt(image.length()-1, '\n'); } -| <~["\n","\r"]> -| <"\\" ~["\n","\r"]> +| "\n" +| "\r" { image.setCharAt(image.length()-1, '\n'); } +| < ~["\n","\r"] > +| < "\\" ~["\n","\r"] > } diff --git a/pmd-python/pom.xml b/pmd-python/pom.xml index 92b9798e44..ba6f9dbf75 100644 --- a/pmd-python/pom.xml +++ b/pmd-python/pom.xml @@ -32,8 +32,10 @@ generate-sources - - + + + + diff --git a/pmd-python/src/main/ant/alljavacc.xml b/pmd-python/src/main/ant/alljavacc.xml deleted file mode 100644 index 1996f86bf1..0000000000 --- a/pmd-python/src/main/ant/alljavacc.xml +++ /dev/null @@ -1,112 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - public class Token implements java.io.Serializable - - - - - - public Token specialToken; - - - - - - - - - - - - - - - - - - - - - diff --git a/pmd-python/src/main/java/net/sourceforge/pmd/cpd/PythonTokenizer.java b/pmd-python/src/main/java/net/sourceforge/pmd/cpd/PythonTokenizer.java index e88b1476cf..e65c0d80b6 100644 --- a/pmd-python/src/main/java/net/sourceforge/pmd/cpd/PythonTokenizer.java +++ b/pmd-python/src/main/java/net/sourceforge/pmd/cpd/PythonTokenizer.java @@ -5,10 +5,13 @@ package net.sourceforge.pmd.cpd; import java.io.StringReader; +import java.util.regex.Pattern; import net.sourceforge.pmd.cpd.internal.JavaCCTokenizer; import net.sourceforge.pmd.lang.TokenManager; -import net.sourceforge.pmd.lang.python.PythonTokenManager; +import net.sourceforge.pmd.lang.ast.impl.javacc.JavaccToken; +import net.sourceforge.pmd.lang.python.ast.PythonTokenKinds; +import net.sourceforge.pmd.lang.python.ast.PythonTokenManager; import net.sourceforge.pmd.util.IOUtil; /** @@ -16,9 +19,29 @@ import net.sourceforge.pmd.util.IOUtil; */ public class PythonTokenizer extends JavaCCTokenizer { + private static final Pattern STRING_NL_ESCAPE = Pattern.compile("\\\\\\r?\\n"); + @Override protected TokenManager getLexerForSource(SourceCode sourceCode) { StringBuilder buffer = sourceCode.getCodeBuffer(); return new PythonTokenManager(IOUtil.skipBOM(new StringReader(buffer.toString()))); } + + @Override + protected String getImage(JavaccToken token) { + switch (token.kind) { + case PythonTokenKinds.SINGLE_STRING: + case PythonTokenKinds.SINGLE_STRING2: + case PythonTokenKinds.SINGLE_BSTRING: + case PythonTokenKinds.SINGLE_BSTRING2: + case PythonTokenKinds.SINGLE_USTRING: + case PythonTokenKinds.SINGLE_USTRING2: + // linebreak escapes, only for single-quoted strings + // todo other escapes? + return STRING_NL_ESCAPE.matcher(token.getImage()).replaceAll(""); + default: + return token.getImage(); + } + } + } diff --git a/pmd-python/src/main/java/net/sourceforge/pmd/lang/python/PythonTokenManager.java b/pmd-python/src/main/java/net/sourceforge/pmd/lang/python/PythonTokenManager.java deleted file mode 100644 index 881c190b57..0000000000 --- a/pmd-python/src/main/java/net/sourceforge/pmd/lang/python/PythonTokenManager.java +++ /dev/null @@ -1,38 +0,0 @@ -/** - * BSD-style license; for more info see http://pmd.sourceforge.net/license.html - */ - -package net.sourceforge.pmd.lang.python; - -import java.io.Reader; - -import net.sourceforge.pmd.lang.TokenManager; -import net.sourceforge.pmd.lang.ast.impl.javacc.CharStreamFactory; -import net.sourceforge.pmd.lang.python.ast.PythonParserTokenManager; - -/** - * Python Token Manager implementation. - */ -public class PythonTokenManager implements TokenManager { - private final PythonParserTokenManager tokenManager; - - /** - * Creates a new Python Token Manager from the given source code. - * - * @param source - * the source code - */ - public PythonTokenManager(Reader source) { - tokenManager = new PythonParserTokenManager(CharStreamFactory.simpleCharStream(source)); - } - - @Override - public Object getNextToken() { - return tokenManager.getNextToken(); - } - - @Override - public void setFileName(String fileName) { - PythonParserTokenManager.setFileName(fileName); - } -} diff --git a/pmd-python/src/main/java/net/sourceforge/pmd/lang/python/ast/PythonTokenManager.java b/pmd-python/src/main/java/net/sourceforge/pmd/lang/python/ast/PythonTokenManager.java new file mode 100644 index 0000000000..72b8095e16 --- /dev/null +++ b/pmd-python/src/main/java/net/sourceforge/pmd/lang/python/ast/PythonTokenManager.java @@ -0,0 +1,53 @@ +/* + * BSD-style license; for more info see http://pmd.sourceforge.net/license.html + */ + +package net.sourceforge.pmd.lang.python.ast; + +import java.io.Reader; + +import org.checkerframework.checker.nullness.qual.Nullable; + +import net.sourceforge.pmd.lang.TokenManager; +import net.sourceforge.pmd.lang.ast.impl.javacc.CharStreamFactory; +import net.sourceforge.pmd.lang.ast.impl.javacc.JavaccTokenDocument; + +/** + * Python Token Manager implementation. + */ +public class PythonTokenManager implements TokenManager { + private final PythonParserImplTokenManager tokenManager; + + /** + * Creates a new Python Token Manager from the given source code. + * + * @param source + * the source code + */ + public PythonTokenManager(Reader source) { + tokenManager = new PythonParserImplTokenManager(CharStreamFactory.simpleCharStream(source, PythonTokenDocument::new)); + } + + @Override + public Object getNextToken() { + return tokenManager.getNextToken(); + } + + @Override + public void setFileName(String fileName) { + PythonParserImplTokenManager.setFileName(fileName); + } + + private static class PythonTokenDocument extends JavaccTokenDocument { + + PythonTokenDocument(String fullText) { + super(fullText); + } + + @Override + protected @Nullable String describeKindImpl(int kind) { + return PythonTokenKinds.describe(kind); + } + + } +} diff --git a/pmd-python/src/test/java/net/sourceforge/pmd/cpd/PythonTokenizerTest.java b/pmd-python/src/test/java/net/sourceforge/pmd/cpd/PythonTokenizerTest.java index 6896a14d62..7a15fea69c 100644 --- a/pmd-python/src/test/java/net/sourceforge/pmd/cpd/PythonTokenizerTest.java +++ b/pmd-python/src/test/java/net/sourceforge/pmd/cpd/PythonTokenizerTest.java @@ -13,7 +13,6 @@ import org.apache.commons.io.IOUtils; import org.junit.Before; import org.junit.Test; -import net.sourceforge.pmd.PMD; import net.sourceforge.pmd.testframework.AbstractTokenizerTest; public class PythonTokenizerTest extends AbstractTokenizerTest { @@ -40,13 +39,13 @@ public class PythonTokenizerTest extends AbstractTokenizerTest { @Test public void testIgnoreBetweenSpecialComments() throws IOException { - SourceCode sourceCode = new SourceCode(new SourceCode.StringCodeLoader("import logging" + PMD.EOL - + "# CPD-OFF" + PMD.EOL - + "logger = logging.getLogger('django.request')" + PMD.EOL - + "class BaseHandler(object):" + PMD.EOL - + " def __init__(self):" + PMD.EOL - + " self._request_middleware = None" + PMD.EOL - + " # CPD-ON" + PMD.EOL + SourceCode sourceCode = new SourceCode(new SourceCode.StringCodeLoader("import logging\n" + + "# CPD-OFF\n" + + "logger = logging.getLogger('django.request')\n" + + "class BaseHandler(object):\n" + + " def __init__(self):\n" + + " self._request_middleware = None\n" + + " # CPD-ON\n" )); Tokens tokens = new Tokens(); tokenizer.tokenize(sourceCode, tokens); @@ -56,9 +55,9 @@ public class PythonTokenizerTest extends AbstractTokenizerTest { @Test public void testBackticks() throws IOException { - SourceCode sourceCode = new SourceCode(new SourceCode.StringCodeLoader("test = 'hello'" + PMD.EOL - + "quoted = `test`" + PMD.EOL - + "print quoted" + PMD.EOL + SourceCode sourceCode = new SourceCode(new SourceCode.StringCodeLoader("test = 'hello'\n" + + "quoted = `test`\n" + + "print quoted\n" )); Tokens tokens = new Tokens(); tokenizer.tokenize(sourceCode, tokens); // should not result in parse error