Port python module

This commit is contained in:
Clément Fournier 2020-02-16 21:17:10 +01:00
parent 8ab362c0ad
commit 579c134061
8 changed files with 136 additions and 265 deletions

View File

@ -14,6 +14,7 @@ import net.sourceforge.pmd.cpd.token.JavaCCTokenFilter;
import net.sourceforge.pmd.cpd.token.TokenFilter;
import net.sourceforge.pmd.lang.TokenManager;
import net.sourceforge.pmd.lang.ast.GenericToken;
import net.sourceforge.pmd.lang.ast.impl.javacc.JavaccToken;
public abstract class JavaCCTokenizer implements Tokenizer {
@ -24,7 +25,11 @@ public abstract class JavaCCTokenizer implements Tokenizer {
}
protected TokenEntry processToken(Tokens tokenEntries, GenericToken currentToken, String filename) {
return new TokenEntry(currentToken.getImage(), filename, currentToken.getBeginLine(), currentToken.getBeginColumn(), currentToken.getEndColumn());
return new TokenEntry(getImage((JavaccToken) currentToken), filename, currentToken.getBeginLine(), currentToken.getBeginColumn(), currentToken.getEndColumn());
}
protected String getImage(JavaccToken token) {
return token.getImage();
}
@Override

View File

@ -1,30 +1,30 @@
/**
* This Python 2.7 grammar was copied from the PyDev Project. (http://www.pydev.org/)
*
* Original source file:
* This Python 2.7 grammar was copied from the PyDev Project. (http://www.pydev.org/)
*
* Original source file:
* https://github.com/aptana/Pydev/blob/development/plugins/org.python.pydev.parser/src/org/python/pydev/parser/grammar27/python.jjt (commit 32950d534139f286e03d34795aec99edab09c04c)
*/
options {
BUILD_PARSER=false;
CACHE_TOKENS=true;
STATIC=false;
UNICODE_INPUT = true;
USER_CHAR_STREAM=true;
USER_CHAR_STREAM=true;
}
PARSER_BEGIN(PythonParser)
PARSER_BEGIN(PythonParserImpl)
package net.sourceforge.pmd.lang.python.ast;
import net.sourceforge.pmd.lang.ast.CharStream;
import net.sourceforge.pmd.lang.ast.TokenMgrError;
public class PythonParser {
public class PythonParserImpl {
}
PARSER_END(PythonParser)
PARSER_END(PythonParserImpl)
SKIP :
{
@ -175,110 +175,49 @@ MORE : /* Strings */
| < (["r", "R"])? "\"\"\"" > : IN_STRING23
}
<IN_STRING11> TOKEN : { <SINGLE_STRING: "'"> {
matchedToken.image = image.toString(); } : DEFAULT}
<IN_STRING21> TOKEN : { <SINGLE_STRING2: "\""> {
matchedToken.image = image.toString(); } : DEFAULT}
<IN_STRING13> TOKEN : { <TRIPLE_STRING: "'''"> {
matchedToken.image = image.toString(); } : DEFAULT}
<IN_STRING23> TOKEN : { <TRIPLE_STRING2: "\"\"\""> {
matchedToken.image = image.toString(); } : DEFAULT}
<IN_STRING11> TOKEN : { <SINGLE_STRING: "'"> : DEFAULT}
<IN_STRING21> TOKEN : { <SINGLE_STRING2: "\""> : DEFAULT}
<IN_STRING13> TOKEN : { <TRIPLE_STRING: "'''"> : DEFAULT}
<IN_STRING23> TOKEN : { <TRIPLE_STRING2: "\"\"\""> : DEFAULT}
<IN_BSTRING11> TOKEN : { <SINGLE_BSTRING: "'"> {
matchedToken.image = image.toString(); } : DEFAULT}
<IN_BSTRING21> TOKEN : { <SINGLE_BSTRING2: "\""> {
matchedToken.image = image.toString(); } : DEFAULT}
<IN_BSTRING13> TOKEN : { <TRIPLE_BSTRING: "'''"> {
matchedToken.image = image.toString(); } : DEFAULT}
<IN_BSTRING23> TOKEN : { <TRIPLE_BSTRING2: "\"\"\""> {
matchedToken.image = image.toString(); } : DEFAULT}
<IN_BSTRING11> TOKEN : { <SINGLE_BSTRING: "'"> : DEFAULT}
<IN_BSTRING21> TOKEN : { <SINGLE_BSTRING2: "\""> : DEFAULT}
<IN_BSTRING13> TOKEN : { <TRIPLE_BSTRING: "'''"> : DEFAULT}
<IN_BSTRING23> TOKEN : { <TRIPLE_BSTRING2: "\"\"\"">: DEFAULT}
<IN_USTRING11> TOKEN : { <SINGLE_USTRING: "'"> {
matchedToken.image = image.toString(); } : DEFAULT}
<IN_USTRING21> TOKEN : { <SINGLE_USTRING2: "\""> {
matchedToken.image = image.toString(); } : DEFAULT}
<IN_USTRING13> TOKEN : { <TRIPLE_USTRING: "'''"> {
matchedToken.image = image.toString(); } : DEFAULT}
<IN_USTRING23> TOKEN : { <TRIPLE_USTRING2: "\"\"\""> {
matchedToken.image = image.toString(); } : DEFAULT}
<IN_USTRING11> TOKEN : { <SINGLE_USTRING: "'"> : DEFAULT}
<IN_USTRING21> TOKEN : { <SINGLE_USTRING2: "\""> : DEFAULT}
<IN_USTRING13> TOKEN : { <TRIPLE_USTRING: "'''"> : DEFAULT}
<IN_USTRING23> TOKEN : { <TRIPLE_USTRING2: "\"\"\"">: DEFAULT}
<IN_STRING11> MORE:
{
<"\\\r\n"> { image.setLength(image.length()-3); } : IN_STRING1NLC
| <("\\" ("\n"|"\r"))> { image.setLength(image.length()-2); } : IN_STRING1NLC
<IN_STRING11, IN_USTRING11, IN_BSTRING11> MORE: {
"\\'"
}
<IN_STRING21, IN_USTRING21, IN_BSTRING21> MORE: {
"\\\""
}
<IN_STRING21> MORE:
<IN_STRING11, IN_STRING21,
IN_USTRING11, IN_USTRING21,
IN_BSTRING11, IN_BSTRING21> MORE:
{
<"\\\r\n"> { image.setLength(image.length()-3); } : IN_STRING2NLC
| <("\\" ("\n"|"\r"))> { image.setLength(image.length()-2); } : IN_STRING2NLC
// escaping a newline ignores it, this is handled by the token document
"\\\r\n" | "\\\n" | "\\\r"
| "\\\\"
| < ~["\n","\r"] >
}
<IN_USTRING11> MORE:
{
<"\\\r\n"> { image.setLength(image.length()-3); } : IN_USTRING1NLC
| <("\\" ("\n"|"\r"))> { image.setLength(image.length()-2); } : IN_USTRING1NLC
}
<IN_USTRING21> MORE:
{
<"\\\r\n"> { image.setLength(image.length()-3); } : IN_USTRING2NLC
| <("\\" ("\n"|"\r"))> { image.setLength(image.length()-2); } : IN_USTRING2NLC
}
<IN_BSTRING11> MORE:
{
<"\\\r\n"> { image.setLength(image.length()-3); } : IN_BSTRING1NLC
| <("\\" ("\n"|"\r"))> { image.setLength(image.length()-2); } : IN_BSTRING1NLC
}
<IN_BSTRING21> MORE:
{
<"\\\r\n"> { image.setLength(image.length()-3); } : IN_BSTRING2NLC
| <("\\" ("\n"|"\r"))> { image.setLength(image.length()-2); } : IN_BSTRING2NLC
}
<IN_STRING1NLC> MORE:
{
<""> : IN_STRING11
}
<IN_STRING2NLC> MORE:
{
<""> : IN_STRING21
}
<IN_USTRING1NLC> MORE:
{
<""> : IN_USTRING11
}
<IN_USTRING2NLC> MORE:
{
<""> : IN_USTRING21
}
<IN_BSTRING1NLC> MORE:
{
<""> : IN_BSTRING11
}
<IN_BSTRING2NLC> MORE:
{
<""> : IN_BSTRING21
}
<IN_STRING11, IN_USTRING11, IN_BSTRING11> MORE: { <("\\" ("\\"|"'")) | ~["\n","\r"]> }
<IN_STRING21, IN_USTRING21, IN_BSTRING21> MORE: { <("\\" ("\\"|"\"")) | ~["\n","\r"]> }
// NLs are normalized in triple-quoted strings
<IN_STRING13, IN_STRING23, IN_USTRING13, IN_USTRING23, IN_BSTRING13, IN_BSTRING23> MORE:
{
<"\r\n"> {
"\r\n" {
int l = image.length();
image.setLength(l-1);
image.setCharAt(l-2, '\n');
}
| <"\n">
| <"\r"> { image.setCharAt(image.length()-1, '\n'); }
| <~["\n","\r"]>
| <"\\" ~["\n","\r"]>
| "\n"
| "\r" { image.setCharAt(image.length()-1, '\n'); }
| < ~["\n","\r"] >
| < "\\" ~["\n","\r"] >
}

View File

@ -32,8 +32,10 @@
<phase>generate-sources</phase>
<configuration>
<target>
<ant antfile="src/main/ant/alljavacc.xml">
<property name="target" value="${project.build.directory}/generated-sources/javacc" />
<ant antfile="${javacc.ant.wrapper}" target="alljavacc-visitor+">
<property name="no-jjtree" value="true"/> <!-- This is a CPD module -->
<property name="lang-name" value="Python" />
<property name="lang-terse-name" value="python" />
<property name="javacc.jar" value="${javacc.jar}" />
</ant>
</target>

View File

@ -1,112 +0,0 @@
<project name="pmd" default="alljavacc" basedir="../../">
<property name="javacc-home.path" value="target/lib" />
<target name="alljavacc"
description="Generates all JavaCC aspects within PMD"
depends="checkUpToDate,init,pythonjavacc,cleanup" />
<target name="checkUpToDate">
<uptodate property="javaccBuildNotRequired" targetfile="${target}/last-generated-timestamp">
<srcfiles dir="etc/grammar" includes="*.jj*"/>
</uptodate>
<echo message="up to date check: javaccBuildNotRequired=${javaccBuildNotRequired}"/>
</target>
<target name="init" unless="javaccBuildNotRequired">
<mkdir dir="${javacc-home.path}" />
<copy file="${javacc.jar}" tofile="${javacc-home.path}/javacc.jar" />
<mkdir dir="${target}"/>
<touch file="${target}/last-generated-timestamp"/>
</target>
<target name="cleanup">
<delete dir="${javacc-home.path}" />
</target>
<target name="pythonjavacc" description="Generates the Python grammar" unless="javaccBuildNotRequired">
<delete dir="${target}/net/sourceforge/pmd/lang/python/ast" />
<mkdir dir="${target}/net/sourceforge/pmd/lang/python/ast" />
<!-- Ensure generated using CharStream interface -->
<javacc static="false"
usercharstream="true"
target="etc/grammar/python.jj"
outputdirectory="${target}/net/sourceforge/pmd/lang/python/ast"
javacchome="${javacc-home.path}" />
<replace file="${target}/net/sourceforge/pmd/lang/python/ast/PythonParserTokenManager.java"
token="class PythonParserTokenManager"
value="class PythonParserTokenManager extends net.sourceforge.pmd.lang.ast.AbstractTokenManager" />
<delete file="${target}/net/sourceforge/pmd/lang/python/ast/CharStream.java" />
<delete file="${target}/net/sourceforge/pmd/lang/python/ast/ParseException.java" />
<delete file="${target}/net/sourceforge/pmd/lang/python/ast/TokenMgrError.java" />
<replace file="${target}/net/sourceforge/pmd/lang/python/ast/Token.java">
<replacetoken>public class Token implements java.io.Serializable</replacetoken>
<replacevalue><![CDATA[import net.sourceforge.pmd.lang.ast.GenericToken;
public class Token implements GenericToken, java.io.Serializable]]></replacevalue>
</replace>
<!--Add implementation methods of GenericToken-->
<replace file="${target}/net/sourceforge/pmd/lang/python/ast/Token.java">
<replacetoken>public Token specialToken;</replacetoken>
<replacevalue><![CDATA[public Token specialToken;
@Override
public GenericToken getNext() {
return next;
}
@Override
public GenericToken getPreviousComment() {
return specialToken;
}
@Override
public String getImage() {
return image;
}
@Override
public int getBeginLine() {
return beginLine;
}
@Override
public int getEndLine() {
return endLine;
}
@Override
public int getBeginColumn() {
return beginColumn;
}
@Override
public int getEndColumn() {
return endColumn;
}
]]></replacevalue>
</replace>
<replaceregexp>
<regexp pattern="class|interface" />
<substitution expression="@Deprecated @net.sourceforge.pmd.annotation.InternalApi \0" />
<fileset dir="${target}/net/sourceforge/pmd/lang/python/ast">
<exclude name="AST*.java" />
</fileset>
</replaceregexp>
<replaceregexp>
<regexp pattern="public class ParseException " />
<substitution expression=" /** @deprecated Use superclass {@link net.sourceforge.pmd.lang.ast.ParseException} */
@Deprecated @net.sourceforge.pmd.annotation.InternalApi \0" />
<fileset file="${target}/net/sourceforge/pmd/lang/python/ast/ParseException.java"/>
</replaceregexp>
</target>
</project>

View File

@ -5,10 +5,13 @@
package net.sourceforge.pmd.cpd;
import java.io.StringReader;
import java.util.regex.Pattern;
import net.sourceforge.pmd.cpd.internal.JavaCCTokenizer;
import net.sourceforge.pmd.lang.TokenManager;
import net.sourceforge.pmd.lang.python.PythonTokenManager;
import net.sourceforge.pmd.lang.ast.impl.javacc.JavaccToken;
import net.sourceforge.pmd.lang.python.ast.PythonTokenKinds;
import net.sourceforge.pmd.lang.python.ast.PythonTokenManager;
import net.sourceforge.pmd.util.IOUtil;
/**
@ -16,9 +19,29 @@ import net.sourceforge.pmd.util.IOUtil;
*/
public class PythonTokenizer extends JavaCCTokenizer {
private static final Pattern STRING_NL_ESCAPE = Pattern.compile("\\\\\\r?\\n");
@Override
protected TokenManager getLexerForSource(SourceCode sourceCode) {
StringBuilder buffer = sourceCode.getCodeBuffer();
return new PythonTokenManager(IOUtil.skipBOM(new StringReader(buffer.toString())));
}
@Override
protected String getImage(JavaccToken token) {
switch (token.kind) {
case PythonTokenKinds.SINGLE_STRING:
case PythonTokenKinds.SINGLE_STRING2:
case PythonTokenKinds.SINGLE_BSTRING:
case PythonTokenKinds.SINGLE_BSTRING2:
case PythonTokenKinds.SINGLE_USTRING:
case PythonTokenKinds.SINGLE_USTRING2:
// linebreak escapes, only for single-quoted strings
// todo other escapes?
return STRING_NL_ESCAPE.matcher(token.getImage()).replaceAll("");
default:
return token.getImage();
}
}
}

View File

@ -1,38 +0,0 @@
/**
* BSD-style license; for more info see http://pmd.sourceforge.net/license.html
*/
package net.sourceforge.pmd.lang.python;
import java.io.Reader;
import net.sourceforge.pmd.lang.TokenManager;
import net.sourceforge.pmd.lang.ast.impl.javacc.CharStreamFactory;
import net.sourceforge.pmd.lang.python.ast.PythonParserTokenManager;
/**
* Python Token Manager implementation.
*/
public class PythonTokenManager implements TokenManager {
private final PythonParserTokenManager tokenManager;
/**
* Creates a new Python Token Manager from the given source code.
*
* @param source
* the source code
*/
public PythonTokenManager(Reader source) {
tokenManager = new PythonParserTokenManager(CharStreamFactory.simpleCharStream(source));
}
@Override
public Object getNextToken() {
return tokenManager.getNextToken();
}
@Override
public void setFileName(String fileName) {
PythonParserTokenManager.setFileName(fileName);
}
}

View File

@ -0,0 +1,53 @@
/*
* BSD-style license; for more info see http://pmd.sourceforge.net/license.html
*/
package net.sourceforge.pmd.lang.python.ast;
import java.io.Reader;
import org.checkerframework.checker.nullness.qual.Nullable;
import net.sourceforge.pmd.lang.TokenManager;
import net.sourceforge.pmd.lang.ast.impl.javacc.CharStreamFactory;
import net.sourceforge.pmd.lang.ast.impl.javacc.JavaccTokenDocument;
/**
* Python Token Manager implementation.
*/
public class PythonTokenManager implements TokenManager {
private final PythonParserImplTokenManager tokenManager;
/**
* Creates a new Python Token Manager from the given source code.
*
* @param source
* the source code
*/
public PythonTokenManager(Reader source) {
tokenManager = new PythonParserImplTokenManager(CharStreamFactory.simpleCharStream(source, PythonTokenDocument::new));
}
@Override
public Object getNextToken() {
return tokenManager.getNextToken();
}
@Override
public void setFileName(String fileName) {
PythonParserImplTokenManager.setFileName(fileName);
}
private static class PythonTokenDocument extends JavaccTokenDocument {
PythonTokenDocument(String fullText) {
super(fullText);
}
@Override
protected @Nullable String describeKindImpl(int kind) {
return PythonTokenKinds.describe(kind);
}
}
}

View File

@ -13,7 +13,6 @@ import org.apache.commons.io.IOUtils;
import org.junit.Before;
import org.junit.Test;
import net.sourceforge.pmd.PMD;
import net.sourceforge.pmd.testframework.AbstractTokenizerTest;
public class PythonTokenizerTest extends AbstractTokenizerTest {
@ -40,13 +39,13 @@ public class PythonTokenizerTest extends AbstractTokenizerTest {
@Test
public void testIgnoreBetweenSpecialComments() throws IOException {
SourceCode sourceCode = new SourceCode(new SourceCode.StringCodeLoader("import logging" + PMD.EOL
+ "# CPD-OFF" + PMD.EOL
+ "logger = logging.getLogger('django.request')" + PMD.EOL
+ "class BaseHandler(object):" + PMD.EOL
+ " def __init__(self):" + PMD.EOL
+ " self._request_middleware = None" + PMD.EOL
+ " # CPD-ON" + PMD.EOL
SourceCode sourceCode = new SourceCode(new SourceCode.StringCodeLoader("import logging\n"
+ "# CPD-OFF\n"
+ "logger = logging.getLogger('django.request')\n"
+ "class BaseHandler(object):\n"
+ " def __init__(self):\n"
+ " self._request_middleware = None\n"
+ " # CPD-ON\n"
));
Tokens tokens = new Tokens();
tokenizer.tokenize(sourceCode, tokens);
@ -56,9 +55,9 @@ public class PythonTokenizerTest extends AbstractTokenizerTest {
@Test
public void testBackticks() throws IOException {
SourceCode sourceCode = new SourceCode(new SourceCode.StringCodeLoader("test = 'hello'" + PMD.EOL
+ "quoted = `test`" + PMD.EOL
+ "print quoted" + PMD.EOL
SourceCode sourceCode = new SourceCode(new SourceCode.StringCodeLoader("test = 'hello'\n"
+ "quoted = `test`\n"
+ "print quoted\n"
));
Tokens tokens = new Tokens();
tokenizer.tokenize(sourceCode, tokens); // should not result in parse error