Javascript tokenizer now ignores comment tokens.

This commit is contained in:
Jan van Nunen
2016-06-06 09:31:04 +02:00
parent 836af8d493
commit 63293f8b31
7 changed files with 993 additions and 30 deletions

File diff suppressed because it is too large Load Diff

View File

@ -32,6 +32,47 @@
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-antrun-plugin</artifactId>
<inherited>true</inherited>
<executions>
<execution>
<id>generate-sources</id>
<phase>generate-sources</phase>
<configuration>
<target>
<ant antfile="src/main/ant/alljavacc.xml">
<property name="target" value="${project.build.directory}/generated-sources/javacc" />
<property name="javacc.jar" value="${settings.localRepository}/net/java/dev/javacc/javacc/${javacc.version}/javacc-${javacc.version}.jar" />
</ant>
</target>
</configuration>
<goals>
<goal>run</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<executions>
<execution>
<id>add-javacc-generated-sources</id>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>${project.build.directory}/generated-sources/javacc</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>net.sourceforge.pmd</groupId>
<artifactId>pmd-build</artifactId>

View File

@ -0,0 +1,44 @@
<project name="pmd" default="alljavacc" basedir="../../">
<property name="javacc-home.path" value="target/lib" />
<target name="alljavacc"
description="Generates all JavaCC aspects within PMD"
depends="checkUpToDate,init,es5javacc,cleanup" />
<target name="checkUpToDate">
<uptodate property="javaccBuildNotRequired" targetfile="${target}/last-generated-timestamp">
<srcfiles dir="etc/grammar" includes="*.jj*"/>
</uptodate>
<echo message="up to date check: javaccBuildNotRequired=${javaccBuildNotRequired}"/>
</target>
<target name="init" unless="javaccBuildNotRequired">
<mkdir dir="${javacc-home.path}" />
<copy file="${javacc.jar}" tofile="${javacc-home.path}/javacc.jar" />
<mkdir dir="${target}"/>
<touch file="${target}/last-generated-timestamp"/>
</target>
<target name="cleanup">
<delete dir="${javacc-home.path}" />
</target>
<target name="es5javacc" description="Generates the Ecmascript 5 grammar" unless="javaccBuildNotRequired">
<delete dir="${target}/net/sourceforge/pmd/lang/ecmascript5/ast" />
<mkdir dir="${target}/net/sourceforge/pmd/lang/ecmascript5/ast" />
<!-- Ensure generated using CharStream interface -->
<javacc static="false"
usercharstream="true"
target="etc/grammar/es5.jj"
outputdirectory="${target}/net/sourceforge/pmd/lang/ecmascript5/ast"
javacchome="${javacc-home.path}" />
<replace file="${target}/net/sourceforge/pmd/lang/ecmascript5/ast/Ecmascript5ParserTokenManager.java"
token="class Ecmascript5ParserTokenManager"
value="class Ecmascript5ParserTokenManager extends net.sourceforge.pmd.lang.ast.AbstractTokenManager" />
<delete file="${target}/net/sourceforge/pmd/lang/ecmascript5/ast/CharStream.java" />
<delete file="${target}/net/sourceforge/pmd/lang/ecmascript5/ast/ParseException.java" />
<delete file="${target}/net/sourceforge/pmd/lang/ecmascript5/ast/TokenMgrError.java" />
</target>
</project>

View File

@ -3,25 +3,56 @@
*/
package net.sourceforge.pmd.cpd;
import java.util.ArrayList;
import java.io.Reader;
import java.io.StringReader;
public class EcmascriptTokenizer extends AbstractTokenizer {
public EcmascriptTokenizer() {
// setting markers for "string" in javascript
this.stringToken = new ArrayList<String>();
this.stringToken.add( "\'" );
this.stringToken.add( "\"" );
import net.sourceforge.pmd.lang.LanguageRegistry;
import net.sourceforge.pmd.lang.LanguageVersionHandler;
import net.sourceforge.pmd.lang.TokenManager;
import net.sourceforge.pmd.lang.ast.TokenMgrError;
import net.sourceforge.pmd.lang.ecmascript.EcmascriptLanguageModule;
import net.sourceforge.pmd.lang.ecmascript5.ast.Ecmascript5ParserConstants;
import net.sourceforge.pmd.lang.ecmascript5.ast.Token;
// setting markers for 'ignorable character' in javascript
this.ignorableCharacter = new ArrayList<String>();
this.ignorableCharacter.add( ";" );
import org.apache.commons.io.IOUtils;
// setting markers for 'ignorable string' in javascript
this.ignorableStmt = new ArrayList<String>();
/**
* The Ecmascript Tokenizer
*/
public class EcmascriptTokenizer implements Tokenizer {
// strings do indeed span multiple lines in javascript
this.spanMultipleLinesString = true;
// the lines do to end with backslashes
this.spanMultipleLinesLineContinuationCharacter = '\\';
@Override
public void tokenize(SourceCode sourceCode, Tokens tokenEntries) {
StringBuilder buffer = sourceCode.getCodeBuffer();
Reader reader = null;
try {
LanguageVersionHandler languageVersionHandler = LanguageRegistry.getLanguage(EcmascriptLanguageModule.NAME)
.getDefaultVersion().getLanguageVersionHandler();
reader = new StringReader(buffer.toString());
TokenManager tokenManager = languageVersionHandler.getParser(
languageVersionHandler.getDefaultParserOptions()).getTokenManager(sourceCode.getFileName(), reader);
Token currentToken = (Token) tokenManager.getNextToken();
while (currentToken.image.length() > 0) {
tokenEntries.add(new TokenEntry(getTokenImage(currentToken), sourceCode.getFileName(), currentToken.beginLine));
currentToken = (Token) tokenManager.getNextToken();
}
tokenEntries.add(TokenEntry.getEOF());
System.err.println("Added " + sourceCode.getFileName());
} catch (TokenMgrError err) {
err.printStackTrace();
System.err.println("Skipping " + sourceCode.getFileName() + " due to parse error");
tokenEntries.add(TokenEntry.getEOF());
} finally {
IOUtils.closeQuietly(reader);
}
}
private String getTokenImage(Token token) {
//Remove line continuation characters from string literals
if (token.kind == Ecmascript5ParserConstants.STRING_LITERAL ||
token.kind == Ecmascript5ParserConstants.UNTERMINATED_STRING_LITERAL) {
return token.image.replaceAll("(?<!\\\\)\\\\(\\r\\n|\\r|\\n)", "");
}
return token.image;
}
}

View File

@ -11,6 +11,7 @@ import net.sourceforge.pmd.lang.ParserOptions;
import net.sourceforge.pmd.lang.TokenManager;
import net.sourceforge.pmd.lang.ast.Node;
import net.sourceforge.pmd.lang.ast.ParseException;
import net.sourceforge.pmd.lang.ecmascript5.Ecmascript5TokenManager;
/**
* Adapter for the EcmascriptParser.
@ -25,7 +26,7 @@ public class Ecmascript3Parser extends AbstractParser {
@Override
public TokenManager createTokenManager(Reader source) {
return null;
return new Ecmascript5TokenManager(source);
}
public boolean canParse() {

View File

@ -0,0 +1,34 @@
/**
* BSD-style license; for more info see http://pmd.sourceforge.net/license.html
*/
package net.sourceforge.pmd.lang.ecmascript5;
import java.io.Reader;
import net.sourceforge.pmd.lang.TokenManager;
import net.sourceforge.pmd.lang.ast.SimpleCharStream;
import net.sourceforge.pmd.lang.ecmascript5.ast.Ecmascript5ParserTokenManager;
/**
* Ecmascript 5 Token Manager implementation.
*/
public class Ecmascript5TokenManager implements TokenManager {
private final Ecmascript5ParserTokenManager tokenManager;
/**
* Creates a new Ecmascript 5 Token Manager from the given source code.
* @param source the source code
*/
public Ecmascript5TokenManager(Reader source) {
tokenManager = new Ecmascript5ParserTokenManager(new SimpleCharStream(source));
}
public Object getNextToken() {
return tokenManager.getNextToken();
}
@Override
public void setFileName(String fileName) {
Ecmascript5ParserTokenManager.setFileName(fileName);
}
}

View File

@ -21,7 +21,7 @@ public class EcmascriptTokenizerTest {
SourceCode sourceCode = new SourceCode( new SourceCode.StringCodeLoader( getCode1() ) );
Tokens tokens = new Tokens();
tokenizer.tokenize( sourceCode, tokens );
assertEquals( 22, tokens.size() );
assertEquals( 40, tokens.size() );
}
@Test
@ -30,7 +30,7 @@ public class EcmascriptTokenizerTest {
SourceCode sourceCode = new SourceCode( new SourceCode.StringCodeLoader( getCode2() ) );
Tokens tokens = new Tokens();
t.tokenize( sourceCode, tokens );
assertEquals( 22, tokens.size() );
assertEquals( 45, tokens.size() );
}
/**
@ -47,14 +47,50 @@ public class EcmascriptTokenizerTest {
+ "continues2\";\n") );
Tokens tokens = new Tokens();
t.tokenize(sourceCode, tokens);
assertEquals(11, tokens.size());
List<TokenEntry> list = tokens.getTokens();
assertEquals("var", list.get(0).getIdentifier(), list.get(5).getIdentifier());
assertEquals("s", list.get(1).getIdentifier(), list.get(6).getIdentifier());
assertEquals("=", list.get(2).getIdentifier(), list.get(7).getIdentifier());
assertEquals("\"a string continues\"", list.get(3).toString());
assertEquals("\"a string continues2\"", list.get(8).toString());
assertFalse(list.get(3).getIdentifier() == list.get(8).getIdentifier());
}
@Test
public void testIgnoreSingleLineComments() throws IOException {
Tokenizer t = new EcmascriptTokenizer();
SourceCode sourceCode = new SourceCode( new SourceCode.StringCodeLoader(
"//This is a single line comment\n"
+ "var i = 0;\n\n"
+ "//This is another comment\n"
+ "i++;") );
Tokens tokens = new Tokens();
t.tokenize(sourceCode, tokens);
assertEquals(9, tokens.size());
List<TokenEntry> list = tokens.getTokens();
assertEquals("var", list.get(0).getIdentifier(), list.get(4).getIdentifier());
assertEquals("s", list.get(1).getIdentifier(), list.get(5).getIdentifier());
assertEquals("=", list.get(2).getIdentifier(), list.get(6).getIdentifier());
assertEquals("\"a string continues\"", list.get(3).toString());
assertEquals("\"a string continues2\"", list.get(7).toString());
assertFalse(list.get(3).getIdentifier() == list.get(7).getIdentifier());
assertEquals("var", list.get(0).toString());
assertEquals("++", list.get(6).toString());
}
@Test
public void testIgnoreMultiLineComments() throws IOException {
Tokenizer t = new EcmascriptTokenizer();
SourceCode sourceCode = new SourceCode( new SourceCode.StringCodeLoader(
"/* This is a multi line comment\n"
+ " * \n"
+ " */ \n"
+ "var i = 0;\n\n"
+ "/* This is another multi line comment\n"
+ " * second line \n"
+ " * third line */\n"
+ "i++;") );
Tokens tokens = new Tokens();
t.tokenize(sourceCode, tokens);
assertEquals(9, tokens.size());
List<TokenEntry> list = tokens.getTokens();
assertEquals("var", list.get(0).toString());
assertEquals("++", list.get(6).toString());
}
// no semi-colons