Don't forget EOF token

This commit is contained in:
Clément Fournier 2023-02-12 15:28:34 +01:00
parent 1828faeadc
commit add597026c
No known key found for this signature in database
GPG Key ID: 4D8D42402E4F47E2
21 changed files with 118 additions and 84 deletions

View File

@ -62,7 +62,7 @@ public class AnyTokenizer implements Tokenizer {
}
@Override
public void tokenize(TextDocument sourceCode, TokenFactory tokenEntries) {
public void tokenize(TextDocument sourceCode, TokenFactory tokens) {
Chars text = sourceCode.getText();
Matcher matcher = pattern.matcher(text);
int lineNo = 1;
@ -87,7 +87,7 @@ public class AnyTokenizer implements Tokenizer {
lineNo += StringUtil.lineNumberAt(image, image.length()) - 1;
lastLineStart = matcher.start() + image.length() - ecol + 1;
}
tokenEntries.recordToken(image, bline, bcol, lineNo, ecol);
tokens.recordToken(image, bline, bcol, lineNo, ecol);
}
}

View File

@ -112,20 +112,10 @@ public final class CpdAnalysis implements AutoCloseable {
this.listener = cpdListener;
}
private int doTokenize(TextDocument document, Tokenizer tokenizer, Tokens tokens) {
private int doTokenize(TextDocument document, Tokenizer tokenizer, Tokens tokens) throws IOException, TokenMgrError {
LOGGER.trace("Tokenizing {}", document.getPathId());
int lastTokenSize = tokens.size();
try {
tokenizer.tokenize(document, TokenFactory.forFile(document, tokens));
} catch (IOException ioe) {
reporter.errorEx("Error while lexing.", ioe);
} catch (TokenMgrError e) {
e.setFileName(document.getDisplayName());
reporter.errorEx("Error while lexing.", e);
throw e;
} finally {
tokens.addEof();
}
Tokenizer.tokenize(tokenizer, document, tokens);
return tokens.size() - lastTokenSize - 1; /* EOF */
}
@ -151,7 +141,11 @@ public final class CpdAnalysis implements AutoCloseable {
int newTokens = doTokenize(textDocument, tokenizers.get(textFile.getLanguageVersion().getLanguage()), tokens);
numberOfTokensPerFile.put(textDocument.getPathId(), newTokens);
listener.addedFile(1);
} catch (TokenMgrError e) {
} catch (TokenMgrError | IOException e) {
if (e instanceof TokenMgrError) {
((TokenMgrError) e).setFileName(textFile.getDisplayName());
}
reporter.errorEx("Error while lexing.", e);
// already reported
savedState.restore(tokens);
}

View File

@ -19,7 +19,7 @@ public class Mark implements Comparable<Mark> {
}
public String getFilename() {
return this.token.getTokenSrcID();
return this.token.getFileName();
}
public int getBeginLine() {

View File

@ -8,7 +8,7 @@ public class TokenEntry implements Comparable<TokenEntry> {
public static final TokenEntry EOF = new TokenEntry();
private final String tokenSrcID;
private final String fileName;
private final int beginLine;
private final int beginColumn;
private final int endColumn;
@ -18,15 +18,15 @@ public class TokenEntry implements Comparable<TokenEntry> {
private TokenEntry() {
this.identifier = 0;
this.tokenSrcID = "EOFMarker";
this.fileName = "EOFMarker";
this.beginLine = -1;
this.beginColumn = -1;
this.endColumn = -1;
}
TokenEntry(int imageId, String tokenSrcID, int beginLine, int beginColumn, int endLine, int endColumn, int index) {
TokenEntry(int imageId, String fileName, int beginLine, int beginColumn, int endLine, int endColumn, int index) {
assert isOk(beginLine) && isOk(beginColumn) && isOk(endLine) && isOk(endColumn) : "Coordinates are 1-based";
this.tokenSrcID = tokenSrcID;
this.fileName = fileName;
this.beginLine = beginLine;
this.beginColumn = beginColumn;
this.endColumn = endColumn;
@ -40,8 +40,8 @@ public class TokenEntry implements Comparable<TokenEntry> {
}
String getTokenSrcID() {
return tokenSrcID;
String getFileName() {
return fileName;
}
public int getBeginLine() {
@ -51,6 +51,7 @@ public class TokenEntry implements Comparable<TokenEntry> {
/**
* The column number where this token begins.
* returns -1 if not available
*
* @return the begin column number
*/
public int getBeginColumn() {
@ -60,17 +61,18 @@ public class TokenEntry implements Comparable<TokenEntry> {
/**
* The column number where this token ends.
* returns -1 if not available
*
* @return the end column number
*/
public int getEndColumn() {
return endColumn; // TODO Java 1.8 make optional
}
int getIdentifier() {
int getIdentifier() {
return this.identifier;
}
int getIndex() {
int getIndex() {
return this.index;
}
@ -79,7 +81,7 @@ public class TokenEntry implements Comparable<TokenEntry> {
return hashCode;
}
void setHashCode(int hashCode) {
void setHashCode(int hashCode) {
this.hashCode = hashCode;
}
@ -105,6 +107,18 @@ public class TokenEntry implements Comparable<TokenEntry> {
return getIndex() - other.getIndex();
}
final void setImageIdentifier(int identifier) {
this.identifier = identifier;
}
public String getImage(Tokens tokens) {
if (EOF.equals(this)) {
return "EOF";
}
String image = tokens.imageFromId(this.identifier);
return image == null ? "--unknown--" : image;
}
@Override
public String toString() {
if (EOF.equals(this)) {
@ -113,7 +127,4 @@ public class TokenEntry implements Comparable<TokenEntry> {
return Integer.toString(identifier);
}
final void setImageIdentifier(int identifier) {
this.identifier = identifier;
}
}

View File

@ -7,7 +7,7 @@ package net.sourceforge.pmd.cpd;
import net.sourceforge.pmd.lang.document.FileLocation;
import net.sourceforge.pmd.lang.document.TextDocument;
public interface TokenFactory {
public interface TokenFactory extends AutoCloseable {
void recordToken(String image, int startLine, int startCol, int endLine, int endCol);
@ -19,6 +19,9 @@ public interface TokenFactory {
TokenEntry peekLastToken();
@Override
void close();
static TokenFactory forFile(TextDocument file, Tokens sink) {
return new TokenFactory() {
final String name = file.getPathId();
@ -37,6 +40,11 @@ public interface TokenFactory {
public TokenEntry peekLastToken() {
return sink.peekLastToken();
}
@Override
public void close() {
sink.addEof();
}
};
}
}

View File

@ -82,5 +82,11 @@ public interface Tokenizer {
String DEFAULT_SKIP_BLOCKS_PATTERN = "#if 0|#endif";
void tokenize(TextDocument sourceCode, TokenFactory tokenEntries) throws IOException;
void tokenize(TextDocument sourceCode, TokenFactory tokens) throws IOException;
static void tokenize(Tokenizer tokenizer, TextDocument textDocument, Tokens tokens) throws IOException {
try (TokenFactory tf = TokenFactory.forFile(textDocument, tokens)) {
tokenizer.tokenize(textDocument, tf);
}
}
}

View File

@ -9,6 +9,7 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
public class Tokens {
@ -32,6 +33,10 @@ public class Tokens {
return images.computeIfAbsent(newImage, k -> images.size() + 1);
}
String imageFromId(int i) {
return images.entrySet().stream().filter(it -> it.getValue() == i).findFirst().map(Entry::getKey).orElse(null);
}
public TokenEntry peekLastToken() {
return get(size() - 1);
}

View File

@ -30,11 +30,11 @@ public abstract class TokenizerBase<T extends GenericToken<T>> implements Tokeni
}
@Override
public void tokenize(TextDocument document, TokenFactory tokenEntries) throws IOException {
public void tokenize(TextDocument document, TokenFactory tokens) throws IOException {
TokenManager<T> tokenManager = filterTokenStream(makeLexerImpl(document));
T currentToken = tokenManager.getNextToken();
while (currentToken != null) {
processToken(tokenEntries, currentToken);
processToken(tokens, currentToken);
currentToken = tokenManager.getNextToken();
}
}

View File

@ -7,6 +7,7 @@ package net.sourceforge.pmd.cpd;
import static net.sourceforge.pmd.util.CollectionUtil.listOf;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@ -18,19 +19,19 @@ import net.sourceforge.pmd.lang.document.TextDocument;
class AnyTokenizerTest {
@Test
void testMultiLineMacros() {
void testMultiLineMacros() throws IOException {
AnyTokenizer tokenizer = new AnyTokenizer("//");
compareResult(tokenizer, TEST1, EXPECTED);
}
@Test
void testStringEscape() {
void testStringEscape() throws IOException {
AnyTokenizer tokenizer = new AnyTokenizer("//");
compareResult(tokenizer, "a = \"oo\\n\"", listOf("a", "=", "\"oo\\n\"", "EOF"));
}
@Test
void testMultilineString() {
void testMultilineString() throws IOException {
AnyTokenizer tokenizer = new AnyTokenizer("//");
Tokens tokens = compareResult(tokenizer, "a = \"oo\n\";", listOf("a", "=", "\"oo\n\"", ";", "EOF"));
TokenEntry string = tokens.getTokens().get(2);
@ -50,11 +51,11 @@ class AnyTokenizerTest {
* Tests that [core][cpd] AnyTokenizer doesn't count columns correctly #2760 is actually fixed.
*/
@Test
void testTokenPosition() {
void testTokenPosition() throws IOException {
AnyTokenizer tokenizer = new AnyTokenizer();
TextDocument code = TextDocument.readOnlyString("a;\nbbbb\n;", "Foo.dummy", DummyLanguageModule.getInstance().getDefaultVersion());
Tokens tokens = new Tokens();
tokenizer.tokenize(code, TokenFactory.forFile(code, tokens));
Tokenizer.tokenize(tokenizer, code, tokens);
TokenEntry bbbbToken = tokens.getTokens().get(2);
assertEquals(2, bbbbToken.getBeginLine());
assertEquals(1, bbbbToken.getBeginColumn());
@ -62,10 +63,10 @@ class AnyTokenizerTest {
}
private Tokens compareResult(AnyTokenizer tokenizer, String source, List<String> expectedImages) {
private Tokens compareResult(AnyTokenizer tokenizer, String source, List<String> expectedImages) throws IOException {
TextDocument code = TextDocument.readOnlyString(source, "Foo.dummy", DummyLanguageModule.getInstance().getDefaultVersion());
Tokens tokens = new Tokens();
tokenizer.tokenize(code, TokenFactory.forFile(code, tokens));
Tokenizer.tokenize(tokenizer, code, tokens);
List<String> tokenStrings = new ArrayList<>();
for (TokenEntry token : tokens.getTokens()) {

View File

@ -16,7 +16,7 @@ class TokenEntryTest {
tokens.addToken("public", "/var/Foo.java", 1, 2, 3, 4);
TokenEntry mark = tokens.peekLastToken();
assertEquals(1, mark.getBeginLine());
assertEquals("/var/Foo.java", mark.getTokenSrcID());
assertEquals("/var/Foo.java", mark.getFileName());
assertEquals(0, mark.getIndex());
assertEquals(2, mark.getBeginColumn());
assertEquals(4, mark.getEndColumn());

View File

@ -32,7 +32,7 @@ class CPPTokenizerTest extends CpdTextComparisonTest {
@Test
void testUTFwithBOM() {
Tokenizer tokenizer = newTokenizer(dontSkipBlocks());
Tokens tokens = tokenize(tokenizer, "\ufeffint start()\n{ int ret = 1;\nreturn ret;\n}\n");
Tokens tokens = tokenize(tokenizer, sourceCodeOf("\ufeffint start()\n{ int ret = 1;\nreturn ret;\n}\n"));
assertEquals(15, tokens.size());
}

View File

@ -20,7 +20,7 @@ import groovyjarjarantlr.TokenStreamException;
public class GroovyTokenizer implements Tokenizer {
@Override
public void tokenize(TextDocument sourceCode, TokenFactory tokenEntries) {
public void tokenize(TextDocument sourceCode, TokenFactory tokens) {
GroovyLexer lexer = new GroovyLexer(sourceCode.newReader());
TokenStream tokenStream = lexer.plumb();
@ -42,7 +42,7 @@ public class GroovyTokenizer implements Tokenizer {
lastLine = token.getLine(); // todo inaccurate
}
tokenEntries.recordToken(tokenText, token.getLine(), token.getColumn(), lastLine, lastCol);
tokens.recordToken(tokenText, token.getLine(), token.getColumn(), lastLine, lastCol);
token = tokenStream.nextToken();
}
} catch (TokenStreamException err) {

View File

@ -19,7 +19,7 @@ import net.sourceforge.pmd.lang.html.HtmlLanguageModule;
public class HtmlTokenizer implements Tokenizer {
@Override
public void tokenize(TextDocument sourceCode, TokenFactory tokenEntries) {
public void tokenize(TextDocument sourceCode, TokenFactory tokens) {
HtmlLanguageModule html = HtmlLanguageModule.getInstance();
try (LanguageProcessor processor = html.createProcessor(html.newPropertyBundle())) {
@ -33,7 +33,7 @@ public class HtmlTokenizer implements Tokenizer {
HtmlParser parser = new HtmlParser();
ASTHtmlDocument root = parser.parse(task);
traverse(root, tokenEntries);
traverse(root, tokens);
} catch (IOException e) {
throw new UncheckedIOException(e);
} catch (Exception e) {

View File

@ -45,7 +45,7 @@ class MatchAlgorithmTest {
SourceManager sourceManager = new SourceManager(listOf(textFile));
Tokens tokens = new Tokens();
TextDocument sourceCode = sourceManager.get(textFile);
tokenizer.tokenize(sourceCode, TokenFactory.forFile(sourceCode, tokens));
Tokenizer.tokenize(tokenizer, sourceCode, tokens);
assertEquals(41, tokens.size());
MatchAlgorithm matchAlgorithm = new MatchAlgorithm(tokens, 5);
@ -77,7 +77,7 @@ class MatchAlgorithmTest {
Tokenizer tokenizer = java.createCpdTokenizer(bundle);
TextDocument sourceCode = TextDocument.readOnlyString(getSampleCode(), "Foo.java", java.getDefaultVersion());
Tokens tokens = new Tokens();
tokenizer.tokenize(sourceCode, TokenFactory.forFile(sourceCode, tokens));
Tokenizer.tokenize(tokenizer, sourceCode, tokens);
MatchAlgorithm matchAlgorithm = new MatchAlgorithm(tokens, 5);
matchAlgorithm.findMatches();

View File

@ -106,7 +106,7 @@ abstract class CpdTextComparisonTest(
append('L').append(curLine).appendLine()
}
formatLine(token).appendLine()
formatLine(token, tokens).appendLine()
}
}
@ -119,9 +119,9 @@ abstract class CpdTextComparisonTest(
)
private fun StringBuilder.formatLine(token: TokenEntry) =
private fun StringBuilder.formatLine(token: TokenEntry, tokens: Tokens) =
formatLine(
escapedImage = escapeImage(token.toString()),
escapedImage = escapeImage(token.getImage(tokens)),
bcol = token.beginColumn,
ecol = token.endColumn
)
@ -167,11 +167,14 @@ abstract class CpdTextComparisonTest(
private fun sourceCodeOf(fileData: FileData): TextDocument =
TextDocument.readOnlyString(fileData.fileText, fileData.fileName, language.defaultVersion)
@JvmOverloads
fun sourceCodeOf(text: String, fileName: String = TextFile.UNKNOWN_FILENAME): FileData =
FileData(fileName = fileName, fileText = text)
fun tokenize(tokenizer: Tokenizer, fileData: FileData): Tokens =
Tokens().also {
val tokens = Tokens()
Tokens().also { tokens ->
val source = sourceCodeOf(fileData)
tokenizer.tokenize(source, TokenFactory.forFile(source, tokens))
Tokenizer.tokenize(tokenizer, source, tokens)
}
private companion object {

View File

@ -1,19 +0,0 @@
/**
* BSD-style license; for more info see http://pmd.sourceforge.net/license.html
*/
package net.sourceforge.pmd.cpd;
/**
* Defines the Language module for Matlab
*/
public class MatlabLanguage extends AbstractLanguage {
/**
* Creates a new instance of {@link MatlabLanguage} with the default
* extensions for matlab files.
*/
public MatlabLanguage() {
super("Matlab", "matlab", new MatlabTokenizer(), ".m");
}
}

View File

@ -0,0 +1,29 @@
/*
* BSD-style license; for more info see http://pmd.sourceforge.net/license.html
*/
package net.sourceforge.pmd.lang.matlab;
import net.sourceforge.pmd.cpd.Tokenizer;
import net.sourceforge.pmd.lang.CpdOnlyLanguageModuleBase;
import net.sourceforge.pmd.lang.LanguagePropertyBundle;
import net.sourceforge.pmd.lang.matlab.cpd.MatlabTokenizer;
/**
* Defines the Language module for Matlab
*/
public class MatlabLanguageModule extends CpdOnlyLanguageModuleBase {
/**
* Creates a new instance of {@link MatlabLanguageModule} with the default
* extensions for matlab files.
*/
public MatlabLanguageModule() {
super(LanguageMetadata.withId("matlab").name("Matlab").extensions("m"));
}
@Override
public Tokenizer createCpdTokenizer(LanguagePropertyBundle bundle) {
return new MatlabTokenizer();
}
}

View File

@ -1,11 +1,12 @@
/**
/*
* BSD-style license; for more info see http://pmd.sourceforge.net/license.html
*/
package net.sourceforge.pmd.cpd;
package net.sourceforge.pmd.lang.matlab.cpd;
import net.sourceforge.pmd.cpd.internal.JavaCCTokenizer;
import net.sourceforge.pmd.lang.TokenManager;
import net.sourceforge.pmd.lang.ast.impl.javacc.CharStream;
import net.sourceforge.pmd.lang.ast.impl.javacc.JavaccToken;
import net.sourceforge.pmd.lang.document.TextDocument;
import net.sourceforge.pmd.lang.matlab.ast.MatlabTokenKinds;
@ -17,6 +18,6 @@ public class MatlabTokenizer extends JavaCCTokenizer {
@Override
protected TokenManager<JavaccToken> makeLexerImpl(TextDocument doc) {
return MatlabTokenKinds.newTokenManager(doc);
return MatlabTokenKinds.newTokenManager(CharStream.create(doc));
}
}

View File

@ -1 +0,0 @@
net.sourceforge.pmd.cpd.MatlabLanguage

View File

@ -0,0 +1 @@
net.sourceforge.pmd.lang.matlab.MatlabLanguageModule

View File

@ -1,8 +1,8 @@
/**
/*
* BSD-style license; for more info see http://pmd.sourceforge.net/license.html
*/
package net.sourceforge.pmd.cpd;
package net.sourceforge.pmd.lang.matlab.cpd;
import org.junit.jupiter.api.Test;
@ -11,12 +11,7 @@ import net.sourceforge.pmd.cpd.test.CpdTextComparisonTest;
class MatlabTokenizerTest extends CpdTextComparisonTest {
MatlabTokenizerTest() {
super(".m");
}
@Override
protected String getResourcePrefix() {
return "../lang/matlab/cpd/testdata";
super("matlab", ".m");
}
@Test