Improve AnyTokenizer

This commit is contained in:
Clément Fournier
2020-09-01 17:05:21 +02:00
parent 3ce68f4977
commit b27ab97684
2 changed files with 108 additions and 38 deletions

View File

@ -4,44 +4,88 @@
package net.sourceforge.pmd.cpd;
import java.io.BufferedReader;
import java.io.CharArrayReader;
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import net.sourceforge.pmd.util.StringUtil;
/**
* This class does a best-guess try-anything tokenization.
* Simple tokenization into words and separators. Can ignore end-of-line
* comments and recognize double/single quoted string literals. It is
* not a goal to be very customizable, or have very high quality.
* Higher-quality lexers should be implemented with a lexer generator.
*
* @author jheintz
* <p>In PMD 7, this replaces AbstractTokenizer, which provided nearly
* no more functionality and whose API was hard to update.
*/
public class AnyTokenizer implements Tokenizer {
public static final String TOKENS = " \t!#$%^&*(){}-=+<>/\\`~;:";
private static final Pattern DEFAULT_PATTERN = makePattern("");
private static Pattern makePattern(String singleLineCommentStart) {
return Pattern.compile(
"\\w++" // either a word
+ eolCommentFragment(singleLineCommentStart) // a comment
+ "|[^\"'\\s]" // a single separator char
+ "|\"(?:[^\"\\\\]++|\\\\\")*+\"" // a double-quoted string
+ "|'(?:[^'\\\\]++|\\\\')*+'" // a single-quoted string
+ "|\n" // or a newline (to count lines), note that sourcecode normalizes line endings
);
}
private final Pattern pattern;
private final String commentStart;
public AnyTokenizer() {
this(DEFAULT_PATTERN, "");
}
public AnyTokenizer(String eolCommentStart) {
this(makePattern(eolCommentStart), eolCommentStart);
}
private AnyTokenizer(Pattern pattern, String commentStart) {
this.pattern = pattern;
this.commentStart = commentStart;
}
private static String eolCommentFragment(String start) {
if (StringUtils.isBlank(start)) {
return "";
} else {
return "|(?:" + Pattern.quote(start) + "[^\n]++)"; // note: sourcecode normalizes line endings
}
}
@Override
public void tokenize(SourceCode sourceCode, Tokens tokenEntries) {
StringBuilder sb = sourceCode.getCodeBuffer();
try (BufferedReader reader = new BufferedReader(new CharArrayReader(sb.toString().toCharArray()))) {
int lineNumber = 1;
int colNumber = 1;
String line = reader.readLine();
while (line != null) {
StringTokenizer tokenizer = new StringTokenizer(line, TOKENS, true);
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
int endCol = colNumber + token.length() - 1; // -1 because inclusive
if (!" ".equals(token) && !"\t".equals(token)) {
tokenEntries.add(new TokenEntry(token, sourceCode.getFileName(), lineNumber, colNumber, endCol));
}
colNumber = endCol + 1;
StringBuilder text = sourceCode.getCodeBuffer();
Matcher matcher = pattern.matcher(text);
int lineNo = 1;
int lastLineStart = 0;
try {
while (matcher.find()) {
String image = matcher.group();
if (isComment(image)) {
continue;
} else if (StringUtils.isWhitespace(image)) {
lineNo++;
lastLineStart = matcher.end();
continue;
}
// advance iteration variables
line = reader.readLine();
lineNumber++;
int bcol = 1 + matcher.start() - lastLineStart;
int ecol = 1 + StringUtil.columnNumberAt(image, image.length());
tokenEntries.add(new TokenEntry(image, sourceCode.getFileName(), lineNo, bcol, ecol));
}
} catch (IOException ignored) {
ignored.printStackTrace();
} finally {
tokenEntries.add(TokenEntry.getEOF());
tokenEntries.add(TokenEntry.EOF);
}
}
private boolean isComment(String tok) {
return !commentStart.isEmpty() && tok.startsWith(commentStart);
}
}

View File

@ -4,29 +4,55 @@
package net.sourceforge.pmd.cpd;
import static net.sourceforge.pmd.util.CollectionUtil.listOf;
import static org.junit.Assert.assertEquals;
import org.junit.Test;
import java.util.List;
import java.util.stream.Collectors;
import net.sourceforge.pmd.PMD;
import org.checkerframework.checker.nullness.qual.NonNull;
import org.junit.Test;
public class AnyTokenizerTest {
@Test
public void testMultiLineMacros() {
AnyTokenizer tokenizer = new AnyTokenizer();
AnyTokenizer tokenizer = new AnyTokenizer("//");
SourceCode code = new SourceCode(new SourceCode.StringCodeLoader(TEST1));
Tokens tokens = new Tokens();
tokenizer.tokenize(code, tokens);
assertEquals(30, tokens.size());
assertEquals(31, tokens.size());
List<String> tokenStrings = tokens.getTokens().stream()
.map(this::getTokenImage)
.collect(Collectors.toList());
assertEquals(EXPECTED, tokenStrings);
}
private static final String TEST1 = "using System;" + PMD.EOL + "namespace HelloNameSpace {" + PMD.EOL + ""
+ PMD.EOL + " public class HelloWorld {" + PMD.EOL + " static void Main(string[] args) {"
+ PMD.EOL + " Console.WriteLine(\"Hello World!\");" + PMD.EOL + " }" + PMD.EOL + " }"
+ PMD.EOL + "}" + PMD.EOL;
public static junit.framework.Test suite() {
return new junit.framework.JUnit4TestAdapter(AnyTokenizerTest.class);
private @NonNull String getTokenImage(TokenEntry t) {
return t.toString();
}
private static final List<String> EXPECTED = listOf(
"using", "System", ";",
"namespace", "HelloNameSpace", "{",
"public", "class", "HelloWorld", "{", // note: comment is excluded
"static", "void", "Main", "(", "string", "[", "]", "args", ")", "{",
"Console", ".", "WriteLine", "(", "\"Hello World!\"", ")", ";",
"}", "}", "}", "EOF"
);
private static final String TEST1 =
"using System;\n"
+ "namespace HelloNameSpace {\n"
+ "\n"
+ " public class HelloWorld { // A comment\n"
+ " static void Main(string[] args) {\n"
+ "\n"
+ " Console.WriteLine(\"Hello World!\");\n"
+ " }\n"
+ " }\n"
+ "\n"
+ "}\n";
}