Improve AnyTokenizer
This commit is contained in:
@ -4,44 +4,88 @@
|
||||
|
||||
package net.sourceforge.pmd.cpd;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.CharArrayReader;
|
||||
import java.io.IOException;
|
||||
import java.util.StringTokenizer;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import net.sourceforge.pmd.util.StringUtil;
|
||||
|
||||
/**
|
||||
* This class does a best-guess try-anything tokenization.
|
||||
* Simple tokenization into words and separators. Can ignore end-of-line
|
||||
* comments and recognize double/single quoted string literals. It is
|
||||
* not a goal to be very customizable, or have very high quality.
|
||||
* Higher-quality lexers should be implemented with a lexer generator.
|
||||
*
|
||||
* @author jheintz
|
||||
* <p>In PMD 7, this replaces AbstractTokenizer, which provided nearly
|
||||
* no more functionality and whose API was hard to update.
|
||||
*/
|
||||
public class AnyTokenizer implements Tokenizer {
|
||||
public static final String TOKENS = " \t!#$%^&*(){}-=+<>/\\`~;:";
|
||||
|
||||
private static final Pattern DEFAULT_PATTERN = makePattern("");
|
||||
|
||||
private static Pattern makePattern(String singleLineCommentStart) {
|
||||
return Pattern.compile(
|
||||
"\\w++" // either a word
|
||||
+ eolCommentFragment(singleLineCommentStart) // a comment
|
||||
+ "|[^\"'\\s]" // a single separator char
|
||||
+ "|\"(?:[^\"\\\\]++|\\\\\")*+\"" // a double-quoted string
|
||||
+ "|'(?:[^'\\\\]++|\\\\')*+'" // a single-quoted string
|
||||
+ "|\n" // or a newline (to count lines), note that sourcecode normalizes line endings
|
||||
);
|
||||
}
|
||||
|
||||
private final Pattern pattern;
|
||||
private final String commentStart;
|
||||
|
||||
public AnyTokenizer() {
|
||||
this(DEFAULT_PATTERN, "");
|
||||
}
|
||||
|
||||
public AnyTokenizer(String eolCommentStart) {
|
||||
this(makePattern(eolCommentStart), eolCommentStart);
|
||||
}
|
||||
|
||||
private AnyTokenizer(Pattern pattern, String commentStart) {
|
||||
this.pattern = pattern;
|
||||
this.commentStart = commentStart;
|
||||
}
|
||||
|
||||
private static String eolCommentFragment(String start) {
|
||||
if (StringUtils.isBlank(start)) {
|
||||
return "";
|
||||
} else {
|
||||
return "|(?:" + Pattern.quote(start) + "[^\n]++)"; // note: sourcecode normalizes line endings
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tokenize(SourceCode sourceCode, Tokens tokenEntries) {
|
||||
StringBuilder sb = sourceCode.getCodeBuffer();
|
||||
try (BufferedReader reader = new BufferedReader(new CharArrayReader(sb.toString().toCharArray()))) {
|
||||
int lineNumber = 1;
|
||||
int colNumber = 1;
|
||||
String line = reader.readLine();
|
||||
while (line != null) {
|
||||
StringTokenizer tokenizer = new StringTokenizer(line, TOKENS, true);
|
||||
while (tokenizer.hasMoreTokens()) {
|
||||
String token = tokenizer.nextToken();
|
||||
int endCol = colNumber + token.length() - 1; // -1 because inclusive
|
||||
if (!" ".equals(token) && !"\t".equals(token)) {
|
||||
tokenEntries.add(new TokenEntry(token, sourceCode.getFileName(), lineNumber, colNumber, endCol));
|
||||
}
|
||||
colNumber = endCol + 1;
|
||||
StringBuilder text = sourceCode.getCodeBuffer();
|
||||
Matcher matcher = pattern.matcher(text);
|
||||
int lineNo = 1;
|
||||
int lastLineStart = 0;
|
||||
try {
|
||||
while (matcher.find()) {
|
||||
String image = matcher.group();
|
||||
if (isComment(image)) {
|
||||
continue;
|
||||
} else if (StringUtils.isWhitespace(image)) {
|
||||
lineNo++;
|
||||
lastLineStart = matcher.end();
|
||||
continue;
|
||||
}
|
||||
// advance iteration variables
|
||||
line = reader.readLine();
|
||||
lineNumber++;
|
||||
|
||||
int bcol = 1 + matcher.start() - lastLineStart;
|
||||
int ecol = 1 + StringUtil.columnNumberAt(image, image.length());
|
||||
tokenEntries.add(new TokenEntry(image, sourceCode.getFileName(), lineNo, bcol, ecol));
|
||||
}
|
||||
} catch (IOException ignored) {
|
||||
ignored.printStackTrace();
|
||||
} finally {
|
||||
tokenEntries.add(TokenEntry.getEOF());
|
||||
tokenEntries.add(TokenEntry.EOF);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isComment(String tok) {
|
||||
return !commentStart.isEmpty() && tok.startsWith(commentStart);
|
||||
}
|
||||
}
|
||||
|
@ -4,29 +4,55 @@
|
||||
|
||||
package net.sourceforge.pmd.cpd;
|
||||
|
||||
import static net.sourceforge.pmd.util.CollectionUtil.listOf;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import org.junit.Test;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import net.sourceforge.pmd.PMD;
|
||||
import org.checkerframework.checker.nullness.qual.NonNull;
|
||||
import org.junit.Test;
|
||||
|
||||
public class AnyTokenizerTest {
|
||||
|
||||
@Test
|
||||
public void testMultiLineMacros() {
|
||||
AnyTokenizer tokenizer = new AnyTokenizer();
|
||||
AnyTokenizer tokenizer = new AnyTokenizer("//");
|
||||
SourceCode code = new SourceCode(new SourceCode.StringCodeLoader(TEST1));
|
||||
Tokens tokens = new Tokens();
|
||||
tokenizer.tokenize(code, tokens);
|
||||
assertEquals(30, tokens.size());
|
||||
assertEquals(31, tokens.size());
|
||||
List<String> tokenStrings = tokens.getTokens().stream()
|
||||
.map(this::getTokenImage)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
assertEquals(EXPECTED, tokenStrings);
|
||||
}
|
||||
|
||||
private static final String TEST1 = "using System;" + PMD.EOL + "namespace HelloNameSpace {" + PMD.EOL + ""
|
||||
+ PMD.EOL + " public class HelloWorld {" + PMD.EOL + " static void Main(string[] args) {"
|
||||
+ PMD.EOL + " Console.WriteLine(\"Hello World!\");" + PMD.EOL + " }" + PMD.EOL + " }"
|
||||
+ PMD.EOL + "}" + PMD.EOL;
|
||||
|
||||
public static junit.framework.Test suite() {
|
||||
return new junit.framework.JUnit4TestAdapter(AnyTokenizerTest.class);
|
||||
private @NonNull String getTokenImage(TokenEntry t) {
|
||||
return t.toString();
|
||||
}
|
||||
|
||||
private static final List<String> EXPECTED = listOf(
|
||||
"using", "System", ";",
|
||||
"namespace", "HelloNameSpace", "{",
|
||||
"public", "class", "HelloWorld", "{", // note: comment is excluded
|
||||
"static", "void", "Main", "(", "string", "[", "]", "args", ")", "{",
|
||||
"Console", ".", "WriteLine", "(", "\"Hello World!\"", ")", ";",
|
||||
"}", "}", "}", "EOF"
|
||||
);
|
||||
|
||||
private static final String TEST1 =
|
||||
"using System;\n"
|
||||
+ "namespace HelloNameSpace {\n"
|
||||
+ "\n"
|
||||
+ " public class HelloWorld { // A comment\n"
|
||||
+ " static void Main(string[] args) {\n"
|
||||
+ "\n"
|
||||
+ " Console.WriteLine(\"Hello World!\");\n"
|
||||
+ " }\n"
|
||||
+ " }\n"
|
||||
+ "\n"
|
||||
+ "}\n";
|
||||
|
||||
}
|
||||
|
Reference in New Issue
Block a user