diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AnyTokenizer.java b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AnyTokenizer.java index d45a4af7b8..207a9f05b1 100644 --- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AnyTokenizer.java +++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AnyTokenizer.java @@ -4,44 +4,88 @@ package net.sourceforge.pmd.cpd; -import java.io.BufferedReader; -import java.io.CharArrayReader; -import java.io.IOException; -import java.util.StringTokenizer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.lang3.StringUtils; + +import net.sourceforge.pmd.util.StringUtil; /** - * This class does a best-guess try-anything tokenization. + * Simple tokenization into words and separators. Can ignore end-of-line + * comments and recognize double/single quoted string literals. It is + * not a goal to be very customizable, or have very high quality. + * Higher-quality lexers should be implemented with a lexer generator. * - * @author jheintz + *
In PMD 7, this replaces AbstractTokenizer, which provided nearly
+ * no more functionality and whose API was hard to update.
*/
public class AnyTokenizer implements Tokenizer {
- public static final String TOKENS = " \t!#$%^&*(){}-=+<>/\\`~;:";
+
+ private static final Pattern DEFAULT_PATTERN = makePattern("");
+
+ private static Pattern makePattern(String singleLineCommentStart) {
+ return Pattern.compile(
+ "\\w++" // either a word
+ + eolCommentFragment(singleLineCommentStart) // a comment
+ + "|[^\"'\\s]" // a single separator char
+ + "|\"(?:[^\"\\\\]++|\\\\\")*+\"" // a double-quoted string
+ + "|'(?:[^'\\\\]++|\\\\')*+'" // a single-quoted string
+ + "|\n" // or a newline (to count lines), note that sourcecode normalizes line endings
+ );
+ }
+
+ private final Pattern pattern;
+ private final String commentStart;
+
+ public AnyTokenizer() {
+ this(DEFAULT_PATTERN, "");
+ }
+
+ public AnyTokenizer(String eolCommentStart) {
+ this(makePattern(eolCommentStart), eolCommentStart);
+ }
+
+ private AnyTokenizer(Pattern pattern, String commentStart) {
+ this.pattern = pattern;
+ this.commentStart = commentStart;
+ }
+
+ private static String eolCommentFragment(String start) {
+ if (StringUtils.isBlank(start)) {
+ return "";
+ } else {
+ return "|(?:" + Pattern.quote(start) + "[^\n]++)"; // note: sourcecode normalizes line endings
+ }
+ }
@Override
public void tokenize(SourceCode sourceCode, Tokens tokenEntries) {
- StringBuilder sb = sourceCode.getCodeBuffer();
- try (BufferedReader reader = new BufferedReader(new CharArrayReader(sb.toString().toCharArray()))) {
- int lineNumber = 1;
- int colNumber = 1;
- String line = reader.readLine();
- while (line != null) {
- StringTokenizer tokenizer = new StringTokenizer(line, TOKENS, true);
- while (tokenizer.hasMoreTokens()) {
- String token = tokenizer.nextToken();
- int endCol = colNumber + token.length() - 1; // -1 because inclusive
- if (!" ".equals(token) && !"\t".equals(token)) {
- tokenEntries.add(new TokenEntry(token, sourceCode.getFileName(), lineNumber, colNumber, endCol));
- }
- colNumber = endCol + 1;
+ StringBuilder text = sourceCode.getCodeBuffer();
+ Matcher matcher = pattern.matcher(text);
+ int lineNo = 1;
+ int lastLineStart = 0;
+ try {
+ while (matcher.find()) {
+ String image = matcher.group();
+ if (isComment(image)) {
+ continue;
+ } else if (StringUtils.isWhitespace(image)) {
+ lineNo++;
+ lastLineStart = matcher.end();
+ continue;
}
- // advance iteration variables
- line = reader.readLine();
- lineNumber++;
+
+ int bcol = 1 + matcher.start() - lastLineStart;
+ int ecol = 1 + StringUtil.columnNumberAt(image, image.length());
+ tokenEntries.add(new TokenEntry(image, sourceCode.getFileName(), lineNo, bcol, ecol));
}
- } catch (IOException ignored) {
- ignored.printStackTrace();
} finally {
- tokenEntries.add(TokenEntry.getEOF());
+ tokenEntries.add(TokenEntry.EOF);
}
}
+
+ private boolean isComment(String tok) {
+ return !commentStart.isEmpty() && tok.startsWith(commentStart);
+ }
}
diff --git a/pmd-core/src/test/java/net/sourceforge/pmd/cpd/AnyTokenizerTest.java b/pmd-core/src/test/java/net/sourceforge/pmd/cpd/AnyTokenizerTest.java
index 559143f126..c116274606 100644
--- a/pmd-core/src/test/java/net/sourceforge/pmd/cpd/AnyTokenizerTest.java
+++ b/pmd-core/src/test/java/net/sourceforge/pmd/cpd/AnyTokenizerTest.java
@@ -4,29 +4,55 @@
package net.sourceforge.pmd.cpd;
+import static net.sourceforge.pmd.util.CollectionUtil.listOf;
import static org.junit.Assert.assertEquals;
-import org.junit.Test;
+import java.util.List;
+import java.util.stream.Collectors;
-import net.sourceforge.pmd.PMD;
+import org.checkerframework.checker.nullness.qual.NonNull;
+import org.junit.Test;
public class AnyTokenizerTest {
@Test
public void testMultiLineMacros() {
- AnyTokenizer tokenizer = new AnyTokenizer();
+ AnyTokenizer tokenizer = new AnyTokenizer("//");
SourceCode code = new SourceCode(new SourceCode.StringCodeLoader(TEST1));
Tokens tokens = new Tokens();
tokenizer.tokenize(code, tokens);
- assertEquals(30, tokens.size());
+ assertEquals(31, tokens.size());
+ List