Improve AnyTokenizer

2020-09-01 17:05:21 +02:00
parent 3ce68f4977
commit b27ab97684
2 changed files with 108 additions and 38 deletions
--- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AnyTokenizer.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AnyTokenizer.java
@ -4,44 +4,88 @@

 package net.sourceforge.pmd.cpd;

-import java.io.BufferedReader;
-import java.io.CharArrayReader;
-import java.io.IOException;
-import java.util.StringTokenizer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.lang3.StringUtils;
+
+import net.sourceforge.pmd.util.StringUtil;

 /**
- * This class does a best-guess try-anything tokenization.
+ * Simple tokenization into words and separators. Can ignore end-of-line
+ * comments and recognize double/single quoted string literals. It is
+ * not a goal to be very customizable, or have very high quality.
+ * Higher-quality lexers should be implemented with a lexer generator.
 *
- * @author jheintz
+ * <p>In PMD 7, this replaces AbstractTokenizer, which provided nearly
+ * no more functionality and whose API was hard to update.
 */
 public class AnyTokenizer implements Tokenizer {
-    public static final String TOKENS = " \t!#$%^&*(){}-=+<>/\\`~;:";
+
+    private static final Pattern DEFAULT_PATTERN = makePattern("");
+
+    private static Pattern makePattern(String singleLineCommentStart) {
+        return Pattern.compile(
+            "\\w++" // either a word
+                + eolCommentFragment(singleLineCommentStart) // a comment
+                + "|[^\"'\\s]" // a single separator char
+                + "|\"(?:[^\"\\\\]++|\\\\\")*+\"" // a double-quoted string
+                + "|'(?:[^'\\\\]++|\\\\')*+'" // a single-quoted string
+                + "|\n" // or a newline (to count lines), note that sourcecode normalizes line endings
+        );
+    }
+
+    private final Pattern pattern;
+    private final String commentStart;
+
+    public AnyTokenizer() {
+        this(DEFAULT_PATTERN, "");
+    }
+
+    public AnyTokenizer(String eolCommentStart) {
+        this(makePattern(eolCommentStart), eolCommentStart);
+    }
+
+    private AnyTokenizer(Pattern pattern, String commentStart) {
+        this.pattern = pattern;
+        this.commentStart = commentStart;
+    }
+
+    private static String eolCommentFragment(String start) {
+        if (StringUtils.isBlank(start)) {
+            return "";
+        } else {
+            return "|(?:" + Pattern.quote(start) + "[^\n]++)"; // note: sourcecode normalizes line endings
+        }
+    }

    @Override
    public void tokenize(SourceCode sourceCode, Tokens tokenEntries) {
-        StringBuilder sb = sourceCode.getCodeBuffer();
-        try (BufferedReader reader = new BufferedReader(new CharArrayReader(sb.toString().toCharArray()))) {
-            int lineNumber = 1;
-            int colNumber = 1;
-            String line = reader.readLine();
-            while (line != null) {
-                StringTokenizer tokenizer = new StringTokenizer(line, TOKENS, true);
-                while (tokenizer.hasMoreTokens()) {
-                    String token = tokenizer.nextToken();
-                    int endCol = colNumber + token.length() - 1; // -1 because inclusive
-                    if (!" ".equals(token) && !"\t".equals(token)) {
-                        tokenEntries.add(new TokenEntry(token, sourceCode.getFileName(), lineNumber, colNumber, endCol));
-                    }
-                    colNumber = endCol + 1;
+        StringBuilder text = sourceCode.getCodeBuffer();
+        Matcher matcher = pattern.matcher(text);
+        int lineNo = 1;
+        int lastLineStart = 0;
+        try {
+            while (matcher.find()) {
+                String image = matcher.group();
+                if (isComment(image)) {
+                    continue;
+                } else if (StringUtils.isWhitespace(image)) {
+                    lineNo++;
+                    lastLineStart = matcher.end();
+                    continue;
                }
-                // advance iteration variables
-                line = reader.readLine();
-                lineNumber++;
+
+                int bcol = 1 + matcher.start() - lastLineStart;
+                int ecol = 1 + StringUtil.columnNumberAt(image, image.length());
+                tokenEntries.add(new TokenEntry(image, sourceCode.getFileName(), lineNo, bcol, ecol));
            }
-        } catch (IOException ignored) {
-            ignored.printStackTrace();
        } finally {
-            tokenEntries.add(TokenEntry.getEOF());
+            tokenEntries.add(TokenEntry.EOF);
        }
    }
+
+    private boolean isComment(String tok) {
+        return !commentStart.isEmpty() && tok.startsWith(commentStart);
+    }
 }
--- a/pmd-core/src/test/java/net/sourceforge/pmd/cpd/AnyTokenizerTest.java
+++ b/pmd-core/src/test/java/net/sourceforge/pmd/cpd/AnyTokenizerTest.java
@ -4,29 +4,55 @@

 package net.sourceforge.pmd.cpd;

+import static net.sourceforge.pmd.util.CollectionUtil.listOf;
 import static org.junit.Assert.assertEquals;

-import org.junit.Test;
+import java.util.List;
+import java.util.stream.Collectors;

-import net.sourceforge.pmd.PMD;
+import org.checkerframework.checker.nullness.qual.NonNull;
+import org.junit.Test;

 public class AnyTokenizerTest {

    @Test
    public void testMultiLineMacros() {
-        AnyTokenizer tokenizer = new AnyTokenizer();
+        AnyTokenizer tokenizer = new AnyTokenizer("//");
        SourceCode code = new SourceCode(new SourceCode.StringCodeLoader(TEST1));
        Tokens tokens = new Tokens();
        tokenizer.tokenize(code, tokens);
-        assertEquals(30, tokens.size());
+        assertEquals(31, tokens.size());
+        List<String> tokenStrings = tokens.getTokens().stream()
+                                          .map(this::getTokenImage)
+                                          .collect(Collectors.toList());
+
+        assertEquals(EXPECTED, tokenStrings);
    }

-    private static final String TEST1 = "using System;" + PMD.EOL + "namespace HelloNameSpace {" + PMD.EOL + ""
-            + PMD.EOL + "    public class HelloWorld {" + PMD.EOL + "        static void Main(string[] args) {"
-            + PMD.EOL + "            Console.WriteLine(\"Hello World!\");" + PMD.EOL + "        }" + PMD.EOL + "    }"
-            + PMD.EOL + "}" + PMD.EOL;
-
-    public static junit.framework.Test suite() {
-        return new junit.framework.JUnit4TestAdapter(AnyTokenizerTest.class);
+    private @NonNull String getTokenImage(TokenEntry t) {
+        return t.toString();
    }
+
+    private static final List<String> EXPECTED = listOf(
+        "using", "System", ";",
+        "namespace", "HelloNameSpace", "{",
+        "public", "class", "HelloWorld", "{", // note: comment is excluded
+        "static", "void", "Main", "(", "string", "[", "]", "args", ")", "{",
+        "Console", ".", "WriteLine", "(", "\"Hello World!\"", ")", ";",
+        "}", "}", "}", "EOF"
+    );
+
+    private static final String TEST1 =
+        "using System;\n"
+            + "namespace HelloNameSpace {\n"
+            + "\n"
+            + "    public class HelloWorld { // A comment\n"
+            + "        static void Main(string[] args) {\n"
+            + "\n"
+            + "            Console.WriteLine(\"Hello World!\");\n"
+            + "        }\n"
+            + "    }\n"
+            + "\n"
+            + "}\n";
+
 }