From b27ab97684b782062bc7a2c0b7de4c24fb0a717b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Fournier?= Date: Tue, 1 Sep 2020 17:05:21 +0200 Subject: [PATCH] Improve AnyTokenizer --- .../net/sourceforge/pmd/cpd/AnyTokenizer.java | 98 ++++++++++++++----- .../sourceforge/pmd/cpd/AnyTokenizerTest.java | 48 ++++++--- 2 files changed, 108 insertions(+), 38 deletions(-) diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AnyTokenizer.java b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AnyTokenizer.java index d45a4af7b8..207a9f05b1 100644 --- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AnyTokenizer.java +++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AnyTokenizer.java @@ -4,44 +4,88 @@ package net.sourceforge.pmd.cpd; -import java.io.BufferedReader; -import java.io.CharArrayReader; -import java.io.IOException; -import java.util.StringTokenizer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.lang3.StringUtils; + +import net.sourceforge.pmd.util.StringUtil; /** - * This class does a best-guess try-anything tokenization. + * Simple tokenization into words and separators. Can ignore end-of-line + * comments and recognize double/single quoted string literals. It is + * not a goal to be very customizable, or have very high quality. + * Higher-quality lexers should be implemented with a lexer generator. * - * @author jheintz + *

In PMD 7, this replaces AbstractTokenizer, which provided nearly + * no more functionality and whose API was hard to update. */ public class AnyTokenizer implements Tokenizer { - public static final String TOKENS = " \t!#$%^&*(){}-=+<>/\\`~;:"; + + private static final Pattern DEFAULT_PATTERN = makePattern(""); + + private static Pattern makePattern(String singleLineCommentStart) { + return Pattern.compile( + "\\w++" // either a word + + eolCommentFragment(singleLineCommentStart) // a comment + + "|[^\"'\\s]" // a single separator char + + "|\"(?:[^\"\\\\]++|\\\\\")*+\"" // a double-quoted string + + "|'(?:[^'\\\\]++|\\\\')*+'" // a single-quoted string + + "|\n" // or a newline (to count lines), note that sourcecode normalizes line endings + ); + } + + private final Pattern pattern; + private final String commentStart; + + public AnyTokenizer() { + this(DEFAULT_PATTERN, ""); + } + + public AnyTokenizer(String eolCommentStart) { + this(makePattern(eolCommentStart), eolCommentStart); + } + + private AnyTokenizer(Pattern pattern, String commentStart) { + this.pattern = pattern; + this.commentStart = commentStart; + } + + private static String eolCommentFragment(String start) { + if (StringUtils.isBlank(start)) { + return ""; + } else { + return "|(?:" + Pattern.quote(start) + "[^\n]++)"; // note: sourcecode normalizes line endings + } + } @Override public void tokenize(SourceCode sourceCode, Tokens tokenEntries) { - StringBuilder sb = sourceCode.getCodeBuffer(); - try (BufferedReader reader = new BufferedReader(new CharArrayReader(sb.toString().toCharArray()))) { - int lineNumber = 1; - int colNumber = 1; - String line = reader.readLine(); - while (line != null) { - StringTokenizer tokenizer = new StringTokenizer(line, TOKENS, true); - while (tokenizer.hasMoreTokens()) { - String token = tokenizer.nextToken(); - int endCol = colNumber + token.length() - 1; // -1 because inclusive - if (!" ".equals(token) && !"\t".equals(token)) { - tokenEntries.add(new TokenEntry(token, sourceCode.getFileName(), lineNumber, colNumber, endCol)); - } - colNumber = endCol + 1; + StringBuilder text = sourceCode.getCodeBuffer(); + Matcher matcher = pattern.matcher(text); + int lineNo = 1; + int lastLineStart = 0; + try { + while (matcher.find()) { + String image = matcher.group(); + if (isComment(image)) { + continue; + } else if (StringUtils.isWhitespace(image)) { + lineNo++; + lastLineStart = matcher.end(); + continue; } - // advance iteration variables - line = reader.readLine(); - lineNumber++; + + int bcol = 1 + matcher.start() - lastLineStart; + int ecol = 1 + StringUtil.columnNumberAt(image, image.length()); + tokenEntries.add(new TokenEntry(image, sourceCode.getFileName(), lineNo, bcol, ecol)); } - } catch (IOException ignored) { - ignored.printStackTrace(); } finally { - tokenEntries.add(TokenEntry.getEOF()); + tokenEntries.add(TokenEntry.EOF); } } + + private boolean isComment(String tok) { + return !commentStart.isEmpty() && tok.startsWith(commentStart); + } } diff --git a/pmd-core/src/test/java/net/sourceforge/pmd/cpd/AnyTokenizerTest.java b/pmd-core/src/test/java/net/sourceforge/pmd/cpd/AnyTokenizerTest.java index 559143f126..c116274606 100644 --- a/pmd-core/src/test/java/net/sourceforge/pmd/cpd/AnyTokenizerTest.java +++ b/pmd-core/src/test/java/net/sourceforge/pmd/cpd/AnyTokenizerTest.java @@ -4,29 +4,55 @@ package net.sourceforge.pmd.cpd; +import static net.sourceforge.pmd.util.CollectionUtil.listOf; import static org.junit.Assert.assertEquals; -import org.junit.Test; +import java.util.List; +import java.util.stream.Collectors; -import net.sourceforge.pmd.PMD; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.junit.Test; public class AnyTokenizerTest { @Test public void testMultiLineMacros() { - AnyTokenizer tokenizer = new AnyTokenizer(); + AnyTokenizer tokenizer = new AnyTokenizer("//"); SourceCode code = new SourceCode(new SourceCode.StringCodeLoader(TEST1)); Tokens tokens = new Tokens(); tokenizer.tokenize(code, tokens); - assertEquals(30, tokens.size()); + assertEquals(31, tokens.size()); + List tokenStrings = tokens.getTokens().stream() + .map(this::getTokenImage) + .collect(Collectors.toList()); + + assertEquals(EXPECTED, tokenStrings); } - private static final String TEST1 = "using System;" + PMD.EOL + "namespace HelloNameSpace {" + PMD.EOL + "" - + PMD.EOL + " public class HelloWorld {" + PMD.EOL + " static void Main(string[] args) {" - + PMD.EOL + " Console.WriteLine(\"Hello World!\");" + PMD.EOL + " }" + PMD.EOL + " }" - + PMD.EOL + "}" + PMD.EOL; - - public static junit.framework.Test suite() { - return new junit.framework.JUnit4TestAdapter(AnyTokenizerTest.class); + private @NonNull String getTokenImage(TokenEntry t) { + return t.toString(); } + + private static final List EXPECTED = listOf( + "using", "System", ";", + "namespace", "HelloNameSpace", "{", + "public", "class", "HelloWorld", "{", // note: comment is excluded + "static", "void", "Main", "(", "string", "[", "]", "args", ")", "{", + "Console", ".", "WriteLine", "(", "\"Hello World!\"", ")", ";", + "}", "}", "}", "EOF" + ); + + private static final String TEST1 = + "using System;\n" + + "namespace HelloNameSpace {\n" + + "\n" + + " public class HelloWorld { // A comment\n" + + " static void Main(string[] args) {\n" + + "\n" + + " Console.WriteLine(\"Hello World!\");\n" + + " }\n" + + " }\n" + + "\n" + + "}\n"; + }