diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AnyTokenizer.java b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AnyTokenizer.java index da143594e8..a2d1825af3 100644 --- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AnyTokenizer.java +++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/AnyTokenizer.java @@ -29,8 +29,8 @@ public class AnyTokenizer implements Tokenizer { "\\w++" // either a word + eolCommentFragment(singleLineCommentStart) // a comment + "|[^\"'\\s]" // a single separator char - + "|\"(?:[^\"\\\\]++|\\\\\")*+\"" // a double-quoted string - + "|'(?:[^'\\\\]++|\\\\')*+'" // a single-quoted string + + "|\"(?:[^\"\\\\]++|\\\\.)*+\"" // a double-quoted string + + "|'(?:[^'\\\\]++|\\\\.)*+'" // a single-quoted string + "|\n" // or a newline (to count lines), note that sourcecode normalizes line endings ); } @@ -55,7 +55,7 @@ public class AnyTokenizer implements Tokenizer { if (StringUtils.isBlank(start)) { return ""; } else { - return "|(?:" + Pattern.quote(start) + "[^\n]++)"; // note: sourcecode normalizes line endings + return "|(?:" + Pattern.quote(start) + "[^\n]*+)"; // note: sourcecode normalizes line endings } } @@ -78,6 +78,9 @@ public class AnyTokenizer implements Tokenizer { int bcol = 1 + matcher.start() - lastLineStart; // + 1 because columns are 1 based int ecol = StringUtil.columnNumberAt(image, image.length()); // this already outputs a 1-based column + if (ecol == image.length() + 1) { + ecol = bcol + image.length(); // single-line token + } tokenEntries.add(new TokenEntry(image, sourceCode.getFileName(), lineNo, bcol, ecol)); } } finally { diff --git a/pmd-core/src/test/java/net/sourceforge/pmd/cpd/AnyTokenizerTest.java b/pmd-core/src/test/java/net/sourceforge/pmd/cpd/AnyTokenizerTest.java index c116274606..bbba98a210 100644 --- a/pmd-core/src/test/java/net/sourceforge/pmd/cpd/AnyTokenizerTest.java +++ b/pmd-core/src/test/java/net/sourceforge/pmd/cpd/AnyTokenizerTest.java @@ -18,15 +18,24 @@ public class AnyTokenizerTest { @Test public void testMultiLineMacros() { AnyTokenizer tokenizer = new AnyTokenizer("//"); - SourceCode code = new SourceCode(new SourceCode.StringCodeLoader(TEST1)); + compareResult(tokenizer, TEST1, EXPECTED); + } + + @Test + public void testStringEscape() { + AnyTokenizer tokenizer = new AnyTokenizer("//"); + compareResult(tokenizer, "a = \"oo\\n\"", listOf("a", "=", "\"oo\\n\"", "EOF")); + } + + private void compareResult(AnyTokenizer tokenizer, String source, List expectedImages) { + SourceCode code = new SourceCode(new SourceCode.StringCodeLoader(source)); Tokens tokens = new Tokens(); tokenizer.tokenize(code, tokens); - assertEquals(31, tokens.size()); List tokenStrings = tokens.getTokens().stream() .map(this::getTokenImage) .collect(Collectors.toList()); - assertEquals(EXPECTED, tokenStrings); + assertEquals(expectedImages, tokenStrings); } private @NonNull String getTokenImage(TokenEntry t) { diff --git a/pmd-perl/src/main/java/net/sourceforge/pmd/cpd/PerlLanguage.java b/pmd-perl/src/main/java/net/sourceforge/pmd/cpd/PerlLanguage.java index 2df0534704..c66d201c56 100644 --- a/pmd-perl/src/main/java/net/sourceforge/pmd/cpd/PerlLanguage.java +++ b/pmd-perl/src/main/java/net/sourceforge/pmd/cpd/PerlLanguage.java @@ -6,6 +6,6 @@ package net.sourceforge.pmd.cpd; public class PerlLanguage extends AbstractLanguage { public PerlLanguage() { - super("Perl", "perl", new PerlTokenizer(), ".pm", ".pl", ".t"); + super("Perl", "perl", new AnyTokenizer("#"), ".pm", ".pl", ".t"); } } diff --git a/pmd-perl/src/test/java/net/sourceforge/pmd/lang/perl/cpd/PerlTokenizerTest.java b/pmd-perl/src/test/java/net/sourceforge/pmd/lang/perl/cpd/PerlTokenizerTest.java new file mode 100644 index 0000000000..ae90a538db --- /dev/null +++ b/pmd-perl/src/test/java/net/sourceforge/pmd/lang/perl/cpd/PerlTokenizerTest.java @@ -0,0 +1,34 @@ +/* + * BSD-style license; for more info see http://pmd.sourceforge.net/license.html + */ + +package net.sourceforge.pmd.lang.perl.cpd; + +import java.util.Properties; + +import org.checkerframework.checker.nullness.qual.NonNull; +import org.junit.Test; + +import net.sourceforge.pmd.cpd.PerlLanguage; +import net.sourceforge.pmd.cpd.Tokenizer; +import net.sourceforge.pmd.cpd.test.CpdTextComparisonTest; + +/** + * + */ +public class PerlTokenizerTest extends CpdTextComparisonTest { + + public PerlTokenizerTest() { + super(".pl"); + } + + @Override + public Tokenizer newTokenizer(@NonNull Properties properties) { + return new PerlLanguage().getTokenizer(); + } + + @Test + public void testSample() { + doTest("sample"); + } +} diff --git a/pmd-perl/src/test/resources/net/sourceforge/pmd/lang/perl/cpd/testdata/sample.pl b/pmd-perl/src/test/resources/net/sourceforge/pmd/lang/perl/cpd/testdata/sample.pl new file mode 100644 index 0000000000..b7abc18a25 --- /dev/null +++ b/pmd-perl/src/test/resources/net/sourceforge/pmd/lang/perl/cpd/testdata/sample.pl @@ -0,0 +1,28 @@ +#!/usr/bin/perl -w +# courtesy of https://github.com/briandfoy/Learning-Perl-Sample-Files +# (no license) + +use strict; + +# This next line of code is used when you get to Chapter 9. +my $what = 'fred|barney'; + +while (<>) { + chomp; + # If you want to try matching strings which may contain + # newlines, here's the trick to use: Uncomment this next + # line, then use a pound sign ("#") wherever you mean to + # have a newline within your data string. + # s/#/\n/g; + + if (/YOUR_PATTERN_GOES_HERE/) { + print "Matched: |$`<$&>$'|\n"; + # If you need these for testing patterns with + # memories, uncomment them as well + # print " And memory one got <$1>\n"; + # print " And memory two got <$2>\n"; + } else { + print "No match.\n"; + } + +} diff --git a/pmd-perl/src/test/resources/net/sourceforge/pmd/lang/perl/cpd/testdata/sample.txt b/pmd-perl/src/test/resources/net/sourceforge/pmd/lang/perl/cpd/testdata/sample.txt new file mode 100644 index 0000000000..e4e548f58f --- /dev/null +++ b/pmd-perl/src/test/resources/net/sourceforge/pmd/lang/perl/cpd/testdata/sample.txt @@ -0,0 +1,47 @@ + [Image] or [Truncated image[ Bcol Ecol +L5 + [use] 1 4 + [strict] 5 11 + [;] 11 12 +L8 + [my] 1 3 + [$] 4 5 + [what] 5 9 + [=] 10 11 + ['fred|barney'] 12 25 + [;] 25 26 +L10 + [while] 1 6 + [(] 7 8 + [<] 8 9 + [>] 9 10 + [)] 10 11 + [{] 12 13 +L11 + [chomp] 5 10 + [;] 10 11 +L18 + [if] 5 7 + [(] 8 9 + [/] 9 10 + [YOUR_PATTERN_GOES_HERE] 10 32 + [/] 32 33 + [)] 33 34 + [{] 35 36 +L19 + [print] 2 7 + ["Matched: |$`<$&>$'|\\n"] 8 31 + [;] 31 32 +L24 + [}] 5 6 + [else] 7 11 + [{] 12 13 +L25 + [print] 2 7 + ["No match.\\n"] 8 21 + [;] 21 22 +L26 + [}] 5 6 +L28 + [}] 1 2 +EOF