From 3ceb80d7ac9ca8595971eacca65eb53b03105590 Mon Sep 17 00:00:00 2001 From: TIOBE Software Date: Wed, 22 Aug 2012 14:18:41 +0200 Subject: [PATCH 1/2] Added missing files for patch 2996539 : Add support for C# to CPD (simple parser) --- .../net/sourceforge/pmd/cpd/CsLanguage.java | 10 + .../net/sourceforge/pmd/cpd/CsTokenizer.java | 225 ++++++++++++++++++ 2 files changed, 235 insertions(+) create mode 100644 pmd/src/main/java/net/sourceforge/pmd/cpd/CsLanguage.java create mode 100644 pmd/src/main/java/net/sourceforge/pmd/cpd/CsTokenizer.java diff --git a/pmd/src/main/java/net/sourceforge/pmd/cpd/CsLanguage.java b/pmd/src/main/java/net/sourceforge/pmd/cpd/CsLanguage.java new file mode 100644 index 0000000000..eebca47820 --- /dev/null +++ b/pmd/src/main/java/net/sourceforge/pmd/cpd/CsLanguage.java @@ -0,0 +1,10 @@ +/** + * BSD-style license; for more info see http://pmd.sourceforge.net/license.html + */ +package net.sourceforge.pmd.cpd; + +public class CsLanguage extends AbstractLanguage { + public CsLanguage() { + super(new CsTokenizer(), ".cs"); + } +} diff --git a/pmd/src/main/java/net/sourceforge/pmd/cpd/CsTokenizer.java b/pmd/src/main/java/net/sourceforge/pmd/cpd/CsTokenizer.java new file mode 100644 index 0000000000..3e50431c74 --- /dev/null +++ b/pmd/src/main/java/net/sourceforge/pmd/cpd/CsTokenizer.java @@ -0,0 +1,225 @@ +/** + * BSD-style license; for more info see http://pmd.sourceforge.net/license.html + */ +package net.sourceforge.pmd.cpd; + +import java.io.BufferedReader; +import java.io.CharArrayReader; +import java.io.IOException; +import java.util.NoSuchElementException; +import java.util.StringTokenizer; + +/** + * This class does a best-guess try-anything tokenization. + * + * @author jheintz + */ +public class CsTokenizer implements Tokenizer { + + public void tokenize(SourceCode sourceCode, Tokens tokenEntries) { + BufferedReader reader = new BufferedReader(new CharArrayReader(sourceCode.getCodeBuffer().toString().toCharArray())); + try { + int ic = reader.read(), line=1; + char c; + StringBuilder b; + while(ic!=-1) + { + c = (char)ic; + switch(c) + { + // new line + case '\n': + line++; + ic = reader.read(); + break; + + // white space + case ' ': + case '\t': + case '\r': + ic = reader.read(); + break; + + // < << <= <<= > >> >= >>= + case '<': + case '>': + ic = reader.read(); + if(ic == '=') + { + tokenEntries.add(new TokenEntry(String.valueOf(c)+"=", sourceCode.getFileName(), line)); + ic = reader.read(); + } + else if(ic == c) + { + ic = reader.read(); + if(ic == '=') + { + tokenEntries.add(new TokenEntry(String.valueOf(c)+String.valueOf(c)+"=", sourceCode.getFileName(), line)); + ic = reader.read(); + } + else + { + tokenEntries.add(new TokenEntry(String.valueOf(c)+String.valueOf(c), sourceCode.getFileName(), line)); + } + } + else + { + tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line)); + } + break; + + // = == & &= && | |= || + += ++ - -= -- + case '=': + case '&': + case '|': + case '+': + case '-': + ic = reader.read(); + if(ic == '=' || ic == c) + { + tokenEntries.add(new TokenEntry(String.valueOf(c)+String.valueOf((char)ic), sourceCode.getFileName(), line)); + ic = reader.read(); + } + else + { + tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line)); + } + break; + + // ! != * *= % %= ^ ^= ~ ~= + case '!': + case '*': + case '%': + case '^': + case '~': + ic = reader.read(); + if(ic == '=') + { + tokenEntries.add(new TokenEntry(String.valueOf(c)+"=", sourceCode.getFileName(), line)); + ic = reader.read(); + } + else + { + tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line)); + } + break; + + // strings & chars + case '"': + case '\'': + b = new StringBuilder(); + b.append(c); + while((ic = reader.read()) != c) + { + if(ic == -1) + break; + b.append((char)ic); + if(ic == '\\') + b.append((char)reader.read()); + } + b.append((char)ic); + tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line)); + ic = reader.read(); + + // / /= /*...*/ //... + case '/': + switch(c = (char)(ic = reader.read())) + { + case '*': + int state = 1; + b = new StringBuilder(); + b.append(c); + + while((ic = reader.read()) != -1) + { + c = (char)ic; + b.append(c); + + if(state==1) + { + if(c == '*') + state = 2; + } + else + { + if(c == '/') + break; + else if(c != '*') + state = 1; + } + } + tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line)); + break; + + case '/': + b = new StringBuilder(); + b.append(c); + while((ic = reader.read()) != '\n') + { + if(ic==-1) + break; + b.append((char)ic); + } + tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line)); + break; + + case '=': + tokenEntries.add(new TokenEntry("/=", sourceCode.getFileName(), line)); + ic = reader.read(); + break; + + default: + tokenEntries.add(new TokenEntry("/", sourceCode.getFileName(), line)); + break; + } + break; + + + + default: + // [a-zA-Z_][a-zA-Z_0-9]* + if(Character.isJavaIdentifierStart(c)) + { + b = new StringBuilder(); + do + { + b.append(c); + c = (char)(ic = reader.read()); + } while(Character.isJavaIdentifierPart(c)); + tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line)); + } + // numbers + else if(Character.isDigit(c) || c == '.') + { + b = new StringBuilder(); + do + { + b.append(c); + if(c == 'e' || c == 'E') + { + c = (char)(ic = reader.read()); + if("1234567890-".indexOf(c)==-1) + break; + b.append(c); + } + c = (char)(ic = reader.read()); + } while("1234567890.iIlLfFdDsSuUeExX".indexOf(c)!=-1); + + tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line)); + } + // anything else + else + { + tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line)); + ic = reader.read(); + break; + } + } + } + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + tokenEntries.add(TokenEntry.getEOF()); + } +} From 62ec54b1528fbc2f989aaab7ba3db0d47c5a8810 Mon Sep 17 00:00:00 2001 From: Andreas Dangel Date: Thu, 22 Nov 2012 20:18:07 +0100 Subject: [PATCH 2/2] pmd: added unit test for CsTokenizer * behaves similar like Java, e.g. ignoring semicolons and comments --- .../net/sourceforge/pmd/cpd/CsTokenizer.java | 39 ++++-- pmd/src/site/xdocs/cpd-usage.xml | 4 +- .../sourceforge/pmd/cpd/CsTokenizerTest.java | 111 ++++++++++++++++++ .../pmd/cpd/JavaTokensTokenizerTest.java | 13 +- 4 files changed, 148 insertions(+), 19 deletions(-) create mode 100644 pmd/src/test/java/net/sourceforge/pmd/cpd/CsTokenizerTest.java diff --git a/pmd/src/main/java/net/sourceforge/pmd/cpd/CsTokenizer.java b/pmd/src/main/java/net/sourceforge/pmd/cpd/CsTokenizer.java index 3e50431c74..bdc637c53e 100644 --- a/pmd/src/main/java/net/sourceforge/pmd/cpd/CsTokenizer.java +++ b/pmd/src/main/java/net/sourceforge/pmd/cpd/CsTokenizer.java @@ -6,8 +6,8 @@ package net.sourceforge.pmd.cpd; import java.io.BufferedReader; import java.io.CharArrayReader; import java.io.IOException; -import java.util.NoSuchElementException; -import java.util.StringTokenizer; + +import net.sourceforge.pmd.util.IOUtil; /** * This class does a best-guess try-anything tokenization. @@ -40,6 +40,11 @@ public class CsTokenizer implements Tokenizer { ic = reader.read(); break; + // ignore semicolons + case ';': + ic = reader.read(); + break; + // < << <= <<= > >> >= >>= case '<': case '>': @@ -114,12 +119,15 @@ public class CsTokenizer implements Tokenizer { if(ic == -1) break; b.append((char)ic); - if(ic == '\\') - b.append((char)reader.read()); + if(ic == '\\') { + int next = reader.read(); + if (next != -1) b.append((char)next); + } } - b.append((char)ic); + if (ic != -1) b.append((char)ic); tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line)); ic = reader.read(); + break; // / /= /*...*/ //... case '/': @@ -128,7 +136,7 @@ public class CsTokenizer implements Tokenizer { case '*': int state = 1; b = new StringBuilder(); - b.append(c); + b.append("/*"); while((ic = reader.read()) != -1) { @@ -142,25 +150,29 @@ public class CsTokenizer implements Tokenizer { } else { - if(c == '/') + if(c == '/') { + ic = reader.read(); break; - else if(c != '*') + } else if(c != '*') { state = 1; + } } } - tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line)); + // ignore the /* comment + //tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line)); break; case '/': b = new StringBuilder(); - b.append(c); + b.append("//"); while((ic = reader.read()) != '\n') { if(ic==-1) break; b.append((char)ic); } - tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line)); + // ignore the // comment + //tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line)); break; case '=': @@ -217,9 +229,10 @@ public class CsTokenizer implements Tokenizer { } } } catch (IOException e) { - // TODO Auto-generated catch block e.printStackTrace(); + } finally { + IOUtil.closeQuietly(reader); + tokenEntries.add(TokenEntry.getEOF()); } - tokenEntries.add(TokenEntry.getEOF()); } } diff --git a/pmd/src/site/xdocs/cpd-usage.xml b/pmd/src/site/xdocs/cpd-usage.xml index 32509815f6..2a61067ce3 100644 --- a/pmd/src/site/xdocs/cpd-usage.xml +++ b/pmd/src/site/xdocs/cpd-usage.xml @@ -22,7 +22,7 @@

Each rewrite made it much faster, and now it can process the JDK 1.4 java.* packages in about 4 seconds (on my workstation, at least).

Here's a screenshot of CPD after running on the JDK java.lang package.

-

Note that CPD works with Java, JSP, C, C++, Fortran and PHP code. Your own language is missing ? See how to add it here

+

Note that CPD works with Java, JSP, C, C++, C#, Fortran and PHP code. Your own language is missing ? See how to add it here

CPD is included with PMD, which you can download here. Or, if you have Java Web Start, you can run CPD by clicking here.

@@ -76,7 +76,7 @@ language - Flag to select the appropriate language (e.g. cpp, cs java, php, ruby, and ecmascript); defaults to java. + Flag to select the appropriate language (e.g. cpp, cs, java, php, ruby, and ecmascript); defaults to java. No diff --git a/pmd/src/test/java/net/sourceforge/pmd/cpd/CsTokenizerTest.java b/pmd/src/test/java/net/sourceforge/pmd/cpd/CsTokenizerTest.java new file mode 100644 index 0000000000..88e4de1883 --- /dev/null +++ b/pmd/src/test/java/net/sourceforge/pmd/cpd/CsTokenizerTest.java @@ -0,0 +1,111 @@ +/** + * BSD-style license; for more info see http://pmd.sourceforge.net/license.html + */ + +package net.sourceforge.pmd.cpd; + +import static org.junit.Assert.assertEquals; + +import java.util.List; + +import org.junit.Before; +import org.junit.Test; + +public class CsTokenizerTest { + + private CsTokenizer tokenizer = new CsTokenizer(); + + private Tokens tokens; + + @Before + public void init() { + tokens = new Tokens(); + TokenEntry.clearImages(); + } + + @Test + public void testSimpleClass() { + tokenizer.tokenize(toSourceCode("class Foo {}"), tokens); + assertEquals(5, tokens.size()); + } + + @Test + public void testSimpleClassDuplicatedTokens() { + tokenizer.tokenize(toSourceCode("class Foo { class Foo { } }"), tokens); + assertEquals(9, tokens.size()); + List tokenList = tokens.getTokens(); + assertEquals(tokenList.get(0).getIdentifier(), tokenList.get(3).getIdentifier()); + assertEquals(tokenList.get(1).getIdentifier(), tokenList.get(4).getIdentifier()); + assertEquals(tokenList.get(2).getIdentifier(), tokenList.get(5).getIdentifier()); + assertEquals(tokenList.get(6).getIdentifier(), tokenList.get(7).getIdentifier()); + } + + @Test + public void testSimpleClassMethodMultipleLines() { + tokenizer.tokenize(toSourceCode( + "class Foo {\n" + + " public String foo(int a) {\n" + + " int i = a;\n" + + " return \"x\" + a;\n" + + " }\n" + + "}"), tokens); + assertEquals(22, tokens.size()); + List tokenList = tokens.getTokens(); + assertEquals(1, tokenList.get(0).getBeginLine()); + assertEquals(2, tokenList.get(4).getBeginLine()); + assertEquals(3, tokenList.get(11).getBeginLine()); + } + + @Test + public void testStrings() { + tokenizer.tokenize(toSourceCode("String s =\"aaa \\\"b\\n\";"), tokens); + assertEquals(5, tokens.size()); + } + + @Test + public void testOpenString() { + tokenizer.tokenize(toSourceCode("String s =\"aaa \\\"b\\"), tokens); + assertEquals(5, tokens.size()); + } + + + @Test + public void testCommentsIgnored1() { + tokenizer.tokenize(toSourceCode("class Foo { /* class * ** X */ }"), tokens); + assertEquals(5, tokens.size()); + } + + @Test + public void testCommentsIgnored2() { + tokenizer.tokenize(toSourceCode("class Foo { // class X /* aaa */ \n }"), tokens); + assertEquals(5, tokens.size()); + } + + @Test + public void testCommentsIgnored3() { + tokenizer.tokenize(toSourceCode("class Foo { /// class X /* aaa */ \n }"), tokens); + assertEquals(5, tokens.size()); + } + + @Test + public void testMoreTokens() { + tokenizer.tokenize(toSourceCode( + "class Foo {\n" + + " void bar() {\n" + + " int a = 1 >> 2; \n" + + " a += 1; \n" + + " a++; \n" + + " a /= 3e2; \n" + + " float f = -3.1; \n" + + " f *= 2; \n" + + " bool b = ! (f == 2.0 || f >= 1.0 && f <= 2.0) \n" + + " }\n" + + "}" + ), tokens); + assertEquals(50, tokens.size()); + } + + private SourceCode toSourceCode(String source) { + return new SourceCode(new SourceCode.StringCodeLoader(source)); + } +} diff --git a/pmd/src/test/java/net/sourceforge/pmd/cpd/JavaTokensTokenizerTest.java b/pmd/src/test/java/net/sourceforge/pmd/cpd/JavaTokensTokenizerTest.java index ab36b2cd6f..75d214134e 100644 --- a/pmd/src/test/java/net/sourceforge/pmd/cpd/JavaTokensTokenizerTest.java +++ b/pmd/src/test/java/net/sourceforge/pmd/cpd/JavaTokensTokenizerTest.java @@ -5,10 +5,6 @@ package net.sourceforge.pmd.cpd; import static org.junit.Assert.assertEquals; import net.sourceforge.pmd.PMD; -import net.sourceforge.pmd.cpd.JavaTokenizer; -import net.sourceforge.pmd.cpd.SourceCode; -import net.sourceforge.pmd.cpd.Tokenizer; -import net.sourceforge.pmd.cpd.Tokens; import org.junit.Test; @@ -24,6 +20,15 @@ public class JavaTokensTokenizerTest { assertEquals("public class Foo {}", sourceCode.getSlice(1, 1)); } + @Test + public void testCommentsIgnored() throws Throwable { + Tokenizer tokenizer = new JavaTokenizer(); + SourceCode sourceCode = new SourceCode(new SourceCode.StringCodeLoader("public class Foo { // class Bar */ \n }")); + Tokens tokens = new Tokens(); + tokenizer.tokenize(sourceCode, tokens); + assertEquals(6, tokens.size()); + } + @Test public void test2() throws Throwable { Tokenizer t = new JavaTokenizer();