Merge remote-tracking branch 'adangel/cstokenizer'

This commit is contained in:
Andreas Dangel
2012-11-28 20:07:10 +01:00
5 changed files with 370 additions and 6 deletions

View File

@ -0,0 +1,10 @@
/**
* BSD-style license; for more info see http://pmd.sourceforge.net/license.html
*/
package net.sourceforge.pmd.cpd;
public class CsLanguage extends AbstractLanguage {
public CsLanguage() {
super(new CsTokenizer(), ".cs");
}
}

View File

@ -0,0 +1,238 @@
/**
* BSD-style license; for more info see http://pmd.sourceforge.net/license.html
*/
package net.sourceforge.pmd.cpd;
import java.io.BufferedReader;
import java.io.CharArrayReader;
import java.io.IOException;
import net.sourceforge.pmd.util.IOUtil;
/**
* This class does a best-guess try-anything tokenization.
*
* @author jheintz
*/
public class CsTokenizer implements Tokenizer {
public void tokenize(SourceCode sourceCode, Tokens tokenEntries) {
BufferedReader reader = new BufferedReader(new CharArrayReader(sourceCode.getCodeBuffer().toString().toCharArray()));
try {
int ic = reader.read(), line=1;
char c;
StringBuilder b;
while(ic!=-1)
{
c = (char)ic;
switch(c)
{
// new line
case '\n':
line++;
ic = reader.read();
break;
// white space
case ' ':
case '\t':
case '\r':
ic = reader.read();
break;
// ignore semicolons
case ';':
ic = reader.read();
break;
// < << <= <<= > >> >= >>=
case '<':
case '>':
ic = reader.read();
if(ic == '=')
{
tokenEntries.add(new TokenEntry(String.valueOf(c)+"=", sourceCode.getFileName(), line));
ic = reader.read();
}
else if(ic == c)
{
ic = reader.read();
if(ic == '=')
{
tokenEntries.add(new TokenEntry(String.valueOf(c)+String.valueOf(c)+"=", sourceCode.getFileName(), line));
ic = reader.read();
}
else
{
tokenEntries.add(new TokenEntry(String.valueOf(c)+String.valueOf(c), sourceCode.getFileName(), line));
}
}
else
{
tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line));
}
break;
// = == & &= && | |= || + += ++ - -= --
case '=':
case '&':
case '|':
case '+':
case '-':
ic = reader.read();
if(ic == '=' || ic == c)
{
tokenEntries.add(new TokenEntry(String.valueOf(c)+String.valueOf((char)ic), sourceCode.getFileName(), line));
ic = reader.read();
}
else
{
tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line));
}
break;
// ! != * *= % %= ^ ^= ~ ~=
case '!':
case '*':
case '%':
case '^':
case '~':
ic = reader.read();
if(ic == '=')
{
tokenEntries.add(new TokenEntry(String.valueOf(c)+"=", sourceCode.getFileName(), line));
ic = reader.read();
}
else
{
tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line));
}
break;
// strings & chars
case '"':
case '\'':
b = new StringBuilder();
b.append(c);
while((ic = reader.read()) != c)
{
if(ic == -1)
break;
b.append((char)ic);
if(ic == '\\') {
int next = reader.read();
if (next != -1) b.append((char)next);
}
}
if (ic != -1) b.append((char)ic);
tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));
ic = reader.read();
break;
// / /= /*...*/ //...
case '/':
switch(c = (char)(ic = reader.read()))
{
case '*':
int state = 1;
b = new StringBuilder();
b.append("/*");
while((ic = reader.read()) != -1)
{
c = (char)ic;
b.append(c);
if(state==1)
{
if(c == '*')
state = 2;
}
else
{
if(c == '/') {
ic = reader.read();
break;
} else if(c != '*') {
state = 1;
}
}
}
// ignore the /* comment
//tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));
break;
case '/':
b = new StringBuilder();
b.append("//");
while((ic = reader.read()) != '\n')
{
if(ic==-1)
break;
b.append((char)ic);
}
// ignore the // comment
//tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));
break;
case '=':
tokenEntries.add(new TokenEntry("/=", sourceCode.getFileName(), line));
ic = reader.read();
break;
default:
tokenEntries.add(new TokenEntry("/", sourceCode.getFileName(), line));
break;
}
break;
default:
// [a-zA-Z_][a-zA-Z_0-9]*
if(Character.isJavaIdentifierStart(c))
{
b = new StringBuilder();
do
{
b.append(c);
c = (char)(ic = reader.read());
} while(Character.isJavaIdentifierPart(c));
tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));
}
// numbers
else if(Character.isDigit(c) || c == '.')
{
b = new StringBuilder();
do
{
b.append(c);
if(c == 'e' || c == 'E')
{
c = (char)(ic = reader.read());
if("1234567890-".indexOf(c)==-1)
break;
b.append(c);
}
c = (char)(ic = reader.read());
} while("1234567890.iIlLfFdDsSuUeExX".indexOf(c)!=-1);
tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));
}
// anything else
else
{
tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line));
ic = reader.read();
break;
}
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
IOUtil.closeQuietly(reader);
tokenEntries.add(TokenEntry.getEOF());
}
}
}

View File

@ -22,7 +22,7 @@
</ul>
<p>Each rewrite made it much faster, and now it can process the JDK 1.4 java.* packages in about 4 seconds (on my workstation, at least).</p>
<p>Here's a <a href="images/screenshot_cpd.png">screenshot</a> of CPD after running on the JDK java.lang package.</p>
<p>Note that CPD works with Java, JSP, C, C++, Fortran and PHP code. Your own language is missing ? See how to add it <a href="cpd-parser-howto.html">here</a></p>
<p>Note that CPD works with Java, JSP, C, C++, C#, Fortran and PHP code. Your own language is missing ? See how to add it <a href="cpd-parser-howto.html">here</a></p>
<p>CPD is included with PMD, which you can download <a href="http://sourceforge.net/projects/pmd/files/pmd/">here</a>.
Or, if you have <a href="http://java.sun.com/products/javawebstart/">Java Web Start</a>, you can <a href="http://pmd.sourceforge.net/cpd.jnlp">run CPD by clicking here</a>.
</p>
@ -76,7 +76,7 @@
</tr>
<tr>
<td valign="top">language</td>
<td valign="top">Flag to select the appropriate language (e.g. <code>cpp</code>, <code>cs</code> <code>java</code>, <code>php</code>, <code>ruby</code>, and <code>ecmascript</code>); defaults to <code>java</code>.</td>
<td valign="top">Flag to select the appropriate language (e.g. <code>cpp</code>, <code>cs</code>, <code>java</code>, <code>php</code>, <code>ruby</code>, and <code>ecmascript</code>); defaults to <code>java</code>.</td>
<td valign="top" align="center">No</td>
</tr>
<tr>

View File

@ -0,0 +1,111 @@
/**
* BSD-style license; for more info see http://pmd.sourceforge.net/license.html
*/
package net.sourceforge.pmd.cpd;
import static org.junit.Assert.assertEquals;
import java.util.List;
import org.junit.Before;
import org.junit.Test;
public class CsTokenizerTest {
private CsTokenizer tokenizer = new CsTokenizer();
private Tokens tokens;
@Before
public void init() {
tokens = new Tokens();
TokenEntry.clearImages();
}
@Test
public void testSimpleClass() {
tokenizer.tokenize(toSourceCode("class Foo {}"), tokens);
assertEquals(5, tokens.size());
}
@Test
public void testSimpleClassDuplicatedTokens() {
tokenizer.tokenize(toSourceCode("class Foo { class Foo { } }"), tokens);
assertEquals(9, tokens.size());
List<TokenEntry> tokenList = tokens.getTokens();
assertEquals(tokenList.get(0).getIdentifier(), tokenList.get(3).getIdentifier());
assertEquals(tokenList.get(1).getIdentifier(), tokenList.get(4).getIdentifier());
assertEquals(tokenList.get(2).getIdentifier(), tokenList.get(5).getIdentifier());
assertEquals(tokenList.get(6).getIdentifier(), tokenList.get(7).getIdentifier());
}
@Test
public void testSimpleClassMethodMultipleLines() {
tokenizer.tokenize(toSourceCode(
"class Foo {\n"
+ " public String foo(int a) {\n"
+ " int i = a;\n"
+ " return \"x\" + a;\n"
+ " }\n"
+ "}"), tokens);
assertEquals(22, tokens.size());
List<TokenEntry> tokenList = tokens.getTokens();
assertEquals(1, tokenList.get(0).getBeginLine());
assertEquals(2, tokenList.get(4).getBeginLine());
assertEquals(3, tokenList.get(11).getBeginLine());
}
@Test
public void testStrings() {
tokenizer.tokenize(toSourceCode("String s =\"aaa \\\"b\\n\";"), tokens);
assertEquals(5, tokens.size());
}
@Test
public void testOpenString() {
tokenizer.tokenize(toSourceCode("String s =\"aaa \\\"b\\"), tokens);
assertEquals(5, tokens.size());
}
@Test
public void testCommentsIgnored1() {
tokenizer.tokenize(toSourceCode("class Foo { /* class * ** X */ }"), tokens);
assertEquals(5, tokens.size());
}
@Test
public void testCommentsIgnored2() {
tokenizer.tokenize(toSourceCode("class Foo { // class X /* aaa */ \n }"), tokens);
assertEquals(5, tokens.size());
}
@Test
public void testCommentsIgnored3() {
tokenizer.tokenize(toSourceCode("class Foo { /// class X /* aaa */ \n }"), tokens);
assertEquals(5, tokens.size());
}
@Test
public void testMoreTokens() {
tokenizer.tokenize(toSourceCode(
"class Foo {\n"
+ " void bar() {\n"
+ " int a = 1 >> 2; \n"
+ " a += 1; \n"
+ " a++; \n"
+ " a /= 3e2; \n"
+ " float f = -3.1; \n"
+ " f *= 2; \n"
+ " bool b = ! (f == 2.0 || f >= 1.0 && f <= 2.0) \n"
+ " }\n"
+ "}"
), tokens);
assertEquals(50, tokens.size());
}
private SourceCode toSourceCode(String source) {
return new SourceCode(new SourceCode.StringCodeLoader(source));
}
}

View File

@ -5,10 +5,6 @@ package net.sourceforge.pmd.cpd;
import static org.junit.Assert.assertEquals;
import net.sourceforge.pmd.PMD;
import net.sourceforge.pmd.cpd.JavaTokenizer;
import net.sourceforge.pmd.cpd.SourceCode;
import net.sourceforge.pmd.cpd.Tokenizer;
import net.sourceforge.pmd.cpd.Tokens;
import org.junit.Test;
@ -24,6 +20,15 @@ public class JavaTokensTokenizerTest {
assertEquals("public class Foo {}", sourceCode.getSlice(1, 1));
}
@Test
public void testCommentsIgnored() throws Throwable {
Tokenizer tokenizer = new JavaTokenizer();
SourceCode sourceCode = new SourceCode(new SourceCode.StringCodeLoader("public class Foo { // class Bar */ \n }"));
Tokens tokens = new Tokens();
tokenizer.tokenize(sourceCode, tokens);
assertEquals(6, tokens.size());
}
@Test
public void test2() throws Throwable {
Tokenizer t = new JavaTokenizer();