Merge remote-tracking branch 'adangel/cstokenizer'
This commit is contained in:
10
pmd/src/main/java/net/sourceforge/pmd/cpd/CsLanguage.java
Normal file
10
pmd/src/main/java/net/sourceforge/pmd/cpd/CsLanguage.java
Normal file
@ -0,0 +1,10 @@
|
||||
/**
|
||||
* BSD-style license; for more info see http://pmd.sourceforge.net/license.html
|
||||
*/
|
||||
package net.sourceforge.pmd.cpd;
|
||||
|
||||
public class CsLanguage extends AbstractLanguage {
|
||||
public CsLanguage() {
|
||||
super(new CsTokenizer(), ".cs");
|
||||
}
|
||||
}
|
238
pmd/src/main/java/net/sourceforge/pmd/cpd/CsTokenizer.java
Normal file
238
pmd/src/main/java/net/sourceforge/pmd/cpd/CsTokenizer.java
Normal file
@ -0,0 +1,238 @@
|
||||
/**
|
||||
* BSD-style license; for more info see http://pmd.sourceforge.net/license.html
|
||||
*/
|
||||
package net.sourceforge.pmd.cpd;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.CharArrayReader;
|
||||
import java.io.IOException;
|
||||
|
||||
import net.sourceforge.pmd.util.IOUtil;
|
||||
|
||||
/**
|
||||
* This class does a best-guess try-anything tokenization.
|
||||
*
|
||||
* @author jheintz
|
||||
*/
|
||||
public class CsTokenizer implements Tokenizer {
|
||||
|
||||
public void tokenize(SourceCode sourceCode, Tokens tokenEntries) {
|
||||
BufferedReader reader = new BufferedReader(new CharArrayReader(sourceCode.getCodeBuffer().toString().toCharArray()));
|
||||
try {
|
||||
int ic = reader.read(), line=1;
|
||||
char c;
|
||||
StringBuilder b;
|
||||
while(ic!=-1)
|
||||
{
|
||||
c = (char)ic;
|
||||
switch(c)
|
||||
{
|
||||
// new line
|
||||
case '\n':
|
||||
line++;
|
||||
ic = reader.read();
|
||||
break;
|
||||
|
||||
// white space
|
||||
case ' ':
|
||||
case '\t':
|
||||
case '\r':
|
||||
ic = reader.read();
|
||||
break;
|
||||
|
||||
// ignore semicolons
|
||||
case ';':
|
||||
ic = reader.read();
|
||||
break;
|
||||
|
||||
// < << <= <<= > >> >= >>=
|
||||
case '<':
|
||||
case '>':
|
||||
ic = reader.read();
|
||||
if(ic == '=')
|
||||
{
|
||||
tokenEntries.add(new TokenEntry(String.valueOf(c)+"=", sourceCode.getFileName(), line));
|
||||
ic = reader.read();
|
||||
}
|
||||
else if(ic == c)
|
||||
{
|
||||
ic = reader.read();
|
||||
if(ic == '=')
|
||||
{
|
||||
tokenEntries.add(new TokenEntry(String.valueOf(c)+String.valueOf(c)+"=", sourceCode.getFileName(), line));
|
||||
ic = reader.read();
|
||||
}
|
||||
else
|
||||
{
|
||||
tokenEntries.add(new TokenEntry(String.valueOf(c)+String.valueOf(c), sourceCode.getFileName(), line));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line));
|
||||
}
|
||||
break;
|
||||
|
||||
// = == & &= && | |= || + += ++ - -= --
|
||||
case '=':
|
||||
case '&':
|
||||
case '|':
|
||||
case '+':
|
||||
case '-':
|
||||
ic = reader.read();
|
||||
if(ic == '=' || ic == c)
|
||||
{
|
||||
tokenEntries.add(new TokenEntry(String.valueOf(c)+String.valueOf((char)ic), sourceCode.getFileName(), line));
|
||||
ic = reader.read();
|
||||
}
|
||||
else
|
||||
{
|
||||
tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line));
|
||||
}
|
||||
break;
|
||||
|
||||
// ! != * *= % %= ^ ^= ~ ~=
|
||||
case '!':
|
||||
case '*':
|
||||
case '%':
|
||||
case '^':
|
||||
case '~':
|
||||
ic = reader.read();
|
||||
if(ic == '=')
|
||||
{
|
||||
tokenEntries.add(new TokenEntry(String.valueOf(c)+"=", sourceCode.getFileName(), line));
|
||||
ic = reader.read();
|
||||
}
|
||||
else
|
||||
{
|
||||
tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line));
|
||||
}
|
||||
break;
|
||||
|
||||
// strings & chars
|
||||
case '"':
|
||||
case '\'':
|
||||
b = new StringBuilder();
|
||||
b.append(c);
|
||||
while((ic = reader.read()) != c)
|
||||
{
|
||||
if(ic == -1)
|
||||
break;
|
||||
b.append((char)ic);
|
||||
if(ic == '\\') {
|
||||
int next = reader.read();
|
||||
if (next != -1) b.append((char)next);
|
||||
}
|
||||
}
|
||||
if (ic != -1) b.append((char)ic);
|
||||
tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));
|
||||
ic = reader.read();
|
||||
break;
|
||||
|
||||
// / /= /*...*/ //...
|
||||
case '/':
|
||||
switch(c = (char)(ic = reader.read()))
|
||||
{
|
||||
case '*':
|
||||
int state = 1;
|
||||
b = new StringBuilder();
|
||||
b.append("/*");
|
||||
|
||||
while((ic = reader.read()) != -1)
|
||||
{
|
||||
c = (char)ic;
|
||||
b.append(c);
|
||||
|
||||
if(state==1)
|
||||
{
|
||||
if(c == '*')
|
||||
state = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(c == '/') {
|
||||
ic = reader.read();
|
||||
break;
|
||||
} else if(c != '*') {
|
||||
state = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// ignore the /* comment
|
||||
//tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));
|
||||
break;
|
||||
|
||||
case '/':
|
||||
b = new StringBuilder();
|
||||
b.append("//");
|
||||
while((ic = reader.read()) != '\n')
|
||||
{
|
||||
if(ic==-1)
|
||||
break;
|
||||
b.append((char)ic);
|
||||
}
|
||||
// ignore the // comment
|
||||
//tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));
|
||||
break;
|
||||
|
||||
case '=':
|
||||
tokenEntries.add(new TokenEntry("/=", sourceCode.getFileName(), line));
|
||||
ic = reader.read();
|
||||
break;
|
||||
|
||||
default:
|
||||
tokenEntries.add(new TokenEntry("/", sourceCode.getFileName(), line));
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
|
||||
default:
|
||||
// [a-zA-Z_][a-zA-Z_0-9]*
|
||||
if(Character.isJavaIdentifierStart(c))
|
||||
{
|
||||
b = new StringBuilder();
|
||||
do
|
||||
{
|
||||
b.append(c);
|
||||
c = (char)(ic = reader.read());
|
||||
} while(Character.isJavaIdentifierPart(c));
|
||||
tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));
|
||||
}
|
||||
// numbers
|
||||
else if(Character.isDigit(c) || c == '.')
|
||||
{
|
||||
b = new StringBuilder();
|
||||
do
|
||||
{
|
||||
b.append(c);
|
||||
if(c == 'e' || c == 'E')
|
||||
{
|
||||
c = (char)(ic = reader.read());
|
||||
if("1234567890-".indexOf(c)==-1)
|
||||
break;
|
||||
b.append(c);
|
||||
}
|
||||
c = (char)(ic = reader.read());
|
||||
} while("1234567890.iIlLfFdDsSuUeExX".indexOf(c)!=-1);
|
||||
|
||||
tokenEntries.add(new TokenEntry(b.toString(), sourceCode.getFileName(), line));
|
||||
}
|
||||
// anything else
|
||||
else
|
||||
{
|
||||
tokenEntries.add(new TokenEntry(String.valueOf(c), sourceCode.getFileName(), line));
|
||||
ic = reader.read();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
} finally {
|
||||
IOUtil.closeQuietly(reader);
|
||||
tokenEntries.add(TokenEntry.getEOF());
|
||||
}
|
||||
}
|
||||
}
|
@ -22,7 +22,7 @@
|
||||
</ul>
|
||||
<p>Each rewrite made it much faster, and now it can process the JDK 1.4 java.* packages in about 4 seconds (on my workstation, at least).</p>
|
||||
<p>Here's a <a href="images/screenshot_cpd.png">screenshot</a> of CPD after running on the JDK java.lang package.</p>
|
||||
<p>Note that CPD works with Java, JSP, C, C++, Fortran and PHP code. Your own language is missing ? See how to add it <a href="cpd-parser-howto.html">here</a></p>
|
||||
<p>Note that CPD works with Java, JSP, C, C++, C#, Fortran and PHP code. Your own language is missing ? See how to add it <a href="cpd-parser-howto.html">here</a></p>
|
||||
<p>CPD is included with PMD, which you can download <a href="http://sourceforge.net/projects/pmd/files/pmd/">here</a>.
|
||||
Or, if you have <a href="http://java.sun.com/products/javawebstart/">Java Web Start</a>, you can <a href="http://pmd.sourceforge.net/cpd.jnlp">run CPD by clicking here</a>.
|
||||
</p>
|
||||
@ -76,7 +76,7 @@
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign="top">language</td>
|
||||
<td valign="top">Flag to select the appropriate language (e.g. <code>cpp</code>, <code>cs</code> <code>java</code>, <code>php</code>, <code>ruby</code>, and <code>ecmascript</code>); defaults to <code>java</code>.</td>
|
||||
<td valign="top">Flag to select the appropriate language (e.g. <code>cpp</code>, <code>cs</code>, <code>java</code>, <code>php</code>, <code>ruby</code>, and <code>ecmascript</code>); defaults to <code>java</code>.</td>
|
||||
<td valign="top" align="center">No</td>
|
||||
</tr>
|
||||
<tr>
|
||||
|
111
pmd/src/test/java/net/sourceforge/pmd/cpd/CsTokenizerTest.java
Normal file
111
pmd/src/test/java/net/sourceforge/pmd/cpd/CsTokenizerTest.java
Normal file
@ -0,0 +1,111 @@
|
||||
/**
|
||||
* BSD-style license; for more info see http://pmd.sourceforge.net/license.html
|
||||
*/
|
||||
|
||||
package net.sourceforge.pmd.cpd;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
public class CsTokenizerTest {
|
||||
|
||||
private CsTokenizer tokenizer = new CsTokenizer();
|
||||
|
||||
private Tokens tokens;
|
||||
|
||||
@Before
|
||||
public void init() {
|
||||
tokens = new Tokens();
|
||||
TokenEntry.clearImages();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimpleClass() {
|
||||
tokenizer.tokenize(toSourceCode("class Foo {}"), tokens);
|
||||
assertEquals(5, tokens.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimpleClassDuplicatedTokens() {
|
||||
tokenizer.tokenize(toSourceCode("class Foo { class Foo { } }"), tokens);
|
||||
assertEquals(9, tokens.size());
|
||||
List<TokenEntry> tokenList = tokens.getTokens();
|
||||
assertEquals(tokenList.get(0).getIdentifier(), tokenList.get(3).getIdentifier());
|
||||
assertEquals(tokenList.get(1).getIdentifier(), tokenList.get(4).getIdentifier());
|
||||
assertEquals(tokenList.get(2).getIdentifier(), tokenList.get(5).getIdentifier());
|
||||
assertEquals(tokenList.get(6).getIdentifier(), tokenList.get(7).getIdentifier());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimpleClassMethodMultipleLines() {
|
||||
tokenizer.tokenize(toSourceCode(
|
||||
"class Foo {\n"
|
||||
+ " public String foo(int a) {\n"
|
||||
+ " int i = a;\n"
|
||||
+ " return \"x\" + a;\n"
|
||||
+ " }\n"
|
||||
+ "}"), tokens);
|
||||
assertEquals(22, tokens.size());
|
||||
List<TokenEntry> tokenList = tokens.getTokens();
|
||||
assertEquals(1, tokenList.get(0).getBeginLine());
|
||||
assertEquals(2, tokenList.get(4).getBeginLine());
|
||||
assertEquals(3, tokenList.get(11).getBeginLine());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStrings() {
|
||||
tokenizer.tokenize(toSourceCode("String s =\"aaa \\\"b\\n\";"), tokens);
|
||||
assertEquals(5, tokens.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOpenString() {
|
||||
tokenizer.tokenize(toSourceCode("String s =\"aaa \\\"b\\"), tokens);
|
||||
assertEquals(5, tokens.size());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testCommentsIgnored1() {
|
||||
tokenizer.tokenize(toSourceCode("class Foo { /* class * ** X */ }"), tokens);
|
||||
assertEquals(5, tokens.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCommentsIgnored2() {
|
||||
tokenizer.tokenize(toSourceCode("class Foo { // class X /* aaa */ \n }"), tokens);
|
||||
assertEquals(5, tokens.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCommentsIgnored3() {
|
||||
tokenizer.tokenize(toSourceCode("class Foo { /// class X /* aaa */ \n }"), tokens);
|
||||
assertEquals(5, tokens.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMoreTokens() {
|
||||
tokenizer.tokenize(toSourceCode(
|
||||
"class Foo {\n"
|
||||
+ " void bar() {\n"
|
||||
+ " int a = 1 >> 2; \n"
|
||||
+ " a += 1; \n"
|
||||
+ " a++; \n"
|
||||
+ " a /= 3e2; \n"
|
||||
+ " float f = -3.1; \n"
|
||||
+ " f *= 2; \n"
|
||||
+ " bool b = ! (f == 2.0 || f >= 1.0 && f <= 2.0) \n"
|
||||
+ " }\n"
|
||||
+ "}"
|
||||
), tokens);
|
||||
assertEquals(50, tokens.size());
|
||||
}
|
||||
|
||||
private SourceCode toSourceCode(String source) {
|
||||
return new SourceCode(new SourceCode.StringCodeLoader(source));
|
||||
}
|
||||
}
|
@ -5,10 +5,6 @@ package net.sourceforge.pmd.cpd;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import net.sourceforge.pmd.PMD;
|
||||
import net.sourceforge.pmd.cpd.JavaTokenizer;
|
||||
import net.sourceforge.pmd.cpd.SourceCode;
|
||||
import net.sourceforge.pmd.cpd.Tokenizer;
|
||||
import net.sourceforge.pmd.cpd.Tokens;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
@ -24,6 +20,15 @@ public class JavaTokensTokenizerTest {
|
||||
assertEquals("public class Foo {}", sourceCode.getSlice(1, 1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCommentsIgnored() throws Throwable {
|
||||
Tokenizer tokenizer = new JavaTokenizer();
|
||||
SourceCode sourceCode = new SourceCode(new SourceCode.StringCodeLoader("public class Foo { // class Bar */ \n }"));
|
||||
Tokens tokens = new Tokens();
|
||||
tokenizer.tokenize(sourceCode, tokens);
|
||||
assertEquals(6, tokens.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test2() throws Throwable {
|
||||
Tokenizer t = new JavaTokenizer();
|
||||
|
Reference in New Issue
Block a user