#1090 cpp parser exception with inline asm

2014-11-28 21:32:32 +01:00
parent 7b58836ebb
commit c8887de5ff
9 changed files with 221 additions and 1 deletions
--- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/CPDConfiguration.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/CPDConfiguration.java
@ -62,6 +62,13 @@ public class CPDConfiguration extends AbstractConfiguration {
 	@Parameter(names = "--skip-lexical-errors", description = "Skip files which can't be tokenized due to invalid characters instead of aborting CPD", required = false)
 	private boolean skipLexicalErrors = false;

+	@Parameter(names = "--no-skip-blocks", description = "Do not skip code blocks marked with --skip-blocks-pattern (e.g. #if 0 until #endif)", required = false)
+	private boolean noSkipBlocks = false;
+
+	@Parameter(names = "--skip-blocks-pattern", description = "Pattern to find the blocks to skip. Start and End pattern separated by |. "
+	        + "Default is \"" + Tokenizer.DEFAULT_SKIP_BLOCKS_PATTERN + "\".", required = false)
+	private String skipBlocksPattern = Tokenizer.DEFAULT_SKIP_BLOCKS_PATTERN;
+
 	@Parameter(names = "--files", variableArity = true, description = "List of files and directories to process", required = false)
 	private List<String> files;

@ -180,6 +187,8 @@ public class CPDConfiguration extends AbstractConfiguration {
        } else {
            properties.remove(Tokenizer.IGNORE_ANNOTATIONS);
 		}
+	    properties.setProperty(Tokenizer.OPTION_SKIP_BLOCKS, Boolean.toString(!configuration.isNoSkipBlocks()));
+	    properties.setProperty(Tokenizer.OPTION_SKIP_BLOCKS_PATTERN, configuration.getSkipBlocksPattern());
 		configuration.getLanguage().setProperties(properties);
 	}

@ -341,4 +350,20 @@ public class CPDConfiguration extends AbstractConfiguration {
 	public String getEncoding() {
 		return encoding;
 	}
+
+    public boolean isNoSkipBlocks() {
+        return noSkipBlocks;
+    }
+
+    public void setNoSkipBlocks(boolean noSkipBlocks) {
+        this.noSkipBlocks = noSkipBlocks;
+    }
+
+    public String getSkipBlocksPattern() {
+        return skipBlocksPattern;
+    }
+
+    public void setSkipBlocksPattern(String skipBlocksPattern) {
+        this.skipBlocksPattern = skipBlocksPattern;
+    }
 }
--- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/CPDTask.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/CPDTask.java
@ -49,6 +49,8 @@ public class CPDTask extends Task {
    private boolean ignoreAnnotations;
    private boolean skipLexicalErrors;
    private boolean skipDuplicateFiles;
+    private boolean skipBlocks = true;
+    private String skipBlocksPattern = Tokenizer.DEFAULT_SKIP_BLOCKS_PATTERN;
    private File outputFile;
    private String encoding = System.getProperty("file.encoding");
    private List<FileSet> filesets = new ArrayList<FileSet>();
@ -102,6 +104,8 @@ public class CPDTask extends Task {
        if (ignoreAnnotations) {
            p.setProperty(Tokenizer.IGNORE_ANNOTATIONS, "true");
        }
+        p.setProperty(Tokenizer.OPTION_SKIP_BLOCKS, Boolean.toString(skipBlocks));
+        p.setProperty(Tokenizer.OPTION_SKIP_BLOCKS_PATTERN, skipBlocksPattern);
        return LanguageFactory.createLanguage(language, p);
    }

@ -208,6 +212,14 @@ public class CPDTask extends Task {
        this.encoding = encoding;
    }

+    public void setSkipBlocks(boolean skipBlocks) {
+        this.skipBlocks = skipBlocks;
+    }
+
+    public void setSkipBlocksPattern(String skipBlocksPattern) {
+        this.skipBlocksPattern = skipBlocksPattern;
+    }
+
    public static class FormatAttribute extends EnumeratedAttribute {
        private static final String[] FORMATS = new String[]{XML_FORMAT, TEXT_FORMAT, CSV_FORMAT};
        public String[] getValues() {
--- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/Tokenizer.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/Tokenizer.java
@ -9,6 +9,23 @@ public interface Tokenizer {
    String IGNORE_LITERALS = "ignore_literals";
    String IGNORE_IDENTIFIERS = "ignore_identifiers";
    String IGNORE_ANNOTATIONS = "ignore_annotations";
+    /**
+     * Enables or disabled skipping of blocks like a pre-processor.
+     * It is a boolean property.
+     * The default value is <code>true</code>.
+     * @see #OPTION_SKIP_BLOCKS_PATTERN
+     */
+    String OPTION_SKIP_BLOCKS = "net.sourceforge.pmd.cpd.Tokenizer.skipBlocks";
+    /**
+     * Configures the pattern, to find the blocks to skip.
+     * It is a string property and contains of two parts, separated by {@code |}.
+     * The first part is the start pattern, the second part is the ending pattern.
+     * Default value is "{@code #if 0|#endif}".
+     * @see #DEFAULT_SKIP_BLOCKS_PATTERN
+     */
+    String OPTION_SKIP_BLOCKS_PATTERN = "net.sourceforge.pmd.cpd.Tokenizer.skipBlocksPattern";
+
+    String DEFAULT_SKIP_BLOCKS_PATTERN = "#if 0|#endif";

    void tokenize(SourceCode sourceCode, Tokens tokenEntries) throws IOException;
 }
--- a/pmd-cpp/src/main/java/net/sourceforge/pmd/cpd/CPPLanguage.java
+++ b/pmd-cpp/src/main/java/net/sourceforge/pmd/cpd/CPPLanguage.java
@ -3,6 +3,8 @@
 */
 package net.sourceforge.pmd.cpd;

+import java.util.Properties;
+
 /**
 * Defines the Language module for C/C++
 */
@ -14,4 +16,13 @@ public class CPPLanguage extends AbstractLanguage {
    public CPPLanguage() {
        super("C++", "cpp", new CPPTokenizer(), ".h", ".hpp", ".hxx", ".c", ".cpp", ".cxx", ".cc", ".C");
    }
+    
+    /* (non-Javadoc)
+     * @see net.sourceforge.pmd.cpd.AbstractLanguage#setProperties(java.util.Properties)
+     */
+    @Override
+    public void setProperties(Properties properties) {
+        super.setProperties(properties);
+        ((CPPTokenizer)getTokenizer()).setProperties(properties);
+    }
 }
--- a/pmd-cpp/src/main/java/net/sourceforge/pmd/cpd/CPPTokenizer.java
+++ b/pmd-cpp/src/main/java/net/sourceforge/pmd/cpd/CPPTokenizer.java
@ -3,8 +3,12 @@
 */
 package net.sourceforge.pmd.cpd;

+import java.io.BufferedReader;
+import java.io.IOException;
 import java.io.StringReader;
+import java.util.Properties;

+import net.sourceforge.pmd.PMD;
 import net.sourceforge.pmd.lang.LanguageRegistry;
 import net.sourceforge.pmd.lang.LanguageVersionHandler;
 import net.sourceforge.pmd.lang.TokenManager;
@ -19,6 +23,30 @@ import org.apache.commons.io.IOUtils;
 */
 public class CPPTokenizer implements Tokenizer {

+    private boolean skipBlocks = true;
+    private String skipBlocksStart;
+    private String skipBlocksEnd;
+
+    /**
+     * Sets the possible options for the C++ tokenizer.
+     * @param properties the properties
+     * @see #OPTION_SKIP_BLOCKS
+     * @see #OPTION_SKIP_BLOCKS_PATTERN
+     */
+    public void setProperties(Properties properties) {
+        skipBlocks = Boolean.parseBoolean(properties.getProperty(OPTION_SKIP_BLOCKS, Boolean.TRUE.toString()));
+        if (skipBlocks) {
+            String skipBlocksPattern = properties.getProperty(OPTION_SKIP_BLOCKS_PATTERN, DEFAULT_SKIP_BLOCKS_PATTERN);
+            String[] split = skipBlocksPattern.split("\\|", 2);
+            skipBlocksStart = split[0];
+            if (split.length == 1) {
+                skipBlocksEnd = split[0];
+            } else {
+                skipBlocksEnd = split[1];
+            }
+        }
+    }
+
    @Override
    public void tokenize(SourceCode sourceCode, Tokens tokenEntries) {
        StringBuilder buffer = sourceCode.getCodeBuffer();
@ -26,7 +54,7 @@ public class CPPTokenizer implements Tokenizer {
        try {
            LanguageVersionHandler languageVersionHandler = LanguageRegistry.getLanguage(CppLanguageModule.NAME)
                    .getDefaultVersion().getLanguageVersionHandler();
-            reader = new StringReader(buffer.toString());
+            reader = new StringReader(maybeSkipBlocks(buffer.toString()));
            TokenManager tokenManager = languageVersionHandler.getParser(
                    languageVersionHandler.getDefaultParserOptions()).getTokenManager(sourceCode.getFileName(), reader);
            Token currentToken = (Token) tokenManager.getNextToken();
@ -40,8 +68,35 @@ public class CPPTokenizer implements Tokenizer {
            err.printStackTrace();
            System.err.println("Skipping " + sourceCode.getFileName() + " due to parse error");
            tokenEntries.add(TokenEntry.getEOF());
+        } catch (IOException e) {
+            e.printStackTrace();
+            System.err.println("Skipping " + sourceCode.getFileName() + " due to parse error");
+            tokenEntries.add(TokenEntry.getEOF());
        } finally {
            IOUtils.closeQuietly(reader);
        }
    }
+
+    private String maybeSkipBlocks(String test) throws IOException {
+        if (!skipBlocks) {
+            return test;
+        }
+
+        BufferedReader reader = new BufferedReader(new StringReader(test));
+        StringBuilder filtered = new StringBuilder(test.length());
+        String line;
+        boolean skip = false;
+        while ((line = reader.readLine()) != null) {
+            if (skipBlocksStart.equalsIgnoreCase(line.trim())) {
+                skip = true;
+            } else if (skip && skipBlocksEnd.equalsIgnoreCase(line.trim())) {
+                skip = false;
+            }
+            if (!skip) {
+                filtered.append(line);
+            }
+            filtered.append(PMD.EOL); // always add a new line to keep the line-numbering
+        }
+        return filtered.toString();
+    }
 }
--- a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java
+++ b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java
@ -5,8 +5,12 @@ package net.sourceforge.pmd.cpd;

 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
+
+import java.util.Properties;
+
 import net.sourceforge.pmd.PMD;

+import org.apache.commons.io.IOUtils;
 import org.junit.Test;

 public class CPPTokenizerTest {
@ -51,8 +55,43 @@ public class CPPTokenizerTest {
    	assertEquals(17, tokens.size());
    }

+    @Test
+    public void testTokenizerWithSkipBlocks() throws Exception {
+        String test = IOUtils.toString(CPPTokenizerTest.class.getResourceAsStream("cpp/cpp_with_asm.cpp"));
+        Tokens tokens = parse(test, true);
+        assertEquals(19, tokens.size());
+    }
+
+    @Test
+    public void testTokenizerWithSkipBlocksPattern() throws Exception {
+        String test = IOUtils.toString(CPPTokenizerTest.class.getResourceAsStream("cpp/cpp_with_asm.cpp"));
+        Tokens tokens = parse(test, true, "#if debug|#endif");
+        assertEquals(31, tokens.size());
+    }
+
+    @Test
+    public void testTokenizerWithoutSkipBlocks() throws Exception {
+        String test = IOUtils.toString(CPPTokenizerTest.class.getResourceAsStream("cpp/cpp_with_asm.cpp"));
+        Tokens tokens = parse(test, false);
+        assertEquals(37, tokens.size());
+    }
+
    private Tokens parse(String snippet) {
+        return parse(snippet, false);
+    }
+    private Tokens parse(String snippet, boolean skipBlocks) {
+        return parse(snippet, skipBlocks, null);
+    }
+    private Tokens parse(String snippet, boolean skipBlocks, String skipPattern) {
+        Properties properties = new Properties();
+        properties.setProperty(Tokenizer.OPTION_SKIP_BLOCKS, Boolean.toString(skipBlocks));
+        if (skipPattern != null) {
+            properties.setProperty(Tokenizer.OPTION_SKIP_BLOCKS_PATTERN, skipPattern);
+        }
+
        CPPTokenizer tokenizer = new CPPTokenizer();
+        tokenizer.setProperties(properties);
+
        SourceCode code = new SourceCode(new SourceCode.StringCodeLoader(snippet));
        Tokens tokens = new Tokens();
        tokenizer.tokenize(code, tokens);
--- a/pmd-cpp/src/test/resources/net/sourceforge/pmd/cpd/cpp/cpp_with_asm.cpp
+++ b/pmd-cpp/src/test/resources/net/sourceforge/pmd/cpd/cpp/cpp_with_asm.cpp
@ -0,0 +1,28 @@
+int main() {
+}
+
+#if DEBUG
+int foobar() {
+}
+#endif
+
+#if 0
+static void my_memset(void *dest,int fill_value,int count)
+{
+    __asm __volatile__(
+         "cld\n"
+         "mov %ecx, %ebx\n"
+         "shr 2,%ecx\n"
+         "rep "
+         "stosl\n"
+         "mov %ebx,%ecx\n"
+         "  // line 157 mentioned above
+         : 
+         : "c" (count), "a" (fill_value), "D" (dest)
+         : "cc","%ebx" );
+}
+#endif
+
+
+int otherMethod() {
+}
--- a/src/site/markdown/overview/changelog.md
+++ b/src/site/markdown/overview/changelog.md
@ -8,8 +8,16 @@

 **New/Modified Rules:**

+**New Parameters for CPD:**
+
+For the language cpp, the following new parameters are supported:
+
+* `--no-skip-blocks`: Disables skipping of code blocks like a pre-processor. This is by default enabled.
+* `--skip-blocks-pattern`: Pattern to find the blocks to skip. Start and End pattern separated by "`|`". Default value is "`#if 0|#endif`".
+
 **Bugfixes:**

+* [#1090](https://sourceforge.net/p/pmd/bugs/1090/): cpp parser exception with inline asm
 * [#1128](https://sourceforge.net/p/pmd/bugs/1128/): CompareObjectsWithEquals False Positive comparing boolean (primitive) values
 * [#1254](https://sourceforge.net/p/pmd/bugs/1254/): CPD run that worked in 5.1.2 fails in 5.1.3 with OOM
 * [#1276](https://sourceforge.net/p/pmd/bugs/1276/): False positive in UnusedPrivateMethod with inner enum
--- a/src/site/xdoc/usage/cpd-usage.xml
+++ b/src/site/xdoc/usage/cpd-usage.xml
@ -49,16 +49,19 @@
  <tr>
    <td valign="top"><b>Attribute</b></td>
    <td valign="top"><b>Description</b></td>
+    <td valign="top"><b>Applies for language</b></td>
    <td align="center" valign="top"><b>Required</b></td>
  </tr>
  <tr>
    <td valign="top">encoding</td>
    <td valign="top">The character set encoding (e.g., UTF-8) to use when reading the source code files, but also when producing the report. A piece of warning, even if you set properly the encoding value, let's say to UTF-8, but you are running CPD encoded with CP1252, you may end up with not UTF-8 file. Indeed, CPD copy piece of source code in its report directly, therefore, the source files keep their encoding.<br>If not specified, CPD uses the system default encoding.</br></td>
+    <td valign="top"></td>
    <td valign="top" align="center">No</td>
  </tr>
  <tr>
    <td valign="top">format</td>
    <td valign="top">The format of the report (e.g. <code>csv</code>, <code>text</code>, <code>xml</code>); defaults to <code>text</code>.</td>
+    <td valign="top"></td>
    <td valign="top" align="center">No</td>
  </tr>
  <tr>
@ -67,28 +70,47 @@
 value differences when evaluating a duplicate block.  This means that <code>foo=42;</code> and <code>foo=43;</code>
 will be seen as equivalent.  You may want to run PMD with this option off to start with and
           then switch it on to see what it turns up; defaults to <code>false</code>.</td>
+    <td valign="top">java</td>
    <td valign="top" align="center">No</td>
  </tr>
  <tr>
    <td valign="top">ignoreIdentifiers</td>
    <td valign="top">Similar to <code>ignoreLiterals</code> but for identifiers; i.e., variable names, methods names, and so forth; defaults to <code>false</code>.</td>
+    <td valign="top">java</td>
    <td valign="top" align="center">No</td>
  </tr>
  <tr>
    <td valign="top">ignoreAnnotations</td>
    <td valign="top">Ignore annotations. More and more modern frameworks use annotations on classes and methods, which can be very redundant and trigger CPD matches. With J2EE (CDI, Transaction Handling, etc) and Spring (everything) annotations become very redundant. Often classes or methods have the same 5-6 lines of annotations. This causes false positives; defaults to <code>false</code>.</td>
+    <td valign="top">java</td>
    <td valign="top" align="center">No</td>
  </tr>
  <tr>
    <td valign="top">skipDuplicateFiles</td>
    <td valign="top">Ignore multiple copies of files of the same name and length in comparison; defaults to <code>false</code>.</td>
+    <td valign="top"></td>
    <td valign="top" align="center">No</td>
  </tr>
  <tr>
    <td valign="top">skipLexicalErrors</td>
    <td valign="top">Skip files which can't be tokenized due to invalid characters instead of aborting CPD; defaults to <code>false</code>.</td>
+    <td valign="top"></td>
    <td valign="top" align="center">No</td>
  </tr>
+  <tr>
+    <td valign="top">skipBlocks</td>
+    <td valign="top">Enables or disabled skipping of blocks like a pre-processor; defaults to <code>true</code>. See also option skipBlocksPattern.</td>
+    <td valign="top">cpp</td>
+    <td valign="top">No</td>
+  </tr>
+  <tr>
+    <td valign="top">skipBlocksPattern</td>
+    <td valign="top">Configures the pattern, to find the blocks to skip. It is a string property and contains of two parts, separated by <code>|</code>.
+        The first part is the start pattern, the second part is the ending pattern.
+        The default value is <code>#if 0|#endif</code>.</td>
+    <td valign="top">cpp</td>
+    <td valign="top">no</td>
+  </tr>
  <tr>
    <td valign="top">language</td>
    <td valign="top">
@ -97,16 +119,19 @@
        <code>ecmascript</code>, and <code>plsql</code>);
        defaults to <code>java</code>.
    </td>
+    <td valign="top"></td>
    <td valign="top" align="center">No</td>
  </tr>
  <tr>
    <td valign="top">minimumtokencount</td>
    <td valign="top">A positive integer indicating the minimum duplicate size.</td>
+    <td valign="top"></td>
    <td valign="top" align="center">Yes</td>
  </tr>
  <tr>
    <td valign="top">outputfile</td>
    <td valign="top">The destination file for the report. If not specified the console will be used instead.</td>
+    <td valign="top"></td>
    <td valign="top" align="center">No</td>
  </tr>
 </table>