From c92dc9706c376aa390c02a658793dba86a140d92 Mon Sep 17 00:00:00 2001 From: Jan van Nunen Date: Tue, 13 Jan 2015 11:14:42 +0100 Subject: [PATCH] Relaxed C++ tokenizer so a '\'(backslash) can be use to escape any character inside a string or character. The following 2 code snippets could not be tokenized: 1: if (*pbuf == '\0x05'), the problem is the '\0' in the character literal '\0x05'. 2: szPath = m_sdcacheDir + _T("\ oMedia");, the problem is the '\ ' in the string literal "\ oMedia". I relaxed the lexical grammar so a '\' (backslash) can escape any character inside a string or character literal. We can relax the grammar because CPD only needs the tokens, so it is no problem to accept 'invalid' string / character literals. (according to the ANSI C standard). Failing too fast because the tokenizer is too strict is annoying because then we can't check the files for duplicated code. Both snippets were taken from existing projects and be successfully compiled, so for some C / C++ compilers it is valid code. --- pmd-cpp/etc/grammar/cpp.jj | 30 ++----------------- .../sourceforge/pmd/cpd/CPPTokenizerTest.java | 12 ++++++++ 2 files changed, 14 insertions(+), 28 deletions(-) diff --git a/pmd-cpp/etc/grammar/cpp.jj b/pmd-cpp/etc/grammar/cpp.jj index a47a0b7ce3..b535533533 100644 --- a/pmd-cpp/etc/grammar/cpp.jj +++ b/pmd-cpp/etc/grammar/cpp.jj @@ -318,35 +318,9 @@ TOKEN [IGNORE_CASE] : TOKEN : { - < CHARACTER : ("L")? "'" - ( (~["'","\\","\n","\r"])* - | ("\\" ( - ["n","t","v","b","r","f","a","\\","?","'","\""] - | - "0" (["0"-"7"])* - | - ["1"-"9"] (["0"-"9"])* - | - ("x" | "X") (["0"-"9","a"-"f","A"-"F"])+ - ) - ) - ) - "'" > + < CHARACTER : ("L")? "'" ( ( ~["'","\\","\r","\n"] ) | ( "\\" ( ~["\n","\r"] ) ) )* "'" > -| < STRING : ("L")? "\"" - ( ( ~["\"","\\","\n","\r"]) - | ("\\" ( - ["n","t","v","b","r","f","a","\\","?","'","\"","\n"] - | - "0" (["0"-"7"])* - | - ["1"-"9"] (["0"-"9"])* - | - ("x" | "X") (["0"-"9","a"-"f","A"-"F"])+ - ) - ) - )* - "\"" > +| < STRING : ("L")? "\"" ( ( ~["\"","\\","\r","\n"] ) | ( "\\" ( ~["\n","\r"] | "\n" | "\r\n" ) ) )* "\"" > } void translation_unit() : diff --git a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java index aac224ce3b..4b78507736 100644 --- a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java +++ b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java @@ -113,6 +113,18 @@ public class CPPTokenizerTest { assertEquals(15, tokens.size()); } + @Test + public void testHexCharacter() { + Tokens tokens = parse("if (*pbuf == '\\0x05')" + PMD.EOL); + assertEquals(8, tokens.size()); + } + + @Test + public void testWhiteSpaceEscape() { + Tokens tokens = parse("szPath = m_sdcacheDir + _T(\"\\ oMedia\");" + PMD.EOL); + assertEquals(10, tokens.size()); + } + private Tokens parse(String snippet) { return parse(snippet, false); }