From c92dc9706c376aa390c02a658793dba86a140d92 Mon Sep 17 00:00:00 2001
From: Jan van Nunen <jan.van.nunen@tiobe.com>
Date: Tue, 13 Jan 2015 11:14:42 +0100
Subject: [PATCH] Relaxed C++ tokenizer so a '\'(backslash) can be use to
 escape any character inside a string or character.

The following 2 code snippets could not be tokenized:

1:  if (*pbuf == '\0x05'), the problem is the '\0' in the character
literal '\0x05'.
2:  szPath = m_sdcacheDir + _T("\    oMedia");, the problem is the '\ '
in the string literal "\    oMedia".

I relaxed the lexical grammar so a '\' (backslash) can escape any
character inside a string or character literal. We can relax the grammar
because CPD only needs the tokens, so it is no problem to accept
'invalid' string / character literals. (according to the ANSI C
standard).  Failing too fast because the tokenizer is too strict is
annoying because then we can't check the files for duplicated code.

Both snippets were taken from existing projects and be successfully
compiled, so for some C / C++ compilers it is valid code.
---
 pmd-cpp/etc/grammar/cpp.jj                    | 30 ++-----------------
 .../sourceforge/pmd/cpd/CPPTokenizerTest.java | 12 ++++++++
 2 files changed, 14 insertions(+), 28 deletions(-)

diff --git a/pmd-cpp/etc/grammar/cpp.jj b/pmd-cpp/etc/grammar/cpp.jj
index a47a0b7ce3..b535533533 100644
--- a/pmd-cpp/etc/grammar/cpp.jj
+++ b/pmd-cpp/etc/grammar/cpp.jj
@@ -318,35 +318,9 @@ TOKEN [IGNORE_CASE] :
 TOKEN :
 {
 
-  <  CHARACTER : ("L")? "'"
-   (   (~["'","\\","\n","\r"])*
-   | ("\\" (
-             ["n","t","v","b","r","f","a","\\","?","'","\""]
-            |
-             "0" (["0"-"7"])*
-            |
-             ["1"-"9"] (["0"-"9"])*
-            |
-             ("x" | "X") (["0"-"9","a"-"f","A"-"F"])+
-           )
-     )
-   )
-   "'" >
+  <  CHARACTER : ("L")? "'" ( ( ~["'","\\","\r","\n"] ) | ( "\\" ( ~["\n","\r"] ) ) )* "'" >
 
-| <  STRING : ("L")? "\""
-   ( ( ~["\"","\\","\n","\r"])
-   | ("\\" (
-             ["n","t","v","b","r","f","a","\\","?","'","\"","\n"]
-            |
-             "0" (["0"-"7"])*
-            |
-             ["1"-"9"] (["0"-"9"])*
-            |
-             ("x" | "X") (["0"-"9","a"-"f","A"-"F"])+
-           )
-     )
-   )*
-   "\"" >
+| <  STRING : ("L")? "\"" ( ( ~["\"","\\","\r","\n"] ) | ( "\\" ( ~["\n","\r"] | "\n" | "\r\n" ) ) )* "\"" >
 }
 
 void translation_unit() :
diff --git a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java
index aac224ce3b..4b78507736 100644
--- a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java
+++ b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java
@@ -113,6 +113,18 @@ public class CPPTokenizerTest {
         assertEquals(15, tokens.size());
     }
 
+    @Test
+    public void testHexCharacter() {
+        Tokens tokens = parse("if (*pbuf == '\\0x05')" + PMD.EOL);
+        assertEquals(8, tokens.size());
+    }
+
+    @Test
+    public void testWhiteSpaceEscape() {
+        Tokens tokens = parse("szPath = m_sdcacheDir + _T(\"\\    oMedia\");" + PMD.EOL);
+        assertEquals(10, tokens.size());
+    }
+
     private Tokens parse(String snippet) {
         return parse(snippet, false);
     }