Relaxed C++ tokenizer so a '\'(backslash) can be use to escape any character inside a string or character.

The following 2 code snippets could not be tokenized:

1:  if (*pbuf == '\0x05'), the problem is the '\0' in the character
literal '\0x05'.
2:  szPath = m_sdcacheDir + _T("\    oMedia");, the problem is the '\ '
in the string literal "\    oMedia".

I relaxed the lexical grammar so a '\' (backslash) can escape any
character inside a string or character literal. We can relax the grammar
because CPD only needs the tokens, so it is no problem to accept
'invalid' string / character literals. (according to the ANSI C
standard).  Failing too fast because the tokenizer is too strict is
annoying because then we can't check the files for duplicated code.

Both snippets were taken from existing projects and be successfully
compiled, so for some C / C++ compilers it is valid code.
This commit is contained in:
Jan van Nunen
2015-01-13 11:14:42 +01:00
parent d830974842
commit c92dc9706c
2 changed files with 14 additions and 28 deletions

View File

@ -318,35 +318,9 @@ TOKEN [IGNORE_CASE] :
TOKEN :
{
< CHARACTER : ("L")? "'"
( (~["'","\\","\n","\r"])*
| ("\\" (
["n","t","v","b","r","f","a","\\","?","'","\""]
|
"0" (["0"-"7"])*
|
["1"-"9"] (["0"-"9"])*
|
("x" | "X") (["0"-"9","a"-"f","A"-"F"])+
)
)
)
"'" >
< CHARACTER : ("L")? "'" ( ( ~["'","\\","\r","\n"] ) | ( "\\" ( ~["\n","\r"] ) ) )* "'" >
| < STRING : ("L")? "\""
( ( ~["\"","\\","\n","\r"])
| ("\\" (
["n","t","v","b","r","f","a","\\","?","'","\"","\n"]
|
"0" (["0"-"7"])*
|
["1"-"9"] (["0"-"9"])*
|
("x" | "X") (["0"-"9","a"-"f","A"-"F"])+
)
)
)*
"\"" >
| < STRING : ("L")? "\"" ( ( ~["\"","\\","\r","\n"] ) | ( "\\" ( ~["\n","\r"] | "\n" | "\r\n" ) ) )* "\"" >
}
void translation_unit() :

View File

@ -113,6 +113,18 @@ public class CPPTokenizerTest {
assertEquals(15, tokens.size());
}
@Test
public void testHexCharacter() {
Tokens tokens = parse("if (*pbuf == '\\0x05')" + PMD.EOL);
assertEquals(8, tokens.size());
}
@Test
public void testWhiteSpaceEscape() {
Tokens tokens = parse("szPath = m_sdcacheDir + _T(\"\\ oMedia\");" + PMD.EOL);
assertEquals(10, tokens.size());
}
private Tokens parse(String snippet) {
return parse(snippet, false);
}