From c429a4e0c477080ab4a15148b0a5de83ec525a76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Fournier?= Date: Sun, 10 May 2020 14:20:07 +0200 Subject: [PATCH] Fix wrong grammar for digit separators --- pmd-cpp/etc/grammar/cpp.jj | 27 +++++++++---------- .../sourceforge/pmd/cpd/CPPTokenizerTest.java | 6 +++++ 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/pmd-cpp/etc/grammar/cpp.jj b/pmd-cpp/etc/grammar/cpp.jj index 26426aaa13..2c48645a2f 100644 --- a/pmd-cpp/etc/grammar/cpp.jj +++ b/pmd-cpp/etc/grammar/cpp.jj @@ -284,20 +284,21 @@ TOKEN : TOKEN [IGNORE_CASE] : { - < OCTALINT : "0" (["'", "0"-"7"])* > + < #DECIMALDIGIT: ["0"-"9"] > +| < #OCTALDIGIT: ["0"-"7"] > +| < #HEXDIGIT: ["a"-"f", "A"-"F", "0"-"9"] > + +| < OCTALINT : "0" ("'" | )* > | < OCTALLONG : "l" > | < UNSIGNED_OCTALINT : "u" > | < UNSIGNED_OCTALLONG : ("ul" | "lu") > -| < #DECIMALDIGIT : ["'", "0"-"9"] > - -| < DECIMALINT : ["1"-"9"] ()* > +| < DECIMALINT : ["1"-"9"] ("'" | )* > | < DECIMALLONG : ["u","l"] > | < UNSIGNED_DECIMALINT : "u" > | < UNSIGNED_DECIMALLONG : ("ul" | "lu") > - -| < HEXADECIMALINT : "0x" ()+ > +| < HEXADECIMALINT : "0x" ("'" | )+ > | < HEXADECIMALLONG : (["u","l"])? > | < UNSIGNED_HEXADECIMALINT : "u" > | < UNSIGNED_HEXADECIMALLONG : ("ul" | "lu") > @@ -384,20 +385,16 @@ TOKEN : { // the standard says ids are only CHAR_NON_DIGIT CHAR* // where CHAR is at least [a-zA-Z_0-9], -// but can also be \ uhhhh or \ Uhhhhhh (unicode escapes) +// but can also be \ uhhhh or \ Uhhhhhhhh (unicode escapes) // *and* other characters may be allowed by implementations, eg a raw unicode char (not escaped), or a dollar -// this grammar doesn't allow unicode escapes, maybe it should? - // ref: https://en.cppreference.com/w/cpp/language/identifiers#Unicode_characters_in_identifiers < ID : ()* > -| < #UNICODE_ESCAPE: "\\" ( "u" - | "U" - ) > -| < #HEXDIGIT: ["a"-"f", "A"-"F", "0"-"9"] > -| < #ID_CHAR: [ +| < #UNICODE_ESCAPE: "\\" ( "u" | "U" ) > +| < #HEXDIGIT_4: > +| < #ID_CHAR: | [ "a"-"z", "A"-"Z", "0"-"9", "_", "$", "\u00a8", "\u00aa", "\u00ad", "\u00af", "\u00b2"-"\u00b5", "\u00b7"-"\u00ba", @@ -418,7 +415,7 @@ TOKEN : ] > // this production is the same as the above, // with some ranges subtracted -| < #ID_START_CHAR: [ +| < #ID_START_CHAR: | [ "a"-"z", "A"-"Z", "_", "$", "\u00a8", "\u00aa", "\u00ad", "\u00af", "\u00b2"-"\u00b5", "\u00b7"-"\u00ba", diff --git a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java index 0c36ed6500..5add8c9ffd 100644 --- a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java +++ b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java @@ -73,6 +73,12 @@ public class CPPTokenizerTest extends CpdTextComparisonTest { doTest("specialComments"); } + @Test + public void testUnicodeEscapeInIdentifier() { + Tokens tokens = parse(" void main() { int a\\u0048; }"); + assertEquals(10, tokens.size()); + } + @Test public void testMultiLineMacros() { doTest("multilineMacros");