Fix wrong grammar for digit separators

This commit is contained in:
Clément Fournier
2020-05-10 14:20:07 +02:00
parent 673fe3934d
commit c429a4e0c4
2 changed files with 18 additions and 15 deletions

View File

@ -284,20 +284,21 @@ TOKEN :
TOKEN [IGNORE_CASE] :
{
< OCTALINT : "0" (["'", "0"-"7"])* >
< #DECIMALDIGIT: ["0"-"9"] >
| < #OCTALDIGIT: ["0"-"7"] >
| < #HEXDIGIT: ["a"-"f", "A"-"F", "0"-"9"] >
| < OCTALINT : "0" ("'" <OCTALDIGIT> | <OCTALDIGIT>)* >
| < OCTALLONG : <OCTALINT> "l" >
| < UNSIGNED_OCTALINT : <OCTALINT> "u" >
| < UNSIGNED_OCTALLONG : <OCTALINT> ("ul" | "lu") >
| < #DECIMALDIGIT : ["'", "0"-"9"] >
| < DECIMALINT : ["1"-"9"] (<DECIMALDIGIT>)* >
| < DECIMALINT : ["1"-"9"] ("'" <DECIMALDIGIT> | <DECIMALDIGIT>)* >
| < DECIMALLONG : <DECIMALINT> ["u","l"] >
| < UNSIGNED_DECIMALINT : <DECIMALINT> "u" >
| < UNSIGNED_DECIMALLONG : <DECIMALINT> ("ul" | "lu") >
| < HEXADECIMALINT : "0x" (<HEXDIGIT>)+ >
| < HEXADECIMALINT : "0x" <HEXDIGIT> ("'" <HEXDIGIT> | <HEXDIGIT>)+ >
| < HEXADECIMALLONG : <HEXADECIMALINT> (["u","l"])? >
| < UNSIGNED_HEXADECIMALINT : <HEXADECIMALINT> "u" >
| < UNSIGNED_HEXADECIMALLONG : <HEXADECIMALINT> ("ul" | "lu") >
@ -384,20 +385,16 @@ TOKEN :
{
// the standard says ids are only CHAR_NON_DIGIT CHAR*
// where CHAR is at least [a-zA-Z_0-9],
// but can also be \ uhhhh or \ Uhhhhhh (unicode escapes)
// but can also be \ uhhhh or \ Uhhhhhhhh (unicode escapes)
// *and* other characters may be allowed by implementations, eg a raw unicode char (not escaped), or a dollar
// this grammar doesn't allow unicode escapes, maybe it should?
// ref: https://en.cppreference.com/w/cpp/language/identifiers#Unicode_characters_in_identifiers
< ID : <ID_START_CHAR> (<ID_CHAR>)* >
| < #UNICODE_ESCAPE: "\\" ( "u" <HEXDIGIT> <HEXDIGIT> <HEXDIGIT> <HEXDIGIT>
| "U" <HEXDIGIT> <HEXDIGIT> <HEXDIGIT> <HEXDIGIT> <HEXDIGIT> <HEXDIGIT>
) >
| < #HEXDIGIT: ["a"-"f", "A"-"F", "0"-"9"] >
| < #ID_CHAR: [
| < #UNICODE_ESCAPE: "\\" ( "u" <HEXDIGIT_4> | "U" <HEXDIGIT_4> <HEXDIGIT_4> ) >
| < #HEXDIGIT_4: <HEXDIGIT> <HEXDIGIT> <HEXDIGIT> <HEXDIGIT> >
| < #ID_CHAR: <UNICODE_ESCAPE> | [
"a"-"z", "A"-"Z", "0"-"9", "_", "$",
"\u00a8", "\u00aa", "\u00ad", "\u00af",
"\u00b2"-"\u00b5", "\u00b7"-"\u00ba",
@ -418,7 +415,7 @@ TOKEN :
] >
// this production is the same as the above,
// with some ranges subtracted
| < #ID_START_CHAR: [
| < #ID_START_CHAR: <UNICODE_ESCAPE> | [
"a"-"z", "A"-"Z", "_", "$",
"\u00a8", "\u00aa", "\u00ad", "\u00af",
"\u00b2"-"\u00b5", "\u00b7"-"\u00ba",

View File

@ -73,6 +73,12 @@ public class CPPTokenizerTest extends CpdTextComparisonTest {
doTest("specialComments");
}
@Test
public void testUnicodeEscapeInIdentifier() {
Tokens tokens = parse(" void main() { int a\\u0048; }");
assertEquals(10, tokens.size());
}
@Test
public void testMultiLineMacros() {
doTest("multilineMacros");