c++ ident unicode support
This commit is contained in:
@@ -382,7 +382,63 @@ rstringbody:
|
||||
|
||||
TOKEN :
|
||||
{
|
||||
< ID : ["$", "a"-"z","A"-"Z", "_"] (["a"-"z","A"-"Z","0"-"9","_","$"])* >
|
||||
// the standard says ids are only CHAR_NON_DIGIT CHAR*
|
||||
// where CHAR is at least [a-zA-Z_0-9],
|
||||
// but can also be \ uhhhh or \ Uhhhhhh (unicode escapes)
|
||||
// *and* other characters may be allowed by implementations, eg a raw unicode char (not escaped), or a dollar
|
||||
|
||||
// this grammar doesn't allow unicode escapes
|
||||
|
||||
// ref: https://en.cppreference.com/w/cpp/language/identifiers#Unicode_characters_in_identifiers
|
||||
|
||||
< ID : <ID_START_CHAR> (<ID_CHAR>)* >
|
||||
|
||||
| < #ID_CHAR: [
|
||||
"a"-"z", "A"-"Z", "0"-"9", "_", "$",
|
||||
"\u00a8", "\u00aa", "\u00ad", "\u00af",
|
||||
"\u00b2"-"\u00b5", "\u00b7"-"\u00ba",
|
||||
"\u00b7"-"\u00ba", "\u00bc"-"\u00be",
|
||||
"\u00c0"-"\u00d6", "\u00d8"-"\u00f6",
|
||||
"\u00f8"-"\u167f", "\u1681"-"\u180d",
|
||||
"\u180f"-"\u1fff", "\u200b"-"\u200d",
|
||||
"\u202a"-"\u202e", "\u203f"-"\u2040",
|
||||
"\u2054", "\u2060"-"\u218f",
|
||||
"\u2460"-"\u24ff", "\u2776"-"\u2793",
|
||||
"\u2c00"-"\u2dff", "\u2e80"-"\u2fff",
|
||||
"\u3004"-"\u3007", "\u3021"-"\u302f",
|
||||
"\u3031"-"\ud7ff", "\uf900"-"\ufd3d",
|
||||
"\ufd40"-"\ufdcf", "\ufdf0"-"\ufe44",
|
||||
"\ufe47"-"\ufffd"
|
||||
// the standard also allows code points in planes 1 through e,
|
||||
// but javacc doesn't support supplementary characters
|
||||
] >
|
||||
// this production is the same as the above,
|
||||
// with some ranges subtracted
|
||||
| < #ID_START_CHAR: [
|
||||
"a"-"z", "A"-"Z", "_", "$",
|
||||
"\u00a8", "\u00aa", "\u00ad", "\u00af",
|
||||
"\u00b2"-"\u00b5", "\u00b7"-"\u00ba",
|
||||
"\u00b7"-"\u00ba", "\u00bc"-"\u00be",
|
||||
"\u00c0"-"\u00d6", "\u00d8"-"\u00f6",
|
||||
// subtracted u+0300-u+036f from u+00f8-u+167f
|
||||
"\u00f8"-"\u02ff", "\u0370"-"\u167f",
|
||||
"\u1681"-"\u180d",
|
||||
// subtracted u+1dc0-u+1dff from u+180f-u+1fff
|
||||
"\u180f"-"\u1dbf", "\u1e00"-"\u1fff",
|
||||
"\u200b"-"\u200d",
|
||||
"\u202a"-"\u202e", "\u203f"-"\u2040",
|
||||
"\u2054",
|
||||
// subtracted u+20d0-u+20ff from u+2060-u+218f
|
||||
"\u2060"-"\u20cf", "\u2100"-"\u218f",
|
||||
"\u2460"-"\u24ff", "\u2776"-"\u2793",
|
||||
"\u2c00"-"\u2dff", "\u2e80"-"\u2fff",
|
||||
"\u3004"-"\u3007", "\u3021"-"\u302f",
|
||||
"\u3031"-"\ud7ff", "\uf900"-"\ufd3d",
|
||||
"\ufd40"-"\ufdcf",
|
||||
// subtracted u+fe20-u+fe2f from u+fdf0-u+fe44
|
||||
"\ufdf0"-"\ufe1f", "\ufe30"-"\ufe44",
|
||||
"\ufe47"-"\ufffd"
|
||||
] >
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -64,7 +64,7 @@ public class CPPTokenizerTest extends CpdTextComparisonTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnicodeSupport() {
|
||||
public void testUnicodeStringSupport() {
|
||||
doTest("unicodeStrings");
|
||||
}
|
||||
|
||||
@@ -79,10 +79,15 @@ public class CPPTokenizerTest extends CpdTextComparisonTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDollarSignInIdentifier() {
|
||||
public void testIdentifierValidChars() {
|
||||
doTest("identifierChars");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWrongUnicodeInIdentifier() {
|
||||
expectTokenMgrError(" void main() { int ⚜ = __; }");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testTokenizerWithSkipBlocks() {
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
void main() {
|
||||
int x$y = 42;
|
||||
int $yx = 42;
|
||||
|
||||
int 県 = µweiß42;
|
||||
int ❶ = __;
|
||||
}
|
||||
@@ -17,6 +17,18 @@ L3
|
||||
[=] 13 13
|
||||
[42] 15 16
|
||||
[;] 17 17
|
||||
L4
|
||||
[int] 5 7
|
||||
[県] 9 9
|
||||
[=] 11 11
|
||||
[µweiß42] 13 19
|
||||
[;] 20 20
|
||||
L5
|
||||
[int] 5 7
|
||||
[❶] 9 9
|
||||
[=] 11 11
|
||||
[__] 13 14
|
||||
[;] 15 15
|
||||
L6
|
||||
[}] 2 2
|
||||
EOF
|
||||
|
||||
@@ -4,10 +4,12 @@
|
||||
|
||||
package net.sourceforge.pmd.cpd.test
|
||||
|
||||
import io.kotlintest.shouldThrow
|
||||
import net.sourceforge.pmd.cpd.SourceCode
|
||||
import net.sourceforge.pmd.cpd.TokenEntry
|
||||
import net.sourceforge.pmd.cpd.Tokenizer
|
||||
import net.sourceforge.pmd.cpd.Tokens
|
||||
import net.sourceforge.pmd.lang.ast.TokenMgrError
|
||||
import net.sourceforge.pmd.test.BaseTextComparisonTest
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import java.util.*
|
||||
@@ -55,6 +57,12 @@ abstract class CpdTextComparisonTest(
|
||||
}
|
||||
}
|
||||
|
||||
@JvmOverloads
|
||||
fun expectTokenMgrError(source: String, properties: Properties = defaultProperties()): TokenMgrError =
|
||||
shouldThrow {
|
||||
newTokenizer(properties).tokenize(sourceCodeOf(source), Tokens())
|
||||
}
|
||||
|
||||
|
||||
private fun StringBuilder.format(tokens: Tokens) {
|
||||
appendHeader().appendln()
|
||||
|
||||
Reference in New Issue
Block a user