c++ ident unicode support

2020-05-07 16:11:18 +02:00
parent 534bfe3c55
commit 185c172b3c
5 changed files with 86 additions and 4 deletions
--- a/pmd-cpp/etc/grammar/cpp.jj
+++ b/pmd-cpp/etc/grammar/cpp.jj
@@ -382,7 +382,63 @@ rstringbody:

 TOKEN :
 {
-  < ID : ["$", "a"-"z","A"-"Z", "_"] (["a"-"z","A"-"Z","0"-"9","_","$"])* >
+// the standard says ids are only CHAR_NON_DIGIT CHAR*
+// where CHAR is at least [a-zA-Z_0-9],
+// but can also be \ uhhhh or \ Uhhhhhh (unicode escapes)
+// *and* other characters may be allowed by implementations, eg a raw unicode char (not escaped), or a dollar
+
+// this grammar doesn't allow unicode escapes
+
+// ref: https://en.cppreference.com/w/cpp/language/identifiers#Unicode_characters_in_identifiers
+
+  < ID : <ID_START_CHAR> (<ID_CHAR>)* >
+
+| < #ID_CHAR: [
+    "a"-"z", "A"-"Z", "0"-"9", "_", "$",
+    "\u00a8", "\u00aa", "\u00ad", "\u00af",
+    "\u00b2"-"\u00b5", "\u00b7"-"\u00ba",
+    "\u00b7"-"\u00ba", "\u00bc"-"\u00be",
+    "\u00c0"-"\u00d6", "\u00d8"-"\u00f6",
+    "\u00f8"-"\u167f", "\u1681"-"\u180d",
+    "\u180f"-"\u1fff", "\u200b"-"\u200d",
+    "\u202a"-"\u202e", "\u203f"-"\u2040",
+    "\u2054", "\u2060"-"\u218f",
+    "\u2460"-"\u24ff", "\u2776"-"\u2793",
+    "\u2c00"-"\u2dff", "\u2e80"-"\u2fff",
+    "\u3004"-"\u3007", "\u3021"-"\u302f",
+    "\u3031"-"\ud7ff", "\uf900"-"\ufd3d",
+    "\ufd40"-"\ufdcf", "\ufdf0"-"\ufe44",
+    "\ufe47"-"\ufffd"
+    // the standard also allows code points in planes 1 through e,
+    // but javacc doesn't support supplementary characters
+  ] >
+// this production is the same as the above,
+// with some ranges subtracted
+| < #ID_START_CHAR: [
+     "a"-"z", "A"-"Z", "_", "$",
+     "\u00a8", "\u00aa", "\u00ad", "\u00af",
+     "\u00b2"-"\u00b5", "\u00b7"-"\u00ba",
+     "\u00b7"-"\u00ba", "\u00bc"-"\u00be",
+     "\u00c0"-"\u00d6", "\u00d8"-"\u00f6",
+     // subtracted u+0300-u+036f from u+00f8-u+167f
+     "\u00f8"-"\u02ff", "\u0370"-"\u167f",
+     "\u1681"-"\u180d",
+     // subtracted u+1dc0-u+1dff from u+180f-u+1fff
+     "\u180f"-"\u1dbf", "\u1e00"-"\u1fff",
+     "\u200b"-"\u200d",
+     "\u202a"-"\u202e", "\u203f"-"\u2040",
+     "\u2054",
+     // subtracted u+20d0-u+20ff from u+2060-u+218f
+     "\u2060"-"\u20cf", "\u2100"-"\u218f",
+     "\u2460"-"\u24ff", "\u2776"-"\u2793",
+     "\u2c00"-"\u2dff", "\u2e80"-"\u2fff",
+     "\u3004"-"\u3007", "\u3021"-"\u302f",
+     "\u3031"-"\ud7ff", "\uf900"-"\ufd3d",
+     "\ufd40"-"\ufdcf",
+     // subtracted u+fe20-u+fe2f from u+fdf0-u+fe44
+     "\ufdf0"-"\ufe1f", "\ufe30"-"\ufe44",
+     "\ufe47"-"\ufffd"
+   ] >
 }


--- a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java
+++ b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CPPTokenizerTest.java
@@ -64,7 +64,7 @@ public class CPPTokenizerTest extends CpdTextComparisonTest {
    }

    @Test
-    public void testUnicodeSupport() {
+    public void testUnicodeStringSupport() {
        doTest("unicodeStrings");
    }

@@ -79,10 +79,15 @@ public class CPPTokenizerTest extends CpdTextComparisonTest {
    }

    @Test
-    public void testDollarSignInIdentifier() {
+    public void testIdentifierValidChars() {
        doTest("identifierChars");
    }

+    @Test
+    public void testWrongUnicodeInIdentifier() {
+        expectTokenMgrError(" void main() { int ⚜ = __; }");
+    }
+

    @Test
    public void testTokenizerWithSkipBlocks() {
--- a/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/identifierChars.cpp
+++ b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/identifierChars.cpp
@@ -1,5 +1,6 @@
 void main() {
    int x$y = 42;
    int $yx = 42;
-
+    int 県 = µweiß42;
+    int ❶ = __;
 }
--- a/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/identifierChars.txt
+++ b/pmd-cpp/src/test/resources/net/sourceforge/pmd/lang/cpp/cpd/testdata/identifierChars.txt
@@ -17,6 +17,18 @@ L3
    [=]                                     13        13
    [42]                                    15        16
    [;]                                     17        17
+L4
+    [int]                                   5         7
+    [県]                                     9         9
+    [=]                                     11        11
+    [µweiß42]                               13        19
+    [;]                                     20        20
 L5
+    [int]                                   5         7
+    [❶]                                     9         9
+    [=]                                     11        11
+    [__]                                    13        14
+    [;]                                     15        15
+L6
    [}]                                     2         2
 EOF
--- a/pmd-lang-test/src/main/kotlin/net/sourceforge/pmd/cpd/test/CpdTextComparisonTest.kt
+++ b/pmd-lang-test/src/main/kotlin/net/sourceforge/pmd/cpd/test/CpdTextComparisonTest.kt
@@ -4,10 +4,12 @@

 package net.sourceforge.pmd.cpd.test

+import io.kotlintest.shouldThrow
 import net.sourceforge.pmd.cpd.SourceCode
 import net.sourceforge.pmd.cpd.TokenEntry
 import net.sourceforge.pmd.cpd.Tokenizer
 import net.sourceforge.pmd.cpd.Tokens
+import net.sourceforge.pmd.lang.ast.TokenMgrError
 import net.sourceforge.pmd.test.BaseTextComparisonTest
 import org.apache.commons.lang3.StringUtils
 import java.util.*
@@ -55,6 +57,12 @@ abstract class CpdTextComparisonTest(
        }
    }

+    @JvmOverloads
+    fun expectTokenMgrError(source: String, properties: Properties = defaultProperties()): TokenMgrError =
+            shouldThrow {
+                newTokenizer(properties).tokenize(sourceCodeOf(source), Tokens())
+            }
+

    private fun StringBuilder.format(tokens: Tokens) {
        appendHeader().appendln()