[apex] Use case-insensitive input stream to avoid choking on Unicode escape sequences (#5284)

Merge pull request #5284 from wahajenius:main
2024-11-14 18:23:26 +01:00 · 2024-11-14 18:23:26 +01:00 · 206ed8bbd3
commit 206ed8bbd3
parent 2df68ed168 e1d4f27e19
6 changed files with 188 additions and 117 deletions
--- a/.all-contributorsrc
+++ b/.all-contributorsrc
@ -7865,6 +7865,15 @@
      "contributions": [
        "code"
      ]
+    },
+    {
+      "login": "wahajenius",
+      "name": "Willem A. Hajenius",
+      "avatar_url": "https://avatars.githubusercontent.com/u/7836322?v=4",
+      "profile": "https://github.com/wahajenius",
+      "contributions": [
+        "code"
+      ]
    }
  ],
  "contributorsPerLine": 7,
--- a/docs/pages/pmd/projectdocs/credits.md
+++ b/docs/pages/pmd/projectdocs/credits.md
--- a/docs/pages/release_notes.md
+++ b/docs/pages/release_notes.md
@ -17,6 +17,8 @@ This is a {{ site.pmd.release_type }} release.
 ### 🐛 Fixed Issues
 * ant
  * [#1860](https://github.com/pmd/pmd/issues/1860): \[ant] Reflective access warnings on java > 9 and java < 17
+* apex
+  * [#5333](https://github.com/pmd/pmd/issues/5333): \[apex] Token recognition errors for string containing unicode escape sequence
 * java
  * [#5293](https://github.com/pmd/pmd/issues/5293): \[java] Deadlock when executing PMD in multiple threads
  * [#5324](https://github.com/pmd/pmd/issues/5324): \[java] Issue with type inference of nested lambdas
@ -29,6 +31,7 @@ This is a {{ site.pmd.release_type }} release.
    instead (note different package `ast` instead of `antlr4`).

 ### ✨ External Contributions
+* [#5284](https://github.com/pmd/pmd/pull/5284): \[apex] Use case-insensitive input stream to avoid choking on Unicode escape sequences - [Willem A. Hajenius](https://github.com/wahajenius) (@wahajenius)

 {% endtocmaker %}

--- a/pmd-apex/src/main/java/net/sourceforge/pmd/lang/apex/ast/ApexCommentBuilder.java
+++ b/pmd-apex/src/main/java/net/sourceforge/pmd/lang/apex/ast/ApexCommentBuilder.java
@ -14,14 +14,19 @@ import java.util.List;
 import java.util.Map;
 import java.util.RandomAccess;

+import org.antlr.v4.runtime.BaseErrorListener;
 import org.antlr.v4.runtime.CharStreams;
+import org.antlr.v4.runtime.RecognitionException;
+import org.antlr.v4.runtime.Recognizer;
 import org.antlr.v4.runtime.Token;

 import net.sourceforge.pmd.annotation.InternalApi;
+import net.sourceforge.pmd.lang.ast.LexException;
 import net.sourceforge.pmd.lang.document.TextDocument;
 import net.sourceforge.pmd.lang.document.TextRegion;

 import io.github.apexdevtools.apexparser.ApexLexer;
+import io.github.apexdevtools.apexparser.CaseInsensitiveInputStream;

@InternalApi
 final class ApexCommentBuilder {
@ -103,7 +108,15 @@ final class ApexCommentBuilder {
    }

    private static CommentInformation extractInformationFromComments(TextDocument sourceCode, String suppressMarker) {
-        ApexLexer lexer = new ApexLexer(CharStreams.fromString(sourceCode.getText().toString()));
+        String source = sourceCode.getText().toString();
+        ApexLexer lexer = new ApexLexer(new CaseInsensitiveInputStream(CharStreams.fromString(source)));
+        lexer.removeErrorListeners();
+        lexer.addErrorListener(new BaseErrorListener() {
+            @Override
+            public void syntaxError(Recognizer<?, ?> recognizer, Object offendingSymbol, int line, int charPositionInLine, String msg, RecognitionException e) {
+                throw new LexException(line, charPositionInLine, sourceCode.getFileId(), msg, e);
+            }
+        });

        List<Token> allCommentTokens = new ArrayList<>();
        Map<Integer, String> suppressMap = new HashMap<>();
--- a/pmd-apex/src/test/java/net/sourceforge/pmd/lang/apex/ast/ApexCommentTest.java
+++ b/pmd-apex/src/test/java/net/sourceforge/pmd/lang/apex/ast/ApexCommentTest.java
@ -66,4 +66,12 @@ class ApexCommentTest extends ApexParserTestBase {
        ASTFormalComment comment = file.descendants(ASTUserClass.class).children(ASTFormalComment.class).first();
        assertEquals(FORMAL_COMMENT_CONTENT, comment.getImage());
    }
+
+    @Test
+    void fileWithUnicodeEscapes() {
+        ASTApexFile file = apex.parse(FORMAL_COMMENT_CONTENT + "\n"
+                + "class MyClass { String s = 'Fran\\u00E7ois'; }");
+        ASTFormalComment comment = file.descendants(ASTUserClass.class).children(ASTFormalComment.class).first();
+        assertEquals(FORMAL_COMMENT_CONTENT, comment.getImage());
+    }
 }
--- a/pmd-apex/src/test/java/net/sourceforge/pmd/lang/apex/ast/ApexLexerTest.java
+++ b/pmd-apex/src/test/java/net/sourceforge/pmd/lang/apex/ast/ApexLexerTest.java
@ -8,14 +8,18 @@ package net.sourceforge.pmd.lang.apex.ast;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;

+import org.antlr.v4.runtime.BaseErrorListener;
 import org.antlr.v4.runtime.CharStream;
 import org.antlr.v4.runtime.CharStreams;
 import org.antlr.v4.runtime.CommonTokenStream;
+import org.antlr.v4.runtime.RecognitionException;
+import org.antlr.v4.runtime.Recognizer;
 import org.antlr.v4.runtime.Token;
 import org.junit.jupiter.api.Test;

 import io.github.apexdevtools.apexparser.ApexLexer;
 import io.github.apexdevtools.apexparser.ApexParser;
+import io.github.apexdevtools.apexparser.CaseInsensitiveInputStream;

 /**
 * This is an exploration test for {@link ApexLexer}.
@ -49,4 +53,36 @@ class ApexLexerTest {
        ApexParser.CompilationUnitContext compilationUnit = parser.compilationUnit();
        assertNotNull(compilationUnit);
    }
+
+    @Test
+    void testLexerUnicodeEscapes() {
+        String s = "'Fran\\u00E7ois'";
+        // note: with apex-parser 4.3.1, no errors are reported anymore
+        assertEquals(2, getLexingErrors(CharStreams.fromString(s)));
+        assertEquals(0, getLexingErrors(new CaseInsensitiveInputStream(CharStreams.fromString(s))));
+    }
+
+    private int getLexingErrors(CharStream stream) {
+        ApexLexer lexer = new ApexLexer(stream);
+        ErrorListener errorListener = new ErrorListener();
+        lexer.removeErrorListeners(); // Avoid distracting "token recognition error" stderr output
+        lexer.addErrorListener(errorListener);
+        CommonTokenStream tokens = new CommonTokenStream(lexer);
+        tokens.fill();
+        return errorListener.getErrorCount();
+    }
+
+    private static class ErrorListener extends BaseErrorListener {
+        private int errorCount = 0;
+
+        @Override
+        public void syntaxError(Recognizer<?, ?> recognizer, Object offendingSymbol, int line,
+            int charPositionInLine, String msg, RecognitionException e) {
+            ++errorCount;
+        }
+
+        public int getErrorCount() {
+            return errorCount;
+        }
+    }
 }