[apex] Must use case-insensitive input stream to avoid choking on Unicode escape sequences

2024-10-21 23:45:20 +02:00
parent 97fe389dcc
commit 61b1c372cd
2 changed files with 38 additions and 1 deletions
--- a/pmd-apex/src/main/java/net/sourceforge/pmd/lang/apex/ast/ApexCommentBuilder.java
+++ b/pmd-apex/src/main/java/net/sourceforge/pmd/lang/apex/ast/ApexCommentBuilder.java
@ -22,6 +22,7 @@ import net.sourceforge.pmd.lang.document.TextDocument;
 import net.sourceforge.pmd.lang.document.TextRegion;

 import io.github.apexdevtools.apexparser.ApexLexer;
+import io.github.apexdevtools.apexparser.CaseInsensitiveInputStream;

@InternalApi
 final class ApexCommentBuilder {
@ -103,7 +104,8 @@ final class ApexCommentBuilder {
    }

    private static CommentInformation extractInformationFromComments(TextDocument sourceCode, String suppressMarker) {
-        ApexLexer lexer = new ApexLexer(CharStreams.fromString(sourceCode.getText().toString()));
+        String source = sourceCode.getText().toString();
+        ApexLexer lexer = new ApexLexer(new CaseInsensitiveInputStream(CharStreams.fromString(source)));

        List<Token> allCommentTokens = new ArrayList<>();
        Map<Integer, String> suppressMap = new HashMap<>();
--- a/pmd-apex/src/test/java/net/sourceforge/pmd/lang/apex/ast/ApexLexerTest.java
+++ b/pmd-apex/src/test/java/net/sourceforge/pmd/lang/apex/ast/ApexLexerTest.java
@ -8,14 +8,18 @@ package net.sourceforge.pmd.lang.apex.ast;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;

+import org.antlr.v4.runtime.BaseErrorListener;
 import org.antlr.v4.runtime.CharStream;
 import org.antlr.v4.runtime.CharStreams;
 import org.antlr.v4.runtime.CommonTokenStream;
+import org.antlr.v4.runtime.RecognitionException;
+import org.antlr.v4.runtime.Recognizer;
 import org.antlr.v4.runtime.Token;
 import org.junit.jupiter.api.Test;

 import io.github.apexdevtools.apexparser.ApexLexer;
 import io.github.apexdevtools.apexparser.ApexParser;
+import io.github.apexdevtools.apexparser.CaseInsensitiveInputStream;

 /**
 * This is an exploration test for {@link ApexLexer}.
@ -49,4 +53,35 @@ class ApexLexerTest {
        ApexParser.CompilationUnitContext compilationUnit = parser.compilationUnit();
        assertNotNull(compilationUnit);
    }
+
+    @Test
+    void testLexerUnicodeEscapes() {
+        String s = "'Fran\\u00E7ois'";
+        assertEquals(2, getLexingErrors(CharStreams.fromString(s)));
+        assertEquals(0, getLexingErrors(new CaseInsensitiveInputStream(CharStreams.fromString(s))));
+    }
+
+    private int getLexingErrors(CharStream stream) {
+        ApexLexer lexer = new ApexLexer(stream);
+        ErrorListener errorListener = new ErrorListener();
+        lexer.removeErrorListeners(); // Avoid distracting "token recognition error" stderr output
+        lexer.addErrorListener(errorListener);
+        CommonTokenStream tokens = new CommonTokenStream(lexer);
+        tokens.fill();
+        return errorListener.getErrorCount();
+    }
+
+    static class ErrorListener extends BaseErrorListener {
+        private int errorCount = 0;
+
+        @Override
+        public void syntaxError(Recognizer<?, ?> recognizer, Object offendingSymbol, int line,
+            int charPositionInLine, String msg, RecognitionException e) {
+            ++errorCount;
+        }
+
+        public int getErrorCount() {
+            return errorCount;
+        }
+    }
 }