[apex] Use case-insensitive input stream to avoid choking on Unicode escape sequences (#5284)

Merge pull request #5284 from wahajenius:main
This commit is contained in:
Andreas Dangel 2024-11-14 18:23:26 +01:00
commit 206ed8bbd3
No known key found for this signature in database
GPG Key ID: 93450DF2DF9A3FA3
6 changed files with 188 additions and 117 deletions

View File

@ -7865,6 +7865,15 @@
"contributions": [
"code"
]
},
{
"login": "wahajenius",
"name": "Willem A. Hajenius",
"avatar_url": "https://avatars.githubusercontent.com/u/7836322?v=4",
"profile": "https://github.com/wahajenius",
"contributions": [
"code"
]
}
],
"contributorsPerLine": 7,

File diff suppressed because it is too large Load Diff

View File

@ -17,6 +17,8 @@ This is a {{ site.pmd.release_type }} release.
### 🐛 Fixed Issues
* ant
* [#1860](https://github.com/pmd/pmd/issues/1860): \[ant] Reflective access warnings on java > 9 and java < 17
* apex
* [#5333](https://github.com/pmd/pmd/issues/5333): \[apex] Token recognition errors for string containing unicode escape sequence
* java
* [#5293](https://github.com/pmd/pmd/issues/5293): \[java] Deadlock when executing PMD in multiple threads
* [#5324](https://github.com/pmd/pmd/issues/5324): \[java] Issue with type inference of nested lambdas
@ -29,6 +31,7 @@ This is a {{ site.pmd.release_type }} release.
instead (note different package `ast` instead of `antlr4`).
### ✨ External Contributions
* [#5284](https://github.com/pmd/pmd/pull/5284): \[apex] Use case-insensitive input stream to avoid choking on Unicode escape sequences - [Willem A. Hajenius](https://github.com/wahajenius) (@wahajenius)
{% endtocmaker %}

View File

@ -14,14 +14,19 @@ import java.util.List;
import java.util.Map;
import java.util.RandomAccess;
import org.antlr.v4.runtime.BaseErrorListener;
import org.antlr.v4.runtime.CharStreams;
import org.antlr.v4.runtime.RecognitionException;
import org.antlr.v4.runtime.Recognizer;
import org.antlr.v4.runtime.Token;
import net.sourceforge.pmd.annotation.InternalApi;
import net.sourceforge.pmd.lang.ast.LexException;
import net.sourceforge.pmd.lang.document.TextDocument;
import net.sourceforge.pmd.lang.document.TextRegion;
import io.github.apexdevtools.apexparser.ApexLexer;
import io.github.apexdevtools.apexparser.CaseInsensitiveInputStream;
@InternalApi
final class ApexCommentBuilder {
@ -103,7 +108,15 @@ final class ApexCommentBuilder {
}
private static CommentInformation extractInformationFromComments(TextDocument sourceCode, String suppressMarker) {
ApexLexer lexer = new ApexLexer(CharStreams.fromString(sourceCode.getText().toString()));
String source = sourceCode.getText().toString();
ApexLexer lexer = new ApexLexer(new CaseInsensitiveInputStream(CharStreams.fromString(source)));
lexer.removeErrorListeners();
lexer.addErrorListener(new BaseErrorListener() {
@Override
public void syntaxError(Recognizer<?, ?> recognizer, Object offendingSymbol, int line, int charPositionInLine, String msg, RecognitionException e) {
throw new LexException(line, charPositionInLine, sourceCode.getFileId(), msg, e);
}
});
List<Token> allCommentTokens = new ArrayList<>();
Map<Integer, String> suppressMap = new HashMap<>();

View File

@ -66,4 +66,12 @@ class ApexCommentTest extends ApexParserTestBase {
ASTFormalComment comment = file.descendants(ASTUserClass.class).children(ASTFormalComment.class).first();
assertEquals(FORMAL_COMMENT_CONTENT, comment.getImage());
}
@Test
void fileWithUnicodeEscapes() {
ASTApexFile file = apex.parse(FORMAL_COMMENT_CONTENT + "\n"
+ "class MyClass { String s = 'Fran\\u00E7ois'; }");
ASTFormalComment comment = file.descendants(ASTUserClass.class).children(ASTFormalComment.class).first();
assertEquals(FORMAL_COMMENT_CONTENT, comment.getImage());
}
}

View File

@ -8,14 +8,18 @@ package net.sourceforge.pmd.lang.apex.ast;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import org.antlr.v4.runtime.BaseErrorListener;
import org.antlr.v4.runtime.CharStream;
import org.antlr.v4.runtime.CharStreams;
import org.antlr.v4.runtime.CommonTokenStream;
import org.antlr.v4.runtime.RecognitionException;
import org.antlr.v4.runtime.Recognizer;
import org.antlr.v4.runtime.Token;
import org.junit.jupiter.api.Test;
import io.github.apexdevtools.apexparser.ApexLexer;
import io.github.apexdevtools.apexparser.ApexParser;
import io.github.apexdevtools.apexparser.CaseInsensitiveInputStream;
/**
* This is an exploration test for {@link ApexLexer}.
@ -49,4 +53,36 @@ class ApexLexerTest {
ApexParser.CompilationUnitContext compilationUnit = parser.compilationUnit();
assertNotNull(compilationUnit);
}
@Test
void testLexerUnicodeEscapes() {
String s = "'Fran\\u00E7ois'";
// note: with apex-parser 4.3.1, no errors are reported anymore
assertEquals(2, getLexingErrors(CharStreams.fromString(s)));
assertEquals(0, getLexingErrors(new CaseInsensitiveInputStream(CharStreams.fromString(s))));
}
private int getLexingErrors(CharStream stream) {
ApexLexer lexer = new ApexLexer(stream);
ErrorListener errorListener = new ErrorListener();
lexer.removeErrorListeners(); // Avoid distracting "token recognition error" stderr output
lexer.addErrorListener(errorListener);
CommonTokenStream tokens = new CommonTokenStream(lexer);
tokens.fill();
return errorListener.getErrorCount();
}
private static class ErrorListener extends BaseErrorListener {
private int errorCount = 0;
@Override
public void syntaxError(Recognizer<?, ?> recognizer, Object offendingSymbol, int line,
int charPositionInLine, String msg, RecognitionException e) {
++errorCount;
}
public int getErrorCount() {
return errorCount;
}
}
}