Also add this ability for Antlr lexers, adapt TSQL

2024-04-08 21:31:22 +02:00
parent 0cb2e37ce9
commit ab80b2443d
7 changed files with 2132 additions and 2056 deletions
--- a/pmd-core/src/main/java/net/sourceforge/pmd/cpd/impl/AntlrCpdLexer.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/cpd/impl/AntlrCpdLexer.java
@ -12,6 +12,7 @@ import org.antlr.v4.runtime.Lexer;

 import net.sourceforge.pmd.cpd.CpdLexer;
 import net.sourceforge.pmd.lang.TokenManager;
+import net.sourceforge.pmd.lang.ast.impl.antlr4.AntlrLexerBehavior;
 import net.sourceforge.pmd.lang.ast.impl.antlr4.AntlrToken;
 import net.sourceforge.pmd.lang.ast.impl.antlr4.AntlrTokenManager;
 import net.sourceforge.pmd.lang.document.TextDocument;
@ -23,7 +24,15 @@ public abstract class AntlrCpdLexer extends CpdLexerBase<AntlrToken> {
    @Override
    protected final TokenManager<AntlrToken> makeLexerImpl(TextDocument doc) throws IOException {
        CharStream charStream = CharStreams.fromReader(doc.newReader(), doc.getFileId().getAbsolutePath());
-        return new AntlrTokenManager(getLexerForSource(charStream), doc);
+        return new AntlrTokenManager(getLexerForSource(charStream), doc, getLexerBehavior());
+    }
+
+    /**
+     * Override this method to customize some aspects of the
+     * lexer.
+     */
+    protected AntlrLexerBehavior getLexerBehavior() {
+        return new AntlrLexerBehavior();
    }

    protected abstract Lexer getLexerForSource(CharStream charStream);
--- a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrLexerBehavior.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrLexerBehavior.java
@ -0,0 +1,32 @@
+/**
+ * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
+ */
+
+package net.sourceforge.pmd.lang.ast.impl.antlr4;
+
+import org.antlr.v4.runtime.Token;
+
+import net.sourceforge.pmd.cpd.CpdLanguageProperties;
+
+/**
+ * Strategy to customize some aspects of the mapping
+ * from Antlr tokens to PMD/CPD tokens.
+ */
+public class AntlrLexerBehavior {
+
+
+    /**
+     * Return the image that the token should have, possibly applying a transformation.
+     * The default just returns {@link Token#getText()}.
+     * Transformations here are usually normalizations, for instance, mapping
+     * the image of all keywords to uppercase/lowercase to implement case-insensitivity,
+     * or replacing the image of literals by a placeholder to implement {@link CpdLanguageProperties#CPD_ANONYMIZE_LITERALS}.
+     *
+     * @param token A token from the Antlr Lexer
+     *
+     * @return The image
+     */
+    protected String getTokenImage(Token token) {
+        return token.getText();
+    }
+}
--- a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrToken.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrToken.java
@ -17,9 +17,13 @@ import net.sourceforge.pmd.lang.document.TextRegion;
 */
 public class AntlrToken implements GenericToken<AntlrToken> {

-    private final Token token;
    private final AntlrToken previousComment;
    private final TextDocument textDoc;
+    private final String image;
+    private final int endOffset;
+    private final int startOffset;
+    private final int channel;
+    private final int kind;
    AntlrToken next;


@ -30,10 +34,14 @@ public class AntlrToken implements GenericToken<AntlrToken> {
     * @param previousComment The previous comment
     * @param textDoc         The text document
     */
-    public AntlrToken(final Token token, final AntlrToken previousComment, TextDocument textDoc) {
-        this.token = token;
+    AntlrToken(final Token token, final AntlrToken previousComment, TextDocument textDoc, AntlrLexerBehavior behavior) {
        this.previousComment = previousComment;
        this.textDoc = textDoc;
+        this.image = behavior.getTokenImage(token);
+        this.startOffset = token.getStartIndex();
+        this.endOffset = token.getStopIndex();
+        this.channel = token.getChannel();
+        this.kind = token.getType();
    }

    @Override
@ -48,13 +56,13 @@ public class AntlrToken implements GenericToken<AntlrToken> {

    @Override
    public CharSequence getImageCs() {
-        return token.getText();
+        return image;
    }

    /** Returns a text region with the coordinates of this token. */
    @Override
    public TextRegion getRegion() {
-        return TextRegion.fromBothOffsets(token.getStartIndex(), token.getStopIndex() + 1);
+        return TextRegion.fromBothOffsets(startOffset, endOffset);
    }

    @Override
@ -74,7 +82,7 @@ public class AntlrToken implements GenericToken<AntlrToken> {

    @Override
    public int getKind() {
-        return token.getType();
+        return kind;
    }

    public boolean isHidden() {
@ -82,6 +90,6 @@ public class AntlrToken implements GenericToken<AntlrToken> {
    }

    public boolean isDefault() {
-        return token.getChannel() == Lexer.DEFAULT_TOKEN_CHANNEL;
+        return channel == Lexer.DEFAULT_TOKEN_CHANNEL;
    }
 }
--- a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrTokenManager.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/antlr4/AntlrTokenManager.java
@ -20,12 +20,20 @@ public class AntlrTokenManager implements TokenManager<AntlrToken> {

    private final Lexer lexer;
    private final TextDocument textDoc;
+    private final AntlrLexerBehavior behavior;
    private AntlrToken previousToken;


    public AntlrTokenManager(final Lexer lexer, final TextDocument textDocument) {
+        this(lexer, textDocument, new AntlrLexerBehavior());
+    }
+
+    public AntlrTokenManager(final Lexer lexer,
+                             final TextDocument textDocument,
+                             final AntlrLexerBehavior behavior) {
        this.lexer = lexer;
        this.textDoc = textDocument;
+        this.behavior = behavior;
        resetListeners();
    }

@ -40,7 +48,7 @@ public class AntlrTokenManager implements TokenManager<AntlrToken> {

    private AntlrToken getNextTokenFromAnyChannel() {
        final AntlrToken previousComment = previousToken != null && previousToken.isHidden() ? previousToken : null;
-        final AntlrToken currentToken = new AntlrToken(lexer.nextToken(), previousComment, textDoc);
+        final AntlrToken currentToken = new AntlrToken(lexer.nextToken(), previousComment, textDoc, this.behavior);
        if (previousToken != null) {
            previousToken.next = currentToken;
        }
--- a/pmd-tsql/src/main/java/net/sourceforge/pmd/lang/tsql/cpd/TSqlCpdLexer.java
+++ b/pmd-tsql/src/main/java/net/sourceforge/pmd/lang/tsql/cpd/TSqlCpdLexer.java
@ -4,10 +4,14 @@

 package net.sourceforge.pmd.lang.tsql.cpd;

+import java.util.Locale;
+
 import org.antlr.v4.runtime.CharStream;
 import org.antlr.v4.runtime.Lexer;
+import org.antlr.v4.runtime.Token;

 import net.sourceforge.pmd.cpd.impl.AntlrCpdLexer;
+import net.sourceforge.pmd.lang.ast.impl.antlr4.AntlrLexerBehavior;
 import net.sourceforge.pmd.lang.tsql.ast.TSqlLexer;

 /**
@ -19,4 +23,19 @@ public class TSqlCpdLexer extends AntlrCpdLexer {
    protected Lexer getLexerForSource(CharStream charStream) {
        return new TSqlLexer(new CaseChangingCharStream(charStream, true));
    }
+
+    @Override
+    protected AntlrLexerBehavior getLexerBehavior() {
+        return new AntlrLexerBehavior() {
+            @Override
+            protected String getTokenImage(Token token) {
+                if (token.getType() == TSqlLexer.STRING) {
+                    // This path is for case-sensitive tokens
+                    return super.getTokenImage(token);
+                }
+                // normalize case sensitive tokens
+                return token.getText().toUpperCase(Locale.ROOT);
+            }
+        };
+    }
 }
--- a/pmd-tsql/src/test/resources/net/sourceforge/pmd/lang/tsql/cpd/testdata/MailJobTimeLine.txt
+++ b/pmd-tsql/src/test/resources/net/sourceforge/pmd/lang/tsql/cpd/testdata/MailJobTimeLine.txt
--- a/pmd-tsql/src/test/resources/net/sourceforge/pmd/lang/tsql/cpd/testdata/simple.txt
+++ b/pmd-tsql/src/test/resources/net/sourceforge/pmd/lang/tsql/cpd/testdata/simple.txt
@ -1,28 +1,28 @@
    [Image] or [Truncated image[            Bcol      Ecol
 L1
-    [create]                                1         7
-    [procedure]                             8         17
-    [p]                                     18        19
-    [(]                                     20        21
-    [@v]                                    21        23
-    [int]                                   24        27
-    [)]                                     27        28
-    [as]                                    29        31
-    [begin]                                 32        37
+    [CREATE]                                1         6
+    [PROCEDURE]                             8         16
+    [P]                                     18        18
+    [(]                                     20        20
+    [@V]                                    21        22
+    [INT]                                   24        26
+    [)]                                     27        27
+    [AS]                                    29        30
+    [BEGIN]                                 32        36
 L2
-    [declare]                               2         9
-    [@f]                                    10        12
-    [int]                                   13        16
+    [DECLARE]                               2         8
+    [@F]                                    10        11
+    [INT]                                   13        15
 L3
-    [set]                                   2         5
-    [@f]                                    6         8
-    [=]                                     9         10
-    [@v]                                    11        13
-    [+]                                     14        15
-    [2]                                     16        17
+    [SET]                                   2         4
+    [@F]                                    6         7
+    [=]                                     9         9
+    [@V]                                    11        12
+    [+]                                     14        14
+    [2]                                     16        16
 L4
-    [select]                                2         8
-    [@f]                                    9         11
+    [SELECT]                                2         7
+    [@F]                                    9         10
 L5
-    [end]                                   1         4
+    [END]                                   1         3
 EOF