From b8b4e513ac232711f00ed9e4b699b7a2459e1233 Mon Sep 17 00:00:00 2001
From: Matt Hargett <plaztiksyke@gmail.com>
Date: Fri, 9 Sep 2022 16:42:18 -0700
Subject: [PATCH] Support multi-line 'normal' strings. Support skipping literal
 sequences. More minor adjustments to make the testing corpus tokenize without
 errors.

---
 .../sourceforge/pmd/lang/lua/antlr4/Lua.g4    |  88 +++++++---
 .../net/sourceforge/pmd/cpd/LuaLanguage.java  |  15 +-
 .../net/sourceforge/pmd/cpd/LuaTokenizer.java | 166 +++++++++++++++++-
 .../pmd/lang/lua/cpd/testdata/luauTypes.lua   |  10 +-
 .../pmd/lang/lua/cpd/testdata/luauTypes.txt   |  69 ++++----
 5 files changed, 283 insertions(+), 65 deletions(-)

diff --git a/pmd-lua/src/main/antlr4/net/sourceforge/pmd/lang/lua/antlr4/Lua.g4 b/pmd-lua/src/main/antlr4/net/sourceforge/pmd/lang/lua/antlr4/Lua.g4
index 30e36071ff..4676055f48 100644
--- a/pmd-lua/src/main/antlr4/net/sourceforge/pmd/lang/lua/antlr4/Lua.g4
+++ b/pmd-lua/src/main/antlr4/net/sourceforge/pmd/lang/lua/antlr4/Lua.g4
@@ -77,7 +77,7 @@ block
 
 stat
     : ';'
-    | varlist '=' explist
+    | varlist ASSIGNMENT explist
     | var compoundop exp
     | functioncall
     | label
@@ -87,11 +87,11 @@ stat
     | 'while' exp 'do' block 'end'
     | 'repeat' block 'until' exp
     | 'if' exp 'then' block ('elseif' exp 'then' block)* ('else' block)? 'end'
-    | 'for' binding '=' exp ',' exp (',' exp)? 'do' block 'end'
+    | 'for' binding ASSIGNMENT exp ',' exp (',' exp)? 'do' block 'end'
     | 'for' bindinglist 'in' explist 'do' block 'end'
     | 'function' funcname funcbody
-    | 'local' 'function' NAME funcbody
-    | 'local' bindinglist ('=' explist)?
+    | LOCAL 'function' NAME funcbody
+    | LOCAL bindinglist (ASSIGNMENT explist)?
     | ('export')? 'type' NAME ('<' genericTypeParameterList '>')? '=' type
     ;
 
@@ -116,7 +116,7 @@ funcname
     ;
 
 funcbody
-    : ('<' genericTypeParameterList '>')? '(' parlist? ')' (':' '...'? returnType ) block 'end'
+    : ('<' genericTypeParameterList '>')? OPEN_PARENS parlist? CLOSE_PARENS (':' '...'? returnType ) block 'end'
     ;
 
 parlist
@@ -138,7 +138,7 @@ binding
 bindinglist: binding (',' bindinglist)?;
 
 var
-    : (NAME | '(' exp ')' varSuffix) varSuffix*
+    : (NAME | OPEN_PARENS exp CLOSE_PARENS varSuffix) varSuffix*
     ;
 
 varlist
@@ -172,11 +172,11 @@ simpleexp
     | tableconstructor;
 
 varOrExp
-    : var | '(' exp ')'
+    : var | OPEN_PARENS exp CLOSE_PARENS
     ;
 
 varSuffix
-    : nameAndArgs* ('[' exp ']' | '.' NAME)
+    : nameAndArgs* (OPEN_BRACKET exp CLOSE_BRACKET | '.' NAME)
     ;
 
 nameAndArgs
@@ -184,7 +184,7 @@ nameAndArgs
     ;
 
 args
-    : '(' explist? ')' | tableconstructor | string
+    : OPEN_PARENS explist? CLOSE_PARENS | tableconstructor | string
     ;
 
 functiondef
@@ -192,7 +192,7 @@ functiondef
     ;
 
 tableconstructor
-    : '{' fieldlist? '}'
+    : OPEN_BRACE fieldlist? CLOSE_BRACE
     ;
 
 fieldlist
@@ -200,7 +200,7 @@ fieldlist
     ;
 
 field
-    : '[' exp ']' '=' exp | NAME '=' exp | exp
+    : OPEN_BRACKET exp CLOSE_BRACKET ASSIGNMENT exp | NAME ASSIGNMENT exp | exp
     ;
 
 fieldsep
@@ -220,6 +220,8 @@ operatorAnd
 operatorComparison
 	: '<' | '>' | '<=' | '>=' | '~=' | '==';
 
+ASSIGNMENT:               '=';
+
 operatorStrcat
 	: '..';
 
@@ -243,20 +245,20 @@ number
     ;
 
 string
-    : NORMALSTRING | LONGSTRING
+    : NORMAL_STRING | LONG_STRING | INTERPOLATED_STRING
     ;
 
 simpleType
     : NIL
     | singletonType
     | NAME ('.' NAME)? ('<' typeParams '>')?
-    | 'typeof' '(' exp ')'
+    | 'typeof' OPEN_PARENS exp CLOSE_PARENS
     | tableType
     | functionType
     ;
 
 singletonType
-    : NORMALSTRING | BOOLEAN
+    : NORMAL_STRING | BOOLEAN
     ;
 
 type
@@ -273,7 +275,7 @@ typeList: type (',' typeList)? | variadicTypePack;
 
 typeParams: (type | typePack | variadicTypePack | genericTypePack) (',' typeParams)?;
 
-typePack: '(' (typeList)? ')';
+typePack: OPEN_PARENS (typeList)? CLOSE_PARENS;
 
 genericTypePack: NAME '...';
 
@@ -281,7 +283,7 @@ variadicTypePack: '...' type;
 
 returnType: type | typePack;
 
-tableIndexer: '[' type ']' ':' type;
+tableIndexer: OPEN_BRACKET type CLOSE_BRACKET ':' type;
 
 tableProp: NAME ':' type;
 
@@ -292,17 +294,25 @@ propList
     : tablePropOrIndexer (fieldsep tablePropOrIndexer)* fieldsep?;
 
 tableType
-    : '{' propList '}';
+    : OPEN_BRACE propList CLOSE_BRACE;
 
-functionType: ('<' genericTypeParameterList '>')? '(' (typeList)? ')' '->' returnType;
+functionType: ('<' genericTypeParameterList '>')? OPEN_PARENS (typeList)? CLOSE_PARENS '->' returnType;
 
 require
-    : 'local'? bindinglist '=' 'require' '(' exp ')' ('.' NAME)* ('::' type)? ';'?
+    : 'local'? bindinglist '=' REQUIRE OPEN_PARENS exp CLOSE_PARENS ('.' NAME)* ('::' type)? ';'?
     ;
 
 
 // LEXER
 
+LOCAL
+    : 'local'
+    ;
+
+REQUIRE
+    : 'require'
+    ;
+
 NIL 
     : 'nil' 
     ;
@@ -315,19 +325,23 @@ NAME
     : [a-zA-Z_][a-zA-Z_0-9]*
     ;
 
-NORMALSTRING
-    : '"' ( EscapeSequence | ~('\\'|'"') )* '"'
-    | '\'' ( EscapeSequence | ~('\\'|'\'') )* '\''
+NORMAL_STRING
+    : '"'  (~["\\\r\n\u0085\u2028\u2029] | EscapeSequence | '\\\n')* '"'
+    | '\'' (~['\\\r\n\u0085\u2028\u2029] | EscapeSequence | '\\\n')* '\''
     ;
 
-LONGSTRING
-    : '[' NESTED_STR ']'
+INTERPOLATED_STRING
+    : '`' (~[`\\\r\n\u0085\u2028\u2029] | EscapeSequence | '\\\n')* '`'
+    ;
+
+LONG_STRING
+    : OPEN_BRACKET NESTED_STR CLOSE_BRACKET
     ;
 
 fragment
 NESTED_STR
     : '=' NESTED_STR '='
-    | '[' .*? ']'
+    | OPEN_BRACKET .*? CLOSE_BRACKET
     ;
 
 INT
@@ -350,6 +364,26 @@ HEX_FLOAT
     | '0' [xX] HexDigit+ HexExponentPart
     ;
 
+OPEN_BRACE:               '{';
+CLOSE_BRACE:              '}';
+
+OPEN_BRACKET:             '[';
+CLOSE_BRACKET:            ']';
+
+OPEN_PARENS:              '(';
+CLOSE_PARENS:             ')';
+
+NL
+	: '\r\n' | '\r' | '\n'
+	| '\u0085' // <Next Line CHARACTER (U+0085)>'
+	| '\u2028' //'<Line Separator CHARACTER (U+2028)>'
+	| '\u2029' //'<Paragraph Separator CHARACTER (U+2029)>'
+	;
+
+COMMA
+    : ','
+    ;
+
 fragment
 ExponentPart
     : [eE] [+-]? Digit+
@@ -362,8 +396,8 @@ HexExponentPart
 
 fragment
 EscapeSequence
-    : '\\' [abfnrtvz"'|$#\\]   // World of Warcraft Lua additionally escapes |$# 
-    | '\\' '\r'? '\n'
+    : '\\' [abfnrtvz"'`|$#\\]   // World of Warcraft Lua additionally escapes |$# 
+    | NL
     | DecimalEscape
     | HexEscape
     | UtfEscape
diff --git a/pmd-lua/src/main/java/net/sourceforge/pmd/cpd/LuaLanguage.java b/pmd-lua/src/main/java/net/sourceforge/pmd/cpd/LuaLanguage.java
index e2a87ec878..2e485e13b8 100644
--- a/pmd-lua/src/main/java/net/sourceforge/pmd/cpd/LuaLanguage.java
+++ b/pmd-lua/src/main/java/net/sourceforge/pmd/cpd/LuaLanguage.java
@@ -4,15 +4,28 @@
 
 package net.sourceforge.pmd.cpd;
 
+import java.util.Properties;
+
 /**
  * Language implementation for Lua
  */
 public class LuaLanguage extends AbstractLanguage {
 
+    public LuaLanguage() {
+        this(System.getProperties());
+    }
+
     /**
      * Creates a new Lua Language instance.
      */
-    public LuaLanguage() {
+    public LuaLanguage(Properties properties) {
         super("Lua", "lua", new LuaTokenizer(), ".lua");
+        setProperties(properties);
+    }
+
+    @Override
+    public final void setProperties(Properties properties) {
+        LuaTokenizer tokenizer = (LuaTokenizer) getTokenizer();
+        tokenizer.setProperties(properties);
     }
 }
diff --git a/pmd-lua/src/main/java/net/sourceforge/pmd/cpd/LuaTokenizer.java b/pmd-lua/src/main/java/net/sourceforge/pmd/cpd/LuaTokenizer.java
index 23c292dbe7..8e4f354b17 100644
--- a/pmd-lua/src/main/java/net/sourceforge/pmd/cpd/LuaTokenizer.java
+++ b/pmd-lua/src/main/java/net/sourceforge/pmd/cpd/LuaTokenizer.java
@@ -4,8 +4,11 @@
 
 package net.sourceforge.pmd.cpd;
 
+import java.util.Properties;
+
 import org.antlr.v4.runtime.CharStream;
 
+import net.sourceforge.pmd.cpd.token.AntlrToken;
 import net.sourceforge.pmd.cpd.token.AntlrTokenFilter;
 import net.sourceforge.pmd.lang.antlr.AntlrTokenManager;
 import net.sourceforge.pmd.lang.lua.antlr4.LuaLexer;
@@ -15,6 +18,22 @@ import net.sourceforge.pmd.lang.lua.antlr4.LuaLexer;
  */
 public class LuaTokenizer extends AntlrTokenizer {
 
+    private boolean ignoreLiteralSequences = false;
+
+    /**
+     * Sets the possible options for the C# tokenizer.
+     *
+     * @param properties the properties
+     * @see #OPTION_IGNORE_LITERAL_SEQUENCES
+     */
+    public void setProperties(Properties properties) {
+        ignoreLiteralSequences = getBooleanProperty(properties, OPTION_IGNORE_LITERAL_SEQUENCES);
+    }
+
+    private boolean getBooleanProperty(final Properties properties, final String property) {
+        return Boolean.parseBoolean(properties.getProperty(property, Boolean.FALSE.toString()));
+    }
+
     @Override
     protected AntlrTokenManager getLexerForSource(SourceCode sourceCode) {
         CharStream charStream = AntlrTokenizer.getCharStreamFromSourceCode(sourceCode);
@@ -23,6 +42,151 @@ public class LuaTokenizer extends AntlrTokenizer {
 
     @Override
     protected AntlrTokenFilter getTokenFilter(final AntlrTokenManager tokenManager) {
-        return new AntlrTokenFilter(tokenManager);
+        return new LuaTokenFilter(tokenManager, ignoreLiteralSequences);
+    }
+
+    /**
+     * The {@link LuaTokenFilter} extends the {@link AntlrTokenFilter} to discard
+     * Lua-specific tokens.
+     * <p>
+     * By default, it enables annotation-based CPD suppression.
+     * If the --ignoreUsings flag is provided, require() directives are filtered out.
+     * </p>
+     */
+    private static class LuaTokenFilter extends AntlrTokenFilter {
+
+        private final boolean ignoreLiteralSequences;
+        private boolean discardingRequires = false;
+        private boolean discardingNL = false;
+        private AntlrToken discardingLiteralsUntil = null;
+        private boolean discardCurrent = false;
+
+
+        LuaTokenFilter(final AntlrTokenManager tokenManager, boolean ignoreLiteralSequences) {
+            super(tokenManager);
+            this.ignoreLiteralSequences = ignoreLiteralSequences;
+        }
+
+        @Override
+        protected void analyzeToken(final AntlrToken currentToken) {
+            skipNewLines(currentToken);
+        }
+
+        @Override
+        protected void analyzeTokens(final AntlrToken currentToken, final Iterable<AntlrToken> remainingTokens) {
+            discardCurrent = false;
+            skipRequires(currentToken, remainingTokens);
+            skipLiteralSequences(currentToken, remainingTokens);
+        }
+
+        private void skipRequires(final AntlrToken currentToken, final Iterable<AntlrToken> remainingTokens) {
+            final int type = currentToken.getKind();
+            if (type == LuaLexer.REQUIRE) {
+                discardingRequires = true;
+            } else if (type == LuaLexer.CLOSE_PARENS && discardingRequires) {
+                discardingRequires = false;
+                discardCurrent = true;
+            }
+        }
+
+        private void skipNewLines(final AntlrToken currentToken) {
+            discardingNL = currentToken.getKind() == LuaLexer.NL;
+        }
+
+        private void skipLiteralSequences(final AntlrToken currentToken, final Iterable<AntlrToken> remainingTokens) {
+            if (ignoreLiteralSequences) {
+                final int type = currentToken.getKind();
+                if (isDiscardingLiterals()) {
+                    if (currentToken == discardingLiteralsUntil) { // NOPMD - intentional check for reference equality
+                        discardingLiteralsUntil = null;
+                        discardCurrent = true;
+                    }
+                } else if (type == LuaLexer.OPEN_BRACE
+                    || type == LuaLexer.OPEN_BRACKET
+                    || type == LuaLexer.OPEN_PARENS) {
+                    final AntlrToken finalToken = findEndOfSequenceOfLiterals(remainingTokens);
+                    discardingLiteralsUntil = finalToken;
+                }
+            }
+        }
+
+        private AntlrToken findEndOfSequenceOfLiterals(final Iterable<AntlrToken> remainingTokens) {
+            boolean seenLiteral = false;
+            int braceCount = 0;
+            int bracketCount = 0;
+            int parenCount = 0;
+            for (final AntlrToken token : remainingTokens) {
+                switch (token.getKind()) {
+                case LuaLexer.INT:
+                case LuaLexer.NORMAL_STRING:
+                case LuaLexer.INTERPOLATED_STRING:
+                case LuaLexer.LONG_STRING:
+                case LuaLexer.HEX_FLOAT:
+                case LuaLexer.HEX:
+                case LuaLexer.FLOAT:
+                case LuaLexer.NIL:
+                case LuaLexer.BOOLEAN:
+                    seenLiteral = true;
+                    break; // can be skipped; continue to the next token
+                case LuaLexer.COMMA:
+                    break; // can be skipped; continue to the next token
+                case LuaLexer.NL:
+                    // this helps skip large multi-line data table sequences in Lua
+                    break; // can be skipped; continue to the next token
+                case LuaLexer.ASSIGNMENT:
+                    // this helps skip large data table sequences in Lua: { ["bob"] = "uncle", ["alice"] = "enby" }
+                    break; // can be skipped; continue to the next token
+                case LuaLexer.OPEN_BRACE:
+                    braceCount++;
+                    break; // curly braces are allowed, as long as they're balanced
+                case LuaLexer.CLOSE_BRACE:
+                    braceCount--;
+                    if (braceCount < 0) {
+                        // end of the list in the braces; skip all contents
+                        return seenLiteral ? token : null;
+                    } else {
+                        // curly braces are not yet balanced; continue to the next token
+                        break;
+                    }
+                case LuaLexer.OPEN_BRACKET:
+                    bracketCount++;
+                    break; // brackets are allowed, as long as they're balanced
+                case LuaLexer.CLOSE_BRACKET:
+                    bracketCount--;
+                    if (bracketCount < 0) {
+                        // end of the list in the brackets; skip all contents
+                        return seenLiteral ? token : null;
+                    } else {
+                        // brackets are not yet balanced; continue to the next token
+                        break;
+                    }
+                case LuaLexer.OPEN_PARENS:
+                    parenCount++;
+                    break; // parens are allowed, as long as they're balanced
+                case LuaLexer.CLOSE_PARENS:
+                    parenCount--;
+                    if (parenCount < 0) {
+                        // end of the list in the parens; skip all contents
+                        return seenLiteral ? token : null;
+                    } else {
+                        // parens are not yet balanced; continue to the next token
+                        break;
+                    }
+                default:
+                    // some other token than the expected ones; this is not a sequence of literals
+                    return null;
+                }
+            }
+            return null;
+        }
+
+        public boolean isDiscardingLiterals() {
+            return discardingLiteralsUntil != null;
+        }
+
+        @Override
+        protected boolean isLanguageSpecificDiscarding() {
+            return discardingRequires || discardingNL || isDiscardingLiterals() || discardCurrent;
+        }
     }
 }
diff --git a/pmd-lua/src/test/resources/net/sourceforge/pmd/lang/lua/cpd/testdata/luauTypes.lua b/pmd-lua/src/test/resources/net/sourceforge/pmd/lang/lua/cpd/testdata/luauTypes.lua
index 9e418a61de..da4e9ddf80 100644
--- a/pmd-lua/src/test/resources/net/sourceforge/pmd/lang/lua/cpd/testdata/luauTypes.lua
+++ b/pmd-lua/src/test/resources/net/sourceforge/pmd/lang/lua/cpd/testdata/luauTypes.lua
@@ -11,18 +11,22 @@ local _PlatformService = nil
 local game = require(script.Parent.game).default :: any
 pcall(function() _PlatformService = game:GetService('PlatformService') end)
 
-
 return function <T>(req, ...: boolean): ({[string|number]: T}, string, Function<...any>)
   local body = string.format("%s %s\n", req.method, req.path)
   local res = {
     code = 200,
     { "Content-Type", "text/plain" },
-    { "Content-Length", #body } :: Array<any>,
+    { 
+      "Content-Length", 
+      #body,
+      ["Auth.Confirm"] = [[至:%s。]],
+
+    } :: Array<any>,
   } :: { [any]: number | Array<string | boolean> }
   if (req :: any).keepAlive then
     local socketType: "Connection" | "Pingback" | "" = "" :: ""
     socketType = "Connection" :: "Connection"
-    res[#res + 1] = { socketType :: string, "Keep-Alive" }
+    res[#res + 1] = { socketType :: string, `\`${req.keepAlive}\`` }
     res[#res - 2] = { ... }
   end
 
diff --git a/pmd-lua/src/test/resources/net/sourceforge/pmd/lang/lua/cpd/testdata/luauTypes.txt b/pmd-lua/src/test/resources/net/sourceforge/pmd/lang/lua/cpd/testdata/luauTypes.txt
index ac7939686c..f1a3934c53 100644
--- a/pmd-lua/src/test/resources/net/sourceforge/pmd/lang/lua/cpd/testdata/luauTypes.txt
+++ b/pmd-lua/src/test/resources/net/sourceforge/pmd/lang/lua/cpd/testdata/luauTypes.txt
@@ -76,14 +76,6 @@ L11
     [local]                                 1         5
     [game]                                  7         10
     [=]                                     12        12
-    [require]                               14        20
-    [(]                                     21        21
-    [script]                                22        27
-    [.]                                     28        28
-    [Parent]                                29        34
-    [.]                                     35        35
-    [game]                                  36        39
-    [)]                                     40        40
     [.]                                     41        41
     [default]                               42        48
     [::]                                    50        51
@@ -104,7 +96,7 @@ L12
     [)]                                     70        70
     [end]                                   72        74
     [)]                                     75        75
-L15
+L14
     [return]                                1         6
     [function]                              8         15
     [<]                                     17        17
@@ -137,7 +129,7 @@ L15
     [any]                                   84        86
     [>]                                     87        87
     [)]                                     88        88
-L16
+L15
     [local]                                 3         7
     [body]                                  9         12
     [=]                                     14        14
@@ -155,37 +147,48 @@ L16
     [.]                                     56        56
     [path]                                  57        60
     [)]                                     61        61
-L17
+L16
     [local]                                 3         7
     [res]                                   9         11
     [=]                                     13        13
     [{]                                     15        15
-L18
+L17
     [code]                                  5         8
     [=]                                     10        10
     [200]                                   12        14
     [,]                                     15        15
-L19
+L18
     [{]                                     5         5
     ["Content-Type"]                        7         20
     [,]                                     21        21
     ["text/plain"]                          23        34
     [}]                                     36        36
     [,]                                     37        37
-L20
+L19
     [{]                                     5         5
+L20
     ["Content-Length"]                      7         22
     [,]                                     23        23
-    [#]                                     25        25
-    [body]                                  26        29
-    [}]                                     31        31
-    [::]                                    33        34
-    [Array]                                 36        40
-    [<]                                     41        41
-    [any]                                   42        44
-    [>]                                     45        45
-    [,]                                     46        46
 L21
+    [#]                                     7         7
+    [body]                                  8         11
+    [,]                                     12        12
+L22
+    [\[]                                    7         7
+    ["Auth.Confirm"]                        8         21
+    [\]]                                    22        22
+    [=]                                     24        24
+    [\[\[至:%s。\]\]]                         26        34
+    [,]                                     35        35
+L24
+    [}]                                     5         5
+    [::]                                    7         8
+    [Array]                                 10        14
+    [<]                                     15        15
+    [any]                                   16        18
+    [>]                                     19        19
+    [,]                                     20        20
+L25
     [}]                                     3         3
     [::]                                    5         6
     [{]                                     8         8
@@ -202,7 +205,7 @@ L21
     [boolean]                               41        47
     [>]                                     48        48
     [}]                                     50        50
-L22
+L26
     [if]                                    3         4
     [(]                                     6         6
     [req]                                   7         9
@@ -212,7 +215,7 @@ L22
     [.]                                     18        18
     [keepAlive]                             19        27
     [then]                                  29        32
-L23
+L27
     [local]                                 5         9
     [socketType]                            11        20
     [:]                                     21        21
@@ -225,13 +228,13 @@ L23
     [""]                                    56        57
     [::]                                    59        60
     [""]                                    62        63
-L24
+L28
     [socketType]                            5         14
     [=]                                     16        16
     ["Connection"]                          18        29
     [::]                                    31        32
     ["Connection"]                          34        45
-L25
+L29
     [res]                                   5         7
     [\[]                                    8         8
     [#]                                     9         9
@@ -245,9 +248,9 @@ L25
     [::]                                    34        35
     [string]                                37        42
     [,]                                     43        43
-    ["Keep-Alive"]                          45        56
-    [}]                                     58        58
-L26
+    [`\\`${req.keepAlive}\\``]              45        66
+    [}]                                     68        68
+L30
     [res]                                   5         7
     [\[]                                    8         8
     [#]                                     9         9
@@ -259,9 +262,9 @@ L26
     [{]                                     21        21
     [...]                                   23        25
     [}]                                     27        27
-L27
+L31
     [end]                                   3         5
-L29
+L33
     [return]                                3         8
     [(]                                     10        10
     [res]                                   11        13
@@ -294,6 +297,6 @@ L29
     [return]                                93        98
     [...]                                   100       102
     [end]                                   104       106
-L30
+L34
     [end]                                   1         3
 EOF