From b8b4e513ac232711f00ed9e4b699b7a2459e1233 Mon Sep 17 00:00:00 2001 From: Matt Hargett <plaztiksyke@gmail.com> Date: Fri, 9 Sep 2022 16:42:18 -0700 Subject: [PATCH] Support multi-line 'normal' strings. Support skipping literal sequences. More minor adjustments to make the testing corpus tokenize without errors. --- .../sourceforge/pmd/lang/lua/antlr4/Lua.g4 | 88 +++++++--- .../net/sourceforge/pmd/cpd/LuaLanguage.java | 15 +- .../net/sourceforge/pmd/cpd/LuaTokenizer.java | 166 +++++++++++++++++- .../pmd/lang/lua/cpd/testdata/luauTypes.lua | 10 +- .../pmd/lang/lua/cpd/testdata/luauTypes.txt | 69 ++++---- 5 files changed, 283 insertions(+), 65 deletions(-) diff --git a/pmd-lua/src/main/antlr4/net/sourceforge/pmd/lang/lua/antlr4/Lua.g4 b/pmd-lua/src/main/antlr4/net/sourceforge/pmd/lang/lua/antlr4/Lua.g4 index 30e36071ff..4676055f48 100644 --- a/pmd-lua/src/main/antlr4/net/sourceforge/pmd/lang/lua/antlr4/Lua.g4 +++ b/pmd-lua/src/main/antlr4/net/sourceforge/pmd/lang/lua/antlr4/Lua.g4 @@ -77,7 +77,7 @@ block stat : ';' - | varlist '=' explist + | varlist ASSIGNMENT explist | var compoundop exp | functioncall | label @@ -87,11 +87,11 @@ stat | 'while' exp 'do' block 'end' | 'repeat' block 'until' exp | 'if' exp 'then' block ('elseif' exp 'then' block)* ('else' block)? 'end' - | 'for' binding '=' exp ',' exp (',' exp)? 'do' block 'end' + | 'for' binding ASSIGNMENT exp ',' exp (',' exp)? 'do' block 'end' | 'for' bindinglist 'in' explist 'do' block 'end' | 'function' funcname funcbody - | 'local' 'function' NAME funcbody - | 'local' bindinglist ('=' explist)? + | LOCAL 'function' NAME funcbody + | LOCAL bindinglist (ASSIGNMENT explist)? | ('export')? 'type' NAME ('<' genericTypeParameterList '>')? '=' type ; @@ -116,7 +116,7 @@ funcname ; funcbody - : ('<' genericTypeParameterList '>')? '(' parlist? ')' (':' '...'? returnType ) block 'end' + : ('<' genericTypeParameterList '>')? OPEN_PARENS parlist? CLOSE_PARENS (':' '...'? returnType ) block 'end' ; parlist @@ -138,7 +138,7 @@ binding bindinglist: binding (',' bindinglist)?; var - : (NAME | '(' exp ')' varSuffix) varSuffix* + : (NAME | OPEN_PARENS exp CLOSE_PARENS varSuffix) varSuffix* ; varlist @@ -172,11 +172,11 @@ simpleexp | tableconstructor; varOrExp - : var | '(' exp ')' + : var | OPEN_PARENS exp CLOSE_PARENS ; varSuffix - : nameAndArgs* ('[' exp ']' | '.' NAME) + : nameAndArgs* (OPEN_BRACKET exp CLOSE_BRACKET | '.' NAME) ; nameAndArgs @@ -184,7 +184,7 @@ nameAndArgs ; args - : '(' explist? ')' | tableconstructor | string + : OPEN_PARENS explist? CLOSE_PARENS | tableconstructor | string ; functiondef @@ -192,7 +192,7 @@ functiondef ; tableconstructor - : '{' fieldlist? '}' + : OPEN_BRACE fieldlist? CLOSE_BRACE ; fieldlist @@ -200,7 +200,7 @@ fieldlist ; field - : '[' exp ']' '=' exp | NAME '=' exp | exp + : OPEN_BRACKET exp CLOSE_BRACKET ASSIGNMENT exp | NAME ASSIGNMENT exp | exp ; fieldsep @@ -220,6 +220,8 @@ operatorAnd operatorComparison : '<' | '>' | '<=' | '>=' | '~=' | '=='; +ASSIGNMENT: '='; + operatorStrcat : '..'; @@ -243,20 +245,20 @@ number ; string - : NORMALSTRING | LONGSTRING + : NORMAL_STRING | LONG_STRING | INTERPOLATED_STRING ; simpleType : NIL | singletonType | NAME ('.' NAME)? ('<' typeParams '>')? - | 'typeof' '(' exp ')' + | 'typeof' OPEN_PARENS exp CLOSE_PARENS | tableType | functionType ; singletonType - : NORMALSTRING | BOOLEAN + : NORMAL_STRING | BOOLEAN ; type @@ -273,7 +275,7 @@ typeList: type (',' typeList)? | variadicTypePack; typeParams: (type | typePack | variadicTypePack | genericTypePack) (',' typeParams)?; -typePack: '(' (typeList)? ')'; +typePack: OPEN_PARENS (typeList)? CLOSE_PARENS; genericTypePack: NAME '...'; @@ -281,7 +283,7 @@ variadicTypePack: '...' type; returnType: type | typePack; -tableIndexer: '[' type ']' ':' type; +tableIndexer: OPEN_BRACKET type CLOSE_BRACKET ':' type; tableProp: NAME ':' type; @@ -292,17 +294,25 @@ propList : tablePropOrIndexer (fieldsep tablePropOrIndexer)* fieldsep?; tableType - : '{' propList '}'; + : OPEN_BRACE propList CLOSE_BRACE; -functionType: ('<' genericTypeParameterList '>')? '(' (typeList)? ')' '->' returnType; +functionType: ('<' genericTypeParameterList '>')? OPEN_PARENS (typeList)? CLOSE_PARENS '->' returnType; require - : 'local'? bindinglist '=' 'require' '(' exp ')' ('.' NAME)* ('::' type)? ';'? + : 'local'? bindinglist '=' REQUIRE OPEN_PARENS exp CLOSE_PARENS ('.' NAME)* ('::' type)? ';'? ; // LEXER +LOCAL + : 'local' + ; + +REQUIRE + : 'require' + ; + NIL : 'nil' ; @@ -315,19 +325,23 @@ NAME : [a-zA-Z_][a-zA-Z_0-9]* ; -NORMALSTRING - : '"' ( EscapeSequence | ~('\\'|'"') )* '"' - | '\'' ( EscapeSequence | ~('\\'|'\'') )* '\'' +NORMAL_STRING + : '"' (~["\\\r\n\u0085\u2028\u2029] | EscapeSequence | '\\\n')* '"' + | '\'' (~['\\\r\n\u0085\u2028\u2029] | EscapeSequence | '\\\n')* '\'' ; -LONGSTRING - : '[' NESTED_STR ']' +INTERPOLATED_STRING + : '`' (~[`\\\r\n\u0085\u2028\u2029] | EscapeSequence | '\\\n')* '`' + ; + +LONG_STRING + : OPEN_BRACKET NESTED_STR CLOSE_BRACKET ; fragment NESTED_STR : '=' NESTED_STR '=' - | '[' .*? ']' + | OPEN_BRACKET .*? CLOSE_BRACKET ; INT @@ -350,6 +364,26 @@ HEX_FLOAT | '0' [xX] HexDigit+ HexExponentPart ; +OPEN_BRACE: '{'; +CLOSE_BRACE: '}'; + +OPEN_BRACKET: '['; +CLOSE_BRACKET: ']'; + +OPEN_PARENS: '('; +CLOSE_PARENS: ')'; + +NL + : '\r\n' | '\r' | '\n' + | '\u0085' // <Next Line CHARACTER (U+0085)>' + | '\u2028' //'<Line Separator CHARACTER (U+2028)>' + | '\u2029' //'<Paragraph Separator CHARACTER (U+2029)>' + ; + +COMMA + : ',' + ; + fragment ExponentPart : [eE] [+-]? Digit+ @@ -362,8 +396,8 @@ HexExponentPart fragment EscapeSequence - : '\\' [abfnrtvz"'|$#\\] // World of Warcraft Lua additionally escapes |$# - | '\\' '\r'? '\n' + : '\\' [abfnrtvz"'`|$#\\] // World of Warcraft Lua additionally escapes |$# + | NL | DecimalEscape | HexEscape | UtfEscape diff --git a/pmd-lua/src/main/java/net/sourceforge/pmd/cpd/LuaLanguage.java b/pmd-lua/src/main/java/net/sourceforge/pmd/cpd/LuaLanguage.java index e2a87ec878..2e485e13b8 100644 --- a/pmd-lua/src/main/java/net/sourceforge/pmd/cpd/LuaLanguage.java +++ b/pmd-lua/src/main/java/net/sourceforge/pmd/cpd/LuaLanguage.java @@ -4,15 +4,28 @@ package net.sourceforge.pmd.cpd; +import java.util.Properties; + /** * Language implementation for Lua */ public class LuaLanguage extends AbstractLanguage { + public LuaLanguage() { + this(System.getProperties()); + } + /** * Creates a new Lua Language instance. */ - public LuaLanguage() { + public LuaLanguage(Properties properties) { super("Lua", "lua", new LuaTokenizer(), ".lua"); + setProperties(properties); + } + + @Override + public final void setProperties(Properties properties) { + LuaTokenizer tokenizer = (LuaTokenizer) getTokenizer(); + tokenizer.setProperties(properties); } } diff --git a/pmd-lua/src/main/java/net/sourceforge/pmd/cpd/LuaTokenizer.java b/pmd-lua/src/main/java/net/sourceforge/pmd/cpd/LuaTokenizer.java index 23c292dbe7..8e4f354b17 100644 --- a/pmd-lua/src/main/java/net/sourceforge/pmd/cpd/LuaTokenizer.java +++ b/pmd-lua/src/main/java/net/sourceforge/pmd/cpd/LuaTokenizer.java @@ -4,8 +4,11 @@ package net.sourceforge.pmd.cpd; +import java.util.Properties; + import org.antlr.v4.runtime.CharStream; +import net.sourceforge.pmd.cpd.token.AntlrToken; import net.sourceforge.pmd.cpd.token.AntlrTokenFilter; import net.sourceforge.pmd.lang.antlr.AntlrTokenManager; import net.sourceforge.pmd.lang.lua.antlr4.LuaLexer; @@ -15,6 +18,22 @@ import net.sourceforge.pmd.lang.lua.antlr4.LuaLexer; */ public class LuaTokenizer extends AntlrTokenizer { + private boolean ignoreLiteralSequences = false; + + /** + * Sets the possible options for the C# tokenizer. + * + * @param properties the properties + * @see #OPTION_IGNORE_LITERAL_SEQUENCES + */ + public void setProperties(Properties properties) { + ignoreLiteralSequences = getBooleanProperty(properties, OPTION_IGNORE_LITERAL_SEQUENCES); + } + + private boolean getBooleanProperty(final Properties properties, final String property) { + return Boolean.parseBoolean(properties.getProperty(property, Boolean.FALSE.toString())); + } + @Override protected AntlrTokenManager getLexerForSource(SourceCode sourceCode) { CharStream charStream = AntlrTokenizer.getCharStreamFromSourceCode(sourceCode); @@ -23,6 +42,151 @@ public class LuaTokenizer extends AntlrTokenizer { @Override protected AntlrTokenFilter getTokenFilter(final AntlrTokenManager tokenManager) { - return new AntlrTokenFilter(tokenManager); + return new LuaTokenFilter(tokenManager, ignoreLiteralSequences); + } + + /** + * The {@link LuaTokenFilter} extends the {@link AntlrTokenFilter} to discard + * Lua-specific tokens. + * <p> + * By default, it enables annotation-based CPD suppression. + * If the --ignoreUsings flag is provided, require() directives are filtered out. + * </p> + */ + private static class LuaTokenFilter extends AntlrTokenFilter { + + private final boolean ignoreLiteralSequences; + private boolean discardingRequires = false; + private boolean discardingNL = false; + private AntlrToken discardingLiteralsUntil = null; + private boolean discardCurrent = false; + + + LuaTokenFilter(final AntlrTokenManager tokenManager, boolean ignoreLiteralSequences) { + super(tokenManager); + this.ignoreLiteralSequences = ignoreLiteralSequences; + } + + @Override + protected void analyzeToken(final AntlrToken currentToken) { + skipNewLines(currentToken); + } + + @Override + protected void analyzeTokens(final AntlrToken currentToken, final Iterable<AntlrToken> remainingTokens) { + discardCurrent = false; + skipRequires(currentToken, remainingTokens); + skipLiteralSequences(currentToken, remainingTokens); + } + + private void skipRequires(final AntlrToken currentToken, final Iterable<AntlrToken> remainingTokens) { + final int type = currentToken.getKind(); + if (type == LuaLexer.REQUIRE) { + discardingRequires = true; + } else if (type == LuaLexer.CLOSE_PARENS && discardingRequires) { + discardingRequires = false; + discardCurrent = true; + } + } + + private void skipNewLines(final AntlrToken currentToken) { + discardingNL = currentToken.getKind() == LuaLexer.NL; + } + + private void skipLiteralSequences(final AntlrToken currentToken, final Iterable<AntlrToken> remainingTokens) { + if (ignoreLiteralSequences) { + final int type = currentToken.getKind(); + if (isDiscardingLiterals()) { + if (currentToken == discardingLiteralsUntil) { // NOPMD - intentional check for reference equality + discardingLiteralsUntil = null; + discardCurrent = true; + } + } else if (type == LuaLexer.OPEN_BRACE + || type == LuaLexer.OPEN_BRACKET + || type == LuaLexer.OPEN_PARENS) { + final AntlrToken finalToken = findEndOfSequenceOfLiterals(remainingTokens); + discardingLiteralsUntil = finalToken; + } + } + } + + private AntlrToken findEndOfSequenceOfLiterals(final Iterable<AntlrToken> remainingTokens) { + boolean seenLiteral = false; + int braceCount = 0; + int bracketCount = 0; + int parenCount = 0; + for (final AntlrToken token : remainingTokens) { + switch (token.getKind()) { + case LuaLexer.INT: + case LuaLexer.NORMAL_STRING: + case LuaLexer.INTERPOLATED_STRING: + case LuaLexer.LONG_STRING: + case LuaLexer.HEX_FLOAT: + case LuaLexer.HEX: + case LuaLexer.FLOAT: + case LuaLexer.NIL: + case LuaLexer.BOOLEAN: + seenLiteral = true; + break; // can be skipped; continue to the next token + case LuaLexer.COMMA: + break; // can be skipped; continue to the next token + case LuaLexer.NL: + // this helps skip large multi-line data table sequences in Lua + break; // can be skipped; continue to the next token + case LuaLexer.ASSIGNMENT: + // this helps skip large data table sequences in Lua: { ["bob"] = "uncle", ["alice"] = "enby" } + break; // can be skipped; continue to the next token + case LuaLexer.OPEN_BRACE: + braceCount++; + break; // curly braces are allowed, as long as they're balanced + case LuaLexer.CLOSE_BRACE: + braceCount--; + if (braceCount < 0) { + // end of the list in the braces; skip all contents + return seenLiteral ? token : null; + } else { + // curly braces are not yet balanced; continue to the next token + break; + } + case LuaLexer.OPEN_BRACKET: + bracketCount++; + break; // brackets are allowed, as long as they're balanced + case LuaLexer.CLOSE_BRACKET: + bracketCount--; + if (bracketCount < 0) { + // end of the list in the brackets; skip all contents + return seenLiteral ? token : null; + } else { + // brackets are not yet balanced; continue to the next token + break; + } + case LuaLexer.OPEN_PARENS: + parenCount++; + break; // parens are allowed, as long as they're balanced + case LuaLexer.CLOSE_PARENS: + parenCount--; + if (parenCount < 0) { + // end of the list in the parens; skip all contents + return seenLiteral ? token : null; + } else { + // parens are not yet balanced; continue to the next token + break; + } + default: + // some other token than the expected ones; this is not a sequence of literals + return null; + } + } + return null; + } + + public boolean isDiscardingLiterals() { + return discardingLiteralsUntil != null; + } + + @Override + protected boolean isLanguageSpecificDiscarding() { + return discardingRequires || discardingNL || isDiscardingLiterals() || discardCurrent; + } } } diff --git a/pmd-lua/src/test/resources/net/sourceforge/pmd/lang/lua/cpd/testdata/luauTypes.lua b/pmd-lua/src/test/resources/net/sourceforge/pmd/lang/lua/cpd/testdata/luauTypes.lua index 9e418a61de..da4e9ddf80 100644 --- a/pmd-lua/src/test/resources/net/sourceforge/pmd/lang/lua/cpd/testdata/luauTypes.lua +++ b/pmd-lua/src/test/resources/net/sourceforge/pmd/lang/lua/cpd/testdata/luauTypes.lua @@ -11,18 +11,22 @@ local _PlatformService = nil local game = require(script.Parent.game).default :: any pcall(function() _PlatformService = game:GetService('PlatformService') end) - return function <T>(req, ...: boolean): ({[string|number]: T}, string, Function<...any>) local body = string.format("%s %s\n", req.method, req.path) local res = { code = 200, { "Content-Type", "text/plain" }, - { "Content-Length", #body } :: Array<any>, + { + "Content-Length", + #body, + ["Auth.Confirm"] = [[至:%s。]], + + } :: Array<any>, } :: { [any]: number | Array<string | boolean> } if (req :: any).keepAlive then local socketType: "Connection" | "Pingback" | "" = "" :: "" socketType = "Connection" :: "Connection" - res[#res + 1] = { socketType :: string, "Keep-Alive" } + res[#res + 1] = { socketType :: string, `\`${req.keepAlive}\`` } res[#res - 2] = { ... } end diff --git a/pmd-lua/src/test/resources/net/sourceforge/pmd/lang/lua/cpd/testdata/luauTypes.txt b/pmd-lua/src/test/resources/net/sourceforge/pmd/lang/lua/cpd/testdata/luauTypes.txt index ac7939686c..f1a3934c53 100644 --- a/pmd-lua/src/test/resources/net/sourceforge/pmd/lang/lua/cpd/testdata/luauTypes.txt +++ b/pmd-lua/src/test/resources/net/sourceforge/pmd/lang/lua/cpd/testdata/luauTypes.txt @@ -76,14 +76,6 @@ L11 [local] 1 5 [game] 7 10 [=] 12 12 - [require] 14 20 - [(] 21 21 - [script] 22 27 - [.] 28 28 - [Parent] 29 34 - [.] 35 35 - [game] 36 39 - [)] 40 40 [.] 41 41 [default] 42 48 [::] 50 51 @@ -104,7 +96,7 @@ L12 [)] 70 70 [end] 72 74 [)] 75 75 -L15 +L14 [return] 1 6 [function] 8 15 [<] 17 17 @@ -137,7 +129,7 @@ L15 [any] 84 86 [>] 87 87 [)] 88 88 -L16 +L15 [local] 3 7 [body] 9 12 [=] 14 14 @@ -155,37 +147,48 @@ L16 [.] 56 56 [path] 57 60 [)] 61 61 -L17 +L16 [local] 3 7 [res] 9 11 [=] 13 13 [{] 15 15 -L18 +L17 [code] 5 8 [=] 10 10 [200] 12 14 [,] 15 15 -L19 +L18 [{] 5 5 ["Content-Type"] 7 20 [,] 21 21 ["text/plain"] 23 34 [}] 36 36 [,] 37 37 -L20 +L19 [{] 5 5 +L20 ["Content-Length"] 7 22 [,] 23 23 - [#] 25 25 - [body] 26 29 - [}] 31 31 - [::] 33 34 - [Array] 36 40 - [<] 41 41 - [any] 42 44 - [>] 45 45 - [,] 46 46 L21 + [#] 7 7 + [body] 8 11 + [,] 12 12 +L22 + [\[] 7 7 + ["Auth.Confirm"] 8 21 + [\]] 22 22 + [=] 24 24 + [\[\[至:%s。\]\]] 26 34 + [,] 35 35 +L24 + [}] 5 5 + [::] 7 8 + [Array] 10 14 + [<] 15 15 + [any] 16 18 + [>] 19 19 + [,] 20 20 +L25 [}] 3 3 [::] 5 6 [{] 8 8 @@ -202,7 +205,7 @@ L21 [boolean] 41 47 [>] 48 48 [}] 50 50 -L22 +L26 [if] 3 4 [(] 6 6 [req] 7 9 @@ -212,7 +215,7 @@ L22 [.] 18 18 [keepAlive] 19 27 [then] 29 32 -L23 +L27 [local] 5 9 [socketType] 11 20 [:] 21 21 @@ -225,13 +228,13 @@ L23 [""] 56 57 [::] 59 60 [""] 62 63 -L24 +L28 [socketType] 5 14 [=] 16 16 ["Connection"] 18 29 [::] 31 32 ["Connection"] 34 45 -L25 +L29 [res] 5 7 [\[] 8 8 [#] 9 9 @@ -245,9 +248,9 @@ L25 [::] 34 35 [string] 37 42 [,] 43 43 - ["Keep-Alive"] 45 56 - [}] 58 58 -L26 + [`\\`${req.keepAlive}\\``] 45 66 + [}] 68 68 +L30 [res] 5 7 [\[] 8 8 [#] 9 9 @@ -259,9 +262,9 @@ L26 [{] 21 21 [...] 23 25 [}] 27 27 -L27 +L31 [end] 3 5 -L29 +L33 [return] 3 8 [(] 10 10 [res] 11 13 @@ -294,6 +297,6 @@ L29 [return] 93 98 [...] 100 102 [end] 104 106 -L30 +L34 [end] 1 3 EOF