Support multi-line 'normal' strings. Support skipping literal sequences. More minor adjustments to make the testing corpus tokenize without errors.

This commit is contained in:
Matt Hargett 2022-09-09 16:42:18 -07:00
parent 7747b75091
commit b8b4e513ac
5 changed files with 283 additions and 65 deletions

View File

@ -77,7 +77,7 @@ block
stat
: ';'
| varlist '=' explist
| varlist ASSIGNMENT explist
| var compoundop exp
| functioncall
| label
@ -87,11 +87,11 @@ stat
| 'while' exp 'do' block 'end'
| 'repeat' block 'until' exp
| 'if' exp 'then' block ('elseif' exp 'then' block)* ('else' block)? 'end'
| 'for' binding '=' exp ',' exp (',' exp)? 'do' block 'end'
| 'for' binding ASSIGNMENT exp ',' exp (',' exp)? 'do' block 'end'
| 'for' bindinglist 'in' explist 'do' block 'end'
| 'function' funcname funcbody
| 'local' 'function' NAME funcbody
| 'local' bindinglist ('=' explist)?
| LOCAL 'function' NAME funcbody
| LOCAL bindinglist (ASSIGNMENT explist)?
| ('export')? 'type' NAME ('<' genericTypeParameterList '>')? '=' type
;
@ -116,7 +116,7 @@ funcname
;
funcbody
: ('<' genericTypeParameterList '>')? '(' parlist? ')' (':' '...'? returnType ) block 'end'
: ('<' genericTypeParameterList '>')? OPEN_PARENS parlist? CLOSE_PARENS (':' '...'? returnType ) block 'end'
;
parlist
@ -138,7 +138,7 @@ binding
bindinglist: binding (',' bindinglist)?;
var
: (NAME | '(' exp ')' varSuffix) varSuffix*
: (NAME | OPEN_PARENS exp CLOSE_PARENS varSuffix) varSuffix*
;
varlist
@ -172,11 +172,11 @@ simpleexp
| tableconstructor;
varOrExp
: var | '(' exp ')'
: var | OPEN_PARENS exp CLOSE_PARENS
;
varSuffix
: nameAndArgs* ('[' exp ']' | '.' NAME)
: nameAndArgs* (OPEN_BRACKET exp CLOSE_BRACKET | '.' NAME)
;
nameAndArgs
@ -184,7 +184,7 @@ nameAndArgs
;
args
: '(' explist? ')' | tableconstructor | string
: OPEN_PARENS explist? CLOSE_PARENS | tableconstructor | string
;
functiondef
@ -192,7 +192,7 @@ functiondef
;
tableconstructor
: '{' fieldlist? '}'
: OPEN_BRACE fieldlist? CLOSE_BRACE
;
fieldlist
@ -200,7 +200,7 @@ fieldlist
;
field
: '[' exp ']' '=' exp | NAME '=' exp | exp
: OPEN_BRACKET exp CLOSE_BRACKET ASSIGNMENT exp | NAME ASSIGNMENT exp | exp
;
fieldsep
@ -220,6 +220,8 @@ operatorAnd
operatorComparison
: '<' | '>' | '<=' | '>=' | '~=' | '==';
ASSIGNMENT: '=';
operatorStrcat
: '..';
@ -243,20 +245,20 @@ number
;
string
: NORMALSTRING | LONGSTRING
: NORMAL_STRING | LONG_STRING | INTERPOLATED_STRING
;
simpleType
: NIL
| singletonType
| NAME ('.' NAME)? ('<' typeParams '>')?
| 'typeof' '(' exp ')'
| 'typeof' OPEN_PARENS exp CLOSE_PARENS
| tableType
| functionType
;
singletonType
: NORMALSTRING | BOOLEAN
: NORMAL_STRING | BOOLEAN
;
type
@ -273,7 +275,7 @@ typeList: type (',' typeList)? | variadicTypePack;
typeParams: (type | typePack | variadicTypePack | genericTypePack) (',' typeParams)?;
typePack: '(' (typeList)? ')';
typePack: OPEN_PARENS (typeList)? CLOSE_PARENS;
genericTypePack: NAME '...';
@ -281,7 +283,7 @@ variadicTypePack: '...' type;
returnType: type | typePack;
tableIndexer: '[' type ']' ':' type;
tableIndexer: OPEN_BRACKET type CLOSE_BRACKET ':' type;
tableProp: NAME ':' type;
@ -292,17 +294,25 @@ propList
: tablePropOrIndexer (fieldsep tablePropOrIndexer)* fieldsep?;
tableType
: '{' propList '}';
: OPEN_BRACE propList CLOSE_BRACE;
functionType: ('<' genericTypeParameterList '>')? '(' (typeList)? ')' '->' returnType;
functionType: ('<' genericTypeParameterList '>')? OPEN_PARENS (typeList)? CLOSE_PARENS '->' returnType;
require
: 'local'? bindinglist '=' 'require' '(' exp ')' ('.' NAME)* ('::' type)? ';'?
: 'local'? bindinglist '=' REQUIRE OPEN_PARENS exp CLOSE_PARENS ('.' NAME)* ('::' type)? ';'?
;
// LEXER
LOCAL
: 'local'
;
REQUIRE
: 'require'
;
NIL
: 'nil'
;
@ -315,19 +325,23 @@ NAME
: [a-zA-Z_][a-zA-Z_0-9]*
;
NORMALSTRING
: '"' ( EscapeSequence | ~('\\'|'"') )* '"'
| '\'' ( EscapeSequence | ~('\\'|'\'') )* '\''
NORMAL_STRING
: '"' (~["\\\r\n\u0085\u2028\u2029] | EscapeSequence | '\\\n')* '"'
| '\'' (~['\\\r\n\u0085\u2028\u2029] | EscapeSequence | '\\\n')* '\''
;
LONGSTRING
: '[' NESTED_STR ']'
INTERPOLATED_STRING
: '`' (~[`\\\r\n\u0085\u2028\u2029] | EscapeSequence | '\\\n')* '`'
;
LONG_STRING
: OPEN_BRACKET NESTED_STR CLOSE_BRACKET
;
fragment
NESTED_STR
: '=' NESTED_STR '='
| '[' .*? ']'
| OPEN_BRACKET .*? CLOSE_BRACKET
;
INT
@ -350,6 +364,26 @@ HEX_FLOAT
| '0' [xX] HexDigit+ HexExponentPart
;
OPEN_BRACE: '{';
CLOSE_BRACE: '}';
OPEN_BRACKET: '[';
CLOSE_BRACKET: ']';
OPEN_PARENS: '(';
CLOSE_PARENS: ')';
NL
: '\r\n' | '\r' | '\n'
| '\u0085' // <Next Line CHARACTER (U+0085)>'
| '\u2028' //'<Line Separator CHARACTER (U+2028)>'
| '\u2029' //'<Paragraph Separator CHARACTER (U+2029)>'
;
COMMA
: ','
;
fragment
ExponentPart
: [eE] [+-]? Digit+
@ -362,8 +396,8 @@ HexExponentPart
fragment
EscapeSequence
: '\\' [abfnrtvz"'|$#\\] // World of Warcraft Lua additionally escapes |$#
| '\\' '\r'? '\n'
: '\\' [abfnrtvz"'`|$#\\] // World of Warcraft Lua additionally escapes |$#
| NL
| DecimalEscape
| HexEscape
| UtfEscape

View File

@ -4,15 +4,28 @@
package net.sourceforge.pmd.cpd;
import java.util.Properties;
/**
* Language implementation for Lua
*/
public class LuaLanguage extends AbstractLanguage {
public LuaLanguage() {
this(System.getProperties());
}
/**
* Creates a new Lua Language instance.
*/
public LuaLanguage() {
public LuaLanguage(Properties properties) {
super("Lua", "lua", new LuaTokenizer(), ".lua");
setProperties(properties);
}
@Override
public final void setProperties(Properties properties) {
LuaTokenizer tokenizer = (LuaTokenizer) getTokenizer();
tokenizer.setProperties(properties);
}
}

View File

@ -4,8 +4,11 @@
package net.sourceforge.pmd.cpd;
import java.util.Properties;
import org.antlr.v4.runtime.CharStream;
import net.sourceforge.pmd.cpd.token.AntlrToken;
import net.sourceforge.pmd.cpd.token.AntlrTokenFilter;
import net.sourceforge.pmd.lang.antlr.AntlrTokenManager;
import net.sourceforge.pmd.lang.lua.antlr4.LuaLexer;
@ -15,6 +18,22 @@ import net.sourceforge.pmd.lang.lua.antlr4.LuaLexer;
*/
public class LuaTokenizer extends AntlrTokenizer {
private boolean ignoreLiteralSequences = false;
/**
* Sets the possible options for the C# tokenizer.
*
* @param properties the properties
* @see #OPTION_IGNORE_LITERAL_SEQUENCES
*/
public void setProperties(Properties properties) {
ignoreLiteralSequences = getBooleanProperty(properties, OPTION_IGNORE_LITERAL_SEQUENCES);
}
private boolean getBooleanProperty(final Properties properties, final String property) {
return Boolean.parseBoolean(properties.getProperty(property, Boolean.FALSE.toString()));
}
@Override
protected AntlrTokenManager getLexerForSource(SourceCode sourceCode) {
CharStream charStream = AntlrTokenizer.getCharStreamFromSourceCode(sourceCode);
@ -23,6 +42,151 @@ public class LuaTokenizer extends AntlrTokenizer {
@Override
protected AntlrTokenFilter getTokenFilter(final AntlrTokenManager tokenManager) {
return new AntlrTokenFilter(tokenManager);
return new LuaTokenFilter(tokenManager, ignoreLiteralSequences);
}
/**
* The {@link LuaTokenFilter} extends the {@link AntlrTokenFilter} to discard
* Lua-specific tokens.
* <p>
* By default, it enables annotation-based CPD suppression.
* If the --ignoreUsings flag is provided, require() directives are filtered out.
* </p>
*/
private static class LuaTokenFilter extends AntlrTokenFilter {
private final boolean ignoreLiteralSequences;
private boolean discardingRequires = false;
private boolean discardingNL = false;
private AntlrToken discardingLiteralsUntil = null;
private boolean discardCurrent = false;
LuaTokenFilter(final AntlrTokenManager tokenManager, boolean ignoreLiteralSequences) {
super(tokenManager);
this.ignoreLiteralSequences = ignoreLiteralSequences;
}
@Override
protected void analyzeToken(final AntlrToken currentToken) {
skipNewLines(currentToken);
}
@Override
protected void analyzeTokens(final AntlrToken currentToken, final Iterable<AntlrToken> remainingTokens) {
discardCurrent = false;
skipRequires(currentToken, remainingTokens);
skipLiteralSequences(currentToken, remainingTokens);
}
private void skipRequires(final AntlrToken currentToken, final Iterable<AntlrToken> remainingTokens) {
final int type = currentToken.getKind();
if (type == LuaLexer.REQUIRE) {
discardingRequires = true;
} else if (type == LuaLexer.CLOSE_PARENS && discardingRequires) {
discardingRequires = false;
discardCurrent = true;
}
}
private void skipNewLines(final AntlrToken currentToken) {
discardingNL = currentToken.getKind() == LuaLexer.NL;
}
private void skipLiteralSequences(final AntlrToken currentToken, final Iterable<AntlrToken> remainingTokens) {
if (ignoreLiteralSequences) {
final int type = currentToken.getKind();
if (isDiscardingLiterals()) {
if (currentToken == discardingLiteralsUntil) { // NOPMD - intentional check for reference equality
discardingLiteralsUntil = null;
discardCurrent = true;
}
} else if (type == LuaLexer.OPEN_BRACE
|| type == LuaLexer.OPEN_BRACKET
|| type == LuaLexer.OPEN_PARENS) {
final AntlrToken finalToken = findEndOfSequenceOfLiterals(remainingTokens);
discardingLiteralsUntil = finalToken;
}
}
}
private AntlrToken findEndOfSequenceOfLiterals(final Iterable<AntlrToken> remainingTokens) {
boolean seenLiteral = false;
int braceCount = 0;
int bracketCount = 0;
int parenCount = 0;
for (final AntlrToken token : remainingTokens) {
switch (token.getKind()) {
case LuaLexer.INT:
case LuaLexer.NORMAL_STRING:
case LuaLexer.INTERPOLATED_STRING:
case LuaLexer.LONG_STRING:
case LuaLexer.HEX_FLOAT:
case LuaLexer.HEX:
case LuaLexer.FLOAT:
case LuaLexer.NIL:
case LuaLexer.BOOLEAN:
seenLiteral = true;
break; // can be skipped; continue to the next token
case LuaLexer.COMMA:
break; // can be skipped; continue to the next token
case LuaLexer.NL:
// this helps skip large multi-line data table sequences in Lua
break; // can be skipped; continue to the next token
case LuaLexer.ASSIGNMENT:
// this helps skip large data table sequences in Lua: { ["bob"] = "uncle", ["alice"] = "enby" }
break; // can be skipped; continue to the next token
case LuaLexer.OPEN_BRACE:
braceCount++;
break; // curly braces are allowed, as long as they're balanced
case LuaLexer.CLOSE_BRACE:
braceCount--;
if (braceCount < 0) {
// end of the list in the braces; skip all contents
return seenLiteral ? token : null;
} else {
// curly braces are not yet balanced; continue to the next token
break;
}
case LuaLexer.OPEN_BRACKET:
bracketCount++;
break; // brackets are allowed, as long as they're balanced
case LuaLexer.CLOSE_BRACKET:
bracketCount--;
if (bracketCount < 0) {
// end of the list in the brackets; skip all contents
return seenLiteral ? token : null;
} else {
// brackets are not yet balanced; continue to the next token
break;
}
case LuaLexer.OPEN_PARENS:
parenCount++;
break; // parens are allowed, as long as they're balanced
case LuaLexer.CLOSE_PARENS:
parenCount--;
if (parenCount < 0) {
// end of the list in the parens; skip all contents
return seenLiteral ? token : null;
} else {
// parens are not yet balanced; continue to the next token
break;
}
default:
// some other token than the expected ones; this is not a sequence of literals
return null;
}
}
return null;
}
public boolean isDiscardingLiterals() {
return discardingLiteralsUntil != null;
}
@Override
protected boolean isLanguageSpecificDiscarding() {
return discardingRequires || discardingNL || isDiscardingLiterals() || discardCurrent;
}
}
}

View File

@ -11,18 +11,22 @@ local _PlatformService = nil
local game = require(script.Parent.game).default :: any
pcall(function() _PlatformService = game:GetService('PlatformService') end)
return function <T>(req, ...: boolean): ({[string|number]: T}, string, Function<...any>)
local body = string.format("%s %s\n", req.method, req.path)
local res = {
code = 200,
{ "Content-Type", "text/plain" },
{ "Content-Length", #body } :: Array<any>,
{
"Content-Length",
#body,
["Auth.Confirm"] = [[至:%s。]],
} :: Array<any>,
} :: { [any]: number | Array<string | boolean> }
if (req :: any).keepAlive then
local socketType: "Connection" | "Pingback" | "" = "" :: ""
socketType = "Connection" :: "Connection"
res[#res + 1] = { socketType :: string, "Keep-Alive" }
res[#res + 1] = { socketType :: string, `\`${req.keepAlive}\`` }
res[#res - 2] = { ... }
end

View File

@ -76,14 +76,6 @@ L11
[local] 1 5
[game] 7 10
[=] 12 12
[require] 14 20
[(] 21 21
[script] 22 27
[.] 28 28
[Parent] 29 34
[.] 35 35
[game] 36 39
[)] 40 40
[.] 41 41
[default] 42 48
[::] 50 51
@ -104,7 +96,7 @@ L12
[)] 70 70
[end] 72 74
[)] 75 75
L15
L14
[return] 1 6
[function] 8 15
[<] 17 17
@ -137,7 +129,7 @@ L15
[any] 84 86
[>] 87 87
[)] 88 88
L16
L15
[local] 3 7
[body] 9 12
[=] 14 14
@ -155,37 +147,48 @@ L16
[.] 56 56
[path] 57 60
[)] 61 61
L17
L16
[local] 3 7
[res] 9 11
[=] 13 13
[{] 15 15
L18
L17
[code] 5 8
[=] 10 10
[200] 12 14
[,] 15 15
L19
L18
[{] 5 5
["Content-Type"] 7 20
[,] 21 21
["text/plain"] 23 34
[}] 36 36
[,] 37 37
L20
L19
[{] 5 5
L20
["Content-Length"] 7 22
[,] 23 23
[#] 25 25
[body] 26 29
[}] 31 31
[::] 33 34
[Array] 36 40
[<] 41 41
[any] 42 44
[>] 45 45
[,] 46 46
L21
[#] 7 7
[body] 8 11
[,] 12 12
L22
[\[] 7 7
["Auth.Confirm"] 8 21
[\]] 22 22
[=] 24 24
[\[\[至:%s。\]\]] 26 34
[,] 35 35
L24
[}] 5 5
[::] 7 8
[Array] 10 14
[<] 15 15
[any] 16 18
[>] 19 19
[,] 20 20
L25
[}] 3 3
[::] 5 6
[{] 8 8
@ -202,7 +205,7 @@ L21
[boolean] 41 47
[>] 48 48
[}] 50 50
L22
L26
[if] 3 4
[(] 6 6
[req] 7 9
@ -212,7 +215,7 @@ L22
[.] 18 18
[keepAlive] 19 27
[then] 29 32
L23
L27
[local] 5 9
[socketType] 11 20
[:] 21 21
@ -225,13 +228,13 @@ L23
[""] 56 57
[::] 59 60
[""] 62 63
L24
L28
[socketType] 5 14
[=] 16 16
["Connection"] 18 29
[::] 31 32
["Connection"] 34 45
L25
L29
[res] 5 7
[\[] 8 8
[#] 9 9
@ -245,9 +248,9 @@ L25
[::] 34 35
[string] 37 42
[,] 43 43
["Keep-Alive"] 45 56
[}] 58 58
L26
[`\\`${req.keepAlive}\\``] 45 66
[}] 68 68
L30
[res] 5 7
[\[] 8 8
[#] 9 9
@ -259,9 +262,9 @@ L26
[{] 21 21
[...] 23 25
[}] 27 27
L27
L31
[end] 3 5
L29
L33
[return] 3 8
[(] 10 10
[res] 11 13
@ -294,6 +297,6 @@ L29
[return] 93 98
[...] 100 102
[end] 104 106
L30
L34
[end] 1 3
EOF