From 806b130e4a3c834280d695806274740bab7afb6c Mon Sep 17 00:00:00 2001
From: Maikel Steneker <maikel.steneker@tiobe.com>
Date: Tue, 21 May 2019 14:33:43 +0200
Subject: [PATCH] Improved Dart tokenizer to handle strings correctly.

This is based on the grammar at https://github.com/chalin/dart-spec-and-grammar/blob/master/doc/grammar-AUTOGENERATED-DO-NOT-EDIT.txt.

The string handling is now much closer to this grammar, resulting in more files being tokenized correctly.
---
 .../sourceforge/pmd/lang/dart/antlr4/Dart2.g4 | 77 ++++++++-----------
 1 file changed, 34 insertions(+), 43 deletions(-)

diff --git a/pmd-dart/src/main/antlr4/net/sourceforge/pmd/lang/dart/antlr4/Dart2.g4 b/pmd-dart/src/main/antlr4/net/sourceforge/pmd/lang/dart/antlr4/Dart2.g4
index fcc42ae013..7d8793edab 100644
--- a/pmd-dart/src/main/antlr4/net/sourceforge/pmd/lang/dart/antlr4/Dart2.g4
+++ b/pmd-dart/src/main/antlr4/net/sourceforge/pmd/lang/dart/antlr4/Dart2.g4
@@ -340,58 +340,49 @@ booleanLiteral
   | 'false'
   ;
 
-//stringLiteral: (MultilineString | SingleLineString)+;
-stringLiteral: SingleLineString;
-//stringLiteral: SingleLineString;
+stringLiteral: (MultiLineString | SingleLineString)+;
+
 SingleLineString
-  : '"' (~[\\"] | '\\\\' | ESCAPE_SEQUENCE | '\\"')* '"'
-  | '\'' (~[\\'] | '\\\\' | ESCAPE_SEQUENCE | '\\\'')* '\''
-//  | 'r\'' (~('\'' | NEWLINE))* '\'' // TODO
-//  | 'r"' (~('\'' | NEWLINE))* '"'
+  : '"' StringContentDQ* '"'
+  | '\'' StringContentSQ* '\''
+  | 'r\'' (~('\'' | '\n' | '\r'))* '\''
+  | 'r"' (~('"' | '\n' | '\r'))* '"'
   ;
 
-//MultilineString
-//  : '"""' StringContentTDQ* '"""'
-//  | '\'\'\'' StringContentTDQ* '\'\'\''
-//  | 'r"""' (~'"""')* '"""' // TODO
-//  | 'r\'\'\'' (~'\'\'\'')* '\'\'\''
-//  ;
-//StringContentSQ: .;// TODO
-//StringContentTDQ: .;// TODO
 fragment
-ESCAPE_SEQUENCE
-  : '\\n'
-  | '\\r'
-  | '\\f'
-  | '\\b'
-  | '\\t'
-  | '\\v'
-  | '\\x' HEX_DIGIT HEX_DIGIT
-  | '\\u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
-  | '\\u{' HEX_DIGIT_SEQUENCE '}'
-  | '\\$'
+StringContentDQ
+  : ~('\\' | '"' /*| '$'*/ | '\n' | '\r')
+  | '\\' ~('\n' | '\r')
+  //| stringInterpolation
   ;
+
 fragment
-HEX_DIGIT_SEQUENCE
-  : HEX_DIGIT HEX_DIGIT? HEX_DIGIT?
-    HEX_DIGIT? HEX_DIGIT? HEX_DIGIT?
+StringContentSQ
+  : ~('\\' | '\'' /*| '$'*/ | '\n' | '\r')
+  | '\\' ~('\n' | '\r')
+  //| stringInterpolation
   ;
-/*TODO
-<stringContentDQ> ::= \~{}( `\\' | `"' | `$' | <NEWLINE> )
-  \alt `\\' \~{}( <NEWLINE> )
-  \alt <stringInterpolation>
 
-<stringContentSQ> ::= \~{}( `\\' | `\'' | `$' | <NEWLINE> )
-  \alt `\\' \~{}( <NEWLINE> )
-  \alt <stringInterpolation>
-<stringContentTDQ> ::= \~{}( `\\' | `"""' | `$')
-  \alt `\\' \~{}( <NEWLINE> )
-  \alt <stringInterpolation>
+MultiLineString
+  : '"""' StringContentTDQ* '"""'
+  | '\'\'\'' StringContentTSQ* '\'\'\''
+  | 'r"""' (~'"' | '"' ~'"' | '""' ~'"')* '"""'
+  | 'r\'\'\'' (~'\'' | '\'' ~'\'' | '\'\'' ~'\'')* '\'\'\''
+  ;
+
+fragment
+StringContentTDQ
+  : ~('\\' | '"' /*| '$'*/)
+  | '"' ~'"' | '""' ~'"'
+  //| stringInterpolation
+  ;
+
+fragment StringContentTSQ
+  : ~('\\' | '\'' /*| '$'*/)
+  | '\'' ~'\'' | '\'\'' ~'\''
+  //| stringInterpolation
+  ;
 
-<stringContentTSQ> ::= \~{}( `\\' | `\'\'\'' | `$')
-  \alt `\\' \~{}( <NEWLINE> )
-  \alt <stringInterpolation>
-*/
 NEWLINE
   : '\n'
   | '\r'