From d4be567652fd5f73a5bcab8f18c295e22db1f9af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Fournier?= <clement.fournier76@gmail.com>
Date: Sat, 18 Apr 2020 09:52:28 +0200
Subject: [PATCH] Fix zero-length escapes

---
 .../lang/ast/impl/io/EscapeAwareReader.java   |  37 +++---
 .../pmd/lang/ast/impl/io/EscapeTracker.java   | 105 ++++++++++++------
 .../pmd/lang/ast/impl/io/JavaInputReader.java |  13 +--
 .../lang/ast/impl/io/JavaInputReaderTest.java |   4 +-
 .../pmd/lang/cpp/ast/CppEscapeReader.java     |  19 ++--
 .../pmd/cpd/CppCharStreamTest.java            |   2 +-
 6 files changed, 109 insertions(+), 71 deletions(-)
diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeAwareReader.java b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeAwareReader.java
index 51e97a70f1..fa8412e920 100644
--- a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeAwareReader.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeAwareReader.java
@@ -13,11 +13,11 @@ import net.sourceforge.pmd.util.StringUtil;
 import net.sourceforge.pmd.util.document.Chars;
 
 /**
- * A reader that optionally escapes its input text. It records where
+ * A reader that can interpret escapes in its input text. It records where
  * escapes occurred, and can translate an offset in the translated
  * input document to a line+column position in the original input.
  *
- * <p>The default implementation does not perform any escaping.
+ * <p>The default implementation does not perform any escape translation.
  */
 @SuppressWarnings("PMD.AssignmentInOperand")
 public class EscapeAwareReader extends Reader {
@@ -27,12 +27,6 @@ public class EscapeAwareReader extends Reader {
      * first backslash is replaced with the translated value of the
      * escape. The bufpos is updated so that we read the next char
      * after the escape.
-     *
-     * <p>This makes it so that 1. we don't need an additional buffer for
-     * translated chars, and 2. the full escape is preserved, just use
-     * the {@link EscapeTracker} to get the position of the escape and
-     * replace the first char with a backslash. We can report unnecessary
-     * escapes that way.
      */
     protected Chars input;
     /** Position of the next char to read in the input. */
@@ -50,21 +44,30 @@ public class EscapeAwareReader extends Reader {
      * Translate all the characters in the buffer.
      */
     public int translate() throws IOException {
-        return read(null, 0, Integer.MAX_VALUE);
+        return readUnchecked(null, 0, Integer.MAX_VALUE);
     }
 
 
     @Override
-    public int read(final char[] cbuf, final int off, final int len) throws IOException {
+    public int read(final char[] cbuf, final int off, int len) throws IOException {
+        if (off < 0 || len < 0 || len + off > cbuf.length) {
+            throw new IndexOutOfBoundsException("cbuf len=" + cbuf.length + " off=" + off + " len=" + len);
+        }
+        return readUnchecked(cbuf, off, len);
+    }
+
+    private int readUnchecked(char[] cbuf, int off, int len) throws IOException {
         ensureOpen();
         if (this.bufpos == input.length()) {
             return -1;
         }
 
+        len = min(len, input.length()); // remove Integer.MAX_VALUE
+
         int readChars = 0;
         while (readChars < len && this.bufpos < input.length()) {
             int bpos = this.bufpos;
-            int nextJump = gobbleMaxWithoutEscape(bpos, len - readChars);
+            int nextJump = gobbleMaxWithoutEscape(min(input.length(), bpos + len - readChars));
             int newlyReadChars = nextJump - bpos;
 
             assert newlyReadChars >= 0 && (readChars + newlyReadChars) <= len;
@@ -82,18 +85,20 @@ public class EscapeAwareReader extends Reader {
         return readChars;
     }
 
-
     /**
      * Returns the max offset, EXclusive, with which we can cut the input
      * array from the bufpos to dump it into the output array. This sets
      * the bufpos to where we should start the next jump.
      */
-    protected int gobbleMaxWithoutEscape(final int bufpos, final int maxReadahead) throws IOException {
-        return this.bufpos = min(bufpos + maxReadahead, input.length());
+    protected int gobbleMaxWithoutEscape(int maxOff) throws IOException {
+        return this.bufpos = maxOff;
     }
 
-    protected void recordEscape(final int startOffsetInclusive, int length) {
-        this.escapes.recordEscape(startOffsetInclusive, length);
+    protected int recordEscape(final int startOffsetInclusive, int lengthInSource, int translatedLength) {
+        assert lengthInSource > 0 && startOffsetInclusive >= 0;
+        this.escapes.recordEscape(startOffsetInclusive, lengthInSource, translatedLength);
+        this.bufpos = startOffsetInclusive + lengthInSource;
+        return startOffsetInclusive + translatedLength;
     }
 
     @Override
diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeTracker.java b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeTracker.java
index 8ed93855da..7b3a23798c 100644
--- a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeTracker.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeTracker.java
@@ -16,43 +16,77 @@ import net.sourceforge.pmd.util.document.Chars;
 class EscapeTracker {
 
     private static final int[] EMPTY = new int[0];
+    private static final int RECORD_SIZE = 3;
 
-    /**
+    /*
      * Offsets in the input buffer where a unicode escape occurred.
-     * Represented as pairs [off, len] where
+     * Represented as tuples (off, len, invalid) where
      * - off is the offset in the source file where the escape occurred
-     * - len is the length in characters of the escape (which is translated to a single char).
+     * - len is the length of the escape in the input file, eg for \ u 00a0 will be 6
+     * - invalid is the last offset in the buffer which contains the translated chars (exclusive)
+     *
+     * Eg for "a\u00a0b" (translates as "a b"), the buffer looks like
+     * [a u00a0b]
+     *   ^       this char has been replaced with the translated value of the escape
+     *    ^^^^^  these characters are only present in the input, we jump over them when reading
+     *   ^       off
+     *    ^      invalid
+     *         ^ off + len
+     *
+     * The escape record is (1,6,2)
+     *
+     * When reading the buffer we'll copy two blocks
+     * * "a "
+     * * then jump over "u00a0" and copy "b"
+     *
+     * In general to read until an escape means reading until its 'invalid'
+     * field, and once that is reached, jump to off + len.
+     *
      */
     private int[] escapeRecords = EMPTY;
     /** Index of the next write in the {@link #escapeRecords}. */
     private int nextFreeIdx = 0;
 
+
     /**
      * Calls to this method must occur in source order (ie param
      * offsetInInput increases monotonically).
      */
-    void recordEscape(int offsetInInput, int len) {
+    void recordEscape(int offsetInInput, int lengthInInput, int lengthInOutput) {
         if (nextFreeIdx + 1 >= escapeRecords.length) {
-            // double capacity, add 1 to not stay stuck at zero
-            int[] newOffsets = new int[(escapeRecords.length + 1) * 2];
+            // add 1 to not stay stuck at zero
+            int[] newOffsets = new int[(escapeRecords.length + 1) * RECORD_SIZE];
             System.arraycopy(escapeRecords, 0, newOffsets, 0, escapeRecords.length);
             this.escapeRecords = newOffsets;
         }
 
         escapeRecords[nextFreeIdx++] = offsetInInput;
-        escapeRecords[nextFreeIdx++] = len - 1; // -1 because the translated escape has length 1
+        escapeRecords[nextFreeIdx++] = lengthInInput;
+        escapeRecords[nextFreeIdx++] = offsetInInput + lengthInOutput;
+    }
+
+    private int inOff(int idx) {
+        return escapeRecords[idx];
+    }
+
+    private int inLen(int idx) {
+        return escapeRecords[idx + 1];
+    }
+
+    private int invalidIdx(int idx) {
+        return escapeRecords[idx + 2];
     }
 
     /**
      * Convert an offset in the translated file into an offset in
      * the untranslated input.
      */
-    public int inputOffsetAt(int translatedOffset) {
+    int inputOffsetAt(int translatedOffset) {
         // basically accumulate the lengths of all escapes occurring before the given translatedOffset
         int sum = translatedOffset;
-        for (int i = 0; i < nextFreeIdx; i += 2) {
-            if (escapeRecords[i] < sum) {
-                sum += escapeRecords[i + 1];
+        for (int i = 0; i < maxEscape(); i += RECORD_SIZE) {
+            if (inOff(i) < sum) {
+                sum += inLen(i);
             } else {
                 break;
             }
@@ -60,16 +94,24 @@ class EscapeTracker {
         return sum;
     }
 
+    int maxEscape() {
+        return nextFreeIdx;
+    }
+
     @Override
     public String toString() {
         StringBuilder res = new StringBuilder("Escape set {");
-        for (int i = 0; i < nextFreeIdx; i += 2) {
-            res.append("(at=").append(escapeRecords[i]).append(", len=").append(escapeRecords[i + 1]).append("), ");
+        for (int i = 0; i < maxEscape(); i += RECORD_SIZE) {
+            res.append("(at=").append(inOff(i))
+               .append(", inlen=").append(inLen(i))
+               .append(", invalidAt=").append(invalidIdx(i))
+               .append("), ");
         }
 
         return res.append('}').toString();
     }
 
+    /** Backend for a CharStream. */
     class Cursor {
 
 
@@ -112,13 +154,14 @@ class EscapeTracker {
             if (pos == buf.length()) {
                 throw new EOFException();
             }
+            char c;
 
-            char c = buf.charAt(pos);
-
-            if (nextEscape < escapeRecords.length && pos == escapeRecords[nextEscape]) {
-                pos += escapeRecords[nextEscape + 1]; // add escape length
-                this.nextEscape += 2;
+            if (nextEscape < maxEscape() && pos == invalidIdx(nextEscape)) {
+                pos += inLen(nextEscape); // add escape length
+                c = buf.charAt(pos);
+                this.nextEscape += RECORD_SIZE;
             } else {
+                c = buf.charAt(pos);
                 pos++;
             }
             outOffset++;
@@ -137,20 +180,20 @@ class EscapeTracker {
             if (nextEscape <= 0) {
                 pos -= numChars; // then there were no escapes before the 'pos'
             } else {
-                int off = pos;
-                for (int i = nextEscape - 2; i >= 0 && numChars > 0; i -= 2) {
-                    int esc = escapeRecords[i];
-                    if (esc == off) {
-                        off -= escapeRecords[i + 1];
-                    } else if (esc > off) {
+                int inoff = pos;
+                for (int i = nextEscape - RECORD_SIZE; i >= 0 && numChars > 0; i -= RECORD_SIZE) {
+                    int esc = inOff(i);
+                    if (esc == inoff) {
+                        inoff -= inLen(i);
+                    } else if (esc > inoff) {
                         // then the current escape was before what we're looking at
                         break;
                     } else {
-                        off--;
+                        inoff--;
                     }
                     numChars--;
                 }
-                pos = off - numChars;
+                pos = inoff - numChars;
             }
         }
 
@@ -174,15 +217,13 @@ class EscapeTracker {
                 int cur = mark;
                 int esc = markEscape;
                 while (cur < pos && esc < nextEscape) {
-                    int escapeOff = escapeRecords[esc];
-                    assert escapeOff < pos;
-                    sb.append(buf, cur, escapeOff + 1);
-                    cur = escapeOff + escapeRecords[esc + 1];
-                    esc += 2;
+                    sb.append(buf, cur, invalidIdx(esc));
+                    cur = inOff(esc) + inLen(esc);
+                    esc += RECORD_SIZE;
                 }
                 // no more escape in the range, append everything until the pos
                 sb.append(buf, cur, pos + 1);
-                assert sb.length() - prevLength == markLength();
+                assert sb.length() - prevLength == markLength() : sb + " should have length " + markLength();
             }
         }
 
diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReader.java b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReader.java
index b41211c0ed..53da0c2315 100644
--- a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReader.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReader.java
@@ -32,12 +32,11 @@ public final class JavaInputReader extends EscapeAwareReader {
     }
 
     @Override
-    protected int gobbleMaxWithoutEscape(final int bufpos, final int maxReadahead) throws IOException {
-        int off = bufpos;
-        int max = min(bufpos + maxReadahead, input.length());
+    protected int gobbleMaxWithoutEscape(final int maxOff) throws IOException {
+        int off = this.bufpos;
         boolean noBackSlash = false;
         int notEscapeEnd = this.savedNotEscapeSpecialEnd;
-        while (off < max && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) {
+        while (off < maxOff && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) {
             off++;
         }
 
@@ -59,12 +58,10 @@ public final class JavaInputReader extends EscapeAwareReader {
 
             replaceFirstBackslashWithEscape(firstBslashOff, off);
             this.savedNotEscapeSpecialEnd = Integer.MAX_VALUE;
-            this.bufpos = off + 5;
-            this.recordEscape(firstBslashOff, off + 5 - firstBslashOff);
-            return firstBslashOff + 1;
+            return recordEscape(firstBslashOff, off + 5 - firstBslashOff, 1);
         } else {
             // not an escape sequence
-            int min = min(bufpos + maxReadahead, off);
+            int min = min(maxOff, off);
             // save the number of backslashes that are part of the escape,
             // might have been cut in half by the maxReadahead
             this.savedNotEscapeSpecialEnd = min < off ? off : Integer.MAX_VALUE;
diff --git a/pmd-core/src/test/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReaderTest.java b/pmd-core/src/test/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReaderTest.java
index 0f03a7bed9..59ef30a854 100644
--- a/pmd-core/src/test/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReaderTest.java
+++ b/pmd-core/src/test/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReaderTest.java
@@ -85,7 +85,7 @@ public class JavaInputReaderTest {
             Assert.assertEquals(1, read);
             assertBufferIsJust("abc\\\\", chars, 0);
 
-            read = r.read(chars, 5, 8);
+            read = r.read(chars, 5, chars.length - 5);
 
             Assert.assertEquals(5, read);
             assertBufferIsJust("abc\\\\\\dede", chars, 0);
@@ -125,7 +125,7 @@ public class JavaInputReaderTest {
             Assert.assertEquals(9, read);
             assertBufferIsJust("abc\u00a0dede\u00a0", chars, 0);
 
-            read = r.read(chars, 9, 10);
+            read = r.read(chars, 9, chars.length - 9);
 
             Assert.assertEquals(-1, read);
             assertBufferIsJust("abc\u00a0dede\u00a0", chars, 0);
diff --git a/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/ast/CppEscapeReader.java b/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/ast/CppEscapeReader.java
index 378dfb1011..4b3f3417b7 100644
--- a/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/ast/CppEscapeReader.java
+++ b/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/ast/CppEscapeReader.java
@@ -23,35 +23,30 @@ public class CppEscapeReader extends EscapeAwareReader {
     }
 
     @Override
-    protected int gobbleMaxWithoutEscape(int bufpos, int maxReadahead) throws IOException {
-        int off = bufpos;
-        int max = min(bufpos + maxReadahead, input.length());
+    protected int gobbleMaxWithoutEscape(final int maxOff) throws IOException {
+        int off = this.bufpos;
         boolean noBackSlash = false;
         int notEscapeEnd = this.savedNotEscapeSpecialEnd;
-        while (off < max && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) {
+        while (off < maxOff && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) {
             off++;
         }
 
-        if (noBackSlash) {
+        if (noBackSlash || off == maxOff) {
             this.bufpos = off;
             return off;
         }
 
         final int backSlackOff = off++;
         if (input.charAt(off) == NEWLINE) {
-            recordEscape(backSlackOff, 2);
-            this.bufpos = off + 2;
-            return backSlackOff;
+            return recordEscape(backSlackOff, 2, 0);
         } else if (input.charAt(off) == CARRIAGE_RETURN) {
             if (input.charAt(++off) == NEWLINE) {
-                recordEscape(backSlackOff, 3);
-                this.bufpos = off + 3;
-                return backSlackOff;
+                return recordEscape(backSlackOff, 3, 0);
             }
         }
 
         // not an escape sequence
-        int min = min(bufpos + maxReadahead, off);
+        int min = min(maxOff, off);
         // save the number of backslashes that are part of the escape,
         // might have been cut in half by the maxReadahead
         this.savedNotEscapeSpecialEnd = min < off ? off : Integer.MAX_VALUE;
diff --git a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CppCharStreamTest.java b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CppCharStreamTest.java
index 9d6df7dd49..a8823f011f 100644
--- a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CppCharStreamTest.java
+++ b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CppCharStreamTest.java
@@ -18,7 +18,7 @@ import net.sourceforge.pmd.util.document.TextDocument;
 public class CppCharStreamTest {
 
     @NonNull
-    public CharStream charStreamFor(String source) {
+    public CharStream charStreamFor(String source) throws IOException {
         return NewCharStream.open(new CPPTokenizer().newTokenDoc(TextDocument.readOnlyString(source)));
     }