Fix zero-length escapes

2020-04-18 09:52:28 +02:00
parent 2e725038ac
commit d4be567652
6 changed files with 109 additions and 71 deletions
--- a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeAwareReader.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeAwareReader.java
@ -13,11 +13,11 @@ import net.sourceforge.pmd.util.StringUtil;
 import net.sourceforge.pmd.util.document.Chars;

 /**
- * A reader that optionally escapes its input text. It records where
+ * A reader that can interpret escapes in its input text. It records where
 * escapes occurred, and can translate an offset in the translated
 * input document to a line+column position in the original input.
 *
- * <p>The default implementation does not perform any escaping.
+ * <p>The default implementation does not perform any escape translation.
 */
@SuppressWarnings("PMD.AssignmentInOperand")
 public class EscapeAwareReader extends Reader {
@ -27,12 +27,6 @@ public class EscapeAwareReader extends Reader {
     * first backslash is replaced with the translated value of the
     * escape. The bufpos is updated so that we read the next char
     * after the escape.
-     *
-     * <p>This makes it so that 1. we don't need an additional buffer for
-     * translated chars, and 2. the full escape is preserved, just use
-     * the {@link EscapeTracker} to get the position of the escape and
-     * replace the first char with a backslash. We can report unnecessary
-     * escapes that way.
     */
    protected Chars input;
    /** Position of the next char to read in the input. */
@ -50,21 +44,30 @@ public class EscapeAwareReader extends Reader {
     * Translate all the characters in the buffer.
     */
    public int translate() throws IOException {
-        return read(null, 0, Integer.MAX_VALUE);
+        return readUnchecked(null, 0, Integer.MAX_VALUE);
    }


    @Override
-    public int read(final char[] cbuf, final int off, final int len) throws IOException {
+    public int read(final char[] cbuf, final int off, int len) throws IOException {
+        if (off < 0 || len < 0 || len + off > cbuf.length) {
+            throw new IndexOutOfBoundsException("cbuf len=" + cbuf.length + " off=" + off + " len=" + len);
+        }
+        return readUnchecked(cbuf, off, len);
+    }
+
+    private int readUnchecked(char[] cbuf, int off, int len) throws IOException {
        ensureOpen();
        if (this.bufpos == input.length()) {
            return -1;
        }

+        len = min(len, input.length()); // remove Integer.MAX_VALUE
+
        int readChars = 0;
        while (readChars < len && this.bufpos < input.length()) {
            int bpos = this.bufpos;
-            int nextJump = gobbleMaxWithoutEscape(bpos, len - readChars);
+            int nextJump = gobbleMaxWithoutEscape(min(input.length(), bpos + len - readChars));
            int newlyReadChars = nextJump - bpos;

            assert newlyReadChars >= 0 && (readChars + newlyReadChars) <= len;
@ -82,18 +85,20 @@ public class EscapeAwareReader extends Reader {
        return readChars;
    }

-
    /**
     * Returns the max offset, EXclusive, with which we can cut the input
     * array from the bufpos to dump it into the output array. This sets
     * the bufpos to where we should start the next jump.
     */
-    protected int gobbleMaxWithoutEscape(final int bufpos, final int maxReadahead) throws IOException {
-        return this.bufpos = min(bufpos + maxReadahead, input.length());
+    protected int gobbleMaxWithoutEscape(int maxOff) throws IOException {
+        return this.bufpos = maxOff;
    }

-    protected void recordEscape(final int startOffsetInclusive, int length) {
-        this.escapes.recordEscape(startOffsetInclusive, length);
+    protected int recordEscape(final int startOffsetInclusive, int lengthInSource, int translatedLength) {
+        assert lengthInSource > 0 && startOffsetInclusive >= 0;
+        this.escapes.recordEscape(startOffsetInclusive, lengthInSource, translatedLength);
+        this.bufpos = startOffsetInclusive + lengthInSource;
+        return startOffsetInclusive + translatedLength;
    }

    @Override
--- a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeTracker.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeTracker.java
@ -16,43 +16,77 @@ import net.sourceforge.pmd.util.document.Chars;
 class EscapeTracker {

    private static final int[] EMPTY = new int[0];
+    private static final int RECORD_SIZE = 3;

-    /**
+    /*
     * Offsets in the input buffer where a unicode escape occurred.
-     * Represented as pairs [off, len] where
+     * Represented as tuples (off, len, invalid) where
     * - off is the offset in the source file where the escape occurred
-     * - len is the length in characters of the escape (which is translated to a single char).
+     * - len is the length of the escape in the input file, eg for \ u 00a0 will be 6
+     * - invalid is the last offset in the buffer which contains the translated chars (exclusive)
+     *
+     * Eg for "a\u00a0b" (translates as "a b"), the buffer looks like
+     * [a u00a0b]
+     *   ^       this char has been replaced with the translated value of the escape
+     *    ^^^^^  these characters are only present in the input, we jump over them when reading
+     *   ^       off
+     *    ^      invalid
+     *         ^ off + len
+     *
+     * The escape record is (1,6,2)
+     *
+     * When reading the buffer we'll copy two blocks
+     * * "a "
+     * * then jump over "u00a0" and copy "b"
+     *
+     * In general to read until an escape means reading until its 'invalid'
+     * field, and once that is reached, jump to off + len.
+     *
     */
    private int[] escapeRecords = EMPTY;
    /** Index of the next write in the {@link #escapeRecords}. */
    private int nextFreeIdx = 0;

+
    /**
     * Calls to this method must occur in source order (ie param
     * offsetInInput increases monotonically).
     */
-    void recordEscape(int offsetInInput, int len) {
+    void recordEscape(int offsetInInput, int lengthInInput, int lengthInOutput) {
        if (nextFreeIdx + 1 >= escapeRecords.length) {
-            // double capacity, add 1 to not stay stuck at zero
-            int[] newOffsets = new int[(escapeRecords.length + 1) * 2];
+            // add 1 to not stay stuck at zero
+            int[] newOffsets = new int[(escapeRecords.length + 1) * RECORD_SIZE];
            System.arraycopy(escapeRecords, 0, newOffsets, 0, escapeRecords.length);
            this.escapeRecords = newOffsets;
        }

        escapeRecords[nextFreeIdx++] = offsetInInput;
-        escapeRecords[nextFreeIdx++] = len - 1; // -1 because the translated escape has length 1
+        escapeRecords[nextFreeIdx++] = lengthInInput;
+        escapeRecords[nextFreeIdx++] = offsetInInput + lengthInOutput;
+    }
+
+    private int inOff(int idx) {
+        return escapeRecords[idx];
+    }
+
+    private int inLen(int idx) {
+        return escapeRecords[idx + 1];
+    }
+
+    private int invalidIdx(int idx) {
+        return escapeRecords[idx + 2];
    }

    /**
     * Convert an offset in the translated file into an offset in
     * the untranslated input.
     */
-    public int inputOffsetAt(int translatedOffset) {
+    int inputOffsetAt(int translatedOffset) {
        // basically accumulate the lengths of all escapes occurring before the given translatedOffset
        int sum = translatedOffset;
-        for (int i = 0; i < nextFreeIdx; i += 2) {
-            if (escapeRecords[i] < sum) {
-                sum += escapeRecords[i + 1];
+        for (int i = 0; i < maxEscape(); i += RECORD_SIZE) {
+            if (inOff(i) < sum) {
+                sum += inLen(i);
            } else {
                break;
            }
@ -60,16 +94,24 @@ class EscapeTracker {
        return sum;
    }

+    int maxEscape() {
+        return nextFreeIdx;
+    }
+
    @Override
    public String toString() {
        StringBuilder res = new StringBuilder("Escape set {");
-        for (int i = 0; i < nextFreeIdx; i += 2) {
-            res.append("(at=").append(escapeRecords[i]).append(", len=").append(escapeRecords[i + 1]).append("), ");
+        for (int i = 0; i < maxEscape(); i += RECORD_SIZE) {
+            res.append("(at=").append(inOff(i))
+               .append(", inlen=").append(inLen(i))
+               .append(", invalidAt=").append(invalidIdx(i))
+               .append("), ");
        }

        return res.append('}').toString();
    }

+    /** Backend for a CharStream. */
    class Cursor {


@ -112,13 +154,14 @@ class EscapeTracker {
            if (pos == buf.length()) {
                throw new EOFException();
            }
+            char c;

-            char c = buf.charAt(pos);
-
-            if (nextEscape < escapeRecords.length && pos == escapeRecords[nextEscape]) {
-                pos += escapeRecords[nextEscape + 1]; // add escape length
-                this.nextEscape += 2;
+            if (nextEscape < maxEscape() && pos == invalidIdx(nextEscape)) {
+                pos += inLen(nextEscape); // add escape length
+                c = buf.charAt(pos);
+                this.nextEscape += RECORD_SIZE;
            } else {
+                c = buf.charAt(pos);
                pos++;
            }
            outOffset++;
@ -137,20 +180,20 @@ class EscapeTracker {
            if (nextEscape <= 0) {
                pos -= numChars; // then there were no escapes before the 'pos'
            } else {
-                int off = pos;
-                for (int i = nextEscape - 2; i >= 0 && numChars > 0; i -= 2) {
-                    int esc = escapeRecords[i];
-                    if (esc == off) {
-                        off -= escapeRecords[i + 1];
-                    } else if (esc > off) {
+                int inoff = pos;
+                for (int i = nextEscape - RECORD_SIZE; i >= 0 && numChars > 0; i -= RECORD_SIZE) {
+                    int esc = inOff(i);
+                    if (esc == inoff) {
+                        inoff -= inLen(i);
+                    } else if (esc > inoff) {
                        // then the current escape was before what we're looking at
                        break;
                    } else {
-                        off--;
+                        inoff--;
                    }
                    numChars--;
                }
-                pos = off - numChars;
+                pos = inoff - numChars;
            }
        }

@ -174,15 +217,13 @@ class EscapeTracker {
                int cur = mark;
                int esc = markEscape;
                while (cur < pos && esc < nextEscape) {
-                    int escapeOff = escapeRecords[esc];
-                    assert escapeOff < pos;
-                    sb.append(buf, cur, escapeOff + 1);
-                    cur = escapeOff + escapeRecords[esc + 1];
-                    esc += 2;
+                    sb.append(buf, cur, invalidIdx(esc));
+                    cur = inOff(esc) + inLen(esc);
+                    esc += RECORD_SIZE;
                }
                // no more escape in the range, append everything until the pos
                sb.append(buf, cur, pos + 1);
-                assert sb.length() - prevLength == markLength();
+                assert sb.length() - prevLength == markLength() : sb + " should have length " + markLength();
            }
        }

--- a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReader.java
+++ b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReader.java
@ -32,12 +32,11 @@ public final class JavaInputReader extends EscapeAwareReader {
    }

    @Override
-    protected int gobbleMaxWithoutEscape(final int bufpos, final int maxReadahead) throws IOException {
-        int off = bufpos;
-        int max = min(bufpos + maxReadahead, input.length());
+    protected int gobbleMaxWithoutEscape(final int maxOff) throws IOException {
+        int off = this.bufpos;
        boolean noBackSlash = false;
        int notEscapeEnd = this.savedNotEscapeSpecialEnd;
-        while (off < max && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) {
+        while (off < maxOff && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) {
            off++;
        }

@ -59,12 +58,10 @@ public final class JavaInputReader extends EscapeAwareReader {

            replaceFirstBackslashWithEscape(firstBslashOff, off);
            this.savedNotEscapeSpecialEnd = Integer.MAX_VALUE;
-            this.bufpos = off + 5;
-            this.recordEscape(firstBslashOff, off + 5 - firstBslashOff);
-            return firstBslashOff + 1;
+            return recordEscape(firstBslashOff, off + 5 - firstBslashOff, 1);
        } else {
            // not an escape sequence
-            int min = min(bufpos + maxReadahead, off);
+            int min = min(maxOff, off);
            // save the number of backslashes that are part of the escape,
            // might have been cut in half by the maxReadahead
            this.savedNotEscapeSpecialEnd = min < off ? off : Integer.MAX_VALUE;
--- a/pmd-core/src/test/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReaderTest.java
+++ b/pmd-core/src/test/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReaderTest.java
@ -85,7 +85,7 @@ public class JavaInputReaderTest {
            Assert.assertEquals(1, read);
            assertBufferIsJust("abc\\\\", chars, 0);

-            read = r.read(chars, 5, 8);
+            read = r.read(chars, 5, chars.length - 5);

            Assert.assertEquals(5, read);
            assertBufferIsJust("abc\\\\\\dede", chars, 0);
@ -125,7 +125,7 @@ public class JavaInputReaderTest {
            Assert.assertEquals(9, read);
            assertBufferIsJust("abc\u00a0dede\u00a0", chars, 0);

-            read = r.read(chars, 9, 10);
+            read = r.read(chars, 9, chars.length - 9);

            Assert.assertEquals(-1, read);
            assertBufferIsJust("abc\u00a0dede\u00a0", chars, 0);
--- a/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/ast/CppEscapeReader.java
+++ b/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/ast/CppEscapeReader.java
@ -23,35 +23,30 @@ public class CppEscapeReader extends EscapeAwareReader {
    }

    @Override
-    protected int gobbleMaxWithoutEscape(int bufpos, int maxReadahead) throws IOException {
-        int off = bufpos;
-        int max = min(bufpos + maxReadahead, input.length());
+    protected int gobbleMaxWithoutEscape(final int maxOff) throws IOException {
+        int off = this.bufpos;
        boolean noBackSlash = false;
        int notEscapeEnd = this.savedNotEscapeSpecialEnd;
-        while (off < max && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) {
+        while (off < maxOff && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) {
            off++;
        }

-        if (noBackSlash) {
+        if (noBackSlash || off == maxOff) {
            this.bufpos = off;
            return off;
        }

        final int backSlackOff = off++;
        if (input.charAt(off) == NEWLINE) {
-            recordEscape(backSlackOff, 2);
-            this.bufpos = off + 2;
-            return backSlackOff;
+            return recordEscape(backSlackOff, 2, 0);
        } else if (input.charAt(off) == CARRIAGE_RETURN) {
            if (input.charAt(++off) == NEWLINE) {
-                recordEscape(backSlackOff, 3);
-                this.bufpos = off + 3;
-                return backSlackOff;
+                return recordEscape(backSlackOff, 3, 0);
            }
        }

        // not an escape sequence
-        int min = min(bufpos + maxReadahead, off);
+        int min = min(maxOff, off);
        // save the number of backslashes that are part of the escape,
        // might have been cut in half by the maxReadahead
        this.savedNotEscapeSpecialEnd = min < off ? off : Integer.MAX_VALUE;
--- a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CppCharStreamTest.java
+++ b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CppCharStreamTest.java
@ -18,7 +18,7 @@ import net.sourceforge.pmd.util.document.TextDocument;
 public class CppCharStreamTest {

    @NonNull
-    public CharStream charStreamFor(String source) {
+    public CharStream charStreamFor(String source) throws IOException {
        return NewCharStream.open(new CPPTokenizer().newTokenDoc(TextDocument.readOnlyString(source)));
    }