From d4be567652fd5f73a5bcab8f18c295e22db1f9af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Fournier?= Date: Sat, 18 Apr 2020 09:52:28 +0200 Subject: [PATCH] Fix zero-length escapes --- .../lang/ast/impl/io/EscapeAwareReader.java | 37 +++--- .../pmd/lang/ast/impl/io/EscapeTracker.java | 105 ++++++++++++------ .../pmd/lang/ast/impl/io/JavaInputReader.java | 13 +-- .../lang/ast/impl/io/JavaInputReaderTest.java | 4 +- .../pmd/lang/cpp/ast/CppEscapeReader.java | 19 ++-- .../pmd/cpd/CppCharStreamTest.java | 2 +- 6 files changed, 109 insertions(+), 71 deletions(-) diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeAwareReader.java b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeAwareReader.java index 51e97a70f1..fa8412e920 100644 --- a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeAwareReader.java +++ b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeAwareReader.java @@ -13,11 +13,11 @@ import net.sourceforge.pmd.util.StringUtil; import net.sourceforge.pmd.util.document.Chars; /** - * A reader that optionally escapes its input text. It records where + * A reader that can interpret escapes in its input text. It records where * escapes occurred, and can translate an offset in the translated * input document to a line+column position in the original input. * - *

The default implementation does not perform any escaping. + *

The default implementation does not perform any escape translation. */ @SuppressWarnings("PMD.AssignmentInOperand") public class EscapeAwareReader extends Reader { @@ -27,12 +27,6 @@ public class EscapeAwareReader extends Reader { * first backslash is replaced with the translated value of the * escape. The bufpos is updated so that we read the next char * after the escape. - * - *

This makes it so that 1. we don't need an additional buffer for - * translated chars, and 2. the full escape is preserved, just use - * the {@link EscapeTracker} to get the position of the escape and - * replace the first char with a backslash. We can report unnecessary - * escapes that way. */ protected Chars input; /** Position of the next char to read in the input. */ @@ -50,21 +44,30 @@ public class EscapeAwareReader extends Reader { * Translate all the characters in the buffer. */ public int translate() throws IOException { - return read(null, 0, Integer.MAX_VALUE); + return readUnchecked(null, 0, Integer.MAX_VALUE); } @Override - public int read(final char[] cbuf, final int off, final int len) throws IOException { + public int read(final char[] cbuf, final int off, int len) throws IOException { + if (off < 0 || len < 0 || len + off > cbuf.length) { + throw new IndexOutOfBoundsException("cbuf len=" + cbuf.length + " off=" + off + " len=" + len); + } + return readUnchecked(cbuf, off, len); + } + + private int readUnchecked(char[] cbuf, int off, int len) throws IOException { ensureOpen(); if (this.bufpos == input.length()) { return -1; } + len = min(len, input.length()); // remove Integer.MAX_VALUE + int readChars = 0; while (readChars < len && this.bufpos < input.length()) { int bpos = this.bufpos; - int nextJump = gobbleMaxWithoutEscape(bpos, len - readChars); + int nextJump = gobbleMaxWithoutEscape(min(input.length(), bpos + len - readChars)); int newlyReadChars = nextJump - bpos; assert newlyReadChars >= 0 && (readChars + newlyReadChars) <= len; @@ -82,18 +85,20 @@ public class EscapeAwareReader extends Reader { return readChars; } - /** * Returns the max offset, EXclusive, with which we can cut the input * array from the bufpos to dump it into the output array. This sets * the bufpos to where we should start the next jump. */ - protected int gobbleMaxWithoutEscape(final int bufpos, final int maxReadahead) throws IOException { - return this.bufpos = min(bufpos + maxReadahead, input.length()); + protected int gobbleMaxWithoutEscape(int maxOff) throws IOException { + return this.bufpos = maxOff; } - protected void recordEscape(final int startOffsetInclusive, int length) { - this.escapes.recordEscape(startOffsetInclusive, length); + protected int recordEscape(final int startOffsetInclusive, int lengthInSource, int translatedLength) { + assert lengthInSource > 0 && startOffsetInclusive >= 0; + this.escapes.recordEscape(startOffsetInclusive, lengthInSource, translatedLength); + this.bufpos = startOffsetInclusive + lengthInSource; + return startOffsetInclusive + translatedLength; } @Override diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeTracker.java b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeTracker.java index 8ed93855da..7b3a23798c 100644 --- a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeTracker.java +++ b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/EscapeTracker.java @@ -16,43 +16,77 @@ import net.sourceforge.pmd.util.document.Chars; class EscapeTracker { private static final int[] EMPTY = new int[0]; + private static final int RECORD_SIZE = 3; - /** + /* * Offsets in the input buffer where a unicode escape occurred. - * Represented as pairs [off, len] where + * Represented as tuples (off, len, invalid) where * - off is the offset in the source file where the escape occurred - * - len is the length in characters of the escape (which is translated to a single char). + * - len is the length of the escape in the input file, eg for \ u 00a0 will be 6 + * - invalid is the last offset in the buffer which contains the translated chars (exclusive) + * + * Eg for "a\u00a0b" (translates as "a b"), the buffer looks like + * [a u00a0b] + * ^ this char has been replaced with the translated value of the escape + * ^^^^^ these characters are only present in the input, we jump over them when reading + * ^ off + * ^ invalid + * ^ off + len + * + * The escape record is (1,6,2) + * + * When reading the buffer we'll copy two blocks + * * "a " + * * then jump over "u00a0" and copy "b" + * + * In general to read until an escape means reading until its 'invalid' + * field, and once that is reached, jump to off + len. + * */ private int[] escapeRecords = EMPTY; /** Index of the next write in the {@link #escapeRecords}. */ private int nextFreeIdx = 0; + /** * Calls to this method must occur in source order (ie param * offsetInInput increases monotonically). */ - void recordEscape(int offsetInInput, int len) { + void recordEscape(int offsetInInput, int lengthInInput, int lengthInOutput) { if (nextFreeIdx + 1 >= escapeRecords.length) { - // double capacity, add 1 to not stay stuck at zero - int[] newOffsets = new int[(escapeRecords.length + 1) * 2]; + // add 1 to not stay stuck at zero + int[] newOffsets = new int[(escapeRecords.length + 1) * RECORD_SIZE]; System.arraycopy(escapeRecords, 0, newOffsets, 0, escapeRecords.length); this.escapeRecords = newOffsets; } escapeRecords[nextFreeIdx++] = offsetInInput; - escapeRecords[nextFreeIdx++] = len - 1; // -1 because the translated escape has length 1 + escapeRecords[nextFreeIdx++] = lengthInInput; + escapeRecords[nextFreeIdx++] = offsetInInput + lengthInOutput; + } + + private int inOff(int idx) { + return escapeRecords[idx]; + } + + private int inLen(int idx) { + return escapeRecords[idx + 1]; + } + + private int invalidIdx(int idx) { + return escapeRecords[idx + 2]; } /** * Convert an offset in the translated file into an offset in * the untranslated input. */ - public int inputOffsetAt(int translatedOffset) { + int inputOffsetAt(int translatedOffset) { // basically accumulate the lengths of all escapes occurring before the given translatedOffset int sum = translatedOffset; - for (int i = 0; i < nextFreeIdx; i += 2) { - if (escapeRecords[i] < sum) { - sum += escapeRecords[i + 1]; + for (int i = 0; i < maxEscape(); i += RECORD_SIZE) { + if (inOff(i) < sum) { + sum += inLen(i); } else { break; } @@ -60,16 +94,24 @@ class EscapeTracker { return sum; } + int maxEscape() { + return nextFreeIdx; + } + @Override public String toString() { StringBuilder res = new StringBuilder("Escape set {"); - for (int i = 0; i < nextFreeIdx; i += 2) { - res.append("(at=").append(escapeRecords[i]).append(", len=").append(escapeRecords[i + 1]).append("), "); + for (int i = 0; i < maxEscape(); i += RECORD_SIZE) { + res.append("(at=").append(inOff(i)) + .append(", inlen=").append(inLen(i)) + .append(", invalidAt=").append(invalidIdx(i)) + .append("), "); } return res.append('}').toString(); } + /** Backend for a CharStream. */ class Cursor { @@ -112,13 +154,14 @@ class EscapeTracker { if (pos == buf.length()) { throw new EOFException(); } + char c; - char c = buf.charAt(pos); - - if (nextEscape < escapeRecords.length && pos == escapeRecords[nextEscape]) { - pos += escapeRecords[nextEscape + 1]; // add escape length - this.nextEscape += 2; + if (nextEscape < maxEscape() && pos == invalidIdx(nextEscape)) { + pos += inLen(nextEscape); // add escape length + c = buf.charAt(pos); + this.nextEscape += RECORD_SIZE; } else { + c = buf.charAt(pos); pos++; } outOffset++; @@ -137,20 +180,20 @@ class EscapeTracker { if (nextEscape <= 0) { pos -= numChars; // then there were no escapes before the 'pos' } else { - int off = pos; - for (int i = nextEscape - 2; i >= 0 && numChars > 0; i -= 2) { - int esc = escapeRecords[i]; - if (esc == off) { - off -= escapeRecords[i + 1]; - } else if (esc > off) { + int inoff = pos; + for (int i = nextEscape - RECORD_SIZE; i >= 0 && numChars > 0; i -= RECORD_SIZE) { + int esc = inOff(i); + if (esc == inoff) { + inoff -= inLen(i); + } else if (esc > inoff) { // then the current escape was before what we're looking at break; } else { - off--; + inoff--; } numChars--; } - pos = off - numChars; + pos = inoff - numChars; } } @@ -174,15 +217,13 @@ class EscapeTracker { int cur = mark; int esc = markEscape; while (cur < pos && esc < nextEscape) { - int escapeOff = escapeRecords[esc]; - assert escapeOff < pos; - sb.append(buf, cur, escapeOff + 1); - cur = escapeOff + escapeRecords[esc + 1]; - esc += 2; + sb.append(buf, cur, invalidIdx(esc)); + cur = inOff(esc) + inLen(esc); + esc += RECORD_SIZE; } // no more escape in the range, append everything until the pos sb.append(buf, cur, pos + 1); - assert sb.length() - prevLength == markLength(); + assert sb.length() - prevLength == markLength() : sb + " should have length " + markLength(); } } diff --git a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReader.java b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReader.java index b41211c0ed..53da0c2315 100644 --- a/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReader.java +++ b/pmd-core/src/main/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReader.java @@ -32,12 +32,11 @@ public final class JavaInputReader extends EscapeAwareReader { } @Override - protected int gobbleMaxWithoutEscape(final int bufpos, final int maxReadahead) throws IOException { - int off = bufpos; - int max = min(bufpos + maxReadahead, input.length()); + protected int gobbleMaxWithoutEscape(final int maxOff) throws IOException { + int off = this.bufpos; boolean noBackSlash = false; int notEscapeEnd = this.savedNotEscapeSpecialEnd; - while (off < max && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) { + while (off < maxOff && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) { off++; } @@ -59,12 +58,10 @@ public final class JavaInputReader extends EscapeAwareReader { replaceFirstBackslashWithEscape(firstBslashOff, off); this.savedNotEscapeSpecialEnd = Integer.MAX_VALUE; - this.bufpos = off + 5; - this.recordEscape(firstBslashOff, off + 5 - firstBslashOff); - return firstBslashOff + 1; + return recordEscape(firstBslashOff, off + 5 - firstBslashOff, 1); } else { // not an escape sequence - int min = min(bufpos + maxReadahead, off); + int min = min(maxOff, off); // save the number of backslashes that are part of the escape, // might have been cut in half by the maxReadahead this.savedNotEscapeSpecialEnd = min < off ? off : Integer.MAX_VALUE; diff --git a/pmd-core/src/test/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReaderTest.java b/pmd-core/src/test/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReaderTest.java index 0f03a7bed9..59ef30a854 100644 --- a/pmd-core/src/test/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReaderTest.java +++ b/pmd-core/src/test/java/net/sourceforge/pmd/lang/ast/impl/io/JavaInputReaderTest.java @@ -85,7 +85,7 @@ public class JavaInputReaderTest { Assert.assertEquals(1, read); assertBufferIsJust("abc\\\\", chars, 0); - read = r.read(chars, 5, 8); + read = r.read(chars, 5, chars.length - 5); Assert.assertEquals(5, read); assertBufferIsJust("abc\\\\\\dede", chars, 0); @@ -125,7 +125,7 @@ public class JavaInputReaderTest { Assert.assertEquals(9, read); assertBufferIsJust("abc\u00a0dede\u00a0", chars, 0); - read = r.read(chars, 9, 10); + read = r.read(chars, 9, chars.length - 9); Assert.assertEquals(-1, read); assertBufferIsJust("abc\u00a0dede\u00a0", chars, 0); diff --git a/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/ast/CppEscapeReader.java b/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/ast/CppEscapeReader.java index 378dfb1011..4b3f3417b7 100644 --- a/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/ast/CppEscapeReader.java +++ b/pmd-cpp/src/main/java/net/sourceforge/pmd/lang/cpp/ast/CppEscapeReader.java @@ -23,35 +23,30 @@ public class CppEscapeReader extends EscapeAwareReader { } @Override - protected int gobbleMaxWithoutEscape(int bufpos, int maxReadahead) throws IOException { - int off = bufpos; - int max = min(bufpos + maxReadahead, input.length()); + protected int gobbleMaxWithoutEscape(final int maxOff) throws IOException { + int off = this.bufpos; boolean noBackSlash = false; int notEscapeEnd = this.savedNotEscapeSpecialEnd; - while (off < max && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) { + while (off < maxOff && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) { off++; } - if (noBackSlash) { + if (noBackSlash || off == maxOff) { this.bufpos = off; return off; } final int backSlackOff = off++; if (input.charAt(off) == NEWLINE) { - recordEscape(backSlackOff, 2); - this.bufpos = off + 2; - return backSlackOff; + return recordEscape(backSlackOff, 2, 0); } else if (input.charAt(off) == CARRIAGE_RETURN) { if (input.charAt(++off) == NEWLINE) { - recordEscape(backSlackOff, 3); - this.bufpos = off + 3; - return backSlackOff; + return recordEscape(backSlackOff, 3, 0); } } // not an escape sequence - int min = min(bufpos + maxReadahead, off); + int min = min(maxOff, off); // save the number of backslashes that are part of the escape, // might have been cut in half by the maxReadahead this.savedNotEscapeSpecialEnd = min < off ? off : Integer.MAX_VALUE; diff --git a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CppCharStreamTest.java b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CppCharStreamTest.java index 9d6df7dd49..a8823f011f 100644 --- a/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CppCharStreamTest.java +++ b/pmd-cpp/src/test/java/net/sourceforge/pmd/cpd/CppCharStreamTest.java @@ -18,7 +18,7 @@ import net.sourceforge.pmd.util.document.TextDocument; public class CppCharStreamTest { @NonNull - public CharStream charStreamFor(String source) { + public CharStream charStreamFor(String source) throws IOException { return NewCharStream.open(new CPPTokenizer().newTokenDoc(TextDocument.readOnlyString(source))); }