Fix zero-length escapes

This commit is contained in:
Clément Fournier
2020-04-18 09:52:28 +02:00
parent 2e725038ac
commit d4be567652
6 changed files with 109 additions and 71 deletions

View File

@ -13,11 +13,11 @@ import net.sourceforge.pmd.util.StringUtil;
import net.sourceforge.pmd.util.document.Chars;
/**
* A reader that optionally escapes its input text. It records where
* A reader that can interpret escapes in its input text. It records where
* escapes occurred, and can translate an offset in the translated
* input document to a line+column position in the original input.
*
* <p>The default implementation does not perform any escaping.
* <p>The default implementation does not perform any escape translation.
*/
@SuppressWarnings("PMD.AssignmentInOperand")
public class EscapeAwareReader extends Reader {
@ -27,12 +27,6 @@ public class EscapeAwareReader extends Reader {
* first backslash is replaced with the translated value of the
* escape. The bufpos is updated so that we read the next char
* after the escape.
*
* <p>This makes it so that 1. we don't need an additional buffer for
* translated chars, and 2. the full escape is preserved, just use
* the {@link EscapeTracker} to get the position of the escape and
* replace the first char with a backslash. We can report unnecessary
* escapes that way.
*/
protected Chars input;
/** Position of the next char to read in the input. */
@ -50,21 +44,30 @@ public class EscapeAwareReader extends Reader {
* Translate all the characters in the buffer.
*/
public int translate() throws IOException {
return read(null, 0, Integer.MAX_VALUE);
return readUnchecked(null, 0, Integer.MAX_VALUE);
}
@Override
public int read(final char[] cbuf, final int off, final int len) throws IOException {
public int read(final char[] cbuf, final int off, int len) throws IOException {
if (off < 0 || len < 0 || len + off > cbuf.length) {
throw new IndexOutOfBoundsException("cbuf len=" + cbuf.length + " off=" + off + " len=" + len);
}
return readUnchecked(cbuf, off, len);
}
private int readUnchecked(char[] cbuf, int off, int len) throws IOException {
ensureOpen();
if (this.bufpos == input.length()) {
return -1;
}
len = min(len, input.length()); // remove Integer.MAX_VALUE
int readChars = 0;
while (readChars < len && this.bufpos < input.length()) {
int bpos = this.bufpos;
int nextJump = gobbleMaxWithoutEscape(bpos, len - readChars);
int nextJump = gobbleMaxWithoutEscape(min(input.length(), bpos + len - readChars));
int newlyReadChars = nextJump - bpos;
assert newlyReadChars >= 0 && (readChars + newlyReadChars) <= len;
@ -82,18 +85,20 @@ public class EscapeAwareReader extends Reader {
return readChars;
}
/**
* Returns the max offset, EXclusive, with which we can cut the input
* array from the bufpos to dump it into the output array. This sets
* the bufpos to where we should start the next jump.
*/
protected int gobbleMaxWithoutEscape(final int bufpos, final int maxReadahead) throws IOException {
return this.bufpos = min(bufpos + maxReadahead, input.length());
protected int gobbleMaxWithoutEscape(int maxOff) throws IOException {
return this.bufpos = maxOff;
}
protected void recordEscape(final int startOffsetInclusive, int length) {
this.escapes.recordEscape(startOffsetInclusive, length);
protected int recordEscape(final int startOffsetInclusive, int lengthInSource, int translatedLength) {
assert lengthInSource > 0 && startOffsetInclusive >= 0;
this.escapes.recordEscape(startOffsetInclusive, lengthInSource, translatedLength);
this.bufpos = startOffsetInclusive + lengthInSource;
return startOffsetInclusive + translatedLength;
}
@Override

View File

@ -16,43 +16,77 @@ import net.sourceforge.pmd.util.document.Chars;
class EscapeTracker {
private static final int[] EMPTY = new int[0];
private static final int RECORD_SIZE = 3;
/**
/*
* Offsets in the input buffer where a unicode escape occurred.
* Represented as pairs [off, len] where
* Represented as tuples (off, len, invalid) where
* - off is the offset in the source file where the escape occurred
* - len is the length in characters of the escape (which is translated to a single char).
* - len is the length of the escape in the input file, eg for \ u 00a0 will be 6
* - invalid is the last offset in the buffer which contains the translated chars (exclusive)
*
* Eg for "a\u00a0b" (translates as "a b"), the buffer looks like
* [a u00a0b]
* ^ this char has been replaced with the translated value of the escape
* ^^^^^ these characters are only present in the input, we jump over them when reading
* ^ off
* ^ invalid
* ^ off + len
*
* The escape record is (1,6,2)
*
* When reading the buffer we'll copy two blocks
* * "a "
* * then jump over "u00a0" and copy "b"
*
* In general to read until an escape means reading until its 'invalid'
* field, and once that is reached, jump to off + len.
*
*/
private int[] escapeRecords = EMPTY;
/** Index of the next write in the {@link #escapeRecords}. */
private int nextFreeIdx = 0;
/**
* Calls to this method must occur in source order (ie param
* offsetInInput increases monotonically).
*/
void recordEscape(int offsetInInput, int len) {
void recordEscape(int offsetInInput, int lengthInInput, int lengthInOutput) {
if (nextFreeIdx + 1 >= escapeRecords.length) {
// double capacity, add 1 to not stay stuck at zero
int[] newOffsets = new int[(escapeRecords.length + 1) * 2];
// add 1 to not stay stuck at zero
int[] newOffsets = new int[(escapeRecords.length + 1) * RECORD_SIZE];
System.arraycopy(escapeRecords, 0, newOffsets, 0, escapeRecords.length);
this.escapeRecords = newOffsets;
}
escapeRecords[nextFreeIdx++] = offsetInInput;
escapeRecords[nextFreeIdx++] = len - 1; // -1 because the translated escape has length 1
escapeRecords[nextFreeIdx++] = lengthInInput;
escapeRecords[nextFreeIdx++] = offsetInInput + lengthInOutput;
}
private int inOff(int idx) {
return escapeRecords[idx];
}
private int inLen(int idx) {
return escapeRecords[idx + 1];
}
private int invalidIdx(int idx) {
return escapeRecords[idx + 2];
}
/**
* Convert an offset in the translated file into an offset in
* the untranslated input.
*/
public int inputOffsetAt(int translatedOffset) {
int inputOffsetAt(int translatedOffset) {
// basically accumulate the lengths of all escapes occurring before the given translatedOffset
int sum = translatedOffset;
for (int i = 0; i < nextFreeIdx; i += 2) {
if (escapeRecords[i] < sum) {
sum += escapeRecords[i + 1];
for (int i = 0; i < maxEscape(); i += RECORD_SIZE) {
if (inOff(i) < sum) {
sum += inLen(i);
} else {
break;
}
@ -60,16 +94,24 @@ class EscapeTracker {
return sum;
}
int maxEscape() {
return nextFreeIdx;
}
@Override
public String toString() {
StringBuilder res = new StringBuilder("Escape set {");
for (int i = 0; i < nextFreeIdx; i += 2) {
res.append("(at=").append(escapeRecords[i]).append(", len=").append(escapeRecords[i + 1]).append("), ");
for (int i = 0; i < maxEscape(); i += RECORD_SIZE) {
res.append("(at=").append(inOff(i))
.append(", inlen=").append(inLen(i))
.append(", invalidAt=").append(invalidIdx(i))
.append("), ");
}
return res.append('}').toString();
}
/** Backend for a CharStream. */
class Cursor {
@ -112,13 +154,14 @@ class EscapeTracker {
if (pos == buf.length()) {
throw new EOFException();
}
char c;
char c = buf.charAt(pos);
if (nextEscape < escapeRecords.length && pos == escapeRecords[nextEscape]) {
pos += escapeRecords[nextEscape + 1]; // add escape length
this.nextEscape += 2;
if (nextEscape < maxEscape() && pos == invalidIdx(nextEscape)) {
pos += inLen(nextEscape); // add escape length
c = buf.charAt(pos);
this.nextEscape += RECORD_SIZE;
} else {
c = buf.charAt(pos);
pos++;
}
outOffset++;
@ -137,20 +180,20 @@ class EscapeTracker {
if (nextEscape <= 0) {
pos -= numChars; // then there were no escapes before the 'pos'
} else {
int off = pos;
for (int i = nextEscape - 2; i >= 0 && numChars > 0; i -= 2) {
int esc = escapeRecords[i];
if (esc == off) {
off -= escapeRecords[i + 1];
} else if (esc > off) {
int inoff = pos;
for (int i = nextEscape - RECORD_SIZE; i >= 0 && numChars > 0; i -= RECORD_SIZE) {
int esc = inOff(i);
if (esc == inoff) {
inoff -= inLen(i);
} else if (esc > inoff) {
// then the current escape was before what we're looking at
break;
} else {
off--;
inoff--;
}
numChars--;
}
pos = off - numChars;
pos = inoff - numChars;
}
}
@ -174,15 +217,13 @@ class EscapeTracker {
int cur = mark;
int esc = markEscape;
while (cur < pos && esc < nextEscape) {
int escapeOff = escapeRecords[esc];
assert escapeOff < pos;
sb.append(buf, cur, escapeOff + 1);
cur = escapeOff + escapeRecords[esc + 1];
esc += 2;
sb.append(buf, cur, invalidIdx(esc));
cur = inOff(esc) + inLen(esc);
esc += RECORD_SIZE;
}
// no more escape in the range, append everything until the pos
sb.append(buf, cur, pos + 1);
assert sb.length() - prevLength == markLength();
assert sb.length() - prevLength == markLength() : sb + " should have length " + markLength();
}
}

View File

@ -32,12 +32,11 @@ public final class JavaInputReader extends EscapeAwareReader {
}
@Override
protected int gobbleMaxWithoutEscape(final int bufpos, final int maxReadahead) throws IOException {
int off = bufpos;
int max = min(bufpos + maxReadahead, input.length());
protected int gobbleMaxWithoutEscape(final int maxOff) throws IOException {
int off = this.bufpos;
boolean noBackSlash = false;
int notEscapeEnd = this.savedNotEscapeSpecialEnd;
while (off < max && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) {
while (off < maxOff && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) {
off++;
}
@ -59,12 +58,10 @@ public final class JavaInputReader extends EscapeAwareReader {
replaceFirstBackslashWithEscape(firstBslashOff, off);
this.savedNotEscapeSpecialEnd = Integer.MAX_VALUE;
this.bufpos = off + 5;
this.recordEscape(firstBslashOff, off + 5 - firstBslashOff);
return firstBslashOff + 1;
return recordEscape(firstBslashOff, off + 5 - firstBslashOff, 1);
} else {
// not an escape sequence
int min = min(bufpos + maxReadahead, off);
int min = min(maxOff, off);
// save the number of backslashes that are part of the escape,
// might have been cut in half by the maxReadahead
this.savedNotEscapeSpecialEnd = min < off ? off : Integer.MAX_VALUE;

View File

@ -85,7 +85,7 @@ public class JavaInputReaderTest {
Assert.assertEquals(1, read);
assertBufferIsJust("abc\\\\", chars, 0);
read = r.read(chars, 5, 8);
read = r.read(chars, 5, chars.length - 5);
Assert.assertEquals(5, read);
assertBufferIsJust("abc\\\\\\dede", chars, 0);
@ -125,7 +125,7 @@ public class JavaInputReaderTest {
Assert.assertEquals(9, read);
assertBufferIsJust("abc\u00a0dede\u00a0", chars, 0);
read = r.read(chars, 9, 10);
read = r.read(chars, 9, chars.length - 9);
Assert.assertEquals(-1, read);
assertBufferIsJust("abc\u00a0dede\u00a0", chars, 0);

View File

@ -23,35 +23,30 @@ public class CppEscapeReader extends EscapeAwareReader {
}
@Override
protected int gobbleMaxWithoutEscape(int bufpos, int maxReadahead) throws IOException {
int off = bufpos;
int max = min(bufpos + maxReadahead, input.length());
protected int gobbleMaxWithoutEscape(final int maxOff) throws IOException {
int off = this.bufpos;
boolean noBackSlash = false;
int notEscapeEnd = this.savedNotEscapeSpecialEnd;
while (off < max && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) {
while (off < maxOff && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) {
off++;
}
if (noBackSlash) {
if (noBackSlash || off == maxOff) {
this.bufpos = off;
return off;
}
final int backSlackOff = off++;
if (input.charAt(off) == NEWLINE) {
recordEscape(backSlackOff, 2);
this.bufpos = off + 2;
return backSlackOff;
return recordEscape(backSlackOff, 2, 0);
} else if (input.charAt(off) == CARRIAGE_RETURN) {
if (input.charAt(++off) == NEWLINE) {
recordEscape(backSlackOff, 3);
this.bufpos = off + 3;
return backSlackOff;
return recordEscape(backSlackOff, 3, 0);
}
}
// not an escape sequence
int min = min(bufpos + maxReadahead, off);
int min = min(maxOff, off);
// save the number of backslashes that are part of the escape,
// might have been cut in half by the maxReadahead
this.savedNotEscapeSpecialEnd = min < off ? off : Integer.MAX_VALUE;

View File

@ -18,7 +18,7 @@ import net.sourceforge.pmd.util.document.TextDocument;
public class CppCharStreamTest {
@NonNull
public CharStream charStreamFor(String source) {
public CharStream charStreamFor(String source) throws IOException {
return NewCharStream.open(new CPPTokenizer().newTokenDoc(TextDocument.readOnlyString(source)));
}