Abstract backslash escape readers

This commit is contained in:
Clément Fournier
2020-04-18 16:27:53 +02:00
parent fd375e4bcd
commit 41d747ead2
5 changed files with 117 additions and 77 deletions

View File

@ -0,0 +1,68 @@
/*
* BSD-style license; for more info see http://pmd.sourceforge.net/license.html
*/
package net.sourceforge.pmd.lang.ast.impl.javacc;
import static java.lang.Integer.min;
import java.io.IOException;
import net.sourceforge.pmd.util.document.Chars;
/**
* A base class for readers that handle escapes starting with a backslash.
*/
public abstract class BackslashEscapeReader extends EscapeAwareReader {
private static final char BACKSLASH = '\\';
/**
* An offset until which we read backslashes and decided they were not
* an escape. The read procedure may cut off in the middle of the escape,
* and turn an even num of backslashes into an odd one, so until we crossed
* this offset, backslashes are not treated specially.
*/
private int savedNotEscapeSpecialEnd = Integer.MAX_VALUE;
public BackslashEscapeReader(Chars input) {
super(input);
}
@Override
protected int gobbleMaxWithoutEscape(final int maxOff) throws IOException {
int off = this.bufpos;
boolean noBackSlash = false;
int notEscapeEnd = this.savedNotEscapeSpecialEnd;
while (off < maxOff && (noBackSlash = input.charAt(off) != BACKSLASH || notEscapeEnd < off)) {
off++;
}
if (noBackSlash || off == maxOff) {
this.bufpos = off;
return off;
}
return handleBackslash(maxOff, off);
}
protected abstract int handleBackslash(int maxOff, int firstBackslashOff) throws IOException;
@Override
protected int recordEscape(int startOffsetInclusive, int lengthInSource, int translatedLength) {
this.savedNotEscapeSpecialEnd = Integer.MAX_VALUE;
return super.recordEscape(startOffsetInclusive, lengthInSource, translatedLength);
}
protected int abortEscape(int off, int maxOff) {
// not an escape sequence
int min = min(maxOff, off);
// save the number of backslashes that are part of the escape,
// might have been cut in half by the maxReadahead
this.savedNotEscapeSpecialEnd = min < off ? off : Integer.MAX_VALUE;
this.bufpos = min;
return min;
}
}

View File

@ -10,16 +10,27 @@ package net.sourceforge.pmd.lang.ast.impl.javacc;
import static java.lang.Integer.min;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import net.sourceforge.pmd.internal.util.AssertionUtil;
import net.sourceforge.pmd.util.StringUtil;
import net.sourceforge.pmd.util.document.Chars;
/**
* A reader that can interpret escapes in its input text. It records where
* escapes occurred, and can translate an offset in the translated
* input document to a line+column position in the original input.
* A reader that may interpret escapes in its input text. It records
* where escapes occurred, and can translate an offset in the translated
* document (the "output") to a line/column/offset coordinates in the
* original input. It uses a single char buffer to store both input and
* translated output, and is overall very optimised for the case where
* there are very few escapes. {@link CharStream} is the API to navigate
* on a translated document (with arbitrary backtrack abilities).
*
* <p>This is useful to back a {@link CharStream} for JavaCC implementation,
* but can also be used as a plain {@link Reader} if using other parser/lexer
* implementations. The reader behaviour is optimised for block IO and has
* poor char-by-char performance. Use a {@link BufferedReader} if you need it.
*
* <p>The default implementation does not perform any escape translation.
*/
@ -39,7 +50,7 @@ public class EscapeAwareReader extends Reader {
final EscapeTracker escapes = new EscapeTracker();
public EscapeAwareReader(Chars input) {
assert input != null;
AssertionUtil.requireParamNotNull("input", input);
this.input = input.mutableCopy();
bufpos = 0;
}
@ -91,15 +102,17 @@ public class EscapeAwareReader extends Reader {
/**
* Returns the max offset, EXclusive, with which we can cut the input
* array from the bufpos to dump it into the output array. This sets
* the bufpos to where we should start the next jump.
* array from the bufpos to dump it into the output array. This must
* set the {@link #bufpos} to where we should start reading next (INclusive).
* If applicable, it must also replace in the buffer the start of
* the escape with its translation.
*/
protected int gobbleMaxWithoutEscape(int maxOff) throws IOException {
return this.bufpos = maxOff;
}
protected int recordEscape(final int startOffsetInclusive, int lengthInSource, int translatedLength) {
assert lengthInSource > 0 && startOffsetInclusive >= 0;
assert lengthInSource > 0 && lengthInSource >= translatedLength && startOffsetInclusive >= 0;
this.escapes.recordEscape(startOffsetInclusive, lengthInSource, translatedLength);
this.bufpos = startOffsetInclusive + lengthInSource;
return startOffsetInclusive + translatedLength;
@ -142,10 +155,20 @@ public class EscapeAwareReader extends Reader {
return escapes.inputOffsetAt(outputOffset);
}
/**
* The parameter is an *input* offset, if you got this offset from
* somewhere else than the input buffer you must first translate it
* back with {@link #inputOffset(int)}. This implementation is very
* inefficient but currently is only used for error messages (which
* obviously are exceptional).
*/
public int getLine(int idxInInput) {
return StringUtil.lineNumberAt(input, idxInInput);
}
/**
* @see #getLine(int)
*/
public int getColumn(int idxInInput) {
return StringUtil.columnNumberAt(input, idxInInput);
}

View File

@ -23,6 +23,9 @@ import net.sourceforge.pmd.util.document.Chars;
* than the escape.
* - C++ translates newline escapes (1 or 2 chars) to zero chars (an important corner case)
* - Java translates arbitrary-length unicode escapes (>= 6 chars) to 1 char
*
* <p>This class is tightly coupled to what {@link EscapeAwareReader}
* does with its buffer.
*/
class EscapeTracker {

View File

@ -8,53 +8,29 @@
package net.sourceforge.pmd.lang.ast.impl.javacc;
import static java.lang.Integer.min;
import java.io.BufferedReader;
import java.io.IOException;
import net.sourceforge.pmd.util.document.Chars;
/**
* An implementation of java.io.Reader that translates Java unicode escapes.
* This implementation has efficient block IO but poor char-by-char performance.
* If this is required, wrap it into a {@link BufferedReader}.
* An implementation of {@link EscapeAwareReader} that translates Java
* unicode escapes.
*/
@SuppressWarnings("PMD.AssignmentInOperand")
public final class JavaInputReader extends EscapeAwareReader {
/**
* An offset until which we read backslashes and decided they were not
* an escape. The read procedure may cut off in the middle of the escape,
* and turn an even num of backslashes into an odd one, so until we crossed
* this offset, backslashes are not treated specially.
*/
private int savedNotEscapeSpecialEnd = Integer.MAX_VALUE;
public final class JavaInputReader extends BackslashEscapeReader {
public JavaInputReader(Chars input) {
super(input);
}
@Override
protected int gobbleMaxWithoutEscape(final int maxOff) throws IOException {
int off = this.bufpos;
boolean noBackSlash = false;
int notEscapeEnd = this.savedNotEscapeSpecialEnd;
while (off < maxOff && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) {
off++;
}
if (noBackSlash) {
this.bufpos = off;
return off;
}
final int firstBslashOff = off;
protected int handleBackslash(final int maxOff, final int firstBackslashOff) throws IOException {
int off = firstBackslashOff;
while (off < input.length() && input.charAt(off) == '\\') {
off++;
}
int bslashCount = off - firstBslashOff;
int bslashCount = off - firstBackslashOff;
// is there an escape at offset firstBslashOff?
if ((bslashCount & 1) == 1 // odd number of backslashes
&& off < input.length() && input.charAt(off) == 'u') { // at least one 'u'
@ -63,17 +39,10 @@ public final class JavaInputReader extends EscapeAwareReader {
// consume all the 'u's
off++;
}
int end = replaceFirstBackslashWithEscape(firstBslashOff, off - 1);
this.savedNotEscapeSpecialEnd = Integer.MAX_VALUE;
return recordEscape(firstBslashOff, end - firstBslashOff, 1);
int end = replaceFirstBackslashWithEscape(firstBackslashOff, off - 1);
return recordEscape(firstBackslashOff, end - firstBackslashOff, 1);
} else {
// not an escape sequence
int min = min(maxOff, off);
// save the number of backslashes that are part of the escape,
// might have been cut in half by the maxReadahead
this.savedNotEscapeSpecialEnd = min < off ? off : Integer.MAX_VALUE;
this.bufpos = min;
return min;
return abortEscape(off, maxOff);
}
}

View File

@ -4,53 +4,30 @@
package net.sourceforge.pmd.lang.cpp.ast;
import static java.lang.Integer.min;
import java.io.IOException;
import net.sourceforge.pmd.lang.ast.impl.javacc.EscapeAwareReader;
import net.sourceforge.pmd.lang.ast.impl.javacc.BackslashEscapeReader;
import net.sourceforge.pmd.util.document.Chars;
public class CppEscapeReader extends EscapeAwareReader {
public class CppEscapeReader extends BackslashEscapeReader {
private static final char NEWLINE = '\n';
private static final char CARRIAGE_RETURN = '\r';
private int savedNotEscapeSpecialEnd = Integer.MAX_VALUE;
public CppEscapeReader(Chars input) {
super(input);
}
@Override
protected int gobbleMaxWithoutEscape(final int maxOff) throws IOException {
int off = this.bufpos;
boolean noBackSlash = false;
int notEscapeEnd = this.savedNotEscapeSpecialEnd;
while (off < maxOff && (noBackSlash = input.charAt(off) != '\\' || notEscapeEnd < off)) {
off++;
}
protected int handleBackslash(int maxOff, final int backSlashOff) {
int off = backSlashOff;
if (noBackSlash || off == maxOff) {
this.bufpos = off;
return off;
}
final int backSlackOff = off++;
if (input.charAt(off) == NEWLINE) {
return recordEscape(backSlackOff, 2, 0);
return recordEscape(backSlashOff, 2, 0);
} else if (input.charAt(off) == CARRIAGE_RETURN) {
if (input.charAt(++off) == NEWLINE) {
return recordEscape(backSlackOff, 3, 0);
return recordEscape(backSlashOff, 3, 0);
}
}
// not an escape sequence
int min = min(maxOff, off);
// save the number of backslashes that are part of the escape,
// might have been cut in half by the maxReadahead
this.savedNotEscapeSpecialEnd = min < off ? off : Integer.MAX_VALUE;
this.bufpos = min;
return min;
return abortEscape(off, maxOff);
}
}