Implement DOMLineNumbers to determine the line numbers after the parsing

This commit is contained in:
Andreas Dangel
2014-12-07 10:51:28 +01:00
parent 0da50b9e89
commit 6ca24d1f8a
4 changed files with 197 additions and 26 deletions

View File

@ -0,0 +1,155 @@
/**
* BSD-style license; for more info see http://pmd.sourceforge.net/license.html
*/
package net.sourceforge.pmd.lang.xml.ast;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Matcher;
import org.apache.commons.lang3.StringUtils;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentType;
import org.w3c.dom.EntityReference;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.ProcessingInstruction;
/**
*
*/
class DOMLineNumbers {
private final Document document;
private final String xmlString;
private Map<Integer, Integer> lines;
public DOMLineNumbers(Document document, String xmlString) {
this.document = document;
this.xmlString = xmlString;
}
public void determine() {
calculateLinesMap();
determineLocation(document, 0);
}
private int determineLocation(Node n, int index) {
int nextIndex = index;
if (n.getNodeType() == Node.DOCUMENT_TYPE_NODE) {
nextIndex = xmlString.indexOf("<!DOCTYPE", nextIndex);
} else if (n.getNodeType() == Node.COMMENT_NODE) {
nextIndex = xmlString.indexOf("<!--", nextIndex);
} else if (n.getNodeType() == Node.ELEMENT_NODE) {
nextIndex = xmlString.indexOf("<" + n.getNodeName(), nextIndex);
} else if (n.getNodeType() == Node.CDATA_SECTION_NODE) {
nextIndex = xmlString.indexOf("<![CDATA[", nextIndex);
} else if (n.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) {
ProcessingInstruction pi = (ProcessingInstruction)n;
nextIndex = xmlString.indexOf("<?" + pi.getTarget(), nextIndex);
} else if (n.getNodeType() == Node.TEXT_NODE) {
String te = unexpandEntities(n, n.getNodeValue());
int newIndex = xmlString.indexOf(te, nextIndex);
if (newIndex > 0) {
nextIndex = newIndex;
} else {
System.out.println("Still not found: " + n.getNodeValue());
}
} else if (n.getNodeType() == Node.ENTITY_REFERENCE_NODE) {
nextIndex = xmlString.indexOf("&" + n.getNodeName() + ";", nextIndex);
}
setBeginLocation(n, nextIndex);
if (n.hasChildNodes()) {
NodeList childs = n.getChildNodes();
for (int i = 0; i < childs.getLength(); i++) {
nextIndex = determineLocation(childs.item(i), nextIndex);
}
}
if (n.getNodeType() == Node.ELEMENT_NODE) {
nextIndex += 2 + n.getNodeName().length() + 1; // </nodename>
} else if (n.getNodeType() == Node.DOCUMENT_TYPE_NODE) {
Node nextSibling = n.getNextSibling();
if (nextSibling.getNodeType() == Node.ELEMENT_NODE) {
nextIndex = xmlString.indexOf("<" + nextSibling.getNodeName(), nextIndex) - 1;
} else if (nextSibling.getNodeType() == Node.COMMENT_NODE) {
nextIndex = xmlString.indexOf("<!--", nextIndex);
} else {
nextIndex = xmlString.indexOf(">", nextIndex);
}
} else if (n.getNodeType() == Node.COMMENT_NODE) {
nextIndex += 4 + 3; // <!-- and -->
nextIndex += n.getNodeValue().length();
} else if (n.getNodeType() == Node.TEXT_NODE) {
String te = unexpandEntities(n, n.getNodeValue());
nextIndex += te.length();
} else if (n.getNodeType() == Node.CDATA_SECTION_NODE) {
nextIndex += "<![CDATA[".length() + n.getNodeValue().length() + "]]>".length();
} else if (n.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) {
ProcessingInstruction pi = (ProcessingInstruction)n;
nextIndex += "<?".length() + pi.getTarget().length() + "?>".length() + pi.getData().length();
}
setEndLocation(n, nextIndex - 1);
return nextIndex;
}
private String unexpandEntities(Node n, String te) {
String result = te;
DocumentType doctype = n.getOwnerDocument().getDoctype();
// implicit entities
result = result.replaceAll(Matcher.quoteReplacement("&"), "&amp;");
result = result.replaceAll(Matcher.quoteReplacement("<"), "&lt;");
result = result.replaceAll(Matcher.quoteReplacement(">"), "&gt;");
result = result.replaceAll(Matcher.quoteReplacement("\""), "&quot;");
result = result.replaceAll(Matcher.quoteReplacement("'"), "&apos;");
if (doctype != null) {
NamedNodeMap entities = doctype.getEntities();
for (int i = 0; i < entities.getLength(); i++) {
Node item = entities.item(i);
result = result.replaceAll(Matcher.quoteReplacement(item.getFirstChild().getNodeValue()), "&" + item.getNodeName() + ";");
}
}
return result;
}
private void setBeginLocation(Node n, int index) {
if (n != null) {
n.setUserData(XmlNode.BEGIN_LINE, toLine(index), null);
n.setUserData(XmlNode.BEGIN_COLUMN, toColumn(index), null);
}
}
private void setEndLocation(Node n, int index) {
if (n != null) {
n.setUserData(XmlNode.END_LINE, toLine(index), null);
n.setUserData(XmlNode.END_COLUMN, toColumn(index), null);
}
}
private void calculateLinesMap() {
lines = new TreeMap<Integer, Integer>();
int index = -1;
int count = StringUtils.countMatches(xmlString, "\n");
for (int line = 1; line <= count; line++) {
lines.put(line, index + 1);
index = xmlString.indexOf("\n", index + 1);
}
lines.put(count + 1, index + 1);
}
private int toLine(int index) {
int line = 1;
for (Map.Entry<Integer, Integer> e : lines.entrySet()) {
line = e.getKey();
if (e.getValue() > index) {
line--;
break;
}
}
return line;
}
private int toColumn(int index) {
int line = toLine(index);
int column = index - lines.get(line);
return column + 1;
}
}

View File

@ -5,6 +5,7 @@ package net.sourceforge.pmd.lang.xml.ast;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.lang.reflect.Proxy;
import java.util.Arrays;
import java.util.HashMap;
@ -12,19 +13,19 @@ import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.sourceforge.pmd.lang.ast.ParseException;
import net.sourceforge.pmd.lang.ast.RootNode;
import net.sourceforge.pmd.lang.xml.XmlParserOptions;
import org.apache.commons.io.IOUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
public class XmlParser {
protected final XmlParserOptions parserOptions;
@ -37,23 +38,24 @@ public class XmlParser {
protected Document parseDocument(Reader reader) throws ParseException {
nodeCache.clear();
try {
SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
saxParserFactory.setFeature("http://xml.org/sax/features/external-general-entities", false);
saxParserFactory.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
saxParserFactory.setNamespaceAware(parserOptions.isNamespaceAware());
saxParserFactory.setValidating(parserOptions.isValidating());
saxParserFactory.setXIncludeAware(parserOptions.isXincludeAware());
SAXParser saxParser = saxParserFactory.newSAXParser();
String xmlData = IOUtils.toString(reader);
LineNumberAwareSaxHandler handler = new LineNumberAwareSaxHandler(parserOptions);
XMLReader xmlReader = saxParser.getXMLReader();
xmlReader.setContentHandler(handler);
xmlReader.setProperty("http://xml.org/sax/properties/lexical-handler", handler);
xmlReader.setProperty("http://xml.org/sax/properties/declaration-handler", handler);
xmlReader.setEntityResolver(parserOptions.getEntityResolver());
xmlReader.parse(new InputSource(reader));
return handler.getDocument();
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
dbf.setNamespaceAware(parserOptions.isNamespaceAware());
dbf.setValidating(parserOptions.isValidating());
dbf.setIgnoringComments(parserOptions.isIgnoringComments());
dbf.setIgnoringElementContentWhitespace(parserOptions.isIgnoringElementContentWhitespace());
dbf.setExpandEntityReferences(parserOptions.isExpandEntityReferences());
dbf.setCoalescing(parserOptions.isCoalescing());
dbf.setXIncludeAware(parserOptions.isXincludeAware());
dbf.setFeature("http://xml.org/sax/features/external-general-entities", false);
dbf.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
DocumentBuilder documentBuilder = dbf.newDocumentBuilder();
documentBuilder.setEntityResolver(parserOptions.getEntityResolver());
Document document = documentBuilder.parse(new InputSource(new StringReader(xmlData)));
DOMLineNumbers lineNumbers = new DOMLineNumbers(document, xmlData);
lineNumbers.determine();
return document;
} catch (ParserConfigurationException e) {
throw new ParseException(e);
} catch (SAXException e) {

View File

@ -12,6 +12,7 @@ import java.util.Iterator;
import net.sourceforge.pmd.lang.LanguageRegistry;
import net.sourceforge.pmd.lang.LanguageVersionHandler;
import net.sourceforge.pmd.lang.Parser;
import net.sourceforge.pmd.lang.ParserOptions;
import net.sourceforge.pmd.lang.ast.Node;
import net.sourceforge.pmd.lang.ast.xpath.Attribute;
import net.sourceforge.pmd.lang.xml.ast.XmlNode;
@ -274,7 +275,7 @@ public class XmlParserTest {
assertNode(document, "document", 1);
Node rootElement = document.jjtGetChild(0);
assertNode(rootElement, "pmd:rootElement", 7);
assertNode(rootElement, "pmd:rootElement", 7, "xmlns:pmd", "http://pmd.sf.net");
Assert.assertEquals("http://pmd.sf.net", ((XmlNode)rootElement).getNode().getNamespaceURI());
Assert.assertEquals("pmd", ((XmlNode)rootElement).getNode().getPrefix());
Assert.assertEquals("rootElement", ((XmlNode)rootElement).getNode().getLocalName());
@ -353,6 +354,19 @@ public class XmlParserTest {
}
}
@Test
public void testWithProcessingInstructions() {
String xml = "<?xml version=\"1.0\"?><?mypi?><!DOCTYPE testDoc [<!ENTITY myentity \"e\">]><!--Comment--><foo abc=\"abc\"><bar>TEXT</bar><![CDATA[cdata!]]>&gt;&myentity;&lt;</foo>";
LanguageVersionHandler xmlVersionHandler = LanguageRegistry.getLanguage(XmlLanguageModule.NAME).getDefaultVersion().getLanguageVersionHandler();
XmlParserOptions options = (XmlParserOptions)xmlVersionHandler.getDefaultParserOptions();
options.setExpandEntityReferences(false);
Parser parser = xmlVersionHandler.getParser(options);
Node document = parser.parse(null, new StringReader(xml));
Assert.assertNotNull(document);
assertNode(document.jjtGetChild(0), "mypi", 0);
assertLineNumbers(document.jjtGetChild(0), 1, 22, 1, 29);
}
/**
* Asserts a single node inclusive attributes.
* @param node the node

View File

@ -77,10 +77,8 @@ public class AbstractDomXmlRuleTest {
// assertEquals(0, visited.size());
visited = rule.visitedNodes.get("EntityReference");
assertEquals(3, visited.size());
assertEquals("gt", ((EntityReference) visited.get(0)).getNodeName());
assertEquals("entity", ((EntityReference) visited.get(1)).getNodeName());
assertEquals("lt", ((EntityReference) visited.get(2)).getNodeName());
assertEquals(1, visited.size());
assertEquals("entity", ((EntityReference) visited.get(0)).getNodeName());
// TODO Figure out how to trigger this.
// visited = rule.visitedNodes.get("Notation");
@ -92,9 +90,11 @@ public class AbstractDomXmlRuleTest {
((ProcessingInstruction) visited.get(0)).getTarget());
visited = rule.visitedNodes.get("Text");
assertEquals(2, visited.size());
assertEquals(4, visited.size());
assertEquals("TEXT", ((Text) visited.get(0)).getData());
assertEquals("e", ((Text) visited.get(1)).getData());
assertEquals(">", ((Text) visited.get(1)).getData());
assertEquals("e", ((Text) visited.get(2)).getData());
assertEquals("<", ((Text) visited.get(3)).getData());
}
@Test