// Copyright 2012 Georg-August-Universität Göttingen, Germany // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package de.ugoe.cs.autoquest.plugin.html; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.xml.sax.SAXException; import de.ugoe.cs.autoquest.eventcore.Event; import de.ugoe.cs.autoquest.eventcore.IEventType; import de.ugoe.cs.autoquest.eventcore.guimodel.GUIModel; import de.ugoe.cs.autoquest.eventcore.guimodel.GUIModelException; import de.ugoe.cs.autoquest.eventcore.guimodel.IGUIElement; import de.ugoe.cs.autoquest.plugin.html.eventcore.HTMLEventTypeFactory; import de.ugoe.cs.autoquest.plugin.html.guimodel.HTMLDocument; import de.ugoe.cs.autoquest.plugin.html.guimodel.HTMLDocumentSpec; import de.ugoe.cs.autoquest.plugin.html.guimodel.HTMLGUIElement; import de.ugoe.cs.autoquest.plugin.html.guimodel.HTMLGUIElementSpec; import de.ugoe.cs.autoquest.plugin.html.guimodel.HTMLPageElement; import de.ugoe.cs.autoquest.plugin.html.guimodel.HTMLPageElementSpec; import de.ugoe.cs.autoquest.plugin.html.guimodel.HTMLServerSpec; /** *

* This class provides the functionality to parse XML log files generated by the HTMLMonitor of * AutoQUEST. The result of parsing a file is a collection of event sequences and a GUI model. *

*

* The parser can be configured with parsing parameters to adapt, e.g., ids or or ignore indexes of * parsed GUI elements. Details can be found in the manual pages of the respective parsing commands. *

* * @author Fabian Glaser, Patrick Harms * @version 1.0 * */ public class HTMLLogParser extends AbstractDefaultLogParser { /** *

* the pattern used for parsing HTML GUI element paths *

*/ private Pattern htmlElementPattern = Pattern.compile("(\\w+)(\\[(\\d+)\\]|\\(htmlId=([\\w-]+)\\))"); /** *

* file containing parameters to influence parsing *

*/ private String parseParamFile; /** *

* a map containing replacement specifications for ids of GUI elements *

*/ private Map> replacementSpecifications; /** *

* initializes the parser with the file containing parsing parameters to be considered *

* * @param parseParamFile the parsing parameters to be considered */ public HTMLLogParser(String parseParamFile) { this.parseParamFile = parseParamFile; } /* (non-Javadoc) * @see de.ugoe.cs.autoquest.plugin.html.AbstractDefaultLogParser#handleGUIElement(String, Map) */ @Override protected boolean handleGUIElement(String id, Map parameters) throws SAXException { ensureParsingParameters(); HTMLGUIElementSpec specification = null; String parentId = parameters.get("parent"); HTMLGUIElement parent = (HTMLGUIElement) super.getGUIElementTree().find(parentId); if (parameters.containsKey("host")) { // this is a server specification int port = 80; String portStr = parameters.get("port"); if (portStr != null) { port = Integer.parseInt(portStr); } specification = new HTMLServerSpec(parameters.get("host"), port); } else if (parameters.containsKey("path")) { // this is a document specification if (parent != null) { if (!(parent.getSpecification() instanceof HTMLServerSpec)) { throw new SAXException ("invalid log: parent GUI element of a document is not of type server"); } specification = new HTMLDocumentSpec ((HTMLServerSpec) parent.getSpecification(), parameters.get("path"), parameters.get("query"), parameters.get("title")); } else if (parentId == null) { throw new SAXException("invalid log: a document has no parent id"); } } else if (parameters.containsKey("tagname")) { String tagName = parameters.get("tagname"); if (!tagNameMustBeConsidered(tagName)) { return true; } if (parent != null) { if (!childrenMustBeConsidered(parent)) { return true; } IGUIElement document = parent; while ((document != null) && (!(document.getSpecification() instanceof HTMLDocumentSpec))) { document = document.getParent(); } if (document == null) { throw new SAXException ("invalid log: parent hierarchy of a page element does not contain a " + "document"); } int index = -1; String indexStr = parameters.get("index"); if ((indexStr != null) && (!"".equals(indexStr))) { index = Integer.parseInt(indexStr); } String htmlId = parameters.get("htmlid"); String replacement = getReplacementMapping(tagName, index, htmlId, parent); if (replacement != null) { if (replacement.startsWith("CLEAR_INDEX,")) { index = -1; replacement = replacement.substring("CLEAR_INDEX,".length()); } else if ("CLEAR_INDEX".equals(replacement)) { index = -1; replacement = htmlId; } if ("".equals(replacement)) { htmlId = null; } else { htmlId = replacement; } } if ((htmlId == null) && (index == -1)) { // set at least a default index, if all is to be ignored. index = 0; } specification = new HTMLPageElementSpec ((HTMLDocumentSpec) document.getSpecification(), tagName.intern(), htmlId == null ? null : htmlId.intern(), index); } else if (parentId == null) { throw new SAXException("invalid log: a page element has no parent id"); } } else { throw new SAXException("invalid log: unknown GUI element"); } if (specification != null) { try { super.getGUIElementTree().add(id, parentId, specification); } catch (GUIModelException e) { throw new SAXException("could not handle GUI element with id " + id + ": " + e.getMessage(), e); } return true; } else { return false; } } /** *

* returns the replacement mapping for the tag specified by the parameters, if a mapping exists. *

* * @param tagName the tag of the considered GUI element * @param index the index of the GUI element * @param id the id of the GUI element * @param parent the parent GUI element of the considered GUI element * * @return the replacement mapping, if any is configured; null else */ private String getReplacementMapping(String tagName, int index, String htmlId, HTMLGUIElement parent) { List mappingCandidates = replacementSpecifications.get(tagName); if (mappingCandidates != null) { for (ReplacementSpecification replacementSpec : mappingCandidates) { if (replacementSpec.matches(tagName, index, htmlId, parent)) { return replacementSpec.getReplacement(); } } } return null; } /* (non-Javadoc) * @see de.ugoe.cs.autoquest.plugin.html.AbstractDefaultLogParser#handleEvent(String, Map) */ @Override protected boolean handleEvent(String type, Map parameters) throws SAXException { String targetId = parameters.get("target"); if (targetId == null) { if (replacementSpecifications.size() != 0) { throw new SAXException ("old log file versions can not be parsed with parse parameters"); } String targetDocument = parameters.get("targetDocument"); String targetDOMPath = parameters.get("targetDOMPath"); if ((targetDocument == null) || (targetDOMPath == null)) { throw new SAXException("event has no target defined"); } targetId = determineTargetId(targetDocument, targetDOMPath); if (targetId == null) { // the target id can not be determined yet return false; } } IGUIElement target = super.getGUIElementTree().find(targetId); if (target == null) { // event not processible yet return false; } IEventType eventType = HTMLEventTypeFactory.getInstance().getEventType(type, parameters, target); if (eventType != null) { Event event = new Event(eventType, target); String timestampStr = parameters.get("timestamp"); if (timestampStr != null) { event.setTimestamp(Long.parseLong(timestampStr)); } ((HTMLGUIElement) event.getTarget()).markUsed(); super.addToSequence(event); } // else ignore unknown event type return true; } /** *

* reads parsing parameters from the config file and makes them available for the parsing * process *

*/ private void ensureParsingParameters() throws SAXException { if (replacementSpecifications == null) { replacementSpecifications = new HashMap>(); if (parseParamFile != null) { Properties props = new Properties(); FileInputStream stream = null; try { stream = new FileInputStream(new File(parseParamFile)); props.load(stream); } catch (FileNotFoundException e) { throw new SAXException("could not find file " + parseParamFile, e); } catch (IOException e) { throw new SAXException("error reading file " + parseParamFile, e); } finally { if (stream != null) { try { stream.close(); } catch (IOException e) { // ignore } } } for (Map.Entry entry : props.entrySet()) { ReplacementSpecification replSpec = new ReplacementSpecification ((String) entry.getKey(), (String) entry.getValue()); List similarReplSpecs = replacementSpecifications.get(replSpec.getLastTagName()); if (similarReplSpecs == null) { similarReplSpecs = new LinkedList(); replacementSpecifications.put(replSpec.getLastTagName(), similarReplSpecs); } similarReplSpecs.add(replSpec); } } } } /** *

* used to determine the id of a target denoted by an event. This is only required for older * document formats. The new formats use concrete ids. *

*/ private String determineTargetId(String targetDocument, String targetDOMPath) throws SAXException { IGUIElement document = super.getGUIElementTree().find(targetDocument); if (document == null) { return null; } if (!(document.getSpecification() instanceof HTMLDocumentSpec)) { throw new SAXException("an id that should refer to an HTML document refers to" + "something else"); } GUIModel model = super.getGUIElementTree().getGUIModel(); IGUIElement child = document; String[] pathElements = targetDOMPath.split("/"); int pathIndex = 0; HTMLPageElementSpec compareSpec; String tagName; int index; String htmlId; while ((pathIndex < pathElements.length) && (child != null)) { if ((pathElements[pathIndex] != null) && (!"".equals(pathElements[pathIndex]))) { Matcher matcher = htmlElementPattern.matcher(pathElements[pathIndex]); if (!matcher.matches()) { throw new SAXException ("could not parse target DOM path element " + pathElements[pathIndex]); } tagName = matcher.group(1); String indexStr = matcher.group(3); htmlId = matcher.group(4); index = -1; if ((indexStr != null) && (!"".equals(indexStr))) { index = Integer.parseInt(indexStr); } compareSpec = new HTMLPageElementSpec ((HTMLDocumentSpec) document.getSpecification(), tagName, htmlId, index); List children = model.getChildren(child); child = null; for (IGUIElement candidate : children) { if (compareSpec.getSimilarity(candidate.getSpecification())) { child = candidate; break; } } } pathIndex++; } if (child != null) { return super.getGUIElementTree().find(child); } else { return null; } } /** *

* checks if tags with the provided name must be handled in the GUI model. As an example, * it is not necessary to handle "head" tags and anything included in them. *

* * @param tagName the tag name to check * * @return true, if the tag must be considered, false else */ private boolean tagNameMustBeConsidered(String tagName) { if (!tagName.startsWith("input_")) { for (int i = 0; i < tagName.length(); i++) { // all known HTML tags are either letters or digits, but nothing else. Any GUI model // containing something different is proprietary and, therefore, ignored. if (!Character.isLetterOrDigit(tagName.charAt(i))) { return false; } } } return !"head".equals(tagName) && !"title".equals(tagName) && !"script".equals(tagName) && !"style".equals(tagName) && !"link".equals(tagName) && !"meta".equals(tagName) && !"iframe".equals(tagName) && !"input_hidden".equals(tagName) && !"option".equals(tagName) && !"tt".equals(tagName) && !"br".equals(tagName) && !"colgroup".equals(tagName) && !"col".equals(tagName) && !"hr".equals(tagName) && !"param".equals(tagName) && !"sfmsg".equals(tagName) && !"wappalyzerdata".equals(tagName); } /** *

* checks if the children of a specified parent must be added to the GUI model or not. *

* * @param parent the parent tag to check * * @return true, if the child of the tag must be considered, false else */ private boolean childrenMustBeConsidered(HTMLGUIElement parent) { if (parent instanceof HTMLPageElement) { return !"svg".equals(((HTMLPageElement) parent).getTagName()); } else { return true; } } /** *

specification for a replacement consisting of path of tag or document specifications * and the appropriate replacement.

*/ private static class ReplacementSpecification { /** *

* the pattern used for parsing parsing parameters *

*/ private Pattern htmlElementSpecPattern = Pattern.compile ("(document\\(path=([\\w/-]+)\\))|((\\w+)(\\[(\\d+)\\]|\\(htmlId=([\\w-_#]+)\\))?)"); /** *

* the path of specifications (tags and document) specifying the tag for which this * replacement is specified *

*/ private List specs = new LinkedList(); /** *

* the name of the last tag in the specification path (used for indexing purposes) *

*/ private String lastTagName; /** *

* the configured replacement *

*/ private String replacement; /** *

* initializes the specification with the key/value strings from the config file. Parses * the key to get the specification path consisting of, optionally, a document * specification and one or more tag specification. *

*/ public ReplacementSpecification(String tagSpec, String replacement) { List tagSpecs = split(tagSpec); for (int i = 0; i < tagSpecs.size(); i++) { Matcher matcher = htmlElementSpecPattern.matcher(tagSpecs.get(i)); if (!matcher.matches()) { throw new IllegalArgumentException ("illegal tag specification " + tagSpecs.get(i)); } if (matcher.group(1) != null) { this.specs.add(new DocumentSpec(matcher.group(2))); } else if (matcher.group(4) != null) { String indexConditionStr = matcher.group(6); Integer indexCondition = null; if (indexConditionStr != null) { try { indexCondition = Integer.parseInt(indexConditionStr); } catch (NumberFormatException e) { throw new IllegalArgumentException ("illegal tag index specification " + indexConditionStr, e); } } this.specs.add (new TagSpec(matcher.group(4), indexCondition, matcher.group(7))); } } this.lastTagName = ((TagSpec) this.specs.get(this.specs.size() - 1)).getTagName(); this.replacement = replacement; } /** *

* convenience method to split the key of a key/value pair from the config file into its * parts *

*/ private List split(String tagSpec) { List specs = new LinkedList(); StringBuffer currentSpec = new StringBuffer(); int openBraces = 0; for (int i = 0; i < tagSpec.length(); i++) { char curChar = tagSpec.charAt(i); if ((openBraces == 0) && ('/' == curChar) && (currentSpec.length() > 0)) { specs.add(currentSpec.toString()); currentSpec.setLength(0); } else { if ('(' == curChar) { openBraces++; } else if (')' == curChar) { openBraces--; } currentSpec.append(curChar); } } if (currentSpec.length() > 0) { specs.add(currentSpec.toString()); } return specs; } /** *

* checks, if the tag identified by the parameters matches this specificaiton. *

*/ private boolean matches(String tagName, int index, String htmlId, HTMLGUIElement parent) { String currentTagName = tagName; int currentIndex = index; String currentHtmlId = htmlId; String currentPath = null; HTMLGUIElement currentParent = parent; int i = specs.size() - 1; while (i >= 0) { if ((specs.get(i) instanceof TagSpec) && (!((TagSpec) specs.get(i)).matches(currentTagName, currentIndex, currentHtmlId))) { return false; } else if ((specs.get(i) instanceof DocumentSpec) && (!((DocumentSpec) specs.get(i)).matches(currentPath))) { return false; } i--; if (i >= 0) { if (currentParent instanceof HTMLPageElement) { currentTagName = ((HTMLPageElement) currentParent).getTagName(); currentIndex = ((HTMLPageElement) currentParent).getIndex(); currentHtmlId = ((HTMLPageElement) currentParent).getHtmlId(); currentPath = null; currentParent = (HTMLGUIElement) currentParent.getParent(); } else if (currentParent instanceof HTMLDocument) { currentTagName = null; currentIndex = Integer.MIN_VALUE; currentHtmlId = null; currentPath = ((HTMLDocument) currentParent).getPath(); currentParent = (HTMLGUIElement) currentParent.getParent(); } else { throw new IllegalArgumentException ("specification matches documents or servers. This is not supported yet."); } } } return true; } /** *

* returns the specified replacement *

*/ private String getReplacement() { return replacement; } /** *

* returns the name of the last tag specified in the specification path *

*/ private String getLastTagName() { return lastTagName; } /* (non-Javadoc) * @see java.lang.Object#toString() */ @Override public String toString() { StringBuffer result = new StringBuffer(); for (Spec spec : specs) { if (result.length() > 0) { result.append("/"); } result.append(spec); } result.append('='); result.append(replacement); return result.toString(); } } /** *

* parent type for document and tag specifications *

*/ private static interface Spec { } /** *

* specification of a document *

*/ private static class DocumentSpec implements Spec { /** *

* the part of the path the document path must have to match this specification *

*/ private String pathPart; /** *

* initializes the document specification with the path part *

*/ private DocumentSpec(String pathPart) { this.pathPart = pathPart; } /** *

* returns true if the provided path contains the path part provided to the parameter *

*/ private boolean matches(String path) { return path.contains(pathPart); } /* (non-Javadoc) * @see java.lang.Object#toString() */ @Override public String toString() { return "document(path=" + pathPart + ")"; } } /** *

* specification for a tag containing a tag name and either an index or id condition. *

*/ private static class TagSpec implements Spec { /** *

* the name of the tag to match *

*/ private String tagName; /** *

* the index of the tag to match *

*/ private Integer indexCondition; /** *

* the id of the tag to match *

*/ private String idCondition; /** *

* initializes the specification with all required parameters *

*/ private TagSpec(String tagName, Integer indexCondition, String idCondition) { this.tagName = tagName; this.indexCondition = indexCondition; this.idCondition = idCondition; } /** *

* returns true if the provided tag information matches this specification. The id is * checked first. If the id condition has a # at some position, the respective element * of the provided id is ignored. *

*/ private boolean matches(String tagName, int index, String htmlId) { if (!this.tagName.equals(tagName)) { return false; } if (idCondition != null) { if (!idCondition.equals(htmlId)) { // check if the id condition would match with ignoring specific characters if ((htmlId != null) && (idCondition.indexOf('#') > -1)) { // first of all, the length must match if (idCondition.length() != htmlId.length()) { return false; } for (int i = 0; i < idCondition.length(); i++) { if ((idCondition.charAt(i) != '#') && (idCondition.charAt(i) != htmlId.charAt(i))) { // if there is a character that is neither ignored nor matches // the condition at a specific position, return "no match" return false; } } } else { // no condition ignoring specific characters return false; } } } if ((indexCondition != null) && (index != indexCondition)) { return false; } return true; } /** *

* returns the name of the tags matched by this specification *

*/ private String getTagName() { return tagName; } /* (non-Javadoc) * @see java.lang.Object#toString() */ @Override public String toString() { StringBuffer result = new StringBuffer(tagName); if (idCondition != null) { result.append("(htmlId="); result.append(idCondition); result.append(')'); } else if (indexCondition != null) { result.append('['); result.append(indexCondition); result.append(']'); } return result.toString(); } } }