// Copyright 2012 Georg-August-Universität Göttingen, Germany // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package de.ugoe.cs.autoquest.usageprofiles; import java.io.Serializable; import java.util.Collection; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import de.ugoe.cs.util.StringTools; import edu.uci.ics.jung.graph.DelegateTree; import edu.uci.ics.jung.graph.Graph; import edu.uci.ics.jung.graph.Tree; /** *

* This class implements a trie, i.e., a tree of sequences that the occurence of * subsequences up to a predefined length. This length is the trie order. *

* * @author Steffen Herbold, Patrick Harms * * @param * Type of the symbols that are stored in the trie. * * @see TrieNode */ public class Trie implements IDotCompatible, Serializable { /** *

* Id for object serialization. *

*/ private static final long serialVersionUID = 1L; /** *

* Collection of all symbols occuring in the trie. *

*/ private Collection knownSymbols; /** *

* Reference to the root of the trie. *

*/ private final TrieNode rootNode; /** *

* Comparator to be used for comparing the symbols with each other *

*/ private transient SymbolComparator comparator; /** *

* Contructor. Creates a new Trie with a {@link DefaultSymbolComparator}. *

*/ public Trie() { this(new DefaultSymbolComparator()); } /** *

* Contructor. Creates a new Trie with that uses a specific {@link SymbolComparator}. *

*/ public Trie(SymbolComparator comparator) { this.comparator = comparator; rootNode = new TrieNode(comparator); knownSymbols = new LinkedHashSet(); } /** *

* Copy-Constructor. Creates a new Trie as the copy of other. The other trie must not be null. *

* * @param other * Trie that is copied */ public Trie(Trie other) { if (other == null) { throw new IllegalArgumentException("other trie must not be null"); } rootNode = new TrieNode(other.rootNode); knownSymbols = new LinkedHashSet(other.knownSymbols); comparator = other.comparator; } /** *

* Returns a collection of all symbols occuring in the trie. *

* * @return symbols occuring in the trie */ public Collection getKnownSymbols() { return new LinkedHashSet(knownSymbols); } /** *

* Trains the current trie using the given sequence and adds all subsequences of length * {@code maxOrder}. *

* * @param sequence * sequence whose subsequences are added to the trie * @param maxOrder * maximum length of the subsequences added to the trie */ public void train(List sequence, int maxOrder) { if (maxOrder < 1) { return; } IncompleteMemory latestActions = new IncompleteMemory(maxOrder); int i = 0; for (T currentEvent : sequence) { latestActions.add(currentEvent); addToKnownSymbols(currentEvent); i++; if (i >= maxOrder) { add(latestActions.getLast(maxOrder)); } } int sequenceLength = sequence.size(); int startIndex = Math.max(0, sequenceLength - maxOrder + 1); for (int j = startIndex; j < sequenceLength; j++) { add(sequence.subList(j, sequenceLength)); } } /** *

* Adds a given subsequence to the trie and increases the counters accordingly. *

* * @param subsequence * subsequence whose counters are increased * @see TrieNode#add(List) */ protected void add(List subsequence) { if (subsequence != null && !subsequence.isEmpty()) { addToKnownSymbols(subsequence); subsequence = new LinkedList(subsequence); // defensive copy! T firstSymbol = subsequence.get(0); TrieNode node = getChildCreate(firstSymbol); node.add(subsequence); } } /** *

* Returns the child of the root node associated with the given symbol or creates it if it does * not exist yet. *

* * @param symbol * symbol whose node is required * @return node associated with the symbol * @see TrieNode#getChildCreate(Object) */ protected TrieNode getChildCreate(T symbol) { return rootNode.getChildCreate(symbol); } /** *

* Returns the child of the root node associated with the given symbol or null if it does not * exist. *

* * @param symbol * symbol whose node is required * @return node associated with the symbol; null if no such node exists * @see TrieNode#getChild(Object) */ protected TrieNode getChild(T symbol) { return rootNode.getChild(symbol); } /** *

* Returns the number of occurences of the given sequence. *

* * @param sequence * sequence whose number of occurences is required * @return number of occurences of the sequence */ public int getCount(List sequence) { int count = 0; TrieNode node = find(sequence); if (node != null) { count = node.getCount(); } return count; } /** *

* Returns the number of occurences of the given prefix and a symbol that follows it.
* Convenience function to simplify usage of {@link #getCount(List)}. *

* * @param sequence * prefix of the sequence * @param follower * suffix of the sequence * @return number of occurences of the sequence * @see #getCount(List) */ public int getCount(List sequence, T follower) { List tmpSequence = new LinkedList(sequence); tmpSequence.add(follower); return getCount(tmpSequence); } /** *

* Searches the trie for a given sequence and returns the node associated with the sequence or * null if no such node is found. *

* * @param sequence * sequence that is searched for * @return node associated with the sequence * @see TrieNode#find(List) */ public TrieNode find(List sequence) { if (sequence == null || sequence.isEmpty()) { return rootNode; } List sequenceCopy = new LinkedList(sequence); TrieNode result = null; TrieNode node = getChild(sequenceCopy.get(0)); if (node != null) { sequenceCopy.remove(0); result = node.find(sequenceCopy); } return result; } /** *

* Returns a collection of all symbols that follow a given sequence in the trie. In case the * sequence is not found or no symbols follow the sequence the result will be empty. *

* * @param sequence * sequence whose followers are returned * @return symbols following the given sequence * @see TrieNode#getFollowingSymbols() */ public Collection getFollowingSymbols(List sequence) { Collection result = new LinkedList(); TrieNode node = find(sequence); if (node != null) { result = node.getFollowingSymbols(); } return result; } /** *

* Returns the longest suffix of the given context that is contained in the tree and whose * children are leaves. *

* * @param context * context whose suffix is searched for * @return longest suffix of the context */ public List getContextSuffix(List context) { List contextSuffix; if (context != null) { contextSuffix = new LinkedList(context); // defensive copy } else { contextSuffix = new LinkedList(); } boolean suffixFound = false; while (!suffixFound) { if (contextSuffix.isEmpty()) { suffixFound = true; // suffix is the empty word } else { TrieNode node = find(contextSuffix); if (node != null) { if (!node.getFollowingSymbols().isEmpty()) { suffixFound = true; } } if (!suffixFound) { contextSuffix.remove(0); } } } return contextSuffix; } /** *

* used to recursively process the trie. The provided processor will be called for any path * through the tree. The processor may abort the processing through returns values of its * {@link TrieProcessor#process(List, int)} method. *

* * @param processor the processor to process the tree */ public void process(TrieProcessor processor) { LinkedList context = new LinkedList(); for (TrieNode child : rootNode.getChildren()) { if (!process(context, child, processor)) { break; } } } /** *

* processes a specific path by calling the provided processor. Furthermore, the method * calls itself recursively for further subpaths. *

* * @param context the context of the currently processed trie node, i.e. the preceeding * symbols * @param child the processed trie node * @param processor the processor used for processing the trie * * @return true, if processing shall continue, false else */ private boolean process(LinkedList context, TrieNode node, TrieProcessor processor) { context.add(node.getSymbol()); TrieProcessor.Result result = processor.process(context, node.getCount()); if (result == TrieProcessor.Result.CONTINUE) { for (TrieNode child : node.getChildren()) { if (!process(context, child, processor)) { break; } } } context.removeLast(); return result != TrieProcessor.Result.BREAK; } /** *

* returns a list of symbol sequences which have a minimal length and that occurred as often * as defined by the given occurrence count. If the given occurrence count is smaller 1 then * those sequences are returned, that occur most often. The resulting list is empty, if there * is no symbol sequence with the minimal length or the provided number of occurrences. *

* * @param minimalLength the minimal length of the returned sequences * @param occurrenceCount the number of occurrences of the returned sequences * * @return as described */ public Collection> getSequencesWithOccurrenceCount(int minimalLength, int occurrenceCount) { LinkedList> context = new LinkedList>(); Collection>> paths = new LinkedList>>(); context.push(rootNode); // traverse the trie and determine all sequences, which have the provided number of // occurrences and a minimal length. // minimalLength + 1 because we denote the depth including the root node determineLongPathsWithMostOccurrences(minimalLength + 1, occurrenceCount, paths, context); Collection> resultingPaths = new LinkedList>(); List resultingPath; if (paths.size() > 0) { for (List> path : paths) { resultingPath = new LinkedList(); for (TrieNode node : path) { if (node.getSymbol() != null) { resultingPath.add(node.getSymbol()); } } resultingPaths.add(resultingPath); } } return resultingPaths; } /** *

* Traverses the trie to collect all sequences with a defined number of occurrences and with * a minimal length. If the given occurrence count is smaller 1 then those sequences are * searched that occur most often. The length of the sequences is encoded in the provided * recursion depth. *

* * @param minimalDepth the minimal recursion depth to be done * @param occurrenceCount the number of occurrences of the returned sequences * @param paths the paths through the trie that all occurred with the same amount * (if occurrence count is smaller 1, the paths which occurred most * often) and that have the so far found matching number of occurrences * (is updated each time a further path with the same number of * occurrences is found; if occurrence count is smaller 1 * it is replaced if a path with more occurrences is found) * @param context the path through the trie, that is analyzed by the recursive call */ private void determineLongPathsWithMostOccurrences(int minimalDepth, int occurrenceCount, Collection>> paths, LinkedList> context) { int envisagedCount = occurrenceCount; // only if we already reached the depth to be achieved, we check if the paths have the // required number of occurrences if (context.size() >= minimalDepth) { if (envisagedCount < 1) { // try to determine the maximum number of occurrences so far, if any if (paths.size() > 0) { List> path = paths.iterator().next(); envisagedCount = path.get(path.size() - 1).getCount(); } // if the current path has a higher number of occurrences than all so far, clear // the paths collected so far and set the new number of occurrences as new maximum if (context.getLast().getCount() > envisagedCount) { paths.clear(); envisagedCount = context.getLast().getCount(); } } // if the path matches the current maximal number of occurrences, add it to the list // of collected paths with these number of occurrences if (context.getLast().getCount() == envisagedCount) { paths.add(new LinkedList>(context)); } } // perform the trie traversal for (TrieNode child : context.getLast().getChildren()) { if (child.getCount() >= envisagedCount) { context.add(child); determineLongPathsWithMostOccurrences (minimalDepth, occurrenceCount, paths, context); context.removeLast(); } } } /** *

* adds a new symbol to the collection of known symbols if this symbol is not already * contained. The symbols are compared using the comparator. *

* * @param symbol the symbol to be added to the known symbols */ private void addToKnownSymbols(T symbol) { for (T knownSymbol : knownSymbols) { if (comparator.equals(knownSymbol, symbol)) { return; } } knownSymbols.add(symbol); } /** *

* adds a list of new symbols to the collection of known symbols. Uses the * {@link #addToKnownSymbols(Object)} method for each element of the provided list. *

* * @param symbols the list of symbols to be added to the known symbols */ private void addToKnownSymbols(List symbols) { for (T symbol : symbols) { addToKnownSymbols(symbol); } } /** *

* Helper class for graph visualization of a trie. *

* * @author Steffen Herbold * @version 1.0 */ static public class Edge {} /** *

* Helper class for graph visualization of a trie. *

* * @author Steffen Herbold * @version 1.0 */ static public class TrieVertex { /** *

* Id of the vertex. *

*/ private String id; /** *

* Contructor. Creates a new TrieVertex. *

* * @param id * id of the vertex */ protected TrieVertex(String id) { this.id = id; } /** *

* Returns the id of the vertex. *

* * @see java.lang.Object#toString() */ @Override public String toString() { return id; } } /** *

* Returns a {@link Graph} representation of the trie. *

* * @return {@link Graph} representation of the trie */ protected Tree getGraph() { DelegateTree graph = new DelegateTree(); rootNode.getGraph(null, graph); return graph; } /* * (non-Javadoc) * * @see de.ugoe.cs.autoquest.usageprofiles.IDotCompatible#getDotRepresentation() */ public String getDotRepresentation() { StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append("digraph model {" + StringTools.ENDLINE); rootNode.appendDotRepresentation(stringBuilder); stringBuilder.append('}' + StringTools.ENDLINE); return stringBuilder.toString(); } /** *

* Returns the string representation of the root node. *

* * @see TrieNode#toString() * @see java.lang.Object#toString() */ @Override public String toString() { return rootNode.toString(); } /** *

* Returns the number of symbols contained in the trie. *

* * @return number of symbols contained in the trie */ public int getNumSymbols() { return knownSymbols.size(); } /** *

* Returns the number of trie nodes that are ancestors of a leaf. This is the equivalent to the * number of states a first-order markov model would have. *

* * @return number of trie nodes that are ancestors of leafs. */ public int getNumLeafAncestors() { List> ancestors = new LinkedList>(); rootNode.getLeafAncestors(ancestors); return ancestors.size(); } /** *

* Returns the number of trie nodes that are leafs. *

* * @return number of leafs in the trie */ public int getNumLeafs() { return rootNode.getNumLeafs(); } /** *

* Updates the list of known symbols by replacing it with all symbols that are found in the * child nodes of the root node. This should be the same as all symbols that are contained in * the trie. *

*/ public void updateKnownSymbols() { knownSymbols = new HashSet(); for (TrieNode node : rootNode.getChildren()) { addToKnownSymbols(node.getSymbol()); } } /** *

* Two Tries are defined as equal, if their {@link #rootNode} are equal. *

* * @see java.lang.Object#equals(java.lang.Object) */ @SuppressWarnings("rawtypes") @Override public boolean equals(Object other) { if (other == this) { return true; } if (other instanceof Trie) { return rootNode.equals(((Trie) other).rootNode); } return false; } /* * (non-Javadoc) * * @see java.lang.Object#hashCode() */ @Override public int hashCode() { int multiplier = 17; int hash = 42; if (rootNode != null) { hash = multiplier * hash + rootNode.hashCode(); } return hash; } }