// Copyright 2012 Georg-August-Universität Göttingen, Germany // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package de.ugoe.cs.autoquest.usageprofiles; import java.io.Serializable; import java.util.Collection; import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import de.ugoe.cs.util.StringTools; import edu.uci.ics.jung.graph.DelegateTree; import edu.uci.ics.jung.graph.Graph; import edu.uci.ics.jung.graph.Tree; /** *
* This class implements a trie, i.e., a tree of sequences that represents the occurrence * of subsequences up to a predefined length. This length is the trie order. *
* * @author Steffen Herbold, Patrick Harms * * @param* Id for object serialization. *
*/ private static final long serialVersionUID = 1L; /** ** Collection of all symbols occurring in the trie. *
*/ private SymbolMap* Reference to the root of the trie. *
*/ private final TrieNode* Strategy for handling symbols, i.e., for comparing and storing them *
*/ private SymbolStrategy* Constructor. Creates a new trie with a {@link DefaultSymbolStrategy}. *
*/ public Trie() { this(new DefaultSymbolStrategy* Constructor. Creates a new trie that uses a specific {@link SymbolStrategy}. *
* * @param strategy * strategy to be used for managing symbols * * @throws IllegalArgumentException * if the strategy is null */ public Trie(SymbolStrategy* Copy-Constructor. Creates a new trie as the copy of other. The other trie must not be null. *
* * @param other * trie that is copied * * @throws IllegalArgumentException * if the other trie is null */ public Trie(Trie* Returns a collection of all symbols occurring in the trie. *
* * @return symbols occurring in the trie */ public Collection* Trains the current trie using the given sequence and adds all subsequences of length * {@code maxOrder}. *
* * @param sequence * sequence whose subsequences are added to the trie * @param maxOrder * maximum length of the subsequences added to the trie */ public void train(List* Adds a given subsequence to the trie and increases the counters accordingly. NOTE: This * method does not add the symbols to the list of known symbols. This is only ensured using * the method {@link #train(List, int)}. *
* * @param subsequence * subsequence whose counters are increased * @see TrieNode#add(List) */ protected void add(List* Returns the child of the root node associated with the given symbol or creates it if it does * not exist yet. *
* * @param symbol * symbol whose node is required * @return node associated with the symbol * @see TrieNode#getChildCreate(Object) */ protected TrieNode* Returns the child of the root node associated with the given symbol or null if it does not * exist. *
* * @param symbol * symbol whose node is required * @return node associated with the symbol; null if no such node exists * @see TrieNode#getChild(Object) */ protected TrieNode* Returns the number of occurrences of the given sequence. *
* * @param sequence * sequence whose number of occurrences is required * @return number of occurrences of the sequence */ public int getCount(List
* Returns the number of occurrences of the given prefix and a symbol that follows it.
* Convenience function to simplify usage of {@link #getCount(List)}.
*
* Searches the trie for a given sequence and returns the node associated with the sequence or * null if no such node is found. *
* * @param sequence * sequence that is searched for * @return node associated with the sequence * @see TrieNode#find(List) */ public TrieNode* Returns a collection of all symbols that follow a given sequence in the trie. In case the * sequence is not found or no symbols follow the sequence the result will be empty. *
* * @param sequence * sequence whose followers are returned * @return symbols following the given sequence * @see TrieNode#getFollowingSymbols() */ public Collection* Returns the longest suffix of the given context that is contained in the tree and whose * children are leaves. *
* * @param context * context whose suffix is searched for * @return longest suffix of the context */ public List* used to recursively process the trie. The provided processor will be called for any path * through the trie. The processor may abort the processing through return values of its * {@link TrieProcessor#process(List, int)} method. *
* * @param processor the processor to process the tree */ public void process(TrieProcessor* processes a specific path by calling the provided processor. Furthermore, the method * calls itself recursively for further subpaths. *
* * @param context the context of the currently processed trie node, i.e. the preceding * symbols * @param child the processed trie node * @param processor the processor used for processing the trie * * @return true, if processing shall continue, false else */ private boolean process(LinkedList* returns a list of symbol sequences which have a minimal length and that occurred as often * as defined by the given occurrence count. If the given occurrence count is smaller 1 then * those sequences are returned, that occur most often. The resulting list is empty, if there * is no symbol sequence with the minimal length or the provided number of occurrences. *
* * @param minimalLength the minimal length of the returned sequences * @param occurrenceCount the number of occurrences of the returned sequences * * @return as described */ public Collection* Traverses the trie to collect all sequences with a defined number of occurrences and with * a minimal length. If the given occurrence count is smaller 1 then those sequences are * searched that occur most often. The length of the sequences is encoded in the provided * recursion depth. *
* * @param minimalDepth the minimal recursion depth to be done * @param occurrenceCount the number of occurrences of the returned sequences * @param paths the paths through the trie that all occurred with the same amount * (if occurrence count is smaller 1, the paths which occurred most * often) and that have the so far found matching number of occurrences * (is updated each time a further path with the same number of * occurrences is found; if occurrence count is smaller 1 * it is replaced if a path with more occurrences is found) * @param context the path through the trie, that is analyzed by the recursive call */ private void determineLongPathsWithMostOccurrences(int minimalDepth, int occurrenceCount, Collection* adds a new symbol to the collection of known symbols if this symbol is not already * contained. *
* * @param symbol the symbol to be added to the known symbols */ private void addToKnownSymbols(T symbol) { if (!knownSymbols.containsSymbol(symbol)) { knownSymbols.addSymbol(symbol, null); } } /** ** Helper class for graph visualization of a trie. *
* * @author Steffen Herbold * @version 1.0 */ static public class Edge {} /** ** Helper class for graph visualization of a trie. *
* * @author Steffen Herbold * @version 1.0 */ static public class TrieVertex { /** ** Id of the vertex. *
*/ private String id; /** ** Constructor. Creates a new TrieVertex. *
* * @param id * id of the vertex */ protected TrieVertex(String id) { this.id = id; } /** ** Returns the id of the vertex. *
* * @see java.lang.Object#toString() */ @Override public String toString() { return id; } } /** ** Returns a {@link Graph} representation of the trie. *
* * @return {@link Graph} representation of the trie */ protected Tree* Returns the string representation of the root node. *
* * @see TrieNode#toString() * @see java.lang.Object#toString() */ @Override public String toString() { return rootNode.toString(); } /** ** Returns the number of symbols contained in the trie. *
* * @return number of symbols contained in the trie */ public int getNumSymbols() { return knownSymbols.size(); } /** ** Returns the number of trie nodes that are ancestors of a leaf. This is the equivalent to the * number of states a first-order markov model would have. *
*
* @return number of trie nodes that are ancestors of leafs.
*/
public int getNumLeafAncestors() {
List
* Returns the number of trie nodes that are leafs.
*
* Updates the list of known symbols by replacing it with all symbols that are found in the
* child nodes of the root node. This should be the same as all symbols that are contained in
* the trie.
*
* Two tries are defined as equal, if their {@link #rootNode}s are equal.
*