source: trunk/quest-core-usageprofiles/src/main/java/de/ugoe/cs/quest/usageprofiles/Trie.java @ 766

Last change on this file since 766 was 766, checked in by sherbold, 12 years ago
  • Property svn:mime-type set to text/plain
File size: 12.7 KB
Line 
1package de.ugoe.cs.quest.usageprofiles;
2
3import java.io.Serializable;
4import java.util.Collection;
5import java.util.HashSet;
6import java.util.LinkedHashSet;
7import java.util.LinkedList;
8import java.util.List;
9
10import de.ugoe.cs.util.StringTools;
11
12import edu.uci.ics.jung.graph.DelegateTree;
13import edu.uci.ics.jung.graph.Graph;
14import edu.uci.ics.jung.graph.Tree;
15
16/**
17 * <p>
18 * This class implements a <it>trie</it>, i.e., a tree of sequences that the occurence of
19 * subsequences up to a predefined length. This length is the trie order.
20 * </p>
21 *
22 * @author Steffen Herbold
23 *
24 * @param <T>
25 *            Type of the symbols that are stored in the trie.
26 *
27 * @see TrieNode
28 */
29public class Trie<T> implements IDotCompatible, Serializable {
30
31    /**
32     * <p>
33     * Id for object serialization.
34     * </p>
35     */
36    private static final long serialVersionUID = 1L;
37
38    /**
39     * <p>
40     * Collection of all symbols occuring in the trie.
41     * </p>
42     */
43    private Collection<T> knownSymbols;
44
45    /**
46     * <p>
47     * Reference to the root of the trie.
48     * </p>
49     */
50    private final TrieNode<T> rootNode;
51
52    /**
53     * <p>
54     * Contructor. Creates a new Trie.
55     * </p>
56     */
57    public Trie() {
58        rootNode = new TrieNode<T>();
59        knownSymbols = new LinkedHashSet<T>();
60    }
61
62    /**
63     * <p>
64     * Copy-Constructor. Creates a new Trie as the copy of other. The other trie must noch be null.
65     * </p>
66     *
67     * @param other
68     *            Trie that is copied
69     */
70    public Trie(Trie<T> other) {
71        if (other == null) {
72            throw new IllegalArgumentException("other trie must not be null");
73        }
74        rootNode = new TrieNode<T>(other.rootNode);
75        knownSymbols = new LinkedHashSet<T>(other.knownSymbols);
76    }
77
78    /**
79     * <p>
80     * Returns a collection of all symbols occuring in the trie.
81     * </p>
82     *
83     * @return symbols occuring in the trie
84     */
85    public Collection<T> getKnownSymbols() {
86        return new LinkedHashSet<T>(knownSymbols);
87    }
88
89    /**
90     * <p>
91     * Trains the current trie using the given sequence and adds all subsequence of length
92     * {@code maxOrder}.
93     * </p>
94     *
95     * @param sequence
96     *            sequence whose subsequences are added to the trie
97     * @param maxOrder
98     *            maximum length of the subsequences added to the trie
99     */
100    public void train(List<T> sequence, int maxOrder) {
101        if (maxOrder < 1) {
102            return;
103        }
104        IncompleteMemory<T> latestActions = new IncompleteMemory<T>(maxOrder);
105        int i = 0;
106        for (T currentEvent : sequence) {
107            latestActions.add(currentEvent);
108            knownSymbols.add(currentEvent);
109            i++;
110            if (i >= maxOrder) {
111                add(latestActions.getLast(maxOrder));
112            }
113        }
114        int sequenceLength = sequence.size();
115        for (int j = maxOrder - 1; j > 0; j--) {
116            add(sequence.subList(sequenceLength - j, sequenceLength));
117        }
118    }
119
120    /**
121     * <p>
122     * Adds a given subsequence to the trie and increases the counters accordingly.
123     * </p>
124     *
125     * @param subsequence
126     *            subsequence whose counters are increased
127     * @see TrieNode#add(List)
128     */
129    protected void add(List<T> subsequence) {
130        if (subsequence != null && !subsequence.isEmpty()) {
131            knownSymbols.addAll(subsequence);
132            subsequence = new LinkedList<T>(subsequence); // defensive copy!
133            T firstSymbol = subsequence.get(0);
134            TrieNode<T> node = getChildCreate(firstSymbol);
135            node.add(subsequence);
136        }
137    }
138
139    /**
140     * <p>
141     * Returns the child of the root node associated with the given symbol or creates it if it does
142     * not exist yet.
143     * </p>
144     *
145     * @param symbol
146     *            symbol whose node is required
147     * @return node associated with the symbol
148     * @see TrieNode#getChildCreate(Object)
149     */
150    protected TrieNode<T> getChildCreate(T symbol) {
151        return rootNode.getChildCreate(symbol);
152    }
153
154    /**
155     * <p>
156     * Returns the child of the root node associated with the given symbol or null if it does not
157     * exist.
158     * </p>
159     *
160     * @param symbol
161     *            symbol whose node is required
162     * @return node associated with the symbol; null if no such node exists
163     * @see TrieNode#getChild(Object)
164     */
165    protected TrieNode<T> getChild(T symbol) {
166        return rootNode.getChild(symbol);
167    }
168
169    /**
170     * <p>
171     * Returns the number of occurences of the given sequence.
172     * </p>
173     *
174     * @param sequence
175     *            sequence whose number of occurences is required
176     * @return number of occurences of the sequence
177     */
178    public int getCount(List<T> sequence) {
179        int count = 0;
180        TrieNode<T> node = find(sequence);
181        if (node != null) {
182            count = node.getCount();
183        }
184        return count;
185    }
186
187    /**
188     * <p>
189     * Returns the number of occurences of the given prefix and a symbol that follows it.<br>
190     * Convenience function to simplify usage of {@link #getCount(List)}.
191     * </p>
192     *
193     * @param sequence
194     *            prefix of the sequence
195     * @param follower
196     *            suffix of the sequence
197     * @return number of occurences of the sequence
198     * @see #getCount(List)
199     */
200    public int getCount(List<T> sequence, T follower) {
201        List<T> tmpSequence = new LinkedList<T>(sequence);
202        tmpSequence.add(follower);
203        return getCount(tmpSequence);
204
205    }
206
207    /**
208     * <p>
209     * Searches the trie for a given sequence and returns the node associated with the sequence or
210     * null if no such node is found.
211     * </p>
212     *
213     * @param sequence
214     *            sequence that is searched for
215     * @return node associated with the sequence
216     * @see TrieNode#find(List)
217     */
218    public TrieNode<T> find(List<T> sequence) {
219        if (sequence == null || sequence.isEmpty()) {
220            return rootNode;
221        }
222        List<T> sequenceCopy = new LinkedList<T>(sequence);
223        TrieNode<T> result = null;
224        TrieNode<T> node = getChild(sequenceCopy.get(0));
225        if (node != null) {
226            sequenceCopy.remove(0);
227            result = node.find(sequenceCopy);
228        }
229        return result;
230    }
231
232    /**
233     * <p>
234     * Returns a collection of all symbols that follow a given sequence in the trie. In case the
235     * sequence is not found or no symbols follow the sequence the result will be empty.
236     * </p>
237     *
238     * @param sequence
239     *            sequence whose followers are returned
240     * @return symbols following the given sequence
241     * @see TrieNode#getFollowingSymbols()
242     */
243    public Collection<T> getFollowingSymbols(List<T> sequence) {
244        Collection<T> result = new LinkedList<T>();
245        TrieNode<T> node = find(sequence);
246        if (node != null) {
247            result = node.getFollowingSymbols();
248        }
249        return result;
250    }
251
252    /**
253     * <p>
254     * Returns the longest suffix of the given context that is contained in the tree and whose
255     * children are leaves.
256     * </p>
257     *
258     * @param context
259     *            context whose suffix is searched for
260     * @return longest suffix of the context
261     */
262    public List<T> getContextSuffix(List<T> context) {
263        List<T> contextSuffix;
264        if (context != null) {
265            contextSuffix = new LinkedList<T>(context); // defensive copy
266        }
267        else {
268            contextSuffix = new LinkedList<T>();
269        }
270        boolean suffixFound = false;
271
272        while (!suffixFound) {
273            if (contextSuffix.isEmpty()) {
274                suffixFound = true; // suffix is the empty word
275            }
276            else {
277                TrieNode<T> node = find(contextSuffix);
278                if (node != null) {
279                    if (!node.getFollowingSymbols().isEmpty()) {
280                        suffixFound = true;
281                    }
282                }
283                if (!suffixFound) {
284                    contextSuffix.remove(0);
285                }
286            }
287        }
288
289        return contextSuffix;
290    }
291
292    /**
293     * <p>
294     * Helper class for graph visualization of a trie.
295     * </p>
296     *
297     * @author Steffen Herbold
298     * @version 1.0
299     */
300    static public class Edge {}
301
302    /**
303     * <p>
304     * Helper class for graph visualization of a trie.
305     * </p>
306     *
307     * @author Steffen Herbold
308     * @version 1.0
309     */
310    static public class TrieVertex {
311
312        /**
313         * <p>
314         * Id of the vertex.
315         * </p>
316         */
317        private String id;
318
319        /**
320         * <p>
321         * Contructor. Creates a new TrieVertex.
322         * </p>
323         *
324         * @param id
325         *            id of the vertex
326         */
327        protected TrieVertex(String id) {
328            this.id = id;
329        }
330
331        /**
332         * <p>
333         * Returns the id of the vertex.
334         * </p>
335         *
336         * @see java.lang.Object#toString()
337         */
338        @Override
339        public String toString() {
340            return id;
341        }
342    }
343
344    /**
345     * <p>
346     * Returns a {@link Graph} representation of the trie.
347     * </p>
348     *
349     * @return {@link Graph} representation of the trie
350     */
351    protected Tree<TrieVertex, Edge> getGraph() {
352        DelegateTree<TrieVertex, Edge> graph = new DelegateTree<TrieVertex, Edge>();
353        rootNode.getGraph(null, graph);
354        return graph;
355    }
356
357    /*
358     * (non-Javadoc)
359     *
360     * @see de.ugoe.cs.quest.usageprofiles.IDotCompatible#getDotRepresentation()
361     */
362    public String getDotRepresentation() {
363        StringBuilder stringBuilder = new StringBuilder();
364        stringBuilder.append("digraph model {" + StringTools.ENDLINE);
365        rootNode.appendDotRepresentation(stringBuilder);
366        stringBuilder.append('}' + StringTools.ENDLINE);
367        return stringBuilder.toString();
368    }
369
370    /**
371     * <p>
372     * Returns the string representation of the root node.
373     * </p>
374     *
375     * @see TrieNode#toString()
376     * @see java.lang.Object#toString()
377     */
378    @Override
379    public String toString() {
380        return rootNode.toString();
381    }
382
383    /**
384     * <p>
385     * Returns the number of symbols contained in the trie.
386     * </p>
387     *
388     * @return number of symbols contained in the trie
389     */
390    public int getNumSymbols() {
391        return knownSymbols.size();
392    }
393
394    /**
395     * <p>
396     * Returns the number of trie nodes that are ancestors of a leaf. This is the equivalent to the
397     * number of states a first-order markov model would have.
398     * <p>
399     *
400     * @return number of trie nodes that are ancestors of leafs.
401     */
402    public int getNumLeafAncestors() {
403        List<TrieNode<T>> ancestors = new LinkedList<TrieNode<T>>();
404        rootNode.getLeafAncestors(ancestors);
405        return ancestors.size();
406    }
407
408    /**
409     * <p>
410     * Returns the number of trie nodes that are leafs.
411     * </p>
412     *
413     * @return number of leafs in the trie
414     */
415    public int getNumLeafs() {
416        return rootNode.getNumLeafs();
417    }
418
419    /**
420     * <p>
421     * Updates the list of known symbols by replacing it with all symbols that are found in the
422     * child nodes of the root node. This should be the same as all symbols that are contained in
423     * the trie.
424     * </p>
425     */
426    public void updateKnownSymbols() {
427        knownSymbols = new HashSet<T>();
428        for (TrieNode<T> node : rootNode.getChildren()) {
429            knownSymbols.add(node.getSymbol());
430        }
431    }
432
433    /**
434     * <p>
435     * Two Tries are defined as equal, if their {@link #rootNode} are equal.
436     * </p>
437     *
438     * @see java.lang.Object#equals(java.lang.Object)
439     */
440    @SuppressWarnings("rawtypes")
441    @Override
442    public boolean equals(Object other) {
443        if (other == this) {
444            return true;
445        }
446        if (other instanceof Trie) {
447            return rootNode.equals(((Trie) other).rootNode);
448        }
449        return false;
450    }
451
452    /*
453     * (non-Javadoc)
454     *
455     * @see java.lang.Object#hashCode()
456     */
457    @Override
458    public int hashCode() {
459        int multiplier = 17;
460        int hash = 42;
461        if (rootNode != null) {
462            hash = multiplier * hash + rootNode.hashCode();
463        }
464        return hash;
465    }
466}
Note: See TracBrowser for help on using the repository browser.