source: trunk/quest-core-usageprofiles/src/main/java/de/ugoe/cs/quest/usageprofiles/Trie.java @ 559

Last change on this file since 559 was 559, checked in by sherbold, 12 years ago
  • adapted to quest coding style
  • Property svn:mime-type set to text/plain
File size: 12.8 KB
Line 
1
2package de.ugoe.cs.quest.usageprofiles;
3
4import java.io.Serializable;
5import java.security.InvalidParameterException;
6import java.util.Collection;
7import java.util.HashSet;
8import java.util.LinkedHashSet;
9import java.util.LinkedList;
10import java.util.List;
11
12import de.ugoe.cs.util.StringTools;
13
14import edu.uci.ics.jung.graph.DelegateTree;
15import edu.uci.ics.jung.graph.Graph;
16import edu.uci.ics.jung.graph.Tree;
17
18/**
19 * <p>
20 * This class implements a <it>trie</it>, i.e., a tree of sequences that the occurence of
21 * subsequences up to a predefined length. This length is the trie order.
22 * </p>
23 *
24 * @author Steffen Herbold
25 *
26 * @param <T>
27 *            Type of the symbols that are stored in the trie.
28 *
29 * @see TrieNode
30 */
31public class Trie<T> implements IDotCompatible, Serializable {
32
33    /**
34     * <p>
35     * Id for object serialization.
36     * </p>
37     */
38    private static final long serialVersionUID = 1L;
39
40    /**
41     * <p>
42     * Collection of all symbols occuring in the trie.
43     * </p>
44     */
45    private Collection<T> knownSymbols;
46
47    /**
48     * <p>
49     * Reference to the root of the trie.
50     * </p>
51     */
52    private final TrieNode<T> rootNode;
53
54    /**
55     * <p>
56     * Contructor. Creates a new Trie.
57     * </p>
58     */
59    public Trie() {
60        rootNode = new TrieNode<T>();
61        knownSymbols = new LinkedHashSet<T>();
62    }
63
64    /**
65     * <p>
66     * Copy-Constructor. Creates a new Trie as the copy of other. The other trie must noch be null.
67     * </p>
68     *
69     * @param other
70     *            Trie that is copied
71     */
72    public Trie(Trie<T> other) {
73        if (other == null) {
74            throw new InvalidParameterException("other trie must not be null");
75        }
76        rootNode = new TrieNode<T>(other.rootNode);
77        knownSymbols = new LinkedHashSet<T>(other.knownSymbols);
78    }
79
80    /**
81     * <p>
82     * Returns a collection of all symbols occuring in the trie.
83     * </p>
84     *
85     * @return symbols occuring in the trie
86     */
87    public Collection<T> getKnownSymbols() {
88        return new LinkedHashSet<T>(knownSymbols);
89    }
90
91    /**
92     * <p>
93     * Trains the current trie using the given sequence and adds all subsequence of length
94     * {@code maxOrder}.
95     * </p>
96     *
97     * @param sequence
98     *            sequence whose subsequences are added to the trie
99     * @param maxOrder
100     *            maximum length of the subsequences added to the trie
101     */
102    public void train(List<T> sequence, int maxOrder) {
103        if (maxOrder < 1) {
104            return;
105        }
106        IncompleteMemory<T> latestActions = new IncompleteMemory<T>(maxOrder);
107        int i = 0;
108        for (T currentEvent : sequence) {
109            latestActions.add(currentEvent);
110            knownSymbols.add(currentEvent);
111            i++;
112            if (i >= maxOrder) {
113                add(latestActions.getLast(maxOrder));
114            }
115        }
116        int sequenceLength = sequence.size();
117        for (int j = maxOrder - 1; j > 0; j--) {
118            add(sequence.subList(sequenceLength - j, sequenceLength));
119        }
120    }
121
122    /**
123     * <p>
124     * Adds a given subsequence to the trie and increases the counters accordingly.
125     * </p>
126     *
127     * @param subsequence
128     *            subsequence whose counters are increased
129     * @see TrieNode#add(List)
130     */
131    protected void add(List<T> subsequence) {
132        if (subsequence != null && !subsequence.isEmpty()) {
133            knownSymbols.addAll(subsequence);
134            subsequence = new LinkedList<T>(subsequence); // defensive copy!
135            T firstSymbol = subsequence.get(0);
136            TrieNode<T> node = getChildCreate(firstSymbol);
137            node.add(subsequence);
138        }
139    }
140
141    /**
142     * <p>
143     * Returns the child of the root node associated with the given symbol or creates it if it does
144     * not exist yet.
145     * </p>
146     *
147     * @param symbol
148     *            symbol whose node is required
149     * @return node associated with the symbol
150     * @see TrieNode#getChildCreate(Object)
151     */
152    protected TrieNode<T> getChildCreate(T symbol) {
153        return rootNode.getChildCreate(symbol);
154    }
155
156    /**
157     * <p>
158     * Returns the child of the root node associated with the given symbol or null if it does not
159     * exist.
160     * </p>
161     *
162     * @param symbol
163     *            symbol whose node is required
164     * @return node associated with the symbol; null if no such node exists
165     * @see TrieNode#getChild(Object)
166     */
167    protected TrieNode<T> getChild(T symbol) {
168        return rootNode.getChild(symbol);
169    }
170
171    /**
172     * <p>
173     * Returns the number of occurences of the given sequence.
174     * </p>
175     *
176     * @param sequence
177     *            sequence whose number of occurences is required
178     * @return number of occurences of the sequence
179     */
180    public int getCount(List<T> sequence) {
181        int count = 0;
182        TrieNode<T> node = find(sequence);
183        if (node != null) {
184            count = node.getCount();
185        }
186        return count;
187    }
188
189    /**
190     * <p>
191     * Returns the number of occurences of the given prefix and a symbol that follows it.<br>
192     * Convenience function to simplify usage of {@link #getCount(List)}.
193     * </p>
194     *
195     * @param sequence
196     *            prefix of the sequence
197     * @param follower
198     *            suffix of the sequence
199     * @return number of occurences of the sequence
200     * @see #getCount(List)
201     */
202    public int getCount(List<T> sequence, T follower) {
203        List<T> tmpSequence = new LinkedList<T>(sequence);
204        tmpSequence.add(follower);
205        return getCount(tmpSequence);
206
207    }
208
209    /**
210     * <p>
211     * Searches the trie for a given sequence and returns the node associated with the sequence or
212     * null if no such node is found.
213     * </p>
214     *
215     * @param sequence
216     *            sequence that is searched for
217     * @return node associated with the sequence
218     * @see TrieNode#find(List)
219     */
220    public TrieNode<T> find(List<T> sequence) {
221        if (sequence == null || sequence.isEmpty()) {
222            return rootNode;
223        }
224        List<T> sequenceCopy = new LinkedList<T>(sequence);
225        TrieNode<T> result = null;
226        TrieNode<T> node = getChild(sequenceCopy.get(0));
227        if (node != null) {
228            sequenceCopy.remove(0);
229            result = node.find(sequenceCopy);
230        }
231        return result;
232    }
233
234    /**
235     * <p>
236     * Returns a collection of all symbols that follow a given sequence in the trie. In case the
237     * sequence is not found or no symbols follow the sequence the result will be empty.
238     * </p>
239     *
240     * @param sequence
241     *            sequence whose followers are returned
242     * @return symbols following the given sequence
243     * @see TrieNode#getFollowingSymbols()
244     */
245    public Collection<T> getFollowingSymbols(List<T> sequence) {
246        Collection<T> result = new LinkedList<T>();
247        TrieNode<T> node = find(sequence);
248        if (node != null) {
249            result = node.getFollowingSymbols();
250        }
251        return result;
252    }
253
254    /**
255     * <p>
256     * Returns the longest suffix of the given context that is contained in the tree and whose
257     * children are leaves.
258     * </p>
259     *
260     * @param context
261     *            context whose suffix is searched for
262     * @return longest suffix of the context
263     */
264    public List<T> getContextSuffix(List<T> context) {
265        List<T> contextSuffix;
266        if (context != null) {
267            contextSuffix = new LinkedList<T>(context); // defensive copy
268        }
269        else {
270            contextSuffix = new LinkedList<T>();
271        }
272        boolean suffixFound = false;
273
274        while (!suffixFound) {
275            if (contextSuffix.isEmpty()) {
276                suffixFound = true; // suffix is the empty word
277            }
278            else {
279                TrieNode<T> node = find(contextSuffix);
280                if (node != null) {
281                    if (!node.getFollowingSymbols().isEmpty()) {
282                        suffixFound = true;
283                    }
284                }
285                if (!suffixFound) {
286                    contextSuffix.remove(0);
287                }
288            }
289        }
290
291        return contextSuffix;
292    }
293
294    /**
295     * <p>
296     * Helper class for graph visualization of a trie.
297     * </p>
298     *
299     * @author Steffen Herbold
300     * @version 1.0
301     */
302    static public class Edge {}
303
304    /**
305     * <p>
306     * Helper class for graph visualization of a trie.
307     * </p>
308     *
309     * @author Steffen Herbold
310     * @version 1.0
311     */
312    static public class TrieVertex {
313
314        /**
315         * <p>
316         * Id of the vertex.
317         * </p>
318         */
319        private String id;
320
321        /**
322         * <p>
323         * Contructor. Creates a new TrieVertex.
324         * </p>
325         *
326         * @param id
327         *            id of the vertex
328         */
329        protected TrieVertex(String id) {
330            this.id = id;
331        }
332
333        /**
334         * <p>
335         * Returns the id of the vertex.
336         * </p>
337         *
338         * @see java.lang.Object#toString()
339         */
340        @Override
341        public String toString() {
342            return id;
343        }
344    }
345
346    /**
347     * <p>
348     * Returns a {@link Graph} representation of the trie.
349     * </p>
350     *
351     * @return {@link Graph} representation of the trie
352     */
353    protected Tree<TrieVertex, Edge> getGraph() {
354        DelegateTree<TrieVertex, Edge> graph = new DelegateTree<TrieVertex, Edge>();
355        rootNode.getGraph(null, graph);
356        return graph;
357    }
358
359    /*
360     * (non-Javadoc)
361     *
362     * @see de.ugoe.cs.quest.usageprofiles.IDotCompatible#getDotRepresentation()
363     */
364    public String getDotRepresentation() {
365        StringBuilder stringBuilder = new StringBuilder();
366        stringBuilder.append("digraph model {" + StringTools.ENDLINE);
367        rootNode.appendDotRepresentation(stringBuilder);
368        stringBuilder.append('}' + StringTools.ENDLINE);
369        return stringBuilder.toString();
370    }
371
372    /**
373     * <p>
374     * Returns the string representation of the root node.
375     * </p>
376     *
377     * @see TrieNode#toString()
378     * @see java.lang.Object#toString()
379     */
380    @Override
381    public String toString() {
382        return rootNode.toString();
383    }
384
385    /**
386     * <p>
387     * Returns the number of symbols contained in the trie.
388     * </p>
389     *
390     * @return number of symbols contained in the trie
391     */
392    public int getNumSymbols() {
393        return knownSymbols.size();
394    }
395
396    /**
397     * <p>
398     * Returns the number of trie nodes that are ancestors of a leaf. This is the equivalent to the
399     * number of states a first-order markov model would have.
400     * <p>
401     *
402     * @return number of trie nodes that are ancestors of leafs.
403     */
404    public int getNumLeafAncestors() {
405        List<TrieNode<T>> ancestors = new LinkedList<TrieNode<T>>();
406        rootNode.getLeafAncestors(ancestors);
407        return ancestors.size();
408    }
409
410    /**
411     * <p>
412     * Returns the number of trie nodes that are leafs.
413     * </p>
414     *
415     * @return number of leafs in the trie
416     */
417    public int getNumLeafs() {
418        return rootNode.getNumLeafs();
419    }
420
421    /**
422     * <p>
423     * Updates the list of known symbols by replacing it with all symbols that are found in the
424     * child nodes of the root node. This should be the same as all symbols that are contained in
425     * the trie.
426     * </p>
427     */
428    public void updateKnownSymbols() {
429        knownSymbols = new HashSet<T>();
430        for (TrieNode<T> node : rootNode.getChildren()) {
431            knownSymbols.add(node.getSymbol());
432        }
433    }
434
435    /**
436     * <p>
437     * Two Tries are defined as equal, if their {@link #rootNode} are equal.
438     * </p>
439     *
440     * @see java.lang.Object#equals(java.lang.Object)
441     */
442    @SuppressWarnings("rawtypes")
443    @Override
444    public boolean equals(Object other) {
445        if (other == this) {
446            return true;
447        }
448        if (other instanceof Trie) {
449            return rootNode.equals(((Trie) other).rootNode);
450        }
451        return false;
452    }
453
454    /*
455     * (non-Javadoc)
456     *
457     * @see java.lang.Object#hashCode()
458     */
459    @Override
460    public int hashCode() {
461        int multiplier = 17;
462        int hash = 42;
463        if (rootNode != null) {
464            hash = multiplier * hash + rootNode.hashCode();
465        }
466        return hash;
467    }
468}
Note: See TracBrowser for help on using the repository browser.