source: trunk/quest-core-usageprofiles/src/main/java/de/ugoe/cs/quest/usageprofiles/Trie.java @ 655

Last change on this file since 655 was 655, checked in by pharms, 12 years ago
  • removed old copyright file header
  • Property svn:mime-type set to text/plain
File size: 12.8 KB
Line 
1package de.ugoe.cs.quest.usageprofiles;
2
3import java.io.Serializable;
4import java.security.InvalidParameterException;
5import java.util.Collection;
6import java.util.HashSet;
7import java.util.LinkedHashSet;
8import java.util.LinkedList;
9import java.util.List;
10
11import de.ugoe.cs.util.StringTools;
12
13import edu.uci.ics.jung.graph.DelegateTree;
14import edu.uci.ics.jung.graph.Graph;
15import edu.uci.ics.jung.graph.Tree;
16
17/**
18 * <p>
19 * This class implements a <it>trie</it>, i.e., a tree of sequences that the occurence of
20 * subsequences up to a predefined length. This length is the trie order.
21 * </p>
22 *
23 * @author Steffen Herbold
24 *
25 * @param <T>
26 *            Type of the symbols that are stored in the trie.
27 *
28 * @see TrieNode
29 */
30public class Trie<T> implements IDotCompatible, Serializable {
31
32    /**
33     * <p>
34     * Id for object serialization.
35     * </p>
36     */
37    private static final long serialVersionUID = 1L;
38
39    /**
40     * <p>
41     * Collection of all symbols occuring in the trie.
42     * </p>
43     */
44    private Collection<T> knownSymbols;
45
46    /**
47     * <p>
48     * Reference to the root of the trie.
49     * </p>
50     */
51    private final TrieNode<T> rootNode;
52
53    /**
54     * <p>
55     * Contructor. Creates a new Trie.
56     * </p>
57     */
58    public Trie() {
59        rootNode = new TrieNode<T>();
60        knownSymbols = new LinkedHashSet<T>();
61    }
62
63    /**
64     * <p>
65     * Copy-Constructor. Creates a new Trie as the copy of other. The other trie must noch be null.
66     * </p>
67     *
68     * @param other
69     *            Trie that is copied
70     */
71    public Trie(Trie<T> other) {
72        if (other == null) {
73            throw new InvalidParameterException("other trie must not be null");
74        }
75        rootNode = new TrieNode<T>(other.rootNode);
76        knownSymbols = new LinkedHashSet<T>(other.knownSymbols);
77    }
78
79    /**
80     * <p>
81     * Returns a collection of all symbols occuring in the trie.
82     * </p>
83     *
84     * @return symbols occuring in the trie
85     */
86    public Collection<T> getKnownSymbols() {
87        return new LinkedHashSet<T>(knownSymbols);
88    }
89
90    /**
91     * <p>
92     * Trains the current trie using the given sequence and adds all subsequence of length
93     * {@code maxOrder}.
94     * </p>
95     *
96     * @param sequence
97     *            sequence whose subsequences are added to the trie
98     * @param maxOrder
99     *            maximum length of the subsequences added to the trie
100     */
101    public void train(List<T> sequence, int maxOrder) {
102        if (maxOrder < 1) {
103            return;
104        }
105        IncompleteMemory<T> latestActions = new IncompleteMemory<T>(maxOrder);
106        int i = 0;
107        for (T currentEvent : sequence) {
108            latestActions.add(currentEvent);
109            knownSymbols.add(currentEvent);
110            i++;
111            if (i >= maxOrder) {
112                add(latestActions.getLast(maxOrder));
113            }
114        }
115        int sequenceLength = sequence.size();
116        for (int j = maxOrder - 1; j > 0; j--) {
117            add(sequence.subList(sequenceLength - j, sequenceLength));
118        }
119    }
120
121    /**
122     * <p>
123     * Adds a given subsequence to the trie and increases the counters accordingly.
124     * </p>
125     *
126     * @param subsequence
127     *            subsequence whose counters are increased
128     * @see TrieNode#add(List)
129     */
130    protected void add(List<T> subsequence) {
131        if (subsequence != null && !subsequence.isEmpty()) {
132            knownSymbols.addAll(subsequence);
133            subsequence = new LinkedList<T>(subsequence); // defensive copy!
134            T firstSymbol = subsequence.get(0);
135            TrieNode<T> node = getChildCreate(firstSymbol);
136            node.add(subsequence);
137        }
138    }
139
140    /**
141     * <p>
142     * Returns the child of the root node associated with the given symbol or creates it if it does
143     * not exist yet.
144     * </p>
145     *
146     * @param symbol
147     *            symbol whose node is required
148     * @return node associated with the symbol
149     * @see TrieNode#getChildCreate(Object)
150     */
151    protected TrieNode<T> getChildCreate(T symbol) {
152        return rootNode.getChildCreate(symbol);
153    }
154
155    /**
156     * <p>
157     * Returns the child of the root node associated with the given symbol or null if it does not
158     * exist.
159     * </p>
160     *
161     * @param symbol
162     *            symbol whose node is required
163     * @return node associated with the symbol; null if no such node exists
164     * @see TrieNode#getChild(Object)
165     */
166    protected TrieNode<T> getChild(T symbol) {
167        return rootNode.getChild(symbol);
168    }
169
170    /**
171     * <p>
172     * Returns the number of occurences of the given sequence.
173     * </p>
174     *
175     * @param sequence
176     *            sequence whose number of occurences is required
177     * @return number of occurences of the sequence
178     */
179    public int getCount(List<T> sequence) {
180        int count = 0;
181        TrieNode<T> node = find(sequence);
182        if (node != null) {
183            count = node.getCount();
184        }
185        return count;
186    }
187
188    /**
189     * <p>
190     * Returns the number of occurences of the given prefix and a symbol that follows it.<br>
191     * Convenience function to simplify usage of {@link #getCount(List)}.
192     * </p>
193     *
194     * @param sequence
195     *            prefix of the sequence
196     * @param follower
197     *            suffix of the sequence
198     * @return number of occurences of the sequence
199     * @see #getCount(List)
200     */
201    public int getCount(List<T> sequence, T follower) {
202        List<T> tmpSequence = new LinkedList<T>(sequence);
203        tmpSequence.add(follower);
204        return getCount(tmpSequence);
205
206    }
207
208    /**
209     * <p>
210     * Searches the trie for a given sequence and returns the node associated with the sequence or
211     * null if no such node is found.
212     * </p>
213     *
214     * @param sequence
215     *            sequence that is searched for
216     * @return node associated with the sequence
217     * @see TrieNode#find(List)
218     */
219    public TrieNode<T> find(List<T> sequence) {
220        if (sequence == null || sequence.isEmpty()) {
221            return rootNode;
222        }
223        List<T> sequenceCopy = new LinkedList<T>(sequence);
224        TrieNode<T> result = null;
225        TrieNode<T> node = getChild(sequenceCopy.get(0));
226        if (node != null) {
227            sequenceCopy.remove(0);
228            result = node.find(sequenceCopy);
229        }
230        return result;
231    }
232
233    /**
234     * <p>
235     * Returns a collection of all symbols that follow a given sequence in the trie. In case the
236     * sequence is not found or no symbols follow the sequence the result will be empty.
237     * </p>
238     *
239     * @param sequence
240     *            sequence whose followers are returned
241     * @return symbols following the given sequence
242     * @see TrieNode#getFollowingSymbols()
243     */
244    public Collection<T> getFollowingSymbols(List<T> sequence) {
245        Collection<T> result = new LinkedList<T>();
246        TrieNode<T> node = find(sequence);
247        if (node != null) {
248            result = node.getFollowingSymbols();
249        }
250        return result;
251    }
252
253    /**
254     * <p>
255     * Returns the longest suffix of the given context that is contained in the tree and whose
256     * children are leaves.
257     * </p>
258     *
259     * @param context
260     *            context whose suffix is searched for
261     * @return longest suffix of the context
262     */
263    public List<T> getContextSuffix(List<T> context) {
264        List<T> contextSuffix;
265        if (context != null) {
266            contextSuffix = new LinkedList<T>(context); // defensive copy
267        }
268        else {
269            contextSuffix = new LinkedList<T>();
270        }
271        boolean suffixFound = false;
272
273        while (!suffixFound) {
274            if (contextSuffix.isEmpty()) {
275                suffixFound = true; // suffix is the empty word
276            }
277            else {
278                TrieNode<T> node = find(contextSuffix);
279                if (node != null) {
280                    if (!node.getFollowingSymbols().isEmpty()) {
281                        suffixFound = true;
282                    }
283                }
284                if (!suffixFound) {
285                    contextSuffix.remove(0);
286                }
287            }
288        }
289
290        return contextSuffix;
291    }
292
293    /**
294     * <p>
295     * Helper class for graph visualization of a trie.
296     * </p>
297     *
298     * @author Steffen Herbold
299     * @version 1.0
300     */
301    static public class Edge {}
302
303    /**
304     * <p>
305     * Helper class for graph visualization of a trie.
306     * </p>
307     *
308     * @author Steffen Herbold
309     * @version 1.0
310     */
311    static public class TrieVertex {
312
313        /**
314         * <p>
315         * Id of the vertex.
316         * </p>
317         */
318        private String id;
319
320        /**
321         * <p>
322         * Contructor. Creates a new TrieVertex.
323         * </p>
324         *
325         * @param id
326         *            id of the vertex
327         */
328        protected TrieVertex(String id) {
329            this.id = id;
330        }
331
332        /**
333         * <p>
334         * Returns the id of the vertex.
335         * </p>
336         *
337         * @see java.lang.Object#toString()
338         */
339        @Override
340        public String toString() {
341            return id;
342        }
343    }
344
345    /**
346     * <p>
347     * Returns a {@link Graph} representation of the trie.
348     * </p>
349     *
350     * @return {@link Graph} representation of the trie
351     */
352    protected Tree<TrieVertex, Edge> getGraph() {
353        DelegateTree<TrieVertex, Edge> graph = new DelegateTree<TrieVertex, Edge>();
354        rootNode.getGraph(null, graph);
355        return graph;
356    }
357
358    /*
359     * (non-Javadoc)
360     *
361     * @see de.ugoe.cs.quest.usageprofiles.IDotCompatible#getDotRepresentation()
362     */
363    public String getDotRepresentation() {
364        StringBuilder stringBuilder = new StringBuilder();
365        stringBuilder.append("digraph model {" + StringTools.ENDLINE);
366        rootNode.appendDotRepresentation(stringBuilder);
367        stringBuilder.append('}' + StringTools.ENDLINE);
368        return stringBuilder.toString();
369    }
370
371    /**
372     * <p>
373     * Returns the string representation of the root node.
374     * </p>
375     *
376     * @see TrieNode#toString()
377     * @see java.lang.Object#toString()
378     */
379    @Override
380    public String toString() {
381        return rootNode.toString();
382    }
383
384    /**
385     * <p>
386     * Returns the number of symbols contained in the trie.
387     * </p>
388     *
389     * @return number of symbols contained in the trie
390     */
391    public int getNumSymbols() {
392        return knownSymbols.size();
393    }
394
395    /**
396     * <p>
397     * Returns the number of trie nodes that are ancestors of a leaf. This is the equivalent to the
398     * number of states a first-order markov model would have.
399     * <p>
400     *
401     * @return number of trie nodes that are ancestors of leafs.
402     */
403    public int getNumLeafAncestors() {
404        List<TrieNode<T>> ancestors = new LinkedList<TrieNode<T>>();
405        rootNode.getLeafAncestors(ancestors);
406        return ancestors.size();
407    }
408
409    /**
410     * <p>
411     * Returns the number of trie nodes that are leafs.
412     * </p>
413     *
414     * @return number of leafs in the trie
415     */
416    public int getNumLeafs() {
417        return rootNode.getNumLeafs();
418    }
419
420    /**
421     * <p>
422     * Updates the list of known symbols by replacing it with all symbols that are found in the
423     * child nodes of the root node. This should be the same as all symbols that are contained in
424     * the trie.
425     * </p>
426     */
427    public void updateKnownSymbols() {
428        knownSymbols = new HashSet<T>();
429        for (TrieNode<T> node : rootNode.getChildren()) {
430            knownSymbols.add(node.getSymbol());
431        }
432    }
433
434    /**
435     * <p>
436     * Two Tries are defined as equal, if their {@link #rootNode} are equal.
437     * </p>
438     *
439     * @see java.lang.Object#equals(java.lang.Object)
440     */
441    @SuppressWarnings("rawtypes")
442    @Override
443    public boolean equals(Object other) {
444        if (other == this) {
445            return true;
446        }
447        if (other instanceof Trie) {
448            return rootNode.equals(((Trie) other).rootNode);
449        }
450        return false;
451    }
452
453    /*
454     * (non-Javadoc)
455     *
456     * @see java.lang.Object#hashCode()
457     */
458    @Override
459    public int hashCode() {
460        int multiplier = 17;
461        int hash = 42;
462        if (rootNode != null) {
463            hash = multiplier * hash + rootNode.hashCode();
464        }
465        return hash;
466    }
467}
Note: See TracBrowser for help on using the repository browser.