source: trunk/autoquest-core-usageprofiles/src/main/java/de/ugoe/cs/autoquest/usageprofiles/Trie.java @ 927

Last change on this file since 927 was 927, checked in by sherbold, 12 years ago
  • added copyright under the Apache License, Version 2.0
  • Property svn:mime-type set to text/plain
File size: 13.4 KB
Line 
1//   Copyright 2012 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.autoquest.usageprofiles;
16
17import java.io.Serializable;
18import java.util.Collection;
19import java.util.HashSet;
20import java.util.LinkedHashSet;
21import java.util.LinkedList;
22import java.util.List;
23
24import de.ugoe.cs.util.StringTools;
25
26import edu.uci.ics.jung.graph.DelegateTree;
27import edu.uci.ics.jung.graph.Graph;
28import edu.uci.ics.jung.graph.Tree;
29
30/**
31 * <p>
32 * This class implements a <it>trie</it>, i.e., a tree of sequences that the occurence of
33 * subsequences up to a predefined length. This length is the trie order.
34 * </p>
35 *
36 * @author Steffen Herbold
37 *
38 * @param <T>
39 *            Type of the symbols that are stored in the trie.
40 *
41 * @see TrieNode
42 */
43public class Trie<T> implements IDotCompatible, Serializable {
44
45    /**
46     * <p>
47     * Id for object serialization.
48     * </p>
49     */
50    private static final long serialVersionUID = 1L;
51
52    /**
53     * <p>
54     * Collection of all symbols occuring in the trie.
55     * </p>
56     */
57    private Collection<T> knownSymbols;
58
59    /**
60     * <p>
61     * Reference to the root of the trie.
62     * </p>
63     */
64    private final TrieNode<T> rootNode;
65
66    /**
67     * <p>
68     * Contructor. Creates a new Trie.
69     * </p>
70     */
71    public Trie() {
72        rootNode = new TrieNode<T>();
73        knownSymbols = new LinkedHashSet<T>();
74    }
75
76    /**
77     * <p>
78     * Copy-Constructor. Creates a new Trie as the copy of other. The other trie must noch be null.
79     * </p>
80     *
81     * @param other
82     *            Trie that is copied
83     */
84    public Trie(Trie<T> other) {
85        if (other == null) {
86            throw new IllegalArgumentException("other trie must not be null");
87        }
88        rootNode = new TrieNode<T>(other.rootNode);
89        knownSymbols = new LinkedHashSet<T>(other.knownSymbols);
90    }
91
92    /**
93     * <p>
94     * Returns a collection of all symbols occuring in the trie.
95     * </p>
96     *
97     * @return symbols occuring in the trie
98     */
99    public Collection<T> getKnownSymbols() {
100        return new LinkedHashSet<T>(knownSymbols);
101    }
102
103    /**
104     * <p>
105     * Trains the current trie using the given sequence and adds all subsequence of length
106     * {@code maxOrder}.
107     * </p>
108     *
109     * @param sequence
110     *            sequence whose subsequences are added to the trie
111     * @param maxOrder
112     *            maximum length of the subsequences added to the trie
113     */
114    public void train(List<T> sequence, int maxOrder) {
115        if (maxOrder < 1) {
116            return;
117        }
118        IncompleteMemory<T> latestActions = new IncompleteMemory<T>(maxOrder);
119        int i = 0;
120        for (T currentEvent : sequence) {
121            latestActions.add(currentEvent);
122            knownSymbols.add(currentEvent);
123            i++;
124            if (i >= maxOrder) {
125                add(latestActions.getLast(maxOrder));
126            }
127        }
128        int sequenceLength = sequence.size();
129        for (int j = maxOrder - 1; j > 0; j--) {
130            add(sequence.subList(sequenceLength - j, sequenceLength));
131        }
132    }
133
134    /**
135     * <p>
136     * Adds a given subsequence to the trie and increases the counters accordingly.
137     * </p>
138     *
139     * @param subsequence
140     *            subsequence whose counters are increased
141     * @see TrieNode#add(List)
142     */
143    protected void add(List<T> subsequence) {
144        if (subsequence != null && !subsequence.isEmpty()) {
145            knownSymbols.addAll(subsequence);
146            subsequence = new LinkedList<T>(subsequence); // defensive copy!
147            T firstSymbol = subsequence.get(0);
148            TrieNode<T> node = getChildCreate(firstSymbol);
149            node.add(subsequence);
150        }
151    }
152
153    /**
154     * <p>
155     * Returns the child of the root node associated with the given symbol or creates it if it does
156     * not exist yet.
157     * </p>
158     *
159     * @param symbol
160     *            symbol whose node is required
161     * @return node associated with the symbol
162     * @see TrieNode#getChildCreate(Object)
163     */
164    protected TrieNode<T> getChildCreate(T symbol) {
165        return rootNode.getChildCreate(symbol);
166    }
167
168    /**
169     * <p>
170     * Returns the child of the root node associated with the given symbol or null if it does not
171     * exist.
172     * </p>
173     *
174     * @param symbol
175     *            symbol whose node is required
176     * @return node associated with the symbol; null if no such node exists
177     * @see TrieNode#getChild(Object)
178     */
179    protected TrieNode<T> getChild(T symbol) {
180        return rootNode.getChild(symbol);
181    }
182
183    /**
184     * <p>
185     * Returns the number of occurences of the given sequence.
186     * </p>
187     *
188     * @param sequence
189     *            sequence whose number of occurences is required
190     * @return number of occurences of the sequence
191     */
192    public int getCount(List<T> sequence) {
193        int count = 0;
194        TrieNode<T> node = find(sequence);
195        if (node != null) {
196            count = node.getCount();
197        }
198        return count;
199    }
200
201    /**
202     * <p>
203     * Returns the number of occurences of the given prefix and a symbol that follows it.<br>
204     * Convenience function to simplify usage of {@link #getCount(List)}.
205     * </p>
206     *
207     * @param sequence
208     *            prefix of the sequence
209     * @param follower
210     *            suffix of the sequence
211     * @return number of occurences of the sequence
212     * @see #getCount(List)
213     */
214    public int getCount(List<T> sequence, T follower) {
215        List<T> tmpSequence = new LinkedList<T>(sequence);
216        tmpSequence.add(follower);
217        return getCount(tmpSequence);
218
219    }
220
221    /**
222     * <p>
223     * Searches the trie for a given sequence and returns the node associated with the sequence or
224     * null if no such node is found.
225     * </p>
226     *
227     * @param sequence
228     *            sequence that is searched for
229     * @return node associated with the sequence
230     * @see TrieNode#find(List)
231     */
232    public TrieNode<T> find(List<T> sequence) {
233        if (sequence == null || sequence.isEmpty()) {
234            return rootNode;
235        }
236        List<T> sequenceCopy = new LinkedList<T>(sequence);
237        TrieNode<T> result = null;
238        TrieNode<T> node = getChild(sequenceCopy.get(0));
239        if (node != null) {
240            sequenceCopy.remove(0);
241            result = node.find(sequenceCopy);
242        }
243        return result;
244    }
245
246    /**
247     * <p>
248     * Returns a collection of all symbols that follow a given sequence in the trie. In case the
249     * sequence is not found or no symbols follow the sequence the result will be empty.
250     * </p>
251     *
252     * @param sequence
253     *            sequence whose followers are returned
254     * @return symbols following the given sequence
255     * @see TrieNode#getFollowingSymbols()
256     */
257    public Collection<T> getFollowingSymbols(List<T> sequence) {
258        Collection<T> result = new LinkedList<T>();
259        TrieNode<T> node = find(sequence);
260        if (node != null) {
261            result = node.getFollowingSymbols();
262        }
263        return result;
264    }
265
266    /**
267     * <p>
268     * Returns the longest suffix of the given context that is contained in the tree and whose
269     * children are leaves.
270     * </p>
271     *
272     * @param context
273     *            context whose suffix is searched for
274     * @return longest suffix of the context
275     */
276    public List<T> getContextSuffix(List<T> context) {
277        List<T> contextSuffix;
278        if (context != null) {
279            contextSuffix = new LinkedList<T>(context); // defensive copy
280        }
281        else {
282            contextSuffix = new LinkedList<T>();
283        }
284        boolean suffixFound = false;
285
286        while (!suffixFound) {
287            if (contextSuffix.isEmpty()) {
288                suffixFound = true; // suffix is the empty word
289            }
290            else {
291                TrieNode<T> node = find(contextSuffix);
292                if (node != null) {
293                    if (!node.getFollowingSymbols().isEmpty()) {
294                        suffixFound = true;
295                    }
296                }
297                if (!suffixFound) {
298                    contextSuffix.remove(0);
299                }
300            }
301        }
302
303        return contextSuffix;
304    }
305
306    /**
307     * <p>
308     * Helper class for graph visualization of a trie.
309     * </p>
310     *
311     * @author Steffen Herbold
312     * @version 1.0
313     */
314    static public class Edge {}
315
316    /**
317     * <p>
318     * Helper class for graph visualization of a trie.
319     * </p>
320     *
321     * @author Steffen Herbold
322     * @version 1.0
323     */
324    static public class TrieVertex {
325
326        /**
327         * <p>
328         * Id of the vertex.
329         * </p>
330         */
331        private String id;
332
333        /**
334         * <p>
335         * Contructor. Creates a new TrieVertex.
336         * </p>
337         *
338         * @param id
339         *            id of the vertex
340         */
341        protected TrieVertex(String id) {
342            this.id = id;
343        }
344
345        /**
346         * <p>
347         * Returns the id of the vertex.
348         * </p>
349         *
350         * @see java.lang.Object#toString()
351         */
352        @Override
353        public String toString() {
354            return id;
355        }
356    }
357
358    /**
359     * <p>
360     * Returns a {@link Graph} representation of the trie.
361     * </p>
362     *
363     * @return {@link Graph} representation of the trie
364     */
365    protected Tree<TrieVertex, Edge> getGraph() {
366        DelegateTree<TrieVertex, Edge> graph = new DelegateTree<TrieVertex, Edge>();
367        rootNode.getGraph(null, graph);
368        return graph;
369    }
370
371    /*
372     * (non-Javadoc)
373     *
374     * @see de.ugoe.cs.autoquest.usageprofiles.IDotCompatible#getDotRepresentation()
375     */
376    public String getDotRepresentation() {
377        StringBuilder stringBuilder = new StringBuilder();
378        stringBuilder.append("digraph model {" + StringTools.ENDLINE);
379        rootNode.appendDotRepresentation(stringBuilder);
380        stringBuilder.append('}' + StringTools.ENDLINE);
381        return stringBuilder.toString();
382    }
383
384    /**
385     * <p>
386     * Returns the string representation of the root node.
387     * </p>
388     *
389     * @see TrieNode#toString()
390     * @see java.lang.Object#toString()
391     */
392    @Override
393    public String toString() {
394        return rootNode.toString();
395    }
396
397    /**
398     * <p>
399     * Returns the number of symbols contained in the trie.
400     * </p>
401     *
402     * @return number of symbols contained in the trie
403     */
404    public int getNumSymbols() {
405        return knownSymbols.size();
406    }
407
408    /**
409     * <p>
410     * Returns the number of trie nodes that are ancestors of a leaf. This is the equivalent to the
411     * number of states a first-order markov model would have.
412     * <p>
413     *
414     * @return number of trie nodes that are ancestors of leafs.
415     */
416    public int getNumLeafAncestors() {
417        List<TrieNode<T>> ancestors = new LinkedList<TrieNode<T>>();
418        rootNode.getLeafAncestors(ancestors);
419        return ancestors.size();
420    }
421
422    /**
423     * <p>
424     * Returns the number of trie nodes that are leafs.
425     * </p>
426     *
427     * @return number of leafs in the trie
428     */
429    public int getNumLeafs() {
430        return rootNode.getNumLeafs();
431    }
432
433    /**
434     * <p>
435     * Updates the list of known symbols by replacing it with all symbols that are found in the
436     * child nodes of the root node. This should be the same as all symbols that are contained in
437     * the trie.
438     * </p>
439     */
440    public void updateKnownSymbols() {
441        knownSymbols = new HashSet<T>();
442        for (TrieNode<T> node : rootNode.getChildren()) {
443            knownSymbols.add(node.getSymbol());
444        }
445    }
446
447    /**
448     * <p>
449     * Two Tries are defined as equal, if their {@link #rootNode} are equal.
450     * </p>
451     *
452     * @see java.lang.Object#equals(java.lang.Object)
453     */
454    @SuppressWarnings("rawtypes")
455    @Override
456    public boolean equals(Object other) {
457        if (other == this) {
458            return true;
459        }
460        if (other instanceof Trie) {
461            return rootNode.equals(((Trie) other).rootNode);
462        }
463        return false;
464    }
465
466    /*
467     * (non-Javadoc)
468     *
469     * @see java.lang.Object#hashCode()
470     */
471    @Override
472    public int hashCode() {
473        int multiplier = 17;
474        int hash = 42;
475        if (rootNode != null) {
476            hash = multiplier * hash + rootNode.hashCode();
477        }
478        return hash;
479    }
480}
Note: See TracBrowser for help on using the repository browser.