source: trunk/autoquest-core-usageprofiles/src/main/java/de/ugoe/cs/autoquest/usageprofiles/Trie.java

Last change on this file was 2218, checked in by pharms, 7 years ago
  • java doc issues removal
  • Property svn:mime-type set to text/plain
File size: 21.8 KB
Line 
1//   Copyright 2012 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.autoquest.usageprofiles;
16
17import java.io.Serializable;
18import java.util.Collection;
19import java.util.LinkedHashSet;
20import java.util.LinkedList;
21import java.util.List;
22
23import de.ugoe.cs.util.StringTools;
24
25import edu.uci.ics.jung.graph.DelegateTree;
26import edu.uci.ics.jung.graph.Graph;
27import edu.uci.ics.jung.graph.Tree;
28
29/**
30 * <p>
31 * This class implements a trie, i.e., a tree of sequences that represents the occurrence
32 * of subsequences up to a predefined length. This length is the trie order.
33 * </p>
34 *
35 * @author Steffen Herbold, Patrick Harms
36 *
37 * @param <T>
38 *            Type of the symbols that are stored in the trie.
39 *
40 * @see TrieNode
41 */
42public class Trie<T> implements IDotCompatible, Serializable {
43
44    /**
45     * <p>
46     * Id for object serialization.
47     * </p>
48     */
49    private static final long serialVersionUID = 1L;
50
51    /**
52     * <p>
53     * Collection of all symbols occurring in the trie.
54     * </p>
55     */
56    private SymbolMap<T, T> knownSymbols;
57
58    /**
59     * <p>
60     * Reference to the root of the trie.
61     * </p>
62     */
63    private final TrieNode<T> rootNode;
64
65    /**
66     * <p>
67     * Strategy for handling symbols, i.e., for comparing and storing them
68     * </p>
69     */
70    private SymbolStrategy<T> strategy;
71
72    /**
73     * <p>
74     * Constructor. Creates a new trie with a {@link DefaultSymbolStrategy}.
75     * </p>
76     */
77    public Trie() {
78        this(new DefaultSymbolStrategy<T>());
79    }
80
81    /**
82     * <p>
83     * Constructor. Creates a new trie that uses a specific {@link SymbolStrategy}.
84     * </p>
85     *
86     * @param strategy
87     *            strategy to be used for managing symbols
88     *
89     * @throws IllegalArgumentException
90     *            if the strategy is null
91     */
92    public Trie(SymbolStrategy<T> strategy) {
93        if (strategy == null) {
94            throw new IllegalArgumentException("strategy must not be null");
95        }
96        this.strategy = strategy;
97        rootNode = new TrieNode<T>(strategy);
98        knownSymbols = strategy.createSymbolMap();
99    }
100
101    /**
102     * <p>
103     * Copy-Constructor. Creates a new trie as the copy of other. The other trie must not be null.
104     * </p>
105     *
106     * @param other
107     *            trie that is copied
108     *
109     * @throws IllegalArgumentException
110     *            if the other trie is null
111     */
112    public Trie(Trie<T> other) {
113        if (other == null) {
114            throw new IllegalArgumentException("other trie must not be null");
115        }
116        rootNode = new TrieNode<T>(other.rootNode);
117        strategy = other.strategy;
118        knownSymbols = strategy.copySymbolMap(other.knownSymbols);
119    }
120
121    /**
122     * <p>
123     * Returns a collection of all symbols occurring in the trie.
124     * </p>
125     *
126     * @return symbols occurring in the trie
127     */
128    public Collection<T> getKnownSymbols() {
129        return new LinkedHashSet<T>(knownSymbols.getSymbols());
130    }
131
132    /**
133     * <p>
134     * Trains the current trie using the given sequence and adds all subsequences of length
135     * {@code maxOrder}.
136     * </p>
137     *
138     * @param sequence
139     *            sequence whose subsequences are added to the trie
140     * @param maxOrder
141     *            maximum length of the subsequences added to the trie
142     */
143    public void train(List<T> sequence, int maxOrder) {
144        if (maxOrder < 1) {
145            return;
146        }
147        IncompleteMemory<T> latestActions = new IncompleteMemory<T>(maxOrder);
148        int i = 0;
149        for (T currentEvent : sequence) {
150            latestActions.add(currentEvent);
151            addToKnownSymbols(currentEvent);
152            i++;
153            if (i >= maxOrder) {
154                add(latestActions.getLast(maxOrder));
155            }
156        }
157        int sequenceLength = sequence.size();
158        int startIndex = Math.max(0, sequenceLength - maxOrder + 1);
159        for (int j = startIndex; j < sequenceLength; j++) {
160            add(sequence.subList(j, sequenceLength));
161        }
162    }
163
164    /**
165     * <p>
166     * Adds a given subsequence to the trie and increases the counters accordingly. NOTE: This
167     * method does not add the symbols to the list of known symbols. This is only ensured using
168     * the method {@link #train(List, int)}.
169     * </p>
170     *
171     * @param subsequence
172     *            subsequence whose counters are increased
173     * @see TrieNode#add(List)
174     */
175    protected void add(List<T> subsequence) {
176        if (subsequence != null && !subsequence.isEmpty()) {
177            subsequence = new LinkedList<T>(subsequence); // defensive copy!
178            T firstSymbol = subsequence.get(0);
179            TrieNode<T> node = getChildCreate(firstSymbol);
180            node.add(subsequence);
181        }
182    }
183
184    /**
185     * <p>
186     * Returns the child of the root node associated with the given symbol or creates it if it does
187     * not exist yet.
188     * </p>
189     *
190     * @param symbol
191     *            symbol whose node is required
192     * @return node associated with the symbol
193     * @see TrieNode#getChildCreate(Object)
194     */
195    protected TrieNode<T> getChildCreate(T symbol) {
196        return rootNode.getChildCreate(symbol);
197    }
198
199    /**
200     * <p>
201     * Returns the child of the root node associated with the given symbol or null if it does not
202     * exist.
203     * </p>
204     *
205     * @param symbol
206     *            symbol whose node is required
207     * @return node associated with the symbol; null if no such node exists
208     * @see TrieNode#getChild(Object)
209     */
210    protected TrieNode<T> getChild(T symbol) {
211        return rootNode.getChild(symbol);
212    }
213
214    /**
215     * <p>
216     * Returns the number of occurrences of the given sequence.
217     * </p>
218     *
219     * @param sequence
220     *            sequence whose number of occurrences is required
221     * @return number of occurrences of the sequence
222     */
223    public int getCount(List<T> sequence) {
224        int count = 0;
225        TrieNode<T> node = find(sequence);
226        if (node != null) {
227            count = node.getCount();
228        }
229        return count;
230    }
231
232    /**
233     * <p>
234     * Returns the number of occurrences of the given prefix and a symbol that follows it.<br>
235     * Convenience function to simplify usage of {@link #getCount(List)}.
236     * </p>
237     *
238     * @param sequence
239     *            prefix of the sequence
240     * @param follower
241     *            suffix of the sequence
242     * @return number of occurrences of the sequence
243     * @see #getCount(List)
244     */
245    public int getCount(List<T> sequence, T follower) {
246        List<T> tmpSequence = new LinkedList<T>(sequence);
247        tmpSequence.add(follower);
248        return getCount(tmpSequence);
249
250    }
251
252    /**
253     * <p>
254     * Searches the trie for a given sequence and returns the node associated with the sequence or
255     * null if no such node is found.
256     * </p>
257     *
258     * @param sequence
259     *            sequence that is searched for
260     * @return node associated with the sequence
261     * @see TrieNode#find(List)
262     */
263    public TrieNode<T> find(List<T> sequence) {
264        if (sequence == null || sequence.isEmpty()) {
265            return rootNode;
266        }
267        List<T> sequenceCopy = new LinkedList<T>(sequence);
268        TrieNode<T> result = null;
269        TrieNode<T> node = getChild(sequenceCopy.get(0));
270        if (node != null) {
271            sequenceCopy.remove(0);
272            result = node.find(sequenceCopy);
273        }
274        return result;
275    }
276
277    /**
278     * <p>
279     * Returns a collection of all symbols that follow a given sequence in the trie. In case the
280     * sequence is not found or no symbols follow the sequence the result will be empty.
281     * </p>
282     *
283     * @param sequence
284     *            sequence whose followers are returned
285     * @return symbols following the given sequence
286     * @see TrieNode#getFollowingSymbols()
287     */
288    public Collection<T> getFollowingSymbols(List<T> sequence) {
289        Collection<T> result = new LinkedList<T>();
290        TrieNode<T> node = find(sequence);
291        if (node != null) {
292            result = node.getFollowingSymbols();
293        }
294        return result;
295    }
296
297    /**
298     * <p>
299     * Returns the longest suffix of the given context that is contained in the tree and whose
300     * children are leaves.
301     * </p>
302     *
303     * @param context
304     *            context whose suffix is searched for
305     * @return longest suffix of the context
306     */
307    public List<T> getContextSuffix(List<T> context) {
308        List<T> contextSuffix;
309        if (context != null) {
310            contextSuffix = new LinkedList<T>(context); // defensive copy
311        }
312        else {
313            contextSuffix = new LinkedList<T>();
314        }
315        boolean suffixFound = false;
316
317        while (!suffixFound) {
318            if (contextSuffix.isEmpty()) {
319                suffixFound = true; // suffix is the empty word
320            }
321            else {
322                TrieNode<T> node = find(contextSuffix);
323                if (node != null) {
324                    if (!node.getFollowingSymbols().isEmpty()) {
325                        suffixFound = true;
326                    }
327                }
328                if (!suffixFound) {
329                    contextSuffix.remove(0);
330                }
331            }
332        }
333
334        return contextSuffix;
335    }
336   
337    /**
338     * <p>
339     * used to recursively process the trie. The provided processor will be called for any path
340     * through the trie. The processor may abort the processing through return values of its
341     * {@link TrieProcessor#process(List, int)} method.
342     * </p>
343     *
344     * @param processor the processor to process the tree
345     */
346    public void process(TrieProcessor<T> processor) {
347        LinkedList<T> context = new LinkedList<T>();
348       
349        for (TrieNode<T> child : rootNode.getChildren()) {
350            if (!process(context, child, processor)) {
351                break;
352            }
353        }
354    }
355
356    /**
357     * <p>
358     * processes a specific path by calling the provided processor. Furthermore, the method
359     * calls itself recursively for further subpaths.
360     * </p>
361     *
362     * @param context   the context of the currently processed trie node, i.e. the preceding
363     *                  symbols
364     * @param child     the processed trie node
365     * @param processor the processor used for processing the trie
366     *
367     * @return true, if processing shall continue, false else
368     */
369    private boolean process(LinkedList<T>    context,
370                            TrieNode<T>      node,
371                            TrieProcessor<T> processor)
372    {
373        context.add(node.getSymbol());
374       
375        TrieProcessor.Result result = processor.process(context, node.getCount());
376       
377        if (result == TrieProcessor.Result.CONTINUE) {
378            for (TrieNode<T> child : node.getChildren()) {
379                if (!process(context, child, processor)) {
380                    break;
381                }
382            }
383        }
384       
385        context.removeLast();
386       
387        return result != TrieProcessor.Result.BREAK;
388    }
389
390    /**
391     * <p>
392     * returns a list of symbol sequences which have a minimal length and that occurred as often
393     * as defined by the given occurrence count. If the given occurrence count is smaller 1 then
394     * those sequences are returned, that occur most often. The resulting list is empty, if there
395     * is no symbol sequence with the minimal length or the provided number of occurrences.
396     * </p>
397     *
398     * @param minimalLength   the minimal length of the returned sequences
399     * @param occurrenceCount the number of occurrences of the returned sequences
400     *
401     * @return as described
402     */
403    public Collection<List<T>> getSequencesWithOccurrenceCount(int minimalLength,
404                                                               int occurrenceCount)
405    {
406        LinkedList<TrieNode<T>> context = new LinkedList<TrieNode<T>>();
407        Collection<List<TrieNode<T>>> paths = new LinkedList<List<TrieNode<T>>>();
408       
409        context.push(rootNode);
410       
411        // traverse the trie and determine all sequences, which have the provided number of
412        // occurrences and a minimal length.
413       
414        // minimalLength + 1 because we denote the depth including the root node
415        determineLongPathsWithMostOccurrences(minimalLength + 1, occurrenceCount, paths, context);
416       
417        Collection<List<T>> resultingPaths = new LinkedList<List<T>>();
418        List<T> resultingPath;
419       
420        if (paths.size() > 0) {
421           
422            for (List<TrieNode<T>> path : paths) {
423                resultingPath = new LinkedList<T>();
424               
425                for (TrieNode<T> node : path) {
426                    if (node.getSymbol() != null) {
427                        resultingPath.add(node.getSymbol());
428                    }
429                }
430               
431                resultingPaths.add(resultingPath);
432            }
433        }
434       
435        return resultingPaths;
436    }
437
438    /**
439     * <p>
440     * Traverses the trie to collect all sequences with a defined number of occurrences and with
441     * a minimal length. If the given occurrence count is smaller 1 then those sequences are
442     * searched that occur most often. The length of the sequences is encoded in the provided
443     * recursion depth.
444     * </p>
445     *
446     * @param minimalDepth    the minimal recursion depth to be done
447     * @param occurrenceCount the number of occurrences of the returned sequences
448     * @param paths           the paths through the trie that all occurred with the same amount
449     *                        (if occurrence count is smaller 1, the paths which occurred most
450     *                        often) and that have the so far found matching number of occurrences
451     *                        (is updated each time a further path with the same number of
452     *                        occurrences is found; if occurrence count is smaller 1
453     *                        it is replaced if a path with more occurrences is found)
454     * @param context         the path through the trie, that is analyzed by the recursive call
455     */
456    private void determineLongPathsWithMostOccurrences(int                           minimalDepth,
457                                                       int                           occurrenceCount,
458                                                       Collection<List<TrieNode<T>>> paths,
459                                                       LinkedList<TrieNode<T>>       context)
460    {
461        int envisagedCount = occurrenceCount;
462
463        // only if we already reached the depth to be achieved, we check if the paths have the
464        // required number of occurrences
465        if (context.size() >= minimalDepth) {
466           
467            if (envisagedCount < 1) {
468                // try to determine the maximum number of occurrences so far, if any
469                if (paths.size() > 0) {
470                    List<TrieNode<T>> path = paths.iterator().next();
471                    envisagedCount = path.get(path.size() - 1).getCount();
472                }
473
474                // if the current path has a higher number of occurrences than all so far, clear
475                // the paths collected so far and set the new number of occurrences as new maximum
476                if (context.getLast().getCount() > envisagedCount) {
477                    paths.clear();
478                    envisagedCount = context.getLast().getCount();
479                }
480            }
481           
482            // if the path matches the current maximal number of occurrences, add it to the list
483            // of collected paths with these number of occurrences
484            if (context.getLast().getCount() == envisagedCount) {
485                paths.add(new LinkedList<TrieNode<T>>(context));
486            }
487        }
488       
489        // perform the trie traversal
490        for (TrieNode<T> child : context.getLast().getChildren()) {
491            if (child.getCount() >= envisagedCount) {
492                context.add(child);
493                determineLongPathsWithMostOccurrences
494                    (minimalDepth, occurrenceCount, paths, context);
495                context.removeLast();
496            }
497        }
498    }
499   
500    /**
501     * <p>
502     * adds a new symbol to the collection of known symbols if this symbol is not already
503     * contained.
504     * </p>
505     *
506     * @param symbol the symbol to be added to the known symbols
507     */
508    private void addToKnownSymbols(T symbol) {
509        if (!knownSymbols.containsSymbol(symbol)) {
510            knownSymbols.addSymbol(symbol, null);
511        }
512    }
513
514    /**
515     * <p>
516     * Helper class for graph visualization of a trie.
517     * </p>
518     *
519     * @author Steffen Herbold
520     * @version 1.0
521     */
522    static public class Edge {}
523
524    /**
525     * <p>
526     * Helper class for graph visualization of a trie.
527     * </p>
528     *
529     * @author Steffen Herbold
530     * @version 1.0
531     */
532    static public class TrieVertex {
533
534        /**
535         * <p>
536         * Id of the vertex.
537         * </p>
538         */
539        private String id;
540
541        /**
542         * <p>
543         * Constructor. Creates a new TrieVertex.
544         * </p>
545         *
546         * @param id
547         *            id of the vertex
548         */
549        protected TrieVertex(String id) {
550            this.id = id;
551        }
552
553        /**
554         * <p>
555         * Returns the id of the vertex.
556         * </p>
557         *
558         * @see java.lang.Object#toString()
559         */
560        @Override
561        public String toString() {
562            return id;
563        }
564    }
565
566    /**
567     * <p>
568     * Returns a {@link Graph} representation of the trie.
569     * </p>
570     *
571     * @return {@link Graph} representation of the trie
572     */
573    protected Tree<TrieVertex, Edge> getGraph() {
574        DelegateTree<TrieVertex, Edge> graph = new DelegateTree<TrieVertex, Edge>();
575        rootNode.getGraph(null, graph);
576        return graph;
577    }
578
579    /*
580     * (non-Javadoc)
581     *
582     * @see de.ugoe.cs.autoquest.usageprofiles.IDotCompatible#getDotRepresentation()
583     */
584    public String getDotRepresentation() {
585        StringBuilder stringBuilder = new StringBuilder();
586        stringBuilder.append("digraph model {" + StringTools.ENDLINE);
587        rootNode.appendDotRepresentation(stringBuilder);
588        stringBuilder.append('}' + StringTools.ENDLINE);
589        return stringBuilder.toString();
590    }
591
592    /**
593     * <p>
594     * Returns the string representation of the root node.
595     * </p>
596     *
597     * @see TrieNode#toString()
598     * @see java.lang.Object#toString()
599     */
600    @Override
601    public String toString() {
602        return rootNode.toString();
603    }
604
605    /**
606     * <p>
607     * Returns the number of symbols contained in the trie.
608     * </p>
609     *
610     * @return number of symbols contained in the trie
611     */
612    public int getNumSymbols() {
613        return knownSymbols.size();
614    }
615
616    /**
617     * <p>
618     * Returns the number of trie nodes that are ancestors of a leaf. This is the equivalent to the
619     * number of states a first-order markov model would have.
620     * <p>
621     *
622     * @return number of trie nodes that are ancestors of leafs.
623     */
624    public int getNumLeafAncestors() {
625        List<TrieNode<T>> ancestors = new LinkedList<TrieNode<T>>();
626        rootNode.getLeafAncestors(ancestors);
627        return ancestors.size();
628    }
629
630    /**
631     * <p>
632     * Returns the number of trie nodes that are leafs.
633     * </p>
634     *
635     * @return number of leafs in the trie
636     */
637    public int getNumLeafs() {
638        return rootNode.getNumLeafs();
639    }
640
641    /**
642     * <p>
643     * Updates the list of known symbols by replacing it with all symbols that are found in the
644     * child nodes of the root node. This should be the same as all symbols that are contained in
645     * the trie.
646     * </p>
647     */
648    public void updateKnownSymbols() {
649        knownSymbols = strategy.createSymbolMap();
650        for (TrieNode<T> node : rootNode.getChildren()) {
651            addToKnownSymbols(node.getSymbol());
652        }
653    }
654
655    /**
656     * <p>
657     * Two tries are defined as equal, if their {@link #rootNode}s are equal.
658     * </p>
659     *
660     * @see java.lang.Object#equals(java.lang.Object)
661     */
662    @SuppressWarnings("rawtypes")
663    @Override
664    public boolean equals(Object other) {
665        if (other == this) {
666            return true;
667        }
668        if (other instanceof Trie) {
669            return rootNode.equals(((Trie) other).rootNode);
670        }
671        return false;
672    }
673
674    /*
675     * (non-Javadoc)
676     *
677     * @see java.lang.Object#hashCode()
678     */
679    @Override
680    public int hashCode() {
681        int multiplier = 17;
682        int hash = 42;
683        if (rootNode != null) {
684            hash = multiplier * hash + rootNode.hashCode();
685        }
686        return hash;
687    }
688
689}
Note: See TracBrowser for help on using the repository browser.