//   Copyright 2012 Georg-August-Universität Göttingen, Germany
//
//   Licensed under the Apache License, Version 2.0 (the "License");
//   you may not use this file except in compliance with the License.
//   You may obtain a copy of the License at
//
//       http://www.apache.org/licenses/LICENSE-2.0
//
//   Unless required by applicable law or agreed to in writing, software
//   distributed under the License is distributed on an "AS IS" BASIS,
//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//   See the License for the specific language governing permissions and
//   limitations under the License.

package de.ugoe.cs.autoquest.usageprofiles;

import java.io.Serializable;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;

import de.ugoe.cs.util.StringTools;

import edu.uci.ics.jung.graph.DelegateTree;
import edu.uci.ics.jung.graph.Graph;
import edu.uci.ics.jung.graph.Tree;

/**
 * <p>
 * This class implements a <it>trie</it>, i.e., a tree of sequences that represents the occurrence
 * of subsequences up to a predefined length. This length is the trie order.
 * </p>
 * 
 * @author Steffen Herbold, Patrick Harms
 * 
 * @param <T>
 *            Type of the symbols that are stored in the trie.
 * 
 * @see TrieNode
 */
public class Trie<T> implements IDotCompatible, Serializable {

    /**
     * <p>
     * Id for object serialization.
     * </p>
     */
    private static final long serialVersionUID = 1L;

    /**
     * <p>
     * Collection of all symbols occurring in the trie.
     * </p>
     */
    private SymbolMap<T, T> knownSymbols;

    /**
     * <p>
     * Reference to the root of the trie.
     * </p>
     */
    private final TrieNode<T> rootNode;

    /**
     * <p>
     * Strategy for handling symbols, i.e., for comparing and storing them
     * </p>
     */
    private SymbolStrategy<T> strategy;

    /**
     * <p>
     * Constructor. Creates a new trie with a {@link DefaultSymbolStrategy}.
     * </p>
     */
    public Trie() {
        this(new DefaultSymbolStrategy<T>());
    }

    /**
     * <p>
     * Constructor. Creates a new trie that uses a specific {@link SymbolStrategy}.
     * </p>
     * 
     * @param strategy
     *            strategy to be used for managing symbols
     * 
     * @throws IllegalArgumentException
     *            if the strategy is null
     */
    public Trie(SymbolStrategy<T> strategy) {
        if (strategy == null) {
            throw new IllegalArgumentException("strategy must not be null");
        }
        this.strategy = strategy;
        rootNode = new TrieNode<T>(strategy);
        knownSymbols = strategy.createSymbolMap();
    }

    /**
     * <p>
     * Copy-Constructor. Creates a new trie as the copy of other. The other trie must not be null.
     * </p>
     * 
     * @param other
     *            trie that is copied
     * 
     * @throws IllegalArgumentException
     *            if the other trie is null
     */
    public Trie(Trie<T> other) {
        if (other == null) {
            throw new IllegalArgumentException("other trie must not be null");
        }
        rootNode = new TrieNode<T>(other.rootNode);
        strategy = other.strategy;
        knownSymbols = strategy.copySymbolMap(other.knownSymbols);
    }

    /**
     * <p>
     * Returns a collection of all symbols occurring in the trie.
     * </p>
     * 
     * @return symbols occurring in the trie
     */
    public Collection<T> getKnownSymbols() {
        return new LinkedHashSet<T>(knownSymbols.getSymbols());
    }

    /**
     * <p>
     * Trains the current trie using the given sequence and adds all subsequences of length
     * {@code maxOrder}.
     * </p>
     * 
     * @param sequence
     *            sequence whose subsequences are added to the trie
     * @param maxOrder
     *            maximum length of the subsequences added to the trie
     */
    public void train(List<T> sequence, int maxOrder) {
        if (maxOrder < 1) {
            return;
        }
        IncompleteMemory<T> latestActions = new IncompleteMemory<T>(maxOrder);
        int i = 0;
        for (T currentEvent : sequence) {
            latestActions.add(currentEvent);
            addToKnownSymbols(currentEvent);
            i++;
            if (i >= maxOrder) {
                add(latestActions.getLast(maxOrder));
            }
        }
        int sequenceLength = sequence.size();
        int startIndex = Math.max(0, sequenceLength - maxOrder + 1);
        for (int j = startIndex; j < sequenceLength; j++) {
            add(sequence.subList(j, sequenceLength));
        }
    }

    /**
     * <p>
     * Adds a given subsequence to the trie and increases the counters accordingly. NOTE: This
     * method does not add the symbols to the list of known symbols. This is only ensured using
     * the method {@link #train(List, int)}.
     * </p>
     * 
     * @param subsequence
     *            subsequence whose counters are increased
     * @see TrieNode#add(List)
     */
    protected void add(List<T> subsequence) {
        if (subsequence != null && !subsequence.isEmpty()) {
            subsequence = new LinkedList<T>(subsequence); // defensive copy!
            T firstSymbol = subsequence.get(0);
            TrieNode<T> node = getChildCreate(firstSymbol);
            node.add(subsequence);
        }
    }

    /**
     * <p>
     * Returns the child of the root node associated with the given symbol or creates it if it does
     * not exist yet.
     * </p>
     * 
     * @param symbol
     *            symbol whose node is required
     * @return node associated with the symbol
     * @see TrieNode#getChildCreate(Object)
     */
    protected TrieNode<T> getChildCreate(T symbol) {
        return rootNode.getChildCreate(symbol);
    }

    /**
     * <p>
     * Returns the child of the root node associated with the given symbol or null if it does not
     * exist.
     * </p>
     * 
     * @param symbol
     *            symbol whose node is required
     * @return node associated with the symbol; null if no such node exists
     * @see TrieNode#getChild(Object)
     */
    protected TrieNode<T> getChild(T symbol) {
        return rootNode.getChild(symbol);
    }

    /**
     * <p>
     * Returns the number of occurrences of the given sequence.
     * </p>
     * 
     * @param sequence
     *            sequence whose number of occurrences is required
     * @return number of occurrences of the sequence
     */
    public int getCount(List<T> sequence) {
        int count = 0;
        TrieNode<T> node = find(sequence);
        if (node != null) {
            count = node.getCount();
        }
        return count;
    }

    /**
     * <p>
     * Returns the number of occurrences of the given prefix and a symbol that follows it.<br>
     * Convenience function to simplify usage of {@link #getCount(List)}.
     * </p>
     * 
     * @param sequence
     *            prefix of the sequence
     * @param follower
     *            suffix of the sequence
     * @return number of occurrences of the sequence
     * @see #getCount(List)
     */
    public int getCount(List<T> sequence, T follower) {
        List<T> tmpSequence = new LinkedList<T>(sequence);
        tmpSequence.add(follower);
        return getCount(tmpSequence);

    }

    /**
     * <p>
     * Searches the trie for a given sequence and returns the node associated with the sequence or
     * null if no such node is found.
     * </p>
     * 
     * @param sequence
     *            sequence that is searched for
     * @return node associated with the sequence
     * @see TrieNode#find(List)
     */
    public TrieNode<T> find(List<T> sequence) {
        if (sequence == null || sequence.isEmpty()) {
            return rootNode;
        }
        List<T> sequenceCopy = new LinkedList<T>(sequence);
        TrieNode<T> result = null;
        TrieNode<T> node = getChild(sequenceCopy.get(0));
        if (node != null) {
            sequenceCopy.remove(0);
            result = node.find(sequenceCopy);
        }
        return result;
    }

    /**
     * <p>
     * Returns a collection of all symbols that follow a given sequence in the trie. In case the
     * sequence is not found or no symbols follow the sequence the result will be empty.
     * </p>
     * 
     * @param sequence
     *            sequence whose followers are returned
     * @return symbols following the given sequence
     * @see TrieNode#getFollowingSymbols()
     */
    public Collection<T> getFollowingSymbols(List<T> sequence) {
        Collection<T> result = new LinkedList<T>();
        TrieNode<T> node = find(sequence);
        if (node != null) {
            result = node.getFollowingSymbols();
        }
        return result;
    }

    /**
     * <p>
     * Returns the longest suffix of the given context that is contained in the tree and whose
     * children are leaves.
     * </p>
     * 
     * @param context
     *            context whose suffix is searched for
     * @return longest suffix of the context
     */
    public List<T> getContextSuffix(List<T> context) {
        List<T> contextSuffix;
        if (context != null) {
            contextSuffix = new LinkedList<T>(context); // defensive copy
        }
        else {
            contextSuffix = new LinkedList<T>();
        }
        boolean suffixFound = false;

        while (!suffixFound) {
            if (contextSuffix.isEmpty()) {
                suffixFound = true; // suffix is the empty word
            }
            else {
                TrieNode<T> node = find(contextSuffix);
                if (node != null) {
                    if (!node.getFollowingSymbols().isEmpty()) {
                        suffixFound = true;
                    }
                }
                if (!suffixFound) {
                    contextSuffix.remove(0);
                }
            }
        }

        return contextSuffix;
    }
    
    /**
     * <p>
     * used to recursively process the trie. The provided processor will be called for any path
     * through the trie. The processor may abort the processing through return values of its
     * {@link TrieProcessor#process(List, int)} method.
     * </p>
     * 
     * @param processor the processor to process the tree
     */
    public void process(TrieProcessor<T> processor) {
        LinkedList<T> context = new LinkedList<T>();
        
        for (TrieNode<T> child : rootNode.getChildren()) {
            if (!process(context, child, processor)) {
                break;
            }
        }
    }

    /**
     * <p>
     * processes a specific path by calling the provided processor. Furthermore, the method
     * calls itself recursively for further subpaths.
     * </p>
     * 
     * @param context   the context of the currently processed trie node, i.e. the preceding
     *                  symbols
     * @param child     the processed trie node
     * @param processor the processor used for processing the trie
     * 
     * @return true, if processing shall continue, false else
     */
    private boolean process(LinkedList<T>    context,
                            TrieNode<T>      node,
                            TrieProcessor<T> processor)
    {
        context.add(node.getSymbol());
        
        TrieProcessor.Result result = processor.process(context, node.getCount());
        
        if (result == TrieProcessor.Result.CONTINUE) {
            for (TrieNode<T> child : node.getChildren()) {
                if (!process(context, child, processor)) {
                    break;
                }
            }
        }
        
        context.removeLast();
        
        return result != TrieProcessor.Result.BREAK;
    }

    /**
     * <p>
     * returns a list of symbol sequences which have a minimal length and that occurred as often
     * as defined by the given occurrence count. If the given occurrence count is smaller 1 then
     * those sequences are returned, that occur most often. The resulting list is empty, if there
     * is no symbol sequence with the minimal length or the provided number of occurrences.
     * </p>
     *
     * @param minimalLength   the minimal length of the returned sequences
     * @param occurrenceCount the number of occurrences of the returned sequences
     * 
     * @return as described
     */
    public Collection<List<T>> getSequencesWithOccurrenceCount(int minimalLength,
                                                               int occurrenceCount)
    {
        LinkedList<TrieNode<T>> context = new LinkedList<TrieNode<T>>();
        Collection<List<TrieNode<T>>> paths = new LinkedList<List<TrieNode<T>>>();
        
        context.push(rootNode);
        
        // traverse the trie and determine all sequences, which have the provided number of
        // occurrences and a minimal length.
        
        // minimalLength + 1 because we denote the depth including the root node
        determineLongPathsWithMostOccurrences(minimalLength + 1, occurrenceCount, paths, context);
        
        Collection<List<T>> resultingPaths = new LinkedList<List<T>>();
        List<T> resultingPath;
        
        if (paths.size() > 0) {
            
            for (List<TrieNode<T>> path : paths) {
                resultingPath = new LinkedList<T>();
                
                for (TrieNode<T> node : path) {
                    if (node.getSymbol() != null) {
                        resultingPath.add(node.getSymbol());
                    }
                }
                
                resultingPaths.add(resultingPath);
            }
        }
        
        return resultingPaths;
    }

    /**
     * <p>
     * Traverses the trie to collect all sequences with a defined number of occurrences and with
     * a minimal length. If the given occurrence count is smaller 1 then those sequences are
     * searched that occur most often. The length of the sequences is encoded in the provided
     * recursion depth.
     * </p>
     *
     * @param minimalDepth    the minimal recursion depth to be done
     * @param occurrenceCount the number of occurrences of the returned sequences
     * @param paths           the paths through the trie that all occurred with the same amount
     *                        (if occurrence count is smaller 1, the paths which occurred most
     *                        often) and that have the so far found matching number of occurrences
     *                        (is updated each time a further path with the same number of
     *                        occurrences is found; if occurrence count is smaller 1
     *                        it is replaced if a path with more occurrences is found)
     * @param context         the path through the trie, that is analyzed by the recursive call
     */
    private void determineLongPathsWithMostOccurrences(int                           minimalDepth,
                                                       int                           occurrenceCount,
                                                       Collection<List<TrieNode<T>>> paths,
                                                       LinkedList<TrieNode<T>>       context)
    {
        int envisagedCount = occurrenceCount;

        // only if we already reached the depth to be achieved, we check if the paths have the
        // required number of occurrences
        if (context.size() >= minimalDepth) {
            
            if (envisagedCount < 1) {
                // try to determine the maximum number of occurrences so far, if any
                if (paths.size() > 0) {
                    List<TrieNode<T>> path = paths.iterator().next();
                    envisagedCount = path.get(path.size() - 1).getCount();
                }

                // if the current path has a higher number of occurrences than all so far, clear
                // the paths collected so far and set the new number of occurrences as new maximum
                if (context.getLast().getCount() > envisagedCount) {
                    paths.clear();
                    envisagedCount = context.getLast().getCount();
                }
            }
            
            // if the path matches the current maximal number of occurrences, add it to the list
            // of collected paths with these number of occurrences
            if (context.getLast().getCount() == envisagedCount) {
                paths.add(new LinkedList<TrieNode<T>>(context));
            }
        }
        
        // perform the trie traversal
        for (TrieNode<T> child : context.getLast().getChildren()) {
            if (child.getCount() >= envisagedCount) {
                context.add(child);
                determineLongPathsWithMostOccurrences
                    (minimalDepth, occurrenceCount, paths, context);
                context.removeLast();
            }
        }
    }
    
    /**
     * <p>
     * adds a new symbol to the collection of known symbols if this symbol is not already
     * contained.
     * </p>
     *
     * @param symbol the symbol to be added to the known symbols
     */
    private void addToKnownSymbols(T symbol) {
        if (!knownSymbols.containsSymbol(symbol)) {
            knownSymbols.addSymbol(symbol, null);
        }
    }

    /**
     * <p>
     * Helper class for graph visualization of a trie.
     * </p>
     * 
     * @author Steffen Herbold
     * @version 1.0
     */
    static public class Edge {}

    /**
     * <p>
     * Helper class for graph visualization of a trie.
     * </p>
     * 
     * @author Steffen Herbold
     * @version 1.0
     */
    static public class TrieVertex {

        /**
         * <p>
         * Id of the vertex.
         * </p>
         */
        private String id;

        /**
         * <p>
         * Constructor. Creates a new TrieVertex.
         * </p>
         * 
         * @param id
         *            id of the vertex
         */
        protected TrieVertex(String id) {
            this.id = id;
        }

        /**
         * <p>
         * Returns the id of the vertex.
         * </p>
         * 
         * @see java.lang.Object#toString()
         */
        @Override
        public String toString() {
            return id;
        }
    }

    /**
     * <p>
     * Returns a {@link Graph} representation of the trie.
     * </p>
     * 
     * @return {@link Graph} representation of the trie
     */
    protected Tree<TrieVertex, Edge> getGraph() {
        DelegateTree<TrieVertex, Edge> graph = new DelegateTree<TrieVertex, Edge>();
        rootNode.getGraph(null, graph);
        return graph;
    }

    /*
     * (non-Javadoc)
     * 
     * @see de.ugoe.cs.autoquest.usageprofiles.IDotCompatible#getDotRepresentation()
     */
    public String getDotRepresentation() {
        StringBuilder stringBuilder = new StringBuilder();
        stringBuilder.append("digraph model {" + StringTools.ENDLINE);
        rootNode.appendDotRepresentation(stringBuilder);
        stringBuilder.append('}' + StringTools.ENDLINE);
        return stringBuilder.toString();
    }

    /**
     * <p>
     * Returns the string representation of the root node.
     * </p>
     * 
     * @see TrieNode#toString()
     * @see java.lang.Object#toString()
     */
    @Override
    public String toString() {
        return rootNode.toString();
    }

    /**
     * <p>
     * Returns the number of symbols contained in the trie.
     * </p>
     * 
     * @return number of symbols contained in the trie
     */
    public int getNumSymbols() {
        return knownSymbols.size();
    }

    /**
     * <p>
     * Returns the number of trie nodes that are ancestors of a leaf. This is the equivalent to the
     * number of states a first-order markov model would have.
     * <p>
     * 
     * @return number of trie nodes that are ancestors of leafs.
     */
    public int getNumLeafAncestors() {
        List<TrieNode<T>> ancestors = new LinkedList<TrieNode<T>>();
        rootNode.getLeafAncestors(ancestors);
        return ancestors.size();
    }

    /**
     * <p>
     * Returns the number of trie nodes that are leafs.
     * </p>
     * 
     * @return number of leafs in the trie
     */
    public int getNumLeafs() {
        return rootNode.getNumLeafs();
    }

    /**
     * <p>
     * Updates the list of known symbols by replacing it with all symbols that are found in the
     * child nodes of the root node. This should be the same as all symbols that are contained in
     * the trie.
     * </p>
     */
    public void updateKnownSymbols() {
        knownSymbols = strategy.createSymbolMap();
        for (TrieNode<T> node : rootNode.getChildren()) {
            addToKnownSymbols(node.getSymbol());
        }
    }

    /**
     * <p>
     * Two tries are defined as equal, if their {@link #rootNode}s are equal.
     * </p>
     * 
     * @see java.lang.Object#equals(java.lang.Object)
     */
    @SuppressWarnings("rawtypes")
    @Override
    public boolean equals(Object other) {
        if (other == this) {
            return true;
        }
        if (other instanceof Trie) {
            return rootNode.equals(((Trie) other).rootNode);
        }
        return false;
    }

    /*
     * (non-Javadoc)
     * 
     * @see java.lang.Object#hashCode()
     */
    @Override
    public int hashCode() {
        int multiplier = 17;
        int hash = 42;
        if (rootNode != null) {
            hash = multiplier * hash + rootNode.hashCode();
        }
        return hash;
    }

}
