package de.ugoe.cs.eventbench.web;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import de.ugoe.cs.eventbench.web.data.WebEvent;
import de.ugoe.cs.util.FileTools;
import de.ugoe.cs.util.console.Console;

/**
 * <p>
 * Provides functionality to parse log files with web request.
 * </p>
 * 
 * @author Steffen Herbold
 * @version 1.0
 */
public class WeblogParser {

	/**
	 * <p>
	 * Timeout between two sessions in milliseconds.
	 * </p>
	 */
	private long timeout;

	/**
	 * <p>
	 * Minimal length of a session. All shorter sessions will be pruned.<br>
	 * Default: 2
	 * </p>
	 */
	private int minLength = 2;

	/**
	 * <p>
	 * Maximal length of a session. All longer sessions will be pruned.<br>
	 * Default: 100
	 * </p>
	 */
	private int maxLength = 100;

	/**
	 * <p>
	 * URL of the server that generated the log that is currently parser; null
	 * of URL is not available.<br>
	 * Default: null
	 * </p>
	 */
	private String url = null;

	/**
	 * <p>
	 * Collection of generated sequences.
	 * </p>
	 */
	private List<List<WebEvent>> sequences;

	/**
	 * <p>
	 * List that stores the users (identified through their cookie id) to each
	 * sequence.
	 * </p>
	 */
	private List<String> users;

	/**
	 * <p>
	 * List that stores the frequent users (identified through their cookie id)
	 * to each sequence.
	 * </p>
	 */
	private List<String> frequentUsers;

	/**
	 * <p>
	 * Sequences for all frequent users.
	 * </p>
	 */
	private List<Collection<List<WebEvent>>> sequencesFrequentUsers;

	/**
	 * <p>
	 * Threshold that defines how many sessions of a user are require to deem
	 * the user frequent. Note, that only sessions whose lengths is in range if
	 * {@link #minLength} and {@link #maxLength} are counted.
	 * </p>
	 */
	private int frequentUsersThreshold = -1;

	/**
	 * <p>
	 * Name and path of the robot filter.
	 * </p>
	 */
	private static final String ROBOTFILTERFILE = "misc/robotfilter.txt";

	/**
	 * <p>
	 * Field that contains a regular expression that matches all robots
	 * contained in {@link #ROBOTFILTERFILE}.
	 * </p>
	 */
	private String robotRegex = null;

	/**
	 * <p>
	 * Constructor. Creates a new WeblogParser with a default timeout of
	 * 3,600,000 milliseconds (1 hour).
	 * </p>
	 */
	public WeblogParser() {
		this(3600000);
	}

	/**
	 * <p>
	 * Constructor. Creates a new WeblogParser.
	 * </p>
	 * 
	 * @param timeout
	 *            session timeout
	 */
	public WeblogParser(long timeout) {
		this.timeout = timeout;
	}

	/**
	 * <p>
	 * Returns the generated event sequences.
	 * </p>
	 * 
	 * @return generated event sequences
	 */
	public Collection<List<WebEvent>> getSequences() {
		return sequences;
	}

	/**
	 * <p>
	 * Sets the session timeout.
	 * </p>
	 * 
	 * @param timeout
	 *            new session timeout
	 */
	public void setTimeout(long timeout) {
		this.timeout = timeout;
	}

	/**
	 * <p>
	 * Sets the minimal length of a session. All sessions that contain less
	 * events will be pruned.
	 * </p>
	 * 
	 * @param minLength
	 *            new minimal length
	 */
	public void setMinLength(int minLength) {
		this.minLength = minLength;
	}

	/**
	 * <p>
	 * Sets the maximal length of a session. All sessions that contain more
	 * events will be pruned.
	 * </p>
	 * 
	 * @param maxLength
	 *            new maximal length
	 */
	public void setMaxLength(int maxLength) {
		this.maxLength = maxLength;
	}

	/**
	 * <p>
	 * Sets the URL of the server from which this log was generated. Often
	 * required for replay generation
	 * </p>
	 * 
	 * @param url
	 *            URL of the server
	 */
	public void setUrl(String url) {
		this.url = url;
	}

	/**
	 * <p>
	 * Sets the threshold for frequent users.
	 * </p>
	 * 
	 * @param threshold
	 *            threshold value; if the value is &lt;1, the sessions of the
	 *            frequent users will not be determined
	 */
	public void setFrequentUserThreshold(int threshold) {
		this.frequentUsersThreshold = threshold;
	}

	/**
	 * <p>
	 * Returns the IDs of all frequent users.
	 * </p>
	 * 
	 * @return IDs of the frequent users
	 */
	public List<String> getFrequentUsers() {
		return frequentUsers;
	}

	/**
	 * <p>
	 * Returns the sequences of all frequent users.
	 * </p>
	 * </p>
	 * 
	 * @return list of the sequences of all frequent users
	 */
	public List<Collection<List<WebEvent>>> getFrequentUserSequences() {
		return sequencesFrequentUsers;
	}

	/**
	 * <p>
	 * Parses a web log file.
	 * </p>
	 * 
	 * @param filename
	 *            name and path of the log file
	 * @throws IOException
	 *             thrown if there is a problem with reading the log file
	 * @throws FileNotFoundException
	 *             thrown if the log file is not found
	 * @throws ParseException
	 *             thrown the date format is invalid
	 */
	public void parseFile(String filename) throws IOException,
			FileNotFoundException, ParseException {
		String[] lines = FileTools.getLinesFromFile(filename);

		Map<String, List<Integer>> cookieSessionMap = new HashMap<String, List<Integer>>();
		int lastId = -1;

		SimpleDateFormat dateFormat = new SimpleDateFormat(
				"yyyy-MM-dd HH:mm:ss");
		loadRobotRegex();

		sequences = new ArrayList<List<WebEvent>>();
		users = new ArrayList<String>();

		int lineCounter = 0;
		for (String line : lines) {
			lineCounter++;
			String[] values = line.substring(1, line.length() - 1).split(
					"\" \"");

			// use cookie as session identifier
			int cookieStart = values[0].lastIndexOf('.');
			String cookie = values[0].substring(cookieStart + 1);
			String dateString = values[1];
			long timestamp = dateFormat.parse(dateString).getTime();
			String uriString = values[2];
			// String ref = values[3]; // referer is not yet used!
			String agent;
			if (values.length > 4) {
				agent = values[4];
			} else {
				agent = "noagent";
			}

			List<String> postedVars = new ArrayList<String>();
			if (values.length == 6) { // post vars found
				for (String postVar : values[5].trim().split(" ")) {
					// TODO manual filtering of bad variables, should be
					// automated
					if (!postVar.contains("and")) {
						postedVars.add(postVar);
					}
				}
			}
			if (!isRobot(agent)) {
				try {
					URI uri = new URI(uriString);
					String path = uri.getPath();
					List<String> getVars = extractGetVarsFromUri(uri);

					WebEvent event = new WebEvent(url, path, timestamp,
							postedVars, getVars);

					// find session and add event
					List<Integer> sessionIds = cookieSessionMap.get(cookie);
					if (sessionIds == null) {
						sessionIds = new ArrayList<Integer>();
						// start new session
						sessionIds.add(++lastId);
						cookieSessionMap.put(cookie, sessionIds);
						sequences.add(new LinkedList<WebEvent>());
						users.add(cookie);
					}
					Integer lastSessionIndex = sessionIds
							.get(sessionIds.size() - 1);
					List<WebEvent> lastSession = sequences
							.get(lastSessionIndex);
					long lastEventTime = timestamp;
					if (!lastSession.isEmpty()) {
						lastEventTime = lastSession.get(lastSession.size() - 1)
								.getTimestamp();
					}
					if (timestamp - lastEventTime > timeout) {
						sessionIds.add(++lastId);
						List<WebEvent> newSession = new LinkedList<WebEvent>();
						newSession.add(event);
						sequences.add(newSession);
						users.add(cookie);
					} else {
						lastSession.add(event);
					}
				} catch (URISyntaxException e) {
					Console.traceln("Ignored line " + lineCounter + ": "
							+ e.getMessage());
				}
			}
		}
		Console.traceln("" + sequences.size() + " user sequences found");
		pruneSequences();
		Console.traceln("" + sequences.size()
				+ " remaining after pruning of sequences shorter than "
				+ minLength);
		Set<String> uniqueUsers = new HashSet<String>(users);
		Console.traceln("" + uniqueUsers.size() + " unique users");
		if (frequentUsersThreshold > 0) {
			generateFrequentUserSequences(uniqueUsers);
		}
	}

	/**
	 * <p>
	 * Generates the frequent user sequences, according to the threshold
	 * {@link #frequentUsersThreshold}.
	 * </p>
	 * 
	 * @param uniqueUsers
	 *            set with all user IDs
	 */
	private void generateFrequentUserSequences(Set<String> uniqueUsers) {
		frequentUsers = new ArrayList<String>();
		sequencesFrequentUsers = new ArrayList<Collection<List<WebEvent>>>();
		for (String user : uniqueUsers) {
			List<String> tmp = new ArrayList<String>();
			tmp.add(user);
			List<String> usersCopy = new LinkedList<String>(users);
			usersCopy.retainAll(tmp);
			int size = usersCopy.size();
			if (size >= frequentUsersThreshold) {
				frequentUsers.add(user);
				Collection<List<WebEvent>> sequencesUser = new ArrayList<List<WebEvent>>();
				for (int i = 0; i < sequences.size(); i++) {
					if (users.get(i).equals(user)) {
						sequencesUser.add(sequences.get(i));
					}
				}
				sequencesFrequentUsers.add(sequencesUser);

			}
		}
		Console.traceln("" + frequentUsers.size() + " users with more than "
				+ frequentUsersThreshold + " sequences");
	}

	/**
	 * <p>
	 * Prunes sequences shorter than {@link #minLength} and longer than
	 * {@link #maxLength}.
	 * </p>
	 */
	private void pruneSequences() {
		int i = 0;
		while (i < sequences.size()) {
			if ((sequences.get(i).size() < minLength)
					|| sequences.get(i).size() > maxLength) {
				sequences.remove(i);
				users.remove(i);
			} else {
				i++;
			}
		}

	}

	/**
	 * <p>
	 * Reads {@link #ROBOTFILTERFILE} and creates a regular expression that
	 * matches all the robots defined in the file. The regular expression is
	 * stored in the field {@link #robotRegex}.
	 * </p>
	 * 
	 * @throws IOException
	 *             thrown if there is a problem reading the robot filter
	 * @throws FileNotFoundException
	 *             thrown if the robot filter is not found
	 */
	private void loadRobotRegex() throws IOException, FileNotFoundException {
		String[] lines = FileTools.getLinesFromFile(ROBOTFILTERFILE);
		StringBuilder regex = new StringBuilder();
		for (int i = 0; i < lines.length; i++) {
			regex.append("(.*" + lines[i] + ".*)");
			if (i != lines.length - 1) {
				regex.append('|');
			}
		}
		robotRegex = regex.toString();
	}

	/**
	 * <p>
	 * Checks whether an agent is a robot.
	 * </p>
	 * 
	 * @param agent
	 *            agent that is checked
	 * @return true, if the agent is a robot; false otherwise
	 */
	private boolean isRobot(String agent) {
		return agent.matches(robotRegex);
	}

	/**
	 * <p>
	 * Parses the URI and extracts the GET variables that have been passed.
	 * </p>
	 * 
	 * @param uri
	 *            URI that is parsed
	 * @return a list with all GET variables
	 */
	private List<String> extractGetVarsFromUri(URI uri) {
		List<String> getVars = new ArrayList<String>();
		String query = uri.getQuery();
		if (query != null) {
			String[] paramPairs = query.split("&");
			for (String paramPair : paramPairs) {
				String[] paramSplit = paramPair.split("=");
				// TODO manual filtering of bad variables, should be automated
				if (!paramSplit[0].contains("and")) {
					getVars.add(paramSplit[0]);
				}
			}
		}
		return getVars;
	}
}
