package de.ugoe.cs.eventbench.web;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import de.ugoe.cs.eventbench.web.data.WebEvent;
import de.ugoe.cs.util.FileTools;
import de.ugoe.cs.util.console.Console;

/**
 * <p>
 * Provides functionality to parse log files with web request.
 * </p>
 * 
 * @author Steffen Herbold
 * @version 1.0
 */
public class WeblogParser {

	/**
	 * <p>
	 * Timeout between two sessions in milliseconds.
	 * </p>
	 */
	private long timeout;

	/**
	 * <p>
	 * Minimal length of a session. All shorter sessions will be pruned.
	 * Default: 2
	 * </p>
	 */
	private int minLength = 2;

	/**
	 * <p>
	 * Collection of generated sequences.
	 * </p>
	 */
	private List<List<WebEvent>> sequences;

	/**
	 * <p>
	 * Name and path of the robot filter.
	 * </p>
	 */
	private static final String ROBOTFILTERFILE = "misc/robotfilter.txt";

	/**
	 * <p>
	 * Field that contains a regular expression that matches all robots
	 * contained in {@link #ROBOTFILTERFILE}.
	 * </p>
	 */
	private String robotRegex = null;

	/**
	 * <p>
	 * Constructor. Creates a new WeblogParser with a default timeout of
	 * 3,600,000 milliseconds (1 hour).
	 * </p>
	 */
	public WeblogParser() {
		this(3600000);
	}

	/**
	 * <p>
	 * Constructor. Creates a new WeblogParser.
	 * </p>
	 * 
	 * @param timeout
	 *            session timeout
	 */
	public WeblogParser(long timeout) {
		this.timeout = timeout;
	}

	/**
	 * <p>
	 * Returns the generated event sequences.
	 * </p>
	 * 
	 * @return generated event sequences
	 */
	public List<List<WebEvent>> getSequences() {
		return sequences;
	}

	/**
	 * <p>
	 * Sets the session timeout.
	 * </p>
	 * 
	 * @param timeout
	 *            new session timeout
	 */
	public void setTimeout(long timeout) {
		this.timeout = timeout;
	}

	/**
	 * <p>
	 * Sets the minimal length of a session. All sessions that contain less
	 * events will be pruned.
	 * </p>
	 * 
	 * @param minLength
	 *            new minimal length
	 */
	public void setMinLength(int minLength) {
		this.minLength = minLength;
	}

	/**
	 * <p>
	 * Parses a web log file.
	 * </p>
	 * 
	 * @param filename
	 *            name and path of the log file
	 * @throws IOException
	 *             thrown if there is a problem with reading the log file
	 * @throws FileNotFoundException
	 *             thrown if the log file is not found
	 * @throws ParseException
	 *             thrown the date format is invalid
	 * @throws URISyntaxException
	 *             thrown if the URI is invalid
	 */
	public void parseFile(String filename) throws IOException,
			FileNotFoundException, ParseException, URISyntaxException {
		String[] lines = FileTools.getLinesFromFile(filename);

		Map<String, List<Integer>> cookieSessionMap = new HashMap<String, List<Integer>>();
		int lastId = -1;

		SimpleDateFormat dateFormat = new SimpleDateFormat(
				"yyyy-MM-dd HH:mm:ss");
		loadRobotRegex();

		sequences = new ArrayList<List<WebEvent>>();

		for (String line : lines) {
			String[] values = line.substring(1, line.length() - 1).split(
					"\" \"");

			// use cookie as session identifier
			int cookieStart = values[0].lastIndexOf('.');
			String cookie = values[0].substring(cookieStart + 1);
			String dateString = values[1];
			long timestamp = dateFormat.parse(dateString).getTime();
			String uriString = values[2];
			// String ref = values[3]; // referer is not yet used!
			String agent;
			if (values.length > 4) {
				agent = values[4];
			} else {
				agent = "noagent";
			}

			List<String> postedVars = new ArrayList<String>();
			if (values.length == 6) { // post vars found
				for (String postVar : values[5].trim().split(" ")) {
					postedVars.add(postVar);
				}
			}
			if (!isRobot(agent)) {
				URI uri = new URI(uriString);

				String path = uri.getPath();
				List<String> getVars = extractGetVarsFromUri(uri);

				WebEvent event = new WebEvent(path, timestamp, postedVars,
						getVars);

				// find session and add event
				List<Integer> sessionIds = cookieSessionMap.get(cookie);
				if (sessionIds == null) {
					sessionIds = new ArrayList<Integer>();
					// start new session
					sessionIds.add(++lastId);
					cookieSessionMap.put(cookie, sessionIds);
					sequences.add(new LinkedList<WebEvent>());
				}
				Integer lastSessionIndex = sessionIds
						.get(sessionIds.size() - 1);
				List<WebEvent> lastSession = sequences.get(lastSessionIndex);
				long lastEventTime = timestamp;
				if (!lastSession.isEmpty()) {
					lastEventTime = lastSession.get(lastSession.size() - 1)
							.getTimestamp();
				}
				if (timestamp - lastEventTime > timeout) {
					sessionIds.add(++lastId);
					List<WebEvent> newSession = new LinkedList<WebEvent>();
					newSession.add(event);
					sequences.add(newSession);
				} else {
					lastSession.add(event);
				}
			}
		}
		pruneShortSequences();
	}

	/**
	 * <p>
	 * Prunes sequences shorter than {@link #minLength}.
	 * </p>
	 */
	private void pruneShortSequences() {
		Console.traceln("" + sequences.size() + " user sequences found");
		// prune sequences shorter than min-length
		int i = 0;
		while (i < sequences.size()) {
			if (sequences.get(i).size() < minLength) {
				sequences.remove(i);
			} else {
				i++;
			}
		}
		Console.traceln("" + sequences.size()
				+ " remaining after pruning of sequences shorter than "
				+ minLength);
	}

	/**
	 * <p>
	 * Reads {@link #ROBOTFILTERFILE} and creates a regular expression that
	 * matches all the robots defined in the file. The regular expression is
	 * stored in the field {@link #robotRegex}.
	 * </p>
	 * 
	 * @throws IOException
	 *             thrown if there is a problem reading the robot filter
	 * @throws FileNotFoundException
	 *             thrown if the robot filter is not found
	 */
	private void loadRobotRegex() throws IOException, FileNotFoundException {
		String[] lines = FileTools.getLinesFromFile(ROBOTFILTERFILE);
		StringBuilder regex = new StringBuilder();
		for (int i = 0; i < lines.length; i++) {
			regex.append("(.*" + lines[i] + ".*)");
			if (i != lines.length - 1) {
				regex.append('|');
			}
		}
		robotRegex = regex.toString();
	}

	/**
	 * <p>
	 * Checks whether an agent is a robot.
	 * </p>
	 * 
	 * @param agent
	 *            agent that is checked
	 * @return true, if the agent is a robot; false otherwise
	 */
	private boolean isRobot(String agent) {
		return agent.matches(robotRegex);
	}

	/**
	 * <p>
	 * Parses the URI and extracts the GET variables that have been passed.
	 * </p>
	 * 
	 * @param uri
	 *            URI that is parsed
	 * @return a list with all GET variables
	 */
	private List<String> extractGetVarsFromUri(URI uri) {
		List<String> getVars = new ArrayList<String>();
		String query = uri.getQuery();
		if (query != null) {
			String[] paramPairs = query.split("&");
			for (String paramPair : paramPairs) {
				String[] paramSplit = paramPair.split("=");
				getVars.add(paramSplit[0]);
			}
		}
		return getVars;
	}
}
