// Copyright 2015 Georg-August-Universität Göttingen, Germany // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package de.ugoe.cs.autoquest.plugin.genericevents.commands; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import de.ugoe.cs.autoquest.eventcore.Event; import de.ugoe.cs.autoquest.eventcore.StringEventType; import de.ugoe.cs.autoquest.plugin.genericevents.eventCore.GenericEventTarget; import de.ugoe.cs.autoquest.plugin.genericevents.eventCore.GenericEventTargetSpec; import de.ugoe.cs.util.console.Console; /** *

* TODO comment *

* * @author Patrick Harms */ public class SogouQDataFileParser { /** */ private static DateFormat TIME_FORMAT = new SimpleDateFormat("HH:mm:ss"); /** * */ Map> parseFile(File file, boolean hasTimestamp, boolean ignoreQuery, boolean compareDomainOnly) { Console.println("reading file " + file); Map> userSessions = new HashMap<>(); try { BufferedReader reader = new BufferedReader (new InputStreamReader(new FileInputStream(file), "GB2312")); String line = null; long lastTimeStamp = 0; int timestampIndex = 0; int userIdIndex = 1; if (!hasTimestamp) { timestampIndex = -1; userIdIndex = 0; } do { line = reader.readLine(); if (line != null) { String[] elements = line.split("\t"); String userId = elements[userIdIndex].intern(); if (hasTimestamp) { try { long timestamp = TIME_FORMAT.parse(elements[timestampIndex]).getTime(); if (timestamp > lastTimeStamp) { lastTimeStamp = timestamp; } else { lastTimeStamp++; } } catch (ParseException e) { // just ignore this and count next lastTimeStamp++; } } StringBuffer query = new StringBuffer(); for (int i = userIdIndex + 1; i < elements.length - 2; i++) { query.append(elements[i]); } String queryStr = query.toString().intern(); String selectedResultPage = elements[elements.length - 2].split(" ")[0].intern(); String selectedResultIndex = elements[elements.length - 2].split(" ")[1].intern(); String selectedResult = elements[elements.length - 1]; String fullSelectedResult = selectedResult; if (compareDomainOnly) { int index = selectedResult.indexOf("://"); // ensure with the second condition, that we do not match something in the // url query if ((index >= 0) && (index < 15)) { selectedResult = selectedResult.substring(index + 3); } index = selectedResult.indexOf("/"); if (index > 0) { selectedResult = selectedResult.substring(0, index); } } selectedResult = selectedResult.intern(); Event event; GenericEventTargetSpec spec = new GenericEventTargetSpec(selectedResult, null); if (!ignoreQuery) { event = new Event(new StringEventType("query for " + queryStr), new GenericEventTarget(spec, null)); } else { event = new Event(new StringEventType("query"), new GenericEventTarget(spec, null)); } event.setTimestamp(lastTimeStamp); event.setParameter("userId".intern(), userId); event.setParameter("query".intern(), queryStr); event.setParameter("selectedResultPage".intern(), selectedResultPage); event.setParameter("selectedResultIndex".intern(), selectedResultIndex); event.setParameter("selectedResult".intern(), fullSelectedResult); List session = userSessions.get(userId); if (session == null) { session = new ArrayList<>(); userSessions.put(userId, session); } session.add(event); } } while (line != null); reader.close(); } catch (FileNotFoundException e) { Console.printerrln("could not read " + file); Console.logException(e); return null; } catch (IOException e) { Console.printerrln("problem while reading a line from " + file); Console.logException(e); return null; } return userSessions; } }