// Copyright 2015 Georg-August-Universität Göttingen, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package de.ugoe.cs.autoquest.plugin.genericevents.commands;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import de.ugoe.cs.autoquest.eventcore.Event;
import de.ugoe.cs.autoquest.eventcore.StringEventType;
import de.ugoe.cs.autoquest.plugin.genericevents.eventCore.GenericEventTarget;
import de.ugoe.cs.autoquest.plugin.genericevents.eventCore.GenericEventTargetSpec;
import de.ugoe.cs.util.console.Console;
/**
*
* TODO comment
*
*
* @author Patrick Harms
*/
public class SogouQDataFileParser {
/** */
private static DateFormat TIME_FORMAT = new SimpleDateFormat("HH:mm:ss");
/**
*
*/
Map> parseFile(File file,
boolean hasTimestamp,
boolean ignoreQuery,
boolean compareDomainOnly)
{
Console.println("reading file " + file);
Map> userSessions = new HashMap<>();
try {
BufferedReader reader = new BufferedReader
(new InputStreamReader(new FileInputStream(file), "GB2312"));
String line = null;
long lastTimeStamp = 0;
int timestampIndex = 0;
int userIdIndex = 1;
if (!hasTimestamp) {
timestampIndex = -1;
userIdIndex = 0;
}
do {
line = reader.readLine();
if (line != null) {
String[] elements = line.split("\t");
String userId = elements[userIdIndex].intern();
if (hasTimestamp) {
try {
long timestamp = TIME_FORMAT.parse(elements[timestampIndex]).getTime();
if (timestamp > lastTimeStamp) {
lastTimeStamp = timestamp;
}
else {
lastTimeStamp++;
}
}
catch (ParseException e) {
// just ignore this and count next
lastTimeStamp++;
}
}
StringBuffer query = new StringBuffer();
for (int i = userIdIndex + 1; i < elements.length - 2; i++) {
query.append(elements[i]);
}
String queryStr = query.toString().intern();
String selectedResultPage =
elements[elements.length - 2].split(" ")[0].intern();
String selectedResultIndex =
elements[elements.length - 2].split(" ")[1].intern();
String selectedResult = elements[elements.length - 1];
String fullSelectedResult = selectedResult;
if (compareDomainOnly) {
int index = selectedResult.indexOf("://");
// ensure with the second condition, that we do not match something in the
// url query
if ((index >= 0) && (index < 15)) {
selectedResult = selectedResult.substring(index + 3);
}
index = selectedResult.indexOf("/");
if (index > 0) {
selectedResult = selectedResult.substring(0, index);
}
}
selectedResult = selectedResult.intern();
Event event;
GenericEventTargetSpec spec = new GenericEventTargetSpec(selectedResult, null);
if (!ignoreQuery) {
event = new Event(new StringEventType("query for " + queryStr),
new GenericEventTarget(spec, null));
}
else {
event = new Event(new StringEventType("query"),
new GenericEventTarget(spec, null));
}
event.setTimestamp(lastTimeStamp);
event.setParameter("userId".intern(), userId);
event.setParameter("query".intern(), queryStr);
event.setParameter("selectedResultPage".intern(), selectedResultPage);
event.setParameter("selectedResultIndex".intern(), selectedResultIndex);
event.setParameter("selectedResult".intern(), fullSelectedResult);
List session = userSessions.get(userId);
if (session == null) {
session = new ArrayList<>();
userSessions.put(userId, session);
}
session.add(event);
}
}
while (line != null);
reader.close();
}
catch (FileNotFoundException e) {
Console.printerrln("could not read " + file);
Console.logException(e);
return null;
}
catch (IOException e) {
Console.printerrln("problem while reading a line from " + file);
Console.logException(e);
return null;
}
return userSessions;
}
}