- Timestamp:
- 09/09/11 06:23:36 (13 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/EventBenchConsole/src/de/ugoe/cs/eventbench/web/WeblogParser.java
r111 r171 17 17 import de.ugoe.cs.util.console.Console; 18 18 19 /** 20 * <p> 21 * Provides functionality to parse log files with web request. 22 * </p> 23 * 24 * @author Steffen Herbold 25 * @version 1.0 26 */ 19 27 public class WeblogParser { 20 28 29 /** 30 * <p> 31 * Timeout between two sessions in milliseconds. 32 * </p> 33 */ 21 34 private long timeout; 22 35 36 /** 37 * <p> 38 * Minimal length of a session. All shorter sessions will be pruned. 39 * Default: 2 40 * </p> 41 */ 23 42 private int minLength = 2; 24 43 44 /** 45 * <p> 46 * Collection of generated sequences. 47 * </p> 48 */ 25 49 private List<List<WebEvent>> sequences; 26 50 51 /** 52 * <p> 53 * Name and path of the robot filter. 54 * </p> 55 */ 27 56 private static final String ROBOTFILTERFILE = "misc/robotfilter.txt"; 28 29 private String robotRegex = ".*"; 30 57 58 /** 59 * <p> 60 * Field that contains a regular expression that matches all robots 61 * contained in {@link #ROBOTFILTERFILE}. 62 * </p> 63 */ 64 private String robotRegex = null; 65 66 /** 67 * <p> 68 * Constructor. Creates a new WeblogParser with a default timeout of 69 * 3,600,000 milliseconds (1 hour). 70 * </p> 71 */ 31 72 public WeblogParser() { 32 timeout = 3600000; // 1 hour session-timeout as default 33 } 34 73 this(3600000); 74 } 75 76 /** 77 * <p> 78 * Constructor. Creates a new WeblogParser. 79 * </p> 80 * 81 * @param timeout 82 * session timeout 83 */ 35 84 public WeblogParser(long timeout) { 36 85 this.timeout = timeout; 37 86 } 38 87 88 /** 89 * <p> 90 * Returns the generated event sequences. 91 * </p> 92 * 93 * @return generated event sequences 94 */ 39 95 public List<List<WebEvent>> getSequences() { 40 return sequences; 41 } 42 96 return sequences; 97 } 98 99 /** 100 * <p> 101 * Sets the session timeout. 102 * </p> 103 * 104 * @param timeout 105 * new session timeout 106 */ 43 107 public void setTimeout(long timeout) { 44 108 this.timeout = timeout; 45 109 } 46 110 111 /** 112 * <p> 113 * Sets the minimal length of a session. All sessions that contain less 114 * events will be pruned. 115 * </p> 116 * 117 * @param minLength 118 * new minimal length 119 */ 47 120 public void setMinLength(int minLength) { 48 121 this.minLength = minLength; 49 122 } 50 51 public void parseFile(String filename) throws IOException, FileNotFoundException, ParseException, URISyntaxException { 123 124 /** 125 * <p> 126 * Parses a web log file. 127 * </p> 128 * 129 * @param filename 130 * name and path of the log file 131 * @throws IOException 132 * thrown if there is a problem with reading the log file 133 * @throws FileNotFoundException 134 * thrown if the log file is not found 135 * @throws ParseException 136 * thrown the date format is invalid 137 * @throws URISyntaxException 138 * thrown if the URI is invalid 139 */ 140 public void parseFile(String filename) throws IOException, 141 FileNotFoundException, ParseException, URISyntaxException { 52 142 String[] lines = FileTools.getLinesFromFile(filename); 53 143 54 144 Map<String, List<Integer>> cookieSessionMap = new HashMap<String, List<Integer>>(); 55 145 int lastId = -1; 56 57 SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 146 147 SimpleDateFormat dateFormat = new SimpleDateFormat( 148 "yyyy-MM-dd HH:mm:ss"); 58 149 loadRobotRegex(); 59 150 60 151 sequences = new ArrayList<List<WebEvent>>(); 61 62 for( String line : lines ) { 63 String[] values = line.substring(1, line.length()-1).split("\" \""); 64 152 153 for (String line : lines) { 154 String[] values = line.substring(1, line.length() - 1).split( 155 "\" \""); 156 65 157 // use cookie as session identifier 66 158 int cookieStart = values[0].lastIndexOf('.'); 67 String cookie = values[0].substring(cookieStart +1);159 String cookie = values[0].substring(cookieStart + 1); 68 160 String dateString = values[1]; 69 161 long timestamp = dateFormat.parse(dateString).getTime(); … … 71 163 // String ref = values[3]; // referer is not yet used! 72 164 String agent; 73 if ( values.length>4) {165 if (values.length > 4) { 74 166 agent = values[4]; 75 167 } else { 76 168 agent = "noagent"; 77 169 } 78 170 79 171 List<String> postedVars = new ArrayList<String>(); 80 if ( values.length==6) { // post vars found81 for ( String postVar : values[5].trim().split(" ")) {172 if (values.length == 6) { // post vars found 173 for (String postVar : values[5].trim().split(" ")) { 82 174 postedVars.add(postVar); 83 175 } 84 176 } 85 if ( !isRobot(agent)) {177 if (!isRobot(agent)) { 86 178 URI uri = new URI(uriString); 87 88 String path = uri.getPath(); 179 180 String path = uri.getPath(); 89 181 List<String> getVars = extractGetVarsFromUri(uri); 90 91 WebEvent event = new WebEvent(path, timestamp, postedVars, getVars); 92 182 183 WebEvent event = new WebEvent(path, timestamp, postedVars, 184 getVars); 185 93 186 // find session and add event 94 187 List<Integer> sessionIds = cookieSessionMap.get(cookie); 95 if ( sessionIds==null) {188 if (sessionIds == null) { 96 189 sessionIds = new ArrayList<Integer>(); 97 190 // start new session … … 99 192 cookieSessionMap.put(cookie, sessionIds); 100 193 sequences.add(new LinkedList<WebEvent>()); 101 } 102 Integer lastSessionIndex = sessionIds.get(sessionIds.size()-1); 194 } 195 Integer lastSessionIndex = sessionIds 196 .get(sessionIds.size() - 1); 103 197 List<WebEvent> lastSession = sequences.get(lastSessionIndex); 104 198 long lastEventTime = timestamp; 105 if( !lastSession.isEmpty() ) { 106 lastEventTime = lastSession.get(lastSession.size()-1).getTimestamp(); 107 } 108 if( timestamp-lastEventTime>timeout ) { 199 if (!lastSession.isEmpty()) { 200 lastEventTime = lastSession.get(lastSession.size() - 1) 201 .getTimestamp(); 202 } 203 if (timestamp - lastEventTime > timeout) { 109 204 sessionIds.add(++lastId); 110 205 List<WebEvent> newSession = new LinkedList<WebEvent>(); … … 119 214 } 120 215 216 /** 217 * <p> 218 * Prunes sequences shorter than {@link #minLength}. 219 * </p> 220 */ 121 221 private void pruneShortSequences() { 122 Console.traceln("" +sequences.size()+ " user sequences found");222 Console.traceln("" + sequences.size() + " user sequences found"); 123 223 // prune sequences shorter than min-length 124 int i =0;125 while ( i<sequences.size()) {126 if ( sequences.get(i).size()<minLength) {224 int i = 0; 225 while (i < sequences.size()) { 226 if (sequences.get(i).size() < minLength) { 127 227 sequences.remove(i); 128 228 } else { … … 130 230 } 131 231 } 132 Console.traceln(""+sequences.size()+ " remaining after pruning of sequences shorter than " + minLength); 133 } 134 232 Console.traceln("" + sequences.size() 233 + " remaining after pruning of sequences shorter than " 234 + minLength); 235 } 236 237 /** 238 * <p> 239 * Reads {@link #ROBOTFILTERFILE} and creates a regular expression that 240 * matches all the robots defined in the file. The regular expression is 241 * stored in the field {@link #robotRegex}. 242 * </p> 243 * 244 * @throws IOException 245 * thrown if there is a problem reading the robot filter 246 * @throws FileNotFoundException 247 * thrown if the robot filter is not found 248 */ 135 249 private void loadRobotRegex() throws IOException, FileNotFoundException { 136 250 String[] lines = FileTools.getLinesFromFile(ROBOTFILTERFILE); 137 251 StringBuilder regex = new StringBuilder(); 138 for ( int i=0; i<lines.length; i++) {139 regex.append("(.*" +lines[i]+".*)");140 if ( i!=lines.length-1) {252 for (int i = 0; i < lines.length; i++) { 253 regex.append("(.*" + lines[i] + ".*)"); 254 if (i != lines.length - 1) { 141 255 regex.append("|"); 142 256 } … … 144 258 robotRegex = regex.toString(); 145 259 } 146 260 261 /** 262 * <p> 263 * Checks whether an agent is a robot. 264 * </p> 265 * 266 * @param agent 267 * agent that is checked 268 * @return true, if the agent is a robot; false otherwise 269 */ 147 270 private boolean isRobot(String agent) { 148 271 return agent.matches(robotRegex); 149 272 } 150 273 274 /** 275 * <p> 276 * Parses the URI and extracts the GET variables that have been passed. 277 * </p> 278 * 279 * @param uri 280 * URI that is parsed 281 * @return a list with all GET variables 282 */ 151 283 private List<String> extractGetVarsFromUri(URI uri) { 152 284 List<String> getVars = new ArrayList<String>(); 153 285 String query = uri.getQuery(); 154 if ( query!=null) {286 if (query != null) { 155 287 String[] paramPairs = query.split("&"); 156 for ( String paramPair : paramPairs) {288 for (String paramPair : paramPairs) { 157 289 String[] paramSplit = paramPair.split("="); 158 290 getVars.add(paramSplit[0]);
Note: See TracChangeset
for help on using the changeset viewer.