From a5f6fe7c49ef31128428ae15b3e9a4d2d0fdd10b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20=C3=85kre=20Solberg?= <andreas.solberg@uninett.no>
Date: Mon, 9 Mar 2009 11:21:28 +0000
Subject: [PATCH] add logcleanerscript

git-svn-id: https://simplesamlphp.googlecode.com/svn/trunk@1390 44740490-163a-0410-bde0-09ae8108e29a
---
 modules/statistics/bin/logcleaner.php |  87 +++++++++++++
 modules/statistics/lib/LogCleaner.php | 170 ++++++++++++++++++++++++++
 2 files changed, 257 insertions(+)
 create mode 100755 modules/statistics/bin/logcleaner.php
 create mode 100644 modules/statistics/lib/LogCleaner.php

diff --git a/modules/statistics/bin/logcleaner.php b/modules/statistics/bin/logcleaner.php
new file mode 100755
index 000000000..9111d92cc
--- /dev/null
+++ b/modules/statistics/bin/logcleaner.php
@@ -0,0 +1,87 @@
+#!/usr/bin/env php
+<?php
+
+
+/* This is the base directory of the simpleSAMLphp installation. */
+$baseDir = dirname(dirname(dirname(dirname(__FILE__))));
+
+/* Add library autoloader. */
+require_once($baseDir . '/lib/_autoload.php');
+
+/* Initialize the configuration. */
+SimpleSAML_Configuration::setConfigDir($baseDir . '/config');
+
+
+
+$progName = array_shift($argv);
+$debug = FALSE;
+$dryrun = FALSE;
+$output = '/tmp/simplesamlphp-new.log';
+
+foreach($argv as $a) {
+	if(strlen($a) === 0) continue;
+
+	if(strpos($a, '=') !== FALSE) {
+		$p = strpos($a, '=');
+		$v = substr($a, $p + 1);
+		$a = substr($a, 0, $p);
+	} else {
+		$v = NULL;
+	}
+
+	/* Map short options to long options. */
+	$shortOptMap = array(
+		'-d' => '--debug',
+	);
+	if(array_key_exists($a, $shortOptMap))  $a = $shortOptMap[$a];
+
+	switch($a) {
+		case '--help':
+			printHelp();
+			exit(0);
+		case '--debug':
+			$debug = TRUE;
+			break;
+		case '--dry-run':
+			$dryrun = TRUE;
+			break;
+		case '--outfile':
+			$output = $v;
+			break;
+		default:
+			echo('Unknown option: ' . $a . "\n");
+			echo('Please run `' . $progName . ' --help` for usage information.' . "\n");
+			exit(1);
+		}
+}
+
+$cleaner = new sspmod_statistics_LogCleaner();
+$cleaner->dumpConfig();
+$todelete = $cleaner->clean($debug);
+
+echo "Cleaning these trackIDs: " . join(', ', $todelete) . "\n";
+
+if (!$dryrun) {
+	$cleaner->store($todelete, $output);
+}
+
+/**
+ * This function prints the help output.
+ */
+function printHelp() {
+	global $progName;
+
+	/*   '======================================================================' */
+	echo('Usage: ' . $progName . ' [options]
+
+This program cleans logs. This script is experimental. Do not run it unless you have talked to Andreas about it. 
+The script deletes log lines related to sessions that produce more than 200 lines.
+
+Options:
+	-d, --debug			Used when configuring the log file syntax. See doc.
+	--dry-run			Aggregate but do not store the results.
+	--outfile			File to output the results.
+
+');
+}
+
diff --git a/modules/statistics/lib/LogCleaner.php b/modules/statistics/lib/LogCleaner.php
new file mode 100644
index 000000000..fe9250f9d
--- /dev/null
+++ b/modules/statistics/lib/LogCleaner.php
@@ -0,0 +1,170 @@
+<?php
+/*
+ * @author Andreas Ă…kre Solberg <andreas.solberg@uninett.no>
+ * @package simpleSAMLphp
+ * @version $Id$
+ */
+class sspmod_statistics_LogCleaner {
+
+	private $statconfig;
+	private $statdir;
+	private $inputfile;
+	private $statrules;
+	private $offset;
+
+	/**
+	 * Constructor
+	 */
+	public function __construct() {
+	
+		$this->statconfig = SimpleSAML_Configuration::getConfig('module_statistics.php');
+		
+		$this->statdir = $this->statconfig->getValue('statdir');
+		$this->inputfile = $this->statconfig->getValue('inputfile');
+		$this->statrules = $this->statconfig->getValue('statrules');
+		$this->offset = $this->statconfig->getValue('offset', 0);
+	}
+	
+	public function dumpConfig() {
+		
+		echo 'Statistics directory   : ' . $this->statdir . "\n";
+		echo 'Input file             : ' . $this->inputfile . "\n";
+		echo 'Offset                 : ' . $this->offset . "\n";
+		
+	}
+	
+
+
+	public function clean($debug = FALSE) {
+		
+		if (!is_dir($this->statdir)) 
+			throw new Exception('Statistics module: output dir do not exists [' . $this->statdir . ']');
+		
+		if (!file_exists($this->inputfile)) 
+			throw new Exception('Statistics module: input file do not exists [' . $this->inputfile . ']');
+		
+		
+		$file = fopen($this->inputfile, 'r');
+		#$logfile = file($this->inputfile, FILE_IGNORE_NEW_LINES );
+		
+		
+		$logparser = new sspmod_statistics_LogParser(
+			$this->statconfig->getValue('datestart', 0), $this->statconfig->getValue('datelength', 15), $this->statconfig->getValue('offsetspan', 44)
+		);
+		$datehandler = new sspmod_statistics_DateHandler($this->offset);
+		
+		$results = array();
+		
+		$sessioncounter = array();
+		
+		$i = 0;
+		// Parse through log file, line by line
+		while (!feof($file)) {
+			
+			$logline = fgets($file, 4096);
+			
+			// Continue if STAT is not found on line.
+			if (!preg_match('/STAT/', $logline)) continue;
+			$i++;
+			
+			// Parse log, and extract epoch time and rest of content.
+			$epoch = $logparser->parseEpoch($logline);
+			$content = $logparser->parseContent($logline);
+			$action = trim($content[5]);
+
+			if (($i % 10000) == 0) {
+				echo("Read line " . $i . "\n");
+			}
+			
+			$trackid = $content[4];
+			#echo "trackid: " . $content[4] . "\n";
+			
+			if(!isset($sessioncounter[$trackid])) $sessioncounter[$trackid] = 0;
+			$sessioncounter[$trackid]++;
+
+			if ($debug) {
+			
+				echo("----------------------------------------\n");
+				echo('Log line: ' . $logline . "\n");
+				echo('Date parse [' . substr($logline, 0, $this->statconfig->getValue('datelength', 15)) . '] to [' . date(DATE_RFC822, $epoch) . ']' . "\n");
+				print_r($content);
+				if ($i >= 13) exit;
+			}
+
+		}
+
+		$histogram = array();
+		foreach($sessioncounter AS $trackid => $sc) {
+			if(!isset($histogram[$sc])) $histogram[$sc] = 0;
+			$histogram[$sc]++;
+		}
+		ksort($histogram);
+		
+		$todelete = array();
+		foreach($sessioncounter AS $trackid => $sc) {
+			if($sc > 200) $todelete[] = $trackid;
+		}
+		
+		#print_r($histogram);
+		return $todelete;
+	}
+	
+	
+	public function store($todelete, $outputfile) {
+		
+		echo "Preparing to delete [" .count($todelete) . "] trackids\n";
+		
+		if (!is_dir($this->statdir)) 
+			throw new Exception('Statistics module: output dir do not exists [' . $this->statdir . ']');
+		
+		if (!file_exists($this->inputfile)) 
+			throw new Exception('Statistics module: input file do not exists [' . $this->inputfile . ']');
+		
+		$file = fopen($this->inputfile, 'r');
+		#$logfile = file($this->inputfile, FILE_IGNORE_NEW_LINES );
+		
+		$outfile = fopen($outputfile, 'w');
+		
+		$logparser = new sspmod_statistics_LogParser(
+			$this->statconfig->getValue('datestart', 0), $this->statconfig->getValue('datelength', 15), $this->statconfig->getValue('offsetspan', 44)
+		);
+
+		$i = 0;
+		// Parse through log file, line by line
+		while (!feof($file)) {
+			
+			$logline = fgets($file, 4096);
+			
+			// Continue if STAT is not found on line.
+			if (!preg_match('/STAT/', $logline)) continue;
+			$i++;
+			
+			$content = $logparser->parseContent($logline);
+			
+			$action = trim($content[5]);
+
+			if (($i % 10000) == 0) {
+				echo("Read line " . $i . "\n");
+			}
+			
+			$trackid = $content[4];
+			
+			if (in_array($trackid, $todelete)) {
+				#echo "Deleting entry with trackid: $trackid \n";
+				continue;
+			} else {
+				#echo "NOT Deleting entry with trackid: $trackid \n";
+			}
+			
+			fputs($outfile, $logline);
+
+		}
+		fclose($file);
+		fclose($outfile);
+		
+	}
+
+
+}
+
+?>
\ No newline at end of file
-- 
GitLab