From a5f6fe7c49ef31128428ae15b3e9a4d2d0fdd10b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20=C3=85kre=20Solberg?= <andreas.solberg@uninett.no> Date: Mon, 9 Mar 2009 11:21:28 +0000 Subject: [PATCH] add logcleanerscript git-svn-id: https://simplesamlphp.googlecode.com/svn/trunk@1390 44740490-163a-0410-bde0-09ae8108e29a --- modules/statistics/bin/logcleaner.php | 87 +++++++++++++ modules/statistics/lib/LogCleaner.php | 170 ++++++++++++++++++++++++++ 2 files changed, 257 insertions(+) create mode 100755 modules/statistics/bin/logcleaner.php create mode 100644 modules/statistics/lib/LogCleaner.php diff --git a/modules/statistics/bin/logcleaner.php b/modules/statistics/bin/logcleaner.php new file mode 100755 index 000000000..9111d92cc --- /dev/null +++ b/modules/statistics/bin/logcleaner.php @@ -0,0 +1,87 @@ +#!/usr/bin/env php +<?php + + +/* This is the base directory of the simpleSAMLphp installation. */ +$baseDir = dirname(dirname(dirname(dirname(__FILE__)))); + +/* Add library autoloader. */ +require_once($baseDir . '/lib/_autoload.php'); + +/* Initialize the configuration. */ +SimpleSAML_Configuration::setConfigDir($baseDir . '/config'); + + + +$progName = array_shift($argv); +$debug = FALSE; +$dryrun = FALSE; +$output = '/tmp/simplesamlphp-new.log'; + +foreach($argv as $a) { + if(strlen($a) === 0) continue; + + if(strpos($a, '=') !== FALSE) { + $p = strpos($a, '='); + $v = substr($a, $p + 1); + $a = substr($a, 0, $p); + } else { + $v = NULL; + } + + /* Map short options to long options. */ + $shortOptMap = array( + '-d' => '--debug', + ); + if(array_key_exists($a, $shortOptMap)) $a = $shortOptMap[$a]; + + switch($a) { + case '--help': + printHelp(); + exit(0); + case '--debug': + $debug = TRUE; + break; + case '--dry-run': + $dryrun = TRUE; + break; + case '--outfile': + $output = $v; + break; + default: + echo('Unknown option: ' . $a . "\n"); + echo('Please run `' . $progName . ' --help` for usage information.' . "\n"); + exit(1); + } +} + +$cleaner = new sspmod_statistics_LogCleaner(); +$cleaner->dumpConfig(); +$todelete = $cleaner->clean($debug); + +echo "Cleaning these trackIDs: " . join(', ', $todelete) . "\n"; + +if (!$dryrun) { + $cleaner->store($todelete, $output); +} + +/** + * This function prints the help output. + */ +function printHelp() { + global $progName; + + /* '======================================================================' */ + echo('Usage: ' . $progName . ' [options] + +This program cleans logs. This script is experimental. Do not run it unless you have talked to Andreas about it. +The script deletes log lines related to sessions that produce more than 200 lines. + +Options: + -d, --debug Used when configuring the log file syntax. See doc. + --dry-run Aggregate but do not store the results. + --outfile File to output the results. + +'); +} + diff --git a/modules/statistics/lib/LogCleaner.php b/modules/statistics/lib/LogCleaner.php new file mode 100644 index 000000000..fe9250f9d --- /dev/null +++ b/modules/statistics/lib/LogCleaner.php @@ -0,0 +1,170 @@ +<?php +/* + * @author Andreas Ă…kre Solberg <andreas.solberg@uninett.no> + * @package simpleSAMLphp + * @version $Id$ + */ +class sspmod_statistics_LogCleaner { + + private $statconfig; + private $statdir; + private $inputfile; + private $statrules; + private $offset; + + /** + * Constructor + */ + public function __construct() { + + $this->statconfig = SimpleSAML_Configuration::getConfig('module_statistics.php'); + + $this->statdir = $this->statconfig->getValue('statdir'); + $this->inputfile = $this->statconfig->getValue('inputfile'); + $this->statrules = $this->statconfig->getValue('statrules'); + $this->offset = $this->statconfig->getValue('offset', 0); + } + + public function dumpConfig() { + + echo 'Statistics directory : ' . $this->statdir . "\n"; + echo 'Input file : ' . $this->inputfile . "\n"; + echo 'Offset : ' . $this->offset . "\n"; + + } + + + + public function clean($debug = FALSE) { + + if (!is_dir($this->statdir)) + throw new Exception('Statistics module: output dir do not exists [' . $this->statdir . ']'); + + if (!file_exists($this->inputfile)) + throw new Exception('Statistics module: input file do not exists [' . $this->inputfile . ']'); + + + $file = fopen($this->inputfile, 'r'); + #$logfile = file($this->inputfile, FILE_IGNORE_NEW_LINES ); + + + $logparser = new sspmod_statistics_LogParser( + $this->statconfig->getValue('datestart', 0), $this->statconfig->getValue('datelength', 15), $this->statconfig->getValue('offsetspan', 44) + ); + $datehandler = new sspmod_statistics_DateHandler($this->offset); + + $results = array(); + + $sessioncounter = array(); + + $i = 0; + // Parse through log file, line by line + while (!feof($file)) { + + $logline = fgets($file, 4096); + + // Continue if STAT is not found on line. + if (!preg_match('/STAT/', $logline)) continue; + $i++; + + // Parse log, and extract epoch time and rest of content. + $epoch = $logparser->parseEpoch($logline); + $content = $logparser->parseContent($logline); + $action = trim($content[5]); + + if (($i % 10000) == 0) { + echo("Read line " . $i . "\n"); + } + + $trackid = $content[4]; + #echo "trackid: " . $content[4] . "\n"; + + if(!isset($sessioncounter[$trackid])) $sessioncounter[$trackid] = 0; + $sessioncounter[$trackid]++; + + if ($debug) { + + echo("----------------------------------------\n"); + echo('Log line: ' . $logline . "\n"); + echo('Date parse [' . substr($logline, 0, $this->statconfig->getValue('datelength', 15)) . '] to [' . date(DATE_RFC822, $epoch) . ']' . "\n"); + print_r($content); + if ($i >= 13) exit; + } + + } + + $histogram = array(); + foreach($sessioncounter AS $trackid => $sc) { + if(!isset($histogram[$sc])) $histogram[$sc] = 0; + $histogram[$sc]++; + } + ksort($histogram); + + $todelete = array(); + foreach($sessioncounter AS $trackid => $sc) { + if($sc > 200) $todelete[] = $trackid; + } + + #print_r($histogram); + return $todelete; + } + + + public function store($todelete, $outputfile) { + + echo "Preparing to delete [" .count($todelete) . "] trackids\n"; + + if (!is_dir($this->statdir)) + throw new Exception('Statistics module: output dir do not exists [' . $this->statdir . ']'); + + if (!file_exists($this->inputfile)) + throw new Exception('Statistics module: input file do not exists [' . $this->inputfile . ']'); + + $file = fopen($this->inputfile, 'r'); + #$logfile = file($this->inputfile, FILE_IGNORE_NEW_LINES ); + + $outfile = fopen($outputfile, 'w'); + + $logparser = new sspmod_statistics_LogParser( + $this->statconfig->getValue('datestart', 0), $this->statconfig->getValue('datelength', 15), $this->statconfig->getValue('offsetspan', 44) + ); + + $i = 0; + // Parse through log file, line by line + while (!feof($file)) { + + $logline = fgets($file, 4096); + + // Continue if STAT is not found on line. + if (!preg_match('/STAT/', $logline)) continue; + $i++; + + $content = $logparser->parseContent($logline); + + $action = trim($content[5]); + + if (($i % 10000) == 0) { + echo("Read line " . $i . "\n"); + } + + $trackid = $content[4]; + + if (in_array($trackid, $todelete)) { + #echo "Deleting entry with trackid: $trackid \n"; + continue; + } else { + #echo "NOT Deleting entry with trackid: $trackid \n"; + } + + fputs($outfile, $logline); + + } + fclose($file); + fclose($outfile); + + } + + +} + +?> \ No newline at end of file -- GitLab