From 3a08025dc8ce1120d28db09681bc2e8b3597b9d3 Mon Sep 17 00:00:00 2001
From: Olav Morken <olav.morken@uninett.no>
Date: Thu, 17 Nov 2011 08:08:31 +0000
Subject: [PATCH] Add support for conditional get of metadata files.

This patch adds support for only updating metadata files that have
changed on the server. This reduces bandwidth used, and also allows
us to skip metadata parsing, which speeds things up significantly.

Thanks to Dyonisius Visser for implementing this!

git-svn-id: https://simplesamlphp.googlecode.com/svn/trunk@2980 44740490-163a-0410-bde0-09ae8108e29a
---
 lib/SimpleSAML/Utilities.php                  |  24 +-
 .../config-templates/config-metarefresh.php   |  15 +-
 modules/metarefresh/hooks/hook_cron.php       |  25 +-
 modules/metarefresh/lib/MetaLoader.php        | 235 +++++++++++++-----
 4 files changed, 228 insertions(+), 71 deletions(-)

diff --git a/lib/SimpleSAML/Utilities.php b/lib/SimpleSAML/Utilities.php
index 2a15674e2..f68ab1e5f 100644
--- a/lib/SimpleSAML/Utilities.php
+++ b/lib/SimpleSAML/Utilities.php
@@ -2117,9 +2117,10 @@ class SimpleSAML_Utilities {
 	 *
 	 * @param string $path  The path or URL we should fetch.
 	 * @param array $context  Extra context options. This parameter is optional.
-	 * @return string  The data we fetched.
+	 * @param boolean $getHeaders Whether to also return response headers. Optional.
+	 * @return mixed array if $getHeaders is set, string otherwise
 	 */
-	public static function fetch($path, $context = array()) {
+	public static function fetch($path, $context = array(), $getHeaders = FALSE) {
 		assert('is_string($path)');
 
 		$config = SimpleSAML_Configuration::getInstance();
@@ -2141,6 +2142,25 @@ class SimpleSAML_Utilities {
 			throw new SimpleSAML_Error_Exception('Error fetching ' . var_export($path, TRUE) . ':' . self::getLastError());
 		}
 
+		// Data and headers.
+		if ($getHeaders) {
+
+			$headers = array();
+
+			foreach($http_response_header as $h) {
+				if(preg_match('@^HTTP/1\.[01]\s+\d{3}\s+@', $h)) {
+					$headers = array(); // reset
+					$headers[0] = $h;
+					continue;
+				}
+				$bits = explode(':', $h, 2);
+				if(count($bits) === 2) {
+					$headers[strtolower($bits[0])] = trim($bits[1]);
+				}
+			}
+			return array($data, $headers);
+		}
+
 		return $data;
 	}
 
diff --git a/modules/metarefresh/config-templates/config-metarefresh.php b/modules/metarefresh/config-templates/config-metarefresh.php
index dc8b2c0b5..18bfd30f6 100644
--- a/modules/metarefresh/config-templates/config-metarefresh.php
+++ b/modules/metarefresh/config-templates/config-metarefresh.php
@@ -9,6 +9,14 @@ $config = array(
 	#	'http://my.own.uni/idp'
 	#),
 	
+	/*
+	 * Conditional GET requests
+	 * Efficient downloading so polling can be done more frequently.
+	 * Works for sources that send 'Last-Modified' or 'Etag' headers.
+	 * Note that the 'data' directory needs to be writable for this to work.
+	 */
+	#'conditionalGET'	=> TRUE,
+
 	'sets' => array(
 
 		'kalmar' => array(
@@ -16,7 +24,7 @@ $config = array(
 			'sources'	=> array(
 				array(
 					/*
-					 * entityIDs that should be excluded from this set.
+					 * entityIDs that should be excluded from this src.
 					 */
 					#'blacklist' => array(
 					#	'http://some.other.uni/idp',
@@ -30,8 +38,9 @@ $config = array(
 					#	'http://some.other.uni/idp',
 					#),
 
-					'src' => 'https://kalmar.feide.no/simplesaml/module.php/aggregator/?id=kalmarcentral&mimetype=text/plain&exclude=norway',
-					'validateFingerprint' => '591d4b4670463eeda91fcc816dc0af2a092aa801',
+					#'conditionalGET' => TRUE,
+					'src' => 'https://kalmar2.org/simplesaml/module.php/aggregator/?id=kalmarcentral&set=saml2&exclude=norway',
+					'validateFingerprint' => '59:1D:4B:46:70:46:3E:ED:A9:1F:CC:81:6D:C0:AF:2A:09:2A:A8:01',
 					'template' => array(
 						'tags'	=> array('kalmar'),
 						'authproc' => array(
diff --git a/modules/metarefresh/hooks/hook_cron.php b/modules/metarefresh/hooks/hook_cron.php
index 5cf828fee..ce9be0c9d 100644
--- a/modules/metarefresh/hooks/hook_cron.php
+++ b/modules/metarefresh/hooks/hook_cron.php
@@ -16,6 +16,7 @@ function metarefresh_hook_cron(&$croninfo) {
 		$mconfig = SimpleSAML_Configuration::getOptionalConfig('config-metarefresh.php');
 
 		$sets = $mconfig->getConfigList('sets', array());
+		$stateFile = $config->getPathValue('datadir', 'data/') . 'metarefresh-state.php';
 
 		foreach ($sets AS $setkey => $set) {
 			// Only process sets where cron matches the current cron tag.
@@ -31,11 +32,21 @@ function metarefresh_hook_cron(&$croninfo) {
 				$expire = NULL;
 			}
 
-			$metaloader = new sspmod_metarefresh_MetaLoader($expire);
+			$outputDir = $set->getString('outputDir');
+			$outputDir = $config->resolvePath($outputDir);
+			$outputFormat = $set->getValueValidate('outputFormat', array('flatfile', 'serialize'), 'flatfile');
+
+			$oldMetadataSrc = SimpleSAML_Metadata_MetaDataStorageSource::getSource(array(
+				'type' => $outputFormat,
+				'directory' => $outputDir,
+			));
+
+			$metaloader = new sspmod_metarefresh_MetaLoader($expire, $stateFile, $oldMetadataSrc);
 
-			# Get global blacklist
+			# Get global blacklist, whitelist and caching info
 			$blacklist = $mconfig->getArray('blacklist', array());
 			$whitelist = $mconfig->getArray('whitelist', array());
+			$conditionalGET = $mconfig->getBoolean('conditionalGET', FALSE);
 
 			foreach($set->getArray('sources') AS $source) {
 
@@ -53,14 +64,18 @@ function metarefresh_hook_cron(&$croninfo) {
 					$source['whitelist'] = $whitelist;
 				}
 
+				# Let src specific conditionalGET override global one
+				if(!isset($source['conditionalGET'])) {
+					$source['conditionalGET'] = $conditionalGET;
+				}
+
 				SimpleSAML_Logger::debug('cron [metarefresh]: In set [' . $setkey . '] loading source ['  . $source['src'] . ']');
 				$metaloader->loadSource($source);
 			}
 
-			$outputDir = $set->getString('outputDir');
-			$outputDir = $config->resolvePath($outputDir);
+			// Write state information back to disk
+			$metaloader->writeState();
 
-			$outputFormat = $set->getValueValidate('outputFormat', array('flatfile', 'serialize'), 'flatfile');
 			switch ($outputFormat) {
 				case 'flatfile':
 					$metaloader->writeMetadataFiles($outputDir);
diff --git a/modules/metarefresh/lib/MetaLoader.php b/modules/metarefresh/lib/MetaLoader.php
index 3d9cec5ff..c0041b152 100644
--- a/modules/metarefresh/lib/MetaLoader.php
+++ b/modules/metarefresh/lib/MetaLoader.php
@@ -7,8 +7,14 @@
 class sspmod_metarefresh_MetaLoader {
 
 
-	private $metadata;
 	private $expire;
+	private $metadata;
+	private $oldMetadataSrc;
+	private $stateFile;
+	private $changed;
+	private static $types = array('saml20-idp-remote', 'saml20-sp-remote',
+		'shib13-idp-remote', 'shib13-sp-remote', 'attributeauthority-remote');
+
 
 	/**
 	 * Constructor
@@ -16,65 +22,172 @@ class sspmod_metarefresh_MetaLoader {
 	 * @param array $sources 	Sources...
 	 * @param 
 	 */
-	public function __construct($expire = NULL) {
-		$this->expire = $expire;	
+	public function __construct($expire = NULL, $stateFile = NULL, $oldMetadataSrc = NULL) {
+		$this->expire = $expire;
 		$this->metadata = array();
+		$this->oldMetadataSrc = $oldMetadataSrc;
+		$this->stateFile = $stateFile;
+		$this->changed = FALSE;
+
+		// Read file containing $state from disk
+		if(is_readable($stateFile)) {
+			require($stateFile);
+		}
+
+		$this->state = (isset($state)) ? $state : array();
+
 	}
 
 	/**
 	 * This function processes a SAML metadata file.
 	 *
-	 * @param $src  Filename of the metadata file.
+	 * @param $source
 	 */
 	public function loadSource($source) {
-		
-		$entities = array();
+
+		$context = NULL;
+
+		$config = SimpleSAML_Configuration::getInstance();
+		$name = $config->getString('technicalcontact_name', NULL);
+		$mail = $config->getString('technicalcontact_email', NULL);
+		$rawheader = "User-Agent: SimpleSAMLphp metarefresh, run by $name <$mail>\r\n";
+
+		if (isset($source['conditionalGET']) && $source['conditionalGET']) {
+			if(array_key_exists($source['src'], $this->state)) {
+
+				$sourceState = $this->state[$source['src']];
+
+				if(isset($sourceState['last-modified'])) {
+					$rawheader .= 'If-Modified-Since: ' . $sourceState['last-modified'] . "\r\n";
+				}
+
+				if(isset($sourceState['etag'])) {
+					$rawheader .= 'If-None-Match: ' . $sourceState['etag'] . "\r\n";
+				}
+			}
+		}
+
+		// Build new HTTP context
+		$context = array('http' => array('header' => $rawheader));
+
+
+		// GET!
 		try {
-			$entities = SimpleSAML_Metadata_SAMLParser::parseDescriptorsFile($source['src']);
+			list($data, $responseHeaders) = SimpleSAML_Utilities::fetch($source['src'], $context, TRUE);
 		} catch(Exception $e) {
 			SimpleSAML_Logger::warning('metarefresh: Failed to retrieve metadata. ' . $e->getMessage());
 		}
 
-		foreach($entities as $entity) {
+		//SimpleSAML_Logger::debug('All response headers: ' . var_export($responsHeaders,1));
+		$status = $responseHeaders[0];
 
-			if(isset($source['blacklist'])) {
-				if(!empty($source['blacklist']) && in_array($entity->getEntityID(), $source['blacklist'])) {
-					SimpleSAML_Logger::info('Skipping "' .  $entity->getEntityID() . '" - blacklisted.' . "\n");
-					continue;
+		if(preg_match('@^HTTP/1\.[01]\s304\s@', $status ) && isset($this->oldMetadataSrc)) {
+			// Not-Modified. This could only have happened if 'conditionalGET' was used.
+			SimpleSAML_Logger::debug('Received \'' . $status . '\', re-using cached metadata');
+
+			foreach(self::$types as $type) {
+				foreach($this->oldMetadataSrc->getMetadataSet($type) as $entity) {
+					if(array_key_exists('metarefresh:src', $entity)) {
+						if($entity['metarefresh:src'] == $source['src']) {
+							//SimpleSAML_Logger::debug('Re-using cached metadata for ' . $entity['entityid']);
+							$this->addMetadata($source['src'], $entity, $type);
+						}
+					}
 				}
 			}
+		} else {
 
-			if(isset($source['whitelist'])) {
-				if(!empty($source['whitelist']) && !in_array($entity->getEntityID(), $source['whitelist'])) {
-					SimpleSAML_Logger::info('Skipping "' .  $entity->getEntityID() . '" - not in the whitelist.' . "\n");
-					continue;
+			// Stale or no metadata, so a fresh copy
+			if (isset($source['conditionalGET']) && $source['conditionalGET']) {
+				SimpleSAML_Logger::debug('Downloaded fresh copy');
+			}
+
+			$entities = array();
+			try{
+				$doc = new DOMDocument();
+				$res = $doc->loadXML($data);
+				if($res !== TRUE) {
+					throw new Exception('Failed to read XML from ' . $source['src']);
 				}
+				if($doc->documentElement ===  NULL) throw new Exception('Opened file is not an XML document: ' . $source['src']);
+				$entities = SimpleSAML_Metadata_SAMLParser::parseDescriptorsElement($doc->documentElement);
+			} catch(Exception $e) {
+				SimpleSAML_Logger::warning('metarefresh: Failed to retrieve metadata. ' . $e->getMessage());
 			}
 
-			if(array_key_exists('validateFingerprint', $source) && $source['validateFingerprint'] !== NULL) {
-				if(!$entity->validateFingerprint($source['validateFingerprint'])) {
-					SimpleSAML_Logger::info('Skipping "' . $entity->getEntityId() . '" - could not verify signature.' . "\n");
-					continue;
+			foreach($entities as $entity) {
+
+				if(isset($source['blacklist'])) {
+					if(!empty($source['blacklist']) && in_array($entity->getEntityID(), $source['blacklist'])) {
+						SimpleSAML_Logger::info('Skipping "' .  $entity->getEntityID() . '" - blacklisted.' . "\n");
+						continue;
+					}
+				}
+
+				if(isset($source['whitelist'])) {
+					if(!empty($source['whitelist']) && !in_array($entity->getEntityID(), $source['whitelist'])) {
+						SimpleSAML_Logger::info('Skipping "' .  $entity->getEntityID() . '" - not in the whitelist.' . "\n");
+						continue;
+					}
+				}
+
+				if(array_key_exists('validateFingerprint', $source) && $source['validateFingerprint'] !== NULL) {
+					if(!$entity->validateFingerprint($source['validateFingerprint'])) {
+						SimpleSAML_Logger::info('Skipping "' . $entity->getEntityId() . '" - could not verify signature.' . "\n");
+						continue;
+					}
+				}
+
+				$template = NULL;
+				if (array_key_exists('template', $source)) $template = $source['template'];
+
+				$this->addMetadata($source['src'], $entity->getMetadata1xSP(), 'shib13-sp-remote', $template);
+				$this->addMetadata($source['src'], $entity->getMetadata1xIdP(), 'shib13-idp-remote', $template);
+				$this->addMetadata($source['src'], $entity->getMetadata20SP(), 'saml20-sp-remote', $template);
+				$this->addMetadata($source['src'], $entity->getMetadata20IdP(), 'saml20-idp-remote', $template);
+				$attributeAuthorities = $entity->getAttributeAuthorities();
+				if (!empty($attributeAuthorities)) {
+					$this->addMetadata($source['src'], $attributeAuthorities[0], 'attributeauthority-remote', $template);
 				}
 			}
-			
-			$template = NULL;
-			if (array_key_exists('template', $source)) $template = $source['template'];
-			
-			$this->addMetadata($source['src'], $entity->getMetadata1xSP(), 'shib13-sp-remote', $template);
-			$this->addMetadata($source['src'], $entity->getMetadata1xIdP(), 'shib13-idp-remote', $template);
-			$this->addMetadata($source['src'], $entity->getMetadata20SP(), 'saml20-sp-remote', $template);
-			$this->addMetadata($source['src'], $entity->getMetadata20IdP(), 'saml20-idp-remote', $template);
-			$attributeAuthorities = $entity->getAttributeAuthorities();
-			if (!empty($attributeAuthorities)) {
-				$this->addMetadata($source['src'], $attributeAuthorities[0], 'attributeauthority-remote', $template);				
+		}
+
+		// Save state for this src
+		if (isset($source['conditionalGET']) && $source['conditionalGET']) {
+
+			// Headers section
+			$candidates = array('last-modified', 'etag');
+
+			foreach($candidates as $candidate) {
+				if(array_key_exists($candidate, $responseHeaders)) {
+					$this->state[$source['src']][$candidate] = $responseHeaders[$candidate];
+				}
 			}
 
+			if(!empty($this->state[$source['src']])) {
+				// Timestamp when this src was requested.
+				$this->state[$source['src']]['requested_at'] = $this->getTime();
+
+				$this->changed = TRUE;
+			}
 		}
 	}
 
+	/**
+	 * This function write the state array back to disk
+	 */
+	 public function writeState() {
+		if($this->changed) {
+			SimpleSAML_Logger::debug('Writing: ' . $this->stateFile);
+			SimpleSAML_Utilities::writeFile(
+				$this->stateFile,
+				"<?php\n/* This file was generated by the metarefresh module at ".$this->getTime() . ".\n".
+				" Do not update it manually as it will get overwritten. */\n".
+				'$state = ' . var_export($this->state, TRUE) . ";\n?>\n"
+			);
+		}
+	}
 
-	
 	/**
 	 * This function writes the metadata to stdout.
 	 */
@@ -126,6 +239,7 @@ class sspmod_metarefresh_MetaLoader {
 			$metadata = array_merge($metadata, $template);
 		}
 	
+		$metadata['metarefresh:src'] = $filename;
 		if(!array_key_exists($type, $this->metadata)) {
 			$this->metadata[$type] = array();
 		}
@@ -138,11 +252,11 @@ class sspmod_metarefresh_MetaLoader {
 			
 				// Override metadata expire with more restrictive global config-
 				if ($this->expire < $metadata['expire'])
-					$metadata['expire'] = $this->expire;		
+					$metadata['expire'] = $this->expire;
 					
 			// If expire is not already in metadata use global config
 			} else {
-				$metadata['expire'] = $this->expire;			
+				$metadata['expire'] = $this->expire;
 			}
 		}
 		
@@ -201,34 +315,33 @@ class sspmod_metarefresh_MetaLoader {
 			}
 		}
 	
-		foreach($this->metadata as $category => $elements) {
-	
-			$filename = $outputDir . '/' . $category . '.php';
-	
-			SimpleSAML_Logger::debug('Writing: ' . $filename . "\n");
-	
-			$fh = @fopen($filename, 'w');
-			if($fh === FALSE) {
-				throw new Exception('Failed to open file for writing: ' . $filename . "\n");
-				exit(1);
-			}
-	
-			fwrite($fh, '<?php' . "\n");
-	
-			foreach($elements as $m) {
-				$filename = $m['filename'];
-				$entityID = $m['metadata']['entityid'];
-	
-				fwrite($fh, "\n");
-				fwrite($fh, '/* The following metadata was generated from ' . $filename . ' on ' . $this->getTime() . '. */' . "\n");
-				fwrite($fh, '$metadata[\'' . addslashes($entityID) . '\'] = ' . var_export($m['metadata'], TRUE) . ';' . "\n");
+		foreach(self::$types as $type) {
+
+			$filename = $outputDir . '/' . $type . '.php';
+
+			if(array_key_exists($type, $this->metadata)) {
+				$elements = $this->metadata[$type];
+				SimpleSAML_Logger::debug('Writing: ' . $filename);
+
+				$content  = '<?php' . "\n" . '/* This file was generated by the metarefresh module at '. $this->getTime() . "\n";
+				$content .= ' Do not update it manually as it will get overwritten' . "\n" . '*/' . "\n";
+
+				foreach($elements as $m) {
+					$entityID = $m['metadata']['entityid'];
+					$content .= "\n";
+					$content .= '$metadata[\'' . addslashes($entityID) . '\'] = ' . var_export($m['metadata'], TRUE) . ';' . "\n";
+				}
+
+				$content .= "\n" . '?>';
+
+				SimpleSAML_Utilities::writeFile($filename, $content);
+			} elseif(is_file($filename)) {
+				if(unlink($filename)) {
+					SimpleSAML_Logger::debug('Deleting stale metadata file: ' . $filename);
+				} else {
+					SimpleSAML_Logger::warning('Could not delete stale metadata file: ' . $filename);
+				}
 			}
-	
-	
-			fwrite($fh, "\n");
-			fwrite($fh, '?>');
-	
-			fclose($fh);
 		}
 	}
 
-- 
GitLab