<?php
/**
 * This file is part of the transcoding library. It contains the
 * definition of the {@link TranscodingActionPagination} class.
 * 
 * @author Sylvain Lequeux
 * @author Francois Daoust <fd@w3.org>
 * @package TransPythia
 * @version $Revision: 1.32 $
 * @license http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231.html W3C Software Notice and License
 * @copyright Copyright (c) 2009, W3C (MIT, ERCIM, Keio)
 */

/**
 * Include the {@link TranscodingAction} base class definition.
 */
require_once(dirname(__FILE__) . '/transcodingaction.php');


/**
 * Transcoding action that paginates lenghty XHTML content to speed up content
 * delivery on mobile devices with limited bandwidth, and possibly limited
 * memory.
 * 
 * Previous/Next links are added to the page.
 * 
 * The content is divided into blocks. Possible blocks are the elements of the
 * XHTML Modularization 1.1 standard that allow whose minimal content model
 * contains "Flow" or "Block", plus list containers:
 *  http://www.w3.org/TR/xhtml-modularization/abstract_modules.html
 *  
 * Two types of blocks are supported:
 * - containers that may be further subdivided: html, body, div, blockquote
 * - atomic blocks: head, p, h1, h2, h3, h4, h5, h6, pre, address, hr, table,
 * form, ul, ol, dl
 * 
 * Atomic blocks are not further sub-divided and appear as they stand in a
 * page.
 * 
 * The object fall-back mechanism is not supported and is likely to produce
 * weird results if the fall-back content contains blocks.
 * 
 * Some atomic blocks are likely to be supported as containers in a future
 * version of this action (e.g. ul, ol, dl). Other block elements that may
 * appear within atomic blocks (e.g. li, dd, th, td) are processed as regular
 * inline content.
 * 
 * The action expects the blocks to be well-formed: closing tags must be set
 * appropriately and properly nested. The action does not require inline
 * content to be well-formed.
 * 
 * Several options may need to be precised through calls to
 * {@link TranscodingAction::setOption()} prior to using this action:
 * - mobile_device: contains the DDR property reference that the transcoding
 *   uses to tell whether the requesting device is mobile or not. The action
 *   uses the is_wireless_device property of the WURFL namespace when the option
 *   is not set.
 *   
 * - page_index => the index of the page to return (first page has index 1).
 *   The first page is returned when not set.
 *   
 * - max_size => maximum allowed size per page in bytes. Defaults to 10Kb. The
 *   size includes the markup size and the size of the images. The action tries
 *   to accomodate this setting as much as it can, but note it may not always
 *   be possible, for instance when an atomic block is too large.
 * 
 * - max_markup_size => maximum allowed size for the markup of the page in bytes.
 *   Defaults to 10Kb. The limitation mentioned for max_size applies here as well.
 *   
 * - nav_block => The HTML code for the navigation block (previous/next links)
 *   A simple div with previous/next links, the index of current page and the
 *   number of pages is returned when not set.
 *   
 * - prev_nav_block => The HTML code for the previous page link. The block is
 *   displayed when current page is not the first page.
 *   
 * - next_nav_block => The HTML code for the next page link. The block is
 *   displayed when current page is not the last page. 
 * 
 * - base_uri => The URI of the page that is being processed which also acts as
 *   the base URI to resolve relative URIs for images
 * 
 * - uri_mappings => The list of URI mappings to use to convert an absolute HTTP
 *   URI to a local file. Mappings must be separated by a space. Each mapping
 *   consists of a root URI and a root folder separated by a '|'.
 *   Ex: http://example.com/img/|/var/www/img/
 * 
 * @author Sylvain Lequeux
 * @author Francois Daoust <fd@w3.org>
 * @package TransPythia
 * @version $Revision: 1.32 $
 * @license http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231.html W3C Software Notice and License
 * @copyright Copyright (c) 2009, W3C (MIT, ERCIM, Keio)
 */
class TranscodingActionPagination extends TranscodingAction {
	/**
 	* @var int Default maximum size per page for content.
 	*/
	static private $DEFAULT_MAXIMUM_PAGESIZE = 10240;
	
	/**
	 * @var int Default image size in bytes when the actual image size
	 * cannot be computed.
	 */
	static private $DEFAULT_IMAGE_SIZE = 10240;
	
	/**
	 * @var string Default "previous page" navigation block
	 */
	static private $DEFAULT_PREV_NAV_BLOCK = '<a href="_PREVURI_">Previous page</a> | ';
	
	/**
	 * @var string Default "current page" navigation block
	 */
	static private $DEFAULT_NAV_BLOCK = '<div><p>_PREVBLOCK_Page _PAGEINDEX_/_NBPAGES__NEXTBLOCK_</p></div>';
	
	/**
	 * @var string Default "next page" navigation block
	 */
	static private $DEFAULT_NEXT_NAV_BLOCK = ' | <a href="_NEXTURI_">Next page</a>';
		
	/**
	 * @var int Maximum size in bytes allowed per page.
	 */
	private $maxSize = 0;
	/**
	 * @var int Maximum markup size in bytes allowed per page.
	 */
	private $maxMarkupSize = 0;
	/**
	 * @var array(string) The list of generated pages.
	 */
	private $pages = array();
	/**
	 * @var string The page that is currently being generated.
	 */
	private $currentPage = "";
	/**
	 * @var string The size of current page
	 */
	private $currentSize = 0;
	/**
	 * @var string The markup size of current page
	 */
	private $currentMarkupSize = 0;
	/**
	 * @var array(string, string) Stack that keeps of the HTML hierarchy.
	 *   The hierarchy is reproduced on next page so that some CSS selectors can
	 *   still apply. The first string represents the starting tag as found in
	 *   the content, the second string is the tag name. 
	 */
	private $currentContext = array();
	/**
	 * @var array(string, string) Stack of contexts that are pending, i.e.
	 *   contexts that we have to write to current page as soon as we plan to
	 *   write some "real" content to the page. On top of the HTML hierarchy,
	 *   note the array may also contain headings
	 */
	private $currentPendingContext = array();
	/**
	 * @var string The head block if one is found. The head block gets copied
	 *   to subsequent pages.
	 */
	private $head = "";
	/**
	 * @var array(string=>string) List of URI mappings to convert HTTP URIs
	 *   to local files.
	 */
	private $uriMappings = array();
	
	
	/**
	 * Paginates the given content when it is too big.
	 * 
	 * @param Evidence $evidence The evidence that identifies the requesting device.
	 * @param string $content The HTML content to transcode.
	 * @return string The transcoded content. Returns an array of strings if
	 *   requested to return all pages with a -1 page_index option.
	 * @exception SystemException The evidence is not valid.
	 */
	public function apply($content, $evidence){
		$this->initPropertyValues($evidence);
		
		$this->initproperty('mobile_device',
			TranscodingAction::$WURFL_MOBILE_DEVICE,
			TranscodingAction::$WURFL_VOCABULARY,
			TranscodingAction::$WURFL_DEFAULT_ASPECT);
		
		// Unless otherwise mentioned, we'll consider the transcoding action
		// is run in the context of an HTTP request. Note it is not mandatory.
		if ($_SERVER) {
			$baseUri = $_SERVER['REQUEST_URI'];
		}
		if (!$baseUri) {
			$baseUri = "";
		}
		
		// See class description for details on the options
		$this->initOption('max_size', 'int', self::$DEFAULT_MAXIMUM_PAGESIZE);
		$this->initOption('max_markup_size', 'int', self::$DEFAULT_MAXIMUM_PAGESIZE);
		$this->initOption('page_index', 'int', 1);
		$this->initOption('prev_nav_block', 'string', self::$DEFAULT_PREV_NAV_BLOCK);
		$this->initOption('nav_block', 'string', self::$DEFAULT_NAV_BLOCK);
		$this->initOption('next_nav_block', 'string', self::$DEFAULT_NEXT_NAV_BLOCK);
		$this->initOption('base_uri', 'string', $baseUri);
		$this->initOption('uri_mappings', 'string', $baseUri . '|' . $baseUri);
		
		// Pagination only takes place when the requesting device is identified
		// as a "mobile" device.
		$property = $this->getOption('mobile_device');
		$is_mobile_device = $this->getPropertyValuePr($property);
		if(!isset($is_mobile_device) || !$is_mobile_device->getBoolean()){
		  	return $content;
		}
		
		$this->uriMappings = array();
		$mappings = explode(' ', $this->getOption('uri_mappings'));
		foreach ($mappings as $mapping) {
			$maparray = explode('|', $mapping);
			$this->uriMappings[$maparray[0]] = $maparray[1];
		}
		
		return $this->paginate($content);
	}
	
	
	/**
	 * Paginates the given HTML content based on actions settings.
	 * 
	 * This is main function of the class, called when the content needs to be
	 * paginated because the requesting device was identified as a "mobile"
	 * device.
	 *  
	 * @param string $content The HTML content to adapt.
	 * @return string The updated HTML content.
	 */
	private function paginate($content){
		// Reset internal variables
		$this->maxSize = $this->getOption('max_size');
		$this->maxMarkupSize = $this->getOption('max_markup_size');
		$this->currentPage = "";
		$this->currentSize = 0;
		$this->currentMarkupSize = 0;
		$this->currentContext = array();
		$this->currentPendingContext = array();
		
		// Paginate content as needed
		$this->processBlock($content);
		
		// Close last page
		$this->closeCurrentPage();
		
		// Insert navigation blocks as needed
		$this->insertNavigationBlocks();
		
		//var_dump($this->pages);
		
		// Select the page to return
		$pageIndex = $this->getOption('page_index'); 
		if ($pageIndex > count($this->pages)) {
			// Let's return last page
			return $this->pages[count($this->pages) - 1];
		}
		else if ($pageIndex == -1) {
			// Return concatenation of all pages.
			// This might sound strange, but nav_block should then typically be
			// viewed as a page break flag understood by the tool in which the
			// action is used. Keep in mind that the last page also ends up with
			// nav_block, which may have to be removed not to trigger an empty
			// last page.
			return join('', $this->pages);
		}
		else {
			return $this->pages[$this->getOption('page_index') - 1];
		}	
	}


	/**
	 * Recursive method that processes the given block and fills the pages,
	 * keeping an eye on the context.
	 * 
	 * Pages are composed of blocks. Possible blocks are taken from the XHTML
	 * standard. Some blocks are also "containers" of blocks that may be
	 * further divided:
	 * - Containers: html, body, div, blockquote
	 * - Other blocks: head, p, h1, h2, h3, h4, h5, h6, pre, address, hr, table, form, fieldset
	 */
	private function processBlock($block) {
		$index = 0;
		$startMatch = NULL;
		
		// Loop through the sub-blocks in the block
		while (true) {
			// Note the structure of the block info:
			// 0: The extracted block
	 		// 1: The matched tag name
	 		// 2: The block's starting offset in the string
	 		// 3: The block's starting tag
	 		// 4: The block's ending offset in the string
	 		// 5: The block's ending tag (may be an empty string if empty tag
	 		// or if ending tag was not found)
			$subblockInfo = $this->getNextBlock(
				'html|head|body|div|blockquote|p|h[1-6]|ul|ol|dl|p|pre|address|hr|table|form|fieldset',
				$block, $index);
			if (!$subblockInfo) {
				break;
			}
			
			// We now have two blocks:
			// - the first one between position $index and the identified block starting at $subblockInfo[2].
			// This first block is treated as an atomic block.
			// - the second one between $startOffset and $endOffset. This second block may not be atomic.
			$subblock = trim(substr($block, $index, $subblockInfo[2] - $index));
			if ($subblock != "") {
				$this->processAtomicBlock($subblock);
			}
			
			$subblock = $subblockInfo[0];
			if (($subblockInfo[1] == 'html')
			|| ($subblockInfo[1] == 'body')
			|| ($subblockInfo[1] == 'div')
			|| ($subblockInfo[1] == 'blockquote')) {
				// The block is a container that may be divided.
				
				// Add the container to the context
				// (array format: start tag, tag name, end tag)
				array_push($this->currentPendingContext,
					array($subblockInfo[3], $subblockInfo[1], $subblockInfo[5]));
				
				// Recursively parse the sub-block, trimming non-significant spaces
				// at the beginning and at the end of the sub-block. 
				$subblock = trim(substr(
					$subblock,
					strlen($subblockInfo[3]),
					strlen($subblock) - strlen($subblockInfo[3]) - strlen($subblockInfo[5])));
				if ($subblock != '') {
					$this->processBlock($subblock);
				}
				
				// Process the closing tag on the same page
				// (note closing tag size has already been counted)
				$this->currentPage .= $subblockInfo[5];
				array_pop($this->currentContext);
			}
			else if (preg_match('/h[1-6]/Usi', $subblockInfo[1])) {
				// The block is a heading. Let's put it aside in case it
				// has to be written to next page.
				array_push($this->currentPendingContext, array($subblock, $subblockInfo[1], ''));
			}
			else {
				// The block cannot be further divided.
				if ($subblockInfo[1] == 'head') {
					// Save head to be able to copy it to other pages
					$this->head = $subblock;
				}				
				$this->processAtomicBlock($subblock);
			}
			
			// Continue with the rest of the block
			$index = $subblockInfo[4];
		}
		
		// No further sub-blocks in this block, let's process the remaining
		// content of this block (note it may be the whole block!) as an
		// atomic block
		$subblock = substr($block, $index, strlen($block) - $index);
		$this->processAtomicBlock($subblock);	
	}

	
	/**
	 * Appends the block to current page or closes current page and inserts
	 * the block in another page when the block does not fit in the current
	 * page.
	 * 
	 * The block is moved to another page when it's too big. As an exception
	 * to the rule, the block is kept in current page when the page is
	 * "rather" empty, i.e. if less than 10% of the page has been filled out
	 * so far.
	 * 
	 * @param $block string The block to process
	 */
	private function processAtomicBlock($block) {
		// Compute the size of the block
		$size = $this->computeSize($block);
		$markupSize = $this->computeMarkupSize($block);
		
		// Add to this size, the size of the contexts that may be pending.
		foreach ($this->currentPendingContext as $tag) {
			$size += $this->computeSize($tag[0]) + $this->computeSize($tag[2]);
			$markupSize += $this->computeMarkupSize($tag[0]) + $this->computeMarkupSize($tag[2]);
		}
		
		// Block is too big and should go to another
		// page, except if it's too big per itself, or
		// if current page hardly contains anything
		// (i.e. less than 10% of the maximum allowed size)
		if ((($this->currentSize + $size) > $this->maxSize)
		&& ($this->currentSize >= ($this->maxSize / 10))) {
			// Close current page
			$this->closeCurrentPage();
		}
		else if ((($this->currentMarkupSize + $markupSize) > $this->maxMarkupSize)
		&& ($this->currentMarkupSize >= ($this->maxMarkupSize / 10))) {
			// Close current page
			$this->closeCurrentPage();
		}
		
		// Add pending context (and reset the list)
		foreach ($this->currentPendingContext as $tag) {
			if ($tag[2]) {
				// Keep a trace of the tags that must be closed afterwards
				array_push($this->currentContext, $tag);
			}
			$this->currentPage .= $tag[0];
		}
		$this->currentPendingContext = array();
		
		// Add the block in itself
		$this->currentPage .= $block;
		
		// Update the size of the page (note the size includes closing tags
		// that have not been written yet).
		$this->currentSize += $size;
		$this->currentMarkupSize += $markupSize;
	}
	
	
	/**
	 * Closes current page and starts another page.
	 * 
	 * "Closing" the page means adding the required closing
	 * tags to balance the page.
	 */
	private function closeCurrentPage() {
		$savedContext = array();
		while ($tag = array_pop($this->currentContext)) {
			array_push($savedContext, $tag);
			$this->currentPage .= "</" . $tag[1] . ">";
			$this->currentSize += strlen("</" . $tag[1] . ">");
			$this->currentMarkupSize += strlen("</" . $tag[1] . ">");
		}
		
		if ($this->currentPage != "") {
			array_push($this->pages, $this->currentPage);
			$this->currentPage = "";
			$this->currentSize = 0;
			$this->currentMarkupSize = 0;
			while ($tag = array_pop($savedContext)) {
				array_push($this->currentContext, $tag);
				$this->currentPage .= $tag[0];
				$this->currentSize += $this->computeSize($tag[0]);
				$this->currentMarkupSize += $this->computeMarkupSize($tag[0]);
				
				// Output the "head" if one was found
				if (($tag == 'html') && ($this->head != '')) {
					$this->currentPage .= $this->head;
					$this->currentSize += $this->computeSize($this->head);
					$this->currentMarkupSize += $this->computeMarkupSize($this->head);
				}
			}
		}
	}
	
	
	/**
	 * Inserts navigation blocks in all pages.
	 */
	private function insertNavigationBlocks() {
		if (count($this->pages) > 1) {
			for ($i = 0; $i < count($this->pages); $i++) {
				$navBlock = $this->getNavigationBlock($i + 1, count($this->pages));
				
				if (stristr($this->pages[$i], "</body>")) {
					$this->pages[$i] = str_ireplace('</body>', $navBlock . '</body>', $this->pages[$i]);
				}
				else {
					$this->pages[$i] .= $navBlock;
				}
			}
		}
	}
	
	
	/**
	 * Retrieves the navigation block to display for the given page index.
	 *  
	 * @param $pageIndex int Index of the page being considered.
	 * @param $nbPages int Total number of pages.
	 * @return string The HTML navigation block to add to the end of the page
	 */
	private function getNavigationBlock($pageIndex, $nbPages) {
		$block = $this->getOption('nav_block');
		
		if ($pageIndex > $nbPages) {
			$pageIndex = $nbPages;
		}
		
		if ($pageIndex > 1) {
			$block = str_replace('_PREVBLOCK_', $this->getOption('prev_nav_block'), $block);
		}
		else {
			$block = str_replace('_PREVBLOCK_', '', $block);
		}
		
		if ($pageIndex < $nbPages) {
			$block = str_replace('_NEXTBLOCK_', $this->getOption('next_nav_block'), $block);
		}
		else {
			$block = str_replace('_NEXTBLOCK_', '', $block);
		}
		
		$block = str_replace('_PAGEINDEX_', $pageIndex, $block);
		$block = str_replace('_NBPAGES_', $nbPages, $block);
		
		$prevUri = $this->getOption('base_uri');
		$prevUri = preg_replace('/([\?\&]pagination=[0-9]+)\&?/Usi', '', $prevUri);
		if (strpos($prevUri, '?') >= 0) {
			$prevUri .= '&';
		}
		else {
			$prevUri .= '?';
		}
		$prevUri .= 'pagination=' . ($pageIndex - 1);
		$block = str_replace('_PREVURI_', htmlspecialchars($prevUri, ENT_COMPAT, 'UTF-8'), $block);
		
		$nextUri = $this->getOption('base_uri');
		$nextUri = preg_replace('/([\?\&]pagination=[0-9]+)\&?/Usi', '', $nextUri);
		if (strpos($nextUri, '?') >= 0) {
			$nextUri .= '&';
		}
		else {
			$nextUri .= '?';
		}
		$nextUri .= 'pagination=' . ($pageIndex + 1);
		$block = str_replace('_NEXTURI_', htmlspecialchars($nextUri, ENT_COMPAT, 'UTF-8'), $block);
			
		return $block;
	}
	
	
	/**
	 * Computes the size of the HTML block.
	 * 
	 * The size of the block is the raw size of the string
	 * plus the size of the images it contains.
	 * 
	 * The size of the images is only computed for "local"
	 * images, i.e. when the images can be retrieved through
	 * the filesystem. The mapping between the URI that
	 * appears in an "src" attribute and the URI of the image
	 * in the filesystem must be provided in the options. 
	 * 
	 * Images that cannot be retrieved as files are averaged
	 * at 10Kb.  
	 * 
	 * TODO: handle object elements in not too stupid a way,
	 * à la mobileOK (with stricter rules). 
	 * 
	 * @param string $content The html code for which the size is to be computed.
	 * @return int The size of the html content in bytes.
	 */
	private function computeSize($block) {
		$total = $this->computeMarkupSize($block);
		
		// TODO: extract width and height when defined for a closer
		// approximation of the image size when it cannot be computed.
		preg_match_all("/\<(img|object)([ \t\r\n].*)?[ \t\r\n]src=((?:'[^']*')|(?:\"[^\"]*\")).*\>/Usi", $block, $images, PREG_SET_ORDER);
		
		foreach ($images as $imageIndex=>$image) {
			$src = trim($image[3], " \t\n\r\0\x0B'\"");
			$total += $this->computeImageSize($src);
		}
		
		return $total;
	}
	
	
	/**
	 * Computes the markup size of the HTML block.
	 * 
	 * The markup size of the block is simply the raw size of the string.
	 * 
	 * @param string $content The html code for which the size is to be computed.
	 * @return int The size of the html content in bytes.
	 */
	private function computeMarkupSize($block) {
		$total = strlen($block);
		return $total;
	}
	
	
	/**
	 * Computes the size of the image identified by its URI. The URI may
	 * be an absolute URI, an absolute path, or a relative URI.
	 *  
	 * @param $uri string The image source URI.
	 * @return int The size in bytes of the image when the image may be
	 *   retrieved from the local filesystem, 10Kb otherwise.
	 */
	private function computeImageSize($uri) {
		// Map the URI to the filesystem if possible
		$fileName = $this->mapUriToFile(
			$uri,
			$this->getOption('base_uri'),
			$this->uriMappings);
		
		if ($fileName && file_exists($fileName)) {
			// Add a fixed size to account for HTTP headers
			$size = filesize($fileName);
			$size += 300;
		}
		else {
			// No way to compute the size of the image
			// TODO: if we knew the width and height of the image,
			// we could make a more educated guess.
			$size = self::$DEFAULT_IMAGE_SIZE;			
		}
		
		return $size;
	}
}

?>