From 4d7f53380d31154709faa3f9d6cdc467ff141951 Mon Sep 17 00:00:00 2001 From: Bernhard Posselt Date: Thu, 29 Aug 2013 13:30:38 +0200 Subject: allow more than one article enhancer per url based on the url regex, also allow embedded youtube videos that start with // --- utility/articleenhancer/articleenhancer.php | 46 +++++++++++----------- .../cyanideandhappinessenhancer.php | 6 ++- 2 files changed, 26 insertions(+), 26 deletions(-) (limited to 'utility') diff --git a/utility/articleenhancer/articleenhancer.php b/utility/articleenhancer/articleenhancer.php index 194137e72..7fc67c660 100644 --- a/utility/articleenhancer/articleenhancer.php +++ b/utility/articleenhancer/articleenhancer.php @@ -32,8 +32,6 @@ abstract class ArticleEnhancer { private $feedRegex; - private $articleUrlRegex; - private $articleXPath; private $purifier; private $fileFactory; private $maximumTimeout; @@ -43,38 +41,38 @@ abstract class ArticleEnhancer { * @param $purifier the purifier object to clean the html which will be * matched * @param SimplePieFileFactory a factory for getting a simple pie file instance - * @param string $articleUrlRegex the regex to match which article should be - * handled - * @param string $articleXPath the xpath which tells the fetcher with what - * body the feed should be replaced + * @param array $regexXPathPair an associative array containing regex to + * match the url and the xpath that should be used for it to extract the + * page * @param int $maximumTimeout maximum timeout in seconds */ public function __construct($purifier, SimplePieFileFactory $fileFactory, - $articleUrlRegex, $articleXPath, - $maximumTimeout=10){ + array $regexXPathPair, $maximumTimeout=10){ $this->purifier = $purifier; - $this->articleUrlRegex = $articleUrlRegex; - $this->articleXPath = $articleXPath; + $this->regexXPathPair = $regexXPathPair; $this->fileFactory = $fileFactory; - $this->timeout = $maximumTimeout; + $this->maximumTimeout = $maximumTimeout; } public function enhance($item){ - if(preg_match($this->articleUrlRegex, $item->getUrl())) { - $file = $this->fileFactory->getFile($item->getUrl(), $this->maximumTimeout); - $dom = new \DOMDocument(); - @$dom->loadHTML($file->body); - $xpath = new \DOMXpath($dom); - $xpathResult = $xpath->evaluate($this->articleXPath); - - // in case it wasnt a text query assume its a single - if(!is_string($xpathResult)) { - $xpathResult = $this->domToString($xpathResult); + foreach($this->regexXPathPair as $regex => $search) { + + if(preg_match($regex, $item->getUrl())) { + $file = $this->fileFactory->getFile($item->getUrl(), $this->maximumTimeout); + $dom = new \DOMDocument(); + @$dom->loadHTML($file->body); + $xpath = new \DOMXpath($dom); + $xpathResult = $xpath->evaluate($search); + + // in case it wasnt a text query assume its a single + if(!is_string($xpathResult)) { + $xpathResult = $this->domToString($xpathResult); + } + + $sanitizedResult = $this->purifier->purify($xpathResult); + $item->setBody($sanitizedResult); } - - $sanitizedResult = $this->purifier->purify($xpathResult); - $item->setBody($sanitizedResult); } return $item; diff --git a/utility/articleenhancer/cyanideandhappinessenhancer.php b/utility/articleenhancer/cyanideandhappinessenhancer.php index 1faee6d5c..037a3179e 100644 --- a/utility/articleenhancer/cyanideandhappinessenhancer.php +++ b/utility/articleenhancer/cyanideandhappinessenhancer.php @@ -36,8 +36,10 @@ class CyanideAndHappinessEnhancer extends ArticleEnhancer { parent::__construct( $purifier, $fileFactory, - '/explosm.net\/comics/', // match article url - '//*[@id=\'maincontent\']/div[2]/div', // xpath statement to extract the html from the page + array( + '/explosm.net\/comics/' => '//*[@id=\'maincontent\']/div[2]/div', + '/explosm.net\/show/' => '//*[@id=\'videoPlayer\']/iframe' + ), $timeout ); } -- cgit v1.2.3