diff options
author | Bernhard Posselt <nukeawhale@gmail.com> | 2013-08-29 13:30:38 +0200 |
---|---|---|
committer | Bernhard Posselt <nukeawhale@gmail.com> | 2013-08-29 13:30:38 +0200 |
commit | 4d7f53380d31154709faa3f9d6cdc467ff141951 (patch) | |
tree | fdbe6e2e76b7b3c3483cc50216aaac8ffd34dc07 | |
parent | a73fe145a2856d6d075f8541f28c70b5cf01e1db (diff) |
allow more than one article enhancer per url based on the url regex, also allow embedded youtube videos that start with //
-rw-r--r-- | dependencyinjection/dicontainer.php | 4 | ||||
-rw-r--r-- | tests/unit/utility/articleenhancer/ArticleEnhancerTest.php | 45 | ||||
-rw-r--r-- | utility/articleenhancer/articleenhancer.php | 46 | ||||
-rw-r--r-- | utility/articleenhancer/cyanideandhappinessenhancer.php | 6 |
4 files changed, 67 insertions, 34 deletions
diff --git a/dependencyinjection/dicontainer.php b/dependencyinjection/dicontainer.php index 9d9a085b0..de49236c6 100644 --- a/dependencyinjection/dicontainer.php +++ b/dependencyinjection/dicontainer.php @@ -109,7 +109,9 @@ class DIContainer extends BaseContainer { $config->set('Cache.SerializerPath', $directory); $config->set('HTML.SafeIframe', true); $config->set('URI.SafeIframeRegexp', - '%^http://(www.youtube(?:-nocookie)?.com/embed/|player.vimeo.com/video/)%'); //allow YouTube and Vimeo + '%^(?:https?:)?//(' . + 'www.youtube(?:-nocookie)?.com/embed/|' . + 'player.vimeo.com/video/)%'); //allow YouTube and Vimeo return new \HTMLPurifier($config); }); diff --git a/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php b/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php index c808a0e49..5f82a4752 100644 --- a/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php +++ b/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php @@ -31,10 +31,10 @@ require_once(__DIR__ . "/../../../classloader.php"); class TestEnhancer extends ArticleEnhancer { - public function __construct($purifier, $fileFactory, $articleRegex, - $articleXPATH, $timeout){ - parent::__construct($purifier, $fileFactory, $articleRegex, - $articleXPATH, $timeout); + public function __construct($purifier, $fileFactory, $regexXPathPair, + $timeout){ + parent::__construct($purifier, $fileFactory, $regexXPathPair, + $timeout); } } @@ -56,8 +56,10 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility { $this->testEnhancer = new TestEnhancer( $this->purifier, $this->fileFactory, - '/explosm.net\/comics/', - '//*[@id=\'maincontent\']/div[2]/img', + array( + '/explosm.net\/comics/' => '//*[@id=\'maincontent\']/div[2]/div/img', + '/explosm.net\/shorts/' => '//*[@id=\'maincontent\']/div[2]/div' + ), $this->timeout ); } @@ -76,7 +78,7 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility { <body> <div id="maincontent"> <div>nooo</div> - <div><img src="hiho"></div> + <div><div><img src="hiho"></div></div> </div> </body> </html>'; @@ -99,6 +101,35 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility { } + public function testDoesModifiyAllArticlesThatMatch() { + $file = new \stdClass; + $file->body = '<html> + <body> + <div id="maincontent"> + <div>nooo</div> + <div><div>rawr</div></div> + </div> + </body> + </html>'; + $item = new Item(); + $item->setUrl('https://www.explosm.net/shorts/312'); + $item->setBody('Hello thar'); + + $this->fileFactory->expects($this->once()) + ->method('getFile') + ->with($this->equalTo($item->getUrl()), + $this->equalTo($this->timeout)) + ->will($this->returnValue($file)); + $this->purifier->expects($this->once()) + ->method('purify') + ->with($this->equalTo('<div>rawr</div>')) + ->will($this->returnValue('<div>rawr</div>')); + + $result = $this->testEnhancer->enhance($item); + $this->assertEquals('<div>rawr</div>', $result->getBody()); + } + + public function testModificationHandlesEmptyResults() { $file = new \stdClass; $file->body = '<html> diff --git a/utility/articleenhancer/articleenhancer.php b/utility/articleenhancer/articleenhancer.php index 194137e72..7fc67c660 100644 --- a/utility/articleenhancer/articleenhancer.php +++ b/utility/articleenhancer/articleenhancer.php @@ -32,8 +32,6 @@ abstract class ArticleEnhancer { private $feedRegex; - private $articleUrlRegex; - private $articleXPath; private $purifier; private $fileFactory; private $maximumTimeout; @@ -43,38 +41,38 @@ abstract class ArticleEnhancer { * @param $purifier the purifier object to clean the html which will be * matched * @param SimplePieFileFactory a factory for getting a simple pie file instance - * @param string $articleUrlRegex the regex to match which article should be - * handled - * @param string $articleXPath the xpath which tells the fetcher with what - * body the feed should be replaced + * @param array $regexXPathPair an associative array containing regex to + * match the url and the xpath that should be used for it to extract the + * page * @param int $maximumTimeout maximum timeout in seconds */ public function __construct($purifier, SimplePieFileFactory $fileFactory, - $articleUrlRegex, $articleXPath, - $maximumTimeout=10){ + array $regexXPathPair, $maximumTimeout=10){ $this->purifier = $purifier; - $this->articleUrlRegex = $articleUrlRegex; - $this->articleXPath = $articleXPath; + $this->regexXPathPair = $regexXPathPair; $this->fileFactory = $fileFactory; - $this->timeout = $maximumTimeout; + $this->maximumTimeout = $maximumTimeout; } public function enhance($item){ - if(preg_match($this->articleUrlRegex, $item->getUrl())) { - $file = $this->fileFactory->getFile($item->getUrl(), $this->maximumTimeout); - $dom = new \DOMDocument(); - @$dom->loadHTML($file->body); - $xpath = new \DOMXpath($dom); - $xpathResult = $xpath->evaluate($this->articleXPath); - - // in case it wasnt a text query assume its a single - if(!is_string($xpathResult)) { - $xpathResult = $this->domToString($xpathResult); + foreach($this->regexXPathPair as $regex => $search) { + + if(preg_match($regex, $item->getUrl())) { + $file = $this->fileFactory->getFile($item->getUrl(), $this->maximumTimeout); + $dom = new \DOMDocument(); + @$dom->loadHTML($file->body); + $xpath = new \DOMXpath($dom); + $xpathResult = $xpath->evaluate($search); + + // in case it wasnt a text query assume its a single + if(!is_string($xpathResult)) { + $xpathResult = $this->domToString($xpathResult); + } + + $sanitizedResult = $this->purifier->purify($xpathResult); + $item->setBody($sanitizedResult); } - - $sanitizedResult = $this->purifier->purify($xpathResult); - $item->setBody($sanitizedResult); } return $item; diff --git a/utility/articleenhancer/cyanideandhappinessenhancer.php b/utility/articleenhancer/cyanideandhappinessenhancer.php index 1faee6d5c..037a3179e 100644 --- a/utility/articleenhancer/cyanideandhappinessenhancer.php +++ b/utility/articleenhancer/cyanideandhappinessenhancer.php @@ -36,8 +36,10 @@ class CyanideAndHappinessEnhancer extends ArticleEnhancer { parent::__construct( $purifier, $fileFactory, - '/explosm.net\/comics/', // match article url - '//*[@id=\'maincontent\']/div[2]/div', // xpath statement to extract the html from the page + array( + '/explosm.net\/comics/' => '//*[@id=\'maincontent\']/div[2]/div', + '/explosm.net\/show/' => '//*[@id=\'videoPlayer\']/iframe' + ), $timeout ); } |