From a9eb72911b6f022da645dc08cf8c0f4b1702d1e1 Mon Sep 17 00:00:00 2001 From: Bernhard Posselt Date: Wed, 28 Aug 2013 17:26:38 +0200 Subject: add enhancers for articles, fix #14 --- utility/articleenhancer/articleenhancer.php | 112 +++++++++++++++++++++ .../cyanideandhappinessenhancer.php | 46 +++++++++ utility/articleenhancer/defaultenhancer.php | 49 +++++++++ utility/articleenhancer/enhancer.php | 52 ++++++++++ utility/simplepiefilefactory.php | 35 +++++++ 5 files changed, 294 insertions(+) create mode 100644 utility/articleenhancer/articleenhancer.php create mode 100644 utility/articleenhancer/cyanideandhappinessenhancer.php create mode 100644 utility/articleenhancer/defaultenhancer.php create mode 100644 utility/articleenhancer/enhancer.php create mode 100644 utility/simplepiefilefactory.php (limited to 'utility') diff --git a/utility/articleenhancer/articleenhancer.php b/utility/articleenhancer/articleenhancer.php new file mode 100644 index 000000000..d7701d53b --- /dev/null +++ b/utility/articleenhancer/articleenhancer.php @@ -0,0 +1,112 @@ +. +* +*/ + +namespace OCA\News\Utility\ArticleEnhancer; + +use \OCA\News\Utility\SimplePieFileFactory; + + +abstract class ArticleEnhancer { + + + private $feedRegex; + private $articleUrlRegex; + private $articleXPath; + private $purifier; + private $fileFactory; + private $maximumTimeout; + + + /** + * @param $purifier the purifier object to clean the html which will be + * matched + * @param SimplePieFileFactory a factory for getting a simple pie file instance + * @param string $articleUrlRegex the regex to match which article should be + * handled + * @param string $articleXPath the xpath which tells the fetcher with what + * body the feed should be replaced + * @param int $maximumTimeout maximum timeout in seconds + */ + public function __construct($purifier, SimplePieFileFactory $fileFactory, + $articleUrlRegex, $articleXPath, + $maximumTimeout=10){ + $this->purifier = $purifier; + $this->articleUrlRegex = $articleUrlRegex; + $this->articleXPath = $articleXPath; + $this->fileFactory = $fileFactory; + $this->timeout = $maximumTimeout; + } + + + public function canHandle($item){ + return preg_match($this->articleUrlRegex, $item->getUrl()) == true; + } + + + public function enhance($item){ + $file = $this->fileFactory->getFile($item->getUrl(), $this->maximumTimeout); + $dom = new \DOMDocument(); + @$dom->loadHTML($file->body); + $xpath = new \DOMXpath($dom); + $xpathResult = $xpath->evaluate($this->articleXPath); + + // in case it wasnt a text query assume its a single + if(!is_string($xpathResult)) { + $xpathResult = $this->domToString($xpathResult); + } + + $sanitizedResult = $this->purifier->purify($xpathResult); + $item->setBody($sanitizedResult); + + + return $item; + } + + + /** + * Method which turns an xpath result to a string + * Assumes that the result matches a single element. If the result + * is not a single element, you can customize it by overwriting this + * method + * @param $xpathResult the result from the xpath query + * @return the result as a string + */ + protected function domToString($xpathResult) { + if($xpathResult->length > 0) { + return $this->toInnerHTML($xpathResult->item(0)); + } else { + return ""; + } + } + + + protected function toInnerHTML($node) { + $dom = new \DOMDocument(); + $dom->appendChild($dom->importNode($node, true)); + return trim($dom->saveHTML()); + } + + +} \ No newline at end of file diff --git a/utility/articleenhancer/cyanideandhappinessenhancer.php b/utility/articleenhancer/cyanideandhappinessenhancer.php new file mode 100644 index 000000000..1faee6d5c --- /dev/null +++ b/utility/articleenhancer/cyanideandhappinessenhancer.php @@ -0,0 +1,46 @@ +. +* +*/ + +namespace OCA\News\Utility\ArticleEnhancer; + +use \OCA\News\Utility\SimplePieFileFactory; + + +class CyanideAndHappinessEnhancer extends ArticleEnhancer { + + + public function __construct(SimplePieFileFactory $fileFactory, $purifier, + $timeout){ + parent::__construct( + $purifier, + $fileFactory, + '/explosm.net\/comics/', // match article url + '//*[@id=\'maincontent\']/div[2]/div', // xpath statement to extract the html from the page + $timeout + ); + } + + +} \ No newline at end of file diff --git a/utility/articleenhancer/defaultenhancer.php b/utility/articleenhancer/defaultenhancer.php new file mode 100644 index 000000000..eb3045ceb --- /dev/null +++ b/utility/articleenhancer/defaultenhancer.php @@ -0,0 +1,49 @@ +. +* +*/ + +namespace OCA\News\Utility\ArticleEnhancer; + +use \OCA\News\Utility\SimplePieFileFactory; + + +class DefaultEnhancer extends ArticleEnhancer { + + + public function __construct(){ + parent::__construct(null, new SimplePieFileFactory(), null, null, null); + } + + + public function canHandle($item){ + return true; + } + + + public function enhance($item){ + return $item; + } + + +} \ No newline at end of file diff --git a/utility/articleenhancer/enhancer.php b/utility/articleenhancer/enhancer.php new file mode 100644 index 000000000..059904f63 --- /dev/null +++ b/utility/articleenhancer/enhancer.php @@ -0,0 +1,52 @@ +. +* +*/ + +namespace OCA\News\Utility\ArticleEnhancer; + + +class Enhancer { + + private $enhancers; + + public function __construct(){ + $this->enhancers = array(); + } + + + public function registerEnhancer(ArticleEnhancer $enhancer){ + array_push($this->enhancers, $enhancer); + } + + + public function enhance($item){ + foreach($this->enhancers as $enhancer){ + if($enhancer->canHandle($item)){ + return $enhancer->enhance($item); + } + } + } + + +} \ No newline at end of file diff --git a/utility/simplepiefilefactory.php b/utility/simplepiefilefactory.php new file mode 100644 index 000000000..13b56dc9e --- /dev/null +++ b/utility/simplepiefilefactory.php @@ -0,0 +1,35 @@ +. +* +*/ + +namespace OCA\News\Utility; + + +class SimplePieFileFactory { + + public function getFile($url, $timeout) { + return new \SimplePie_File($url, $timeout); + } + +} \ No newline at end of file -- cgit v1.2.3