. * */ namespace OCA\News\Utility\ArticleEnhancer; use \OCA\News\Utility\SimplePieFileFactory; abstract class ArticleEnhancer { private $feedRegex; private $purifier; private $fileFactory; private $maximumTimeout; /** * @param $purifier the purifier object to clean the html which will be * matched * @param SimplePieFileFactory a factory for getting a simple pie file instance * @param array $regexXPathPair an associative array containing regex to * match the url and the xpath that should be used for it to extract the * page * @param int $maximumTimeout maximum timeout in seconds, defaults to 10 sec */ public function __construct($purifier, SimplePieFileFactory $fileFactory, array $regexXPathPair, $maximumTimeout=10){ $this->purifier = $purifier; $this->regexXPathPair = $regexXPathPair; $this->fileFactory = $fileFactory; $this->maximumTimeout = $maximumTimeout; } public function enhance($item){ foreach($this->regexXPathPair as $regex => $search) { if(preg_match($regex, $item->getUrl())) { $file = $this->fileFactory->getFile($item->getUrl(), $this->maximumTimeout); $dom = new \DOMDocument(); @$dom->loadHTML($file->body); $xpath = new \DOMXpath($dom); $xpathResult = $xpath->evaluate($search); // in case it wasnt a text query assume its a single if(!is_string($xpathResult)) { $xpathResult = $this->domToString($xpathResult); } $sanitizedResult = $this->purifier->purify($xpathResult); $item->setBody($sanitizedResult); } } return $item; } /** * Method which turns an xpath result to a string * Assumes that the result matches a single element. If the result * is not a single element, you can customize it by overwriting this * method * @param $xpathResult the result from the xpath query * @return the result as a string */ protected function domToString($xpathResult) { if($xpathResult->length > 0) { return $this->toInnerHTML($xpathResult->item(0)); } else { return ""; } } protected function toInnerHTML($node) { $dom = new \DOMDocument(); $dom->appendChild($dom->importNode($node, true)); return trim($dom->saveHTML()); } }