diff options
author | Bernhard Posselt <dev@bernhard-posselt.com> | 2015-08-10 20:20:30 +0200 |
---|---|---|
committer | Bernhard Posselt <dev@bernhard-posselt.com> | 2015-08-12 17:05:18 +0200 |
commit | 53679811da855acf9bd944a389a48399ca5d5a15 (patch) | |
tree | fa75e06a965fb5751017288a5c135bc179574210 /articleenhancer/xpatharticleenhancer.php | |
parent | c77a6705d34c81cb933f3d4b83eb18e2b586035a (diff) |
serverside full text
remove enhancers
add full text client side implementation
fix bugs and tests for full text feed
Diffstat (limited to 'articleenhancer/xpatharticleenhancer.php')
-rw-r--r-- | articleenhancer/xpatharticleenhancer.php | 196 |
1 files changed, 0 insertions, 196 deletions
diff --git a/articleenhancer/xpatharticleenhancer.php b/articleenhancer/xpatharticleenhancer.php deleted file mode 100644 index 61bf230a0..000000000 --- a/articleenhancer/xpatharticleenhancer.php +++ /dev/null @@ -1,196 +0,0 @@ -<?php -/** - * ownCloud - News - * - * This file is licensed under the Affero General Public License version 3 or - * later. See the COPYING file. - * - * @author Alessandro Cosentino <cosenal@gmail.com> - * @author Bernhard Posselt <dev@bernhard-posselt.com> - * @copyright Alessandro Cosentino 2012 - * @copyright Bernhard Posselt 2012, 2014 - */ - -namespace OCA\News\ArticleEnhancer; - -use DOMDocument; -use DOMXpath; - -use PicoFeed\Encoding\Encoding; - -use OCA\News\Utility\PicoFeedClientFactory; - -use OCA\News\Db\Item; - - -class XPathArticleEnhancer implements ArticleEnhancer { - - private $clientFactory; - private $regexXPathPair; - - - /** - * @param \Utility\PicoFeedClientFactory $clientFactory - * @param array $regexXPathPair an associative array containing regex to - * match the url and the xpath that should be used for it to extract the - * page - */ - public function __construct(PicoFeedClientFactory $clientFactory, - array $regexXPathPair){ - $this->clientFactory = $clientFactory; - $this->regexXPathPair = $regexXPathPair; - } - - /** - * @param \OCA\News\Db\Item $item - * @return \OCA\News\Db\Item enhanced item - */ - public function enhance(Item $item){ - - foreach($this->regexXPathPair as $regex => $search) { - - if(preg_match($regex, $item->getUrl())) { - $body = $this->getFile($item->getUrl()); - - // First check if either <meta charset="..."> or - // <meta http-equiv="Content-Type" ...> is specified and use it - // If this fails use mb_detect_encoding() - $regex = '/<meta\s+[^>]*(?:charset\s*=\s*[\'"]([^>\'"]*)[\'"]' . - '|http-equiv\s*=\s*[\'"]content-type[\'"]\s+[^>]*' . - 'content\s*=\s*[\'"][^>]*charset=([^>]*)[\'"])[^>]*>' . - '/i'; - if(preg_match($regex, $body, $matches)) { - $enc = strtoupper($matches[sizeof($matches) - 1]); - } else { - $enc = mb_detect_encoding($body); - } - $enc = $enc ? $enc : 'UTF-8'; - $body = mb_convert_encoding($body, 'HTML-ENTITIES', $enc); - if (trim($body) === '') { - return $item; - } - - $dom = new DOMDocument(); - $isOk = @$dom->loadHTML($body); - - $xpath = new DOMXpath($dom); - $xpathResult = $xpath->evaluate($search); - - // in case it wasnt a text query assume its a dom element and - // convert it to text - if(!is_string($xpathResult)) { - $xpathResult = $this->domToString($xpathResult); - } - - $xpathResult = trim($xpathResult); - - // convert all relative to absolute URLs - $xpathResult = $this->substituteRelativeLinks( - $xpathResult, $item->getUrl() - ); - - if($isOk && $xpathResult !== false && $xpathResult !== '') { - $item->setBody($xpathResult); - } - } - } - - return $item; - } - - - private function getFile($url) { - $client = $this->clientFactory->build(); - $client->execute($url); - $client->setUserAgent('Mozilla/5.0 AppleWebKit'); - return $client->getContent(); - } - - - /** - * Method which converts all relative "href" and "src" URLs of - * a HTML snippet with their absolute equivalent - * @param string $xmlString a HTML snippet as string with the relative URLs - * to be replaced - * @param string $absoluteUrl the approptiate absolute url of the HTML - * snippet - * @return string the result HTML snippet as a string - */ - protected function substituteRelativeLinks($xmlString, $absoluteUrl) { - $dom = new DOMDocument(); - $dom->preserveWhiteSpace = false; - - if($xmlString === '') { - return ''; - } - - $xmlString = '<div>' . $xmlString . '</div>'; - $isOk = @$dom->loadHTML($xmlString, LIBXML_HTML_NOIMPLIED | - LIBXML_HTML_NODEFDTD); - - if(!$isOk) { - return ''; - } - - foreach (['href', 'src'] as $attribute) { - $xpath = new DOMXpath($dom); - $xpathResult = $xpath->query( - "//*[@" . $attribute . " " . - "and not(contains(@" . $attribute . ", '://')) " . - "and not(starts-with(@" . $attribute . ", 'mailto:')) " . - "and not(starts-with(@" . $attribute . ", '//'))]"); - foreach ($xpathResult as $linkNode) { - $urlElement = $linkNode->attributes->getNamedItem($attribute); - $abs = $this->relativeToAbsoluteUrl( - $urlElement->nodeValue, $absoluteUrl - ); - $urlElement->nodeValue = htmlspecialchars($abs); - } - } - - $xmlString = $dom->saveHTML(); - - // domdocument spoils the string with line breaks between the elements - // strip them - $xmlString = str_replace("\n", '', $xmlString); - - return $xmlString; - } - - - /** - * Method which builds a URL by taking a relative URL and its corresponding - * absolute URL - * @param string $relativeUrl the relative URL - * @param string $absoluteUrl the absolute URL with at least scheme and host - * @return string the resulting absolute URL - */ - protected function relativeToAbsoluteUrl($relativeUrl, $absoluteUrl) { - $base = new \Net_URL2($absoluteUrl); - return $base->resolve($relativeUrl); - } - - - /** - * Method which turns an xpath result to a string - * you can customize it by overwriting this method - * @param mixed $xpathResult the result from the xpath query - * @return string the result as a string - */ - protected function domToString($xpathResult) { - $result = ''; - foreach($xpathResult as $node) { - $result .= $this->toInnerHTML($node); - } - return $result; - } - - - protected function toInnerHTML($node) { - $dom = new DOMDocument(); - $dom->appendChild($dom->importNode($node, true)); - return trim($dom->saveHTML($dom->documentElement)); - } - - -} |