summaryrefslogtreecommitdiffstats
path: root/articleenhancer/xpatharticleenhancer.php
diff options
context:
space:
mode:
Diffstat (limited to 'articleenhancer/xpatharticleenhancer.php')
-rw-r--r--articleenhancer/xpatharticleenhancer.php196
1 files changed, 0 insertions, 196 deletions
diff --git a/articleenhancer/xpatharticleenhancer.php b/articleenhancer/xpatharticleenhancer.php
deleted file mode 100644
index 61bf230a0..000000000
--- a/articleenhancer/xpatharticleenhancer.php
+++ /dev/null
@@ -1,196 +0,0 @@
-<?php
-/**
- * ownCloud - News
- *
- * This file is licensed under the Affero General Public License version 3 or
- * later. See the COPYING file.
- *
- * @author Alessandro Cosentino <cosenal@gmail.com>
- * @author Bernhard Posselt <dev@bernhard-posselt.com>
- * @copyright Alessandro Cosentino 2012
- * @copyright Bernhard Posselt 2012, 2014
- */
-
-namespace OCA\News\ArticleEnhancer;
-
-use DOMDocument;
-use DOMXpath;
-
-use PicoFeed\Encoding\Encoding;
-
-use OCA\News\Utility\PicoFeedClientFactory;
-
-use OCA\News\Db\Item;
-
-
-class XPathArticleEnhancer implements ArticleEnhancer {
-
- private $clientFactory;
- private $regexXPathPair;
-
-
- /**
- * @param \Utility\PicoFeedClientFactory $clientFactory
- * @param array $regexXPathPair an associative array containing regex to
- * match the url and the xpath that should be used for it to extract the
- * page
- */
- public function __construct(PicoFeedClientFactory $clientFactory,
- array $regexXPathPair){
- $this->clientFactory = $clientFactory;
- $this->regexXPathPair = $regexXPathPair;
- }
-
- /**
- * @param \OCA\News\Db\Item $item
- * @return \OCA\News\Db\Item enhanced item
- */
- public function enhance(Item $item){
-
- foreach($this->regexXPathPair as $regex => $search) {
-
- if(preg_match($regex, $item->getUrl())) {
- $body = $this->getFile($item->getUrl());
-
- // First check if either <meta charset="..."> or
- // <meta http-equiv="Content-Type" ...> is specified and use it
- // If this fails use mb_detect_encoding()
- $regex = '/<meta\s+[^>]*(?:charset\s*=\s*[\'"]([^>\'"]*)[\'"]' .
- '|http-equiv\s*=\s*[\'"]content-type[\'"]\s+[^>]*' .
- 'content\s*=\s*[\'"][^>]*charset=([^>]*)[\'"])[^>]*>' .
- '/i';
- if(preg_match($regex, $body, $matches)) {
- $enc = strtoupper($matches[sizeof($matches) - 1]);
- } else {
- $enc = mb_detect_encoding($body);
- }
- $enc = $enc ? $enc : 'UTF-8';
- $body = mb_convert_encoding($body, 'HTML-ENTITIES', $enc);
- if (trim($body) === '') {
- return $item;
- }
-
- $dom = new DOMDocument();
- $isOk = @$dom->loadHTML($body);
-
- $xpath = new DOMXpath($dom);
- $xpathResult = $xpath->evaluate($search);
-
- // in case it wasnt a text query assume its a dom element and
- // convert it to text
- if(!is_string($xpathResult)) {
- $xpathResult = $this->domToString($xpathResult);
- }
-
- $xpathResult = trim($xpathResult);
-
- // convert all relative to absolute URLs
- $xpathResult = $this->substituteRelativeLinks(
- $xpathResult, $item->getUrl()
- );
-
- if($isOk && $xpathResult !== false && $xpathResult !== '') {
- $item->setBody($xpathResult);
- }
- }
- }
-
- return $item;
- }
-
-
- private function getFile($url) {
- $client = $this->clientFactory->build();
- $client->execute($url);
- $client->setUserAgent('Mozilla/5.0 AppleWebKit');
- return $client->getContent();
- }
-
-
- /**
- * Method which converts all relative "href" and "src" URLs of
- * a HTML snippet with their absolute equivalent
- * @param string $xmlString a HTML snippet as string with the relative URLs
- * to be replaced
- * @param string $absoluteUrl the approptiate absolute url of the HTML
- * snippet
- * @return string the result HTML snippet as a string
- */
- protected function substituteRelativeLinks($xmlString, $absoluteUrl) {
- $dom = new DOMDocument();
- $dom->preserveWhiteSpace = false;
-
- if($xmlString === '') {
- return '';
- }
-
- $xmlString = '<div>' . $xmlString . '</div>';
- $isOk = @$dom->loadHTML($xmlString, LIBXML_HTML_NOIMPLIED |
- LIBXML_HTML_NODEFDTD);
-
- if(!$isOk) {
- return '';
- }
-
- foreach (['href', 'src'] as $attribute) {
- $xpath = new DOMXpath($dom);
- $xpathResult = $xpath->query(
- "//*[@" . $attribute . " " .
- "and not(contains(@" . $attribute . ", '://')) " .
- "and not(starts-with(@" . $attribute . ", 'mailto:')) " .
- "and not(starts-with(@" . $attribute . ", '//'))]");
- foreach ($xpathResult as $linkNode) {
- $urlElement = $linkNode->attributes->getNamedItem($attribute);
- $abs = $this->relativeToAbsoluteUrl(
- $urlElement->nodeValue, $absoluteUrl
- );
- $urlElement->nodeValue = htmlspecialchars($abs);
- }
- }
-
- $xmlString = $dom->saveHTML();
-
- // domdocument spoils the string with line breaks between the elements
- // strip them
- $xmlString = str_replace("\n", '', $xmlString);
-
- return $xmlString;
- }
-
-
- /**
- * Method which builds a URL by taking a relative URL and its corresponding
- * absolute URL
- * @param string $relativeUrl the relative URL
- * @param string $absoluteUrl the absolute URL with at least scheme and host
- * @return string the resulting absolute URL
- */
- protected function relativeToAbsoluteUrl($relativeUrl, $absoluteUrl) {
- $base = new \Net_URL2($absoluteUrl);
- return $base->resolve($relativeUrl);
- }
-
-
- /**
- * Method which turns an xpath result to a string
- * you can customize it by overwriting this method
- * @param mixed $xpathResult the result from the xpath query
- * @return string the result as a string
- */
- protected function domToString($xpathResult) {
- $result = '';
- foreach($xpathResult as $node) {
- $result .= $this->toInnerHTML($node);
- }
- return $result;
- }
-
-
- protected function toInnerHTML($node) {
- $dom = new DOMDocument();
- $dom->appendChild($dom->importNode($node, true));
- return trim($dom->saveHTML($dom->documentElement));
- }
-
-
-}