From c56d433ca432fba641a0b734287cca9431fd6d12 Mon Sep 17 00:00:00 2001 From: Bernhard Posselt Date: Thu, 26 Sep 2013 20:54:56 +0200 Subject: create a seperate config file for regex enhancers --- utility/articleenhancer/articleenhancer.php | 186 +------------------- utility/articleenhancer/enhancers.json | 40 ----- utility/articleenhancer/regexarticleenhancer.php | 55 ++++++ utility/articleenhancer/regexenhancers.json | 7 + utility/articleenhancer/twogagenhancer.php | 52 ------ utility/articleenhancer/xpatharticleenhancer.php | 214 +++++++++++++++++++++++ utility/articleenhancer/xpathenhancers.json | 40 +++++ 7 files changed, 319 insertions(+), 275 deletions(-) delete mode 100644 utility/articleenhancer/enhancers.json create mode 100644 utility/articleenhancer/regexarticleenhancer.php create mode 100644 utility/articleenhancer/regexenhancers.json delete mode 100644 utility/articleenhancer/twogagenhancer.php create mode 100644 utility/articleenhancer/xpatharticleenhancer.php create mode 100644 utility/articleenhancer/xpathenhancers.json (limited to 'utility') diff --git a/utility/articleenhancer/articleenhancer.php b/utility/articleenhancer/articleenhancer.php index 1790d2457..1842a0c02 100644 --- a/utility/articleenhancer/articleenhancer.php +++ b/utility/articleenhancer/articleenhancer.php @@ -25,189 +25,9 @@ namespace OCA\News\Utility\ArticleEnhancer; -use \OCA\News\Utility\SimplePieFileFactory; - - -class ArticleEnhancer { - - - private $feedRegex; - private $purifier; - private $fileFactory; - private $maximumTimeout; - - - /** - * @param $purifier the purifier object to clean the html which will be - * matched - * @param SimplePieFileFactory a factory for getting a simple pie file instance - * @param array $regexXPathPair an associative array containing regex to - * match the url and the xpath that should be used for it to extract the - * page - * @param int $maximumTimeout maximum timeout in seconds, defaults to 10 sec - */ - public function __construct($purifier, SimplePieFileFactory $fileFactory, - array $regexXPathPair, $maximumTimeout=10){ - $this->purifier = $purifier; - $this->regexXPathPair = $regexXPathPair; - $this->fileFactory = $fileFactory; - $this->maximumTimeout = $maximumTimeout; - } - - - public function enhance($item){ - - foreach($this->regexXPathPair as $regex => $search) { - - if(preg_match($regex, $item->getUrl())) { - $file = $this->fileFactory->getFile($item->getUrl(), $this->maximumTimeout); - - // convert encoding by detecting charset from header - $contentType = $file->headers['content-type']; - if( preg_match( '/(?<=charset=)[^;]*/', $contentType, $matches ) ) { - $body = mb_convert_encoding($file->body, 'HTML-ENTITIES', $matches[0]); - } else { - $body = $file->body; - } - - $dom = new \DOMDocument(); - @$dom->loadHTML($body); - - $xpath = new \DOMXpath($dom); - $xpathResult = $xpath->evaluate($search); - - // in case it wasnt a text query assume its a single - if(!is_string($xpathResult)) { - $xpathResult = $this->domToString($xpathResult); - } - - // convert all relative to absolute URLs - $xpathResult = $this->substituteRelativeLinks($xpathResult, $item->getUrl()); - - $sanitizedResult = $this->purifier->purify($xpathResult); - $item->setBody($sanitizedResult); - } - } - - return $item; - } - - - /** - * Method which converts all relative "href" and "src" URLs of - * a HTML snippet with their absolute equivalent - * @param string $xmlString a HTML snippet as string with the relative URLs to be replaced - * @param string $absoluteUrl the approptiate absolute url of the HTML snippet - * @return string the result HTML snippet as a string - */ - protected function substituteRelativeLinks($xmlString, $absoluteUrl) { - $dom = new \DOMDocument(); - $dom->preserveWhiteSpace = false; - - // return, if xml is empty or loading the HTML fails - if( trim($xmlString) == "" || !$dom->loadHTML($xmlString) ) { - return $xmlString; - } - - // remove removeChild($dom->firstChild); - // remove - $dom->replaceChild($dom->firstChild->firstChild, $dom->firstChild); - - $substitution = array("href", "src"); - - foreach ($substitution as $attribute) { - $xpath = new \DOMXpath($dom); - $xpathResult = $xpath->query("//*[@".$attribute." and not(contains(@".$attribute.", '://')) and not(starts-with(@".$attribute.", 'mailto:'))]"); - foreach ($xpathResult as $linkNode) { - $urlElement = $linkNode->attributes->getNamedItem($attribute); - $urlElement->nodeValue = htmlentities( $this->relativeToAbsoluteUrl( $urlElement->nodeValue, $absoluteUrl ) ); - } - } - - // save dom to string and remove - $xmlString = substr(trim($dom->saveHTML()), 6, -7); - // domdocument spoils the string with line breaks between the elements. strip them. - $xmlString = str_replace("\n", "", $xmlString); - - return $xmlString; - } - - - /** - * Method which builds a URL by taking a relative URL and its corresponding - * absolute URL - * For examle relative URL "../example/path/file.php?a=1#anchor" and - * absolute URL "https://username:password@www.website.com/subfolder/index.html" - * will result in "https://username:password@www.website.com/example/path/file.php?a=1#anchor" - * @param string $relativeUrl the relative URL - * @param string $absoluteUrl the absolute URL with at least scheme and host - * @return string the resulting absolute URL - */ - protected function relativeToAbsoluteUrl($relativeUrl, $absoluteUrl) { - $abs = parse_url($absoluteUrl); - - $newUrl = $abs["scheme"]."://" - .( (isset($abs["user"])) ? $abs["user"] . ( (isset($abs["pass"])) ? ":".$abs["pass"] : "") . "@" : "" ) - .$abs["host"] - .( (isset($abs["port"])) ? ":".$abs["port"] : "" ); - - if(substr(trim($relativeUrl), 0, 1) == "/") { - // we have a relative url like "/a/path/file" - return $newUrl . $relativeUrl; - } else { - // we have a relative url like "a/path/file", handle "."" and ".." directories - - // the starting point is the absolute path, but with out the last part (we don't need the file name) - $newPath = explode("/", substr($abs["path"], 1) ); - array_pop($newPath); - - $relPath = parse_url($relativeUrl, PHP_URL_PATH); - $relPath = explode("/", $relPath); - - // cross the relative and the absolute path - for($i=0; $itoInnerHTML($node); - } - return $result; - } - - - protected function toInnerHTML($node) { - $dom = new \DOMDocument(); - $dom->appendChild($dom->importNode($node, true)); - return trim($dom->saveHTML($dom->documentElement)); - } +use \OCA\News\Db\Item; +interface ArticleEnhancer { + public function enhance(Item $item); } \ No newline at end of file diff --git a/utility/articleenhancer/enhancers.json b/utility/articleenhancer/enhancers.json deleted file mode 100644 index cd30b4880..000000000 --- a/utility/articleenhancer/enhancers.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "cad-comic.com": { - "%cad-comic.com/cad/\\d+/$%": "//*[@id='content']/img" - }, - "explosm.net": { - "%explosm.net/comics%": "//*[@id='maincontent']/div[2]/div/img", - "%explosm.net/show%": "//*[@id='videoPlayer']/iframe" - }, - "themerepublic.net": { - "%feedproxy.google.com/~r/blogspot/DngUJ%": "//*[@class='post hentry']" - }, - "penny-arcade.com": { - "%feeds.penny-arcade.com/~r/pa-mainsite%": "//*[starts-with(@class, \"post\")]" - }, - "leasticoulddo.com": { - "%feedproxy.google.com/~r/LICD%": "//*[@id='comic-img']/a/img | //*[@id='comic-img']/img" - }, - "escapistmagazine.com/articles/view/comics/critical-miss": { - "%escapistmagazine.com/articles/view/comics/critical-miss%": "//*[@class='body']/span/img" - }, - "escapistmagazine.com/articles/view/comics/namegame": { - "%escapistmagazine.com/articles/view/comics/namegame%": "//*[@class='body']/span/p/img[@height != \"120\"]" - }, - "escapistmagazine.com/articles/view/comics/stolen-pixels": { - "%escapistmagazine.com/articles/view/comics/stolen-pixels%": "//*[@class='body']/span/p[2]/img" - }, - "escapistmagazine.com/articles/view/comics/bumhugparade": { - "%escapistmagazine.com/articles/view/comics/bumhugparade%": "//*[@class='body']/span/p[2]/img" - }, - "escapistmagazine.com/articles/view/comics/escapistradiotheater": { - "%escapistmagazine.com/articles/view/comics/escapistradiotheater%": "//*[@class='body']/span/p[2]/img" - }, - "escapistmagazine.com/articles/view/comics/paused": { - "%escapistmagazine.com/articles/view/comics/paused%": "//*[@class='body']/span/p[2]/img | //*[@class='body']/span/div/img" - }, - "escapistmagazine.com/articles/view/comics/fraughtwithperil": { - "%escapistmagazine.com/articles/view/comics/fraughtwithperil%": "//*[@class='body']" - } -} - diff --git a/utility/articleenhancer/regexarticleenhancer.php b/utility/articleenhancer/regexarticleenhancer.php new file mode 100644 index 000000000..dfd822c85 --- /dev/null +++ b/utility/articleenhancer/regexarticleenhancer.php @@ -0,0 +1,55 @@ +. + * + */ + +namespace OCA\News\Utility\ArticleEnhancer; + +use \OCA\News\Utility\SimplePieFileFactory; +use \OCA\News\Db\Item; + + +class RegexArticleEnhancer implements ArticleEnhancer { + + private $matchArticleUrl; + private $regexPair; + + public function __construct($matchArticleUrl, array $regexPair) { + $this->matchArticleUrl = $matchArticleUrl; + $this->regexPair = $regexPair; + } + + + public function enhance(Item $item) { + if (preg_match($this->matchArticleUrl, $item->getUrl())) { + $body = $item->getBody(); + foreach($this->regexPair as $search => $replaceWith) { + $body = preg_replace($search, $replaceWith, $body); + } + $item->setBody($body); + } + return $item; + } + + +} diff --git a/utility/articleenhancer/regexenhancers.json b/utility/articleenhancer/regexenhancers.json new file mode 100644 index 000000000..edf8ffb35 --- /dev/null +++ b/utility/articleenhancer/regexenhancers.json @@ -0,0 +1,7 @@ +{ + "twogag.com": { + "%(?:www.twogag.com/archives)|(feedproxy.google.com/~r/TwoGuysAndGuy)%": { + "%http://www.twogag.com/comics-rss/([^.]+)\\.jpg%": "http://www.twogag.com/comics/$1.jpg" + } + } +} \ No newline at end of file diff --git a/utility/articleenhancer/twogagenhancer.php b/utility/articleenhancer/twogagenhancer.php deleted file mode 100644 index 7303e2b34..000000000 --- a/utility/articleenhancer/twogagenhancer.php +++ /dev/null @@ -1,52 +0,0 @@ -. - * - */ - -namespace OCA\News\Utility\ArticleEnhancer; - -use \OCA\News\Utility\SimplePieFileFactory; - - -class TwoGAGEnhancer extends ArticleEnhancer { - - - public function __construct(SimplePieFileFactory $fileFactory, $purifier, - $timeout) { - parent::__construct( - $purifier, - $fileFactory, - array(), - $timeout - ); - } - - public function enhance($item) { - if (preg_match('/www.twogag.com\/archives/', $item->getUrl()) || preg_match('/feedproxy.google.com\/\~r\/TwoGuysAndGuy/', $item->getUrl())) { - $body = $item->getBody(); - $body = preg_replace('/http\:\/\/www.twogag.com\/comics-rss\/([^.]+)\.jpg/', 'http://www.twogag.com/comics/$1.jpg', $body); - $item->setBody($body); - } - return $item; - } -} diff --git a/utility/articleenhancer/xpatharticleenhancer.php b/utility/articleenhancer/xpatharticleenhancer.php new file mode 100644 index 000000000..bfc720cf3 --- /dev/null +++ b/utility/articleenhancer/xpatharticleenhancer.php @@ -0,0 +1,214 @@ +. +* +*/ + +namespace OCA\News\Utility\ArticleEnhancer; + +use \OCA\News\Utility\SimplePieFileFactory; +use \OCA\News\Db\Item; + + +class XPathArticleEnhancer implements ArticleEnhancer { + + + private $feedRegex; + private $purifier; + private $fileFactory; + private $maximumTimeout; + + + /** + * @param $purifier the purifier object to clean the html which will be + * matched + * @param SimplePieFileFactory a factory for getting a simple pie file instance + * @param array $regexXPathPair an associative array containing regex to + * match the url and the xpath that should be used for it to extract the + * page + * @param int $maximumTimeout maximum timeout in seconds, defaults to 10 sec + */ + public function __construct($purifier, SimplePieFileFactory $fileFactory, + array $regexXPathPair, $maximumTimeout=10){ + $this->purifier = $purifier; + $this->regexXPathPair = $regexXPathPair; + $this->fileFactory = $fileFactory; + $this->maximumTimeout = $maximumTimeout; + } + + + public function enhance(Item $item){ + + foreach($this->regexXPathPair as $regex => $search) { + + if(preg_match($regex, $item->getUrl())) { + $file = $this->fileFactory->getFile($item->getUrl(), $this->maximumTimeout); + + // convert encoding by detecting charset from header + $contentType = $file->headers['content-type']; + if( preg_match( '/(?<=charset=)[^;]*/', $contentType, $matches ) ) { + $body = mb_convert_encoding($file->body, 'HTML-ENTITIES', $matches[0]); + } else { + $body = $file->body; + } + + $dom = new \DOMDocument(); + @$dom->loadHTML($body); + + $xpath = new \DOMXpath($dom); + $xpathResult = $xpath->evaluate($search); + + // in case it wasnt a text query assume its a single + if(!is_string($xpathResult)) { + $xpathResult = $this->domToString($xpathResult); + } + + // convert all relative to absolute URLs + $xpathResult = $this->substituteRelativeLinks($xpathResult, $item->getUrl()); + + $sanitizedResult = $this->purifier->purify($xpathResult); + $item->setBody($sanitizedResult); + } + } + + return $item; + } + + + /** + * Method which converts all relative "href" and "src" URLs of + * a HTML snippet with their absolute equivalent + * @param string $xmlString a HTML snippet as string with the relative URLs to be replaced + * @param string $absoluteUrl the approptiate absolute url of the HTML snippet + * @return string the result HTML snippet as a string + */ + protected function substituteRelativeLinks($xmlString, $absoluteUrl) { + $dom = new \DOMDocument(); + $dom->preserveWhiteSpace = false; + + // return, if xml is empty or loading the HTML fails + if( trim($xmlString) == "" || !$dom->loadHTML($xmlString) ) { + return $xmlString; + } + + // remove removeChild($dom->firstChild); + // remove + $dom->replaceChild($dom->firstChild->firstChild, $dom->firstChild); + + $substitution = array("href", "src"); + + foreach ($substitution as $attribute) { + $xpath = new \DOMXpath($dom); + $xpathResult = $xpath->query("//*[@".$attribute." and not(contains(@".$attribute.", '://')) and not(starts-with(@".$attribute.", 'mailto:'))]"); + foreach ($xpathResult as $linkNode) { + $urlElement = $linkNode->attributes->getNamedItem($attribute); + $urlElement->nodeValue = $this->relativeToAbsoluteUrl( $urlElement->nodeValue, $absoluteUrl ); + } + } + + // save dom to string and remove + $xmlString = substr(trim($dom->saveHTML()), 6, -7); + // domdocument spoils the string with line breaks between the elements. strip them. + $xmlString = str_replace("\n", "", $xmlString); + + return $xmlString; + } + + + /** + * Method which builds a URL by taking a relative URL and its corresponding + * absolute URL + * For examle relative URL "../example/path/file.php?a=1#anchor" and + * absolute URL "https://username:password@www.website.com/subfolder/index.html" + * will result in "https://username:password@www.website.com/example/path/file.php?a=1#anchor" + * @param string $relativeUrl the relative URL + * @param string $absoluteUrl the absolute URL with at least scheme and host + * @return string the resulting absolute URL + */ + protected function relativeToAbsoluteUrl($relativeUrl, $absoluteUrl) { + $abs = parse_url($absoluteUrl); + + $newUrl = $abs["scheme"]."://" + .( (isset($abs["user"])) ? $abs["user"] . ( (isset($abs["pass"])) ? ":".$abs["pass"] : "") . "@" : "" ) + .$abs["host"] + .( (isset($abs["port"])) ? ":".$abs["port"] : "" ); + + if(substr(trim($relativeUrl), 0, 1) == "/") { + // we have a relative url like "/a/path/file" + return $newUrl . $relativeUrl; + } else { + // we have a relative url like "a/path/file", handle "."" and ".." directories + + // the starting point is the absolute path, but with out the last part (we don't need the file name) + $newPath = explode("/", substr($abs["path"], 1) ); + array_pop($newPath); + + $relPath = parse_url($relativeUrl, PHP_URL_PATH); + $relPath = explode("/", $relPath); + + // cross the relative and the absolute path + for($i=0; $itoInnerHTML($node); + } + return $result; + } + + + protected function toInnerHTML($node) { + $dom = new \DOMDocument(); + $dom->appendChild($dom->importNode($node, true)); + return trim($dom->saveHTML($dom->documentElement)); + } + + +} \ No newline at end of file diff --git a/utility/articleenhancer/xpathenhancers.json b/utility/articleenhancer/xpathenhancers.json new file mode 100644 index 000000000..cd30b4880 --- /dev/null +++ b/utility/articleenhancer/xpathenhancers.json @@ -0,0 +1,40 @@ +{ + "cad-comic.com": { + "%cad-comic.com/cad/\\d+/$%": "//*[@id='content']/img" + }, + "explosm.net": { + "%explosm.net/comics%": "//*[@id='maincontent']/div[2]/div/img", + "%explosm.net/show%": "//*[@id='videoPlayer']/iframe" + }, + "themerepublic.net": { + "%feedproxy.google.com/~r/blogspot/DngUJ%": "//*[@class='post hentry']" + }, + "penny-arcade.com": { + "%feeds.penny-arcade.com/~r/pa-mainsite%": "//*[starts-with(@class, \"post\")]" + }, + "leasticoulddo.com": { + "%feedproxy.google.com/~r/LICD%": "//*[@id='comic-img']/a/img | //*[@id='comic-img']/img" + }, + "escapistmagazine.com/articles/view/comics/critical-miss": { + "%escapistmagazine.com/articles/view/comics/critical-miss%": "//*[@class='body']/span/img" + }, + "escapistmagazine.com/articles/view/comics/namegame": { + "%escapistmagazine.com/articles/view/comics/namegame%": "//*[@class='body']/span/p/img[@height != \"120\"]" + }, + "escapistmagazine.com/articles/view/comics/stolen-pixels": { + "%escapistmagazine.com/articles/view/comics/stolen-pixels%": "//*[@class='body']/span/p[2]/img" + }, + "escapistmagazine.com/articles/view/comics/bumhugparade": { + "%escapistmagazine.com/articles/view/comics/bumhugparade%": "//*[@class='body']/span/p[2]/img" + }, + "escapistmagazine.com/articles/view/comics/escapistradiotheater": { + "%escapistmagazine.com/articles/view/comics/escapistradiotheater%": "//*[@class='body']/span/p[2]/img" + }, + "escapistmagazine.com/articles/view/comics/paused": { + "%escapistmagazine.com/articles/view/comics/paused%": "//*[@class='body']/span/p[2]/img | //*[@class='body']/span/div/img" + }, + "escapistmagazine.com/articles/view/comics/fraughtwithperil": { + "%escapistmagazine.com/articles/view/comics/fraughtwithperil%": "//*[@class='body']" + } +} + -- cgit v1.2.3