diff options
author | Bernhard Posselt <Raydiation@users.noreply.github.com> | 2013-09-20 20:18:13 -0700 |
---|---|---|
committer | Bernhard Posselt <Raydiation@users.noreply.github.com> | 2013-09-20 20:18:13 -0700 |
commit | 7d7511714d370268feccef0681a8d89777b38978 (patch) | |
tree | 5a71e12f19be3279e81f759cd320584735850827 | |
parent | 3a13a680a3ee2d5313bfb6ded2d887dc1e363241 (diff) | |
parent | ea8e552ccb6e7d3818b41df1914dccc62cb4b324 (diff) |
Merge pull request #358 from bastei/relativeurls
ArticleEnhancer: Transform relative to absolute URLs
-rw-r--r-- | tests/unit/utility/articleenhancer/ArticleEnhancerTest.php | 92 | ||||
-rw-r--r-- | utility/articleenhancer/articleenhancer.php | 108 |
2 files changed, 190 insertions, 10 deletions
diff --git a/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php b/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php index a7585d771..ce105db49 100644 --- a/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php +++ b/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php @@ -57,8 +57,9 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility { $this->purifier, $this->fileFactory, array( - '/explosm.net\/comics/' => '//*[@id=\'maincontent\']/div[2]/div/img', + '/explosm.net\/comics/' => '//*[@id=\'maincontent\']/div[2]/div/span', '/explosm.net\/shorts/' => '//*[@id=\'maincontent\']/div/div', + '/explosm.net\/all/' => '//body/*', '/themerepublic.net/' => '//*[@class=\'post hentry\']' ), $this->timeout @@ -80,7 +81,7 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility { <body> <div id="maincontent"> <div>nooo</div> - <div><div><img src="hiho"></div></div> + <div><div><span>hiho</span></div></div> </div> </body> </html>'; @@ -95,11 +96,11 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility { ->will($this->returnValue($file)); $this->purifier->expects($this->once()) ->method('purify') - ->with($this->equalTo('<img src="hiho">')) - ->will($this->returnValue('<img src="hiho">')); + ->with($this->equalTo('<span>hiho</span>')) + ->will($this->returnValue('<span>hiho</span>')); $result = $this->testEnhancer->enhance($item); - $this->assertEquals('<img src="hiho">', $result->getBody()); + $this->assertEquals('<span>hiho</span>', $result->getBody()); } @@ -212,4 +213,85 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility { } + public function testTransformRelativeUrls() { + $file = new \stdClass; + $file->headers = array("content-type"=>"text/html; charset=utf-8"); + $file->body = '<html> + <body> + <a href="../a/relative/url.html?a=1#b">link</a> + <a href="b/relative/url.html">link2</a> + <img src="/another/relative/link.jpg"></img> + </body> + </html>'; + $item = new Item(); + $item->setUrl('https://www.explosm.net/all/312'); + $item->setBody('Hello thar'); + + $this->fileFactory->expects($this->once()) + ->method('getFile') + ->with($this->equalTo($item->getUrl()), + $this->equalTo($this->timeout)) + ->will($this->returnValue($file)); + $this->purifier->expects($this->once()) + ->method('purify') + ->with($this->equalTo('<a href="https://www.explosm.net/a/relative/url.html?a=1#b">link</a><a href="https://www.explosm.net/all/b/relative/url.html">link2</a><img src="https://www.explosm.net/another/relative/link.jpg">')) + ->will($this->returnValue('<a href="https://www.explosm.net/a/relative/url.html?a=1#b">link</a><a href="https://www.explosm.net/all/b/relative/url.html">link2</a><img src="https://www.explosm.net/another/relative/link.jpg">')); + + $result = $this->testEnhancer->enhance($item); + $this->assertEquals('<a href="https://www.explosm.net/a/relative/url.html?a=1#b">link</a><a href="https://www.explosm.net/all/b/relative/url.html">link2</a><img src="https://www.explosm.net/another/relative/link.jpg">', $result->getBody()); + } + + public function testTransformRelativeUrlSpecials() { + $file = new \stdClass; + $file->headers = array("content-type"=>"text/html; charset=utf-8"); + $file->body = '<html> + <body> + <img src="relative/url.png"> + </body> + </html>'; + $item = new Item(); + $item->setUrl('https://username:secret@www.explosm.net/all/312'); + $item->setBody('Hello thar'); + + $this->fileFactory->expects($this->once()) + ->method('getFile') + ->with($this->equalTo($item->getUrl()), + $this->equalTo($this->timeout)) + ->will($this->returnValue($file)); + $this->purifier->expects($this->once()) + ->method('purify') + ->with($this->equalTo('<img src="https://username:secret@www.explosm.net/all/relative/url.png">')) + ->will($this->returnValue('<img src="https://username:secret@www.explosm.net/all/relative/url.png">')); + + $result = $this->testEnhancer->enhance($item); + $this->assertEquals('<img src="https://username:secret@www.explosm.net/all/relative/url.png">', $result->getBody()); + } + + public function testDontTransformAbsoluteUrlsAndMails() { + $file = new \stdClass; + $file->headers = array("content-type"=>"text/html; charset=utf-8"); + $file->body = '<html> + <body> + <img src="http://www.url.com/absolute/url.png"> + <a href="mailto:test@testsite.com">mail</a> + </body> + </html>'; + $item = new Item(); + $item->setUrl('https://www.explosm.net/all/312'); + $item->setBody('Hello thar'); + + $this->fileFactory->expects($this->once()) + ->method('getFile') + ->with($this->equalTo($item->getUrl()), + $this->equalTo($this->timeout)) + ->will($this->returnValue($file)); + $this->purifier->expects($this->once()) + ->method('purify') + ->with($this->equalTo('<img src="http://www.url.com/absolute/url.png"><a href="mailto:test@testsite.com">mail</a>')) + ->will($this->returnValue('<img src="http://www.url.com/absolute/url.png"><a href="mailto:test@testsite.com">mail</a>')); + + $result = $this->testEnhancer->enhance($item); + $this->assertEquals('<img src="http://www.url.com/absolute/url.png"><a href="mailto:test@testsite.com">mail</a>', $result->getBody()); + } + }
\ No newline at end of file diff --git a/utility/articleenhancer/articleenhancer.php b/utility/articleenhancer/articleenhancer.php index e0d60d4c4..823ddcc18 100644 --- a/utility/articleenhancer/articleenhancer.php +++ b/utility/articleenhancer/articleenhancer.php @@ -56,6 +56,7 @@ abstract class ArticleEnhancer { public function enhance($item){ + foreach($this->regexXPathPair as $regex => $search) { if(preg_match($regex, $item->getUrl())) { @@ -71,6 +72,7 @@ abstract class ArticleEnhancer { $dom = new \DOMDocument(); @$dom->loadHTML($body); + $xpath = new \DOMXpath($dom); $xpathResult = $xpath->evaluate($search); @@ -78,6 +80,9 @@ abstract class ArticleEnhancer { if(!is_string($xpathResult)) { $xpathResult = $this->domToString($xpathResult); } + + // convert all relative to absolute URLs + $xpathResult = $this->substituteRelativeLinks($xpathResult, $item->getUrl()); $sanitizedResult = $this->purifier->purify($xpathResult); $item->setBody($sanitizedResult); @@ -89,10 +94,103 @@ abstract class ArticleEnhancer { /** + * Method which converts all relative "href" and "src" URLs of + * a HTML snippet with their absolute equivalent + * @param string $xmlString a HTML snippet as string with the relative URLs to be replaced + * @param string $absoluteUrl the approptiate absolute url of the HTML snippet + * @return string the result HTML snippet as a string + */ + protected function substituteRelativeLinks($xmlString, $absoluteUrl) { + $dom = new \DOMDocument(); + $dom->preserveWhiteSpace = false; + + // return, if xml is empty or loading the HTML fails + if( trim($xmlString) == "" || !$dom->loadHTML($xmlString) ) { + return $xmlString; + } + + // remove <!DOCTYPE + $dom->removeChild($dom->firstChild); + // remove <html></html> + $dom->replaceChild($dom->firstChild->firstChild, $dom->firstChild); + + $substitution = array("href", "src"); + + foreach ($substitution as $attribute) { + $xpath = new \DOMXpath($dom); + $xpathResult = $xpath->query("//*[@".$attribute." and not(contains(@".$attribute.", '://')) and not(starts-with(@".$attribute.", 'mailto:'))]"); + foreach ($xpathResult as $linkNode) { + $urlElement = $linkNode->attributes->getNamedItem($attribute); + $urlElement->nodeValue = $this->relativeToAbsoluteUrl( $urlElement->nodeValue, $absoluteUrl ); + } + } + + // save dom to string and remove <body></body> + $xmlString = substr(trim($dom->saveHTML()), 6, -7); + // domdocument spoils the string with line breaks between the elements. strip them. + $xmlString = str_replace("\n", "", $xmlString); + + return $xmlString; + } + + + /** + * Method which builds a URL by taking a relative URL and its corresponding + * absolute URL + * For examle relative URL "../example/path/file.php?a=1#anchor" and + * absolute URL "https://username:password@www.website.com/subfolder/index.html" + * will result in "https://username:password@www.website.com/example/path/file.php?a=1#anchor" + * @param string $relativeUrl the relative URL + * @param string $absoluteUrl the absolute URL with at least scheme and host + * @return string the resulting absolute URL + */ + protected function relativeToAbsoluteUrl($relativeUrl, $absoluteUrl) { + $abs = parse_url($absoluteUrl); + + $newUrl = $abs["scheme"]."://" + .( (isset($abs["user"])) ? $abs["user"] . ( (isset($abs["pass"])) ? ":".$abs["pass"] : "") . "@" : "" ) + .$abs["host"] + .( (isset($abs["port"])) ? ":".$abs["port"] : "" ); + + if(substr(trim($relativeUrl), 0, 1) == "/") { + // we have a relative url like "/a/path/file" + return $newUrl . $relativeUrl; + } else { + // we have a relative url like "a/path/file", handle "."" and ".." directories + + // the starting point is the absolute path, but with out the last part (we don't need the file name) + $newPath = explode("/", substr($abs["path"], 1) ); + array_pop($newPath); + + $relPath = parse_url($relativeUrl, PHP_URL_PATH); + $relPath = explode("/", $relPath); + + // cross the relative and the absolute path + for($i=0; $i<count($relPath)-1; $i++) { + if($relPath[$i] == ".") { + continue; + } elseif($relPath[$i] == "..") { + array_pop($newPath); + } else { + $newPath[] = $relPath[$i]; + } + } + + // add the last part (the file name) of the relative URL + $newPath[] = $relPath[ count($relPath)-1 ]; + $newPath = implode("/", $newPath); + + $rel = parse_url($relativeUrl); + return $newUrl . "/" . $newPath + . ( (isset($rel["query"])) ? "?".$rel["query"] : "") + . ( (isset($rel["fragment"])) ? "#".$rel["fragment"] : ""); + } + } + + + /** * Method which turns an xpath result to a string - * Assumes that the result matches a single element. If the result - * is not a single element, you can customize it by overwriting this - * method + * you can customize it by overwriting this method * @param $xpathResult the result from the xpath query * @return the result as a string */ @@ -106,9 +204,9 @@ abstract class ArticleEnhancer { protected function toInnerHTML($node) { - $dom = new \DOMDocument(); + $dom = new \DOMDocument(); $dom->appendChild($dom->importNode($node, true)); - return trim($dom->saveHTML()); + return trim($dom->saveHTML($dom->documentElement)); } |