summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorbastei <bastei@users.noreply.github.com>2013-09-21 04:27:13 +0200
committerbastei <bastei@users.noreply.github.com>2013-09-21 04:27:13 +0200
commitea8e552ccb6e7d3818b41df1914dccc62cb4b324 (patch)
tree9f14e4480f716aed4046e84a1060b4e58a04164e
parentd9f1aca951fb83d870d8abbcc30244441dd2a442 (diff)
ArticleEnhancer: Transform relative to absolute URLs
-rw-r--r--tests/unit/utility/articleenhancer/ArticleEnhancerTest.php92
-rw-r--r--utility/articleenhancer/articleenhancer.php108
2 files changed, 190 insertions, 10 deletions
diff --git a/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php b/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php
index a7585d771..ce105db49 100644
--- a/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php
+++ b/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php
@@ -57,8 +57,9 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility {
$this->purifier,
$this->fileFactory,
array(
- '/explosm.net\/comics/' => '//*[@id=\'maincontent\']/div[2]/div/img',
+ '/explosm.net\/comics/' => '//*[@id=\'maincontent\']/div[2]/div/span',
'/explosm.net\/shorts/' => '//*[@id=\'maincontent\']/div/div',
+ '/explosm.net\/all/' => '//body/*',
'/themerepublic.net/' => '//*[@class=\'post hentry\']'
),
$this->timeout
@@ -80,7 +81,7 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility {
<body>
<div id="maincontent">
<div>nooo</div>
- <div><div><img src="hiho"></div></div>
+ <div><div><span>hiho</span></div></div>
</div>
</body>
</html>';
@@ -95,11 +96,11 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility {
->will($this->returnValue($file));
$this->purifier->expects($this->once())
->method('purify')
- ->with($this->equalTo('<img src="hiho">'))
- ->will($this->returnValue('<img src="hiho">'));
+ ->with($this->equalTo('<span>hiho</span>'))
+ ->will($this->returnValue('<span>hiho</span>'));
$result = $this->testEnhancer->enhance($item);
- $this->assertEquals('<img src="hiho">', $result->getBody());
+ $this->assertEquals('<span>hiho</span>', $result->getBody());
}
@@ -212,4 +213,85 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility {
}
+ public function testTransformRelativeUrls() {
+ $file = new \stdClass;
+ $file->headers = array("content-type"=>"text/html; charset=utf-8");
+ $file->body = '<html>
+ <body>
+ <a href="../a/relative/url.html?a=1#b">link</a>
+ <a href="b/relative/url.html">link2</a>
+ <img src="/another/relative/link.jpg"></img>
+ </body>
+ </html>';
+ $item = new Item();
+ $item->setUrl('https://www.explosm.net/all/312');
+ $item->setBody('Hello thar');
+
+ $this->fileFactory->expects($this->once())
+ ->method('getFile')
+ ->with($this->equalTo($item->getUrl()),
+ $this->equalTo($this->timeout))
+ ->will($this->returnValue($file));
+ $this->purifier->expects($this->once())
+ ->method('purify')
+ ->with($this->equalTo('<a href="https://www.explosm.net/a/relative/url.html?a=1#b">link</a><a href="https://www.explosm.net/all/b/relative/url.html">link2</a><img src="https://www.explosm.net/another/relative/link.jpg">'))
+ ->will($this->returnValue('<a href="https://www.explosm.net/a/relative/url.html?a=1#b">link</a><a href="https://www.explosm.net/all/b/relative/url.html">link2</a><img src="https://www.explosm.net/another/relative/link.jpg">'));
+
+ $result = $this->testEnhancer->enhance($item);
+ $this->assertEquals('<a href="https://www.explosm.net/a/relative/url.html?a=1#b">link</a><a href="https://www.explosm.net/all/b/relative/url.html">link2</a><img src="https://www.explosm.net/another/relative/link.jpg">', $result->getBody());
+ }
+
+ public function testTransformRelativeUrlSpecials() {
+ $file = new \stdClass;
+ $file->headers = array("content-type"=>"text/html; charset=utf-8");
+ $file->body = '<html>
+ <body>
+ <img src="relative/url.png">
+ </body>
+ </html>';
+ $item = new Item();
+ $item->setUrl('https://username:secret@www.explosm.net/all/312');
+ $item->setBody('Hello thar');
+
+ $this->fileFactory->expects($this->once())
+ ->method('getFile')
+ ->with($this->equalTo($item->getUrl()),
+ $this->equalTo($this->timeout))
+ ->will($this->returnValue($file));
+ $this->purifier->expects($this->once())
+ ->method('purify')
+ ->with($this->equalTo('<img src="https://username:secret@www.explosm.net/all/relative/url.png">'))
+ ->will($this->returnValue('<img src="https://username:secret@www.explosm.net/all/relative/url.png">'));
+
+ $result = $this->testEnhancer->enhance($item);
+ $this->assertEquals('<img src="https://username:secret@www.explosm.net/all/relative/url.png">', $result->getBody());
+ }
+
+ public function testDontTransformAbsoluteUrlsAndMails() {
+ $file = new \stdClass;
+ $file->headers = array("content-type"=>"text/html; charset=utf-8");
+ $file->body = '<html>
+ <body>
+ <img src="http://www.url.com/absolute/url.png">
+ <a href="mailto:test@testsite.com">mail</a>
+ </body>
+ </html>';
+ $item = new Item();
+ $item->setUrl('https://www.explosm.net/all/312');
+ $item->setBody('Hello thar');
+
+ $this->fileFactory->expects($this->once())
+ ->method('getFile')
+ ->with($this->equalTo($item->getUrl()),
+ $this->equalTo($this->timeout))
+ ->will($this->returnValue($file));
+ $this->purifier->expects($this->once())
+ ->method('purify')
+ ->with($this->equalTo('<img src="http://www.url.com/absolute/url.png"><a href="mailto:test@testsite.com">mail</a>'))
+ ->will($this->returnValue('<img src="http://www.url.com/absolute/url.png"><a href="mailto:test@testsite.com">mail</a>'));
+
+ $result = $this->testEnhancer->enhance($item);
+ $this->assertEquals('<img src="http://www.url.com/absolute/url.png"><a href="mailto:test@testsite.com">mail</a>', $result->getBody());
+ }
+
} \ No newline at end of file
diff --git a/utility/articleenhancer/articleenhancer.php b/utility/articleenhancer/articleenhancer.php
index e0d60d4c4..823ddcc18 100644
--- a/utility/articleenhancer/articleenhancer.php
+++ b/utility/articleenhancer/articleenhancer.php
@@ -56,6 +56,7 @@ abstract class ArticleEnhancer {
public function enhance($item){
+
foreach($this->regexXPathPair as $regex => $search) {
if(preg_match($regex, $item->getUrl())) {
@@ -71,6 +72,7 @@ abstract class ArticleEnhancer {
$dom = new \DOMDocument();
@$dom->loadHTML($body);
+
$xpath = new \DOMXpath($dom);
$xpathResult = $xpath->evaluate($search);
@@ -78,6 +80,9 @@ abstract class ArticleEnhancer {
if(!is_string($xpathResult)) {
$xpathResult = $this->domToString($xpathResult);
}
+
+ // convert all relative to absolute URLs
+ $xpathResult = $this->substituteRelativeLinks($xpathResult, $item->getUrl());
$sanitizedResult = $this->purifier->purify($xpathResult);
$item->setBody($sanitizedResult);
@@ -89,10 +94,103 @@ abstract class ArticleEnhancer {
/**
+ * Method which converts all relative "href" and "src" URLs of
+ * a HTML snippet with their absolute equivalent
+ * @param string $xmlString a HTML snippet as string with the relative URLs to be replaced
+ * @param string $absoluteUrl the approptiate absolute url of the HTML snippet
+ * @return string the result HTML snippet as a string
+ */
+ protected function substituteRelativeLinks($xmlString, $absoluteUrl) {
+ $dom = new \DOMDocument();
+ $dom->preserveWhiteSpace = false;
+
+ // return, if xml is empty or loading the HTML fails
+ if( trim($xmlString) == "" || !$dom->loadHTML($xmlString) ) {
+ return $xmlString;
+ }
+
+ // remove <!DOCTYPE
+ $dom->removeChild($dom->firstChild);
+ // remove <html></html>
+ $dom->replaceChild($dom->firstChild->firstChild, $dom->firstChild);
+
+ $substitution = array("href", "src");
+
+ foreach ($substitution as $attribute) {
+ $xpath = new \DOMXpath($dom);
+ $xpathResult = $xpath->query("//*[@".$attribute." and not(contains(@".$attribute.", '://')) and not(starts-with(@".$attribute.", 'mailto:'))]");
+ foreach ($xpathResult as $linkNode) {
+ $urlElement = $linkNode->attributes->getNamedItem($attribute);
+ $urlElement->nodeValue = $this->relativeToAbsoluteUrl( $urlElement->nodeValue, $absoluteUrl );
+ }
+ }
+
+ // save dom to string and remove <body></body>
+ $xmlString = substr(trim($dom->saveHTML()), 6, -7);
+ // domdocument spoils the string with line breaks between the elements. strip them.
+ $xmlString = str_replace("\n", "", $xmlString);
+
+ return $xmlString;
+ }
+
+
+ /**
+ * Method which builds a URL by taking a relative URL and its corresponding
+ * absolute URL
+ * For examle relative URL "../example/path/file.php?a=1#anchor" and
+ * absolute URL "https://username:password@www.website.com/subfolder/index.html"
+ * will result in "https://username:password@www.website.com/example/path/file.php?a=1#anchor"
+ * @param string $relativeUrl the relative URL
+ * @param string $absoluteUrl the absolute URL with at least scheme and host
+ * @return string the resulting absolute URL
+ */
+ protected function relativeToAbsoluteUrl($relativeUrl, $absoluteUrl) {
+ $abs = parse_url($absoluteUrl);
+
+ $newUrl = $abs["scheme"]."://"
+ .( (isset($abs["user"])) ? $abs["user"] . ( (isset($abs["pass"])) ? ":".$abs["pass"] : "") . "@" : "" )
+ .$abs["host"]
+ .( (isset($abs["port"])) ? ":".$abs["port"] : "" );
+
+ if(substr(trim($relativeUrl), 0, 1) == "/") {
+ // we have a relative url like "/a/path/file"
+ return $newUrl . $relativeUrl;
+ } else {
+ // we have a relative url like "a/path/file", handle "."" and ".." directories
+
+ // the starting point is the absolute path, but with out the last part (we don't need the file name)
+ $newPath = explode("/", substr($abs["path"], 1) );
+ array_pop($newPath);
+
+ $relPath = parse_url($relativeUrl, PHP_URL_PATH);
+ $relPath = explode("/", $relPath);
+
+ // cross the relative and the absolute path
+ for($i=0; $i<count($relPath)-1; $i++) {
+ if($relPath[$i] == ".") {
+ continue;
+ } elseif($relPath[$i] == "..") {
+ array_pop($newPath);
+ } else {
+ $newPath[] = $relPath[$i];
+ }
+ }
+
+ // add the last part (the file name) of the relative URL
+ $newPath[] = $relPath[ count($relPath)-1 ];
+ $newPath = implode("/", $newPath);
+
+ $rel = parse_url($relativeUrl);
+ return $newUrl . "/" . $newPath
+ . ( (isset($rel["query"])) ? "?".$rel["query"] : "")
+ . ( (isset($rel["fragment"])) ? "#".$rel["fragment"] : "");
+ }
+ }
+
+
+ /**
* Method which turns an xpath result to a string
- * Assumes that the result matches a single element. If the result
- * is not a single element, you can customize it by overwriting this
- * method
+ * you can customize it by overwriting this method
* @param $xpathResult the result from the xpath query
* @return the result as a string
*/
@@ -106,9 +204,9 @@ abstract class ArticleEnhancer {
protected function toInnerHTML($node) {
- $dom = new \DOMDocument();
+ $dom = new \DOMDocument();
$dom->appendChild($dom->importNode($node, true));
- return trim($dom->saveHTML());
+ return trim($dom->saveHTML($dom->documentElement));
}