further cleanup

author: Bernhard Posselt <dev@bernhard-posselt.com> 2014-10-04 18:22:12 +0200
committer: Bernhard Posselt <dev@bernhard-posselt.com> 2014-10-04 18:22:12 +0200
commit: b5f4871da2218a3bd8e0b179720db1008beb5dfb (patch)
tree: 4bcb9e6ce0d5fe10b6209f20aa84a6e706a44b1d
parent: d14ffa79073d4ed6a9dd470ff6fd121ce2dc096b (diff)
2 files changed, 26 insertions, 20 deletions
diff --git a/articleenhancer/xpatharticleenhancer.php b/articleenhancer/xpatharticleenhancer.php
index 38ce64a40..8d0dba7e8 100644
--- a/articleenhancer/xpatharticleenhancer.php
+++ b/articleenhancer/xpatharticleenhancer.php
@@ -39,8 +39,6 @@ class XPathArticleEnhancer implements ArticleEnhancer {
      * match the url and the xpath that should be used for it to extract the
      * page
      * @param \OCA\News\Utility\Config $config
-     * @internal param \OCA\News\ArticleEnhancer\a $SimplePieFileFactory factory for getting a simple pie file instance
-     * @internal param int $maximumTimeout maximum timeout in seconds, defaults to 10 sec
      */
 	public function __construct(SimplePieAPIFactory $fileFactory,
 	                            array $regexXPathPair,
@@ -79,7 +77,8 @@ class XPathArticleEnhancer implements ArticleEnhancer {
 				$xpath = new DOMXpath($dom);
 				$xpathResult = $xpath->evaluate($search);
 
-				// in case it wasnt a text query assume its a dom element
+				// in case it wasnt a text query assume its a dom element and
+				// convert it to text
 				if(!is_string($xpathResult)) {
 					$xpathResult = $this->domToString($xpathResult);
 				}
@@ -118,20 +117,17 @@ class XPathArticleEnhancer implements ArticleEnhancer {
 		$dom = new DOMDocument();
 		$dom->preserveWhiteSpace = false;
 
-		// return, if xml is empty or loading the HTML fails
-		$isLoaded = Security::scan($xmlString, $dom, function ($xml, $dom) {
-			return @$dom->loadHTML($xml, LIBXML_NONET);
+		$noHTMLError = Security::scan($xmlString, $dom, function ($xml, $dom) {
+			// wrap in div to prevent loadHTML from inserting weird elements
+			$xml = '<div>' . $xml . '</div>';
+			return @$dom->loadHTML($xml, LIBXML_NONET | LIBXML_HTML_NODEFDTD
+				| LIBXML_HTML_NOIMPLIED);
 		});
 
-		if($xmlString === '' || !$isLoaded) {
-			return $xmlString;
+		if($xmlString === '' || !$noHTMLError) {
+			return false;
 		}
 
-		// remove <!DOCTYPE
-		$dom->removeChild($dom->firstChild);
-		// remove <html></html>
-		$dom->replaceChild($dom->firstChild->firstChild, $dom->firstChild);
-
 		foreach (['href', 'src'] as $attribute) {
 			$xpath = new DOMXpath($dom);
 			$xpathResult = $xpath->query(
@@ -147,9 +143,10 @@ class XPathArticleEnhancer implements ArticleEnhancer {
 		}
 
 		// save dom to string and remove <body></body>
-		$xmlString = substr(trim($dom->saveHTML()), 6, -7);
+		$xmlString = $dom->saveHTML();
+
 		// domdocument spoils the string with line breaks between the elements. strip them.
-		$xmlString = str_replace('\n', '', $xmlString);
+		$xmlString = str_replace("\n", '', $xmlString);
 
 		return $xmlString;
 	}
diff --git a/tests/unit/articleenhancer/XPathArticleEnhancerTest.php b/tests/unit/articleenhancer/XPathArticleEnhancerTest.php
index 2f6f5688e..e7b3d19dd 100644
--- a/tests/unit/articleenhancer/XPathArticleEnhancerTest.php
+++ b/tests/unit/articleenhancer/XPathArticleEnhancerTest.php
@@ -127,7 +127,7 @@ class XPathArticleEnhancerTest extends \PHPUnit_Framework_TestCase {
 			->will($this->returnValue($file));
 
 		$result = $this->testEnhancer->enhance($item);
-		$this->assertEquals('<span>hiho</span>', $result->getBody());
+		$this->assertEquals('<div><span>hiho</span></div>', $result->getBody());
 	}
 
 
@@ -156,7 +156,8 @@ class XPathArticleEnhancerTest extends \PHPUnit_Framework_TestCase {
 			->will($this->returnValue($file));
 
 		$result = $this->testEnhancer->enhance($item);
-		$this->assertEquals('<div>hiho</div><div>rawr</div>', $result->getBody());
+		$this->assertEquals('<div><div>hiho</div><div>rawr</div></div>',
+			$result->getBody());
 	}
 
 
@@ -260,7 +261,11 @@ class XPathArticleEnhancerTest extends \PHPUnit_Framework_TestCase {
 			->will($this->returnValue($file));
 
 		$result = $this->testEnhancer->enhance($item);
-		$this->assertEquals('<a target="_blank" href="https://www.explosm.net/a/relative/url.html?a=1#b">link</a><a target="_blank" href="https://www.explosm.net/all/b/relative/url.html">link2</a><img src="https://www.explosm.net/another/relative/link.jpg">', $result->getBody());
+		$this->assertEquals('<div>' .
+			'<a target="_blank" href="https://www.explosm.net/a/relative/url.html?a=1#b">link</a>' .
+			'<a target="_blank" href="https://www.explosm.net/all/b/relative/url.html">link2</a>' .
+			'<img src="https://www.explosm.net/another/relative/link.jpg">' .
+			'</div>', $result->getBody());
 	}
 
 	public function testTransformRelativeUrlSpecials() {
@@ -285,7 +290,9 @@ class XPathArticleEnhancerTest extends \PHPUnit_Framework_TestCase {
 			->will($this->returnValue($file));
 
 		$result = $this->testEnhancer->enhance($item);
-		$this->assertEquals('<img src="https://username:secret@www.explosm.net/all/relative/url.png?a=1&amp;b=2">', $result->getBody());
+		$this->assertEquals(
+			'<div><img src="https://username:secret@www.explosm.net/all/relative/url.png?a=1&amp;b=2"></div>',
+			$result->getBody());
 	}
 
 	public function testDontTransformAbsoluteUrlsAndMails() {
@@ -312,8 +319,10 @@ class XPathArticleEnhancerTest extends \PHPUnit_Framework_TestCase {
 
 		$result = $this->testEnhancer->enhance($item);
 		$this->assertEquals(
+			'<div>' .
 			'<img src="http://www.url.com/absolute/url.png">' .
-			'<a target="_blank" href="mailto:test@testsite.com">mail</a>',
+			'<a target="_blank" href="mailto:test@testsite.com">mail</a>' .
+			'</div>',
 			$result->getBody()
 		);
 	}
author	Bernhard Posselt <dev@bernhard-posselt.com>	2014-10-04 18:22:12 +0200
committer	Bernhard Posselt <dev@bernhard-posselt.com>	2014-10-04 18:22:12 +0200
commit	b5f4871da2218a3bd8e0b179720db1008beb5dfb (patch)
tree	4bcb9e6ce0d5fe10b6209f20aa84a6e706a44b1d
parent	d14ffa79073d4ed6a9dd470ff6fd121ce2dc096b (diff)