diff options
author | Bernhard Posselt <dev@bernhard-posselt.com> | 2014-10-04 18:22:12 +0200 |
---|---|---|
committer | Bernhard Posselt <dev@bernhard-posselt.com> | 2014-10-04 18:22:12 +0200 |
commit | b5f4871da2218a3bd8e0b179720db1008beb5dfb (patch) | |
tree | 4bcb9e6ce0d5fe10b6209f20aa84a6e706a44b1d | |
parent | d14ffa79073d4ed6a9dd470ff6fd121ce2dc096b (diff) |
further cleanup
-rw-r--r-- | articleenhancer/xpatharticleenhancer.php | 27 | ||||
-rw-r--r-- | tests/unit/articleenhancer/XPathArticleEnhancerTest.php | 19 |
2 files changed, 26 insertions, 20 deletions
diff --git a/articleenhancer/xpatharticleenhancer.php b/articleenhancer/xpatharticleenhancer.php index 38ce64a40..8d0dba7e8 100644 --- a/articleenhancer/xpatharticleenhancer.php +++ b/articleenhancer/xpatharticleenhancer.php @@ -39,8 +39,6 @@ class XPathArticleEnhancer implements ArticleEnhancer { * match the url and the xpath that should be used for it to extract the * page * @param \OCA\News\Utility\Config $config - * @internal param \OCA\News\ArticleEnhancer\a $SimplePieFileFactory factory for getting a simple pie file instance - * @internal param int $maximumTimeout maximum timeout in seconds, defaults to 10 sec */ public function __construct(SimplePieAPIFactory $fileFactory, array $regexXPathPair, @@ -79,7 +77,8 @@ class XPathArticleEnhancer implements ArticleEnhancer { $xpath = new DOMXpath($dom); $xpathResult = $xpath->evaluate($search); - // in case it wasnt a text query assume its a dom element + // in case it wasnt a text query assume its a dom element and + // convert it to text if(!is_string($xpathResult)) { $xpathResult = $this->domToString($xpathResult); } @@ -118,20 +117,17 @@ class XPathArticleEnhancer implements ArticleEnhancer { $dom = new DOMDocument(); $dom->preserveWhiteSpace = false; - // return, if xml is empty or loading the HTML fails - $isLoaded = Security::scan($xmlString, $dom, function ($xml, $dom) { - return @$dom->loadHTML($xml, LIBXML_NONET); + $noHTMLError = Security::scan($xmlString, $dom, function ($xml, $dom) { + // wrap in div to prevent loadHTML from inserting weird elements + $xml = '<div>' . $xml . '</div>'; + return @$dom->loadHTML($xml, LIBXML_NONET | LIBXML_HTML_NODEFDTD + | LIBXML_HTML_NOIMPLIED); }); - if($xmlString === '' || !$isLoaded) { - return $xmlString; + if($xmlString === '' || !$noHTMLError) { + return false; } - // remove <!DOCTYPE - $dom->removeChild($dom->firstChild); - // remove <html></html> - $dom->replaceChild($dom->firstChild->firstChild, $dom->firstChild); - foreach (['href', 'src'] as $attribute) { $xpath = new DOMXpath($dom); $xpathResult = $xpath->query( @@ -147,9 +143,10 @@ class XPathArticleEnhancer implements ArticleEnhancer { } // save dom to string and remove <body></body> - $xmlString = substr(trim($dom->saveHTML()), 6, -7); + $xmlString = $dom->saveHTML(); + // domdocument spoils the string with line breaks between the elements. strip them. - $xmlString = str_replace('\n', '', $xmlString); + $xmlString = str_replace("\n", '', $xmlString); return $xmlString; } diff --git a/tests/unit/articleenhancer/XPathArticleEnhancerTest.php b/tests/unit/articleenhancer/XPathArticleEnhancerTest.php index 2f6f5688e..e7b3d19dd 100644 --- a/tests/unit/articleenhancer/XPathArticleEnhancerTest.php +++ b/tests/unit/articleenhancer/XPathArticleEnhancerTest.php @@ -127,7 +127,7 @@ class XPathArticleEnhancerTest extends \PHPUnit_Framework_TestCase { ->will($this->returnValue($file)); $result = $this->testEnhancer->enhance($item); - $this->assertEquals('<span>hiho</span>', $result->getBody()); + $this->assertEquals('<div><span>hiho</span></div>', $result->getBody()); } @@ -156,7 +156,8 @@ class XPathArticleEnhancerTest extends \PHPUnit_Framework_TestCase { ->will($this->returnValue($file)); $result = $this->testEnhancer->enhance($item); - $this->assertEquals('<div>hiho</div><div>rawr</div>', $result->getBody()); + $this->assertEquals('<div><div>hiho</div><div>rawr</div></div>', + $result->getBody()); } @@ -260,7 +261,11 @@ class XPathArticleEnhancerTest extends \PHPUnit_Framework_TestCase { ->will($this->returnValue($file)); $result = $this->testEnhancer->enhance($item); - $this->assertEquals('<a target="_blank" href="https://www.explosm.net/a/relative/url.html?a=1#b">link</a><a target="_blank" href="https://www.explosm.net/all/b/relative/url.html">link2</a><img src="https://www.explosm.net/another/relative/link.jpg">', $result->getBody()); + $this->assertEquals('<div>' . + '<a target="_blank" href="https://www.explosm.net/a/relative/url.html?a=1#b">link</a>' . + '<a target="_blank" href="https://www.explosm.net/all/b/relative/url.html">link2</a>' . + '<img src="https://www.explosm.net/another/relative/link.jpg">' . + '</div>', $result->getBody()); } public function testTransformRelativeUrlSpecials() { @@ -285,7 +290,9 @@ class XPathArticleEnhancerTest extends \PHPUnit_Framework_TestCase { ->will($this->returnValue($file)); $result = $this->testEnhancer->enhance($item); - $this->assertEquals('<img src="https://username:secret@www.explosm.net/all/relative/url.png?a=1&b=2">', $result->getBody()); + $this->assertEquals( + '<div><img src="https://username:secret@www.explosm.net/all/relative/url.png?a=1&b=2"></div>', + $result->getBody()); } public function testDontTransformAbsoluteUrlsAndMails() { @@ -312,8 +319,10 @@ class XPathArticleEnhancerTest extends \PHPUnit_Framework_TestCase { $result = $this->testEnhancer->enhance($item); $this->assertEquals( + '<div>' . '<img src="http://www.url.com/absolute/url.png">' . - '<a target="_blank" href="mailto:test@testsite.com">mail</a>', + '<a target="_blank" href="mailto:test@testsite.com">mail</a>' . + '</div>', $result->getBody() ); } |