diff options
author | Bernhard Posselt <Raydiation@users.noreply.github.com> | 2013-09-07 09:22:36 -0700 |
---|---|---|
committer | Bernhard Posselt <Raydiation@users.noreply.github.com> | 2013-09-07 09:22:36 -0700 |
commit | 8bee2e6d5e64eeaede7a1f152f84696b8526e1f6 (patch) | |
tree | 835bb88ecfc3ca5e4fcb96800a72d63071ecf329 | |
parent | e04ffe7de8c230e3c411caa22438f38e4ce142b4 (diff) | |
parent | a73b7da2ec8ceaf6716ed8e6a3041bfee726f71a (diff) |
Merge pull request #337 from bastei/master
Convert encoding of documents in ArticleEnhancer
-rw-r--r-- | tests/unit/utility/articleenhancer/ArticleEnhancerTest.php | 5 | ||||
-rw-r--r-- | utility/articleenhancer/articleenhancer.php | 11 |
2 files changed, 15 insertions, 1 deletions
diff --git a/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php b/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php index a59bf9485..bb3c9e53d 100644 --- a/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php +++ b/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php @@ -74,6 +74,7 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility { public function testDoesModifiyArticlesThatMatch() { $file = new \stdClass; + $file->headers = array("content-type"=>"text/html; charset=utf-8"); $file->body = '<html> <body> <div id="maincontent"> @@ -103,6 +104,7 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility { public function testDoesModifiyAllArticlesThatMatch() { $file = new \stdClass; + $file->headers = array("content-type"=>"text/html; charset=utf-8"); $file->body = '<html> <body> <div id="maincontent"> @@ -132,6 +134,7 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility { public function testModificationHandlesEmptyResults() { $file = new \stdClass; + $file->headers = array("content-type"=>"text/html; charset=utf-8"); $file->body = '<html> <body> <div id="maincontent"> @@ -159,6 +162,7 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility { public function testModificationDoesNotBreakOnEmptyDom() { $file = new \stdClass; + $file->headers = array("content-type"=>"text/html; charset=utf-8"); $file->body = ''; $item = new Item(); $item->setUrl('https://www.explosm.net/comics/312'); @@ -181,6 +185,7 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility { public function testModificationDoesNotBreakOnBrokenDom() { $file = new \stdClass; + $file->headers = array("content-type"=>"text/html; charset=utf-8"); $file->body = '<html/><p> <body> <div id="maincontent"> diff --git a/utility/articleenhancer/articleenhancer.php b/utility/articleenhancer/articleenhancer.php index 76bb0fa9f..e0d60d4c4 100644 --- a/utility/articleenhancer/articleenhancer.php +++ b/utility/articleenhancer/articleenhancer.php @@ -60,8 +60,17 @@ abstract class ArticleEnhancer { if(preg_match($regex, $item->getUrl())) { $file = $this->fileFactory->getFile($item->getUrl(), $this->maximumTimeout); + + // convert encoding by detecting charset from header + $contentType = $file->headers['content-type']; + if( preg_match( '/(?<=charset=)[^;]*/', $contentType, $matches ) ) { + $body = mb_convert_encoding($file->body, 'HTML-ENTITIES', $matches[0]); + } else { + $body = $file->body; + } + $dom = new \DOMDocument(); - @$dom->loadHTML($file->body); + @$dom->loadHTML($body); $xpath = new \DOMXpath($dom); $xpathResult = $xpath->evaluate($search); |