summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBernhard Posselt <Raydiation@users.noreply.github.com>2013-09-07 09:22:36 -0700
committerBernhard Posselt <Raydiation@users.noreply.github.com>2013-09-07 09:22:36 -0700
commit8bee2e6d5e64eeaede7a1f152f84696b8526e1f6 (patch)
tree835bb88ecfc3ca5e4fcb96800a72d63071ecf329
parente04ffe7de8c230e3c411caa22438f38e4ce142b4 (diff)
parenta73b7da2ec8ceaf6716ed8e6a3041bfee726f71a (diff)
Merge pull request #337 from bastei/master
Convert encoding of documents in ArticleEnhancer
-rw-r--r--tests/unit/utility/articleenhancer/ArticleEnhancerTest.php5
-rw-r--r--utility/articleenhancer/articleenhancer.php11
2 files changed, 15 insertions, 1 deletions
diff --git a/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php b/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php
index a59bf9485..bb3c9e53d 100644
--- a/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php
+++ b/tests/unit/utility/articleenhancer/ArticleEnhancerTest.php
@@ -74,6 +74,7 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility {
public function testDoesModifiyArticlesThatMatch() {
$file = new \stdClass;
+ $file->headers = array("content-type"=>"text/html; charset=utf-8");
$file->body = '<html>
<body>
<div id="maincontent">
@@ -103,6 +104,7 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility {
public function testDoesModifiyAllArticlesThatMatch() {
$file = new \stdClass;
+ $file->headers = array("content-type"=>"text/html; charset=utf-8");
$file->body = '<html>
<body>
<div id="maincontent">
@@ -132,6 +134,7 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility {
public function testModificationHandlesEmptyResults() {
$file = new \stdClass;
+ $file->headers = array("content-type"=>"text/html; charset=utf-8");
$file->body = '<html>
<body>
<div id="maincontent">
@@ -159,6 +162,7 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility {
public function testModificationDoesNotBreakOnEmptyDom() {
$file = new \stdClass;
+ $file->headers = array("content-type"=>"text/html; charset=utf-8");
$file->body = '';
$item = new Item();
$item->setUrl('https://www.explosm.net/comics/312');
@@ -181,6 +185,7 @@ class ArticleEnhancerTest extends \OCA\AppFramework\Utility\TestUtility {
public function testModificationDoesNotBreakOnBrokenDom() {
$file = new \stdClass;
+ $file->headers = array("content-type"=>"text/html; charset=utf-8");
$file->body = '<html/><p>
<body>
<div id="maincontent">
diff --git a/utility/articleenhancer/articleenhancer.php b/utility/articleenhancer/articleenhancer.php
index 76bb0fa9f..e0d60d4c4 100644
--- a/utility/articleenhancer/articleenhancer.php
+++ b/utility/articleenhancer/articleenhancer.php
@@ -60,8 +60,17 @@ abstract class ArticleEnhancer {
if(preg_match($regex, $item->getUrl())) {
$file = $this->fileFactory->getFile($item->getUrl(), $this->maximumTimeout);
+
+ // convert encoding by detecting charset from header
+ $contentType = $file->headers['content-type'];
+ if( preg_match( '/(?<=charset=)[^;]*/', $contentType, $matches ) ) {
+ $body = mb_convert_encoding($file->body, 'HTML-ENTITIES', $matches[0]);
+ } else {
+ $body = $file->body;
+ }
+
$dom = new \DOMDocument();
- @$dom->loadHTML($file->body);
+ @$dom->loadHTML($body);
$xpath = new \DOMXpath($dom);
$xpathResult = $xpath->evaluate($search);