diff options
author | bastei <bastei@users.noreply.github.com> | 2013-09-06 17:45:52 +0200 |
---|---|---|
committer | bastei <bastei@users.noreply.github.com> | 2013-09-06 17:45:52 +0200 |
commit | a051d38df9094ccb6cac8e62587d08c477cd09df (patch) | |
tree | 562b1ce47252297a7f735c25ee59bef9acb99137 /utility | |
parent | 772999e5628e62286c95f7e11635585404660a88 (diff) |
try detect encoding of documents in ArticleEnhancer and convert them
Diffstat (limited to 'utility')
-rw-r--r-- | utility/articleenhancer/articleenhancer.php | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/utility/articleenhancer/articleenhancer.php b/utility/articleenhancer/articleenhancer.php index 76bb0fa9f..c9b61a135 100644 --- a/utility/articleenhancer/articleenhancer.php +++ b/utility/articleenhancer/articleenhancer.php @@ -60,8 +60,16 @@ abstract class ArticleEnhancer { if(preg_match($regex, $item->getUrl())) { $file = $this->fileFactory->getFile($item->getUrl(), $this->maximumTimeout); + + // convert encoding by detecting charset from header + $contentType = $file->headers['content-type']; + if( preg_match( '/(?<=charset=)[^;]*/', $contentType, $matches ) ) + $body = mb_convert_encoding($file->body, 'HTML-ENTITIES', $matches[0]); + else + $body = $file->body; + $dom = new \DOMDocument(); - @$dom->loadHTML($file->body); + @$dom->loadHTML($body); $xpath = new \DOMXpath($dom); $xpathResult = $xpath->evaluate($search); |