summaryrefslogtreecommitdiffstats
path: root/articleenhancer/xpatharticleenhancer.php
diff options
context:
space:
mode:
Diffstat (limited to 'articleenhancer/xpatharticleenhancer.php')
-rw-r--r--articleenhancer/xpatharticleenhancer.php17
1 files changed, 15 insertions, 2 deletions
diff --git a/articleenhancer/xpatharticleenhancer.php b/articleenhancer/xpatharticleenhancer.php
index b283786b8..dec0fe760 100644
--- a/articleenhancer/xpatharticleenhancer.php
+++ b/articleenhancer/xpatharticleenhancer.php
@@ -52,8 +52,21 @@ class XPathArticleEnhancer implements ArticleEnhancer {
if(preg_match($regex, $item->getUrl())) {
$body = $this->getFile($item->getUrl());
- $body = mb_convert_encoding($body, 'HTML-ENTITIES',
- mb_detect_encoding($body));
+
+ // Determine document encoding.
+ // First check if either <meta charset="..."> or
+ // <meta http-equiv="Content-Type" ...> is specified and use that
+ // If this fails use mb_detect_encoding()
+ // Use UTF-8 if mb_detect_encoding does not return anything (or the HTML page is messed up)
+ $encregex = "/<meta\s+[^>]*(?:charset\s*=\s*['\"]([^>'\"]*)['\"]" .
+ "|http-equiv\s*=\s*['\"]content-type['\"]\s+[^>]*content\s*=\s*['\"][^>]*charset=([^>]*)['\"])[^>]*>/i";
+ if(preg_match($encregex, $body, $matches)) {
+ $enc = strtoupper($matches[sizeof($matches) - 1]);
+ } else {
+ $enc = mb_detect_encoding($body);
+ }
+ $enc = $enc ? $enc : "UTF-8";
+ $body = mb_convert_encoding($body, 'HTML-ENTITIES', $enc);
$dom = new DOMDocument();