From 222dcd48082e355cf593594b83bd6a9bb7a36d09 Mon Sep 17 00:00:00 2001
From: Lars Bensmann <lars@almosthappy.de>
Date: Sat, 22 Nov 2014 18:21:06 +0100
Subject: Only use mb_detect_encoding() if no charset is set in HTML

---
 articleenhancer/xpatharticleenhancer.php | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)
diff --git a/articleenhancer/xpatharticleenhancer.php b/articleenhancer/xpatharticleenhancer.php
index b283786b8..7c91cf13d 100644
--- a/articleenhancer/xpatharticleenhancer.php
+++ b/articleenhancer/xpatharticleenhancer.php
@@ -52,8 +52,25 @@ class XPathArticleEnhancer implements ArticleEnhancer {
 
             if(preg_match($regex, $item->getUrl())) {
                 $body = $this->getFile($item->getUrl());
-                $body = mb_convert_encoding($body, 'HTML-ENTITIES',
-                    mb_detect_encoding($body));
+
+                // Determine document encoding.
+                // First check if <meta charset="..."> is specified and use that
+                // If this fails look for charset in <meta http-equiv="Content-Type" ...>
+                // As a last resort use mb_detect_encoding()
+                // Use UTF-8 if mb_detect_encoding does not return anything (or the HTML page is messed up)
+                $csregex = "/<meta\s+[^>]*charset\s*=\s*['\"]([^>]*)['\"][^>]*>/i";
+                if(preg_match($csregex, $body, $matches)) {
+                    $enc = strtoupper($matches[1]);
+                } else {
+                    $ctregex = "/<meta\s+[^>]*http-equiv\s*=\s*['\"]content-type['\"]\s+[^>]*content\s*=\s*['\"][^>]*charset=([^>]*)['\"][^>]*>/i";
+                    if(preg_match($ctregex, $body, $matches)) {
+                        $enc = strtoupper($matches[1]);
+                    } else {
+                        $enc = mb_detect_encoding($body);
+                    }
+                }
+                $enc = $enc ? $enc : "UTF-8";
+                $body = mb_convert_encoding($body, 'HTML-ENTITIES', $enc);
 
                 $dom = new DOMDocument();
 
-- 
cgit v1.2.3