From 222dcd48082e355cf593594b83bd6a9bb7a36d09 Mon Sep 17 00:00:00 2001 From: Lars Bensmann Date: Sat, 22 Nov 2014 18:21:06 +0100 Subject: Only use mb_detect_encoding() if no charset is set in HTML --- articleenhancer/xpatharticleenhancer.php | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/articleenhancer/xpatharticleenhancer.php b/articleenhancer/xpatharticleenhancer.php index b283786b8..7c91cf13d 100644 --- a/articleenhancer/xpatharticleenhancer.php +++ b/articleenhancer/xpatharticleenhancer.php @@ -52,8 +52,25 @@ class XPathArticleEnhancer implements ArticleEnhancer { if(preg_match($regex, $item->getUrl())) { $body = $this->getFile($item->getUrl()); - $body = mb_convert_encoding($body, 'HTML-ENTITIES', - mb_detect_encoding($body)); + + // Determine document encoding. + // First check if is specified and use that + // If this fails look for charset in + // As a last resort use mb_detect_encoding() + // Use UTF-8 if mb_detect_encoding does not return anything (or the HTML page is messed up) + $csregex = "/]*charset\s*=\s*['\"]([^>]*)['\"][^>]*>/i"; + if(preg_match($csregex, $body, $matches)) { + $enc = strtoupper($matches[1]); + } else { + $ctregex = "/]*http-equiv\s*=\s*['\"]content-type['\"]\s+[^>]*content\s*=\s*['\"][^>]*charset=([^>]*)['\"][^>]*>/i"; + if(preg_match($ctregex, $body, $matches)) { + $enc = strtoupper($matches[1]); + } else { + $enc = mb_detect_encoding($body); + } + } + $enc = $enc ? $enc : "UTF-8"; + $body = mb_convert_encoding($body, 'HTML-ENTITIES', $enc); $dom = new DOMDocument(); -- cgit v1.2.3