diff options
author | Bernhard Posselt <Raydiation@users.noreply.github.com> | 2014-11-23 17:00:44 +0100 |
---|---|---|
committer | Bernhard Posselt <Raydiation@users.noreply.github.com> | 2014-11-23 17:00:44 +0100 |
commit | bbd765b76ac337fa7cf895b7ac083de926bb7731 (patch) | |
tree | 0580d09e07ca2bdc5d4fbc2a21688d7a6b91c8aa /articleenhancer/xpatharticleenhancer.php | |
parent | 9252fe7bd8420aa6b986e69e00a44fb77eff1831 (diff) | |
parent | c0e881de83b85a6229cde01ad9426660ba6e6618 (diff) |
Merge pull request #664 from chaotix-/androidpolice-spiegel-enhancers
Refined spiegel.de enhancer and added androidpolice.com
Diffstat (limited to 'articleenhancer/xpatharticleenhancer.php')
-rw-r--r-- | articleenhancer/xpatharticleenhancer.php | 17 |
1 files changed, 15 insertions, 2 deletions
diff --git a/articleenhancer/xpatharticleenhancer.php b/articleenhancer/xpatharticleenhancer.php index b283786b8..dec0fe760 100644 --- a/articleenhancer/xpatharticleenhancer.php +++ b/articleenhancer/xpatharticleenhancer.php @@ -52,8 +52,21 @@ class XPathArticleEnhancer implements ArticleEnhancer { if(preg_match($regex, $item->getUrl())) { $body = $this->getFile($item->getUrl()); - $body = mb_convert_encoding($body, 'HTML-ENTITIES', - mb_detect_encoding($body)); + + // Determine document encoding. + // First check if either <meta charset="..."> or + // <meta http-equiv="Content-Type" ...> is specified and use that + // If this fails use mb_detect_encoding() + // Use UTF-8 if mb_detect_encoding does not return anything (or the HTML page is messed up) + $encregex = "/<meta\s+[^>]*(?:charset\s*=\s*['\"]([^>'\"]*)['\"]" . + "|http-equiv\s*=\s*['\"]content-type['\"]\s+[^>]*content\s*=\s*['\"][^>]*charset=([^>]*)['\"])[^>]*>/i"; + if(preg_match($encregex, $body, $matches)) { + $enc = strtoupper($matches[sizeof($matches) - 1]); + } else { + $enc = mb_detect_encoding($body); + } + $enc = $enc ? $enc : "UTF-8"; + $body = mb_convert_encoding($body, 'HTML-ENTITIES', $enc); $dom = new DOMDocument(); |