diff options
author | Lars Bensmann <lars@almosthappy.de> | 2014-11-23 14:55:31 +0100 |
---|---|---|
committer | Lars Bensmann <lars@almosthappy.de> | 2014-11-23 14:55:31 +0100 |
commit | c0e881de83b85a6229cde01ad9426660ba6e6618 (patch) | |
tree | 9da7e146c00f694c62368601992a480577782f6c | |
parent | aacf3c1c12436ccdbeb1b56c1cf7d324e0b293e7 (diff) |
Use one regular expression instead of two for charset detection
-rw-r--r-- | articleenhancer/xpatharticleenhancer.php | 20 |
1 files changed, 8 insertions, 12 deletions
diff --git a/articleenhancer/xpatharticleenhancer.php b/articleenhancer/xpatharticleenhancer.php index 7c91cf13d..dec0fe760 100644 --- a/articleenhancer/xpatharticleenhancer.php +++ b/articleenhancer/xpatharticleenhancer.php @@ -54,20 +54,16 @@ class XPathArticleEnhancer implements ArticleEnhancer { $body = $this->getFile($item->getUrl()); // Determine document encoding. - // First check if <meta charset="..."> is specified and use that - // If this fails look for charset in <meta http-equiv="Content-Type" ...> - // As a last resort use mb_detect_encoding() + // First check if either <meta charset="..."> or + // <meta http-equiv="Content-Type" ...> is specified and use that + // If this fails use mb_detect_encoding() // Use UTF-8 if mb_detect_encoding does not return anything (or the HTML page is messed up) - $csregex = "/<meta\s+[^>]*charset\s*=\s*['\"]([^>]*)['\"][^>]*>/i"; - if(preg_match($csregex, $body, $matches)) { - $enc = strtoupper($matches[1]); + $encregex = "/<meta\s+[^>]*(?:charset\s*=\s*['\"]([^>'\"]*)['\"]" . + "|http-equiv\s*=\s*['\"]content-type['\"]\s+[^>]*content\s*=\s*['\"][^>]*charset=([^>]*)['\"])[^>]*>/i"; + if(preg_match($encregex, $body, $matches)) { + $enc = strtoupper($matches[sizeof($matches) - 1]); } else { - $ctregex = "/<meta\s+[^>]*http-equiv\s*=\s*['\"]content-type['\"]\s+[^>]*content\s*=\s*['\"][^>]*charset=([^>]*)['\"][^>]*>/i"; - if(preg_match($ctregex, $body, $matches)) { - $enc = strtoupper($matches[1]); - } else { - $enc = mb_detect_encoding($body); - } + $enc = mb_detect_encoding($body); } $enc = $enc ? $enc : "UTF-8"; $body = mb_convert_encoding($body, 'HTML-ENTITIES', $enc); |