summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLars Bensmann <lars@almosthappy.de>2014-11-23 14:55:31 +0100
committerLars Bensmann <lars@almosthappy.de>2014-11-23 14:55:31 +0100
commitc0e881de83b85a6229cde01ad9426660ba6e6618 (patch)
tree9da7e146c00f694c62368601992a480577782f6c
parentaacf3c1c12436ccdbeb1b56c1cf7d324e0b293e7 (diff)
Use one regular expression instead of two for charset detection
-rw-r--r--articleenhancer/xpatharticleenhancer.php20
1 files changed, 8 insertions, 12 deletions
diff --git a/articleenhancer/xpatharticleenhancer.php b/articleenhancer/xpatharticleenhancer.php
index 7c91cf13d..dec0fe760 100644
--- a/articleenhancer/xpatharticleenhancer.php
+++ b/articleenhancer/xpatharticleenhancer.php
@@ -54,20 +54,16 @@ class XPathArticleEnhancer implements ArticleEnhancer {
$body = $this->getFile($item->getUrl());
// Determine document encoding.
- // First check if <meta charset="..."> is specified and use that
- // If this fails look for charset in <meta http-equiv="Content-Type" ...>
- // As a last resort use mb_detect_encoding()
+ // First check if either <meta charset="..."> or
+ // <meta http-equiv="Content-Type" ...> is specified and use that
+ // If this fails use mb_detect_encoding()
// Use UTF-8 if mb_detect_encoding does not return anything (or the HTML page is messed up)
- $csregex = "/<meta\s+[^>]*charset\s*=\s*['\"]([^>]*)['\"][^>]*>/i";
- if(preg_match($csregex, $body, $matches)) {
- $enc = strtoupper($matches[1]);
+ $encregex = "/<meta\s+[^>]*(?:charset\s*=\s*['\"]([^>'\"]*)['\"]" .
+ "|http-equiv\s*=\s*['\"]content-type['\"]\s+[^>]*content\s*=\s*['\"][^>]*charset=([^>]*)['\"])[^>]*>/i";
+ if(preg_match($encregex, $body, $matches)) {
+ $enc = strtoupper($matches[sizeof($matches) - 1]);
} else {
- $ctregex = "/<meta\s+[^>]*http-equiv\s*=\s*['\"]content-type['\"]\s+[^>]*content\s*=\s*['\"][^>]*charset=([^>]*)['\"][^>]*>/i";
- if(preg_match($ctregex, $body, $matches)) {
- $enc = strtoupper($matches[1]);
- } else {
- $enc = mb_detect_encoding($body);
- }
+ $enc = mb_detect_encoding($body);
}
$enc = $enc ? $enc : "UTF-8";
$body = mb_convert_encoding($body, 'HTML-ENTITIES', $enc);