From 222dcd48082e355cf593594b83bd6a9bb7a36d09 Mon Sep 17 00:00:00 2001
From: Lars Bensmann <lars@almosthappy.de>
Date: Sat, 22 Nov 2014 18:21:06 +0100
Subject: Only use mb_detect_encoding() if no charset is set in HTML

---
 articleenhancer/xpatharticleenhancer.php | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)
diff --git a/articleenhancer/xpatharticleenhancer.php b/articleenhancer/xpatharticleenhancer.php
index b283786b8..7c91cf13d 100644
--- a/articleenhancer/xpatharticleenhancer.php
+++ b/articleenhancer/xpatharticleenhancer.php
@@ -52,8 +52,25 @@ class XPathArticleEnhancer implements ArticleEnhancer {
 
             if(preg_match($regex, $item->getUrl())) {
                 $body = $this->getFile($item->getUrl());
-                $body = mb_convert_encoding($body, 'HTML-ENTITIES',
-                    mb_detect_encoding($body));
+
+                // Determine document encoding.
+                // First check if <meta charset="..."> is specified and use that
+                // If this fails look for charset in <meta http-equiv="Content-Type" ...>
+                // As a last resort use mb_detect_encoding()
+                // Use UTF-8 if mb_detect_encoding does not return anything (or the HTML page is messed up)
+                $csregex = "/<meta\s+[^>]*charset\s*=\s*['\"]([^>]*)['\"][^>]*>/i";
+                if(preg_match($csregex, $body, $matches)) {
+                    $enc = strtoupper($matches[1]);
+                } else {
+                    $ctregex = "/<meta\s+[^>]*http-equiv\s*=\s*['\"]content-type['\"]\s+[^>]*content\s*=\s*['\"][^>]*charset=([^>]*)['\"][^>]*>/i";
+                    if(preg_match($ctregex, $body, $matches)) {
+                        $enc = strtoupper($matches[1]);
+                    } else {
+                        $enc = mb_detect_encoding($body);
+                    }
+                }
+                $enc = $enc ? $enc : "UTF-8";
+                $body = mb_convert_encoding($body, 'HTML-ENTITIES', $enc);
 
                 $dom = new DOMDocument();
 
-- 
cgit v1.2.3


From 677982b81bec9f3284c965bd2d77e831d1737613 Mon Sep 17 00:00:00 2001
From: Lars Bensmann <lars@almosthappy.de>
Date: Sat, 22 Nov 2014 18:47:04 +0100
Subject: Refined spiegel.de xpatharticleenhancer

This xpathenhancer does not result in empty bullet points at the top of
article. It also adds the authors name/sign at the end of the article.
---
 articleenhancer/xpathenhancers.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/articleenhancer/xpathenhancers.json b/articleenhancer/xpathenhancers.json
index aa6511e27..9f07510da 100644
--- a/articleenhancer/xpathenhancers.json
+++ b/articleenhancer/xpathenhancers.json
@@ -94,7 +94,7 @@
         "%heise.de%": "//*[@class='meldung_wrapper']/*[not(contains(@class, 'dossier'))]"
     },
     "spiegel.de": {
-        "%spiegel.de/(?!.*video).*%": "//p[@class='article-intro'] | //*[@itemprop='description' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )]) ] | //*[(@class='spPanoImageTeaserPic' or @class='spPanoGalleryTeaserPic' or @class='spPanoPlayerTeaserPic') and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )])] | //*[@class='image-buttons-panel' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )])]/*[1]/img | //*[@class='article-image-description' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )]) ]/p | //*[@id='content-main']/*[@id='js-article-column']/div[contains( normalize-space( @class ), 'article-section' )]"
+        "%spiegel.de/(?!.*video).*%": "//p[@class='article-intro'] | //*[@itemprop='description' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )]) ] | //*[(@class='spPanoImageTeaserPic' or @class='spPanoGalleryTeaserPic' or @class='spPanoPlayerTeaserPic') and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )])] | //*[@class='image-buttons-panel' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )])]/*[1]/img | //*[@class='article-image-description' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )]) ]/p | //*[@id='content-main']/*[@id='js-article-column']/div[contains( normalize-space( @class ), 'article-section' )]/*[not( contains( normalize-space( @class ), 'article-function-social-media' ))] | //*[@id='content-main']/*[@id='js-article-column']/p"
     },
     "eqcomics.com": {
         "%feedproxy.google.com/~r/eqcomics%": "//div[@id=\"comic\"]/div/a/img"
-- 
cgit v1.2.3


From aacf3c1c12436ccdbeb1b56c1cf7d324e0b293e7 Mon Sep 17 00:00:00 2001
From: Lars Bensmann <lars@almosthappy.de>
Date: Sat, 22 Nov 2014 18:50:00 +0100
Subject: Added androidpolice.com xpathenhancer

---
 articleenhancer/xpathenhancers.json | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/articleenhancer/xpathenhancers.json b/articleenhancer/xpathenhancers.json
index 9f07510da..cc1a4189e 100644
--- a/articleenhancer/xpathenhancers.json
+++ b/articleenhancer/xpathenhancers.json
@@ -122,5 +122,8 @@
     },
     "satwcomic.com": {
         "%feedproxy.google.com/~r/satwcomic%": "//div[@class=\"comicmid\"]/center/a/img"
+    },
+    "androidpolice.com": {
+        "%rss.feedsportal.com/c/33941/f/615677/p/1/s/%": "//div[@class=\"post_content\"]"
     }
 }
-- 
cgit v1.2.3


From c0e881de83b85a6229cde01ad9426660ba6e6618 Mon Sep 17 00:00:00 2001
From: Lars Bensmann <lars@almosthappy.de>
Date: Sun, 23 Nov 2014 14:55:31 +0100
Subject: Use one regular expression instead of two for charset detection

---
 articleenhancer/xpatharticleenhancer.php | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/articleenhancer/xpatharticleenhancer.php b/articleenhancer/xpatharticleenhancer.php
index 7c91cf13d..dec0fe760 100644
--- a/articleenhancer/xpatharticleenhancer.php
+++ b/articleenhancer/xpatharticleenhancer.php
@@ -54,20 +54,16 @@ class XPathArticleEnhancer implements ArticleEnhancer {
                 $body = $this->getFile($item->getUrl());
 
                 // Determine document encoding.
-                // First check if <meta charset="..."> is specified and use that
-                // If this fails look for charset in <meta http-equiv="Content-Type" ...>
-                // As a last resort use mb_detect_encoding()
+                // First check if either <meta charset="..."> or
+                // <meta http-equiv="Content-Type" ...> is specified and use that
+                // If this fails use mb_detect_encoding()
                 // Use UTF-8 if mb_detect_encoding does not return anything (or the HTML page is messed up)
-                $csregex = "/<meta\s+[^>]*charset\s*=\s*['\"]([^>]*)['\"][^>]*>/i";
-                if(preg_match($csregex, $body, $matches)) {
-                    $enc = strtoupper($matches[1]);
+                $encregex = "/<meta\s+[^>]*(?:charset\s*=\s*['\"]([^>'\"]*)['\"]" .
+                            "|http-equiv\s*=\s*['\"]content-type['\"]\s+[^>]*content\s*=\s*['\"][^>]*charset=([^>]*)['\"])[^>]*>/i";
+                if(preg_match($encregex, $body, $matches)) {
+                    $enc = strtoupper($matches[sizeof($matches) - 1]);
                 } else {
-                    $ctregex = "/<meta\s+[^>]*http-equiv\s*=\s*['\"]content-type['\"]\s+[^>]*content\s*=\s*['\"][^>]*charset=([^>]*)['\"][^>]*>/i";
-                    if(preg_match($ctregex, $body, $matches)) {
-                        $enc = strtoupper($matches[1]);
-                    } else {
-                        $enc = mb_detect_encoding($body);
-                    }
+                    $enc = mb_detect_encoding($body);
                 }
                 $enc = $enc ? $enc : "UTF-8";
                 $body = mb_convert_encoding($body, 'HTML-ENTITIES', $enc);
-- 
cgit v1.2.3