summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBernhard Posselt <Raydiation@users.noreply.github.com>2014-11-23 17:00:44 +0100
committerBernhard Posselt <Raydiation@users.noreply.github.com>2014-11-23 17:00:44 +0100
commitbbd765b76ac337fa7cf895b7ac083de926bb7731 (patch)
tree0580d09e07ca2bdc5d4fbc2a21688d7a6b91c8aa
parent9252fe7bd8420aa6b986e69e00a44fb77eff1831 (diff)
parentc0e881de83b85a6229cde01ad9426660ba6e6618 (diff)
Merge pull request #664 from chaotix-/androidpolice-spiegel-enhancers
Refined spiegel.de enhancer and added androidpolice.com
-rw-r--r--articleenhancer/xpatharticleenhancer.php17
-rw-r--r--articleenhancer/xpathenhancers.json5
2 files changed, 19 insertions, 3 deletions
diff --git a/articleenhancer/xpatharticleenhancer.php b/articleenhancer/xpatharticleenhancer.php
index b283786b8..dec0fe760 100644
--- a/articleenhancer/xpatharticleenhancer.php
+++ b/articleenhancer/xpatharticleenhancer.php
@@ -52,8 +52,21 @@ class XPathArticleEnhancer implements ArticleEnhancer {
if(preg_match($regex, $item->getUrl())) {
$body = $this->getFile($item->getUrl());
- $body = mb_convert_encoding($body, 'HTML-ENTITIES',
- mb_detect_encoding($body));
+
+ // Determine document encoding.
+ // First check if either <meta charset="..."> or
+ // <meta http-equiv="Content-Type" ...> is specified and use that
+ // If this fails use mb_detect_encoding()
+ // Use UTF-8 if mb_detect_encoding does not return anything (or the HTML page is messed up)
+ $encregex = "/<meta\s+[^>]*(?:charset\s*=\s*['\"]([^>'\"]*)['\"]" .
+ "|http-equiv\s*=\s*['\"]content-type['\"]\s+[^>]*content\s*=\s*['\"][^>]*charset=([^>]*)['\"])[^>]*>/i";
+ if(preg_match($encregex, $body, $matches)) {
+ $enc = strtoupper($matches[sizeof($matches) - 1]);
+ } else {
+ $enc = mb_detect_encoding($body);
+ }
+ $enc = $enc ? $enc : "UTF-8";
+ $body = mb_convert_encoding($body, 'HTML-ENTITIES', $enc);
$dom = new DOMDocument();
diff --git a/articleenhancer/xpathenhancers.json b/articleenhancer/xpathenhancers.json
index aa6511e27..cc1a4189e 100644
--- a/articleenhancer/xpathenhancers.json
+++ b/articleenhancer/xpathenhancers.json
@@ -94,7 +94,7 @@
"%heise.de%": "//*[@class='meldung_wrapper']/*[not(contains(@class, 'dossier'))]"
},
"spiegel.de": {
- "%spiegel.de/(?!.*video).*%": "//p[@class='article-intro'] | //*[@itemprop='description' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )]) ] | //*[(@class='spPanoImageTeaserPic' or @class='spPanoGalleryTeaserPic' or @class='spPanoPlayerTeaserPic') and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )])] | //*[@class='image-buttons-panel' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )])]/*[1]/img | //*[@class='article-image-description' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )]) ]/p | //*[@id='content-main']/*[@id='js-article-column']/div[contains( normalize-space( @class ), 'article-section' )]"
+ "%spiegel.de/(?!.*video).*%": "//p[@class='article-intro'] | //*[@itemprop='description' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )]) ] | //*[(@class='spPanoImageTeaserPic' or @class='spPanoGalleryTeaserPic' or @class='spPanoPlayerTeaserPic') and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )])] | //*[@class='image-buttons-panel' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )])]/*[1]/img | //*[@class='article-image-description' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )]) ]/p | //*[@id='content-main']/*[@id='js-article-column']/div[contains( normalize-space( @class ), 'article-section' )]/*[not( contains( normalize-space( @class ), 'article-function-social-media' ))] | //*[@id='content-main']/*[@id='js-article-column']/p"
},
"eqcomics.com": {
"%feedproxy.google.com/~r/eqcomics%": "//div[@id=\"comic\"]/div/a/img"
@@ -122,5 +122,8 @@
},
"satwcomic.com": {
"%feedproxy.google.com/~r/satwcomic%": "//div[@class=\"comicmid\"]/center/a/img"
+ },
+ "androidpolice.com": {
+ "%rss.feedsportal.com/c/33941/f/615677/p/1/s/%": "//div[@class=\"post_content\"]"
}
}