From 222dcd48082e355cf593594b83bd6a9bb7a36d09 Mon Sep 17 00:00:00 2001 From: Lars Bensmann Date: Sat, 22 Nov 2014 18:21:06 +0100 Subject: Only use mb_detect_encoding() if no charset is set in HTML --- articleenhancer/xpatharticleenhancer.php | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/articleenhancer/xpatharticleenhancer.php b/articleenhancer/xpatharticleenhancer.php index b283786b8..7c91cf13d 100644 --- a/articleenhancer/xpatharticleenhancer.php +++ b/articleenhancer/xpatharticleenhancer.php @@ -52,8 +52,25 @@ class XPathArticleEnhancer implements ArticleEnhancer { if(preg_match($regex, $item->getUrl())) { $body = $this->getFile($item->getUrl()); - $body = mb_convert_encoding($body, 'HTML-ENTITIES', - mb_detect_encoding($body)); + + // Determine document encoding. + // First check if is specified and use that + // If this fails look for charset in + // As a last resort use mb_detect_encoding() + // Use UTF-8 if mb_detect_encoding does not return anything (or the HTML page is messed up) + $csregex = "/]*charset\s*=\s*['\"]([^>]*)['\"][^>]*>/i"; + if(preg_match($csregex, $body, $matches)) { + $enc = strtoupper($matches[1]); + } else { + $ctregex = "/]*http-equiv\s*=\s*['\"]content-type['\"]\s+[^>]*content\s*=\s*['\"][^>]*charset=([^>]*)['\"][^>]*>/i"; + if(preg_match($ctregex, $body, $matches)) { + $enc = strtoupper($matches[1]); + } else { + $enc = mb_detect_encoding($body); + } + } + $enc = $enc ? $enc : "UTF-8"; + $body = mb_convert_encoding($body, 'HTML-ENTITIES', $enc); $dom = new DOMDocument(); -- cgit v1.2.3 From 677982b81bec9f3284c965bd2d77e831d1737613 Mon Sep 17 00:00:00 2001 From: Lars Bensmann Date: Sat, 22 Nov 2014 18:47:04 +0100 Subject: Refined spiegel.de xpatharticleenhancer This xpathenhancer does not result in empty bullet points at the top of article. It also adds the authors name/sign at the end of the article. --- articleenhancer/xpathenhancers.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/articleenhancer/xpathenhancers.json b/articleenhancer/xpathenhancers.json index aa6511e27..9f07510da 100644 --- a/articleenhancer/xpathenhancers.json +++ b/articleenhancer/xpathenhancers.json @@ -94,7 +94,7 @@ "%heise.de%": "//*[@class='meldung_wrapper']/*[not(contains(@class, 'dossier'))]" }, "spiegel.de": { - "%spiegel.de/(?!.*video).*%": "//p[@class='article-intro'] | //*[@itemprop='description' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )]) ] | //*[(@class='spPanoImageTeaserPic' or @class='spPanoGalleryTeaserPic' or @class='spPanoPlayerTeaserPic') and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )])] | //*[@class='image-buttons-panel' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )])]/*[1]/img | //*[@class='article-image-description' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )]) ]/p | //*[@id='content-main']/*[@id='js-article-column']/div[contains( normalize-space( @class ), 'article-section' )]" + "%spiegel.de/(?!.*video).*%": "//p[@class='article-intro'] | //*[@itemprop='description' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )]) ] | //*[(@class='spPanoImageTeaserPic' or @class='spPanoGalleryTeaserPic' or @class='spPanoPlayerTeaserPic') and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )])] | //*[@class='image-buttons-panel' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )])]/*[1]/img | //*[@class='article-image-description' and not(ancestor::*[contains( normalize-space( @class ), 'article-section' )]) ]/p | //*[@id='content-main']/*[@id='js-article-column']/div[contains( normalize-space( @class ), 'article-section' )]/*[not( contains( normalize-space( @class ), 'article-function-social-media' ))] | //*[@id='content-main']/*[@id='js-article-column']/p" }, "eqcomics.com": { "%feedproxy.google.com/~r/eqcomics%": "//div[@id=\"comic\"]/div/a/img" -- cgit v1.2.3 From aacf3c1c12436ccdbeb1b56c1cf7d324e0b293e7 Mon Sep 17 00:00:00 2001 From: Lars Bensmann Date: Sat, 22 Nov 2014 18:50:00 +0100 Subject: Added androidpolice.com xpathenhancer --- articleenhancer/xpathenhancers.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/articleenhancer/xpathenhancers.json b/articleenhancer/xpathenhancers.json index 9f07510da..cc1a4189e 100644 --- a/articleenhancer/xpathenhancers.json +++ b/articleenhancer/xpathenhancers.json @@ -122,5 +122,8 @@ }, "satwcomic.com": { "%feedproxy.google.com/~r/satwcomic%": "//div[@class=\"comicmid\"]/center/a/img" + }, + "androidpolice.com": { + "%rss.feedsportal.com/c/33941/f/615677/p/1/s/%": "//div[@class=\"post_content\"]" } } -- cgit v1.2.3 From c0e881de83b85a6229cde01ad9426660ba6e6618 Mon Sep 17 00:00:00 2001 From: Lars Bensmann Date: Sun, 23 Nov 2014 14:55:31 +0100 Subject: Use one regular expression instead of two for charset detection --- articleenhancer/xpatharticleenhancer.php | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/articleenhancer/xpatharticleenhancer.php b/articleenhancer/xpatharticleenhancer.php index 7c91cf13d..dec0fe760 100644 --- a/articleenhancer/xpatharticleenhancer.php +++ b/articleenhancer/xpatharticleenhancer.php @@ -54,20 +54,16 @@ class XPathArticleEnhancer implements ArticleEnhancer { $body = $this->getFile($item->getUrl()); // Determine document encoding. - // First check if is specified and use that - // If this fails look for charset in - // As a last resort use mb_detect_encoding() + // First check if either or + // is specified and use that + // If this fails use mb_detect_encoding() // Use UTF-8 if mb_detect_encoding does not return anything (or the HTML page is messed up) - $csregex = "/]*charset\s*=\s*['\"]([^>]*)['\"][^>]*>/i"; - if(preg_match($csregex, $body, $matches)) { - $enc = strtoupper($matches[1]); + $encregex = "/]*(?:charset\s*=\s*['\"]([^>'\"]*)['\"]" . + "|http-equiv\s*=\s*['\"]content-type['\"]\s+[^>]*content\s*=\s*['\"][^>]*charset=([^>]*)['\"])[^>]*>/i"; + if(preg_match($encregex, $body, $matches)) { + $enc = strtoupper($matches[sizeof($matches) - 1]); } else { - $ctregex = "/]*http-equiv\s*=\s*['\"]content-type['\"]\s+[^>]*content\s*=\s*['\"][^>]*charset=([^>]*)['\"][^>]*>/i"; - if(preg_match($ctregex, $body, $matches)) { - $enc = strtoupper($matches[1]); - } else { - $enc = mb_detect_encoding($body); - } + $enc = mb_detect_encoding($body); } $enc = $enc ? $enc : "UTF-8"; $body = mb_convert_encoding($body, 'HTML-ENTITIES', $enc); -- cgit v1.2.3