summaryrefslogtreecommitdiffstats
path: root/lib/Fetcher
diff options
context:
space:
mode:
authorBernhard Posselt <dev@bernhard-posselt.com>2016-07-23 21:24:54 +0200
committerBernhard Posselt <dev@bernhard-posselt.com>2016-07-23 21:24:54 +0200
commit004fcbbcc7609ca83807f2e38967ef54f469bf72 (patch)
tree49eb99b4ea92b2045793fc567f719b31ec7f9042 /lib/Fetcher
parent60abc0ed4438c9b6fda245b0dc33cb483bc2aeaf (diff)
Move to new directory structure
Diffstat (limited to 'lib/Fetcher')
-rw-r--r--lib/Fetcher/FeedFetcher.php302
-rw-r--r--lib/Fetcher/Fetcher.php69
-rw-r--r--lib/Fetcher/FetcherException.php26
-rw-r--r--lib/Fetcher/IFeedFetcher.php48
-rw-r--r--lib/Fetcher/YoutubeFetcher.php82
5 files changed, 527 insertions, 0 deletions
diff --git a/lib/Fetcher/FeedFetcher.php b/lib/Fetcher/FeedFetcher.php
new file mode 100644
index 000000000..beffe9051
--- /dev/null
+++ b/lib/Fetcher/FeedFetcher.php
@@ -0,0 +1,302 @@
+<?php
+/**
+ * ownCloud - News
+ *
+ * This file is licensed under the Affero General Public License version 3 or
+ * later. See the COPYING file.
+ *
+ * @author Alessandro Cosentino <cosenal@gmail.com>
+ * @author Bernhard Posselt <dev@bernhard-posselt.com>
+ * @copyright Alessandro Cosentino 2012
+ * @copyright Bernhard Posselt 2012, 2014
+ */
+
+namespace OCA\News\Fetcher;
+
+use Exception;
+
+use PicoFeed\Parser\MalFormedXmlException;
+use PicoFeed\Reader\Reader;
+use PicoFeed\Parser\Parser;
+use PicoFeed\Reader\SubscriptionNotFoundException;
+use PicoFeed\Reader\UnsupportedFeedFormatException;
+use PicoFeed\Client\InvalidCertificateException;
+use PicoFeed\Client\InvalidUrlException;
+use PicoFeed\Client\MaxRedirectException;
+use PicoFeed\Client\MaxSizeException;
+use PicoFeed\Client\TimeoutException;
+use PicoFeed\Client\ForbiddenException;
+use PicoFeed\Client\UnauthorizedException;
+
+use OCP\IL10N;
+
+use OCA\News\Db\Item;
+use OCA\News\Db\Feed;
+use OCA\News\Utility\PicoFeedFaviconFactory;
+use OCA\News\Utility\PicoFeedReaderFactory;
+use OCA\News\Utility\Time;
+
+class FeedFetcher implements IFeedFetcher {
+
+ private $faviconFactory;
+ private $reader;
+ private $l10n;
+ private $time;
+
+ public function __construct(Reader $reader,
+ PicoFeedFaviconFactory $faviconFactory,
+ IL10N $l10n,
+ Time $time) {
+ $this->faviconFactory = $faviconFactory;
+ $this->reader = $reader;
+ $this->time = $time;
+ $this->l10n = $l10n;
+ }
+
+
+ /**
+ * This fetcher handles all the remaining urls therefore always returns true
+ */
+ public function canHandle($url) {
+ return true;
+ }
+
+
+ /**
+ * Fetch a feed from remote
+ * @param string $url remote url of the feed
+ * @param boolean $getFavicon if the favicon should also be fetched,
+ * defaults to true
+ * @param string $lastModified a last modified value from an http header
+ * defaults to false. If lastModified matches the http header from the feed
+ * no results are fetched
+ * @param string $etag an etag from an http header.
+ * If lastModified matches the http header from the feed
+ * no results are fetched
+ * @param bool fullTextEnabled if true tells the fetcher to enhance the
+ * articles by fetching custom enhanced content
+ * @param string $basicAuthUser if given, basic auth is set for this feed
+ * @param string $basicAuthPassword if given, basic auth is set for this
+ * feed. Ignored if user is null or an empty string
+ * @throws FetcherException if it fails
+ * @return array an array containing the new feed and its items, first
+ * element being the Feed and second element being an array of Items
+ */
+ public function fetch($url, $getFavicon = true, $lastModified = null,
+ $etag = null, $fullTextEnabled = false,
+ $basicAuthUser = null, $basicAuthPassword = null) {
+ try {
+ if ($basicAuthUser !== null && trim($basicAuthUser) !== '') {
+ $resource = $this->reader->discover($url, $lastModified, $etag,
+ $basicAuthUser,
+ $basicAuthPassword);
+ } else {
+ $resource = $this->reader->discover($url, $lastModified, $etag);
+ }
+
+ if (!$resource->isModified()) {
+ return [null, null];
+ }
+
+ $location = $resource->getUrl();
+ $etag = $resource->getEtag();
+ $content = $resource->getContent();
+ $encoding = $resource->getEncoding();
+ $lastModified = $resource->getLastModified();
+
+ $parser = $this->reader->getParser($location, $content, $encoding);
+
+ if ($fullTextEnabled) {
+ $parser->enableContentGrabber();
+ }
+
+ $parsedFeed = $parser->execute();
+
+ $feed = $this->buildFeed(
+ $parsedFeed, $url, $getFavicon, $lastModified, $etag, $location
+ );
+
+ $items = [];
+ foreach ($parsedFeed->getItems() as $item) {
+ $items[] = $this->buildItem($item, $parsedFeed);
+ }
+
+ return [$feed, $items];
+
+ } catch (Exception $ex) {
+ $this->handleError($ex, $url);
+ }
+
+ }
+
+
+ private function handleError(Exception $ex, $url) {
+ $msg = $ex->getMessage();
+
+ if ($ex instanceof MalFormedXmlException) {
+ $msg = $this->l10n->t('Feed contains invalid XML');
+ } else if ($ex instanceof SubscriptionNotFoundException) {
+ $msg = $this->l10n->t('Feed not found: either the website ' .
+ 'does not provide a feed or blocks access. To rule out ' .
+ 'blocking, try to download the feed on your server\'s ' .
+ 'command line using curl: curl ' . $url);
+ } else if ($ex instanceof UnsupportedFeedFormatException) {
+ $msg = $this->l10n->t('Detected feed format is not supported');
+ } else if ($ex instanceof InvalidCertificateException) {
+ $msg = $this->buildCurlSslErrorMessage($ex->getCode());
+ } else if ($ex instanceof InvalidUrlException) {
+ $msg = $this->l10n->t('Website not found');
+ } else if ($ex instanceof MaxRedirectException) {
+ $msg = $this->l10n->t('More redirects than allowed, aborting');
+ } else if ($ex instanceof MaxSizeException) {
+ $msg = $this->l10n->t('Bigger than maximum allowed size');
+ } else if ($ex instanceof TimeoutException) {
+ $msg = $this->l10n->t('Request timed out');
+ } else if ($ex instanceof UnauthorizedException) {
+ $msg = $this->l10n->t('Required credentials for feed were ' .
+ 'either missing or incorrect');
+ } else if ($ex instanceof ForbiddenException) {
+ $msg = $this->l10n->t('Forbidden to access feed');
+ }
+
+ throw new FetcherException($msg);
+ }
+
+ private function buildCurlSslErrorMessage($errorCode) {
+ switch ($errorCode) {
+ case 35: // CURLE_SSL_CONNECT_ERROR
+ return $this->l10n->t(
+ 'Certificate error: A problem occurred ' .
+ 'somewhere in the SSL/TLS handshake. Could be ' .
+ 'certificates (file formats, paths, permissions), ' .
+ 'passwords, and others.'
+ );
+ case 51: // CURLE_PEER_FAILED_VERIFICATION
+ return $this->l10n->t(
+ 'Certificate error: The remote server\'s SSL ' .
+ 'certificate or SSH md5 fingerprint was deemed not OK.'
+ );
+ case 58: // CURLE_SSL_CERTPROBLEM
+ return $this->l10n->t(
+ 'Certificate error: Problem with the local client ' .
+ 'certificate.'
+ );
+ case 59: // CURLE_SSL_CIPHER
+ return $this->l10n->t(
+ 'Certificate error: Couldn\'t use specified cipher.'
+ );
+ case 60: // CURLE_SSL_CACERT
+ return $this->l10n->t(
+ 'Certificate error: Peer certificate cannot be ' .
+ 'authenticated with known CA certificates.'
+ );
+ case 64: // CURLE_USE_SSL_FAILED
+ return $this->l10n->t(
+ 'Certificate error: Requested FTP SSL level failed.'
+ );
+ case 66: // CURLE_SSL_ENGINE_INITFAILED
+ return $this->l10n->t(
+ 'Certificate error: Initiating the SSL Engine failed.'
+ );
+ case 77: // CURLE_SSL_CACERT_BADFILE
+ return $this->l10n->t(
+ 'Certificate error: Problem with reading the SSL CA ' .
+ 'cert (path? access rights?)'
+ );
+ case 83: // CURLE_SSL_ISSUER_ERROR
+ return $this->l10n->t(
+ 'Certificate error: Issuer check failed'
+ );
+ default:
+ return $this->l10n->t('Unknown SSL certificate error!');
+ }
+ }
+
+ private function decodeTwice($string) {
+ return html_entity_decode(
+ html_entity_decode(
+ $string, ENT_QUOTES | ENT_HTML5, 'UTF-8'
+ ),
+ ENT_QUOTES | ENT_HTML5, 'UTF-8'
+ );
+ }
+
+
+ protected function determineRtl($parsedItem, $parsedFeed) {
+ $itemLang = $parsedItem->getLanguage();
+ $feedLang = $parsedFeed->getLanguage();
+
+ if ($itemLang) {
+ return Parser::isLanguageRTL($itemLang);
+ } else {
+ return Parser::isLanguageRTL($feedLang);
+ }
+ }
+
+
+ protected function buildItem($parsedItem, $parsedFeed) {
+ $item = new Item();
+ $item->setUnread();
+ $item->setUrl($parsedItem->getUrl());
+ $item->setGuid($parsedItem->getId());
+ $item->setGuidHash($item->getGuid());
+ $item->setPubDate($parsedItem->getDate()->getTimestamp());
+ $item->setRtl($this->determineRtl($parsedItem, $parsedFeed));
+
+ // unescape content because angularjs helps against XSS
+ $item->setTitle($this->decodeTwice($parsedItem->getTitle()));
+ $item->setAuthor($this->decodeTwice($parsedItem->getAuthor()));
+
+ // purification is done in the service layer
+ $body = $parsedItem->getContent();
+ $body = mb_convert_encoding($body, 'HTML-ENTITIES',
+ mb_detect_encoding($body));
+ $item->setBody($body);
+
+ $enclosureUrl = $parsedItem->getEnclosureUrl();
+ if ($enclosureUrl) {
+ $enclosureType = $parsedItem->getEnclosureType();
+ if (stripos($enclosureType, 'audio/') !== false ||
+ stripos($enclosureType, 'video/') !== false
+ ) {
+ $item->setEnclosureMime($enclosureType);
+ $item->setEnclosureLink($enclosureUrl);
+ }
+ }
+
+ $item->generateSearchIndex();
+
+ return $item;
+ }
+
+
+ protected function buildFeed($parsedFeed, $url, $getFavicon, $modified,
+ $etag, $location) {
+ $feed = new Feed();
+
+ $link = $parsedFeed->getSiteUrl();
+
+ if (!$link) {
+ $link = $location;
+ }
+
+ // unescape content because angularjs helps against XSS
+ $title = strip_tags($this->decodeTwice($parsedFeed->getTitle()));
+ $feed->setTitle($title);
+ $feed->setUrl($url); // the url used to add the feed
+ $feed->setLocation($location); // the url where the feed was found
+ $feed->setLink($link); // <link> attribute in the feed
+ $feed->setHttpLastModified($modified);
+ $feed->setHttpEtag($etag);
+ $feed->setAdded($this->time->getTime());
+
+ if ($getFavicon) {
+ $faviconFetcher = $this->faviconFactory->build();
+ $favicon = $faviconFetcher->find($feed->getLink());
+ $feed->setFaviconLink($favicon);
+ }
+
+ return $feed;
+ }
+
+}
diff --git a/lib/Fetcher/Fetcher.php b/lib/Fetcher/Fetcher.php
new file mode 100644
index 000000000..43c9e7a3f
--- /dev/null
+++ b/lib/Fetcher/Fetcher.php
@@ -0,0 +1,69 @@
+<?php
+/**
+ * ownCloud - News
+ *
+ * This file is licensed under the Affero General Public License version 3 or
+ * later. See the COPYING file.
+ *
+ * @author Alessandro Cosentino <cosenal@gmail.com>
+ * @author Bernhard Posselt <dev@bernhard-posselt.com>
+ * @copyright Alessandro Cosentino 2012
+ * @copyright Bernhard Posselt 2012, 2014
+ */
+
+namespace OCA\News\Fetcher;
+
+
+class Fetcher {
+
+ private $fetchers;
+
+ public function __construct(){
+ $this->fetchers = [];
+ }
+
+
+ /**
+ * Add an additional fetcher
+ * @param IFeedFetcher $fetcher the fetcher
+ */
+ public function registerFetcher(IFeedFetcher $fetcher){
+ $this->fetchers[] = $fetcher;
+ }
+
+ /**
+ * Fetch a feed from remote
+ * @param string $url remote url of the feed
+ * @param boolean $getFavicon if the favicon should also be fetched,
+ * defaults to true
+ * @param string $lastModified a last modified value from an http header
+ * defaults to false. If lastModified matches the http header from the feed
+ * no results are fetched
+ * @param string $etag an etag from an http header.
+ * If lastModified matches the http header from the feed
+ * no results are fetched
+ * @param bool fullTextEnabled if true tells the fetcher to enhance the
+ * articles by fetching custom enhanced content
+ * @param string $basicAuthUser if given, basic auth is set for this feed
+ * @param string $basicAuthPassword if given, basic auth is set for this
+ * feed. Ignored if user is null or an empty string
+ * @throws FetcherException if simple pie fails
+ * @return array an array containing the new feed and its items, first
+ * element being the Feed and second element being an array of Items
+ */
+ public function fetch($url, $getFavicon=true, $lastModified=null,
+ $etag=null, $fullTextEnabled=false,
+ $basicAuthUser=null, $basicAuthPassword=null) {
+ foreach($this->fetchers as $fetcher){
+ if($fetcher->canHandle($url)){
+ return $fetcher->fetch($url, $getFavicon, $lastModified, $etag,
+ $fullTextEnabled, $basicAuthUser,
+ $basicAuthPassword);
+ }
+ }
+
+ return [null, []];
+ }
+
+
+}
diff --git a/lib/Fetcher/FetcherException.php b/lib/Fetcher/FetcherException.php
new file mode 100644
index 000000000..27dd42f39
--- /dev/null
+++ b/lib/Fetcher/FetcherException.php
@@ -0,0 +1,26 @@
+<?php
+/**
+ * ownCloud - News
+ *
+ * This file is licensed under the Affero General Public License version 3 or
+ * later. See the COPYING file.
+ *
+ * @author Alessandro Cosentino <cosenal@gmail.com>
+ * @author Bernhard Posselt <dev@bernhard-posselt.com>
+ * @copyright Alessandro Cosentino 2012
+ * @copyright Bernhard Posselt 2012, 2014
+ */
+
+namespace OCA\News\Fetcher;
+
+class FetcherException extends \Exception {
+
+ /**
+ * Constructor
+ * @param string $msg the error message
+ */
+ public function __construct($msg){
+ parent::__construct($msg);
+ }
+
+} \ No newline at end of file
diff --git a/lib/Fetcher/IFeedFetcher.php b/lib/Fetcher/IFeedFetcher.php
new file mode 100644
index 000000000..e854d83fe
--- /dev/null
+++ b/lib/Fetcher/IFeedFetcher.php
@@ -0,0 +1,48 @@
+<?php
+/**
+ * ownCloud - News
+ *
+ * This file is licensed under the Affero General Public License version 3 or
+ * later. See the COPYING file.
+ *
+ * @author Alessandro Cosentino <cosenal@gmail.com>
+ * @author Bernhard Posselt <dev@bernhard-posselt.com>
+ * @copyright Alessandro Cosentino 2012
+ * @copyright Bernhard Posselt 2012, 2014
+ */
+
+namespace OCA\News\Fetcher;
+
+interface IFeedFetcher {
+
+ /**
+ * @param string $url remote url of the feed
+ * @param boolean $getFavicon if the favicon should also be fetched,
+ * defaults to true
+ * @param string $lastModified a last modified value from an http header
+ * defaults to false. If lastModified matches the http header from the feed
+ * no results are fetched
+ * @param string $etag an etag from an http header.
+ * If lastModified matches the http header from the feed
+ * no results are fetched
+ * @param bool fullTextEnabled if true tells the fetcher to enhance the
+ * articles by fetching custom enhanced content
+ * @param string $basicAuthUser if given, basic auth is set for this feed
+ * @param string $basicAuthPassword if given, basic auth is set for this
+ * feed. Ignored if user is null or an empty string
+ * @throws FetcherException if the fetcher encounters a problem
+ * @return array an array containing the new feed and its items, first
+ * element being the Feed and second element being an array of Items
+ */
+ function fetch($url, $getFavicon=true, $lastModified=null, $etag=null,
+ $fullTextEnabled=false, $basicAuthUser=null,
+ $basicAuthPassword=null);
+
+ /**
+ * @param string $url the url that should be fetched
+ * @return boolean if the fetcher can handle the url. This fetcher will be
+ * used exclusively to fetch the feed and the items of the page
+ */
+ function canHandle($url);
+
+}
diff --git a/lib/Fetcher/YoutubeFetcher.php b/lib/Fetcher/YoutubeFetcher.php
new file mode 100644
index 000000000..3752ba197
--- /dev/null
+++ b/lib/Fetcher/YoutubeFetcher.php
@@ -0,0 +1,82 @@
+<?php
+/**
+ * ownCloud - News
+ *
+ * This file is licensed under the Affero General Public License version 3 or
+ * later. See the COPYING file.
+ *
+ * @author Bernhard Posselt <dev@bernhard-posselt.com>
+ * @copyright Bernhard Posselt 2012, 2014
+ */
+
+namespace OCA\News\Fetcher;
+
+
+class YoutubeFetcher implements IFeedFetcher {
+
+ private $feedFetcher;
+
+ public function __construct(FeedFetcher $feedFetcher){
+ $this->feedFetcher = $feedFetcher;
+ }
+
+
+ private function buildUrl($url) {
+ $baseRegex = '%(?:https?://|//)?(?:www.)?youtube.com';
+ $playRegex = $baseRegex . '.*?list=([^&]*)%';
+
+ if (preg_match($playRegex, $url, $matches)) {
+ $id = $matches[1];
+ return 'http://gdata.youtube.com/feeds/api/playlists/' . $id;
+ } else {
+ return $url;
+ }
+ }
+
+
+ /**
+ * This fetcher handles all the remaining urls therefore always returns true
+ */
+ public function canHandle($url){
+ return $this->buildUrl($url) !== $url;
+ }
+
+
+ /**
+ * Fetch a feed from remote
+ * @param string $url remote url of the feed
+ * @param boolean $getFavicon if the favicon should also be fetched,
+ * defaults to true
+ * @param string $lastModified a last modified value from an http header
+ * defaults to false. If lastModified matches the http header from the feed
+ * no results are fetched
+ * @param string $etag an etag from an http header.
+ * If lastModified matches the http header from the feed
+ * no results are fetched
+ * @param bool fullTextEnabled if true tells the fetcher to enhance the
+ * articles by fetching custom enhanced content
+ * @param string $basicAuthUser if given, basic auth is set for this feed
+ * @param string $basicAuthPassword if given, basic auth is set for this
+ * feed. Ignored if user is null or an empty string
+ * @throws FetcherException if it fails
+ * @return array an array containing the new feed and its items, first
+ * element being the Feed and second element being an array of Items
+ */
+ public function fetch($url, $getFavicon=true, $lastModified=null,
+ $etag=null, $fullTextEnabled=false,
+ $basicAuthUser=null, $basicAuthPassword=null) {
+ $transformedUrl = $this->buildUrl($url);
+
+ $result = $this->feedFetcher->fetch(
+ $transformedUrl, $getFavicon, $lastModified, $etag,
+ $fullTextEnabled, $basicAuthUser, $basicAuthPassword
+ );
+
+ // reset feed url so we know the correct added url for the feed
+ $result[0]->setUrl($url);
+
+ return $result;
+ }
+
+
+}