From 004fcbbcc7609ca83807f2e38967ef54f469bf72 Mon Sep 17 00:00:00 2001 From: Bernhard Posselt Date: Sat, 23 Jul 2016 21:24:54 +0200 Subject: Move to new directory structure --- lib/Fetcher/FeedFetcher.php | 302 +++++++++++++++++++++++++++++++++++++++ lib/Fetcher/Fetcher.php | 69 +++++++++ lib/Fetcher/FetcherException.php | 26 ++++ lib/Fetcher/IFeedFetcher.php | 48 +++++++ lib/Fetcher/YoutubeFetcher.php | 82 +++++++++++ 5 files changed, 527 insertions(+) create mode 100644 lib/Fetcher/FeedFetcher.php create mode 100644 lib/Fetcher/Fetcher.php create mode 100644 lib/Fetcher/FetcherException.php create mode 100644 lib/Fetcher/IFeedFetcher.php create mode 100644 lib/Fetcher/YoutubeFetcher.php (limited to 'lib/Fetcher') diff --git a/lib/Fetcher/FeedFetcher.php b/lib/Fetcher/FeedFetcher.php new file mode 100644 index 000000000..beffe9051 --- /dev/null +++ b/lib/Fetcher/FeedFetcher.php @@ -0,0 +1,302 @@ + + * @author Bernhard Posselt + * @copyright Alessandro Cosentino 2012 + * @copyright Bernhard Posselt 2012, 2014 + */ + +namespace OCA\News\Fetcher; + +use Exception; + +use PicoFeed\Parser\MalFormedXmlException; +use PicoFeed\Reader\Reader; +use PicoFeed\Parser\Parser; +use PicoFeed\Reader\SubscriptionNotFoundException; +use PicoFeed\Reader\UnsupportedFeedFormatException; +use PicoFeed\Client\InvalidCertificateException; +use PicoFeed\Client\InvalidUrlException; +use PicoFeed\Client\MaxRedirectException; +use PicoFeed\Client\MaxSizeException; +use PicoFeed\Client\TimeoutException; +use PicoFeed\Client\ForbiddenException; +use PicoFeed\Client\UnauthorizedException; + +use OCP\IL10N; + +use OCA\News\Db\Item; +use OCA\News\Db\Feed; +use OCA\News\Utility\PicoFeedFaviconFactory; +use OCA\News\Utility\PicoFeedReaderFactory; +use OCA\News\Utility\Time; + +class FeedFetcher implements IFeedFetcher { + + private $faviconFactory; + private $reader; + private $l10n; + private $time; + + public function __construct(Reader $reader, + PicoFeedFaviconFactory $faviconFactory, + IL10N $l10n, + Time $time) { + $this->faviconFactory = $faviconFactory; + $this->reader = $reader; + $this->time = $time; + $this->l10n = $l10n; + } + + + /** + * This fetcher handles all the remaining urls therefore always returns true + */ + public function canHandle($url) { + return true; + } + + + /** + * Fetch a feed from remote + * @param string $url remote url of the feed + * @param boolean $getFavicon if the favicon should also be fetched, + * defaults to true + * @param string $lastModified a last modified value from an http header + * defaults to false. If lastModified matches the http header from the feed + * no results are fetched + * @param string $etag an etag from an http header. + * If lastModified matches the http header from the feed + * no results are fetched + * @param bool fullTextEnabled if true tells the fetcher to enhance the + * articles by fetching custom enhanced content + * @param string $basicAuthUser if given, basic auth is set for this feed + * @param string $basicAuthPassword if given, basic auth is set for this + * feed. Ignored if user is null or an empty string + * @throws FetcherException if it fails + * @return array an array containing the new feed and its items, first + * element being the Feed and second element being an array of Items + */ + public function fetch($url, $getFavicon = true, $lastModified = null, + $etag = null, $fullTextEnabled = false, + $basicAuthUser = null, $basicAuthPassword = null) { + try { + if ($basicAuthUser !== null && trim($basicAuthUser) !== '') { + $resource = $this->reader->discover($url, $lastModified, $etag, + $basicAuthUser, + $basicAuthPassword); + } else { + $resource = $this->reader->discover($url, $lastModified, $etag); + } + + if (!$resource->isModified()) { + return [null, null]; + } + + $location = $resource->getUrl(); + $etag = $resource->getEtag(); + $content = $resource->getContent(); + $encoding = $resource->getEncoding(); + $lastModified = $resource->getLastModified(); + + $parser = $this->reader->getParser($location, $content, $encoding); + + if ($fullTextEnabled) { + $parser->enableContentGrabber(); + } + + $parsedFeed = $parser->execute(); + + $feed = $this->buildFeed( + $parsedFeed, $url, $getFavicon, $lastModified, $etag, $location + ); + + $items = []; + foreach ($parsedFeed->getItems() as $item) { + $items[] = $this->buildItem($item, $parsedFeed); + } + + return [$feed, $items]; + + } catch (Exception $ex) { + $this->handleError($ex, $url); + } + + } + + + private function handleError(Exception $ex, $url) { + $msg = $ex->getMessage(); + + if ($ex instanceof MalFormedXmlException) { + $msg = $this->l10n->t('Feed contains invalid XML'); + } else if ($ex instanceof SubscriptionNotFoundException) { + $msg = $this->l10n->t('Feed not found: either the website ' . + 'does not provide a feed or blocks access. To rule out ' . + 'blocking, try to download the feed on your server\'s ' . + 'command line using curl: curl ' . $url); + } else if ($ex instanceof UnsupportedFeedFormatException) { + $msg = $this->l10n->t('Detected feed format is not supported'); + } else if ($ex instanceof InvalidCertificateException) { + $msg = $this->buildCurlSslErrorMessage($ex->getCode()); + } else if ($ex instanceof InvalidUrlException) { + $msg = $this->l10n->t('Website not found'); + } else if ($ex instanceof MaxRedirectException) { + $msg = $this->l10n->t('More redirects than allowed, aborting'); + } else if ($ex instanceof MaxSizeException) { + $msg = $this->l10n->t('Bigger than maximum allowed size'); + } else if ($ex instanceof TimeoutException) { + $msg = $this->l10n->t('Request timed out'); + } else if ($ex instanceof UnauthorizedException) { + $msg = $this->l10n->t('Required credentials for feed were ' . + 'either missing or incorrect'); + } else if ($ex instanceof ForbiddenException) { + $msg = $this->l10n->t('Forbidden to access feed'); + } + + throw new FetcherException($msg); + } + + private function buildCurlSslErrorMessage($errorCode) { + switch ($errorCode) { + case 35: // CURLE_SSL_CONNECT_ERROR + return $this->l10n->t( + 'Certificate error: A problem occurred ' . + 'somewhere in the SSL/TLS handshake. Could be ' . + 'certificates (file formats, paths, permissions), ' . + 'passwords, and others.' + ); + case 51: // CURLE_PEER_FAILED_VERIFICATION + return $this->l10n->t( + 'Certificate error: The remote server\'s SSL ' . + 'certificate or SSH md5 fingerprint was deemed not OK.' + ); + case 58: // CURLE_SSL_CERTPROBLEM + return $this->l10n->t( + 'Certificate error: Problem with the local client ' . + 'certificate.' + ); + case 59: // CURLE_SSL_CIPHER + return $this->l10n->t( + 'Certificate error: Couldn\'t use specified cipher.' + ); + case 60: // CURLE_SSL_CACERT + return $this->l10n->t( + 'Certificate error: Peer certificate cannot be ' . + 'authenticated with known CA certificates.' + ); + case 64: // CURLE_USE_SSL_FAILED + return $this->l10n->t( + 'Certificate error: Requested FTP SSL level failed.' + ); + case 66: // CURLE_SSL_ENGINE_INITFAILED + return $this->l10n->t( + 'Certificate error: Initiating the SSL Engine failed.' + ); + case 77: // CURLE_SSL_CACERT_BADFILE + return $this->l10n->t( + 'Certificate error: Problem with reading the SSL CA ' . + 'cert (path? access rights?)' + ); + case 83: // CURLE_SSL_ISSUER_ERROR + return $this->l10n->t( + 'Certificate error: Issuer check failed' + ); + default: + return $this->l10n->t('Unknown SSL certificate error!'); + } + } + + private function decodeTwice($string) { + return html_entity_decode( + html_entity_decode( + $string, ENT_QUOTES | ENT_HTML5, 'UTF-8' + ), + ENT_QUOTES | ENT_HTML5, 'UTF-8' + ); + } + + + protected function determineRtl($parsedItem, $parsedFeed) { + $itemLang = $parsedItem->getLanguage(); + $feedLang = $parsedFeed->getLanguage(); + + if ($itemLang) { + return Parser::isLanguageRTL($itemLang); + } else { + return Parser::isLanguageRTL($feedLang); + } + } + + + protected function buildItem($parsedItem, $parsedFeed) { + $item = new Item(); + $item->setUnread(); + $item->setUrl($parsedItem->getUrl()); + $item->setGuid($parsedItem->getId()); + $item->setGuidHash($item->getGuid()); + $item->setPubDate($parsedItem->getDate()->getTimestamp()); + $item->setRtl($this->determineRtl($parsedItem, $parsedFeed)); + + // unescape content because angularjs helps against XSS + $item->setTitle($this->decodeTwice($parsedItem->getTitle())); + $item->setAuthor($this->decodeTwice($parsedItem->getAuthor())); + + // purification is done in the service layer + $body = $parsedItem->getContent(); + $body = mb_convert_encoding($body, 'HTML-ENTITIES', + mb_detect_encoding($body)); + $item->setBody($body); + + $enclosureUrl = $parsedItem->getEnclosureUrl(); + if ($enclosureUrl) { + $enclosureType = $parsedItem->getEnclosureType(); + if (stripos($enclosureType, 'audio/') !== false || + stripos($enclosureType, 'video/') !== false + ) { + $item->setEnclosureMime($enclosureType); + $item->setEnclosureLink($enclosureUrl); + } + } + + $item->generateSearchIndex(); + + return $item; + } + + + protected function buildFeed($parsedFeed, $url, $getFavicon, $modified, + $etag, $location) { + $feed = new Feed(); + + $link = $parsedFeed->getSiteUrl(); + + if (!$link) { + $link = $location; + } + + // unescape content because angularjs helps against XSS + $title = strip_tags($this->decodeTwice($parsedFeed->getTitle())); + $feed->setTitle($title); + $feed->setUrl($url); // the url used to add the feed + $feed->setLocation($location); // the url where the feed was found + $feed->setLink($link); // attribute in the feed + $feed->setHttpLastModified($modified); + $feed->setHttpEtag($etag); + $feed->setAdded($this->time->getTime()); + + if ($getFavicon) { + $faviconFetcher = $this->faviconFactory->build(); + $favicon = $faviconFetcher->find($feed->getLink()); + $feed->setFaviconLink($favicon); + } + + return $feed; + } + +} diff --git a/lib/Fetcher/Fetcher.php b/lib/Fetcher/Fetcher.php new file mode 100644 index 000000000..43c9e7a3f --- /dev/null +++ b/lib/Fetcher/Fetcher.php @@ -0,0 +1,69 @@ + + * @author Bernhard Posselt + * @copyright Alessandro Cosentino 2012 + * @copyright Bernhard Posselt 2012, 2014 + */ + +namespace OCA\News\Fetcher; + + +class Fetcher { + + private $fetchers; + + public function __construct(){ + $this->fetchers = []; + } + + + /** + * Add an additional fetcher + * @param IFeedFetcher $fetcher the fetcher + */ + public function registerFetcher(IFeedFetcher $fetcher){ + $this->fetchers[] = $fetcher; + } + + /** + * Fetch a feed from remote + * @param string $url remote url of the feed + * @param boolean $getFavicon if the favicon should also be fetched, + * defaults to true + * @param string $lastModified a last modified value from an http header + * defaults to false. If lastModified matches the http header from the feed + * no results are fetched + * @param string $etag an etag from an http header. + * If lastModified matches the http header from the feed + * no results are fetched + * @param bool fullTextEnabled if true tells the fetcher to enhance the + * articles by fetching custom enhanced content + * @param string $basicAuthUser if given, basic auth is set for this feed + * @param string $basicAuthPassword if given, basic auth is set for this + * feed. Ignored if user is null or an empty string + * @throws FetcherException if simple pie fails + * @return array an array containing the new feed and its items, first + * element being the Feed and second element being an array of Items + */ + public function fetch($url, $getFavicon=true, $lastModified=null, + $etag=null, $fullTextEnabled=false, + $basicAuthUser=null, $basicAuthPassword=null) { + foreach($this->fetchers as $fetcher){ + if($fetcher->canHandle($url)){ + return $fetcher->fetch($url, $getFavicon, $lastModified, $etag, + $fullTextEnabled, $basicAuthUser, + $basicAuthPassword); + } + } + + return [null, []]; + } + + +} diff --git a/lib/Fetcher/FetcherException.php b/lib/Fetcher/FetcherException.php new file mode 100644 index 000000000..27dd42f39 --- /dev/null +++ b/lib/Fetcher/FetcherException.php @@ -0,0 +1,26 @@ + + * @author Bernhard Posselt + * @copyright Alessandro Cosentino 2012 + * @copyright Bernhard Posselt 2012, 2014 + */ + +namespace OCA\News\Fetcher; + +class FetcherException extends \Exception { + + /** + * Constructor + * @param string $msg the error message + */ + public function __construct($msg){ + parent::__construct($msg); + } + +} \ No newline at end of file diff --git a/lib/Fetcher/IFeedFetcher.php b/lib/Fetcher/IFeedFetcher.php new file mode 100644 index 000000000..e854d83fe --- /dev/null +++ b/lib/Fetcher/IFeedFetcher.php @@ -0,0 +1,48 @@ + + * @author Bernhard Posselt + * @copyright Alessandro Cosentino 2012 + * @copyright Bernhard Posselt 2012, 2014 + */ + +namespace OCA\News\Fetcher; + +interface IFeedFetcher { + + /** + * @param string $url remote url of the feed + * @param boolean $getFavicon if the favicon should also be fetched, + * defaults to true + * @param string $lastModified a last modified value from an http header + * defaults to false. If lastModified matches the http header from the feed + * no results are fetched + * @param string $etag an etag from an http header. + * If lastModified matches the http header from the feed + * no results are fetched + * @param bool fullTextEnabled if true tells the fetcher to enhance the + * articles by fetching custom enhanced content + * @param string $basicAuthUser if given, basic auth is set for this feed + * @param string $basicAuthPassword if given, basic auth is set for this + * feed. Ignored if user is null or an empty string + * @throws FetcherException if the fetcher encounters a problem + * @return array an array containing the new feed and its items, first + * element being the Feed and second element being an array of Items + */ + function fetch($url, $getFavicon=true, $lastModified=null, $etag=null, + $fullTextEnabled=false, $basicAuthUser=null, + $basicAuthPassword=null); + + /** + * @param string $url the url that should be fetched + * @return boolean if the fetcher can handle the url. This fetcher will be + * used exclusively to fetch the feed and the items of the page + */ + function canHandle($url); + +} diff --git a/lib/Fetcher/YoutubeFetcher.php b/lib/Fetcher/YoutubeFetcher.php new file mode 100644 index 000000000..3752ba197 --- /dev/null +++ b/lib/Fetcher/YoutubeFetcher.php @@ -0,0 +1,82 @@ + + * @copyright Bernhard Posselt 2012, 2014 + */ + +namespace OCA\News\Fetcher; + + +class YoutubeFetcher implements IFeedFetcher { + + private $feedFetcher; + + public function __construct(FeedFetcher $feedFetcher){ + $this->feedFetcher = $feedFetcher; + } + + + private function buildUrl($url) { + $baseRegex = '%(?:https?://|//)?(?:www.)?youtube.com'; + $playRegex = $baseRegex . '.*?list=([^&]*)%'; + + if (preg_match($playRegex, $url, $matches)) { + $id = $matches[1]; + return 'http://gdata.youtube.com/feeds/api/playlists/' . $id; + } else { + return $url; + } + } + + + /** + * This fetcher handles all the remaining urls therefore always returns true + */ + public function canHandle($url){ + return $this->buildUrl($url) !== $url; + } + + + /** + * Fetch a feed from remote + * @param string $url remote url of the feed + * @param boolean $getFavicon if the favicon should also be fetched, + * defaults to true + * @param string $lastModified a last modified value from an http header + * defaults to false. If lastModified matches the http header from the feed + * no results are fetched + * @param string $etag an etag from an http header. + * If lastModified matches the http header from the feed + * no results are fetched + * @param bool fullTextEnabled if true tells the fetcher to enhance the + * articles by fetching custom enhanced content + * @param string $basicAuthUser if given, basic auth is set for this feed + * @param string $basicAuthPassword if given, basic auth is set for this + * feed. Ignored if user is null or an empty string + * @throws FetcherException if it fails + * @return array an array containing the new feed and its items, first + * element being the Feed and second element being an array of Items + */ + public function fetch($url, $getFavicon=true, $lastModified=null, + $etag=null, $fullTextEnabled=false, + $basicAuthUser=null, $basicAuthPassword=null) { + $transformedUrl = $this->buildUrl($url); + + $result = $this->feedFetcher->fetch( + $transformedUrl, $getFavicon, $lastModified, $etag, + $fullTextEnabled, $basicAuthUser, $basicAuthPassword + ); + + // reset feed url so we know the correct added url for the feed + $result[0]->setUrl($url); + + return $result; + } + + +} -- cgit v1.2.3