From a3246a927de542e1b3ab403359bfd3c08705b6a7 Mon Sep 17 00:00:00 2001 From: Sean Molenaar Date: Wed, 30 Jan 2019 20:36:40 +0100 Subject: Parser: Switch to feedIO for parsing instead of picoFeed --- lib/Fetcher/FeedFetcher.php | 377 +++++++++++++++-------------------------- lib/Fetcher/Fetcher.php | 45 +++-- lib/Fetcher/IFeedFetcher.php | 17 +- lib/Fetcher/YoutubeFetcher.php | 25 +-- 4 files changed, 169 insertions(+), 295 deletions(-) (limited to 'lib/Fetcher') diff --git a/lib/Fetcher/FeedFetcher.php b/lib/Fetcher/FeedFetcher.php index 65a4b5526..ae338ca09 100644 --- a/lib/Fetcher/FeedFetcher.php +++ b/lib/Fetcher/FeedFetcher.php @@ -13,29 +13,17 @@ namespace OCA\News\Fetcher; -use Exception; - -use OCA\News\PostProcessor\LWNProcessor; +use DateTime; +use Favicon\Favicon; +use FeedIo\Feed\ItemInterface; +use FeedIo\FeedInterface; +use FeedIo\FeedIo; use OCP\Http\Client\IClientService; -use PicoFeed\Parser\MalFormedXmlException; -use PicoFeed\Reader\Reader; -use PicoFeed\Parser\Parser; -use PicoFeed\Reader\SubscriptionNotFoundException; -use PicoFeed\Reader\UnsupportedFeedFormatException; -use PicoFeed\Client\InvalidCertificateException; -use PicoFeed\Client\InvalidUrlException; -use PicoFeed\Client\MaxRedirectException; -use PicoFeed\Client\MaxSizeException; -use PicoFeed\Client\TimeoutException; -use PicoFeed\Client\ForbiddenException; -use PicoFeed\Client\UnauthorizedException; use OCP\IL10N; use OCA\News\Db\Item; use OCA\News\Db\Feed; -use OCA\News\Utility\PicoFeedFaviconFactory; -use OCA\News\Utility\PicoFeedReaderFactory; use OCA\News\Utility\Time; class FeedFetcher implements IFeedFetcher @@ -48,22 +36,26 @@ class FeedFetcher implements IFeedFetcher private $clientService; public function __construct( - Reader $reader, - PicoFeedFaviconFactory $faviconFactory, + FeedIo $fetcher, + Favicon $favicon, IL10N $l10n, Time $time, IClientService $clientService ) { - $this->faviconFactory = $faviconFactory; - $this->reader = $reader; - $this->time = $time; - $this->l10n = $l10n; - $this->clientService = $clientService; + $this->faviconFactory = $favicon; + $this->reader = $fetcher; + $this->time = $time; + $this->l10n = $l10n; + $this->clientService = $clientService; } /** - * This fetcher handles all the remaining urls therefore always returns true + * This fetcher handles all the remaining urls therefore always returns true. + * + * @param string $url The URL to check + * + * @return bool */ public function canHandle($url) { @@ -74,176 +66,55 @@ class FeedFetcher implements IFeedFetcher /** * Fetch a feed from remote * - * @param string $url remote url of the feed - * @param boolean $getFavicon if the favicon should also be fetched, defaults to true - * @param string $lastModified a last modified value from an http header defaults to false. - * If lastModified matches the http header from the feed no results are fetched - * @param string $etag an etag from an http header. - * If lastModified matches the http header from the feed no results are fetched - * @param bool $fullTextEnabled if true tells the fetcher to enhance the articles by fetching more content - * @param string $basicAuthUser if given, basic auth is set for this feed - * @param string $basicAuthPassword if given, basic auth is set for this feed. Ignored if user is empty + * @param string $url Remote url of the feed + * @param boolean $getFavicon If the favicon should also be fetched, + * defaults to true + * @param string $lastModified A last modified value from an http header + * defaults to false. If lastModified matches + * the header from the feed no results are fetched + * @param string $user If given, basic auth is set for this feed + * @param string $password If given, basic auth is set for this feed. + * Ignored if user is null or an empty string. * - * @throws FetcherException if it fails * @return array an array containing the new feed and its items, first * element being the Feed and second element being an array of Items */ - public function fetch( - $url, - $getFavicon = true, - $lastModified = null, - $etag = null, - $fullTextEnabled = false, - $basicAuthUser = null, - $basicAuthPassword = null - ) { - try { - if ($basicAuthUser !== null && trim($basicAuthUser) !== '') { - $resource = $this->reader->discover( - $url, - $lastModified, - $etag, - $basicAuthUser, - $basicAuthPassword - ); - } else { - $resource = $this->reader->discover($url, $lastModified, $etag); - } - - if (!$resource->isModified()) { - return [null, null]; - } - - $location = $resource->getUrl(); - $etag = $resource->getEtag(); - $content = $resource->getContent(); - $encoding = $resource->getEncoding(); - $lastModified = $resource->getLastModified(); - - $parser = $this->reader->getParser($location, $content, $encoding); - - if ($fullTextEnabled) { - $parser->enableContentGrabber(); - $parser->getItemPostProcessor()->register( - new LWNProcessor( - $basicAuthUser, - $basicAuthPassword, - $this->clientService - ) - ); - } - - $parsedFeed = $parser->execute(); - - $feed = $this->buildFeed( - $parsedFeed, - $url, - $getFavicon, - $lastModified, - $etag, - $location - ); - - $items = []; - foreach ($parsedFeed->getItems() as $item) { - $items[] = $this->buildItem($item, $parsedFeed); - } - - return [$feed, $items]; - } catch (Exception $ex) { - $this->handleError($ex, $url); - } - } - - - private function handleError(Exception $ex, $url) + public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null) { - $msg = $ex->getMessage(); + if ($user !== null && trim($user) !== '') { + $url = explode('://', $url); + $url = $url[0] . '://' . $user . ':' . $password . '@' . $url[1]; + } + $resource = $this->reader->readSince($url, new DateTime($lastModified)); - if ($ex instanceof MalFormedXmlException) { - $msg = $this->l10n->t('Feed contains invalid XML'); - } elseif ($ex instanceof SubscriptionNotFoundException) { - $msg = $this->l10n->t( - 'Feed not found: Either the website ' . - 'does not provide a feed or blocks access. To rule out ' . - 'blocking, try to download the feed on your server\'s ' . - 'command line using curl: curl ' . $url - ); - } elseif ($ex instanceof UnsupportedFeedFormatException) { - $msg = $this->l10n->t('Detected feed format is not supported'); - } elseif ($ex instanceof InvalidCertificateException) { - $msg = $this->buildCurlSslErrorMessage($ex->getCode()); - } elseif ($ex instanceof InvalidUrlException) { - $msg = $this->l10n->t('Website not found'); - } elseif ($ex instanceof MaxRedirectException) { - $msg = $this->l10n->t('More redirects than allowed, aborting'); - } elseif ($ex instanceof MaxSizeException) { - $msg = $this->l10n->t('Bigger than maximum allowed size'); - } elseif ($ex instanceof TimeoutException) { - $msg = $this->l10n->t('Request timed out'); - } elseif ($ex instanceof UnauthorizedException) { - $msg = $this->l10n->t( - 'Required credentials for feed were ' . - 'either missing or incorrect' - ); - } elseif ($ex instanceof ForbiddenException) { - $msg = $this->l10n->t('Forbidden to access feed'); + if (!$resource->getResponse()->isModified()) { + return [null, null]; } - throw new FetcherException($msg); - } + $location = $resource->getUrl(); + $parsedFeed = $resource->getFeed(); + $feed = $this->buildFeed( + $parsedFeed, + $url, + $getFavicon, + $location + ); - private function buildCurlSslErrorMessage($errorCode) - { - switch ($errorCode) { - case 35: // CURLE_SSL_CONNECT_ERROR - return $this->l10n->t( - 'Certificate error: A problem occurred ' . - 'somewhere in the SSL/TLS handshake. Could be ' . - 'certificates (file formats, paths, permissions), ' . - 'passwords, and others.' - ); - case 51: // CURLE_PEER_FAILED_VERIFICATION - return $this->l10n->t( - 'Certificate error: The remote server\'s SSL ' . - 'certificate or SSH md5 fingerprint was deemed not OK.' - ); - case 58: // CURLE_SSL_CERTPROBLEM - return $this->l10n->t( - 'Certificate error: Problem with the local client ' . - 'certificate.' - ); - case 59: // CURLE_SSL_CIPHER - return $this->l10n->t( - 'Certificate error: Couldn\'t use specified cipher.' - ); - case 60: // CURLE_SSL_CACERT - return $this->l10n->t( - 'Certificate error: Peer certificate cannot be ' . - 'authenticated with known CA certificates.' - ); - case 64: // CURLE_USE_SSL_FAILED - return $this->l10n->t( - 'Certificate error: Requested FTP SSL level failed.' - ); - case 66: // CURLE_SSL_ENGINE_INITFAILED - return $this->l10n->t( - 'Certificate error: Initiating the SSL engine failed.' - ); - case 77: // CURLE_SSL_CACERT_BADFILE - return $this->l10n->t( - 'Certificate error: Problem with reading the SSL CA ' . - 'cert (path? access rights?)' - ); - case 83: // CURLE_SSL_ISSUER_ERROR - return $this->l10n->t( - 'Certificate error: Issuer check failed' - ); - default: - return $this->l10n->t('Unknown SSL certificate error!'); + $items = []; + foreach ($parsedFeed as $item) { + $items[] = $this->buildItem($item, $parsedFeed); } + + return [$feed, $items]; } + /** + * Decode the string twice + * + * @param string $string String to decode + * + * @return string + */ private function decodeTwice($string) { return html_entity_decode( @@ -257,37 +128,73 @@ class FeedFetcher implements IFeedFetcher ); } - - protected function determineRtl($parsedItem, $parsedFeed) + /** + * Check if a feed is RTL or not + * + * @param FeedInterface $parsedFeed The feed that was parsed + * + * @return bool + */ + protected function determineRtl($parsedFeed) { - $itemLang = $parsedItem->getLanguage(); - $feedLang = $parsedFeed->getLanguage(); - - if ($itemLang) { - return Parser::isLanguageRTL($itemLang); - } else { - return Parser::isLanguageRTL($feedLang); + $language = $parsedFeed->getLanguage(); + + $language = strtolower($language); + $rtl_languages = array( + 'ar', // Arabic (ar-**) + 'fa', // Farsi (fa-**) + 'ur', // Urdu (ur-**) + 'ps', // Pashtu (ps-**) + 'syr', // Syriac (syr-**) + 'dv', // Divehi (dv-**) + 'he', // Hebrew (he-**) + 'yi', // Yiddish (yi-**) + ); + foreach ($rtl_languages as $prefix) { + if (strpos($language, $prefix) === 0) { + return true; + } } + return false; } - + /** + * Build an item based on a feed. + * + * @param ItemInterface $parsedItem The item to use + * @param FeedInterface $parsedFeed The feed to use + * + * @return Item + */ protected function buildItem($parsedItem, $parsedFeed) { $item = new Item(); $item->setUnread(true); - $item->setUrl($parsedItem->getUrl()); - $item->setGuid($parsedItem->getId()); + $item->setUrl($parsedItem->getLink()); + $item->setGuid($parsedItem->getPublicId()); $item->setGuidHash($item->getGuid()); - $item->setPubDate($parsedItem->getPublishedDate()->getTimestamp()); - $item->setUpdatedDate($parsedItem->getUpdatedDate()->getTimestamp()); - $item->setRtl($this->determineRtl($parsedItem, $parsedFeed)); + + $pubDT = $parsedItem->getLastModified(); + if ($parsedItem->getValue('pubDate') !== null) { + $pubDT = new DateTime($parsedItem->getValue('pubDate')); + } elseif ($parsedItem->getValue('published') !== null) { + $pubDT = new DateTime($parsedItem->getValue('published')); + } + + $item->setPubDate( + $pubDT->getTimestamp() + ); + $item->setLastModified( + $parsedItem->getLastModified()->getTimestamp() + ); + $item->setRtl($this->determineRtl($parsedFeed)); // unescape content because angularjs helps against XSS $item->setTitle($this->decodeTwice($parsedItem->getTitle())); $item->setAuthor($this->decodeTwice($parsedItem->getAuthor())); // purification is done in the service layer - $body = $parsedItem->getContent(); + $body = $parsedItem->getDescription(); $body = mb_convert_encoding( $body, 'HTML-ENTITIES', @@ -295,14 +202,14 @@ class FeedFetcher implements IFeedFetcher ); $item->setBody($body); - $enclosureUrl = $parsedItem->getEnclosureUrl(); - if ($enclosureUrl) { - $enclosureType = $parsedItem->getEnclosureType(); - if (stripos($enclosureType, 'audio/') !== false - || stripos($enclosureType, 'video/') !== false - ) { - $item->setEnclosureMime($enclosureType); - $item->setEnclosureLink($enclosureUrl); + if ($parsedItem->hasMedia()) { + // TODO: Fix multiple media support + foreach ($parsedItem->getMedias() as $media) { + if (!$item->isSupportedMime($media->getType())) { + continue; + } + $item->setEnclosureMime($media->getType()); + $item->setEnclosureLink($media->getUrl()); } } @@ -311,39 +218,35 @@ class FeedFetcher implements IFeedFetcher return $item; } - - protected function buildFeed( - $parsedFeed, - $url, - $getFavicon, - $modified, - $etag, - $location - ) { - $feed = new Feed(); - - $link = $parsedFeed->getSiteUrl(); - - if (!$link) { - $link = $location; - } + /** + * Build a feed based on provided info + * + * @param FeedInterface $feed Feed to build from + * @param string $url URL to use + * @param bool $getFavicon To get the favicon + * @param string $location String base URL + * + * @return Feed + */ + protected function buildFeed($feed, $url, $getFavicon, $location) + { + $newFeed = new Feed(); // unescape content because angularjs helps against XSS - $title = strip_tags($this->decodeTwice($parsedFeed->getTitle())); - $feed->setTitle($title); - $feed->setUrl($url); // the url used to add the feed - $feed->setLocation($location); // the url where the feed was found - $feed->setLink($link); // attribute in the feed - $feed->setHttpLastModified($modified); - $feed->setHttpEtag($etag); - $feed->setAdded($this->time->getTime()); - - if ($getFavicon) { - $faviconFetcher = $this->faviconFactory->build(); - $favicon = $faviconFetcher->find($feed->getLink()); - $feed->setFaviconLink($favicon); + $title = strip_tags($this->decodeTwice($feed->getTitle())); + $newFeed->setTitle($title); + $newFeed->setUrl($url); // the url used to add the feed + $newFeed->setLocation($location); // the url where the feed was found + $newFeed->setLink($feed->getLink()); // attribute in the feed + $newFeed->setLastModified($feed->getLastModified()->getTimestamp()); + $newFeed->setAdded($this->time->getTime()); + + if (!$getFavicon) { + return $newFeed; } + $favicon = $this->faviconFactory->get($url); + $newFeed->setFaviconLink($favicon); - return $feed; + return $newFeed; } } diff --git a/lib/Fetcher/Fetcher.php b/lib/Fetcher/Fetcher.php index e78da0265..23f5b57f7 100644 --- a/lib/Fetcher/Fetcher.php +++ b/lib/Fetcher/Fetcher.php @@ -16,6 +16,10 @@ namespace OCA\News\Fetcher; class Fetcher { + /** + * List of fetchers. + * @var IFeedFetcher[] + */ private $fetchers; public function __construct() @@ -39,39 +43,28 @@ class Fetcher * * @param string $url remote url of the feed * @param boolean $getFavicon if the favicon should also be fetched, defaults to true - * @param string $lastModified a last modified value from an http header defaults to false. + * @param string $lastModified a last modified value from an http header defaults to false. * If lastModified matches the http header from the feed no results are fetched - * @param string $etag an etag from an http header. - * If lastModified matches the http header from the feed no results are fetched - * @param bool $fullTextEnabled if true tells the fetcher to enhance the articles by fetching more content - * @param string $basicAuthUser if given, basic auth is set for this feed - * @param string $basicAuthPassword if given, basic auth is set for this feed. Ignored if user is empty + * @param string $user if given, basic auth is set for this feed + * @param string $password if given, basic auth is set for this feed. Ignored if user is empty * - * @throws FetcherException if simple pie fails + * @throws FetcherException if FeedIO fails * @return array an array containing the new feed and its items, first * element being the Feed and second element being an array of Items */ - public function fetch( - $url, - $getFavicon = true, - $lastModified = null, - $etag = null, - $fullTextEnabled = false, - $basicAuthUser = null, - $basicAuthPassword = null - ) { + public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null) + { foreach ($this->fetchers as $fetcher) { - if ($fetcher->canHandle($url)) { - return $fetcher->fetch( - $url, - $getFavicon, - $lastModified, - $etag, - $fullTextEnabled, - $basicAuthUser, - $basicAuthPassword - ); + if (!$fetcher->canHandle($url)) { + continue; } + return $fetcher->fetch( + $url, + $getFavicon, + $lastModified, + $user, + $password + ); } return [null, []]; diff --git a/lib/Fetcher/IFeedFetcher.php b/lib/Fetcher/IFeedFetcher.php index c96bd315b..d5994a076 100644 --- a/lib/Fetcher/IFeedFetcher.php +++ b/lib/Fetcher/IFeedFetcher.php @@ -23,25 +23,14 @@ interface IFeedFetcher * @param boolean $getFavicon if the favicon should also be fetched, defaults to true * @param string $lastModified a last modified value from an http header defaults to false. * If lastModified matches the http header from the feed no results are fetched - * @param string $etag an etag from an http header. - * If lastModified matches the http header from the feed no results are fetched - * @param bool $fullTextEnabled if true tells the fetcher to enhance the articles by fetching more content - * @param string $basicAuthUser if given, basic auth is set for this feed - * @param string $basicAuthPassword if given, basic auth is set for this feed. Ignored if user is empty + * @param string $user if given, basic auth is set for this feed + * @param string $password if given, basic auth is set for this feed. Ignored if user is empty * * @throws FetcherException if the fetcher encounters a problem * @return array an array containing the new feed and its items, first * element being the Feed and second element being an array of Items */ - public function fetch( - $url, - $getFavicon = true, - $lastModified = null, - $etag = null, - $fullTextEnabled = false, - $basicAuthUser = null, - $basicAuthPassword = null - ); + public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null); /** * Can a fetcher handle a feed. diff --git a/lib/Fetcher/YoutubeFetcher.php b/lib/Fetcher/YoutubeFetcher.php index a47b8fdb8..9ccce4463 100644 --- a/lib/Fetcher/YoutubeFetcher.php +++ b/lib/Fetcher/YoutubeFetcher.php @@ -52,35 +52,24 @@ class YoutubeFetcher implements IFeedFetcher * @param boolean $getFavicon if the favicon should also be fetched, defaults to true * @param string $lastModified a last modified value from an http header defaults to false. * If lastModified matches the http header from the feed no results are fetched - * @param string $etag an etag from an http header. - * If lastModified matches the http header from the feed no results are fetched - * @param bool $fullTextEnabled if true tells the fetcher to enhance the articles by fetching more content - * @param string $basicAuthUser if given, basic auth is set for this feed - * @param string $basicAuthPassword if given, basic auth is set for this feed. Ignored if user is empty + * @param string $user if given, basic auth is set for this feed + * @param string $password if given, basic auth is set for this feed. Ignored if user is empty * * @throws FetcherException if it fails * @return array an array containing the new feed and its items, first * element being the Feed and second element being an array of Items */ - public function fetch( - $url, - $getFavicon = true, - $lastModified = null, - $etag = null, - $fullTextEnabled = false, - $basicAuthUser = null, - $basicAuthPassword = null - ) { + public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null + ) + { $transformedUrl = $this->buildUrl($url); $result = $this->feedFetcher->fetch( $transformedUrl, $getFavicon, $lastModified, - $etag, - $fullTextEnabled, - $basicAuthUser, - $basicAuthPassword + $user, + $password ); // reset feed url so we know the correct added url for the feed -- cgit v1.2.3 From d61a57bd2dbbc6ecbddfaa22c347248210703f02 Mon Sep 17 00:00:00 2001 From: Sean Molenaar Date: Thu, 31 Jan 2019 10:23:56 +0100 Subject: Make feed failing more verbose --- lib/Fetcher/FeedFetcher.php | 34 +++++++++++++--------------------- lib/Fetcher/IFeedFetcher.php | 4 ++-- lib/Fetcher/YoutubeFetcher.php | 5 ++--- 3 files changed, 17 insertions(+), 26 deletions(-) (limited to 'lib/Fetcher') diff --git a/lib/Fetcher/FeedFetcher.php b/lib/Fetcher/FeedFetcher.php index ae338ca09..abfd0095b 100644 --- a/lib/Fetcher/FeedFetcher.php +++ b/lib/Fetcher/FeedFetcher.php @@ -18,7 +18,6 @@ use Favicon\Favicon; use FeedIo\Feed\ItemInterface; use FeedIo\FeedInterface; use FeedIo\FeedIo; -use OCP\Http\Client\IClientService; use OCP\IL10N; @@ -33,20 +32,13 @@ class FeedFetcher implements IFeedFetcher private $reader; private $l10n; private $time; - private $clientService; - - public function __construct( - FeedIo $fetcher, - Favicon $favicon, - IL10N $l10n, - Time $time, - IClientService $clientService - ) { - $this->faviconFactory = $favicon; + + public function __construct(FeedIo $fetcher, Favicon $favicon, IL10N $l10n, Time $time) + { $this->reader = $fetcher; - $this->time = $time; + $this->faviconFactory = $favicon; $this->l10n = $l10n; - $this->clientService = $clientService; + $this->time = $time; } @@ -57,7 +49,7 @@ class FeedFetcher implements IFeedFetcher * * @return bool */ - public function canHandle($url) + public function canHandle($url): bool { return true; } @@ -79,7 +71,7 @@ class FeedFetcher implements IFeedFetcher * @return array an array containing the new feed and its items, first * element being the Feed and second element being an array of Items */ - public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null) + public function fetch(string $url, $getFavicon = true, $lastModified = null, $user = null, $password = null): array { if ($user !== null && trim($user) !== '') { $url = explode('://', $url); @@ -88,7 +80,7 @@ class FeedFetcher implements IFeedFetcher $resource = $this->reader->readSince($url, new DateTime($lastModified)); if (!$resource->getResponse()->isModified()) { - return [null, null]; + throw new FetcherException('Feed was not modified since last fetch'); } $location = $resource->getUrl(); @@ -115,7 +107,7 @@ class FeedFetcher implements IFeedFetcher * * @return string */ - private function decodeTwice($string) + private function decodeTwice($string): string { return html_entity_decode( html_entity_decode( @@ -135,7 +127,7 @@ class FeedFetcher implements IFeedFetcher * * @return bool */ - protected function determineRtl($parsedFeed) + protected function determineRtl(FeedInterface $parsedFeed): bool { $language = $parsedFeed->getLanguage(); @@ -166,7 +158,7 @@ class FeedFetcher implements IFeedFetcher * * @return Item */ - protected function buildItem($parsedItem, $parsedFeed) + protected function buildItem(ItemInterface $parsedItem, FeedInterface $parsedFeed): Item { $item = new Item(); $item->setUnread(true); @@ -223,12 +215,12 @@ class FeedFetcher implements IFeedFetcher * * @param FeedInterface $feed Feed to build from * @param string $url URL to use - * @param bool $getFavicon To get the favicon + * @param boolean $getFavicon To get the favicon * @param string $location String base URL * * @return Feed */ - protected function buildFeed($feed, $url, $getFavicon, $location) + protected function buildFeed(FeedInterface $feed, string $url, boolean $getFavicon, string $location): Feed { $newFeed = new Feed(); diff --git a/lib/Fetcher/IFeedFetcher.php b/lib/Fetcher/IFeedFetcher.php index d5994a076..70f153d2e 100644 --- a/lib/Fetcher/IFeedFetcher.php +++ b/lib/Fetcher/IFeedFetcher.php @@ -30,7 +30,7 @@ interface IFeedFetcher * @return array an array containing the new feed and its items, first * element being the Feed and second element being an array of Items */ - public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null); + public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null): array; /** * Can a fetcher handle a feed. @@ -40,5 +40,5 @@ interface IFeedFetcher * @return boolean if the fetcher can handle the url. This fetcher will be * used exclusively to fetch the feed and the items of the page */ - public function canHandle($url); + public function canHandle($url): bool; } diff --git a/lib/Fetcher/YoutubeFetcher.php b/lib/Fetcher/YoutubeFetcher.php index 9ccce4463..fd4e7d2fb 100644 --- a/lib/Fetcher/YoutubeFetcher.php +++ b/lib/Fetcher/YoutubeFetcher.php @@ -39,7 +39,7 @@ class YoutubeFetcher implements IFeedFetcher /** * This fetcher handles all the remaining urls therefore always returns true */ - public function canHandle($url) + public function canHandle($url): bool { return $this->buildUrl($url) !== $url; } @@ -59,8 +59,7 @@ class YoutubeFetcher implements IFeedFetcher * @return array an array containing the new feed and its items, first * element being the Feed and second element being an array of Items */ - public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null - ) + public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null): array { $transformedUrl = $this->buildUrl($url); -- cgit v1.2.3 From ee6bb6ce70b1362bde39aa8183b407555f418204 Mon Sep 17 00:00:00 2001 From: Sean Molenaar Date: Thu, 31 Jan 2019 13:40:52 +0100 Subject: Fix mismatched interface --- lib/Fetcher/FeedFetcher.php | 19 ++++--------------- lib/Fetcher/IFeedFetcher.php | 14 +++++++------- lib/Fetcher/YoutubeFetcher.php | 15 +++------------ 3 files changed, 14 insertions(+), 34 deletions(-) (limited to 'lib/Fetcher') diff --git a/lib/Fetcher/FeedFetcher.php b/lib/Fetcher/FeedFetcher.php index abfd0095b..38fcab823 100644 --- a/lib/Fetcher/FeedFetcher.php +++ b/lib/Fetcher/FeedFetcher.php @@ -58,20 +58,9 @@ class FeedFetcher implements IFeedFetcher /** * Fetch a feed from remote * - * @param string $url Remote url of the feed - * @param boolean $getFavicon If the favicon should also be fetched, - * defaults to true - * @param string $lastModified A last modified value from an http header - * defaults to false. If lastModified matches - * the header from the feed no results are fetched - * @param string $user If given, basic auth is set for this feed - * @param string $password If given, basic auth is set for this feed. - * Ignored if user is null or an empty string. - * - * @return array an array containing the new feed and its items, first - * element being the Feed and second element being an array of Items + * @inheritdoc */ - public function fetch(string $url, $getFavicon = true, $lastModified = null, $user = null, $password = null): array + public function fetch(string $url, bool $favicon, $lastModified, $user, $password): array { if ($user !== null && trim($user) !== '') { $url = explode('://', $url); @@ -88,7 +77,7 @@ class FeedFetcher implements IFeedFetcher $feed = $this->buildFeed( $parsedFeed, $url, - $getFavicon, + $favicon, $location ); @@ -220,7 +209,7 @@ class FeedFetcher implements IFeedFetcher * * @return Feed */ - protected function buildFeed(FeedInterface $feed, string $url, boolean $getFavicon, string $location): Feed + protected function buildFeed(FeedInterface $feed, string $url, bool $getFavicon, string $location): Feed { $newFeed = new Feed(); diff --git a/lib/Fetcher/IFeedFetcher.php b/lib/Fetcher/IFeedFetcher.php index 70f153d2e..ecc0ffc16 100644 --- a/lib/Fetcher/IFeedFetcher.php +++ b/lib/Fetcher/IFeedFetcher.php @@ -19,18 +19,18 @@ interface IFeedFetcher /** * Fetch feed content. * - * @param string $url remote url of the feed - * @param boolean $getFavicon if the favicon should also be fetched, defaults to true - * @param string $lastModified a last modified value from an http header defaults to false. + * @param string $url remote url of the feed + * @param boolean $favicon if the favicon should also be fetched, defaults to true + * @param string|null $lastModified a last modified value from an http header defaults to false. * If lastModified matches the http header from the feed no results are fetched - * @param string $user if given, basic auth is set for this feed - * @param string $password if given, basic auth is set for this feed. Ignored if user is empty + * @param string|null $user if given, basic auth is set for this feed + * @param string|null $password if given, basic auth is set for this feed. Ignored if user is empty * - * @throws FetcherException if the fetcher encounters a problem * @return array an array containing the new feed and its items, first * element being the Feed and second element being an array of Items + * @throws FetcherException if the fetcher encounters a problem */ - public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null): array; + public function fetch(string $url, bool $favicon, $lastModified, $user, $password): array; /** * Can a fetcher handle a feed. diff --git a/lib/Fetcher/YoutubeFetcher.php b/lib/Fetcher/YoutubeFetcher.php index fd4e7d2fb..41319a36b 100644 --- a/lib/Fetcher/YoutubeFetcher.php +++ b/lib/Fetcher/YoutubeFetcher.php @@ -48,24 +48,15 @@ class YoutubeFetcher implements IFeedFetcher /** * Fetch a feed from remote * - * @param string $url remote url of the feed - * @param boolean $getFavicon if the favicon should also be fetched, defaults to true - * @param string $lastModified a last modified value from an http header defaults to false. - * If lastModified matches the http header from the feed no results are fetched - * @param string $user if given, basic auth is set for this feed - * @param string $password if given, basic auth is set for this feed. Ignored if user is empty - * - * @throws FetcherException if it fails - * @return array an array containing the new feed and its items, first - * element being the Feed and second element being an array of Items + * @inheritdoc */ - public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null): array + public function fetch(string $url, bool $favicon, $lastModified, $user, $password): array { $transformedUrl = $this->buildUrl($url); $result = $this->feedFetcher->fetch( $transformedUrl, - $getFavicon, + $favicon, $lastModified, $user, $password -- cgit v1.2.3 From 7c17b2c24b1131ace6b464723978841566714f54 Mon Sep 17 00:00:00 2001 From: Sean Molenaar Date: Sat, 23 Feb 2019 16:11:28 +0100 Subject: Allow empty update time --- lib/Fetcher/FeedFetcher.php | 46 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) mode change 100644 => 100755 lib/Fetcher/FeedFetcher.php (limited to 'lib/Fetcher') diff --git a/lib/Fetcher/FeedFetcher.php b/lib/Fetcher/FeedFetcher.php old mode 100644 new mode 100755 index 38fcab823..898597460 --- a/lib/Fetcher/FeedFetcher.php +++ b/lib/Fetcher/FeedFetcher.php @@ -19,6 +19,7 @@ use FeedIo\Feed\ItemInterface; use FeedIo\FeedInterface; use FeedIo\FeedIo; +use OCA\News\Utility\PsrLogger; use OCP\IL10N; use OCA\News\Db\Item; @@ -32,13 +33,15 @@ class FeedFetcher implements IFeedFetcher private $reader; private $l10n; private $time; + private $logger; - public function __construct(FeedIo $fetcher, Favicon $favicon, IL10N $l10n, Time $time) + public function __construct(FeedIo $fetcher, Favicon $favicon, IL10N $l10n, Time $time, PsrLogger $logger) { $this->reader = $fetcher; $this->faviconFactory = $favicon; $this->l10n = $l10n; $this->time = $time; + $this->logger = $logger; } @@ -62,14 +65,24 @@ class FeedFetcher implements IFeedFetcher */ public function fetch(string $url, bool $favicon, $lastModified, $user, $password): array { - if ($user !== null && trim($user) !== '') { + if (!empty($user) && !empty(trim($user))) { $url = explode('://', $url); $url = $url[0] . '://' . $user . ':' . $password . '@' . $url[1]; } - $resource = $this->reader->readSince($url, new DateTime($lastModified)); + if (is_null($lastModified) || !is_string($lastModified)) { + $resource = $this->reader->read($url); + } else { + $resource = $this->reader->readSince($url, new DateTime($lastModified)); + } - if (!$resource->getResponse()->isModified()) { - throw new FetcherException('Feed was not modified since last fetch'); + $response = $resource->getResponse(); + if (!$response->isModified()) { + $this->logger->debug('Feed {url} was not modified since last fetch. old: {old}, new: {new}', [ + 'url' => $url, + 'old' => print_r($lastModified, true), + 'new' => print_r($response->getLastModified(), true), + ]); + return [null, []]; } $location = $resource->getUrl(); @@ -82,6 +95,7 @@ class FeedFetcher implements IFeedFetcher ); $items = []; + $this->logger->debug('Feed ' . $url . ' was modified since last fetch. #' . count($parsedFeed) . ' items'); foreach ($parsedFeed as $item) { $items[] = $this->buildItem($item, $parsedFeed); } @@ -153,26 +167,32 @@ class FeedFetcher implements IFeedFetcher $item->setUnread(true); $item->setUrl($parsedItem->getLink()); $item->setGuid($parsedItem->getPublicId()); - $item->setGuidHash($item->getGuid()); + $item->setGuidHash(md5($item->getGuid())); - $pubDT = $parsedItem->getLastModified(); + $lastmodified = $parsedItem->getLastModified() ?? new \DateTime(); if ($parsedItem->getValue('pubDate') !== null) { $pubDT = new DateTime($parsedItem->getValue('pubDate')); } elseif ($parsedItem->getValue('published') !== null) { $pubDT = new DateTime($parsedItem->getValue('published')); + } else { + $pubDT = $lastmodified; } $item->setPubDate( $pubDT->getTimestamp() ); + $item->setLastModified( - $parsedItem->getLastModified()->getTimestamp() + $lastmodified->getTimestamp() ); $item->setRtl($this->determineRtl($parsedFeed)); // unescape content because angularjs helps against XSS $item->setTitle($this->decodeTwice($parsedItem->getTitle())); - $item->setAuthor($this->decodeTwice($parsedItem->getAuthor())); + $author = $parsedItem->getAuthor(); + if (!is_null($author)) { + $item->setAuthor($this->decodeTwice($author->getName())); + } // purification is done in the service layer $body = $parsedItem->getDescription(); @@ -196,6 +216,11 @@ class FeedFetcher implements IFeedFetcher $item->generateSearchIndex(); + $this->logger->debug('Added item {title} for feed {feed} publishdate: {datetime}', [ + 'title' => $item->getTitle(), + 'feed' => $parsedFeed->getTitle(), + 'datetime' => $item->getLastModified(), + ]); return $item; } @@ -219,7 +244,8 @@ class FeedFetcher implements IFeedFetcher $newFeed->setUrl($url); // the url used to add the feed $newFeed->setLocation($location); // the url where the feed was found $newFeed->setLink($feed->getLink()); // attribute in the feed - $newFeed->setLastModified($feed->getLastModified()->getTimestamp()); + $lastmodified = $feed->getLastModified() ?? new DateTime(); + $newFeed->setLastModified($lastmodified->getTimestamp()); $newFeed->setAdded($this->time->getTime()); if (!$getFavicon) { -- cgit v1.2.3