From a3246a927de542e1b3ab403359bfd3c08705b6a7 Mon Sep 17 00:00:00 2001 From: Sean Molenaar Date: Wed, 30 Jan 2019 20:36:40 +0100 Subject: Parser: Switch to feedIO for parsing instead of picoFeed --- lib/Fetcher/FeedFetcher.php | 377 +++++++++++++++-------------------------- lib/Fetcher/Fetcher.php | 45 +++-- lib/Fetcher/IFeedFetcher.php | 17 +- lib/Fetcher/YoutubeFetcher.php | 25 +-- 4 files changed, 169 insertions(+), 295 deletions(-) (limited to 'lib/Fetcher') diff --git a/lib/Fetcher/FeedFetcher.php b/lib/Fetcher/FeedFetcher.php index 65a4b5526..ae338ca09 100644 --- a/lib/Fetcher/FeedFetcher.php +++ b/lib/Fetcher/FeedFetcher.php @@ -13,29 +13,17 @@ namespace OCA\News\Fetcher; -use Exception; - -use OCA\News\PostProcessor\LWNProcessor; +use DateTime; +use Favicon\Favicon; +use FeedIo\Feed\ItemInterface; +use FeedIo\FeedInterface; +use FeedIo\FeedIo; use OCP\Http\Client\IClientService; -use PicoFeed\Parser\MalFormedXmlException; -use PicoFeed\Reader\Reader; -use PicoFeed\Parser\Parser; -use PicoFeed\Reader\SubscriptionNotFoundException; -use PicoFeed\Reader\UnsupportedFeedFormatException; -use PicoFeed\Client\InvalidCertificateException; -use PicoFeed\Client\InvalidUrlException; -use PicoFeed\Client\MaxRedirectException; -use PicoFeed\Client\MaxSizeException; -use PicoFeed\Client\TimeoutException; -use PicoFeed\Client\ForbiddenException; -use PicoFeed\Client\UnauthorizedException; use OCP\IL10N; use OCA\News\Db\Item; use OCA\News\Db\Feed; -use OCA\News\Utility\PicoFeedFaviconFactory; -use OCA\News\Utility\PicoFeedReaderFactory; use OCA\News\Utility\Time; class FeedFetcher implements IFeedFetcher @@ -48,22 +36,26 @@ class FeedFetcher implements IFeedFetcher private $clientService; public function __construct( - Reader $reader, - PicoFeedFaviconFactory $faviconFactory, + FeedIo $fetcher, + Favicon $favicon, IL10N $l10n, Time $time, IClientService $clientService ) { - $this->faviconFactory = $faviconFactory; - $this->reader = $reader; - $this->time = $time; - $this->l10n = $l10n; - $this->clientService = $clientService; + $this->faviconFactory = $favicon; + $this->reader = $fetcher; + $this->time = $time; + $this->l10n = $l10n; + $this->clientService = $clientService; } /** - * This fetcher handles all the remaining urls therefore always returns true + * This fetcher handles all the remaining urls therefore always returns true. + * + * @param string $url The URL to check + * + * @return bool */ public function canHandle($url) { @@ -74,176 +66,55 @@ class FeedFetcher implements IFeedFetcher /** * Fetch a feed from remote * - * @param string $url remote url of the feed - * @param boolean $getFavicon if the favicon should also be fetched, defaults to true - * @param string $lastModified a last modified value from an http header defaults to false. - * If lastModified matches the http header from the feed no results are fetched - * @param string $etag an etag from an http header. - * If lastModified matches the http header from the feed no results are fetched - * @param bool $fullTextEnabled if true tells the fetcher to enhance the articles by fetching more content - * @param string $basicAuthUser if given, basic auth is set for this feed - * @param string $basicAuthPassword if given, basic auth is set for this feed. Ignored if user is empty + * @param string $url Remote url of the feed + * @param boolean $getFavicon If the favicon should also be fetched, + * defaults to true + * @param string $lastModified A last modified value from an http header + * defaults to false. If lastModified matches + * the header from the feed no results are fetched + * @param string $user If given, basic auth is set for this feed + * @param string $password If given, basic auth is set for this feed. + * Ignored if user is null or an empty string. * - * @throws FetcherException if it fails * @return array an array containing the new feed and its items, first * element being the Feed and second element being an array of Items */ - public function fetch( - $url, - $getFavicon = true, - $lastModified = null, - $etag = null, - $fullTextEnabled = false, - $basicAuthUser = null, - $basicAuthPassword = null - ) { - try { - if ($basicAuthUser !== null && trim($basicAuthUser) !== '') { - $resource = $this->reader->discover( - $url, - $lastModified, - $etag, - $basicAuthUser, - $basicAuthPassword - ); - } else { - $resource = $this->reader->discover($url, $lastModified, $etag); - } - - if (!$resource->isModified()) { - return [null, null]; - } - - $location = $resource->getUrl(); - $etag = $resource->getEtag(); - $content = $resource->getContent(); - $encoding = $resource->getEncoding(); - $lastModified = $resource->getLastModified(); - - $parser = $this->reader->getParser($location, $content, $encoding); - - if ($fullTextEnabled) { - $parser->enableContentGrabber(); - $parser->getItemPostProcessor()->register( - new LWNProcessor( - $basicAuthUser, - $basicAuthPassword, - $this->clientService - ) - ); - } - - $parsedFeed = $parser->execute(); - - $feed = $this->buildFeed( - $parsedFeed, - $url, - $getFavicon, - $lastModified, - $etag, - $location - ); - - $items = []; - foreach ($parsedFeed->getItems() as $item) { - $items[] = $this->buildItem($item, $parsedFeed); - } - - return [$feed, $items]; - } catch (Exception $ex) { - $this->handleError($ex, $url); - } - } - - - private function handleError(Exception $ex, $url) + public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null) { - $msg = $ex->getMessage(); + if ($user !== null && trim($user) !== '') { + $url = explode('://', $url); + $url = $url[0] . '://' . $user . ':' . $password . '@' . $url[1]; + } + $resource = $this->reader->readSince($url, new DateTime($lastModified)); - if ($ex instanceof MalFormedXmlException) { - $msg = $this->l10n->t('Feed contains invalid XML'); - } elseif ($ex instanceof SubscriptionNotFoundException) { - $msg = $this->l10n->t( - 'Feed not found: Either the website ' . - 'does not provide a feed or blocks access. To rule out ' . - 'blocking, try to download the feed on your server\'s ' . - 'command line using curl: curl ' . $url - ); - } elseif ($ex instanceof UnsupportedFeedFormatException) { - $msg = $this->l10n->t('Detected feed format is not supported'); - } elseif ($ex instanceof InvalidCertificateException) { - $msg = $this->buildCurlSslErrorMessage($ex->getCode()); - } elseif ($ex instanceof InvalidUrlException) { - $msg = $this->l10n->t('Website not found'); - } elseif ($ex instanceof MaxRedirectException) { - $msg = $this->l10n->t('More redirects than allowed, aborting'); - } elseif ($ex instanceof MaxSizeException) { - $msg = $this->l10n->t('Bigger than maximum allowed size'); - } elseif ($ex instanceof TimeoutException) { - $msg = $this->l10n->t('Request timed out'); - } elseif ($ex instanceof UnauthorizedException) { - $msg = $this->l10n->t( - 'Required credentials for feed were ' . - 'either missing or incorrect' - ); - } elseif ($ex instanceof ForbiddenException) { - $msg = $this->l10n->t('Forbidden to access feed'); + if (!$resource->getResponse()->isModified()) { + return [null, null]; } - throw new FetcherException($msg); - } + $location = $resource->getUrl(); + $parsedFeed = $resource->getFeed(); + $feed = $this->buildFeed( + $parsedFeed, + $url, + $getFavicon, + $location + ); - private function buildCurlSslErrorMessage($errorCode) - { - switch ($errorCode) { - case 35: // CURLE_SSL_CONNECT_ERROR - return $this->l10n->t( - 'Certificate error: A problem occurred ' . - 'somewhere in the SSL/TLS handshake. Could be ' . - 'certificates (file formats, paths, permissions), ' . - 'passwords, and others.' - ); - case 51: // CURLE_PEER_FAILED_VERIFICATION - return $this->l10n->t( - 'Certificate error: The remote server\'s SSL ' . - 'certificate or SSH md5 fingerprint was deemed not OK.' - ); - case 58: // CURLE_SSL_CERTPROBLEM - return $this->l10n->t( - 'Certificate error: Problem with the local client ' . - 'certificate.' - ); - case 59: // CURLE_SSL_CIPHER - return $this->l10n->t( - 'Certificate error: Couldn\'t use specified cipher.' - ); - case 60: // CURLE_SSL_CACERT - return $this->l10n->t( - 'Certificate error: Peer certificate cannot be ' . - 'authenticated with known CA certificates.' - ); - case 64: // CURLE_USE_SSL_FAILED - return $this->l10n->t( - 'Certificate error: Requested FTP SSL level failed.' - ); - case 66: // CURLE_SSL_ENGINE_INITFAILED - return $this->l10n->t( - 'Certificate error: Initiating the SSL engine failed.' - ); - case 77: // CURLE_SSL_CACERT_BADFILE - return $this->l10n->t( - 'Certificate error: Problem with reading the SSL CA ' . - 'cert (path? access rights?)' - ); - case 83: // CURLE_SSL_ISSUER_ERROR - return $this->l10n->t( - 'Certificate error: Issuer check failed' - ); - default: - return $this->l10n->t('Unknown SSL certificate error!'); + $items = []; + foreach ($parsedFeed as $item) { + $items[] = $this->buildItem($item, $parsedFeed); } + + return [$feed, $items]; } + /** + * Decode the string twice + * + * @param string $string String to decode + * + * @return string + */ private function decodeTwice($string) { return html_entity_decode( @@ -257,37 +128,73 @@ class FeedFetcher implements IFeedFetcher ); } - - protected function determineRtl($parsedItem, $parsedFeed) + /** + * Check if a feed is RTL or not + * + * @param FeedInterface $parsedFeed The feed that was parsed + * + * @return bool + */ + protected function determineRtl($parsedFeed) { - $itemLang = $parsedItem->getLanguage(); - $feedLang = $parsedFeed->getLanguage(); - - if ($itemLang) { - return Parser::isLanguageRTL($itemLang); - } else { - return Parser::isLanguageRTL($feedLang); + $language = $parsedFeed->getLanguage(); + + $language = strtolower($language); + $rtl_languages = array( + 'ar', // Arabic (ar-**) + 'fa', // Farsi (fa-**) + 'ur', // Urdu (ur-**) + 'ps', // Pashtu (ps-**) + 'syr', // Syriac (syr-**) + 'dv', // Divehi (dv-**) + 'he', // Hebrew (he-**) + 'yi', // Yiddish (yi-**) + ); + foreach ($rtl_languages as $prefix) { + if (strpos($language, $prefix) === 0) { + return true; + } } + return false; } - + /** + * Build an item based on a feed. + * + * @param ItemInterface $parsedItem The item to use + * @param FeedInterface $parsedFeed The feed to use + * + * @return Item + */ protected function buildItem($parsedItem, $parsedFeed) { $item = new Item(); $item->setUnread(true); - $item->setUrl($parsedItem->getUrl()); - $item->setGuid($parsedItem->getId()); + $item->setUrl($parsedItem->getLink()); + $item->setGuid($parsedItem->getPublicId()); $item->setGuidHash($item->getGuid()); - $item->setPubDate($parsedItem->getPublishedDate()->getTimestamp()); - $item->setUpdatedDate($parsedItem->getUpdatedDate()->getTimestamp()); - $item->setRtl($this->determineRtl($parsedItem, $parsedFeed)); + + $pubDT = $parsedItem->getLastModified(); + if ($parsedItem->getValue('pubDate') !== null) { + $pubDT = new DateTime($parsedItem->getValue('pubDate')); + } elseif ($parsedItem->getValue('published') !== null) { + $pubDT = new DateTime($parsedItem->getValue('published')); + } + + $item->setPubDate( + $pubDT->getTimestamp() + ); + $item->setLastModified( + $parsedItem->getLastModified()->getTimestamp() + ); + $item->setRtl($this->determineRtl($parsedFeed)); // unescape content because angularjs helps against XSS $item->setTitle($this->decodeTwice($parsedItem->getTitle())); $item->setAuthor($this->decodeTwice($parsedItem->getAuthor())); // purification is done in the service layer - $body = $parsedItem->getContent(); + $body = $parsedItem->getDescription(); $body = mb_convert_encoding( $body, 'HTML-ENTITIES', @@ -295,14 +202,14 @@ class FeedFetcher implements IFeedFetcher ); $item->setBody($body); - $enclosureUrl = $parsedItem->getEnclosureUrl(); - if ($enclosureUrl) { - $enclosureType = $parsedItem->getEnclosureType(); - if (stripos($enclosureType, 'audio/') !== false - || stripos($enclosureType, 'video/') !== false - ) { - $item->setEnclosureMime($enclosureType); - $item->setEnclosureLink($enclosureUrl); + if ($parsedItem->hasMedia()) { + // TODO: Fix multiple media support + foreach ($parsedItem->getMedias() as $media) { + if (!$item->isSupportedMime($media->getType())) { + continue; + } + $item->setEnclosureMime($media->getType()); + $item->setEnclosureLink($media->getUrl()); } } @@ -311,39 +218,35 @@ class FeedFetcher implements IFeedFetcher return $item; } - - protected function buildFeed( - $parsedFeed, - $url, - $getFavicon, - $modified, - $etag, - $location - ) { - $feed = new Feed(); - - $link = $parsedFeed->getSiteUrl(); - - if (!$link) { - $link = $location; - } + /** + * Build a feed based on provided info + * + * @param FeedInterface $feed Feed to build from + * @param string $url URL to use + * @param bool $getFavicon To get the favicon + * @param string $location String base URL + * + * @return Feed + */ + protected function buildFeed($feed, $url, $getFavicon, $location) + { + $newFeed = new Feed(); // unescape content because angularjs helps against XSS - $title = strip_tags($this->decodeTwice($parsedFeed->getTitle())); - $feed->setTitle($title); - $feed->setUrl($url); // the url used to add the feed - $feed->setLocation($location); // the url where the feed was found - $feed->setLink($link); // attribute in the feed - $feed->setHttpLastModified($modified); - $feed->setHttpEtag($etag); - $feed->setAdded($this->time->getTime()); - - if ($getFavicon) { - $faviconFetcher = $this->faviconFactory->build(); - $favicon = $faviconFetcher->find($feed->getLink()); - $feed->setFaviconLink($favicon); + $title = strip_tags($this->decodeTwice($feed->getTitle())); + $newFeed->setTitle($title); + $newFeed->setUrl($url); // the url used to add the feed + $newFeed->setLocation($location); // the url where the feed was found + $newFeed->setLink($feed->getLink()); // attribute in the feed + $newFeed->setLastModified($feed->getLastModified()->getTimestamp()); + $newFeed->setAdded($this->time->getTime()); + + if (!$getFavicon) { + return $newFeed; } + $favicon = $this->faviconFactory->get($url); + $newFeed->setFaviconLink($favicon); - return $feed; + return $newFeed; } } diff --git a/lib/Fetcher/Fetcher.php b/lib/Fetcher/Fetcher.php index e78da0265..23f5b57f7 100644 --- a/lib/Fetcher/Fetcher.php +++ b/lib/Fetcher/Fetcher.php @@ -16,6 +16,10 @@ namespace OCA\News\Fetcher; class Fetcher { + /** + * List of fetchers. + * @var IFeedFetcher[] + */ private $fetchers; public function __construct() @@ -39,39 +43,28 @@ class Fetcher * * @param string $url remote url of the feed * @param boolean $getFavicon if the favicon should also be fetched, defaults to true - * @param string $lastModified a last modified value from an http header defaults to false. + * @param string $lastModified a last modified value from an http header defaults to false. * If lastModified matches the http header from the feed no results are fetched - * @param string $etag an etag from an http header. - * If lastModified matches the http header from the feed no results are fetched - * @param bool $fullTextEnabled if true tells the fetcher to enhance the articles by fetching more content - * @param string $basicAuthUser if given, basic auth is set for this feed - * @param string $basicAuthPassword if given, basic auth is set for this feed. Ignored if user is empty + * @param string $user if given, basic auth is set for this feed + * @param string $password if given, basic auth is set for this feed. Ignored if user is empty * - * @throws FetcherException if simple pie fails + * @throws FetcherException if FeedIO fails * @return array an array containing the new feed and its items, first * element being the Feed and second element being an array of Items */ - public function fetch( - $url, - $getFavicon = true, - $lastModified = null, - $etag = null, - $fullTextEnabled = false, - $basicAuthUser = null, - $basicAuthPassword = null - ) { + public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null) + { foreach ($this->fetchers as $fetcher) { - if ($fetcher->canHandle($url)) { - return $fetcher->fetch( - $url, - $getFavicon, - $lastModified, - $etag, - $fullTextEnabled, - $basicAuthUser, - $basicAuthPassword - ); + if (!$fetcher->canHandle($url)) { + continue; } + return $fetcher->fetch( + $url, + $getFavicon, + $lastModified, + $user, + $password + ); } return [null, []]; diff --git a/lib/Fetcher/IFeedFetcher.php b/lib/Fetcher/IFeedFetcher.php index c96bd315b..d5994a076 100644 --- a/lib/Fetcher/IFeedFetcher.php +++ b/lib/Fetcher/IFeedFetcher.php @@ -23,25 +23,14 @@ interface IFeedFetcher * @param boolean $getFavicon if the favicon should also be fetched, defaults to true * @param string $lastModified a last modified value from an http header defaults to false. * If lastModified matches the http header from the feed no results are fetched - * @param string $etag an etag from an http header. - * If lastModified matches the http header from the feed no results are fetched - * @param bool $fullTextEnabled if true tells the fetcher to enhance the articles by fetching more content - * @param string $basicAuthUser if given, basic auth is set for this feed - * @param string $basicAuthPassword if given, basic auth is set for this feed. Ignored if user is empty + * @param string $user if given, basic auth is set for this feed + * @param string $password if given, basic auth is set for this feed. Ignored if user is empty * * @throws FetcherException if the fetcher encounters a problem * @return array an array containing the new feed and its items, first * element being the Feed and second element being an array of Items */ - public function fetch( - $url, - $getFavicon = true, - $lastModified = null, - $etag = null, - $fullTextEnabled = false, - $basicAuthUser = null, - $basicAuthPassword = null - ); + public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null); /** * Can a fetcher handle a feed. diff --git a/lib/Fetcher/YoutubeFetcher.php b/lib/Fetcher/YoutubeFetcher.php index a47b8fdb8..9ccce4463 100644 --- a/lib/Fetcher/YoutubeFetcher.php +++ b/lib/Fetcher/YoutubeFetcher.php @@ -52,35 +52,24 @@ class YoutubeFetcher implements IFeedFetcher * @param boolean $getFavicon if the favicon should also be fetched, defaults to true * @param string $lastModified a last modified value from an http header defaults to false. * If lastModified matches the http header from the feed no results are fetched - * @param string $etag an etag from an http header. - * If lastModified matches the http header from the feed no results are fetched - * @param bool $fullTextEnabled if true tells the fetcher to enhance the articles by fetching more content - * @param string $basicAuthUser if given, basic auth is set for this feed - * @param string $basicAuthPassword if given, basic auth is set for this feed. Ignored if user is empty + * @param string $user if given, basic auth is set for this feed + * @param string $password if given, basic auth is set for this feed. Ignored if user is empty * * @throws FetcherException if it fails * @return array an array containing the new feed and its items, first * element being the Feed and second element being an array of Items */ - public function fetch( - $url, - $getFavicon = true, - $lastModified = null, - $etag = null, - $fullTextEnabled = false, - $basicAuthUser = null, - $basicAuthPassword = null - ) { + public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null + ) + { $transformedUrl = $this->buildUrl($url); $result = $this->feedFetcher->fetch( $transformedUrl, $getFavicon, $lastModified, - $etag, - $fullTextEnabled, - $basicAuthUser, - $basicAuthPassword + $user, + $password ); // reset feed url so we know the correct added url for the feed -- cgit v1.2.3