diff options
Diffstat (limited to 'lib/Fetcher')
-rwxr-xr-x | lib/Fetcher/FeedFetcher.php | 58 | ||||
-rw-r--r-- | lib/Fetcher/Fetcher.php | 18 | ||||
-rw-r--r-- | lib/Fetcher/IFeedFetcher.php | 3 | ||||
-rw-r--r-- | lib/Fetcher/YoutubeFetcher.php | 3 |
4 files changed, 60 insertions, 22 deletions
diff --git a/lib/Fetcher/FeedFetcher.php b/lib/Fetcher/FeedFetcher.php index 1fbce123f..5a0f0e4e7 100755 --- a/lib/Fetcher/FeedFetcher.php +++ b/lib/Fetcher/FeedFetcher.php @@ -26,6 +26,7 @@ use OCP\IL10N; use OCA\News\Db\Item; use OCA\News\Db\Feed; use OCA\News\Utility\Time; +use OCA\News\Scraper\Scraper; use SimpleXMLElement; class FeedFetcher implements IFeedFetcher @@ -36,14 +37,22 @@ class FeedFetcher implements IFeedFetcher private $l10n; private $time; private $logger; - - public function __construct(FeedIo $fetcher, Favicon $favicon, IL10N $l10n, Time $time, PsrLogger $logger) - { + private $scraper; + + public function __construct( + FeedIo $fetcher, + Favicon $favicon, + IL10N $l10n, + Time $time, + PsrLogger $logger, + Scraper $scraper + ) { $this->reader = $fetcher; $this->faviconFactory = $favicon; $this->l10n = $l10n; $this->time = $time; $this->logger = $logger; + $this->scraper = $scraper; } @@ -65,7 +74,7 @@ class FeedFetcher implements IFeedFetcher * * @inheritdoc */ - public function fetch(string $url, bool $favicon, $lastModified, $user, $password): array + public function fetch(string $url, bool $favicon, $lastModified, bool $fullTextEnabled, $user, $password): array { $url2 = new Net_URL2($url); if (!empty($user) && !empty(trim($user))) { @@ -99,12 +108,32 @@ class FeedFetcher implements IFeedFetcher ); $items = []; + $RTL = $this->determineRtl($parsedFeed); + $feedName = $parsedFeed->getTitle(); $this->logger->debug('Feed {url} was modified since last fetch. #{count} items', [ 'url' => $url, 'count' => count($parsedFeed), ]); + foreach ($parsedFeed as $item) { - $items[] = $this->buildItem($item, $parsedFeed); + $body = null; + $currRTL = $RTL; + + // Scrape content if enabled + if ($fullTextEnabled) { + if ($this->scraper->scrape($item->getLink())) { + $body = $this->scraper->getContent(); + $currRTL = $this->scraper->getRTL($currRTL); + } + } + + $builtItem = $this->buildItem($item, $body, $currRTL); + $this->logger->debug('Added item {title} for feed {feed} publishdate: {datetime}', [ + 'title' => $builtItem->getTitle(), + 'feed' => $feedName, + 'datetime' => $builtItem->getLastModified(), + ]); + $items[] = $builtItem; } return [$feed, $items]; @@ -164,11 +193,12 @@ class FeedFetcher implements IFeedFetcher * Build an item based on a feed. * * @param ItemInterface $parsedItem The item to use - * @param FeedInterface $parsedFeed The feed to use + * @param string $body Text of the item, if not provided use description from $parsedItem + * @param bool $RTL True if the feed is RTL (Right-to-left) * * @return Item */ - protected function buildItem(ItemInterface $parsedItem, FeedInterface $parsedFeed): Item + protected function buildItem(ItemInterface $parsedItem, string $body = null, bool $RTL = false): Item { $item = new Item(); $item->setUnread(true); @@ -188,7 +218,7 @@ class FeedFetcher implements IFeedFetcher $item->setPubDate($pubDT->getTimestamp()); $item->setLastModified($lastmodified->getTimestamp()); - $item->setRtl($this->determineRtl($parsedFeed)); + $item->setRtl($RTL); // unescape content because angularjs helps against XSS $item->setTitle($this->decodeTwice($parsedItem->getTitle())); @@ -197,8 +227,12 @@ class FeedFetcher implements IFeedFetcher $item->setAuthor($this->decodeTwice($author->getName())); } + // Use description from feed if body is not provided (by a scraper) + if ($body === null) { + $body = $parsedItem->getValue("content:encoded") ?? $parsedItem->getDescription(); + } + // purification is done in the service layer - $body = $parsedItem->getValue("content:encoded") ?? $parsedItem->getDescription(); $body = mb_convert_encoding( $body, 'HTML-ENTITIES', @@ -231,12 +265,6 @@ class FeedFetcher implements IFeedFetcher } $item->generateSearchIndex(); - - $this->logger->debug('Added item {title} for feed {feed} publishdate: {datetime}', [ - 'title' => $item->getTitle(), - 'feed' => $parsedFeed->getTitle(), - 'datetime' => $item->getLastModified(), - ]); return $item; } diff --git a/lib/Fetcher/Fetcher.php b/lib/Fetcher/Fetcher.php index 425004680..f52141dda 100644 --- a/lib/Fetcher/Fetcher.php +++ b/lib/Fetcher/Fetcher.php @@ -45,17 +45,24 @@ class Fetcher * * @param string $url remote url of the feed * @param boolean $getFavicon if the favicon should also be fetched, defaults to true - * @param string $lastModified a last modified value from an http header defaults to false. + * @param string $lastModified a last modified value from an http header defaults to false. * If lastModified matches the http header from the feed no results are fetched - * @param string $user if given, basic auth is set for this feed - * @param string $password if given, basic auth is set for this feed. Ignored if user is empty + * @param bool $fullTextEnabled If true use a scraper to download the full article + * @param string $user if given, basic auth is set for this feed + * @param string $password if given, basic auth is set for this feed. Ignored if user is empty * * @throws ReadErrorException if FeedIO fails * @return array an array containing the new feed and its items, first * element being the Feed and second element being an array of Items */ - public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null) - { + public function fetch( + $url, + $getFavicon = true, + $lastModified = null, + $fullTextEnabled = false, + $user = null, + $password = null + ) { foreach ($this->fetchers as $fetcher) { if (!$fetcher->canHandle($url)) { continue; @@ -64,6 +71,7 @@ class Fetcher $url, $getFavicon, $lastModified, + $fullTextEnabled, $user, $password ); diff --git a/lib/Fetcher/IFeedFetcher.php b/lib/Fetcher/IFeedFetcher.php index 81bf8526f..6500c2690 100644 --- a/lib/Fetcher/IFeedFetcher.php +++ b/lib/Fetcher/IFeedFetcher.php @@ -25,6 +25,7 @@ interface IFeedFetcher * @param boolean $favicon if the favicon should also be fetched, defaults to true * @param string|null $lastModified a last modified value from an http header defaults to false. * If lastModified matches the http header from the feed no results are fetched + * @param bool $fullTextEnabled If true use a scraper to download the full article * @param string|null $user if given, basic auth is set for this feed * @param string|null $password if given, basic auth is set for this feed. Ignored if user is empty * @@ -32,7 +33,7 @@ interface IFeedFetcher * element being the Feed and second element being an array of Items * @throws ReadErrorException if the Feed-IO fetcher encounters a problem */ - public function fetch(string $url, bool $favicon, $lastModified, $user, $password): array; + public function fetch(string $url, bool $favicon, $lastModified, bool $fullTextEnabled, $user, $password): array; /** * Can a fetcher handle a feed. diff --git a/lib/Fetcher/YoutubeFetcher.php b/lib/Fetcher/YoutubeFetcher.php index 41319a36b..56adaae03 100644 --- a/lib/Fetcher/YoutubeFetcher.php +++ b/lib/Fetcher/YoutubeFetcher.php @@ -50,7 +50,7 @@ class YoutubeFetcher implements IFeedFetcher * * @inheritdoc */ - public function fetch(string $url, bool $favicon, $lastModified, $user, $password): array + public function fetch(string $url, bool $favicon, $lastModified, bool $fullTextEnabled, $user, $password): array { $transformedUrl = $this->buildUrl($url); @@ -58,6 +58,7 @@ class YoutubeFetcher implements IFeedFetcher $transformedUrl, $favicon, $lastModified, + $fullTextEnabled, $user, $password ); |