summaryrefslogtreecommitdiffstats
path: root/lib/Fetcher/FeedFetcher.php
diff options
context:
space:
mode:
authorDriverXX <DriverXX@users.noreply.github.com>2019-12-24 09:33:19 +0100
committerBenjamin Brahmer <info@b-brahmer.de>2019-12-24 09:33:19 +0100
commit6673cbc3d940745a0ecddb93b32805a0fbe79eb1 (patch)
tree72d50a36c4635311a684375106652d88d019575e /lib/Fetcher/FeedFetcher.php
parentc2f617dd400681b67927781a73a735600803d9ae (diff)
Reimplement full-text scraping (#563)
Add readability.php scraper Fixes #482 Signed-off-by: Gioele Falcetti <thegio.f@gmail.com>
Diffstat (limited to 'lib/Fetcher/FeedFetcher.php')
-rwxr-xr-xlib/Fetcher/FeedFetcher.php58
1 files changed, 43 insertions, 15 deletions
diff --git a/lib/Fetcher/FeedFetcher.php b/lib/Fetcher/FeedFetcher.php
index 1fbce123f..5a0f0e4e7 100755
--- a/lib/Fetcher/FeedFetcher.php
+++ b/lib/Fetcher/FeedFetcher.php
@@ -26,6 +26,7 @@ use OCP\IL10N;
use OCA\News\Db\Item;
use OCA\News\Db\Feed;
use OCA\News\Utility\Time;
+use OCA\News\Scraper\Scraper;
use SimpleXMLElement;
class FeedFetcher implements IFeedFetcher
@@ -36,14 +37,22 @@ class FeedFetcher implements IFeedFetcher
private $l10n;
private $time;
private $logger;
-
- public function __construct(FeedIo $fetcher, Favicon $favicon, IL10N $l10n, Time $time, PsrLogger $logger)
- {
+ private $scraper;
+
+ public function __construct(
+ FeedIo $fetcher,
+ Favicon $favicon,
+ IL10N $l10n,
+ Time $time,
+ PsrLogger $logger,
+ Scraper $scraper
+ ) {
$this->reader = $fetcher;
$this->faviconFactory = $favicon;
$this->l10n = $l10n;
$this->time = $time;
$this->logger = $logger;
+ $this->scraper = $scraper;
}
@@ -65,7 +74,7 @@ class FeedFetcher implements IFeedFetcher
*
* @inheritdoc
*/
- public function fetch(string $url, bool $favicon, $lastModified, $user, $password): array
+ public function fetch(string $url, bool $favicon, $lastModified, bool $fullTextEnabled, $user, $password): array
{
$url2 = new Net_URL2($url);
if (!empty($user) && !empty(trim($user))) {
@@ -99,12 +108,32 @@ class FeedFetcher implements IFeedFetcher
);
$items = [];
+ $RTL = $this->determineRtl($parsedFeed);
+ $feedName = $parsedFeed->getTitle();
$this->logger->debug('Feed {url} was modified since last fetch. #{count} items', [
'url' => $url,
'count' => count($parsedFeed),
]);
+
foreach ($parsedFeed as $item) {
- $items[] = $this->buildItem($item, $parsedFeed);
+ $body = null;
+ $currRTL = $RTL;
+
+ // Scrape content if enabled
+ if ($fullTextEnabled) {
+ if ($this->scraper->scrape($item->getLink())) {
+ $body = $this->scraper->getContent();
+ $currRTL = $this->scraper->getRTL($currRTL);
+ }
+ }
+
+ $builtItem = $this->buildItem($item, $body, $currRTL);
+ $this->logger->debug('Added item {title} for feed {feed} publishdate: {datetime}', [
+ 'title' => $builtItem->getTitle(),
+ 'feed' => $feedName,
+ 'datetime' => $builtItem->getLastModified(),
+ ]);
+ $items[] = $builtItem;
}
return [$feed, $items];
@@ -164,11 +193,12 @@ class FeedFetcher implements IFeedFetcher
* Build an item based on a feed.
*
* @param ItemInterface $parsedItem The item to use
- * @param FeedInterface $parsedFeed The feed to use
+ * @param string $body Text of the item, if not provided use description from $parsedItem
+ * @param bool $RTL True if the feed is RTL (Right-to-left)
*
* @return Item
*/
- protected function buildItem(ItemInterface $parsedItem, FeedInterface $parsedFeed): Item
+ protected function buildItem(ItemInterface $parsedItem, string $body = null, bool $RTL = false): Item
{
$item = new Item();
$item->setUnread(true);
@@ -188,7 +218,7 @@ class FeedFetcher implements IFeedFetcher
$item->setPubDate($pubDT->getTimestamp());
$item->setLastModified($lastmodified->getTimestamp());
- $item->setRtl($this->determineRtl($parsedFeed));
+ $item->setRtl($RTL);
// unescape content because angularjs helps against XSS
$item->setTitle($this->decodeTwice($parsedItem->getTitle()));
@@ -197,8 +227,12 @@ class FeedFetcher implements IFeedFetcher
$item->setAuthor($this->decodeTwice($author->getName()));
}
+ // Use description from feed if body is not provided (by a scraper)
+ if ($body === null) {
+ $body = $parsedItem->getValue("content:encoded") ?? $parsedItem->getDescription();
+ }
+
// purification is done in the service layer
- $body = $parsedItem->getValue("content:encoded") ?? $parsedItem->getDescription();
$body = mb_convert_encoding(
$body,
'HTML-ENTITIES',
@@ -231,12 +265,6 @@ class FeedFetcher implements IFeedFetcher
}
$item->generateSearchIndex();
-
- $this->logger->debug('Added item {title} for feed {feed} publishdate: {datetime}', [
- 'title' => $item->getTitle(),
- 'feed' => $parsedFeed->getTitle(),
- 'datetime' => $item->getLastModified(),
- ]);
return $item;
}