summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorDriverXX <DriverXX@users.noreply.github.com>2019-12-24 09:33:19 +0100
committerBenjamin Brahmer <info@b-brahmer.de>2019-12-24 09:33:19 +0100
commit6673cbc3d940745a0ecddb93b32805a0fbe79eb1 (patch)
tree72d50a36c4635311a684375106652d88d019575e /lib
parentc2f617dd400681b67927781a73a735600803d9ae (diff)
Reimplement full-text scraping (#563)
Add readability.php scraper Fixes #482 Signed-off-by: Gioele Falcetti <thegio.f@gmail.com>
Diffstat (limited to 'lib')
-rw-r--r--lib/AppInfo/Application.php10
-rw-r--r--lib/Command/ShowFeed.php6
-rwxr-xr-xlib/Fetcher/FeedFetcher.php58
-rw-r--r--lib/Fetcher/Fetcher.php18
-rw-r--r--lib/Fetcher/IFeedFetcher.php3
-rw-r--r--lib/Fetcher/YoutubeFetcher.php3
-rw-r--r--lib/Scraper/IScraper.php43
-rw-r--r--lib/Scraper/Scraper.php106
-rw-r--r--lib/Service/FeedService.php3
9 files changed, 225 insertions, 25 deletions
diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php
index abacdfc3a..faafa471c 100644
--- a/lib/AppInfo/Application.php
+++ b/lib/AppInfo/Application.php
@@ -41,6 +41,7 @@ use OCA\News\Fetcher\FeedFetcher;
use OCA\News\Fetcher\Fetcher;
use OCA\News\Fetcher\YoutubeFetcher;
use OCA\News\Utility\ProxyConfigParser;
+use OCA\News\Scraper\Scraper;
/**
* Class Application
@@ -193,5 +194,14 @@ class Application extends App
$fetcher->registerFetcher($c->query(FeedFetcher::class));
return $fetcher;
});
+
+ /**
+ * Scrapers
+ */
+ $container->registerService(Scraper::class, function (IContainer $c): Scraper {
+ return new Scraper(
+ $c->query(PsrLogger::class)
+ );
+ });
}
}
diff --git a/lib/Command/ShowFeed.php b/lib/Command/ShowFeed.php
index bbe1913fa..572b68e26 100644
--- a/lib/Command/ShowFeed.php
+++ b/lib/Command/ShowFeed.php
@@ -48,7 +48,8 @@ class ShowFeed extends Command
->setDescription('Prints a JSON string which represents the given feed as it would be in the DB.')
->addArgument('feed', InputArgument::REQUIRED, 'Feed to parse')
->addOption('user', 'u', InputOption::VALUE_OPTIONAL, 'Username for the feed')
- ->addOption('password', 'p', InputOption::VALUE_OPTIONAL, 'Password for the feed');
+ ->addOption('password', 'p', InputOption::VALUE_OPTIONAL, 'Password for the feed')
+ ->addOption('full-text', 'f', InputOption::VALUE_NONE, 'Usa a scraper to get full text');
}
protected function execute(InputInterface $input, OutputInterface $output)
@@ -56,9 +57,10 @@ class ShowFeed extends Command
$url = $input->getArgument('feed');
$user = $input->getOption('user');
$password = $input->getOption('password');
+ $fullTextEnabled = (bool) $input->getOption('full-text');
try {
- list($feed, $items) = $this->feedFetcher->fetch($url, true, null, $user, $password);
+ list($feed, $items) = $this->feedFetcher->fetch($url, true, null, $fullTextEnabled, $user, $password);
$output->writeln("Feed: " . json_encode($feed, JSON_PRETTY_PRINT));
$output->writeln("Items: " . json_encode($items, JSON_PRETTY_PRINT));
} catch (\Throwable $ex) {
diff --git a/lib/Fetcher/FeedFetcher.php b/lib/Fetcher/FeedFetcher.php
index 1fbce123f..5a0f0e4e7 100755
--- a/lib/Fetcher/FeedFetcher.php
+++ b/lib/Fetcher/FeedFetcher.php
@@ -26,6 +26,7 @@ use OCP\IL10N;
use OCA\News\Db\Item;
use OCA\News\Db\Feed;
use OCA\News\Utility\Time;
+use OCA\News\Scraper\Scraper;
use SimpleXMLElement;
class FeedFetcher implements IFeedFetcher
@@ -36,14 +37,22 @@ class FeedFetcher implements IFeedFetcher
private $l10n;
private $time;
private $logger;
-
- public function __construct(FeedIo $fetcher, Favicon $favicon, IL10N $l10n, Time $time, PsrLogger $logger)
- {
+ private $scraper;
+
+ public function __construct(
+ FeedIo $fetcher,
+ Favicon $favicon,
+ IL10N $l10n,
+ Time $time,
+ PsrLogger $logger,
+ Scraper $scraper
+ ) {
$this->reader = $fetcher;
$this->faviconFactory = $favicon;
$this->l10n = $l10n;
$this->time = $time;
$this->logger = $logger;
+ $this->scraper = $scraper;
}
@@ -65,7 +74,7 @@ class FeedFetcher implements IFeedFetcher
*
* @inheritdoc
*/
- public function fetch(string $url, bool $favicon, $lastModified, $user, $password): array
+ public function fetch(string $url, bool $favicon, $lastModified, bool $fullTextEnabled, $user, $password): array
{
$url2 = new Net_URL2($url);
if (!empty($user) && !empty(trim($user))) {
@@ -99,12 +108,32 @@ class FeedFetcher implements IFeedFetcher
);
$items = [];
+ $RTL = $this->determineRtl($parsedFeed);
+ $feedName = $parsedFeed->getTitle();
$this->logger->debug('Feed {url} was modified since last fetch. #{count} items', [
'url' => $url,
'count' => count($parsedFeed),
]);
+
foreach ($parsedFeed as $item) {
- $items[] = $this->buildItem($item, $parsedFeed);
+ $body = null;
+ $currRTL = $RTL;
+
+ // Scrape content if enabled
+ if ($fullTextEnabled) {
+ if ($this->scraper->scrape($item->getLink())) {
+ $body = $this->scraper->getContent();
+ $currRTL = $this->scraper->getRTL($currRTL);
+ }
+ }
+
+ $builtItem = $this->buildItem($item, $body, $currRTL);
+ $this->logger->debug('Added item {title} for feed {feed} publishdate: {datetime}', [
+ 'title' => $builtItem->getTitle(),
+ 'feed' => $feedName,
+ 'datetime' => $builtItem->getLastModified(),
+ ]);
+ $items[] = $builtItem;
}
return [$feed, $items];
@@ -164,11 +193,12 @@ class FeedFetcher implements IFeedFetcher
* Build an item based on a feed.
*
* @param ItemInterface $parsedItem The item to use
- * @param FeedInterface $parsedFeed The feed to use
+ * @param string $body Text of the item, if not provided use description from $parsedItem
+ * @param bool $RTL True if the feed is RTL (Right-to-left)
*
* @return Item
*/
- protected function buildItem(ItemInterface $parsedItem, FeedInterface $parsedFeed): Item
+ protected function buildItem(ItemInterface $parsedItem, string $body = null, bool $RTL = false): Item
{
$item = new Item();
$item->setUnread(true);
@@ -188,7 +218,7 @@ class FeedFetcher implements IFeedFetcher
$item->setPubDate($pubDT->getTimestamp());
$item->setLastModified($lastmodified->getTimestamp());
- $item->setRtl($this->determineRtl($parsedFeed));
+ $item->setRtl($RTL);
// unescape content because angularjs helps against XSS
$item->setTitle($this->decodeTwice($parsedItem->getTitle()));
@@ -197,8 +227,12 @@ class FeedFetcher implements IFeedFetcher
$item->setAuthor($this->decodeTwice($author->getName()));
}
+ // Use description from feed if body is not provided (by a scraper)
+ if ($body === null) {
+ $body = $parsedItem->getValue("content:encoded") ?? $parsedItem->getDescription();
+ }
+
// purification is done in the service layer
- $body = $parsedItem->getValue("content:encoded") ?? $parsedItem->getDescription();
$body = mb_convert_encoding(
$body,
'HTML-ENTITIES',
@@ -231,12 +265,6 @@ class FeedFetcher implements IFeedFetcher
}
$item->generateSearchIndex();
-
- $this->logger->debug('Added item {title} for feed {feed} publishdate: {datetime}', [
- 'title' => $item->getTitle(),
- 'feed' => $parsedFeed->getTitle(),
- 'datetime' => $item->getLastModified(),
- ]);
return $item;
}
diff --git a/lib/Fetcher/Fetcher.php b/lib/Fetcher/Fetcher.php
index 425004680..f52141dda 100644
--- a/lib/Fetcher/Fetcher.php
+++ b/lib/Fetcher/Fetcher.php
@@ -45,17 +45,24 @@ class Fetcher
*
* @param string $url remote url of the feed
* @param boolean $getFavicon if the favicon should also be fetched, defaults to true
- * @param string $lastModified a last modified value from an http header defaults to false.
+ * @param string $lastModified a last modified value from an http header defaults to false.
* If lastModified matches the http header from the feed no results are fetched
- * @param string $user if given, basic auth is set for this feed
- * @param string $password if given, basic auth is set for this feed. Ignored if user is empty
+ * @param bool $fullTextEnabled If true use a scraper to download the full article
+ * @param string $user if given, basic auth is set for this feed
+ * @param string $password if given, basic auth is set for this feed. Ignored if user is empty
*
* @throws ReadErrorException if FeedIO fails
* @return array an array containing the new feed and its items, first
* element being the Feed and second element being an array of Items
*/
- public function fetch($url, $getFavicon = true, $lastModified = null, $user = null, $password = null)
- {
+ public function fetch(
+ $url,
+ $getFavicon = true,
+ $lastModified = null,
+ $fullTextEnabled = false,
+ $user = null,
+ $password = null
+ ) {
foreach ($this->fetchers as $fetcher) {
if (!$fetcher->canHandle($url)) {
continue;
@@ -64,6 +71,7 @@ class Fetcher
$url,
$getFavicon,
$lastModified,
+ $fullTextEnabled,
$user,
$password
);
diff --git a/lib/Fetcher/IFeedFetcher.php b/lib/Fetcher/IFeedFetcher.php
index 81bf8526f..6500c2690 100644
--- a/lib/Fetcher/IFeedFetcher.php
+++ b/lib/Fetcher/IFeedFetcher.php
@@ -25,6 +25,7 @@ interface IFeedFetcher
* @param boolean $favicon if the favicon should also be fetched, defaults to true
* @param string|null $lastModified a last modified value from an http header defaults to false.
* If lastModified matches the http header from the feed no results are fetched
+ * @param bool $fullTextEnabled If true use a scraper to download the full article
* @param string|null $user if given, basic auth is set for this feed
* @param string|null $password if given, basic auth is set for this feed. Ignored if user is empty
*
@@ -32,7 +33,7 @@ interface IFeedFetcher
* element being the Feed and second element being an array of Items
* @throws ReadErrorException if the Feed-IO fetcher encounters a problem
*/
- public function fetch(string $url, bool $favicon, $lastModified, $user, $password): array;
+ public function fetch(string $url, bool $favicon, $lastModified, bool $fullTextEnabled, $user, $password): array;
/**
* Can a fetcher handle a feed.
diff --git a/lib/Fetcher/YoutubeFetcher.php b/lib/Fetcher/YoutubeFetcher.php
index 41319a36b..56adaae03 100644
--- a/lib/Fetcher/YoutubeFetcher.php
+++ b/lib/Fetcher/YoutubeFetcher.php
@@ -50,7 +50,7 @@ class YoutubeFetcher implements IFeedFetcher
*
* @inheritdoc
*/
- public function fetch(string $url, bool $favicon, $lastModified, $user, $password): array
+ public function fetch(string $url, bool $favicon, $lastModified, bool $fullTextEnabled, $user, $password): array
{
$transformedUrl = $this->buildUrl($url);
@@ -58,6 +58,7 @@ class YoutubeFetcher implements IFeedFetcher
$transformedUrl,
$favicon,
$lastModified,
+ $fullTextEnabled,
$user,
$password
);
diff --git a/lib/Scraper/IScraper.php b/lib/Scraper/IScraper.php
new file mode 100644
index 000000000..b389b79c9
--- /dev/null
+++ b/lib/Scraper/IScraper.php
@@ -0,0 +1,43 @@
+<?php
+/**
+ * Nextcloud - News
+ *
+ * This file is licensed under the Affero General Public License version 3 or
+ * later. See the COPYING file.
+ *
+ * @author Gioele Falcetti <thegio.f@gmail.com>
+ * @copyright 2019 Gioele Falcetti
+ */
+
+namespace OCA\News\Scraper;
+
+interface IScraper
+{
+ /**
+ * Scrape feed url
+ *
+ * @param string $url
+ *
+ * @return bool False if failed
+ *
+ */
+ public function scrape(string $url): bool;
+
+ /**
+ * Get the scraped content
+ *
+ * @return string
+ *
+ */
+ public function getContent(): string;
+
+ /**
+ * Get the RTL (rigth-to-left) information
+ *
+ * @param bool $default Return this value if the scraper is unable to determine it
+ *
+ * @return bool
+ *
+ */
+ public function getRTL(bool $default = false): bool;
+}
diff --git a/lib/Scraper/Scraper.php b/lib/Scraper/Scraper.php
new file mode 100644
index 000000000..fedb0391c
--- /dev/null
+++ b/lib/Scraper/Scraper.php
@@ -0,0 +1,106 @@
+<?php
+/**
+ * Nextcloud - News
+ *
+ * This file is licensed under the Affero General Public License version 3 or
+ * later. See the COPYING file.
+ *
+ * @author Gioele Falcetti <thegio.f@gmail.com>
+ * @copyright 2019 Gioele Falcetti
+ */
+
+namespace OCA\News\Scraper;
+
+use OCA\News\Utility\PsrLogger;
+
+use andreskrey\Readability\Readability;
+use andreskrey\Readability\Configuration;
+use andreskrey\Readability\ParseException;
+
+class Scraper implements IScraper
+{
+ private $logger;
+ private $config;
+ private $readability;
+ private $curl_opts;
+
+ public function __construct(PsrLogger $logger)
+ {
+ $this->logger = $logger;
+ $this->config = new Configuration([
+ 'FixRelativeURLs' => true,
+ 'SummonCthulhu' => true, // Remove <script>
+ ]);
+ $this->readability = null;
+
+ $this->curl_opts = array(
+ CURLOPT_RETURNTRANSFER => true, // return web page
+ CURLOPT_HEADER => false, // do not return headers
+ CURLOPT_FOLLOWLOCATION => true, // follow redirects
+ //CURLOPT_USERAGENT => "php-news", // who am i
+ CURLOPT_AUTOREFERER => true, // set referer on redirect
+ CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
+ CURLOPT_TIMEOUT => 120, // timeout on response
+ CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
+ );
+ }
+
+ private function getHTTPContent(string $url): array
+ {
+ $handler = curl_init($url);
+ curl_setopt_array($handler, $this->curl_opts);
+ $content = curl_exec($handler);
+ $header = curl_getinfo($handler);
+ curl_close($handler);
+
+ // Update the url after the redirects has been followed
+ $url = $header['url'];
+ return array($content, $header['url']);
+ }
+
+ public function scrape(string $url): bool
+ {
+ list($content, $redirected_url) = $this->getHTTPContent($url);
+ if ($content === false) {
+ $this->logger->error('Unable to recive content from {url}', [
+ 'url' => $url,
+ ]);
+ $this->readability = null;
+ return false;
+ }
+
+ // Update URL used to convert relative URLs
+ $this->config->setOriginalURL($redirected_url);
+ $this->readability = new Readability($this->config);
+
+ try {
+ $this->readability->parse($content);
+ } catch (ParseException $e) {
+ $this->logger->error('Unable to parse content from {url}', [
+ 'url' => $url,
+ ]);
+ }
+ return true;
+ }
+
+ public function getContent(): string
+ {
+ if ($this->readability === null) {
+ return null;
+ }
+ return $this->readability->getContent();
+ }
+
+ public function getRTL(bool $default = false): bool
+ {
+ if ($this->readability === null) {
+ return $default;
+ }
+
+ $RTL = $this->readability->getDirection();
+ if ($RTL === null) {
+ return $default;
+ }
+ return $RTL === "rtl";
+ }
+}
diff --git a/lib/Service/FeedService.php b/lib/Service/FeedService.php
index 6dee1fd1c..de545b9e3 100644
--- a/lib/Service/FeedService.php
+++ b/lib/Service/FeedService.php
@@ -113,7 +113,7 @@ class FeedService extends Service
* @var Feed $feed
* @var Item[] $items
*/
- list($feed, $items) = $this->feedFetcher->fetch($feedUrl, true, null, $user, $password);
+ list($feed, $items) = $this->feedFetcher->fetch($feedUrl, true, null, false, $user, $password);
// try again if feed exists depending on the reported link
try {
$hash = $feed->getUrlHash();
@@ -224,6 +224,7 @@ class FeedService extends Service
$location,
false,
$existingFeed->getHttpLastModified(),
+ $existingFeed->getFullTextEnabled(),
$existingFeed->getBasicAuthUser(),
$existingFeed->getBasicAuthPassword()
);