summaryrefslogtreecommitdiffstats
path: root/lib/Scraper
diff options
context:
space:
mode:
authorDriverXX <DriverXX@users.noreply.github.com>2019-12-24 09:33:19 +0100
committerBenjamin Brahmer <info@b-brahmer.de>2019-12-24 09:33:19 +0100
commit6673cbc3d940745a0ecddb93b32805a0fbe79eb1 (patch)
tree72d50a36c4635311a684375106652d88d019575e /lib/Scraper
parentc2f617dd400681b67927781a73a735600803d9ae (diff)
Reimplement full-text scraping (#563)
Add readability.php scraper Fixes #482 Signed-off-by: Gioele Falcetti <thegio.f@gmail.com>
Diffstat (limited to 'lib/Scraper')
-rw-r--r--lib/Scraper/IScraper.php43
-rw-r--r--lib/Scraper/Scraper.php106
2 files changed, 149 insertions, 0 deletions
diff --git a/lib/Scraper/IScraper.php b/lib/Scraper/IScraper.php
new file mode 100644
index 000000000..b389b79c9
--- /dev/null
+++ b/lib/Scraper/IScraper.php
@@ -0,0 +1,43 @@
+<?php
+/**
+ * Nextcloud - News
+ *
+ * This file is licensed under the Affero General Public License version 3 or
+ * later. See the COPYING file.
+ *
+ * @author Gioele Falcetti <thegio.f@gmail.com>
+ * @copyright 2019 Gioele Falcetti
+ */
+
+namespace OCA\News\Scraper;
+
+interface IScraper
+{
+ /**
+ * Scrape feed url
+ *
+ * @param string $url
+ *
+ * @return bool False if failed
+ *
+ */
+ public function scrape(string $url): bool;
+
+ /**
+ * Get the scraped content
+ *
+ * @return string
+ *
+ */
+ public function getContent(): string;
+
+ /**
+ * Get the RTL (rigth-to-left) information
+ *
+ * @param bool $default Return this value if the scraper is unable to determine it
+ *
+ * @return bool
+ *
+ */
+ public function getRTL(bool $default = false): bool;
+}
diff --git a/lib/Scraper/Scraper.php b/lib/Scraper/Scraper.php
new file mode 100644
index 000000000..fedb0391c
--- /dev/null
+++ b/lib/Scraper/Scraper.php
@@ -0,0 +1,106 @@
+<?php
+/**
+ * Nextcloud - News
+ *
+ * This file is licensed under the Affero General Public License version 3 or
+ * later. See the COPYING file.
+ *
+ * @author Gioele Falcetti <thegio.f@gmail.com>
+ * @copyright 2019 Gioele Falcetti
+ */
+
+namespace OCA\News\Scraper;
+
+use OCA\News\Utility\PsrLogger;
+
+use andreskrey\Readability\Readability;
+use andreskrey\Readability\Configuration;
+use andreskrey\Readability\ParseException;
+
+class Scraper implements IScraper
+{
+ private $logger;
+ private $config;
+ private $readability;
+ private $curl_opts;
+
+ public function __construct(PsrLogger $logger)
+ {
+ $this->logger = $logger;
+ $this->config = new Configuration([
+ 'FixRelativeURLs' => true,
+ 'SummonCthulhu' => true, // Remove <script>
+ ]);
+ $this->readability = null;
+
+ $this->curl_opts = array(
+ CURLOPT_RETURNTRANSFER => true, // return web page
+ CURLOPT_HEADER => false, // do not return headers
+ CURLOPT_FOLLOWLOCATION => true, // follow redirects
+ //CURLOPT_USERAGENT => "php-news", // who am i
+ CURLOPT_AUTOREFERER => true, // set referer on redirect
+ CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
+ CURLOPT_TIMEOUT => 120, // timeout on response
+ CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
+ );
+ }
+
+ private function getHTTPContent(string $url): array
+ {
+ $handler = curl_init($url);
+ curl_setopt_array($handler, $this->curl_opts);
+ $content = curl_exec($handler);
+ $header = curl_getinfo($handler);
+ curl_close($handler);
+
+ // Update the url after the redirects has been followed
+ $url = $header['url'];
+ return array($content, $header['url']);
+ }
+
+ public function scrape(string $url): bool
+ {
+ list($content, $redirected_url) = $this->getHTTPContent($url);
+ if ($content === false) {
+ $this->logger->error('Unable to recive content from {url}', [
+ 'url' => $url,
+ ]);
+ $this->readability = null;
+ return false;
+ }
+
+ // Update URL used to convert relative URLs
+ $this->config->setOriginalURL($redirected_url);
+ $this->readability = new Readability($this->config);
+
+ try {
+ $this->readability->parse($content);
+ } catch (ParseException $e) {
+ $this->logger->error('Unable to parse content from {url}', [
+ 'url' => $url,
+ ]);
+ }
+ return true;
+ }
+
+ public function getContent(): string
+ {
+ if ($this->readability === null) {
+ return null;
+ }
+ return $this->readability->getContent();
+ }
+
+ public function getRTL(bool $default = false): bool
+ {
+ if ($this->readability === null) {
+ return $default;
+ }
+
+ $RTL = $this->readability->getDirection();
+ if ($RTL === null) {
+ return $default;
+ }
+ return $RTL === "rtl";
+ }
+}