lib/Scraper/Scraper.php


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105

<?php
/**
 * Nextcloud - News
 *
 * This file is licensed under the Affero General Public License version 3 or
 * later. See the COPYING file.
 *
 * @author Gioele Falcetti <thegio.f@gmail.com>
 * @copyright 2019 Gioele Falcetti
 */

namespace OCA\News\Scraper;

use andreskrey\Readability\Readability;
use andreskrey\Readability\Configuration;
use andreskrey\Readability\ParseException;
use Psr\Log\LoggerInterface;

class Scraper implements IScraper
{
    private $logger;
    private $config;
    private $readability;
    private $curl_opts;

    public function __construct(LoggerInterface $logger)
    {
        $this->logger = $logger;
        $this->config = new Configuration([
            'FixRelativeURLs' => true,
            'SummonCthulhu' => true, // Remove <script>
        ]);
        $this->readability = null;

        $this->curl_opts = array(
            CURLOPT_RETURNTRANSFER => true,     // return web page
            CURLOPT_HEADER         => false,    // do not return headers
            CURLOPT_FOLLOWLOCATION => true,     // follow redirects
            //CURLOPT_USERAGENT    => "php-news", // who am i
            CURLOPT_AUTOREFERER    => true,     // set referer on redirect
            CURLOPT_CONNECTTIMEOUT => 120,      // timeout on connect
            CURLOPT_TIMEOUT        => 120,      // timeout on response
            CURLOPT_MAXREDIRS      => 10,       // stop after 10 redirects
        );
    }

    private function getHTTPContent(string $url): array
    {
        $handler = curl_init($url);
        curl_setopt_array($handler, $this->curl_opts);
        $content = curl_exec($handler);
        $header  = curl_getinfo($handler);
        curl_close($handler);

        // Update the url after the redirects has been followed
        $url = $header['url'];
        return array($content, $header['url']);
    }

    public function scrape(string $url): bool
    {
        list($content, $redirected_url) = $this->getHTTPContent($url);
        if ($content === false) {
            $this->logger->error('Unable to receive content from {url}', [
                 'url' => $url,
            ]);
            $this->readability = null;
            return false;
        }

        // Update URL used to convert relative URLs
        $this->config->setOriginalURL($redirected_url);
        $this->readability = new Readability($this->config);

        try {
            $this->readability->parse($content);
        } catch (ParseException $e) {
            $this->logger->error('Unable to parse content from {url}', [
                 'url' => $url,
            ]);
        }
        return true;
    }

    public function getContent(): ?string
    {
        if ($this->readability === null) {
            return null;
        }
        return $this->readability->getContent();
    }

    public function getRTL(bool $default = false): bool
    {
        if ($this->readability === null) {
            return $default;
        }

        $RTL = $this->readability->getDirection();
        if ($RTL === null) {
            return $default;
        }
        return $RTL === "rtl";
    }
}