From bc27596f70cb170203655a55c0f53ee55d8d6316 Mon Sep 17 00:00:00 2001 From: Robin Appelman Date: Wed, 4 Jan 2017 11:10:19 +0100 Subject: Add postprocessor for getting lwn subscriber articles (#72) --- lib/Fetcher/FeedFetcher.php | 8 ++- lib/PostProcessor/LWNProcessor.php | 102 +++++++++++++++++++++++++++++++++ tests/Unit/Fetcher/FeedFetcherTest.php | 12 +++- 3 files changed, 120 insertions(+), 2 deletions(-) create mode 100644 lib/PostProcessor/LWNProcessor.php diff --git a/lib/Fetcher/FeedFetcher.php b/lib/Fetcher/FeedFetcher.php index 194a927f9..587e64af2 100644 --- a/lib/Fetcher/FeedFetcher.php +++ b/lib/Fetcher/FeedFetcher.php @@ -15,6 +15,8 @@ namespace OCA\News\Fetcher; use Exception; +use OCA\News\PostProcessor\LWNProcessor; +use OCP\Http\Client\IClientService; use PicoFeed\Parser\MalFormedXmlException; use PicoFeed\Reader\Reader; use PicoFeed\Parser\Parser; @@ -42,15 +44,18 @@ class FeedFetcher implements IFeedFetcher { private $reader; private $l10n; private $time; + private $clientService; public function __construct(Reader $reader, PicoFeedFaviconFactory $faviconFactory, IL10N $l10n, - Time $time) { + Time $time, + IClientService $clientService) { $this->faviconFactory = $faviconFactory; $this->reader = $reader; $this->time = $time; $this->l10n = $l10n; + $this->clientService = $clientService; } @@ -108,6 +113,7 @@ class FeedFetcher implements IFeedFetcher { if ($fullTextEnabled) { $parser->enableContentGrabber(); + $parser->getItemPostProcessor()->register(new LWNProcessor($basicAuthUser, $basicAuthPassword, $this->clientService)); } $parsedFeed = $parser->execute(); diff --git a/lib/PostProcessor/LWNProcessor.php b/lib/PostProcessor/LWNProcessor.php new file mode 100644 index 000000000..f931bb41c --- /dev/null +++ b/lib/PostProcessor/LWNProcessor.php @@ -0,0 +1,102 @@ + + */ + +namespace OCA\News\PostProcessor; + +use GuzzleHttp\Cookie\CookieJar; +use OCP\Http\Client\IClientService; +use PicoFeed\Parser\Feed; +use PicoFeed\Parser\Item; +use PicoFeed\Processor\ItemProcessorInterface; +use PicoFeed\Scraper\RuleParser; + +class LWNProcessor implements ItemProcessorInterface { + private $user; + + private $password; + + private $clientService; + + private $cookieJar; + + /** + * @param $user + * @param $password + */ + public function __construct($user, $password, IClientService $clientService) { + $this->user = $user; + $this->password = $password; + $this->clientService = $clientService; + $this->cookieJar = new CookieJar(); + } + + private function login() { + if ($this->cookieJar->count() > 0) { + return true; + } + if (!$this->user || !$this->password) { + return false; + } + + $client = $this->clientService->newClient(); + $response = $client->post('https://lwn.net/login', [ + 'cookies' => $this->cookieJar, + 'body' => [ + 'Username' => $this->user, + 'Password' => $this->password, + 'target' => '/' + ] + ]); + return ($response->getStatusCode() === 200 && $this->cookieJar->count() > 0); + } + + private function getBody($url) { + $client = $this->clientService->newClient(); + $response = $client->get($url, [ + 'cookies' => $this->cookieJar + ]); + $parser = new RuleParser($response->getBody(), [ + 'body' => array( + '//div[@class="ArticleText"]', + ), + 'strip' => array( + '//div[@class="FeatureByline"]' + ) + ]); + $articleBody = $parser->execute(); + // make all links absolute + return str_replace('href="/', 'href="https://lwn.net/', $articleBody); + } + + private function canHandle($url) { + $regex = '%(?:https?://|//)?(?:www.)?lwn.net%'; + + return (bool)preg_match($regex, $url); + } + + /** + * Execute Item Processor + * + * @access public + * @param Feed $feed + * @param Item $item + * @return bool + */ + public function execute(Feed $feed, Item $item) { + if ($this->canHandle($item->getUrl())) { + $loggedIn = $this->login(); + + $item->setUrl(str_replace('/rss', '', $item->getUrl())); + if ($loggedIn) { + $item->setContent($this->getBody($item->getUrl())); + } + } + } +} diff --git a/tests/Unit/Fetcher/FeedFetcherTest.php b/tests/Unit/Fetcher/FeedFetcherTest.php index 930cf4c99..ce09bb0e3 100644 --- a/tests/Unit/Fetcher/FeedFetcherTest.php +++ b/tests/Unit/Fetcher/FeedFetcherTest.php @@ -15,6 +15,8 @@ namespace OCA\News\Fetcher; use \OCA\News\Db\Item; use \OCA\News\Db\Feed; +use OCP\Http\Client\IClientService; +use PicoFeed\Processor\ItemPostProcessor; class FeedFetcherTest extends \PHPUnit_Framework_TestCase { @@ -97,11 +99,19 @@ class FeedFetcherTest extends \PHPUnit_Framework_TestCase { $timeFactory->expects($this->any()) ->method('getTime') ->will($this->returnValue($this->time)); + $postProcessor = $this->getMockBuilder(ItemPostProcessor::class) + ->getMock(); + $this->parser->expects($this->any()) + ->method('getItemPostProcessor') + ->will($this->returnValue($postProcessor)); + $clientService = $this->getMockBuilder(IClientService::class) + ->getMock(); $this->fetcher = new FeedFetcher( $this->reader, $this->faviconFactory, $this->l10n, - $timeFactory); + $timeFactory, + $clientService); $this->url = 'http://tests'; $this->permalink = 'http://permalink'; -- cgit v1.2.3