summaryrefslogtreecommitdiffstats
path: root/vendor
diff options
context:
space:
mode:
authorBernhard Posselt <dev@bernhard-posselt.com>2015-04-11 15:21:54 +0200
committerBernhard Posselt <dev@bernhard-posselt.com>2015-04-11 15:21:54 +0200
commite63fbaebcba2ef3330ed34426b76bb3ad4156ea6 (patch)
tree9a5e930af3aff537b8afe03863f5252f982c10fb /vendor
parent896577ae02b38412c6045111d9ecb13d7e81b00f (diff)
update picofeed, fix #763
Diffstat (limited to 'vendor')
-rw-r--r--vendor/composer/installed.json8
-rw-r--r--vendor/fguillot/picofeed/docs/config.markdown10
-rw-r--r--vendor/fguillot/picofeed/docs/installation.markdown13
-rw-r--r--vendor/fguillot/picofeed/lib/PicoFeed/Client/Client.php22
-rw-r--r--vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php25
-rw-r--r--vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php91
-rw-r--r--vendor/fguillot/picofeed/lib/PicoFeed/Client/Stream.php2
-rw-r--r--vendor/fguillot/picofeed/lib/PicoFeed/Config/Config.php2
-rw-r--r--vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php1
-rw-r--r--vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php11
-rw-r--r--vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.lemonde.fr.php8
-rw-r--r--vendor/fguillot/picofeed/tests/Client/GrabberTest.php87
12 files changed, 225 insertions, 55 deletions
diff --git a/vendor/composer/installed.json b/vendor/composer/installed.json
index 0d41799e2..fa2c89fda 100644
--- a/vendor/composer/installed.json
+++ b/vendor/composer/installed.json
@@ -119,12 +119,12 @@
"source": {
"type": "git",
"url": "https://github.com/fguillot/picoFeed.git",
- "reference": "7c28753d5936ba635435a8e0e941dcabee67b243"
+ "reference": "273c344b35b468b6c8053f635332c3a404f8c7b9"
},
"dist": {
"type": "zip",
- "url": "https://api.github.com/repos/fguillot/picoFeed/zipball/7c28753d5936ba635435a8e0e941dcabee67b243",
- "reference": "7c28753d5936ba635435a8e0e941dcabee67b243",
+ "url": "https://api.github.com/repos/fguillot/picoFeed/zipball/273c344b35b468b6c8053f635332c3a404f8c7b9",
+ "reference": "273c344b35b468b6c8053f635332c3a404f8c7b9",
"shasum": ""
},
"require": {
@@ -138,7 +138,7 @@
"suggest": {
"ext-curl": "PicoFeed will use cURL if present"
},
- "time": "2015-03-30 23:34:59",
+ "time": "2015-04-11 12:46:50",
"bin": [
"picofeed"
],
diff --git a/vendor/fguillot/picofeed/docs/config.markdown b/vendor/fguillot/picofeed/docs/config.markdown
index 8b197f6f9..3360abf73 100644
--- a/vendor/fguillot/picofeed/docs/config.markdown
+++ b/vendor/fguillot/picofeed/docs/config.markdown
@@ -126,6 +126,16 @@ $config->setGrabberTimeout(20); // 20 seconds
$config->setGrabberUserAgent('My content scraper');
```
+### Add a rules folder
+
+- Method name: `setGrabberRulesFolder()`
+- Default value: `null`
+- Argument value: string
+
+```php
+$config->setGrabberRulesFolder('/path/to/my/grabber/rules');
+```
+
Parser
------
diff --git a/vendor/fguillot/picofeed/docs/installation.markdown b/vendor/fguillot/picofeed/docs/installation.markdown
index 9bf1450d1..ecc6b3b4d 100644
--- a/vendor/fguillot/picofeed/docs/installation.markdown
+++ b/vendor/fguillot/picofeed/docs/installation.markdown
@@ -5,14 +5,7 @@ Versions
--------
- Development version: master
-- Available versions:
- - v0.1.2 (stable)
- - v0.1.1
- - v0.1.0
- - v0.0.2
- - v0.0.1
-
-Note: The public API has changed between 0.0.x and 0.1.0
+- Stable version: v0.1.3
Installation with Composer
--------------------------
@@ -22,7 +15,7 @@ Configure your `composer.json`:
```json
{
"require": {
- "fguillot/picofeed": "0.1.2"
+ "fguillot/picofeed": "0.1.3"
}
}
```
@@ -30,7 +23,7 @@ Configure your `composer.json`:
Or simply:
```bash
-composer require fguillot/picofeed:0.1.2
+composer require fguillot/picofeed:0.1.3
```
And download the code:
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Client.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Client.php
index 84a5cf296..ae93f3e83 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Client.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Client.php
@@ -38,6 +38,14 @@ abstract class Client
private $encoding = '';
/**
+ * HTTP request headers
+ *
+ * @access protected
+ * @var array
+ */
+ protected $request_headers = array();
+
+ /**
* HTTP Etag header
*
* @access protected
@@ -194,6 +202,16 @@ abstract class Client
}
/**
+ * Add HTTP Header to the request
+ *
+ * @access public
+ * @param array $headers
+ */
+ public function setHeaders($headers) {
+ $this->request_headers = $headers;
+ }
+
+ /**
* Perform the HTTP request
*
* @access public
@@ -645,8 +663,8 @@ abstract class Client
public function setConfig($config)
{
if ($config !== null) {
- $this->setTimeout($config->getGrabberTimeout());
- $this->setUserAgent($config->getGrabberUserAgent());
+ $this->setTimeout($config->getClientTimeout());
+ $this->setUserAgent($config->getClientUserAgent());
$this->setMaxRedirections($config->getMaxRedirections());
$this->setMaxBodySize($config->getMaxBodySize());
$this->setProxyHostname($config->getProxyHostname());
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php
index 278eeb422..4ad3f141d 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php
@@ -34,7 +34,7 @@ class Curl extends Client
* @access private
* @var array
*/
- private $headers = array();
+ private $response_headers = array();
/**
* Counter on the number of header received
@@ -42,7 +42,7 @@ class Curl extends Client
* @access private
* @var integer
*/
- private $headers_counter = 0;
+ private $response_headers_count = 0;
/**
* cURL callback to read the HTTP body
@@ -81,15 +81,15 @@ class Curl extends Client
$length = strlen($buffer);
if ($buffer === "\r\n") {
- $this->headers_counter++;
+ $this->response_headers_count++;
}
else {
- if (! isset($this->headers[$this->headers_counter])) {
- $this->headers[$this->headers_counter] = '';
+ if (! isset($this->response_headers[$this->response_headers_count])) {
+ $this->response_headers[$this->response_headers_count] = '';
}
- $this->headers[$this->headers_counter] .= $buffer;
+ $this->response_headers[$this->response_headers_count] .= $buffer;
}
return $length;
@@ -153,6 +153,8 @@ class Curl extends Client
$headers[] = 'If-Modified-Since: '.$this->last_modified;
}
+ $headers = array_merge($headers, $this->request_headers);
+
return $headers;
}
@@ -234,6 +236,7 @@ class Curl extends Client
curl_setopt($ch, CURLOPT_URL, $this->url);
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
+ curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout);
curl_setopt($ch, CURLOPT_USERAGENT, $this->user_agent);
curl_setopt($ch, CURLOPT_HTTPHEADER, $this->prepareHeaders());
@@ -302,7 +305,7 @@ class Curl extends Client
{
$this->executeContext();
- list($status, $headers) = HttpHeaders::parse(explode("\r\n", $this->headers[$this->headers_counter - 1]));
+ list($status, $headers) = HttpHeaders::parse(explode("\r\n", $this->response_headers[$this->response_headers_count - 1]));
// When restricted with open_basedir
if ($this->needToHandleRedirection($follow_location, $status)) {
@@ -343,8 +346,8 @@ class Curl extends Client
$this->url = Url::resolve($location, $this->url);
$this->body = '';
$this->body_length = 0;
- $this->headers = array();
- $this->headers_counter = 0;
+ $this->response_headers = array();
+ $this->response_headers_count = 0;
while (true) {
@@ -360,8 +363,8 @@ class Curl extends Client
$this->url = Url::resolve($result['headers']['Location'], $this->url);
$this->body = '';
$this->body_length = 0;
- $this->headers = array();
- $this->headers_counter = 0;
+ $this->response_headers = array();
+ $this->response_headers_count = 0;
}
else {
break;
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php
index fe4890400..bec8ab07b 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php
@@ -284,7 +284,7 @@ class Grabber
Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes');
$rules = $this->getRules();
- if (is_array($rules)) {
+ if (! empty($rules)) {
Logger::setMessage(get_called_class().': Parse content with rules');
$this->parseContentWithRules($rules);
}
@@ -316,7 +316,13 @@ class Grabber
try {
$client = Client::getInstance();
- $client->setConfig($this->config);
+
+ if ($this->config !== null) {
+ $client->setConfig($this->config);
+ $client->setTimeout($this->config->getGrabberTimeout());
+ $client->setUserAgent($this->config->getGrabberUserAgent());
+ }
+
$client->execute($this->url);
$this->url = $client->getUrl();
@@ -335,31 +341,67 @@ class Grabber
* Try to find a predefined rule
*
* @access public
- * @return mixed
+ * @return array
*/
public function getRules()
{
$hostname = parse_url($this->url, PHP_URL_HOST);
- if ($hostname === false) {
- return false;
- }
+ if ($hostname !== false) {
- $files = array($hostname);
+ $files = $this->getRulesFileList($hostname);
- if (substr($hostname, 0, 4) == 'www.') {
- $files[] = substr($hostname, 4);
+ foreach ($this->getRulesFolders() as $folder) {
+ $rule = $this->loadRuleFile($folder, $files);
+
+ if (! empty($rule)) {
+ return $rule;
+ }
+ }
}
- if (($pos = strpos($hostname, '.')) !== false) {
- $files[] = substr($hostname, $pos);
- $files[] = substr($hostname, $pos + 1);
- $files[] = substr($hostname, 0, $pos);
+ return array();
+ }
+
+ /**
+ * Get the list of possible rules file names for a given hostname
+ *
+ * @access public
+ * @param string $hostname Hostname
+ * @return array
+ */
+ public function getRulesFileList($hostname)
+ {
+ $files = array($hostname); // subdomain.domain.tld
+ $parts = explode('.', $hostname);
+ $len = count($parts);
+
+ if ($len > 2) {
+ $subdomain = array_shift($parts);
+ $files[] = implode('.', $parts); // domain.tld
+ $files[] = '.'.implode('.', $parts); // .domain.tld
+ $files[] = $subdomain; // subdomain
+ }
+ else if ($len === 2) {
+ $files[] = '.'.implode('.', $parts); // .domain.tld
+ $files[] = $parts[0]; // domain
}
- foreach ($files as $file) {
+ return $files;
+ }
- $filename = __DIR__.'/../Rules/'.$file.'.php';
+ /**
+ * Load a rule file from the defined folder
+ *
+ * @access public
+ * @param string $folder Rule directory
+ * @param array $files List of possible file names
+ * @return array
+ */
+ public function loadRuleFile($folder, array $files)
+ {
+ foreach ($files as $file) {
+ $filename = $folder.'/'.$file.'.php';
if (file_exists($filename)) {
Logger::setMessage(get_called_class().' Load rule: '.$file);
@@ -367,7 +409,24 @@ class Grabber
}
}
- return false;
+ return array();
+ }
+
+ /**
+ * Get the list of folders that contains rules
+ *
+ * @access public
+ * @return array
+ */
+ public function getRulesFolders()
+ {
+ $folders = array(__DIR__.'/../Rules');
+
+ if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) {
+ $folders[] = $this->config->getGrabberRulesFolder();
+ }
+
+ return $folders;
}
/**
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Stream.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Stream.php
index 1e539b106..72afe92c0 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Stream.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Stream.php
@@ -47,6 +47,8 @@ class Stream extends Client
$headers[] = 'Authorization: Basic '.base64_encode($this->username.':'.$this->password);
}
+ $headers = array_merge($headers, $this->request_headers);
+
return $headers;
}
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Config/Config.php b/vendor/fguillot/picofeed/lib/PicoFeed/Config/Config.php
index 181da03b6..1eaaeef9e 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Config/Config.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Config/Config.php
@@ -16,6 +16,7 @@ namespace PicoFeed\Config;
* @method \PicoFeed\Config\Config setProxyPort(integer $value)
* @method \PicoFeed\Config\Config setProxyUsername(string $value)
* @method \PicoFeed\Config\Config setProxyPassword(string $value)
+ * @method \PicoFeed\Config\Config setGrabberRulesFolder(string $value)
* @method \PicoFeed\Config\Config setGrabberTimeout(integer $value)
* @method \PicoFeed\Config\Config setGrabberUserAgent(string $value)
* @method \PicoFeed\Config\Config setParserHashAlgo(string $value)
@@ -42,6 +43,7 @@ namespace PicoFeed\Config;
* @method integer getProxyPort()
* @method string getProxyUsername()
* @method string getProxyPassword()
+ * @method string getGrabberRulesFolder()
* @method integer getGrabberTimeout()
* @method string getGrabberUserAgent()
* @method string getParserHashAlgo()
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php b/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php
index 0eb3f88ea..123f9896e 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php
@@ -120,6 +120,7 @@ class Filter
"\x10",
"\xc3\x20",
"&#x1F;",
+ "\xe2\x80\x9c\x08",
);
foreach ($invalid_chars as $needle) {
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php
index feda8c254..d0c2f8ef0 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php
@@ -55,7 +55,9 @@ class XmlParser
*/
private static function scanInput($input, Closure $callback)
{
- if (substr(php_sapi_name(), 0, 3) === 'fpm') {
+ $isRunningFpm = substr(php_sapi_name(), 0, 3) === 'fpm';
+
+ if ($isRunningFpm) {
// If running with PHP-FPM and an entity is detected we refuse to parse the feed
// @see https://bugs.php.net/bug.php?id=64938
@@ -64,8 +66,7 @@ class XmlParser
}
}
else {
-
- libxml_disable_entity_loader(true);
+ $entityLoaderDisabled = libxml_disable_entity_loader(true);
}
libxml_use_internal_errors(true);
@@ -81,6 +82,10 @@ class XmlParser
}
}
+ if ($isRunningFpm === false) {
+ libxml_disable_entity_loader($entityLoaderDisabled);
+ }
+
return $dom;
}
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.lemonde.fr.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.lemonde.fr.php
index ce2a95018..125bb6a34 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.lemonde.fr.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.lemonde.fr.php
@@ -1,9 +1,13 @@
<?php
return array(
- 'test_url' => 'http://www.lemonde.fr/societe/article/2013/08/30/boris-boillon-ancien-ambassadeur-de-sarkozy-arrete-avec-350-000-euros-en-liquide_3469109_3224.html',
+ 'test_url' => array(
+ 'http://www.lemonde.fr/societe/article/2013/08/30/boris-boillon-ancien-ambassadeur-de-sarkozy-arrete-avec-350-000-euros-en-liquide_3469109_3224.html',
+ 'http://www.lemonde.fr/afrique/article/2015/04/06/plonge-dans-la-crise-l-angola-revele-son-vrai-visage_4610364_3212.html',
+ ),
'body' => array(
'//div[@id="articleBody"]',
+ '//div[@itemprop="articleBody"]',
),
'strip' => array(
),
-); \ No newline at end of file
+);
diff --git a/vendor/fguillot/picofeed/tests/Client/GrabberTest.php b/vendor/fguillot/picofeed/tests/Client/GrabberTest.php
index 224dc14f1..9b057fc1f 100644
--- a/vendor/fguillot/picofeed/tests/Client/GrabberTest.php
+++ b/vendor/fguillot/picofeed/tests/Client/GrabberTest.php
@@ -4,9 +4,89 @@ namespace PicoFeed\Client;
use PHPUnit_Framework_TestCase;
use PicoFeed\Reader\Reader;
+use PicoFeed\Config\Config;
class GrabberTest extends PHPUnit_Framework_TestCase
{
+ public function testGetRulesFolders()
+ {
+ // No custom path
+ $grabber = new Grabber('');
+ $dirs = $grabber->getRulesFolders();
+ $this->assertNotEmpty($dirs);
+ $this->assertCount(1, $dirs);
+ $this->assertTrue(strpos($dirs[0], '/../Rules') !== false);
+
+ // Custom path
+ $config = new Config;
+ $config->setGrabberRulesFolder('/foobar/rules');
+
+ $grabber = new Grabber('');
+ $grabber->setConfig($config);
+
+ $dirs = $grabber->getRulesFolders();
+
+ $this->assertNotEmpty($dirs);
+ $this->assertCount(2, $dirs);
+ $this->assertTrue(strpos($dirs[0], '/../Rules') !== false);
+ $this->assertEquals('/foobar/rules', $dirs[1]);
+
+ // No custom path with empty config object
+ $grabber = new Grabber('');
+ $grabber->setConfig(new Config);
+
+ $dirs = $grabber->getRulesFolders();
+
+ $this->assertNotEmpty($dirs);
+ $this->assertCount(1, $dirs);
+ $this->assertTrue(strpos($dirs[0], '/../Rules') !== false);
+ }
+
+ public function testLoadRuleFile()
+ {
+ $grabber = new Grabber('');
+ $dirs = $grabber->getRulesFolders();
+
+ $this->assertEmpty($grabber->loadRuleFile($dirs[0], array('test')));
+ $this->assertNotEmpty($grabber->loadRuleFile($dirs[0], array('test', 'xkcd.com')));
+ }
+
+ public function testGetRulesFileList()
+ {
+ $grabber = new Grabber('');
+ $this->assertEquals(
+ array('www.google.ca', 'google.ca', '.google.ca', 'www'),
+ $grabber->getRulesFileList('www.google.ca')
+ );
+
+ $grabber = new Grabber('');
+ $this->assertEquals(
+ array('google.ca', '.google.ca', 'google'),
+ $grabber->getRulesFileList('google.ca')
+ );
+
+ $grabber = new Grabber('');
+ $this->assertEquals(
+ array('a.b.c.d', 'b.c.d', '.b.c.d', 'a'),
+ $grabber->getRulesFileList('a.b.c.d')
+ );
+
+ $grabber = new Grabber('');
+ $this->assertEquals(
+ array('localhost'),
+ $grabber->getRulesFileList('localhost')
+ );
+ }
+
+ public function testGetRules()
+ {
+ $grabber = new Grabber('http://www.egscomics.com/index.php?id=1690');
+ $this->assertNotEmpty($grabber->getRules());
+
+ $grabber = new Grabber('http://localhost/foobar');
+ $this->assertEmpty($grabber->getRules());
+ }
+
/**
* @group online
*/
@@ -33,13 +113,6 @@ class GrabberTest extends PHPUnit_Framework_TestCase
$this->assertTrue($grabber->parse());
}
- public function testGetRules()
- {
- $grabber = new Grabber('http://www.egscomics.com/index.php?id=1690');
- $this->assertTrue(is_array($grabber->getRules()));
- }
-
- // 01net.com - https://github.com/fguillot/miniflux/issues/267
/**
* @group online
*/