diff options
Diffstat (limited to 'vendor/fguillot/picofeed/lib')
8 files changed, 128 insertions, 34 deletions
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Client.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Client.php index 84a5cf296..ae93f3e83 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Client.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Client.php @@ -38,6 +38,14 @@ abstract class Client private $encoding = ''; /** + * HTTP request headers + * + * @access protected + * @var array + */ + protected $request_headers = array(); + + /** * HTTP Etag header * * @access protected @@ -194,6 +202,16 @@ abstract class Client } /** + * Add HTTP Header to the request + * + * @access public + * @param array $headers + */ + public function setHeaders($headers) { + $this->request_headers = $headers; + } + + /** * Perform the HTTP request * * @access public @@ -645,8 +663,8 @@ abstract class Client public function setConfig($config) { if ($config !== null) { - $this->setTimeout($config->getGrabberTimeout()); - $this->setUserAgent($config->getGrabberUserAgent()); + $this->setTimeout($config->getClientTimeout()); + $this->setUserAgent($config->getClientUserAgent()); $this->setMaxRedirections($config->getMaxRedirections()); $this->setMaxBodySize($config->getMaxBodySize()); $this->setProxyHostname($config->getProxyHostname()); diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php index 278eeb422..4ad3f141d 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Curl.php @@ -34,7 +34,7 @@ class Curl extends Client * @access private * @var array */ - private $headers = array(); + private $response_headers = array(); /** * Counter on the number of header received @@ -42,7 +42,7 @@ class Curl extends Client * @access private * @var integer */ - private $headers_counter = 0; + private $response_headers_count = 0; /** * cURL callback to read the HTTP body @@ -81,15 +81,15 @@ class Curl extends Client $length = strlen($buffer); if ($buffer === "\r\n") { - $this->headers_counter++; + $this->response_headers_count++; } else { - if (! isset($this->headers[$this->headers_counter])) { - $this->headers[$this->headers_counter] = ''; + if (! isset($this->response_headers[$this->response_headers_count])) { + $this->response_headers[$this->response_headers_count] = ''; } - $this->headers[$this->headers_counter] .= $buffer; + $this->response_headers[$this->response_headers_count] .= $buffer; } return $length; @@ -153,6 +153,8 @@ class Curl extends Client $headers[] = 'If-Modified-Since: '.$this->last_modified; } + $headers = array_merge($headers, $this->request_headers); + return $headers; } @@ -234,6 +236,7 @@ class Curl extends Client curl_setopt($ch, CURLOPT_URL, $this->url); curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1); + curl_setopt($ch, CURLOPT_TIMEOUT, $this->timeout); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->timeout); curl_setopt($ch, CURLOPT_USERAGENT, $this->user_agent); curl_setopt($ch, CURLOPT_HTTPHEADER, $this->prepareHeaders()); @@ -302,7 +305,7 @@ class Curl extends Client { $this->executeContext(); - list($status, $headers) = HttpHeaders::parse(explode("\r\n", $this->headers[$this->headers_counter - 1])); + list($status, $headers) = HttpHeaders::parse(explode("\r\n", $this->response_headers[$this->response_headers_count - 1])); // When restricted with open_basedir if ($this->needToHandleRedirection($follow_location, $status)) { @@ -343,8 +346,8 @@ class Curl extends Client $this->url = Url::resolve($location, $this->url); $this->body = ''; $this->body_length = 0; - $this->headers = array(); - $this->headers_counter = 0; + $this->response_headers = array(); + $this->response_headers_count = 0; while (true) { @@ -360,8 +363,8 @@ class Curl extends Client $this->url = Url::resolve($result['headers']['Location'], $this->url); $this->body = ''; $this->body_length = 0; - $this->headers = array(); - $this->headers_counter = 0; + $this->response_headers = array(); + $this->response_headers_count = 0; } else { break; diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php index fe4890400..bec8ab07b 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php @@ -284,7 +284,7 @@ class Grabber Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes'); $rules = $this->getRules(); - if (is_array($rules)) { + if (! empty($rules)) { Logger::setMessage(get_called_class().': Parse content with rules'); $this->parseContentWithRules($rules); } @@ -316,7 +316,13 @@ class Grabber try { $client = Client::getInstance(); - $client->setConfig($this->config); + + if ($this->config !== null) { + $client->setConfig($this->config); + $client->setTimeout($this->config->getGrabberTimeout()); + $client->setUserAgent($this->config->getGrabberUserAgent()); + } + $client->execute($this->url); $this->url = $client->getUrl(); @@ -335,31 +341,67 @@ class Grabber * Try to find a predefined rule * * @access public - * @return mixed + * @return array */ public function getRules() { $hostname = parse_url($this->url, PHP_URL_HOST); - if ($hostname === false) { - return false; - } + if ($hostname !== false) { - $files = array($hostname); + $files = $this->getRulesFileList($hostname); - if (substr($hostname, 0, 4) == 'www.') { - $files[] = substr($hostname, 4); + foreach ($this->getRulesFolders() as $folder) { + $rule = $this->loadRuleFile($folder, $files); + + if (! empty($rule)) { + return $rule; + } + } } - if (($pos = strpos($hostname, '.')) !== false) { - $files[] = substr($hostname, $pos); - $files[] = substr($hostname, $pos + 1); - $files[] = substr($hostname, 0, $pos); + return array(); + } + + /** + * Get the list of possible rules file names for a given hostname + * + * @access public + * @param string $hostname Hostname + * @return array + */ + public function getRulesFileList($hostname) + { + $files = array($hostname); // subdomain.domain.tld + $parts = explode('.', $hostname); + $len = count($parts); + + if ($len > 2) { + $subdomain = array_shift($parts); + $files[] = implode('.', $parts); // domain.tld + $files[] = '.'.implode('.', $parts); // .domain.tld + $files[] = $subdomain; // subdomain + } + else if ($len === 2) { + $files[] = '.'.implode('.', $parts); // .domain.tld + $files[] = $parts[0]; // domain } - foreach ($files as $file) { + return $files; + } - $filename = __DIR__.'/../Rules/'.$file.'.php'; + /** + * Load a rule file from the defined folder + * + * @access public + * @param string $folder Rule directory + * @param array $files List of possible file names + * @return array + */ + public function loadRuleFile($folder, array $files) + { + foreach ($files as $file) { + $filename = $folder.'/'.$file.'.php'; if (file_exists($filename)) { Logger::setMessage(get_called_class().' Load rule: '.$file); @@ -367,7 +409,24 @@ class Grabber } } - return false; + return array(); + } + + /** + * Get the list of folders that contains rules + * + * @access public + * @return array + */ + public function getRulesFolders() + { + $folders = array(__DIR__.'/../Rules'); + + if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) { + $folders[] = $this->config->getGrabberRulesFolder(); + } + + return $folders; } /** diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Stream.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Stream.php index 1e539b106..72afe92c0 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Stream.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Stream.php @@ -47,6 +47,8 @@ class Stream extends Client $headers[] = 'Authorization: Basic '.base64_encode($this->username.':'.$this->password); } + $headers = array_merge($headers, $this->request_headers); + return $headers; } diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Config/Config.php b/vendor/fguillot/picofeed/lib/PicoFeed/Config/Config.php index 181da03b6..1eaaeef9e 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Config/Config.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Config/Config.php @@ -16,6 +16,7 @@ namespace PicoFeed\Config; * @method \PicoFeed\Config\Config setProxyPort(integer $value) * @method \PicoFeed\Config\Config setProxyUsername(string $value) * @method \PicoFeed\Config\Config setProxyPassword(string $value) + * @method \PicoFeed\Config\Config setGrabberRulesFolder(string $value) * @method \PicoFeed\Config\Config setGrabberTimeout(integer $value) * @method \PicoFeed\Config\Config setGrabberUserAgent(string $value) * @method \PicoFeed\Config\Config setParserHashAlgo(string $value) @@ -42,6 +43,7 @@ namespace PicoFeed\Config; * @method integer getProxyPort() * @method string getProxyUsername() * @method string getProxyPassword() + * @method string getGrabberRulesFolder() * @method integer getGrabberTimeout() * @method string getGrabberUserAgent() * @method string getParserHashAlgo() diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php b/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php index 0eb3f88ea..123f9896e 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php @@ -120,6 +120,7 @@ class Filter "\x10", "\xc3\x20", "", + "\xe2\x80\x9c\x08", ); foreach ($invalid_chars as $needle) { diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php index feda8c254..d0c2f8ef0 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/XmlParser.php @@ -55,7 +55,9 @@ class XmlParser */ private static function scanInput($input, Closure $callback) { - if (substr(php_sapi_name(), 0, 3) === 'fpm') { + $isRunningFpm = substr(php_sapi_name(), 0, 3) === 'fpm'; + + if ($isRunningFpm) { // If running with PHP-FPM and an entity is detected we refuse to parse the feed // @see https://bugs.php.net/bug.php?id=64938 @@ -64,8 +66,7 @@ class XmlParser } } else { - - libxml_disable_entity_loader(true); + $entityLoaderDisabled = libxml_disable_entity_loader(true); } libxml_use_internal_errors(true); @@ -81,6 +82,10 @@ class XmlParser } } + if ($isRunningFpm === false) { + libxml_disable_entity_loader($entityLoaderDisabled); + } + return $dom; } diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.lemonde.fr.php b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.lemonde.fr.php index ce2a95018..125bb6a34 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.lemonde.fr.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Rules/www.lemonde.fr.php @@ -1,9 +1,13 @@ <?php return array( - 'test_url' => 'http://www.lemonde.fr/societe/article/2013/08/30/boris-boillon-ancien-ambassadeur-de-sarkozy-arrete-avec-350-000-euros-en-liquide_3469109_3224.html', + 'test_url' => array( + 'http://www.lemonde.fr/societe/article/2013/08/30/boris-boillon-ancien-ambassadeur-de-sarkozy-arrete-avec-350-000-euros-en-liquide_3469109_3224.html', + 'http://www.lemonde.fr/afrique/article/2015/04/06/plonge-dans-la-crise-l-angola-revele-son-vrai-visage_4610364_3212.html', + ), 'body' => array( '//div[@id="articleBody"]', + '//div[@itemprop="articleBody"]', ), 'strip' => array( ), -);
\ No newline at end of file +); |