From e63fbaebcba2ef3330ed34426b76bb3ad4156ea6 Mon Sep 17 00:00:00 2001 From: Bernhard Posselt Date: Sat, 11 Apr 2015 15:21:54 +0200 Subject: update picofeed, fix #763 --- .../picofeed/lib/PicoFeed/Client/Grabber.php | 91 ++++++++++++++++++---- 1 file changed, 75 insertions(+), 16 deletions(-) (limited to 'vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php') diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php index fe4890400..bec8ab07b 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php @@ -284,7 +284,7 @@ class Grabber Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes'); $rules = $this->getRules(); - if (is_array($rules)) { + if (! empty($rules)) { Logger::setMessage(get_called_class().': Parse content with rules'); $this->parseContentWithRules($rules); } @@ -316,7 +316,13 @@ class Grabber try { $client = Client::getInstance(); - $client->setConfig($this->config); + + if ($this->config !== null) { + $client->setConfig($this->config); + $client->setTimeout($this->config->getGrabberTimeout()); + $client->setUserAgent($this->config->getGrabberUserAgent()); + } + $client->execute($this->url); $this->url = $client->getUrl(); @@ -335,31 +341,67 @@ class Grabber * Try to find a predefined rule * * @access public - * @return mixed + * @return array */ public function getRules() { $hostname = parse_url($this->url, PHP_URL_HOST); - if ($hostname === false) { - return false; - } + if ($hostname !== false) { - $files = array($hostname); + $files = $this->getRulesFileList($hostname); - if (substr($hostname, 0, 4) == 'www.') { - $files[] = substr($hostname, 4); + foreach ($this->getRulesFolders() as $folder) { + $rule = $this->loadRuleFile($folder, $files); + + if (! empty($rule)) { + return $rule; + } + } } - if (($pos = strpos($hostname, '.')) !== false) { - $files[] = substr($hostname, $pos); - $files[] = substr($hostname, $pos + 1); - $files[] = substr($hostname, 0, $pos); + return array(); + } + + /** + * Get the list of possible rules file names for a given hostname + * + * @access public + * @param string $hostname Hostname + * @return array + */ + public function getRulesFileList($hostname) + { + $files = array($hostname); // subdomain.domain.tld + $parts = explode('.', $hostname); + $len = count($parts); + + if ($len > 2) { + $subdomain = array_shift($parts); + $files[] = implode('.', $parts); // domain.tld + $files[] = '.'.implode('.', $parts); // .domain.tld + $files[] = $subdomain; // subdomain + } + else if ($len === 2) { + $files[] = '.'.implode('.', $parts); // .domain.tld + $files[] = $parts[0]; // domain } - foreach ($files as $file) { + return $files; + } - $filename = __DIR__.'/../Rules/'.$file.'.php'; + /** + * Load a rule file from the defined folder + * + * @access public + * @param string $folder Rule directory + * @param array $files List of possible file names + * @return array + */ + public function loadRuleFile($folder, array $files) + { + foreach ($files as $file) { + $filename = $folder.'/'.$file.'.php'; if (file_exists($filename)) { Logger::setMessage(get_called_class().' Load rule: '.$file); @@ -367,7 +409,24 @@ class Grabber } } - return false; + return array(); + } + + /** + * Get the list of folders that contains rules + * + * @access public + * @return array + */ + public function getRulesFolders() + { + $folders = array(__DIR__.'/../Rules'); + + if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) { + $folders[] = $this->config->getGrabberRulesFolder(); + } + + return $folders; } /** -- cgit v1.2.3