From 73f65c8fbadbdd2098448e77b6d3f0464ad8613e Mon Sep 17 00:00:00 2001 From: Bernhard Posselt Date: Tue, 27 Jan 2015 09:29:09 +0100 Subject: update picofeed --- vendor/fguillot/picofeed | 1 + .../picofeed/lib/PicoFeed/Client/Grabber.php | 414 --------------------- 2 files changed, 1 insertion(+), 414 deletions(-) create mode 160000 vendor/fguillot/picofeed delete mode 100644 vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php (limited to 'vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php') diff --git a/vendor/fguillot/picofeed b/vendor/fguillot/picofeed new file mode 160000 index 000000000..0a1d0d395 --- /dev/null +++ b/vendor/fguillot/picofeed @@ -0,0 +1 @@ +Subproject commit 0a1d0d3950f7f047dc8fb1d80aa6296e15f306d0 diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php deleted file mode 100644 index 1bca05664..000000000 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php +++ /dev/null @@ -1,414 +0,0 @@ -url = $url; - $this->html = $html; - $this->encoding = $encoding; - } - - /** - * Set config object - * - * @access public - * @param \PicoFeed\Config\Config $config Config instance - * @return Grabber - */ - public function setConfig($config) - { - $this->config = $config; - return $this; - } - - /** - * Get relevant content - * - * @access public - * @return string - */ - public function getContent() - { - return $this->content; - } - - /** - * Get raw content (unfiltered) - * - * @access public - * @return string - */ - public function getRawContent() - { - return $this->html; - } - - /** - * Get filtered relevant content - * - * @access public - * @return string - */ - public function getFilteredContent() - { - $filter = Filter::html($this->content, $this->url); - $filter->setConfig($this->config); - return $filter->execute(); - } - - /** - * Parse the HTML content - * - * @access public - * @return bool - */ - public function parse() - { - if ($this->html) { - - Logger::setMessage(get_called_class().' Fix encoding'); - Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"'); - - $this->html = Encoding::convert($this->html, $this->encoding); - $this->html = Filter::stripHeadTags($this->html); - - Logger::setMessage(get_called_class().' Content length: '.strlen($this->html).' bytes'); - $rules = $this->getRules(); - - if (is_array($rules)) { - Logger::setMessage(get_called_class().' Parse content with rules'); - $this->parseContentWithRules($rules); - } - else { - Logger::setMessage(get_called_class().' Parse content with candidates'); - $this->parseContentWithCandidates(); - } - } - else { - Logger::setMessage(get_called_class().' No content fetched'); - } - - Logger::setMessage(get_called_class().' Content length: '.strlen($this->content).' bytes'); - Logger::setMessage(get_called_class().' Grabber done'); - - return $this->content !== ''; - } - - /** - * Download the HTML content - * - * @access public - * @return HTML content - */ - public function download() - { - $client = Client::getInstance(); - $client->setConfig($this->config); - $client->execute($this->url); - - $this->url = $client->getUrl(); - $this->html = $client->getContent(); - $this->encoding = $client->getEncoding(); - - return $this->html; - } - - /** - * Try to find a predefined rule - * - * @access public - * @return mixed - */ - public function getRules() - { - $hostname = parse_url($this->url, PHP_URL_HOST); - - if ($hostname === false) { - return false; - } - - $files = array($hostname); - - if (substr($hostname, 0, 4) == 'www.') { - $files[] = substr($hostname, 4); - } - - if (($pos = strpos($hostname, '.')) !== false) { - $files[] = substr($hostname, $pos); - $files[] = substr($hostname, $pos + 1); - $files[] = substr($hostname, 0, $pos); - } - - foreach ($files as $file) { - - $filename = __DIR__.'/../Rules/'.$file.'.php'; - - if (file_exists($filename)) { - Logger::setMessage(get_called_class().' Load rule: '.$file); - return include $filename; - } - } - - return false; - } - - /** - * Get the relevant content with predefined rules - * - * @access public - * @param array $rules Rules - */ - public function parseContentWithRules(array $rules) - { - // Logger::setMessage($this->html); - $dom = XmlParser::getHtmlDocument(''.$this->html); - $xpath = new DOMXPath($dom); - - if (isset($rules['strip']) && is_array($rules['strip'])) { - - foreach ($rules['strip'] as $pattern) { - - $nodes = $xpath->query($pattern); - - if ($nodes !== false && $nodes->length > 0) { - foreach ($nodes as $node) { - $node->parentNode->removeChild($node); - } - } - } - } - - if (isset($rules['body']) && is_array($rules['body'])) { - - foreach ($rules['body'] as $pattern) { - - $nodes = $xpath->query($pattern); - - if ($nodes !== false && $nodes->length > 0) { - foreach ($nodes as $node) { - $this->content .= $dom->saveXML($node); - } - } - } - } - } - - /** - * Get the relevant content with the list of potential attributes - * - * @access public - */ - public function parseContentWithCandidates() - { - $dom = XmlParser::getHtmlDocument(''.$this->html); - $xpath = new DOMXPath($dom); - - // Try to lookup in each tag - foreach ($this->candidatesAttributes as $candidate) { - - Logger::setMessage(get_called_class().' Try this candidate: "'.$candidate.'"'); - - $nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); - - if ($nodes !== false && $nodes->length > 0) { - $this->content = $dom->saveXML($nodes->item(0)); - Logger::setMessage(get_called_class().' Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)'); - break; - } - } - - // Try to fetch
- if (! $this->content) { - - $nodes = $xpath->query('//article'); - - if ($nodes !== false && $nodes->length > 0) { - $this->content = $dom->saveXML($nodes->item(0)); - Logger::setMessage(get_called_class().' Find
tag ('.strlen($this->content).' bytes)'); - } - } - - if (strlen($this->content) < 50) { - Logger::setMessage(get_called_class().' No enought content fetched, get the full body'); - $this->content = $dom->saveXML($dom->firstChild); - } - - Logger::setMessage(get_called_class().' Strip garbage'); - $this->stripGarbage(); - } - - /** - * Strip useless tags - * - * @access public - */ - public function stripGarbage() - { - $dom = XmlParser::getDomDocument($this->content); - - if ($dom !== false) { - - $xpath = new DOMXPath($dom); - - foreach ($this->stripTags as $tag) { - - $nodes = $xpath->query('//'.$tag); - - if ($nodes !== false && $nodes->length > 0) { - Logger::setMessage(get_called_class().' Strip tag: "'.$tag.'"'); - foreach ($nodes as $node) { - $node->parentNode->removeChild($node); - } - } - } - - foreach ($this->stripAttributes as $attribute) { - - $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]'); - - if ($nodes !== false && $nodes->length > 0) { - Logger::setMessage(get_called_class().' Strip attribute: "'.$attribute.'"'); - foreach ($nodes as $node) { - $node->parentNode->removeChild($node); - } - } - } - - $this->content = $dom->saveXML($dom->documentElement); - } - } -} -- cgit v1.2.3