From 8241180c6ce0cb19255d70a3394f891e08182542 Mon Sep 17 00:00:00 2001 From: Bernhard Posselt Date: Tue, 27 Jan 2015 09:31:40 +0100 Subject: dont use picofeed submodule --- vendor/fguillot/picofeed | 1 - .../picofeed/lib/PicoFeed/Client/Grabber.php | 535 +++++++++++++++++++++ 2 files changed, 535 insertions(+), 1 deletion(-) delete mode 160000 vendor/fguillot/picofeed create mode 100644 vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php (limited to 'vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php') diff --git a/vendor/fguillot/picofeed b/vendor/fguillot/picofeed deleted file mode 160000 index 0a1d0d395..000000000 --- a/vendor/fguillot/picofeed +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0a1d0d3950f7f047dc8fb1d80aa6296e15f306d0 diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php new file mode 100644 index 000000000..52f2f0bf1 --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php @@ -0,0 +1,535 @@ +url = $url; + $this->html = $html; + $this->encoding = $encoding; + + $this->handleFiles(); + $this->handleStreamingVideos(); + } + + /** + * Set config object + * + * @access public + * @param \PicoFeed\Config\Config $config Config instance + * @return Grabber + */ + public function setConfig($config) + { + $this->config = $config; + return $this; + } + + /** + * Get URL to download. + * + * @access public + * @return string + */ + public function getUrl() + { + return $this->url; + } + + /** + * Set URL to download and reset object to use for another grab. + * + * @access public + * @param string $url URL + * @return string + */ + public function setUrl($url) + { + $this->url = $url; + $this->html = ""; + $this->content = ""; + $this->encoding = ""; + + $this->handleFiles(); + $this->handleStreamingVideos(); + } + + /** + * Get relevant content + * + * @access public + * @return string + */ + public function getContent() + { + return $this->content; + } + + /** + * Get raw content (unfiltered) + * + * @access public + * @return string + */ + public function getRawContent() + { + return $this->html; + } + + /** + * Get filtered relevant content + * + * @access public + * @return string + */ + public function getFilteredContent() + { + $filter = Filter::html($this->content, $this->url); + $filter->setConfig($this->config); + return $filter->execute(); + } + + /** + * Return the Youtube embed player and skip processing + * + * @access public + * @return string + */ + public function handleStreamingVideos() + { + if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) { + $this->content = ''; + $this->skip_processing = true; + } + } + + /** + * Skip processing for PDF documents + * + * @access public + * @return string + */ + public function handleFiles() + { + if (substr($this->url, -3) === 'pdf') { + $this->skip_processing = true; + Logger::setMessage(get_called_class().': PDF document => processing skipped'); + } + } + + /** + * Parse the HTML content + * + * @access public + * @return bool + */ + public function parse() + { + if ($this->skip_processing) { + return true; + } + + if ($this->html) { + + Logger::setMessage(get_called_class().': Fix encoding'); + Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"'); + + $this->html = Encoding::convert($this->html, $this->encoding); + $this->html = Filter::stripHeadTags($this->html); + + Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes'); + $rules = $this->getRules(); + + if (is_array($rules)) { + Logger::setMessage(get_called_class().': Parse content with rules'); + $this->parseContentWithRules($rules); + } + else { + Logger::setMessage(get_called_class().': Parse content with candidates'); + $this->parseContentWithCandidates(); + } + } + else { + Logger::setMessage(get_called_class().': No content fetched'); + } + + Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes'); + Logger::setMessage(get_called_class().': Grabber done'); + + return $this->content !== ''; + } + + /** + * Download the HTML content + * + * @access public + * @return HTML content + */ + public function download() + { + if (! $this->skip_processing && $this->url != '') { + + try { + + $client = Client::getInstance(); + $client->setConfig($this->config); + $client->execute($this->url); + + $this->url = $client->getUrl(); + $this->html = $client->getContent(); + $this->encoding = $client->getEncoding(); + } + catch (ClientException $e) { + Logger::setMessage(get_called_class().': '.$e->getMessage()); + } + } + + return $this->html; + } + + /** + * Try to find a predefined rule + * + * @access public + * @return mixed + */ + public function getRules() + { + $hostname = parse_url($this->url, PHP_URL_HOST); + + if ($hostname === false) { + return false; + } + + $files = array($hostname); + + if (substr($hostname, 0, 4) == 'www.') { + $files[] = substr($hostname, 4); + } + + if (($pos = strpos($hostname, '.')) !== false) { + $files[] = substr($hostname, $pos); + $files[] = substr($hostname, $pos + 1); + $files[] = substr($hostname, 0, $pos); + } + + foreach ($files as $file) { + + $filename = __DIR__.'/../Rules/'.$file.'.php'; + + if (file_exists($filename)) { + Logger::setMessage(get_called_class().' Load rule: '.$file); + return include $filename; + } + } + + return false; + } + + /** + * Get the relevant content with predefined rules + * + * @access public + * @param array $rules Rules + */ + public function parseContentWithRules(array $rules) + { + // Logger::setMessage($this->html); + $dom = XmlParser::getHtmlDocument(''.$this->html); + $xpath = new DOMXPath($dom); + + if (isset($rules['strip']) && is_array($rules['strip'])) { + + foreach ($rules['strip'] as $pattern) { + + $nodes = $xpath->query($pattern); + + if ($nodes !== false && $nodes->length > 0) { + foreach ($nodes as $node) { + $node->parentNode->removeChild($node); + } + } + } + } + + if (isset($rules['body']) && is_array($rules['body'])) { + + foreach ($rules['body'] as $pattern) { + + $nodes = $xpath->query($pattern); + + if ($nodes !== false && $nodes->length > 0) { + foreach ($nodes as $node) { + $this->content .= $dom->saveXML($node); + } + } + } + } + } + + /** + * Get the relevant content with the list of potential attributes + * + * @access public + */ + public function parseContentWithCandidates() + { + $dom = XmlParser::getHtmlDocument(''.$this->html); + $xpath = new DOMXPath($dom); + + // Try to lookup in each tag + foreach ($this->candidatesAttributes as $candidate) { + + Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"'); + + $nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); + + if ($nodes !== false && $nodes->length > 0) { + $this->content = $dom->saveXML($nodes->item(0)); + Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)'); + break; + } + } + + // Try to fetch
+ if (strlen($this->content) < 200) { + + $nodes = $xpath->query('//article'); + + if ($nodes !== false && $nodes->length > 0) { + $this->content = $dom->saveXML($nodes->item(0)); + Logger::setMessage(get_called_class().': Find
tag ('.strlen($this->content).' bytes)'); + } + } + + // Get everything + if (strlen($this->content) < 50) { + + $nodes = $xpath->query('//body'); + + if ($nodes !== false && $nodes->length > 0) { + Logger::setMessage(get_called_class().' No enought content fetched, get //body'); + $this->content = $dom->saveXML($nodes->item(0)); + } + } + + Logger::setMessage(get_called_class().': Strip garbage'); + $this->stripGarbage(); + } + + /** + * Strip useless tags + * + * @access public + */ + public function stripGarbage() + { + $dom = XmlParser::getDomDocument($this->content); + + if ($dom !== false) { + + $xpath = new DOMXPath($dom); + + foreach ($this->stripTags as $tag) { + + $nodes = $xpath->query('//'.$tag); + + if ($nodes !== false && $nodes->length > 0) { + Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"'); + foreach ($nodes as $node) { + $node->parentNode->removeChild($node); + } + } + } + + foreach ($this->stripAttributes as $attribute) { + + $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]'); + + if ($nodes !== false && $nodes->length > 0) { + Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"'); + foreach ($nodes as $node) { + if ($this->shouldRemove($dom, $node)) { + $node->parentNode->removeChild($node); + } + } + } + } + + $this->content = $dom->saveXML($dom->documentElement); + } + } + + /** + * Return false if the node should not be removed + * + * @access public + * @param DomDocument $dom + * @param DomNode $node + * @return boolean + */ + public function shouldRemove($dom, $node) + { + $document_length = strlen($dom->textContent); + $node_length = strlen($node->textContent); + + if ($document_length === 0) { + return true; + } + + $ratio = $node_length * 100 / $document_length; + + if ($ratio >= 90) { + Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%'); + return false; + } + + return true; + } +} -- cgit v1.2.3