diff options
Diffstat (limited to 'vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php')
-rw-r--r-- | vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php | 592 |
1 files changed, 0 insertions, 592 deletions
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php deleted file mode 100644 index bec8ab07b..000000000 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php +++ /dev/null @@ -1,592 +0,0 @@ -<?php - -namespace PicoFeed\Client; - -use DOMXPath; -use PicoFeed\Encoding\Encoding; -use PicoFeed\Logging\Logger; -use PicoFeed\Filter\Filter; -use PicoFeed\Parser\XmlParser; - -/** - * Grabber class - * - * @author Frederic Guillot - * @package Client - */ -class Grabber -{ - /** - * URL - * - * @access private - * @var string - */ - private $url = ''; - - /** - * Relevant content - * - * @access private - * @var string - */ - private $content = ''; - - /** - * HTML content - * - * @access private - * @var string - */ - private $html = ''; - - /** - * HTML content encoding - * - * @access private - * @var string - */ - private $encoding = ''; - - /** - * Flag to skip download and parsing - * - * @access private - * @var boolean - */ - private $skip_processing = false; - - /** - * List of attributes to try to get the content, order is important, generic terms at the end - * - * @access private - * @var array - */ - private $candidatesAttributes = array( - 'articleBody', - 'articlebody', - 'article-body', - 'articleContent', - 'articlecontent', - 'article-content', - 'articlePage', - 'post-content', - 'post_content', - 'entry-content', - 'entry-body', - 'main-content', - 'story_content', - 'storycontent', - 'entryBox', - 'entrytext', - 'comic', - 'post', - 'article', - 'content', - 'main', - ); - - /** - * List of attributes to strip - * - * @access private - * @var array - */ - private $stripAttributes = array( - 'comment', - 'share', - 'links', - 'toolbar', - 'fb', - 'footer', - 'credit', - 'bottom', - 'nav', - 'header', - 'social', - 'tag', - 'metadata', - 'entry-utility', - 'related-posts', - 'tweet', - 'categories', - 'post_title', - 'by_line', - 'byline', - 'sponsors', - ); - - /** - * Tags to remove - * - * @access private - * @var array - */ - private $stripTags = array( - 'nav', - 'header', - 'footer', - 'aside', - 'form', - ); - - /** - * Config object - * - * @access private - * @var \PicoFeed\Config\Config - */ - private $config; - - /** - * Constructor - * - * @access public - * @param string $url Url - * @param string $html HTML content - * @param string $encoding Charset - */ - public function __construct($url, $html = '', $encoding = 'utf-8') - { - $this->url = $url; - $this->html = $html; - $this->encoding = $encoding; - - $this->handleFiles(); - $this->handleStreamingVideos(); - } - - /** - * Set config object - * - * @access public - * @param \PicoFeed\Config\Config $config Config instance - * @return Grabber - */ - public function setConfig($config) - { - $this->config = $config; - return $this; - } - - /** - * Get URL to download. - * - * @access public - * @return string - */ - public function getUrl() - { - return $this->url; - } - - /** - * Set URL to download and reset object to use for another grab. - * - * @access public - * @param string $url URL - * @return string - */ - public function setUrl($url) - { - $this->url = $url; - $this->html = ""; - $this->content = ""; - $this->encoding = ""; - - $this->handleFiles(); - $this->handleStreamingVideos(); - } - - /** - * Get relevant content - * - * @access public - * @return string - */ - public function getContent() - { - return $this->content; - } - - /** - * Get raw content (unfiltered) - * - * @access public - * @return string - */ - public function getRawContent() - { - return $this->html; - } - - /** - * Get filtered relevant content - * - * @access public - * @return string - */ - public function getFilteredContent() - { - $filter = Filter::html($this->content, $this->url); - $filter->setConfig($this->config); - return $filter->execute(); - } - - /** - * Return the Youtube embed player and skip processing - * - * @access public - * @return string - */ - public function handleStreamingVideos() - { - if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) { - $this->content = '<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[0].'" frameborder="0"></iframe>'; - $this->skip_processing = true; - } - } - - /** - * Skip processing for PDF documents - * - * @access public - * @return string - */ - public function handleFiles() - { - if (substr($this->url, -3) === 'pdf') { - $this->skip_processing = true; - Logger::setMessage(get_called_class().': PDF document => processing skipped'); - } - } - - /** - * Parse the HTML content - * - * @access public - * @return bool - */ - public function parse() - { - if ($this->skip_processing) { - return true; - } - - if ($this->html) { - $html_encoding = XmlParser::getEncodingFromMetaTag($this->html); - - // Encode everything in UTF-8 - Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"'); - $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding); - $this->html = Filter::stripHeadTags($this->html); - - Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes'); - $rules = $this->getRules(); - - if (! empty($rules)) { - Logger::setMessage(get_called_class().': Parse content with rules'); - $this->parseContentWithRules($rules); - } - else { - Logger::setMessage(get_called_class().': Parse content with candidates'); - $this->parseContentWithCandidates(); - } - } - else { - Logger::setMessage(get_called_class().': No content fetched'); - } - - Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes'); - Logger::setMessage(get_called_class().': Grabber done'); - - return $this->content !== ''; - } - - /** - * Download the HTML content - * - * @access public - * @return HTML content - */ - public function download() - { - if (! $this->skip_processing && $this->url != '') { - - try { - - $client = Client::getInstance(); - - if ($this->config !== null) { - $client->setConfig($this->config); - $client->setTimeout($this->config->getGrabberTimeout()); - $client->setUserAgent($this->config->getGrabberUserAgent()); - } - - $client->execute($this->url); - - $this->url = $client->getUrl(); - $this->html = $client->getContent(); - $this->encoding = $client->getEncoding(); - } - catch (ClientException $e) { - Logger::setMessage(get_called_class().': '.$e->getMessage()); - } - } - - return $this->html; - } - - /** - * Try to find a predefined rule - * - * @access public - * @return array - */ - public function getRules() - { - $hostname = parse_url($this->url, PHP_URL_HOST); - - if ($hostname !== false) { - - $files = $this->getRulesFileList($hostname); - - foreach ($this->getRulesFolders() as $folder) { - $rule = $this->loadRuleFile($folder, $files); - - if (! empty($rule)) { - return $rule; - } - } - } - - return array(); - } - - /** - * Get the list of possible rules file names for a given hostname - * - * @access public - * @param string $hostname Hostname - * @return array - */ - public function getRulesFileList($hostname) - { - $files = array($hostname); // subdomain.domain.tld - $parts = explode('.', $hostname); - $len = count($parts); - - if ($len > 2) { - $subdomain = array_shift($parts); - $files[] = implode('.', $parts); // domain.tld - $files[] = '.'.implode('.', $parts); // .domain.tld - $files[] = $subdomain; // subdomain - } - else if ($len === 2) { - $files[] = '.'.implode('.', $parts); // .domain.tld - $files[] = $parts[0]; // domain - } - - return $files; - } - - /** - * Load a rule file from the defined folder - * - * @access public - * @param string $folder Rule directory - * @param array $files List of possible file names - * @return array - */ - public function loadRuleFile($folder, array $files) - { - foreach ($files as $file) { - $filename = $folder.'/'.$file.'.php'; - - if (file_exists($filename)) { - Logger::setMessage(get_called_class().' Load rule: '.$file); - return include $filename; - } - } - - return array(); - } - - /** - * Get the list of folders that contains rules - * - * @access public - * @return array - */ - public function getRulesFolders() - { - $folders = array(__DIR__.'/../Rules'); - - if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) { - $folders[] = $this->config->getGrabberRulesFolder(); - } - - return $folders; - } - - /** - * Get the relevant content with predefined rules - * - * @access public - * @param array $rules Rules - */ - public function parseContentWithRules(array $rules) - { - // Logger::setMessage($this->html); - $dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html); - $xpath = new DOMXPath($dom); - - if (isset($rules['strip']) && is_array($rules['strip'])) { - - foreach ($rules['strip'] as $pattern) { - - $nodes = $xpath->query($pattern); - - if ($nodes !== false && $nodes->length > 0) { - foreach ($nodes as $node) { - $node->parentNode->removeChild($node); - } - } - } - } - - if (isset($rules['body']) && is_array($rules['body'])) { - - foreach ($rules['body'] as $pattern) { - - $nodes = $xpath->query($pattern); - - if ($nodes !== false && $nodes->length > 0) { - foreach ($nodes as $node) { - $this->content .= $dom->saveXML($node); - } - } - } - } - } - - /** - * Get the relevant content with the list of potential attributes - * - * @access public - */ - public function parseContentWithCandidates() - { - $dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html); - $xpath = new DOMXPath($dom); - - // Try to lookup in each tag - foreach ($this->candidatesAttributes as $candidate) { - - Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"'); - - $nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); - - if ($nodes !== false && $nodes->length > 0) { - $this->content = $dom->saveXML($nodes->item(0)); - Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)'); - break; - } - } - - // Try to fetch <article/> - if (strlen($this->content) < 200) { - - $nodes = $xpath->query('//article'); - - if ($nodes !== false && $nodes->length > 0) { - $this->content = $dom->saveXML($nodes->item(0)); - Logger::setMessage(get_called_class().': Find <article/> tag ('.strlen($this->content).' bytes)'); - } - } - - // Get everything - if (strlen($this->content) < 50) { - - $nodes = $xpath->query('//body'); - - if ($nodes !== false && $nodes->length > 0) { - Logger::setMessage(get_called_class().' No enought content fetched, get //body'); - $this->content = $dom->saveXML($nodes->item(0)); - } - } - - Logger::setMessage(get_called_class().': Strip garbage'); - $this->stripGarbage(); - } - - /** - * Strip useless tags - * - * @access public - */ - public function stripGarbage() - { - $dom = XmlParser::getDomDocument($this->content); - - if ($dom !== false) { - - $xpath = new DOMXPath($dom); - - foreach ($this->stripTags as $tag) { - - $nodes = $xpath->query('//'.$tag); - - if ($nodes !== false && $nodes->length > 0) { - Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"'); - foreach ($nodes as $node) { - $node->parentNode->removeChild($node); - } - } - } - - foreach ($this->stripAttributes as $attribute) { - - $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]'); - - if ($nodes !== false && $nodes->length > 0) { - Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"'); - foreach ($nodes as $node) { - if ($this->shouldRemove($dom, $node)) { - $node->parentNode->removeChild($node); - } - } - } - } - - $this->content = $dom->saveXML($dom->documentElement); - } - } - - /** - * Return false if the node should not be removed - * - * @access public - * @param DomDocument $dom - * @param DomNode $node - * @return boolean - */ - public function shouldRemove($dom, $node) - { - $document_length = strlen($dom->textContent); - $node_length = strlen($node->textContent); - - if ($document_length === 0) { - return true; - } - - $ratio = $node_length * 100 / $document_length; - - if ($ratio >= 90) { - Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%'); - return false; - } - - return true; - } -} |