url = $url; $this->html = $html; $this->encoding = $encoding; $this->handleFiles(); $this->handleStreamingVideos(); } /** * Set config object * * @access public * @param \PicoFeed\Config\Config $config Config instance * @return Grabber */ public function setConfig($config) { $this->config = $config; return $this; } /** * Get URL to download. * * @access public * @return string */ public function getUrl() { return $this->url; } /** * Set URL to download and reset object to use for another grab. * * @access public * @param string $url URL * @return string */ public function setUrl($url) { $this->url = $url; $this->html = ""; $this->content = ""; $this->encoding = ""; $this->handleFiles(); $this->handleStreamingVideos(); } /** * Get relevant content * * @access public * @return string */ public function getContent() { return $this->content; } /** * Get raw content (unfiltered) * * @access public * @return string */ public function getRawContent() { return $this->html; } /** * Get filtered relevant content * * @access public * @return string */ public function getFilteredContent() { $filter = Filter::html($this->content, $this->url); $filter->setConfig($this->config); return $filter->execute(); } /** * Return the Youtube embed player and skip processing * * @access public * @return string */ public function handleStreamingVideos() { if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) { $this->content = ''; $this->skip_processing = true; } } /** * Skip processing for PDF documents * * @access public * @return string */ public function handleFiles() { if (substr($this->url, -3) === 'pdf') { $this->skip_processing = true; Logger::setMessage(get_called_class().': PDF document => processing skipped'); } } /** * Parse the HTML content * * @access public * @return bool */ public function parse() { if ($this->skip_processing) { return true; } if ($this->html) { $html_encoding = XmlParser::getEncodingFromMetaTag($this->html); // Encode everything in UTF-8 Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"'); $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding); $this->html = Filter::stripHeadTags($this->html); Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes'); $rules = $this->getRules(); if (! empty($rules)) { Logger::setMessage(get_called_class().': Parse content with rules'); $this->parseContentWithRules($rules); } else { Logger::setMessage(get_called_class().': Parse content with candidates'); $this->parseContentWithCandidates(); } } else { Logger::setMessage(get_called_class().': No content fetched'); } Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes'); Logger::setMessage(get_called_class().': Grabber done'); return $this->content !== ''; } /** * Download the HTML content * * @access public * @return HTML content */ public function download() { if (! $this->skip_processing && $this->url != '') { try { $client = Client::getInstance(); if ($this->config !== null) { $client->setConfig($this->config); $client->setTimeout($this->config->getGrabberTimeout()); $client->setUserAgent($this->config->getGrabberUserAgent()); } $client->execute($this->url); $this->url = $client->getUrl(); $this->html = $client->getContent(); $this->encoding = $client->getEncoding(); } catch (ClientException $e) { Logger::setMessage(get_called_class().': '.$e->getMessage()); } } return $this->html; } /** * Try to find a predefined rule * * @access public * @return array */ public function getRules() { $hostname = parse_url($this->url, PHP_URL_HOST); if ($hostname !== false) { $files = $this->getRulesFileList($hostname); foreach ($this->getRulesFolders() as $folder) { $rule = $this->loadRuleFile($folder, $files); if (! empty($rule)) { return $rule; } } } return array(); } /** * Get the list of possible rules file names for a given hostname * * @access public * @param string $hostname Hostname * @return array */ public function getRulesFileList($hostname) { $files = array($hostname); // subdomain.domain.tld $parts = explode('.', $hostname); $len = count($parts); if ($len > 2) { $subdomain = array_shift($parts); $files[] = implode('.', $parts); // domain.tld $files[] = '.'.implode('.', $parts); // .domain.tld $files[] = $subdomain; // subdomain } else if ($len === 2) { $files[] = '.'.implode('.', $parts); // .domain.tld $files[] = $parts[0]; // domain } return $files; } /** * Load a rule file from the defined folder * * @access public * @param string $folder Rule directory * @param array $files List of possible file names * @return array */ public function loadRuleFile($folder, array $files) { foreach ($files as $file) { $filename = $folder.'/'.$file.'.php'; if (file_exists($filename)) { Logger::setMessage(get_called_class().' Load rule: '.$file); return include $filename; } } return array(); } /** * Get the list of folders that contains rules * * @access public * @return array */ public function getRulesFolders() { $folders = array(__DIR__.'/../Rules'); if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) { $folders[] = $this->config->getGrabberRulesFolder(); } return $folders; } /** * Get the relevant content with predefined rules * * @access public * @param array $rules Rules */ public function parseContentWithRules(array $rules) { // Logger::setMessage($this->html); $dom = XmlParser::getHtmlDocument(''.$this->html); $xpath = new DOMXPath($dom); if (isset($rules['strip']) && is_array($rules['strip'])) { foreach ($rules['strip'] as $pattern) { $nodes = $xpath->query($pattern); if ($nodes !== false && $nodes->length > 0) { foreach ($nodes as $node) { $node->parentNode->removeChild($node); } } } } if (isset($rules['body']) && is_array($rules['body'])) { foreach ($rules['body'] as $pattern) { $nodes = $xpath->query($pattern); if ($nodes !== false && $nodes->length > 0) { foreach ($nodes as $node) { $this->content .= $dom->saveXML($node); } } } } } /** * Get the relevant content with the list of potential attributes * * @access public */ public function parseContentWithCandidates() { $dom = XmlParser::getHtmlDocument(''.$this->html); $xpath = new DOMXPath($dom); // Try to lookup in each tag foreach ($this->candidatesAttributes as $candidate) { Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"'); $nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]'); if ($nodes !== false && $nodes->length > 0) { $this->content = $dom->saveXML($nodes->item(0)); Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)'); break; } } // Try to fetch
if (strlen($this->content) < 200) { $nodes = $xpath->query('//article'); if ($nodes !== false && $nodes->length > 0) { $this->content = $dom->saveXML($nodes->item(0)); Logger::setMessage(get_called_class().': Find
tag ('.strlen($this->content).' bytes)'); } } // Get everything if (strlen($this->content) < 50) { $nodes = $xpath->query('//body'); if ($nodes !== false && $nodes->length > 0) { Logger::setMessage(get_called_class().' No enought content fetched, get //body'); $this->content = $dom->saveXML($nodes->item(0)); } } Logger::setMessage(get_called_class().': Strip garbage'); $this->stripGarbage(); } /** * Strip useless tags * * @access public */ public function stripGarbage() { $dom = XmlParser::getDomDocument($this->content); if ($dom !== false) { $xpath = new DOMXPath($dom); foreach ($this->stripTags as $tag) { $nodes = $xpath->query('//'.$tag); if ($nodes !== false && $nodes->length > 0) { Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"'); foreach ($nodes as $node) { $node->parentNode->removeChild($node); } } } foreach ($this->stripAttributes as $attribute) { $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]'); if ($nodes !== false && $nodes->length > 0) { Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"'); foreach ($nodes as $node) { if ($this->shouldRemove($dom, $node)) { $node->parentNode->removeChild($node); } } } } $this->content = $dom->saveXML($dom->documentElement); } } /** * Return false if the node should not be removed * * @access public * @param DomDocument $dom * @param DomNode $node * @return boolean */ public function shouldRemove($dom, $node) { $document_length = strlen($dom->textContent); $node_length = strlen($node->textContent); if ($document_length === 0) { return true; } $ratio = $node_length * 100 / $document_length; if ($ratio >= 90) { Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%'); return false; } return true; } }