summaryrefslogtreecommitdiffstats
path: root/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php')
-rw-r--r--vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php592
1 files changed, 0 insertions, 592 deletions
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php
deleted file mode 100644
index bec8ab07b..000000000
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php
+++ /dev/null
@@ -1,592 +0,0 @@
-<?php
-
-namespace PicoFeed\Client;
-
-use DOMXPath;
-use PicoFeed\Encoding\Encoding;
-use PicoFeed\Logging\Logger;
-use PicoFeed\Filter\Filter;
-use PicoFeed\Parser\XmlParser;
-
-/**
- * Grabber class
- *
- * @author Frederic Guillot
- * @package Client
- */
-class Grabber
-{
- /**
- * URL
- *
- * @access private
- * @var string
- */
- private $url = '';
-
- /**
- * Relevant content
- *
- * @access private
- * @var string
- */
- private $content = '';
-
- /**
- * HTML content
- *
- * @access private
- * @var string
- */
- private $html = '';
-
- /**
- * HTML content encoding
- *
- * @access private
- * @var string
- */
- private $encoding = '';
-
- /**
- * Flag to skip download and parsing
- *
- * @access private
- * @var boolean
- */
- private $skip_processing = false;
-
- /**
- * List of attributes to try to get the content, order is important, generic terms at the end
- *
- * @access private
- * @var array
- */
- private $candidatesAttributes = array(
- 'articleBody',
- 'articlebody',
- 'article-body',
- 'articleContent',
- 'articlecontent',
- 'article-content',
- 'articlePage',
- 'post-content',
- 'post_content',
- 'entry-content',
- 'entry-body',
- 'main-content',
- 'story_content',
- 'storycontent',
- 'entryBox',
- 'entrytext',
- 'comic',
- 'post',
- 'article',
- 'content',
- 'main',
- );
-
- /**
- * List of attributes to strip
- *
- * @access private
- * @var array
- */
- private $stripAttributes = array(
- 'comment',
- 'share',
- 'links',
- 'toolbar',
- 'fb',
- 'footer',
- 'credit',
- 'bottom',
- 'nav',
- 'header',
- 'social',
- 'tag',
- 'metadata',
- 'entry-utility',
- 'related-posts',
- 'tweet',
- 'categories',
- 'post_title',
- 'by_line',
- 'byline',
- 'sponsors',
- );
-
- /**
- * Tags to remove
- *
- * @access private
- * @var array
- */
- private $stripTags = array(
- 'nav',
- 'header',
- 'footer',
- 'aside',
- 'form',
- );
-
- /**
- * Config object
- *
- * @access private
- * @var \PicoFeed\Config\Config
- */
- private $config;
-
- /**
- * Constructor
- *
- * @access public
- * @param string $url Url
- * @param string $html HTML content
- * @param string $encoding Charset
- */
- public function __construct($url, $html = '', $encoding = 'utf-8')
- {
- $this->url = $url;
- $this->html = $html;
- $this->encoding = $encoding;
-
- $this->handleFiles();
- $this->handleStreamingVideos();
- }
-
- /**
- * Set config object
- *
- * @access public
- * @param \PicoFeed\Config\Config $config Config instance
- * @return Grabber
- */
- public function setConfig($config)
- {
- $this->config = $config;
- return $this;
- }
-
- /**
- * Get URL to download.
- *
- * @access public
- * @return string
- */
- public function getUrl()
- {
- return $this->url;
- }
-
- /**
- * Set URL to download and reset object to use for another grab.
- *
- * @access public
- * @param string $url URL
- * @return string
- */
- public function setUrl($url)
- {
- $this->url = $url;
- $this->html = "";
- $this->content = "";
- $this->encoding = "";
-
- $this->handleFiles();
- $this->handleStreamingVideos();
- }
-
- /**
- * Get relevant content
- *
- * @access public
- * @return string
- */
- public function getContent()
- {
- return $this->content;
- }
-
- /**
- * Get raw content (unfiltered)
- *
- * @access public
- * @return string
- */
- public function getRawContent()
- {
- return $this->html;
- }
-
- /**
- * Get filtered relevant content
- *
- * @access public
- * @return string
- */
- public function getFilteredContent()
- {
- $filter = Filter::html($this->content, $this->url);
- $filter->setConfig($this->config);
- return $filter->execute();
- }
-
- /**
- * Return the Youtube embed player and skip processing
- *
- * @access public
- * @return string
- */
- public function handleStreamingVideos()
- {
- if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
- $this->content = '<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[0].'" frameborder="0"></iframe>';
- $this->skip_processing = true;
- }
- }
-
- /**
- * Skip processing for PDF documents
- *
- * @access public
- * @return string
- */
- public function handleFiles()
- {
- if (substr($this->url, -3) === 'pdf') {
- $this->skip_processing = true;
- Logger::setMessage(get_called_class().': PDF document => processing skipped');
- }
- }
-
- /**
- * Parse the HTML content
- *
- * @access public
- * @return bool
- */
- public function parse()
- {
- if ($this->skip_processing) {
- return true;
- }
-
- if ($this->html) {
- $html_encoding = XmlParser::getEncodingFromMetaTag($this->html);
-
- // Encode everything in UTF-8
- Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"');
- $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding);
- $this->html = Filter::stripHeadTags($this->html);
-
- Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes');
- $rules = $this->getRules();
-
- if (! empty($rules)) {
- Logger::setMessage(get_called_class().': Parse content with rules');
- $this->parseContentWithRules($rules);
- }
- else {
- Logger::setMessage(get_called_class().': Parse content with candidates');
- $this->parseContentWithCandidates();
- }
- }
- else {
- Logger::setMessage(get_called_class().': No content fetched');
- }
-
- Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
- Logger::setMessage(get_called_class().': Grabber done');
-
- return $this->content !== '';
- }
-
- /**
- * Download the HTML content
- *
- * @access public
- * @return HTML content
- */
- public function download()
- {
- if (! $this->skip_processing && $this->url != '') {
-
- try {
-
- $client = Client::getInstance();
-
- if ($this->config !== null) {
- $client->setConfig($this->config);
- $client->setTimeout($this->config->getGrabberTimeout());
- $client->setUserAgent($this->config->getGrabberUserAgent());
- }
-
- $client->execute($this->url);
-
- $this->url = $client->getUrl();
- $this->html = $client->getContent();
- $this->encoding = $client->getEncoding();
- }
- catch (ClientException $e) {
- Logger::setMessage(get_called_class().': '.$e->getMessage());
- }
- }
-
- return $this->html;
- }
-
- /**
- * Try to find a predefined rule
- *
- * @access public
- * @return array
- */
- public function getRules()
- {
- $hostname = parse_url($this->url, PHP_URL_HOST);
-
- if ($hostname !== false) {
-
- $files = $this->getRulesFileList($hostname);
-
- foreach ($this->getRulesFolders() as $folder) {
- $rule = $this->loadRuleFile($folder, $files);
-
- if (! empty($rule)) {
- return $rule;
- }
- }
- }
-
- return array();
- }
-
- /**
- * Get the list of possible rules file names for a given hostname
- *
- * @access public
- * @param string $hostname Hostname
- * @return array
- */
- public function getRulesFileList($hostname)
- {
- $files = array($hostname); // subdomain.domain.tld
- $parts = explode('.', $hostname);
- $len = count($parts);
-
- if ($len > 2) {
- $subdomain = array_shift($parts);
- $files[] = implode('.', $parts); // domain.tld
- $files[] = '.'.implode('.', $parts); // .domain.tld
- $files[] = $subdomain; // subdomain
- }
- else if ($len === 2) {
- $files[] = '.'.implode('.', $parts); // .domain.tld
- $files[] = $parts[0]; // domain
- }
-
- return $files;
- }
-
- /**
- * Load a rule file from the defined folder
- *
- * @access public
- * @param string $folder Rule directory
- * @param array $files List of possible file names
- * @return array
- */
- public function loadRuleFile($folder, array $files)
- {
- foreach ($files as $file) {
- $filename = $folder.'/'.$file.'.php';
-
- if (file_exists($filename)) {
- Logger::setMessage(get_called_class().' Load rule: '.$file);
- return include $filename;
- }
- }
-
- return array();
- }
-
- /**
- * Get the list of folders that contains rules
- *
- * @access public
- * @return array
- */
- public function getRulesFolders()
- {
- $folders = array(__DIR__.'/../Rules');
-
- if ($this->config !== null && $this->config->getGrabberRulesFolder() !== null) {
- $folders[] = $this->config->getGrabberRulesFolder();
- }
-
- return $folders;
- }
-
- /**
- * Get the relevant content with predefined rules
- *
- * @access public
- * @param array $rules Rules
- */
- public function parseContentWithRules(array $rules)
- {
- // Logger::setMessage($this->html);
- $dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html);
- $xpath = new DOMXPath($dom);
-
- if (isset($rules['strip']) && is_array($rules['strip'])) {
-
- foreach ($rules['strip'] as $pattern) {
-
- $nodes = $xpath->query($pattern);
-
- if ($nodes !== false && $nodes->length > 0) {
- foreach ($nodes as $node) {
- $node->parentNode->removeChild($node);
- }
- }
- }
- }
-
- if (isset($rules['body']) && is_array($rules['body'])) {
-
- foreach ($rules['body'] as $pattern) {
-
- $nodes = $xpath->query($pattern);
-
- if ($nodes !== false && $nodes->length > 0) {
- foreach ($nodes as $node) {
- $this->content .= $dom->saveXML($node);
- }
- }
- }
- }
- }
-
- /**
- * Get the relevant content with the list of potential attributes
- *
- * @access public
- */
- public function parseContentWithCandidates()
- {
- $dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html);
- $xpath = new DOMXPath($dom);
-
- // Try to lookup in each tag
- foreach ($this->candidatesAttributes as $candidate) {
-
- Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');
-
- $nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
-
- if ($nodes !== false && $nodes->length > 0) {
- $this->content = $dom->saveXML($nodes->item(0));
- Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
- break;
- }
- }
-
- // Try to fetch <article/>
- if (strlen($this->content) < 200) {
-
- $nodes = $xpath->query('//article');
-
- if ($nodes !== false && $nodes->length > 0) {
- $this->content = $dom->saveXML($nodes->item(0));
- Logger::setMessage(get_called_class().': Find <article/> tag ('.strlen($this->content).' bytes)');
- }
- }
-
- // Get everything
- if (strlen($this->content) < 50) {
-
- $nodes = $xpath->query('//body');
-
- if ($nodes !== false && $nodes->length > 0) {
- Logger::setMessage(get_called_class().' No enought content fetched, get //body');
- $this->content = $dom->saveXML($nodes->item(0));
- }
- }
-
- Logger::setMessage(get_called_class().': Strip garbage');
- $this->stripGarbage();
- }
-
- /**
- * Strip useless tags
- *
- * @access public
- */
- public function stripGarbage()
- {
- $dom = XmlParser::getDomDocument($this->content);
-
- if ($dom !== false) {
-
- $xpath = new DOMXPath($dom);
-
- foreach ($this->stripTags as $tag) {
-
- $nodes = $xpath->query('//'.$tag);
-
- if ($nodes !== false && $nodes->length > 0) {
- Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');
- foreach ($nodes as $node) {
- $node->parentNode->removeChild($node);
- }
- }
- }
-
- foreach ($this->stripAttributes as $attribute) {
-
- $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
-
- if ($nodes !== false && $nodes->length > 0) {
- Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');
- foreach ($nodes as $node) {
- if ($this->shouldRemove($dom, $node)) {
- $node->parentNode->removeChild($node);
- }
- }
- }
- }
-
- $this->content = $dom->saveXML($dom->documentElement);
- }
- }
-
- /**
- * Return false if the node should not be removed
- *
- * @access public
- * @param DomDocument $dom
- * @param DomNode $node
- * @return boolean
- */
- public function shouldRemove($dom, $node)
- {
- $document_length = strlen($dom->textContent);
- $node_length = strlen($node->textContent);
-
- if ($document_length === 0) {
- return true;
- }
-
- $ratio = $node_length * 100 / $document_length;
-
- if ($ratio >= 90) {
- Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');
- return false;
- }
-
- return true;
- }
-}