summaryrefslogtreecommitdiffstats
path: root/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php')
m---------vendor/fguillot/picofeed0
-rw-r--r--vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php535
2 files changed, 535 insertions, 0 deletions
diff --git a/vendor/fguillot/picofeed b/vendor/fguillot/picofeed
deleted file mode 160000
-Subproject 0a1d0d3950f7f047dc8fb1d80aa6296e15f306d
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php
new file mode 100644
index 000000000..52f2f0bf1
--- /dev/null
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Client/Grabber.php
@@ -0,0 +1,535 @@
+<?php
+
+namespace PicoFeed\Client;
+
+use DOMXPath;
+use PicoFeed\Encoding\Encoding;
+use PicoFeed\Logging\Logger;
+use PicoFeed\Filter\Filter;
+use PicoFeed\Parser\XmlParser;
+
+/**
+ * Grabber class
+ *
+ * @author Frederic Guillot
+ * @package Client
+ */
+class Grabber
+{
+ /**
+ * URL
+ *
+ * @access private
+ * @var string
+ */
+ private $url = '';
+
+ /**
+ * Relevant content
+ *
+ * @access private
+ * @var string
+ */
+ private $content = '';
+
+ /**
+ * HTML content
+ *
+ * @access private
+ * @var string
+ */
+ private $html = '';
+
+ /**
+ * HTML content encoding
+ *
+ * @access private
+ * @var string
+ */
+ private $encoding = '';
+
+ /**
+ * Flag to skip download and parsing
+ *
+ * @access private
+ * @var boolean
+ */
+ private $skip_processing = false;
+
+ /**
+ * List of attributes to try to get the content, order is important, generic terms at the end
+ *
+ * @access private
+ * @var array
+ */
+ private $candidatesAttributes = array(
+ 'articleBody',
+ 'articlebody',
+ 'article-body',
+ 'articleContent',
+ 'articlecontent',
+ 'article-content',
+ 'articlePage',
+ 'post-content',
+ 'post_content',
+ 'entry-content',
+ 'entry-body',
+ 'main-content',
+ 'story_content',
+ 'storycontent',
+ 'entryBox',
+ 'entrytext',
+ 'comic',
+ 'post',
+ 'article',
+ 'content',
+ 'main',
+ );
+
+ /**
+ * List of attributes to strip
+ *
+ * @access private
+ * @var array
+ */
+ private $stripAttributes = array(
+ 'comment',
+ 'share',
+ 'links',
+ 'toolbar',
+ 'fb',
+ 'footer',
+ 'credit',
+ 'bottom',
+ 'nav',
+ 'header',
+ 'social',
+ 'tag',
+ 'metadata',
+ 'entry-utility',
+ 'related-posts',
+ 'tweet',
+ 'categories',
+ 'post_title',
+ 'by_line',
+ 'byline',
+ 'sponsors',
+ );
+
+ /**
+ * Tags to remove
+ *
+ * @access private
+ * @var array
+ */
+ private $stripTags = array(
+ 'script',
+ 'style',
+ 'nav',
+ 'header',
+ 'footer',
+ 'aside',
+ 'form',
+ );
+
+ /**
+ * Config object
+ *
+ * @access private
+ * @var \PicoFeed\Config\Config
+ */
+ private $config;
+
+ /**
+ * Constructor
+ *
+ * @access public
+ * @param string $url Url
+ * @param string $html HTML content
+ * @param string $encoding Charset
+ */
+ public function __construct($url, $html = '', $encoding = 'utf-8')
+ {
+ $this->url = $url;
+ $this->html = $html;
+ $this->encoding = $encoding;
+
+ $this->handleFiles();
+ $this->handleStreamingVideos();
+ }
+
+ /**
+ * Set config object
+ *
+ * @access public
+ * @param \PicoFeed\Config\Config $config Config instance
+ * @return Grabber
+ */
+ public function setConfig($config)
+ {
+ $this->config = $config;
+ return $this;
+ }
+
+ /**
+ * Get URL to download.
+ *
+ * @access public
+ * @return string
+ */
+ public function getUrl()
+ {
+ return $this->url;
+ }
+
+ /**
+ * Set URL to download and reset object to use for another grab.
+ *
+ * @access public
+ * @param string $url URL
+ * @return string
+ */
+ public function setUrl($url)
+ {
+ $this->url = $url;
+ $this->html = "";
+ $this->content = "";
+ $this->encoding = "";
+
+ $this->handleFiles();
+ $this->handleStreamingVideos();
+ }
+
+ /**
+ * Get relevant content
+ *
+ * @access public
+ * @return string
+ */
+ public function getContent()
+ {
+ return $this->content;
+ }
+
+ /**
+ * Get raw content (unfiltered)
+ *
+ * @access public
+ * @return string
+ */
+ public function getRawContent()
+ {
+ return $this->html;
+ }
+
+ /**
+ * Get filtered relevant content
+ *
+ * @access public
+ * @return string
+ */
+ public function getFilteredContent()
+ {
+ $filter = Filter::html($this->content, $this->url);
+ $filter->setConfig($this->config);
+ return $filter->execute();
+ }
+
+ /**
+ * Return the Youtube embed player and skip processing
+ *
+ * @access public
+ * @return string
+ */
+ public function handleStreamingVideos()
+ {
+ if (preg_match("#(?<=v=|v\/|vi=|vi\/|youtu.be\/)[a-zA-Z0-9_-]{11}#", $this->url, $matches)) {
+ $this->content = '<iframe width="560" height="315" src="//www.youtube.com/embed/'.$matches[0].'" frameborder="0"></iframe>';
+ $this->skip_processing = true;
+ }
+ }
+
+ /**
+ * Skip processing for PDF documents
+ *
+ * @access public
+ * @return string
+ */
+ public function handleFiles()
+ {
+ if (substr($this->url, -3) === 'pdf') {
+ $this->skip_processing = true;
+ Logger::setMessage(get_called_class().': PDF document => processing skipped');
+ }
+ }
+
+ /**
+ * Parse the HTML content
+ *
+ * @access public
+ * @return bool
+ */
+ public function parse()
+ {
+ if ($this->skip_processing) {
+ return true;
+ }
+
+ if ($this->html) {
+
+ Logger::setMessage(get_called_class().': Fix encoding');
+ Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'"');
+
+ $this->html = Encoding::convert($this->html, $this->encoding);
+ $this->html = Filter::stripHeadTags($this->html);
+
+ Logger::setMessage(get_called_class().': Content length: '.strlen($this->html).' bytes');
+ $rules = $this->getRules();
+
+ if (is_array($rules)) {
+ Logger::setMessage(get_called_class().': Parse content with rules');
+ $this->parseContentWithRules($rules);
+ }
+ else {
+ Logger::setMessage(get_called_class().': Parse content with candidates');
+ $this->parseContentWithCandidates();
+ }
+ }
+ else {
+ Logger::setMessage(get_called_class().': No content fetched');
+ }
+
+ Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
+ Logger::setMessage(get_called_class().': Grabber done');
+
+ return $this->content !== '';
+ }
+
+ /**
+ * Download the HTML content
+ *
+ * @access public
+ * @return HTML content
+ */
+ public function download()
+ {
+ if (! $this->skip_processing && $this->url != '') {
+
+ try {
+
+ $client = Client::getInstance();
+ $client->setConfig($this->config);
+ $client->execute($this->url);
+
+ $this->url = $client->getUrl();
+ $this->html = $client->getContent();
+ $this->encoding = $client->getEncoding();
+ }
+ catch (ClientException $e) {
+ Logger::setMessage(get_called_class().': '.$e->getMessage());
+ }
+ }
+
+ return $this->html;
+ }
+
+ /**
+ * Try to find a predefined rule
+ *
+ * @access public
+ * @return mixed
+ */
+ public function getRules()
+ {
+ $hostname = parse_url($this->url, PHP_URL_HOST);
+
+ if ($hostname === false) {
+ return false;
+ }
+
+ $files = array($hostname);
+
+ if (substr($hostname, 0, 4) == 'www.') {
+ $files[] = substr($hostname, 4);
+ }
+
+ if (($pos = strpos($hostname, '.')) !== false) {
+ $files[] = substr($hostname, $pos);
+ $files[] = substr($hostname, $pos + 1);
+ $files[] = substr($hostname, 0, $pos);
+ }
+
+ foreach ($files as $file) {
+
+ $filename = __DIR__.'/../Rules/'.$file.'.php';
+
+ if (file_exists($filename)) {
+ Logger::setMessage(get_called_class().' Load rule: '.$file);
+ return include $filename;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Get the relevant content with predefined rules
+ *
+ * @access public
+ * @param array $rules Rules
+ */
+ public function parseContentWithRules(array $rules)
+ {
+ // Logger::setMessage($this->html);
+ $dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html);
+ $xpath = new DOMXPath($dom);
+
+ if (isset($rules['strip']) && is_array($rules['strip'])) {
+
+ foreach ($rules['strip'] as $pattern) {
+
+ $nodes = $xpath->query($pattern);
+
+ if ($nodes !== false && $nodes->length > 0) {
+ foreach ($nodes as $node) {
+ $node->parentNode->removeChild($node);
+ }
+ }
+ }
+ }
+
+ if (isset($rules['body']) && is_array($rules['body'])) {
+
+ foreach ($rules['body'] as $pattern) {
+
+ $nodes = $xpath->query($pattern);
+
+ if ($nodes !== false && $nodes->length > 0) {
+ foreach ($nodes as $node) {
+ $this->content .= $dom->saveXML($node);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Get the relevant content with the list of potential attributes
+ *
+ * @access public
+ */
+ public function parseContentWithCandidates()
+ {
+ $dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$this->html);
+ $xpath = new DOMXPath($dom);
+
+ // Try to lookup in each tag
+ foreach ($this->candidatesAttributes as $candidate) {
+
+ Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');
+
+ $nodes = $xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
+
+ if ($nodes !== false && $nodes->length > 0) {
+ $this->content = $dom->saveXML($nodes->item(0));
+ Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'" ('.strlen($this->content).' bytes)');
+ break;
+ }
+ }
+
+ // Try to fetch <article/>
+ if (strlen($this->content) < 200) {
+
+ $nodes = $xpath->query('//article');
+
+ if ($nodes !== false && $nodes->length > 0) {
+ $this->content = $dom->saveXML($nodes->item(0));
+ Logger::setMessage(get_called_class().': Find <article/> tag ('.strlen($this->content).' bytes)');
+ }
+ }
+
+ // Get everything
+ if (strlen($this->content) < 50) {
+
+ $nodes = $xpath->query('//body');
+
+ if ($nodes !== false && $nodes->length > 0) {
+ Logger::setMessage(get_called_class().' No enought content fetched, get //body');
+ $this->content = $dom->saveXML($nodes->item(0));
+ }
+ }
+
+ Logger::setMessage(get_called_class().': Strip garbage');
+ $this->stripGarbage();
+ }
+
+ /**
+ * Strip useless tags
+ *
+ * @access public
+ */
+ public function stripGarbage()
+ {
+ $dom = XmlParser::getDomDocument($this->content);
+
+ if ($dom !== false) {
+
+ $xpath = new DOMXPath($dom);
+
+ foreach ($this->stripTags as $tag) {
+
+ $nodes = $xpath->query('//'.$tag);
+
+ if ($nodes !== false && $nodes->length > 0) {
+ Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');
+ foreach ($nodes as $node) {
+ $node->parentNode->removeChild($node);
+ }
+ }
+ }
+
+ foreach ($this->stripAttributes as $attribute) {
+
+ $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
+
+ if ($nodes !== false && $nodes->length > 0) {
+ Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');
+ foreach ($nodes as $node) {
+ if ($this->shouldRemove($dom, $node)) {
+ $node->parentNode->removeChild($node);
+ }
+ }
+ }
+ }
+
+ $this->content = $dom->saveXML($dom->documentElement);
+ }
+ }
+
+ /**
+ * Return false if the node should not be removed
+ *
+ * @access public
+ * @param DomDocument $dom
+ * @param DomNode $node
+ * @return boolean
+ */
+ public function shouldRemove($dom, $node)
+ {
+ $document_length = strlen($dom->textContent);
+ $node_length = strlen($node->textContent);
+
+ if ($document_length === 0) {
+ return true;
+ }
+
+ $ratio = $node_length * 100 / $document_length;
+
+ if ($ratio >= 90) {
+ Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');
+ return false;
+ }
+
+ return true;
+ }
+}