diff options
Diffstat (limited to 'vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php')
-rw-r--r-- | vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php | 137 |
1 files changed, 39 insertions, 98 deletions
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php index 5130b68bb..433f21a26 100644 --- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php @@ -2,12 +2,15 @@ namespace PicoFeed\Parser; +use PicoFeed\Processor\ContentFilterProcessor; +use PicoFeed\Processor\ContentGeneratorProcessor; +use PicoFeed\Processor\ItemPostProcessor; +use PicoFeed\Processor\ScraperProcessor; use SimpleXMLElement; use PicoFeed\Client\Url; use PicoFeed\Encoding\Encoding; use PicoFeed\Filter\Filter; use PicoFeed\Logging\Logger; -use PicoFeed\Scraper\Scraper; /** * Base parser class. @@ -28,7 +31,7 @@ abstract class Parser * * @var \PicoFeed\Parser\DateParser */ - protected $date; + private $dateParser; /** * Hash algorithm used to generate item id, any value supported by PHP, see hash_algos(). @@ -66,32 +69,12 @@ abstract class Parser protected $used_namespaces = array(); /** - * Enable the content filtering. + * Item Post Processor instance * - * @var bool + * @access private + * @var ItemPostProcessor */ - private $enable_filter = true; - - /** - * Enable the content grabber. - * - * @var bool - */ - private $enable_grabber = false; - - /** - * Enable the content grabber on all pages. - * - * @var bool - */ - private $grabber_needs_rule_file = false; - - /** - * Ignore those urls for the content scraper. - * - * @var array - */ - private $grabber_ignore_urls = array(); + private $itemPostProcessor; /** * Constructor. @@ -102,7 +85,6 @@ abstract class Parser */ public function __construct($content, $http_encoding = '', $fallback_url = '') { - $this->date = new DateParser(); $this->fallback_url = $fallback_url; $xml_encoding = XmlParser::getEncodingFromXmlTag($content); @@ -112,6 +94,10 @@ abstract class Parser // Encode everything in UTF-8 Logger::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"'); $this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding); + + $this->itemPostProcessor = new ItemPostProcessor($this->config); + $this->itemPostProcessor->register(new ContentGeneratorProcessor($this->config)); + $this->itemPostProcessor->register(new ContentFilterProcessor($this->config)); } /** @@ -173,15 +159,11 @@ abstract class Parser // Id generation can use the item url/title/content (order is important) $this->findItemId($entry, $item, $feed); - $this->findItemDate($entry, $item, $feed); $this->findItemEnclosure($entry, $item, $feed); $this->findItemLanguage($entry, $item, $feed); - // Order is important (avoid double filtering) - $this->filterItemContent($feed, $item); - $this->scrapWebsite($item); - + $this->itemPostProcessor->execute($feed, $item); $feed->items[] = $item; } @@ -230,43 +212,29 @@ abstract class Parser } /** - * Fetch item content with the content grabber. + * Get Item Post Processor instance * - * @param Item $item Item object + * @access public + * @return ItemPostProcessor */ - public function scrapWebsite(Item $item) + public function getItemPostProcessor() { - if ($this->enable_grabber && !in_array($item->getUrl(), $this->grabber_ignore_urls)) { - $grabber = new Scraper($this->config); - $grabber->setUrl($item->getUrl()); - - if ($this->grabber_needs_rule_file) { - $grabber->disableCandidateParser(); - } - - $grabber->execute(); - - if ($grabber->hasRelevantContent()) { - $item->content = $grabber->getFilteredContent(); - } - } + return $this->itemPostProcessor; } /** - * Filter HTML for entry content. + * Get DateParser instance * - * @param Feed $feed Feed object - * @param Item $item Item object + * @access public + * @return DateParser */ - public function filterItemContent(Feed $feed, Item $item) + public function getDateParser() { - if ($this->isFilteringEnabled()) { - $filter = Filter::html($item->getContent(), $feed->getSiteUrl()); - $filter->setConfig($this->config); - $item->content = $filter->execute(); - } else { - Logger::setMessage(get_called_class().': Content filtering disabled'); + if ($this->dateParser === null) { + return new DateParser($this->config); } + + return $this->dateParser; } /** @@ -316,31 +284,11 @@ abstract class Parser * Set Hash algorithm used for id generation. * * @param string $algo Algorithm name - * * @return \PicoFeed\Parser\Parser */ public function setHashAlgo($algo) { $this->hash_algo = $algo ?: $this->hash_algo; - - return $this; - } - - /** - * Set a different timezone. - * - * @see http://php.net/manual/en/timezones.php - * - * @param string $timezone Timezone - * - * @return \PicoFeed\Parser\Parser - */ - public function setTimezone($timezone) - { - if ($timezone) { - $this->date->timezone = $timezone; - } - return $this; } @@ -354,7 +302,6 @@ abstract class Parser public function setConfig($config) { $this->config = $config; - return $this; } @@ -365,21 +312,8 @@ abstract class Parser */ public function disableContentFiltering() { - $this->enable_filter = false; - } - - /** - * Return true if the content filtering is enabled. - * - * @return bool - */ - public function isFilteringEnabled() - { - if ($this->config === null) { - return $this->enable_filter; - } - - return $this->config->getContentFiltering($this->enable_filter); + $this->itemPostProcessor->unregister('PicoFeed\Processor\ContentFilterProcessor'); + return $this; } /** @@ -392,8 +326,14 @@ abstract class Parser */ public function enableContentGrabber($needs_rule_file = false) { - $this->enable_grabber = true; - $this->grabber_needs_rule_file = $needs_rule_file; + $processor = new ScraperProcessor($this->config); + + if ($needs_rule_file) { + $processor->getScraper()->disableCandidateParser(); + } + + $this->itemPostProcessor->register($processor); + return $this; } /** @@ -405,7 +345,8 @@ abstract class Parser */ public function setGrabberIgnoreUrls(array $urls) { - $this->grabber_ignore_urls = $urls; + $this->itemPostProcessor->getProcessor('PicoFeed\Processor\ScraperProcessor')->ignoreUrls($urls); + return $this; } /** |