summaryrefslogtreecommitdiffstats
path: root/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php')
-rw-r--r--vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php137
1 files changed, 39 insertions, 98 deletions
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php
index 5130b68bb..433f21a26 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php
@@ -2,12 +2,15 @@
namespace PicoFeed\Parser;
+use PicoFeed\Processor\ContentFilterProcessor;
+use PicoFeed\Processor\ContentGeneratorProcessor;
+use PicoFeed\Processor\ItemPostProcessor;
+use PicoFeed\Processor\ScraperProcessor;
use SimpleXMLElement;
use PicoFeed\Client\Url;
use PicoFeed\Encoding\Encoding;
use PicoFeed\Filter\Filter;
use PicoFeed\Logging\Logger;
-use PicoFeed\Scraper\Scraper;
/**
* Base parser class.
@@ -28,7 +31,7 @@ abstract class Parser
*
* @var \PicoFeed\Parser\DateParser
*/
- protected $date;
+ private $dateParser;
/**
* Hash algorithm used to generate item id, any value supported by PHP, see hash_algos().
@@ -66,32 +69,12 @@ abstract class Parser
protected $used_namespaces = array();
/**
- * Enable the content filtering.
+ * Item Post Processor instance
*
- * @var bool
+ * @access private
+ * @var ItemPostProcessor
*/
- private $enable_filter = true;
-
- /**
- * Enable the content grabber.
- *
- * @var bool
- */
- private $enable_grabber = false;
-
- /**
- * Enable the content grabber on all pages.
- *
- * @var bool
- */
- private $grabber_needs_rule_file = false;
-
- /**
- * Ignore those urls for the content scraper.
- *
- * @var array
- */
- private $grabber_ignore_urls = array();
+ private $itemPostProcessor;
/**
* Constructor.
@@ -102,7 +85,6 @@ abstract class Parser
*/
public function __construct($content, $http_encoding = '', $fallback_url = '')
{
- $this->date = new DateParser();
$this->fallback_url = $fallback_url;
$xml_encoding = XmlParser::getEncodingFromXmlTag($content);
@@ -112,6 +94,10 @@ abstract class Parser
// Encode everything in UTF-8
Logger::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"');
$this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding);
+
+ $this->itemPostProcessor = new ItemPostProcessor($this->config);
+ $this->itemPostProcessor->register(new ContentGeneratorProcessor($this->config));
+ $this->itemPostProcessor->register(new ContentFilterProcessor($this->config));
}
/**
@@ -173,15 +159,11 @@ abstract class Parser
// Id generation can use the item url/title/content (order is important)
$this->findItemId($entry, $item, $feed);
-
$this->findItemDate($entry, $item, $feed);
$this->findItemEnclosure($entry, $item, $feed);
$this->findItemLanguage($entry, $item, $feed);
- // Order is important (avoid double filtering)
- $this->filterItemContent($feed, $item);
- $this->scrapWebsite($item);
-
+ $this->itemPostProcessor->execute($feed, $item);
$feed->items[] = $item;
}
@@ -230,43 +212,29 @@ abstract class Parser
}
/**
- * Fetch item content with the content grabber.
+ * Get Item Post Processor instance
*
- * @param Item $item Item object
+ * @access public
+ * @return ItemPostProcessor
*/
- public function scrapWebsite(Item $item)
+ public function getItemPostProcessor()
{
- if ($this->enable_grabber && !in_array($item->getUrl(), $this->grabber_ignore_urls)) {
- $grabber = new Scraper($this->config);
- $grabber->setUrl($item->getUrl());
-
- if ($this->grabber_needs_rule_file) {
- $grabber->disableCandidateParser();
- }
-
- $grabber->execute();
-
- if ($grabber->hasRelevantContent()) {
- $item->content = $grabber->getFilteredContent();
- }
- }
+ return $this->itemPostProcessor;
}
/**
- * Filter HTML for entry content.
+ * Get DateParser instance
*
- * @param Feed $feed Feed object
- * @param Item $item Item object
+ * @access public
+ * @return DateParser
*/
- public function filterItemContent(Feed $feed, Item $item)
+ public function getDateParser()
{
- if ($this->isFilteringEnabled()) {
- $filter = Filter::html($item->getContent(), $feed->getSiteUrl());
- $filter->setConfig($this->config);
- $item->content = $filter->execute();
- } else {
- Logger::setMessage(get_called_class().': Content filtering disabled');
+ if ($this->dateParser === null) {
+ return new DateParser($this->config);
}
+
+ return $this->dateParser;
}
/**
@@ -316,31 +284,11 @@ abstract class Parser
* Set Hash algorithm used for id generation.
*
* @param string $algo Algorithm name
- *
* @return \PicoFeed\Parser\Parser
*/
public function setHashAlgo($algo)
{
$this->hash_algo = $algo ?: $this->hash_algo;
-
- return $this;
- }
-
- /**
- * Set a different timezone.
- *
- * @see http://php.net/manual/en/timezones.php
- *
- * @param string $timezone Timezone
- *
- * @return \PicoFeed\Parser\Parser
- */
- public function setTimezone($timezone)
- {
- if ($timezone) {
- $this->date->timezone = $timezone;
- }
-
return $this;
}
@@ -354,7 +302,6 @@ abstract class Parser
public function setConfig($config)
{
$this->config = $config;
-
return $this;
}
@@ -365,21 +312,8 @@ abstract class Parser
*/
public function disableContentFiltering()
{
- $this->enable_filter = false;
- }
-
- /**
- * Return true if the content filtering is enabled.
- *
- * @return bool
- */
- public function isFilteringEnabled()
- {
- if ($this->config === null) {
- return $this->enable_filter;
- }
-
- return $this->config->getContentFiltering($this->enable_filter);
+ $this->itemPostProcessor->unregister('PicoFeed\Processor\ContentFilterProcessor');
+ return $this;
}
/**
@@ -392,8 +326,14 @@ abstract class Parser
*/
public function enableContentGrabber($needs_rule_file = false)
{
- $this->enable_grabber = true;
- $this->grabber_needs_rule_file = $needs_rule_file;
+ $processor = new ScraperProcessor($this->config);
+
+ if ($needs_rule_file) {
+ $processor->getScraper()->disableCandidateParser();
+ }
+
+ $this->itemPostProcessor->register($processor);
+ return $this;
}
/**
@@ -405,7 +345,8 @@ abstract class Parser
*/
public function setGrabberIgnoreUrls(array $urls)
{
- $this->grabber_ignore_urls = $urls;
+ $this->itemPostProcessor->getProcessor('PicoFeed\Processor\ScraperProcessor')->ignoreUrls($urls);
+ return $this;
}
/**