diff options
Diffstat (limited to 'vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php')
m--------- | vendor/fguillot/picofeed | 0 | ||||
-rw-r--r-- | vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php | 567 |
2 files changed, 567 insertions, 0 deletions
diff --git a/vendor/fguillot/picofeed b/vendor/fguillot/picofeed deleted file mode 160000 -Subproject 0a1d0d3950f7f047dc8fb1d80aa6296e15f306d diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php new file mode 100644 index 000000000..7ada6d10f --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php @@ -0,0 +1,567 @@ +<?php + +namespace PicoFeed\Parser; + +use SimpleXMLElement; +use PicoFeed\Encoding\Encoding; +use PicoFeed\Filter\Filter; +use PicoFeed\Logging\Logger; +use PicoFeed\Client\Url; +use PicoFeed\Client\Grabber; + +/** + * Base parser class + * + * @author Frederic Guillot + * @package Parser + */ +abstract class Parser +{ + /** + * Config object + * + * @access private + * @var \PicoFeed\Config\Config + */ + private $config; + + /** + * DateParser object + * + * @access protected + * @var \PicoFeed\Parser\DateParser + */ + protected $date; + + /** + * Hash algorithm used to generate item id, any value supported by PHP, see hash_algos() + * + * @access private + * @var string + */ + private $hash_algo = 'sha256'; + + /** + * Feed content (XML data) + * + * @access protected + * @var string + */ + protected $content = ''; + + /** + * Fallback url + * + * @access protected + * @var string + */ + protected $fallback_url = ''; + + /** + * XML namespaces + * + * @access protected + * @var array + */ + protected $namespaces = array(); + + /** + * Enable the content filtering + * + * @access private + * @var bool + */ + private $enable_filter = true; + + /** + * Enable the content grabber + * + * @access private + * @var bool + */ + private $enable_grabber = false; + + /** + * Ignore those urls for the content scraper + * + * @access private + * @var array + */ + private $grabber_ignore_urls = array(); + + /** + * Constructor + * + * @access public + * @param string $content Feed content + * @param string $http_encoding HTTP encoding (headers) + * @param string $fallback_url Fallback url when the feed provide relative or broken url + */ + public function __construct($content, $http_encoding = '', $fallback_url = '') + { + $this->date = new DateParser; + $this->fallback_url = $fallback_url; + $xml_encoding = XmlParser::getEncodingFromXmlTag($content); + + // Strip XML tag to avoid multiple encoding/decoding in the next XML processing + $this->content = Filter::stripXmlTag($content); + + // Encode everything in UTF-8 + Logger::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"'); + $this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding); + + // Workarounds + $this->content = Filter::normalizeData($this->content); + } + + /** + * Parse the document + * + * @access public + * @return \PicoFeed\Parser\Feed + */ + public function execute() + { + Logger::setMessage(get_called_class().': begin parsing'); + + $xml = XmlParser::getSimpleXml($this->content); + + if ($xml === false) { + Logger::setMessage(get_called_class().': XML parsing error'); + Logger::setMessage(XmlParser::getErrors()); + throw new MalformedXmlException('XML parsing error'); + } + + $this->namespaces = $xml->getNamespaces(true); + + $feed = new Feed; + + $this->findFeedUrl($xml, $feed); + $this->checkFeedUrl($feed); + + $this->findSiteUrl($xml, $feed); + $this->checkSiteUrl($feed); + + $this->findFeedTitle($xml, $feed); + $this->findFeedDescription($xml, $feed); + $this->findFeedLanguage($xml, $feed); + $this->findFeedId($xml, $feed); + $this->findFeedDate($xml, $feed); + $this->findFeedLogo($xml, $feed); + $this->findFeedIcon($xml, $feed); + + foreach ($this->getItemsTree($xml) as $entry) { + + $item = new Item; + $this->findItemAuthor($xml, $entry, $item); + + $this->findItemUrl($entry, $item); + $this->checkItemUrl($feed, $item); + + $this->findItemTitle($entry, $item); + $this->findItemContent($entry, $item); + + // Id generation can use the item url/title/content (order is important) + $this->findItemId($entry, $item, $feed); + + $this->findItemDate($entry, $item); + $this->findItemEnclosure($entry, $item, $feed); + $this->findItemLanguage($entry, $item, $feed); + + // Order is important (avoid double filtering) + $this->filterItemContent($feed, $item); + $this->scrapWebsite($item); + + $feed->items[] = $item; + } + + Logger::setMessage(get_called_class().PHP_EOL.$feed); + + return $feed; + } + + /** + * Check if the feed url is correct + * + * @access public + * @param Feed $feed Feed object + */ + public function checkFeedUrl(Feed $feed) + { + if ($feed->getFeedUrl() === '') { + $feed->feed_url = $this->fallback_url; + } + else { + $feed->feed_url = Url::resolve($feed->getFeedUrl(), $this->fallback_url); + } + } + + /** + * Check if the site url is correct + * + * @access public + * @param Feed $feed Feed object + */ + public function checkSiteUrl(Feed $feed) + { + if ($feed->getSiteUrl() === '') { + $feed->site_url = Url::base($feed->getFeedUrl()); + } + else { + $feed->site_url = Url::resolve($feed->getSiteUrl(), $this->fallback_url); + } + } + + /** + * Check if the item url is correct + * + * @access public + * @param Feed $feed Feed object + * @param Item $item Item object + */ + public function checkItemUrl(Feed $feed, Item $item) + { + $item->url = Url::resolve($item->getUrl(), $feed->getSiteUrl()); + } + + /** + * Fetch item content with the content grabber + * + * @access public + * @param Item $item Item object + */ + public function scrapWebsite(Item $item) + { + if ($this->enable_grabber && ! in_array($item->getUrl(), $this->grabber_ignore_urls)) { + + $grabber = new Grabber($item->getUrl()); + $grabber->setConfig($this->config); + $grabber->download(); + + if ($grabber->parse()) { + $item->content = $grabber->getFilteredContent(); + } + } + } + + /** + * Filter HTML for entry content + * + * @access public + * @param Feed $feed Feed object + * @param Item $item Item object + */ + public function filterItemContent(Feed $feed, Item $item) + { + if ($this->isFilteringEnabled()) { + $filter = Filter::html($item->getContent(), $feed->getSiteUrl()); + $filter->setConfig($this->config); + $item->content = $filter->execute(); + } + else { + Logger::setMessage(get_called_class().': Content filtering disabled'); + } + } + + /** + * Generate a unique id for an entry (hash all arguments) + * + * @access public + * @param string $args Pieces of data to hash + * @return string + */ + public function generateId() + { + return hash($this->hash_algo, implode(func_get_args())); + } + + /** + * Return true if the given language is "Right to Left" + * + * @static + * @access public + * @param string $language Language: fr-FR, en-US + * @return bool + */ + public static function isLanguageRTL($language) + { + $language = strtolower($language); + + $rtl_languages = array( + 'ar', // Arabic (ar-**) + 'fa', // Farsi (fa-**) + 'ur', // Urdu (ur-**) + 'ps', // Pashtu (ps-**) + 'syr', // Syriac (syr-**) + 'dv', // Divehi (dv-**) + 'he', // Hebrew (he-**) + 'yi', // Yiddish (yi-**) + ); + + foreach ($rtl_languages as $prefix) { + if (strpos($language, $prefix) === 0) { + return true; + } + } + + return false; + } + + /** + * Set Hash algorithm used for id generation + * + * @access public + * @param string $algo Algorithm name + * @return \PicoFeed\Parser\Parser + */ + public function setHashAlgo($algo) + { + $this->hash_algo = $algo ?: $this->hash_algo; + return $this; + } + + /** + * Set a different timezone + * + * @see http://php.net/manual/en/timezones.php + * @access public + * @param string $timezone Timezone + * @return \PicoFeed\Parser\Parser + */ + public function setTimezone($timezone) + { + if ($timezone) { + $this->date->timezone = $timezone; + } + + return $this; + } + + /** + * Set config object + * + * @access public + * @param \PicoFeed\Config\Config $config Config instance + * @return \PicoFeed\Parser\Parser + */ + public function setConfig($config) + { + $this->config = $config; + return $this; + } + + /** + * Enable the content grabber + * + * @access public + * @return \PicoFeed\Parser\Parser + */ + public function disableContentFiltering() + { + $this->enable_filter = false; + } + + /** + * Return true if the content filtering is enabled + * + * @access public + * @return boolean + */ + public function isFilteringEnabled() + { + if ($this->config === null) { + return $this->enable_filter; + } + + return $this->config->getContentFiltering($this->enable_filter); + } + + /** + * Enable the content grabber + * + * @access public + * @return \PicoFeed\Parser\Parser + */ + public function enableContentGrabber() + { + $this->enable_grabber = true; + } + + /** + * Set ignored URLs for the content grabber + * + * @access public + * @param array $urls URLs + * @return \PicoFeed\Parser\Parser + */ + public function setGrabberIgnoreUrls(array $urls) + { + $this->grabber_ignore_urls = $urls; + } + + /** + * Find the feed url + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public abstract function findFeedUrl(SimpleXMLElement $xml, Feed $feed); + + /** + * Find the site url + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public abstract function findSiteUrl(SimpleXMLElement $xml, Feed $feed); + + /** + * Find the feed title + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public abstract function findFeedTitle(SimpleXMLElement $xml, Feed $feed); + + /** + * Find the feed description + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public abstract function findFeedDescription(SimpleXMLElement $xml, Feed $feed); + + /** + * Find the feed language + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public abstract function findFeedLanguage(SimpleXMLElement $xml, Feed $feed); + + /** + * Find the feed id + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public abstract function findFeedId(SimpleXMLElement $xml, Feed $feed); + + /** + * Find the feed date + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public abstract function findFeedDate(SimpleXMLElement $xml, Feed $feed); + + /** + * Find the feed logo url + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public abstract function findFeedLogo(SimpleXMLElement $xml, Feed $feed); + + /** + * Find the feed icon + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public abstract function findFeedIcon(SimpleXMLElement $xml, Feed $feed); + + /** + * Get the path to the items XML tree + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @return SimpleXMLElement + */ + public abstract function getItemsTree(SimpleXMLElement $xml); + + /** + * Find the item author + * + * @access public + * @param SimpleXMLElement $xml Feed + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + */ + public abstract function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item); + + /** + * Find the item URL + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + */ + public abstract function findItemUrl(SimpleXMLElement $entry, Item $item); + + /** + * Find the item title + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + */ + public abstract function findItemTitle(SimpleXMLElement $entry, Item $item); + + /** + * Genereate the item id + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public abstract function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed); + + /** + * Find the item date + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + */ + public abstract function findItemDate(SimpleXMLElement $entry, Item $item); + + /** + * Find the item content + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + */ + public abstract function findItemContent(SimpleXMLElement $entry, Item $item); + + /** + * Find the item enclosure + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public abstract function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed); + + /** + * Find the item language + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public abstract function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed); +} |