diff options
Diffstat (limited to 'vendor/fguillot/picofeed/lib/PicoFeed/Parser')
13 files changed, 2173 insertions, 0 deletions
diff --git a/vendor/fguillot/picofeed b/vendor/fguillot/picofeed deleted file mode 160000 -Subproject 0a1d0d3950f7f047dc8fb1d80aa6296e15f306d diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php new file mode 100644 index 000000000..9f2f108d8 --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php @@ -0,0 +1,336 @@ +<?php + +namespace PicoFeed\Parser; + +use SimpleXMLElement; +use PicoFeed\Filter\Filter; +use PicoFeed\Client\Url; + +/** + * Atom parser + * + * @author Frederic Guillot + * @package Parser + */ +class Atom extends Parser +{ + /** + * Get the path to the items XML tree + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @return SimpleXMLElement + */ + public function getItemsTree(SimpleXMLElement $xml) + { + return $xml->entry; + } + + /** + * Find the feed url + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findFeedUrl(SimpleXMLElement $xml, Feed $feed) + { + $feed->feed_url = $this->getUrl($xml, 'self'); + } + + /** + * Find the site url + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findSiteUrl(SimpleXMLElement $xml, Feed $feed) + { + $feed->site_url = $this->getUrl($xml, 'alternate', true); + } + + /** + * Find the feed description + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findFeedDescription(SimpleXMLElement $xml, Feed $feed) + { + $feed->description = (string) $xml->subtitle; + } + + /** + * Find the feed logo url + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findFeedLogo(SimpleXMLElement $xml, Feed $feed) + { + $feed->logo = (string) $xml->logo; + } + + /** + * Find the feed icon + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findFeedIcon(SimpleXMLElement $xml, Feed $feed) + { + $feed->icon = (string) $xml->icon; + } + + /** + * Find the feed title + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findFeedTitle(SimpleXMLElement $xml, Feed $feed) + { + $feed->title = Filter::stripWhiteSpace((string) $xml->title) ?: $feed->getSiteUrl(); + } + + /** + * Find the feed language + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed) + { + $feed->language = XmlParser::getXmlLang($this->content); + } + + /** + * Find the feed id + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findFeedId(SimpleXMLElement $xml, Feed $feed) + { + $feed->id = (string) $xml->id; + } + + /** + * Find the feed date + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findFeedDate(SimpleXMLElement $xml, Feed $feed) + { + $feed->date = $this->date->getTimestamp((string) $xml->updated); + } + + /** + * Find the item date + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param Item $item Item object + */ + public function findItemDate(SimpleXMLElement $entry, Item $item) + { + $published = isset($entry->published) ? $this->date->getTimestamp((string) $entry->published) : 0; + $updated = isset($entry->updated) ? $this->date->getTimestamp((string) $entry->updated) : 0; + + $item->date = max($published, $updated) ?: time(); + } + + /** + * Find the item title + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param Item $item Item object + */ + public function findItemTitle(SimpleXMLElement $entry, Item $item) + { + $item->title = Filter::stripWhiteSpace((string) $entry->title); + + if (empty($item->title)) { + $item->title = $item->url; + } + } + + /** + * Find the item author + * + * @access public + * @param SimpleXMLElement $xml Feed + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + */ + public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item) + { + if (isset($entry->author->name)) { + $item->author = (string) $entry->author->name; + } + else { + $item->author = (string) $xml->author->name; + } + } + + /** + * Find the item content + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + */ + public function findItemContent(SimpleXMLElement $entry, Item $item) + { + $item->content = $this->getContent($entry); + } + + /** + * Find the item URL + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + */ + public function findItemUrl(SimpleXMLElement $entry, Item $item) + { + $item->url = $this->getUrl($entry, 'alternate', true); + } + + /** + * Genereate the item id + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed) + { + $id = (string) $entry->id; + + if ($id) { + $item->id = $this->generateId($id); + } + else { + $item->id = $this->generateId( + $item->getTitle(), $item->getUrl(), $item->getContent() + ); + } + } + + /** + * Find the item enclosure + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed) + { + $enclosure = $this->findLink($entry, 'enclosure'); + + if ($enclosure) { + $item->enclosure_url = Url::resolve((string) $enclosure['href'], $feed->getSiteUrl()); + $item->enclosure_type = (string) $enclosure['type']; + } + } + + /** + * Find the item language + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed) + { + $language = (string) $entry->attributes('xml', true)->{'lang'}; + + if ($language === '') { + $language = $feed->language; + } + + $item->language = $language; + } + + /** + * Get the URL from a link tag + * + * @access private + * @param SimpleXMLElement $xml XML tag + * @param string $rel Link relationship: alternate, enclosure, related, self, via + * @return string + */ + private function getUrl(SimpleXMLElement $xml, $rel, $fallback = false) + { + $link = $this->findLink($xml, $rel); + + if ($link) { + return (string) $link['href']; + } + + if ($fallback) { + $link = $this->findLink($xml, ''); + return $link ? (string) $link['href'] : ''; + } + + return ''; + } + + /** + * Get a link tag that match a relationship + * + * @access private + * @param SimpleXMLElement $xml XML tag + * @param string $rel Link relationship: alternate, enclosure, related, self, via + * @return SimpleXMLElement|null + */ + private function findLink(SimpleXMLElement $xml, $rel) + { + foreach ($xml->link as $link) { + if ($rel === (string) $link['rel']) { + return $link; + } + } + + return null; + } + + /** + * Get the entry content + * + * @access private + * @param SimpleXMLElement $entry XML Entry + * @return string + */ + private function getContent(SimpleXMLElement $entry) + { + if (isset($entry->content) && ! empty($entry->content)) { + + if (count($entry->content->children())) { + return (string) $entry->content->asXML(); + } + else { + return (string) $entry->content; + } + } + else if (isset($entry->summary) && ! empty($entry->summary)) { + return (string) $entry->summary; + } + + return ''; + } +} diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php new file mode 100644 index 000000000..89f189e7a --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/DateParser.php @@ -0,0 +1,109 @@ +<?php + +namespace PicoFeed\Parser; + +use DateTime; +use DateTimeZone; + +/** + * Date Parser + * + * @author Frederic Guillot + * @package Parser + */ +class DateParser +{ + /** + * Timezone used to parse feed dates + * + * @access public + * @var string + */ + public $timezone = 'UTC'; + + /** + * Supported formats [ 'format' => length ] + * + * @access public + * @var array + */ + public $formats = array( + DATE_ATOM => null, + DATE_RSS => null, + DATE_COOKIE => null, + DATE_ISO8601 => null, + DATE_RFC822 => null, + DATE_RFC850 => null, + DATE_RFC1036 => null, + DATE_RFC1123 => null, + DATE_RFC2822 => null, + DATE_RFC3339 => null, + 'D, d M Y H:i:s' => 25, + 'D, d M Y h:i:s' => 25, + 'D M d Y H:i:s' => 24, + 'j M Y H:i:s' => 20, + 'Y-m-d H:i:s' => 19, + 'Y-m-d\TH:i:s' => 19, + 'd/m/Y H:i:s' => 19, + 'D, d M Y' => 16, + 'Y-m-d' => 10, + 'd-m-Y' => 10, + 'm-d-Y' => 10, + 'd.m.Y' => 10, + 'm.d.Y' => 10, + 'd/m/Y' => 10, + 'm/d/Y' => 10, + ); + + /** + * Try to parse all date format for broken feeds + * + * @access public + * @param string $value Original date format + * @return integer Timestamp + */ + public function getTimestamp($value) + { + $value = trim($value); + + foreach ($this->formats as $format => $length) { + + $truncated_value = $value; + if ($length !== null) { + $truncated_value = substr($truncated_value, 0, $length); + } + + $timestamp = $this->getValidDate($format, $truncated_value); + if ($timestamp > 0) { + return $timestamp; + } + } + + $date = new DateTime('now', new DateTimeZone($this->timezone)); + return $date->getTimestamp(); + } + + /** + * Get a valid date from a given format + * + * @access public + * @param string $format Date format + * @param string $value Original date value + * @return integer Timestamp + */ + public function getValidDate($format, $value) + { + $date = DateTime::createFromFormat($format, $value, new DateTimeZone($this->timezone)); + + if ($date !== false) { + + $errors = DateTime::getLastErrors(); + + if ($errors['error_count'] === 0 && $errors['warning_count'] === 0) { + return $date->getTimestamp(); + } + } + + return 0; + } +} diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Feed.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Feed.php new file mode 100644 index 000000000..74e9ab4da --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Feed.php @@ -0,0 +1,238 @@ +<?php + +namespace PicoFeed\Parser; + +/** + * Feed + * + * @author Frederic Guillot + * @package Parser + */ +class Feed +{ + /** + * Feed items + * + * @access public + * @var array + */ + public $items = array(); + + /** + * Feed id + * + * @access public + * @var string + */ + public $id = ''; + + /** + * Feed title + * + * @access public + * @var string + */ + public $title = ''; + + /** + * Feed description + * + * @access public + * @var string + */ + public $description = ''; + + /** + * Feed url + * + * @access public + * @var string + */ + public $feed_url = ''; + + /** + * Site url + * + * @access public + * @var string + */ + public $site_url = ''; + + /** + * Feed date + * + * @access public + * @var integer + */ + public $date = 0; + + /** + * Feed language + * + * @access public + * @var string + */ + public $language = ''; + + /** + * Feed logo URL + * + * @access public + * @var string + */ + public $logo = ''; + + /** + * Feed icon URL + * + * @access public + * @var string + */ + public $icon = ''; + + /** + * Return feed information + * + * @access public + * $return string + */ + public function __toString() + { + $output = ''; + + foreach (array('id', 'title', 'feed_url', 'site_url', 'date', 'language', 'description', 'logo') as $property) { + $output .= 'Feed::'.$property.' = '.$this->$property.PHP_EOL; + } + + $output .= 'Feed::isRTL() = '.($this->isRTL() ? 'true' : 'false').PHP_EOL; + $output .= 'Feed::items = '.count($this->items).' items'.PHP_EOL; + + foreach ($this->items as $item) { + $output .= '----'.PHP_EOL; + $output .= $item; + } + + return $output; + } + + /** + * Get title + * + * @access public + * $return string + */ + public function getTitle() + { + return $this->title; + } + + /** + * Get description + * + * @access public + * $return string + */ + public function getDescription() + { + return $this->description; + } + + /** + * Get the logo url + * + * @access public + * $return string + */ + public function getLogo() + { + return $this->logo; + } + + /** + * Get the icon url + * + * @access public + * $return string + */ + public function getIcon() + { + return $this->icon; + } + + /** + * Get feed url + * + * @access public + * $return string + */ + public function getFeedUrl() + { + return $this->feed_url; + } + + /** + * Get site url + * + * @access public + * $return string + */ + public function getSiteUrl() + { + return $this->site_url; + } + + /** + * Get date + * + * @access public + * $return integer + */ + public function getDate() + { + return $this->date; + } + + /** + * Get language + * + * @access public + * $return string + */ + public function getLanguage() + { + return $this->language; + } + + /** + * Get id + * + * @access public + * $return string + */ + public function getId() + { + return $this->id; + } + + /** + * Get feed items + * + * @access public + * $return array + */ + public function getItems() + { + return $this->items; + } + + /** + * Return true if the feed is "Right to Left" + * + * @access public + * @return bool + */ + public function isRTL() + { + return Parser::isLanguageRTL($this->language); + } +} diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Item.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Item.php new file mode 100644 index 000000000..3642cccea --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Item.php @@ -0,0 +1,231 @@ +<?php + +namespace PicoFeed\Parser; + +/** + * Feed Item + * + * @author Frederic Guillot + * @package Parser + */ +class Item +{ + /** + * List of known RTL languages + * + * @access public + * @var public + */ + public $rtl = array( + 'ar', // Arabic (ar-**) + 'fa', // Farsi (fa-**) + 'ur', // Urdu (ur-**) + 'ps', // Pashtu (ps-**) + 'syr', // Syriac (syr-**) + 'dv', // Divehi (dv-**) + 'he', // Hebrew (he-**) + 'yi', // Yiddish (yi-**) + ); + + /** + * Item id + * + * @access public + * @var string + */ + public $id = ''; + + /** + * Item title + * + * @access public + * @var string + */ + public $title = ''; + + /** + * Item url + * + * @access public + * @var string + */ + public $url = ''; + + /** + * Item author + * + * @access public + * @var string + */ + public $author= ''; + + /** + * Item date + * + * @access public + * @var integer + */ + public $date = 0; + + /** + * Item content + * + * @access public + * @var string + */ + public $content = ''; + + /** + * Item enclosure url + * + * @access public + * @var string + */ + public $enclosure_url = ''; + + /** + * Item enclusure type + * + * @access public + * @var string + */ + public $enclosure_type = ''; + + /** + * Item language + * + * @access public + * @var string + */ + public $language = ''; + + /** + * Return item information + * + * @access public + * $return string + */ + public function __toString() + { + $output = ''; + + foreach (array('id', 'title', 'url', 'date', 'language', 'author', 'enclosure_url', 'enclosure_type') as $property) { + $output .= 'Item::'.$property.' = '.$this->$property.PHP_EOL; + } + + $output .= 'Item::isRTL() = '.($this->isRTL() ? 'true' : 'false').PHP_EOL; + $output .= 'Item::content = '.strlen($this->content).' bytes'.PHP_EOL; + + return $output; + } + + /** + * Get title + * + * @access public + * $return string + */ + public function getTitle() + { + return $this->title; + } + + /** + * Get url + * + * @access public + * $return string + */ + public function getUrl() + { + return $this->url; + } + + /** + * Get id + * + * @access public + * $return string + */ + public function getId() + { + return $this->id; + } + + /** + * Get date + * + * @access public + * $return integer + */ + public function getDate() + { + return $this->date; + } + + /** + * Get content + * + * @access public + * $return string + */ + public function getContent() + { + return $this->content; + } + + /** + * Get enclosure url + * + * @access public + * $return string + */ + public function getEnclosureUrl() + { + return $this->enclosure_url; + } + + /** + * Get enclosure type + * + * @access public + * $return string + */ + public function getEnclosureType() + { + return $this->enclosure_type; + } + + /** + * Get language + * + * @access public + * $return string + */ + public function getLanguage() + { + return $this->language; + } + + /** + * Get author + * + * @access public + * $return string + */ + public function getAuthor() + { + return $this->author; + } + + /** + * Return true if the item is "Right to Left" + * + * @access public + * @return bool + */ + public function isRTL() + { + return Parser::isLanguageRTL($this->language); + } +} diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/MalformedXmlException.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/MalformedXmlException.php new file mode 100644 index 000000000..8464e9cac --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/MalformedXmlException.php @@ -0,0 +1,13 @@ +<?php + +namespace PicoFeed\Parser; + +/** + * MalformedXmlException Exception + * + * @author Frederic Guillot + * @package Parser + */ +class MalformedXmlException extends ParserException +{ +}
\ No newline at end of file diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php new file mode 100644 index 000000000..7ada6d10f --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php @@ -0,0 +1,567 @@ +<?php + +namespace PicoFeed\Parser; + +use SimpleXMLElement; +use PicoFeed\Encoding\Encoding; +use PicoFeed\Filter\Filter; +use PicoFeed\Logging\Logger; +use PicoFeed\Client\Url; +use PicoFeed\Client\Grabber; + +/** + * Base parser class + * + * @author Frederic Guillot + * @package Parser + */ +abstract class Parser +{ + /** + * Config object + * + * @access private + * @var \PicoFeed\Config\Config + */ + private $config; + + /** + * DateParser object + * + * @access protected + * @var \PicoFeed\Parser\DateParser + */ + protected $date; + + /** + * Hash algorithm used to generate item id, any value supported by PHP, see hash_algos() + * + * @access private + * @var string + */ + private $hash_algo = 'sha256'; + + /** + * Feed content (XML data) + * + * @access protected + * @var string + */ + protected $content = ''; + + /** + * Fallback url + * + * @access protected + * @var string + */ + protected $fallback_url = ''; + + /** + * XML namespaces + * + * @access protected + * @var array + */ + protected $namespaces = array(); + + /** + * Enable the content filtering + * + * @access private + * @var bool + */ + private $enable_filter = true; + + /** + * Enable the content grabber + * + * @access private + * @var bool + */ + private $enable_grabber = false; + + /** + * Ignore those urls for the content scraper + * + * @access private + * @var array + */ + private $grabber_ignore_urls = array(); + + /** + * Constructor + * + * @access public + * @param string $content Feed content + * @param string $http_encoding HTTP encoding (headers) + * @param string $fallback_url Fallback url when the feed provide relative or broken url + */ + public function __construct($content, $http_encoding = '', $fallback_url = '') + { + $this->date = new DateParser; + $this->fallback_url = $fallback_url; + $xml_encoding = XmlParser::getEncodingFromXmlTag($content); + + // Strip XML tag to avoid multiple encoding/decoding in the next XML processing + $this->content = Filter::stripXmlTag($content); + + // Encode everything in UTF-8 + Logger::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"'); + $this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding); + + // Workarounds + $this->content = Filter::normalizeData($this->content); + } + + /** + * Parse the document + * + * @access public + * @return \PicoFeed\Parser\Feed + */ + public function execute() + { + Logger::setMessage(get_called_class().': begin parsing'); + + $xml = XmlParser::getSimpleXml($this->content); + + if ($xml === false) { + Logger::setMessage(get_called_class().': XML parsing error'); + Logger::setMessage(XmlParser::getErrors()); + throw new MalformedXmlException('XML parsing error'); + } + + $this->namespaces = $xml->getNamespaces(true); |