diff options
Diffstat (limited to '3rdparty/fguillot/picofeed/lib/PicoFeed/Parser')
11 files changed, 1936 insertions, 0 deletions
diff --git a/3rdparty/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php b/3rdparty/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php new file mode 100644 index 000000000..feaf0e376 --- /dev/null +++ b/3rdparty/fguillot/picofeed/lib/PicoFeed/Parser/Atom.php @@ -0,0 +1,281 @@ +<?php + +namespace PicoFeed\Parser; + +use SimpleXMLElement; +use PicoFeed\Logging\Logging; +use PicoFeed\Filter\Filter; +use PicoFeed\Client\Url; + +/** + * Atom parser + * + * @author Frederic Guillot + * @package Parser + */ +class Atom extends Parser +{ + /** + * Get the path to the items XML tree + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @return SimpleXMLElement + */ + public function getItemsTree(SimpleXMLElement $xml) + { + return $xml->entry; + } + + /** + * Find the feed url + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findFeedUrl(SimpleXMLElement $xml, Feed $feed) + { + $feed->url = $this->getLink($xml); + } + + /** + * Find the feed description + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findFeedDescription(SimpleXMLElement $xml, Feed $feed) + { + $feed->description = (string) $xml->subtitle; + } + + /** + * Find the feed logo url + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findFeedLogo(SimpleXMLElement $xml, Feed $feed) + { + $feed->logo = (string) $xml->logo; + } + + /** + * Find the feed title + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findFeedTitle(SimpleXMLElement $xml, Feed $feed) + { + $feed->title = Filter::stripWhiteSpace((string) $xml->title) ?: $feed->url; + } + + /** + * Find the feed language + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findFeedLanguage(SimpleXMLElement $xml, Feed $feed) + { + $feed->language = XmlParser::getXmlLang($this->content); + } + + /** + * Find the feed id + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findFeedId(SimpleXMLElement $xml, Feed $feed) + { + $feed->id = (string) $xml->id; + } + + /** + * Find the feed date + * + * @access public + * @param SimpleXMLElement $xml Feed xml + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findFeedDate(SimpleXMLElement $xml, Feed $feed) + { + $feed->date = $this->parseDate((string) $xml->updated); + } + + /** + * Find the item date + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param Item $item Item object + */ + public function findItemDate(SimpleXMLElement $entry, Item $item) + { + $item->date = $this->parseDate((string) $entry->updated); + } + + /** + * Find the item title + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param Item $item Item object + */ + public function findItemTitle(SimpleXMLElement $entry, Item $item) + { + $item->title = Filter::stripWhiteSpace((string) $entry->title); + + if (empty($item->title)) { + $item->title = $item->url; + } + } + + /** + * Find the item author + * + * @access public + * @param SimpleXMLElement $xml Feed + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + */ + public function findItemAuthor(SimpleXMLElement $xml, SimpleXMLElement $entry, Item $item) + { + if (isset($entry->author->name)) { + $item->author = (string) $entry->author->name; + } + else { + $item->author = (string) $xml->author->name; + } + } + + /** + * Find the item content + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + */ + public function findItemContent(SimpleXMLElement $entry, Item $item) + { + $item->content = $this->getContent($entry); + } + + /** + * Find the item URL + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + */ + public function findItemUrl(SimpleXMLElement $entry, Item $item) + { + $item->url = $this->getLink($entry); + } + + /** + * Genereate the item id + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findItemId(SimpleXMLElement $entry, Item $item, Feed $feed) + { + $id = (string) $entry->id; + + if ($id) { + $item->id = $this->generateId($id); + } + else { + $item->id = $this->generateId( + $item->getTitle(), $item->getUrl(), $item->getContent() + ); + } + } + + /** + * Find the item enclosure + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findItemEnclosure(SimpleXMLElement $entry, Item $item, Feed $feed) + { + foreach ($entry->link as $link) { + if ((string) $link['rel'] === 'enclosure') { + + $item->enclosure_url = Url::resolve((string) $link['href'], $feed->url); + $item->enclosure_type = (string) $link['type']; + break; + } + } + } + + /** + * Find the item language + * + * @access public + * @param SimpleXMLElement $entry Feed item + * @param \PicoFeed\Parser\Item $item Item object + * @param \PicoFeed\Parser\Feed $feed Feed object + */ + public function findItemLanguage(SimpleXMLElement $entry, Item $item, Feed $feed) + { + $item->language = $feed->language; + } + + /** + * Get the URL from a link tag + * + * @access public + * @param SimpleXMLElement $xml XML tag + * @return string + */ + public function getLink(SimpleXMLElement $xml) + { + foreach ($xml->link as $link) { + if ((string) $link['type'] === 'text/html' || (string) $link['type'] === 'application/xhtml+xml') { + return (string) $link['href']; + } + } + + return (string) $xml->link['href']; + } + + /** + * Get the entry content + * + * @access public + * @param SimpleXMLElement $entry XML Entry + * @return string + */ + public function getContent(SimpleXMLElement $entry) + { + if (isset($entry->content) && ! empty($entry->content)) { + + if (count($entry->content->children())) { + return (string) $entry->content->asXML(); + } + else { + return (string) $entry->content; + } + } + else if (isset($entry->summary) && ! empty($entry->summary)) { + return (string) $entry->summary; + } + + return ''; + } +} diff --git a/3rdparty/fguillot/picofeed/lib/PicoFeed/Parser/Feed.php b/3rdparty/fguillot/picofeed/lib/PicoFeed/Parser/Feed.php new file mode 100644 index 000000000..77a6f0c97 --- /dev/null +++ b/3rdparty/fguillot/picofeed/lib/PicoFeed/Parser/Feed.php @@ -0,0 +1,188 @@ +<?php + +namespace PicoFeed\Parser; + +/** + * Feed + * + * @author Frederic Guillot + * @package Parser + */ +class Feed +{ + /** + * Feed items + * + * @access public + * @var array + */ + public $items = array(); + + /** + * Feed id + * + * @access public + * @var string + */ + public $id = ''; + + /** + * Feed title + * + * @access public + * @var string + */ + public $title = ''; + + /** + * Feed description + * + * @access public + * @var string + */ + public $description = ''; + + /** + * Feed url + * + * @access public + * @var string + */ + public $url = ''; + + /** + * Feed date + * + * @access public + * @var integer + */ + public $date = 0; + + /** + * Feed language + * + * @access public + * @var string + */ + public $language = ''; + + /** + * Feed logo URL (not the same as icon) + * + * @access public + * @var string + */ + public $logo = ''; + + /** + * Return feed information + * + * @access public + * $return string + */ + public function __toString() + { + $output = ''; + + foreach (array('id', 'title', 'url', 'date', 'language', 'description', 'logo') as $property) { + $output .= 'Feed::'.$property.' = '.$this->$property.PHP_EOL; + } + + $output .= 'Feed::items = '.count($this->items).' items'.PHP_EOL; + + foreach ($this->items as $item) { + $output .= '----'.PHP_EOL; + $output .= $item; + } + + return $output; + } + + /** + * Get title + * + * @access public + * $return string + */ + public function getTitle() + { + return $this->title; + } + + /** + * Get description + * + * @access public + * $return string + */ + public function getDescription() + { + return $this->description; + } + + /** + * Get the logo url + * + * @access public + * $return string + */ + public function getLogo() + { + return $this->logo; + } + + /** + * Get url + * + * @access public + * $return string + */ + public function getUrl() + { + return $this->url; + } + + /** + * Get date + * + * @access public + * $return integer + */ + public function getDate() + { + return $this->date; + } + + /** + * Get language + * + * @access public + * $return string + */ + public function getLanguage() + { + return $this->language; + } + + /** + * Get id + * + * @access public + * $return string + */ + public function getId() + { + return $this->id; + } + + /** + * Get feed items + * + * @access public + * $return array + */ + public function getItems() + { + return $this->items; + } +} diff --git a/3rdparty/fguillot/picofeed/lib/PicoFeed/Parser/Item.php b/3rdparty/fguillot/picofeed/lib/PicoFeed/Parser/Item.php new file mode 100644 index 000000000..1731f5a29 --- /dev/null +++ b/3rdparty/fguillot/picofeed/lib/PicoFeed/Parser/Item.php @@ -0,0 +1,202 @@ +<?php + +namespace PicoFeed\Parser; + +/** + * Feed Item + * + * @author Frederic Guillot + * @package Parser + */ +class Item +{ + /** + * Item id + * + * @access public + * @var string + */ + public $id = ''; + + /** + * Item title + * + * @access public + * @var string + */ + public $title = ''; + + /** + * Item url + * + * @access public + * @var string + */ + public $url = ''; + + /** + * Item author + * + * @access public + * @var string + */ + public $author= ''; + + /** + * Item date + * + * @access public + * @var integer + */ + public $date = 0; + + /** + * Item content + * + * @access public + * @var string + */ + public $content = ''; + + /** + * Item enclosure url + * + * @access public + * @var string + */ + public $enclosure_url = ''; + + /** + * Item enclusure type + * + * @access public + * @var string + */ + public $enclosure_type = ''; + + /** + * Item language + * + * @access public + * @var string + */ + public $language = ''; + + /** + * Return item information + * + * @access public + * $return string + */ + public function __toString() + { + $output = ''; + + foreach (array('id', 'title', 'url', 'date', 'language', 'author', 'enclosure_url', 'enclosure_type') as $property) { + $output .= 'Item::'.$property.' = '.$this->$property.PHP_EOL; + } + + $output .= 'Item::content = '.strlen($this->content).' bytes'.PHP_EOL; + + return $output; + } + + /** + * Get title + * + * @access public + * $return string + */ + public function getTitle() + { + return $this->title; + } + + /** + * Get url + * + * @access public + * $return string + */ + public function getUrl() + { + return $this->url; + } + + /** + * Get id + * + * @access public + * $return string + */ + public function getId() + { + return $this->id; + } + + /** + * Get date + * + * @access public + * $return integer + */ + public function getDate() + { + return $this->date; + } + + /** + * Get content + * + * @access public + * $return string + */ + public function getContent() + { + return $this->content; + } + + /** + * Get enclosure url + * + * @access public + * $return string + */ + public function getEnclosureUrl() + { + return $this->enclosure_url; + } + + /** + * Get enclosure type + * + * @access public + * $return string + */ + public function getEnclosureType() + { + return $this->enclosure_type; + } + + /** + * Get language + * + * @access public + * $return string + */ + public function getLanguage() + { + return $this->language; + } + + /** + * Get author + * + * @access public + * $return string + */ + public function getAuthor() + { + return $this->author; + } +} diff --git a/3rdparty/fguillot/picofeed/lib/PicoFeed/Parser/MalformedXmlException.php b/3rdparty/fguillot/picofeed/lib/PicoFeed/Parser/MalformedXmlException.php new file mode 100644 index 000000000..8464e9cac --- /dev/null +++ b/3rdparty/fguillot/picofeed/lib/PicoFeed/Parser/MalformedXmlException.php @@ -0,0 +1,13 @@ +<?php + +namespace PicoFeed\Parser; + +/** + * MalformedXmlException Exception + * + * @author Frederic Guillot + * @package Parser + */ +class MalformedXmlException extends ParserException +{ +}
\ No newline at end of file diff --git a/3rdparty/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php b/3rdparty/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php new file mode 100644 index 000000000..6954c2ffc --- /dev/null +++ b/3rdparty/fguillot/picofeed/lib/PicoFeed/Parser/Parser.php @@ -0,0 +1,610 @@ +<?php + +namespace PicoFeed\Parser; + +use SimpleXMLElement; +use DateTime; +use DateTimeZone; + +use PicoFeed\Encoding\Encoding; +use PicoFeed\Filter\Filter; +use PicoFeed\Logging\Logging; +use PicoFeed\Client\Url; +use PicoFeed\Client\Grabber; + +/** + * Base parser class + * + * @author Frederic Guillot + * @package Parser + */ +abstract class Parser +{ + /** + * Config object + * + * @access private + * @var \PicoFeed\Config\Config + */ + private $config; + + /** + * Hash algorithm used to generate item id, any value supported by PHP, see hash_algos() + * + * @access private + * @var string + */ + private $hash_algo = 'sha256'; + + /** + * Timezone used to parse feed dates + * + * @access private + * @var string + */ + private $timezone = 'UTC'; + + /** + * Feed content (XML data) + * + * @access protected + * @var string + */ + protected $content = ''; + + /** + * Fallback url + * + * @access protected + * @var string + */ + protected $fallback_url = ''; + + /** + * XML namespaces + * + * @access protected + * @var array + */ + protected $namespaces = array(); + + /** + * Enable the content filtering + * + * @access private + * @var bool + */ + private $enable_filter = true; + + /** + * Enable the content grabber + * + * @access private + * @var bool + */ + private $enable_grabber = false; + + /** + * Ignore those urls for the content scraper + * + * @access private + * @var array + */ + private $grabber_ignore_urls = array(); + + /** + * Constructor + * + * @access public + * @param string $content Feed content + * @param string $http_encoding HTTP encoding (headers) + * @param string $base_url Fallback url when the feed provide relative or broken url + */ + public function __construct($content, $http_encoding = '', $fallback_url = '') + { + $this->fallback_url = $fallback_url; + $xml_encoding = XmlParser::getEncodingFromXmlTag($content); + + // Strip XML tag to avoid multiple encoding/decoding in the next XML processing + $this->content = Filter::stripXmlTag($content); + + // Encode everything in UTF-8 + Logging::setMessage(get_called_class().': HTTP Encoding "'.$http_encoding.'" ; XML Encoding "'.$xml_encoding.'"'); + $this->content = Encoding::convert($this->content, $xml_encoding ?: $http_encoding); + + // Workarounds + $this->content = Filter::normalizeData($this->content); + } + + /** + * Parse the document + * + * @access public + * @return \PicoFeed\Parser\Feed + */ + public function execute() + { + Logging::setMessage(get_called_class().': begin parsing'); + + $xml = XmlParser::getSimpleXml($this->content); + + if ($xml === false) { + Logging::setMessage(get_called_class().': XML parsing error'); + Logging::setMessage(XmlParser::getErrors()); + throw new MalformedXmlException('XML parsing error'); + } + + $this->namespaces = $xml->getNamespaces(true); + + $feed = new Feed; + + $this->findFeedUrl($xml, $feed); + $this->checkFeedUrl($feed); + + $this->findFeedTitle($xml, $feed); + $this->findFeedDescription($xml, $feed); + $this->findFeedLanguage($xml, $feed); + $this->findFeedId($xml, $feed); + $this->findFeedDate($xml, $feed); + $this->findFeedLogo($xml, $feed); + + foreach ($this->getItemsTree($xml) as $entry) { + + $item = new Item; + $this->findItemAuthor($xml, $entry, $item); + + $this->findItemUrl($entry, $item); + $this->checkItemUrl($feed, $item); + + $this->findItemTitle($entry, $item); + $this->findItemContent($entry, $item); + + // Id generation can use the item url/title/content (order is important) + $this->findItemId($entry, $item, $feed); + + $this->findItemDate($entry, $item); + $this->findItemEnclosure($entry, $item, $feed); + $this->findItemLanguage($entry, $item, $feed); + + $this->scrapWebsite($item); + $this->filterItemContent($feed, $item); + + $feed->items[] = $item; + } + + Logging::setMessage(get_called_class().PHP_EOL.$feed); + + return $feed; + } + + /** + * Check if the feed url is correct + * + * @access public + * @param Feed $feed Feed object + */ + public function checkFeedUrl(Feed $feed) + { + $url = new Url($feed->getUrl()); + + if ($url->isRelativeUrl()) { + $feed->url = $this->fallback_url; + } + } + + /** + * Check if the item url is correct + * + * @access public + * @param Feed $feed Feed object + * @param Item $item Item object + */ + public function checkItemUrl(Feed $feed, Item $item) + { + $url = new Url($item->getUrl()); + + if ($url->isRelativeUrl()) { + $item->url = Url::resolve($item->getUrl(), $feed->getUrl()); + } + } + + /** + * Fetch item content with the content grabber + * + * @access public + * @param Item $item Item object + */ + public function scrapWebsite(Item $item) + { + if ($this->enable_grabber && ! in_array($item->getUrl(), $this->grabber_ignore_urls)) { + + $grabber = new Grabber($item->getUrl()); + $grabber->setConfig($this->config); + $grabber->download(); + + if ($grabber->parse()) { + $item->content = $grabber->getContent() ?: $item->content; + } + } + } + + /** + * Filter HTML for entry content + * + * @access public + * @param Feed $feed Feed object + * @param Item $item Item object + */ + public function filterItemContent(Feed $feed, Item $item) + { + if ($this->isFilteringEnabled()) { + $filter = Filter::html($item->getContent(), $feed->getUrl()); + $filter->setConfig($this->config); + $item->content = $filter->execute(); + } + else { + Logging::setMessage(get_called_class().': Content filtering disabled'); + } + } + + /** + * Generate a unique id for an entry (hash all arguments) + * + * @access public + * @param string $args Pieces of data to hash + * @return string Id + */ + public function generateId() + { + return hash($this->hash_algo, implode(func_get_args())); + } + + /** + * Try to parse all date format for broken feeds + * + * @access public + * @param string $value Original date format + * @return integer Timestamp + */ + public function parseDate($value) + { + // Format => truncate to this length if not null + $formats = array( + DATE_ATOM => null, + DATE_RSS => null, + DATE_COOKIE => null, + DATE_ISO8601 => null, + DATE_RFC822 => null, + DATE_RFC850 => null, + DATE_RFC1036 => null, + DATE_RFC1123 => null, + DATE_RFC2822 => null, + DATE_RFC3339 => null, + 'D, d M Y H:i:s' => 25, + 'D, d M Y h:i:s' => 25, + 'D M d Y H:i:s' => 24, + 'Y-m-d H:i:s' => 19, + 'Y-m-d\TH:i:s' => 19, + 'd/m/Y H:i:s' => 19, + 'D, d M Y' => 16, + 'Y-m-d' => 10, + 'd-m-Y' => 10, + 'm-d-Y' => 10, + 'd.m.Y' => 10, + 'm.d.Y' => 10, + 'd/m/Y' => 10, + 'm/d/Y' => 10, + ); + + $value = trim($value); + + foreach ($formats as $format => $length) { + + $truncated_value = $value; + if ($length !== null) { + $truncated_value = substr($truncated_value, 0, $length); + } + + $timestamp = $this->getValidDate($format, $truncated_value); + if ($timestamp > 0) { + return $timestamp; + } + } + + $date = new DateTime('now', new DateTimeZone($this->timezone)); + return $date->getTimestamp(); + } + + /** + * Get a valid date from a given format + * + * @access public + * @param string $format Date format + * @param string $value Original date value + * @return integer Timestamp + */ + public function getValidDate($format, $value) + { + $date = DateTime::createFromFormat($format, $value, new DateTimeZone($this->timezone)); + + if ($date !== false) { + + $errors = DateTime::getLastErrors(); + + if ($errors['error_count'] === 0 && $errors['warning_count'] === 0) { + return $date->getTimestamp(); + } + } + |