diff options
Diffstat (limited to '3rdparty/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php')
-rw-r--r-- | 3rdparty/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php | 170 |
1 files changed, 170 insertions, 0 deletions
diff --git a/3rdparty/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php b/3rdparty/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php new file mode 100644 index 000000000..0490e2f49 --- /dev/null +++ b/3rdparty/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php @@ -0,0 +1,170 @@ +<?php + +namespace PicoFeed\Filter; + +/** + * Filter class + * + * @author Frederic Guillot + * @package Filter + */ +class Filter +{ + /** + * Get the Html filter instance + * + * @static + * @access public + * @param string $html HTML content + * @param string $website Site URL (used to build absolute URL) + * @return PicoFeed\Filter\Html + */ + public static function html($html, $website) + { + $filter = new Html($html, $website); + return $filter; + } + + /** + * Escape HTML content + * + * @static + * @access public + * @return string + */ + public static function escape($content) + { + return @htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false); + } + + /** + * Remove HTML tags + * + * @access public + * @param string $data Input data + * @return string + */ + public function removeHTMLTags($data) + { + return preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $data); + } + + /** + * Remove the XML tag from a document + * + * @static + * @access public + * @param string $data Input data + * @return string + */ + public static function stripXmlTag($data) + { + if (strpos($data, '<?xml') !== false) { + $data = ltrim(substr($data, strpos($data, '?>') + 2)); + } + + do { + + $pos = strpos($data, '<?xml-stylesheet '); + + if ($pos !== false) { + $data = ltrim(substr($data, strpos($data, '?>') + 2)); + } + + } while ($pos !== false && $pos < 200); + + return $data; + } + + /** + * Strip head tag from the HTML content + * + * @static + * @access public + * @param string $data Input data + * @return string + */ + public static function stripHeadTags($data) + { + $start = strpos($data, '<head>'); + $end = strpos($data, '</head>'); + + if ($start !== false && $end !== false) { + $before = substr($data, 0, $start); + $after = substr($data, $end + 7); + $data = $before.$after; + } + + return $data; + } + + /** + * Trim whitespace from the begining, the end and inside a string and don't break utf-8 string + * + * @static + * @access public + * @param string $value Raw data + * @return string Normalized data + */ + public static function stripWhiteSpace($value) + { + $value = str_replace("\r", ' ', $value); + $value = str_replace("\t", ' ', $value); + $value = str_replace("\n", ' ', $value); + // $value = preg_replace('/\s+/', ' ', $value); <= break utf-8 + return trim($value); + } + + /** + * Dirty quickfixes before XML parsing + * + * @static + * @access public + * @param string $data Raw data + * @return string Normalized data + */ + public static function normalizeData($data) + { + $invalid_chars = array( + "\x10", + "\xc3\x20", + "", + ); + + foreach ($invalid_chars as $needle) { + $data = str_replace($needle, '', $data); + } + + return $data; + } + + /** + * Get the first XML tag + * + * @static + * @access public + * @param string $data Feed content + * @return string + */ + public static function getFirstTag($data) + { + // Strip HTML comments (max of 5,000 characters long to prevent crashing) + $data = preg_replace('/<!--(.{0,5000}?)-->/Uis', '', $data); + + /* Strip Doctype: + * Doctype needs to be within the first 100 characters. (Ideally the first!) + * If it's not found by then, we need to stop looking to prevent PREG + * from reaching max backtrack depth and crashing. + */ + $data = preg_replace('/^.{0,100}<!DOCTYPE([^>]*)>/Uis', '', $data); + + // Strip <?xml version.... + $data = self::stripXmlTag($data); + + // Find the first tag + $open_tag = strpos($data, '<'); + $close_tag = strpos($data, '>'); + + return substr($data, $open_tag, $close_tag); + } +} |