summaryrefslogtreecommitdiffstats
path: root/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php')
-rw-r--r--vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php40
1 files changed, 30 insertions, 10 deletions
diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php b/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php
index 123f9896e..e3e4ad36b 100644
--- a/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php
+++ b/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php
@@ -107,7 +107,7 @@ class Filter
}
/**
- * Dirty quickfixes before XML parsing
+ * Fixes before XML parsing
*
* @static
* @access public
@@ -116,17 +116,37 @@ class Filter
*/
public static function normalizeData($data)
{
- $invalid_chars = array(
- "\x10",
- "\xc3\x20",
- "",
- "\xe2\x80\x9c\x08",
+ $entities = array(
+ '/(&#)(\d+);/m', // decimal encoded
+ '/(&#x)([a-f0-9]+);/mi', // hex encoded
);
- foreach ($invalid_chars as $needle) {
- $data = str_replace($needle, '', $data);
- }
+ // strip invalid XML 1.0 characters which are encoded as entities
+ $data = preg_replace_callback($entities, function($matches) {
+ $code_point = $matches[2];
- return $data;
+ // convert hex entity to decimal
+ if (strtolower($matches[1]) === '&#x') {
+ $code_point = hexdec($code_point);
+ }
+
+ $code_point = (int) $code_point;
+
+ // replace invalid characters
+ if ($code_point < 9
+ || ($code_point > 10 && $code_point < 13)
+ || ($code_point > 13 && $code_point < 32)
+ || ($code_point > 55295 && $code_point < 57344)
+ || ($code_point > 65533 && $code_point < 65536)
+ || $code_point > 1114111
+ ) {
+ return '';
+ };
+
+ return $matches[0];
+ }, $data);
+
+ // strip every utf-8 character than isn't in the range of valid XML 1.0 characters
+ return (string) preg_replace('/[^\x{0009}\x{000A}\x{000D}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]/u', '', $data);
}
}