From 8241180c6ce0cb19255d70a3394f891e08182542 Mon Sep 17 00:00:00 2001 From: Bernhard Posselt Date: Tue, 27 Jan 2015 09:31:40 +0100 Subject: dont use picofeed submodule --- vendor/fguillot/picofeed | 1 - .../picofeed/lib/PicoFeed/Encoding/Encoding.php | 167 +++++++++++++++++++++ 2 files changed, 167 insertions(+), 1 deletion(-) delete mode 160000 vendor/fguillot/picofeed create mode 100644 vendor/fguillot/picofeed/lib/PicoFeed/Encoding/Encoding.php (limited to 'vendor/fguillot/picofeed/lib/PicoFeed/Encoding') diff --git a/vendor/fguillot/picofeed b/vendor/fguillot/picofeed deleted file mode 160000 index 0a1d0d395..000000000 --- a/vendor/fguillot/picofeed +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0a1d0d3950f7f047dc8fb1d80aa6296e15f306d0 diff --git a/vendor/fguillot/picofeed/lib/PicoFeed/Encoding/Encoding.php b/vendor/fguillot/picofeed/lib/PicoFeed/Encoding/Encoding.php new file mode 100644 index 000000000..7739def5f --- /dev/null +++ b/vendor/fguillot/picofeed/lib/PicoFeed/Encoding/Encoding.php @@ -0,0 +1,167 @@ + + * @package Encoding + * @version 1.2 + * @link https://github.com/neitanod/forceutf8 + * @example https://github.com/neitanod/forceutf8 + * @license Revised BSD + */ +class Encoding +{ + protected static $win1252ToUtf8 = array( + 128 => "\xe2\x82\xac", + 130 => "\xe2\x80\x9a", + 131 => "\xc6\x92", + 132 => "\xe2\x80\x9e", + 133 => "\xe2\x80\xa6", + 134 => "\xe2\x80\xa0", + 135 => "\xe2\x80\xa1", + 136 => "\xcb\x86", + 137 => "\xe2\x80\xb0", + 138 => "\xc5\xa0", + 139 => "\xe2\x80\xb9", + 140 => "\xc5\x92", + 142 => "\xc5\xbd", + 145 => "\xe2\x80\x98", + 146 => "\xe2\x80\x99", + 147 => "\xe2\x80\x9c", + 148 => "\xe2\x80\x9d", + 149 => "\xe2\x80\xa2", + 150 => "\xe2\x80\x93", + 151 => "\xe2\x80\x94", + 152 => "\xcb\x9c", + 153 => "\xe2\x84\xa2", + 154 => "\xc5\xa1", + 155 => "\xe2\x80\xba", + 156 => "\xc5\x93", + 158 => "\xc5\xbe", + 159 => "\xc5\xb8" + ); + + /** + * Function Encoding::toUTF8 + * + * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8. + * + * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1. + * + * It may fail to convert characters to UTF-8 if they fall into one of these scenarios: + * + * 1) when any of these characters: ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß + * are followed by any of these: ("group B") + * ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿ + * For example: %ABREPRESENT%C9%BB. «REPRESENTÉ» + * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB) + * is also a valid unicode character, and will be left unchanged. + * + * 2) when any of these: àáâãäåæçèéêëìíîï are followed by TWO chars from group B, + * 3) when any of these: ðñòó are followed by THREE chars from group B. + * + * @name toUTF8 + * @param string $text Any string. + * @return string The same string, UTF8 encoded + * + */ + public static function toUTF8($text) + { + if (is_array($text)) { + foreach ($text as $k => $v) { + $text[$k] = self::toUTF8($v); + } + + return $text; + } + elseif (is_string($text)) { + + $max = strlen($text); + $buf = ""; + + for ($i = 0; $i < $max; $i++) { + + $c1 = $text{$i}; + + if ($c1>="\xc0") { //Should be converted to UTF8, if it's not UTF8 already + + $c2 = $i+1 >= $max? "\x00" : $text{$i+1}; + $c3 = $i+2 >= $max? "\x00" : $text{$i+2}; + $c4 = $i+3 >= $max? "\x00" : $text{$i+3}; + + if ($c1 >= "\xc0" & $c1 <= "\xdf") { //looks like 2 bytes UTF8 + + if ($c2 >= "\x80" && $c2 <= "\xbf") { //yeah, almost sure it's UTF8 already + $buf .= $c1 . $c2; + $i++; + } + else { //not valid UTF8. Convert it. + $buf .= self::convertInvalidCharacter($c1); + } + } + else if ($c1 >= "\xe0" & $c1 <= "\xef") { //looks like 3 bytes UTF8 + + if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { //yeah, almost sure it's UTF8 already + $buf .= $c1 . $c2 . $c3; + $i = $i + 2; + } + else { //not valid UTF8. Convert it. + $buf .= self::convertInvalidCharacter($c1); + } + } + else if ($c1 >= "\xf0" & $c1 <= "\xf7") { //looks like 4 bytes UTF8 + + if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { //yeah, almost sure it's UTF8 already + $buf .= $c1 . $c2 . $c3; + $i = $i + 2; + } + else { //not valid UTF8. Convert it. + $buf .= self::convertInvalidCharacter($c1); + } + } + else { //doesn't look like UTF8, but should be converted + $buf .= self::convertInvalidCharacter($c1); + } + } + elseif (($c1 & "\xc0") == "\x80") { // needs conversion + + if (isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases + $buf .= self::$win1252ToUtf8[ord($c1)]; + } + else { + $buf .= self::convertInvalidCharacter($c1); + } + } + else { // it doesn't need conversion + $buf .= $c1; + } + } + + return $buf; + } + else { + return $text; + } + } + + public static function convertInvalidCharacter($c1) + { + $cc1 = chr(ord($c1) / 64) | "\xc0"; + $cc2 = ($c1 & "\x3f") | "\x80"; + return $cc1.$cc2; + } + + public static function convert($input, $encoding) + { + switch ($encoding) { + case 'utf-8': + return $input; + case 'windows-1251': + case 'windows-1255': + return iconv($encoding, 'UTF-8//TRANSLIT', $input); + default: + return self::toUTF8($input); + } + } +} -- cgit v1.2.3