summaryrefslogtreecommitdiffstats
path: root/vendor/fguillot/picofeed/lib/PicoFeed/Encoding/Encoding.php
blob: 7739def5f4acdab2c037291649e163dfb675d2fa (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
<?php

namespace PicoFeed\Encoding;

/**
 * @author   "Sebastián Grignoli" <grignoli@framework2.com.ar>
 * @package  Encoding
 * @version  1.2
 * @link     https://github.com/neitanod/forceutf8
 * @example  https://github.com/neitanod/forceutf8
 * @license  Revised BSD
 */
class Encoding
{
    protected static $win1252ToUtf8 = array(
        128 => "\xe2\x82\xac",
        130 => "\xe2\x80\x9a",
        131 => "\xc6\x92",
        132 => "\xe2\x80\x9e",
        133 => "\xe2\x80\xa6",
        134 => "\xe2\x80\xa0",
        135 => "\xe2\x80\xa1",
        136 => "\xcb\x86",
        137 => "\xe2\x80\xb0",
        138 => "\xc5\xa0",
        139 => "\xe2\x80\xb9",
        140 => "\xc5\x92",
        142 => "\xc5\xbd",
        145 => "\xe2\x80\x98",
        146 => "\xe2\x80\x99",
        147 => "\xe2\x80\x9c",
        148 => "\xe2\x80\x9d",
        149 => "\xe2\x80\xa2",
        150 => "\xe2\x80\x93",
        151 => "\xe2\x80\x94",
        152 => "\xcb\x9c",
        153 => "\xe2\x84\xa2",
        154 => "\xc5\xa1",
        155 => "\xe2\x80\xba",
        156 => "\xc5\x93",
        158 => "\xc5\xbe",
        159 => "\xc5\xb8"
    );

    /**
    * Function Encoding::toUTF8
    *
    * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
    *
    * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
    *
    * It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
    *
    * 1) when any of these characters:   ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
    *    are followed by any of these:  ("group B")
    *                                    ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿
    * For example:   %ABREPRESENT%C9%BB. «REPRESENTÉ»
    * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
    * is also a valid unicode character, and will be left unchanged.
    *
    * 2) when any of these: àáâãäåæçèéêëìíîï  are followed by TWO chars from group B,
    * 3) when any of these: ðñòó  are followed by THREE chars from group B.
    *
    * @name toUTF8
    * @param string $text  Any string.
    * @return string  The same string, UTF8 encoded
    *
    */
    public static function toUTF8($text)
    {
        if (is_array($text)) {
            foreach ($text as $k => $v) {
                $text[$k] = self::toUTF8($v);
            }

            return $text;
        }
        elseif (is_string($text)) {

            $max = strlen($text);
            $buf = "";

            for ($i = 0; $i < $max; $i++) {

                $c1 = $text{$i};

                if ($c1>="\xc0") { //Should be converted to UTF8, if it's not UTF8 already

                    $c2 = $i+1 >= $max? "\x00" : $text{$i+1};
                    $c3 = $i+2 >= $max? "\x00" : $text{$i+2};
                    $c4 = $i+3 >= $max? "\x00" : $text{$i+3};

                    if ($c1 >= "\xc0" & $c1 <= "\xdf") { //looks like 2 bytes UTF8

                        if ($c2 >= "\x80" && $c2 <= "\xbf") { //yeah, almost sure it's UTF8 already
                            $buf .= $c1 . $c2;
                            $i++;
                        }
                        else { //not valid UTF8.  Convert it.
                            $buf .= self::convertInvalidCharacter($c1);
                        }
                    }
                    else if ($c1 >= "\xe0" & $c1 <= "\xef") { //looks like 3 bytes UTF8

                        if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { //yeah, almost sure it's UTF8 already
                            $buf .= $c1 . $c2 . $c3;
                            $i = $i + 2;
                        }
                        else { //not valid UTF8.  Convert it.
                            $buf .= self::convertInvalidCharacter($c1);
                        }
                    }
                    else if ($c1 >= "\xf0" & $c1 <= "\xf7") { //looks like 4 bytes UTF8

                        if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { //yeah, almost sure it's UTF8 already
                            $buf .= $c1 . $c2 . $c3;
                            $i = $i + 2;
                        }
                        else { //not valid UTF8.  Convert it.
                            $buf .= self::convertInvalidCharacter($c1);
                        }
                    }
                    else { //doesn't look like UTF8, but should be converted
                        $buf .= self::convertInvalidCharacter($c1);
                    }
                }
                elseif (($c1 & "\xc0") == "\x80") { // needs conversion

                    if (isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
                        $buf .= self::$win1252ToUtf8[ord($c1)];
                    }
                    else {
                        $buf .= self::convertInvalidCharacter($c1);
                    }
                }
                else { // it doesn't need conversion
                    $buf .= $c1;
                }
            }

            return $buf;
        }
        else {
            return $text;
        }
    }

    public static function convertInvalidCharacter($c1)
    {
        $cc1 = chr(ord($c1) / 64) | "\xc0";
        $cc2 = ($c1 & "\x3f") | "\x80";
        return $cc1.$cc2;
    }

    public static function convert($input, $encoding)
    {
        switch ($encoding) {
            case 'utf-8':
                return $input;
            case 'windows-1251':
            case 'windows-1255':
                return iconv($encoding, 'UTF-8//TRANSLIT', $input);
            default:
                return self::toUTF8($input);
        }
    }
}