summaryrefslogtreecommitdiffstats
path: root/vendor/fguillot/picofeed/lib/PicoFeed/Filter/Filter.php
blob: e3e4ad36bb03bce10914662f1c4af3eb338dc7d0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
<?php

namespace PicoFeed\Filter;

/**
 * Filter class
 *
 * @author  Frederic Guillot
 * @package Filter
 */
class Filter
{
    /**
     * Get the Html filter instance
     *
     * @static
     * @access public
     * @param  string  $html      HTML content
     * @param  string  $website   Site URL (used to build absolute URL)
     * @return Html
     */
    public static function html($html, $website)
    {
        $filter = new Html($html, $website);
        return $filter;
    }

    /**
     * Escape HTML content
     *
     * @static
     * @access public
     * @return string
     */
    public static function escape($content)
    {
        return @htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);
    }

    /**
     * Remove HTML tags
     *
     * @access public
     * @param  string  $data  Input data
     * @return string
     */
    public function removeHTMLTags($data)
    {
        return preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $data);
    }

    /**
     * Remove the XML tag from a document
     *
     * @static
     * @access public
     * @param  string  $data  Input data
     * @return string
     */
    public static function stripXmlTag($data)
    {
        if (strpos($data, '<?xml') !== false) {
            $data = ltrim(substr($data, strpos($data, '?>') + 2));
        }

        do {

            $pos = strpos($data, '<?xml-stylesheet ');

            if ($pos !== false) {
                $data = ltrim(substr($data, strpos($data, '?>') + 2));
            }

        } while ($pos !== false && $pos < 200);

        return $data;
    }

    /**
     * Strip head tag from the HTML content
     *
     * @static
     * @access public
     * @param  string  $data  Input data
     * @return string
     */
    public static function stripHeadTags($data)
    {
        return preg_replace('@<head[^>]*?>.*?</head>@siu','', $data );
    }

    /**
     * Trim whitespace from the begining, the end and inside a string and don't break utf-8 string
     *
     * @static
     * @access public
     * @param  string  $value  Raw data
     * @return string          Normalized data
     */
    public static function stripWhiteSpace($value)
    {
        $value = str_replace("\r", ' ', $value);
        $value = str_replace("\t", ' ', $value);
        $value = str_replace("\n", ' ', $value);
        // $value = preg_replace('/\s+/', ' ', $value); <= break utf-8
        return trim($value);
    }

    /**
     * Fixes before XML parsing
     *
     * @static
     * @access public
     * @param  string  $data Raw data
     * @return string        Normalized data
     */
    public static function normalizeData($data)
    {
        $entities = array(
            '/(&#)(\d+);/m', // decimal encoded
            '/(&#x)([a-f0-9]+);/mi', // hex encoded
        );

        // strip invalid XML 1.0 characters which are encoded as entities
        $data = preg_replace_callback($entities, function($matches) {
            $code_point = $matches[2];

            // convert hex entity to decimal
            if (strtolower($matches[1]) === '&#x') {
                $code_point = hexdec($code_point);
            }

            $code_point = (int) $code_point;

            // replace invalid characters
            if ($code_point < 9
                || ($code_point > 10 && $code_point < 13)
                || ($code_point > 13 && $code_point < 32)
                || ($code_point > 55295 && $code_point < 57344)
                || ($code_point > 65533 && $code_point < 65536)
                || $code_point > 1114111
            ) {
                return '';
            };

            return $matches[0];
        }, $data);

        // strip every utf-8 character than isn't in the range of valid XML 1.0 characters
        return (string) preg_replace('/[^\x{0009}\x{000A}\x{000D}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]/u', '', $data);
    }
}