1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
|
<?php
namespace PicoFeed\Filter;
/**
* Filter class
*
* @author Frederic Guillot
* @package Filter
*/
class Filter
{
/**
* Get the Html filter instance
*
* @static
* @access public
* @param string $html HTML content
* @param string $website Site URL (used to build absolute URL)
* @return Html
*/
public static function html($html, $website)
{
$filter = new Html($html, $website);
return $filter;
}
/**
* Escape HTML content
*
* @static
* @access public
* @return string
*/
public static function escape($content)
{
return @htmlspecialchars($content, ENT_QUOTES, 'UTF-8', false);
}
/**
* Remove HTML tags
*
* @access public
* @param string $data Input data
* @return string
*/
public function removeHTMLTags($data)
{
return preg_replace('~<(?:!DOCTYPE|/?(?:html|head|body))[^>]*>\s*~i', '', $data);
}
/**
* Remove the XML tag from a document
*
* @static
* @access public
* @param string $data Input data
* @return string
*/
public static function stripXmlTag($data)
{
if (strpos($data, '<?xml') !== false) {
$data = ltrim(substr($data, strpos($data, '?>') + 2));
}
do {
$pos = strpos($data, '<?xml-stylesheet ');
if ($pos !== false) {
$data = ltrim(substr($data, strpos($data, '?>') + 2));
}
} while ($pos !== false && $pos < 200);
return $data;
}
/**
* Strip head tag from the HTML content
*
* @static
* @access public
* @param string $data Input data
* @return string
*/
public static function stripHeadTags($data)
{
return preg_replace('@<head[^>]*?>.*?</head>@siu','', $data );
}
/**
* Trim whitespace from the begining, the end and inside a string and don't break utf-8 string
*
* @static
* @access public
* @param string $value Raw data
* @return string Normalized data
*/
public static function stripWhiteSpace($value)
{
$value = str_replace("\r", ' ', $value);
$value = str_replace("\t", ' ', $value);
$value = str_replace("\n", ' ', $value);
// $value = preg_replace('/\s+/', ' ', $value); <= break utf-8
return trim($value);
}
/**
* Dirty quickfixes before XML parsing
*
* @static
* @access public
* @param string $data Raw data
* @return string Normalized data
*/
public static function normalizeData($data)
{
$invalid_chars = array(
"\x10",
"\xc3\x20",
"",
"\xe2\x80\x9c\x08",
);
foreach ($invalid_chars as $needle) {
$data = str_replace($needle, '', $data);
}
return $data;
}
}
|