1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
|
from mlscraper.html import get_root_node
from mlscraper.html import HTMLExactTextMatch
from mlscraper.html import Page
from mlscraper.html import selector_matches_nodes
from mlscraper.matches import AttributeValueExtractor
def test_get_root_nodes():
html = b'<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>'
page = Page(html)
node_1 = page.select("#one")[0]
node_2 = page.select("#two")[0]
root = get_root_node([node_1, node_2])
assert root == page.select("div")[0]
def test_node_set():
html = b"<html><body><p>test</p></body></html>"
page = Page(html)
node_1 = page.select("p")[0]
node_2 = node_1.parent.select("p")[0]
assert node_1.parent == node_2.parent
class TestPage:
def test_select(self, stackoverflow_samples):
page = stackoverflow_samples[0].page
nodes = page.select(".answer .js-vote-count")
assert [n.text for n in nodes] == ["20", "16", "0"]
def test_find_all(self, stackoverflow_samples):
page = stackoverflow_samples[0].page
nodes = page.find_all("/users/624900/jterrace")
assert nodes
def test_attribute_extractor():
html_ = (
b'<html><body><a href="https://karllorey.com"></a><a>no link</a></body></html>'
)
page = Page(html_)
extractor = AttributeValueExtractor("href")
a_tags = page.select("a")
assert extractor.extract(a_tags[0]) == "https://karllorey.com"
assert extractor.extract(a_tags[1]) is None
def test_extractor_factory():
# we want to make sure that each extractor exists only once
# as we need this to ensure extractor selection
e1 = AttributeValueExtractor("href")
e2 = AttributeValueExtractor("href")
assert len({e1, e2}) == 1
def test_equality():
# we want to make sure that equal html does not result in equality
same_html = b"<html><body><div><p></p></div></body></html>"
assert Page(same_html) == Page(same_html)
assert Page(same_html) is not Page(same_html)
def test_select():
html = b"<html><body><p></p><p></p></body></html>"
page = Page(html)
p_tag_nodes = page.select("p")
assert len(p_tag_nodes) == 2
# not used in practice
# assert len(set(p_tag_nodes)) == 2
def test_tag_name():
html = b"<html><body><p>bla</p></body></html>"
p = Page(html)
tag_node = p.select("p")[0]
assert tag_node.tag_name == "p"
def test_classes():
html = b'<html><body><p class="box bordered">bla</p></body></html>'
p = Page(html)
tag_node = p.select("p")[0]
assert tag_node.classes == ["box", "bordered"]
def test_selector_matches_nodes():
html = b"<html><body><p>1</p><p>2</p></body></html>"
page = Page(html)
p_tags = page.select("p")
assert selector_matches_nodes(
page, "p", p_tags
), "does not match properly ordered tags"
p_tags_reversed = list(reversed(p_tags))
assert not selector_matches_nodes(
page, "p", p_tags_reversed
), "matches reversed order"
def test_find_text_with_whitespace():
html = b"<html><body><p> whitespace \n\t </p></body></html>"
page = Page(html)
html_matches = page.find_all("whitespace")
# should match p, body, html (and document)
assert len(html_matches) == 4
assert all(isinstance(hm, HTMLExactTextMatch) for hm in html_matches)
def test_find_text_with_noise():
html = b"<html><body><p>bla karl bla</p></body></html>"
page = Page(html)
assert all(
not isinstance(html_match, HTMLExactTextMatch)
for html_match in page.find_all("karl")
)
|