blob: 9e7acc0e0a7560484704c803a6da57f79cacdee8 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
from mlscraper.html import Page
from mlscraper.selectors import generate_unique_selectors_for_nodes
def get_css_selectors_for_node(node):
"""
helper to extract plain css rules
"""
return [
selector.css_rule
for selector in generate_unique_selectors_for_nodes([node], None, 100)
]
class TestGenerateUniqueSelectorsForNodes:
def test_basic(self):
page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>'
page1 = Page(page1_html)
page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>'
page2 = Page(page2_html)
nodes = list(map(lambda p: p.select("p.test")[0], [page1, page2]))
gen = generate_unique_selectors_for_nodes(nodes, None, 1)
selectors_found = [sel.css_rule for sel in gen]
assert "p" not in selectors_found
assert "div" not in selectors_found
assert ".test" in selectors_found
assert "p.test" in selectors_found
def test_ids(self):
page = Page(
b"""
<html><body>
<div id="target">test</div>
<div>irrelevant</div>
</body></html>"""
)
node = page.select("#target")[0]
selectors = get_css_selectors_for_node(node)
assert selectors == ["#target"]
def test_multi_parents(self):
page = Page(b'<html><body><div id="target"><p>test</p></div><div><p></p></div>')
node = page.select("#target")[0].select("p")[0]
selectors = get_css_selectors_for_node(node)
assert "#target p" in selectors
|