summaryrefslogtreecommitdiffstats
path: root/tests/test_selectors.py
blob: 9e7acc0e0a7560484704c803a6da57f79cacdee8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from mlscraper.html import Page
from mlscraper.selectors import generate_unique_selectors_for_nodes


def get_css_selectors_for_node(node):
    """
    helper to extract plain css rules
    """
    return [
        selector.css_rule
        for selector in generate_unique_selectors_for_nodes([node], None, 100)
    ]


class TestGenerateUniqueSelectorsForNodes:
    def test_basic(self):
        page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>'
        page1 = Page(page1_html)

        page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>'
        page2 = Page(page2_html)

        nodes = list(map(lambda p: p.select("p.test")[0], [page1, page2]))
        gen = generate_unique_selectors_for_nodes(nodes, None, 1)
        selectors_found = [sel.css_rule for sel in gen]

        assert "p" not in selectors_found
        assert "div" not in selectors_found

        assert ".test" in selectors_found
        assert "p.test" in selectors_found

    def test_ids(self):
        page = Page(
            b"""
            <html><body>
                <div id="target">test</div>
                <div>irrelevant</div>
            </body></html>"""
        )
        node = page.select("#target")[0]
        selectors = get_css_selectors_for_node(node)
        assert selectors == ["#target"]

    def test_multi_parents(self):
        page = Page(b'<html><body><div id="target"><p>test</p></div><div><p></p></div>')
        node = page.select("#target")[0].select("p")[0]
        selectors = get_css_selectors_for_node(node)
        assert "#target p" in selectors