diff options
author | Karl Lorey <git@karllorey.com> | 2022-06-20 11:12:00 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-06-20 11:12:00 +0200 |
commit | c6c371223d56f23ad4a588231b5e6f51bee4259c (patch) | |
tree | 45d68736dfce2c8bee9d061e19224161f3916332 /tests/test_selectors.py | |
parent | 3a4c3234653984768a992747ba45da5e34d3af9c (diff) |
Re-implement selector generation with a speedup >10xdevelop
Diffstat (limited to 'tests/test_selectors.py')
-rw-r--r-- | tests/test_selectors.py | 56 |
1 files changed, 42 insertions, 14 deletions
diff --git a/tests/test_selectors.py b/tests/test_selectors.py index 3e4570a..9e7acc0 100644 --- a/tests/test_selectors.py +++ b/tests/test_selectors.py @@ -1,21 +1,49 @@ from mlscraper.html import Page -from mlscraper.selectors import generate_selector_for_nodes +from mlscraper.selectors import generate_unique_selectors_for_nodes -def test_generate_selector_for_nodes(): - page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>' - page1 = Page(page1_html) +def get_css_selectors_for_node(node): + """ + helper to extract plain css rules + """ + return [ + selector.css_rule + for selector in generate_unique_selectors_for_nodes([node], None, 100) + ] - page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>' - page2 = Page(page2_html) - nodes = list(map(lambda p: p.select("p.test")[0], [page1, page2])) - gen = generate_selector_for_nodes(nodes, None, 1) - selectors_found = [sel.css_rule for sel in gen] - assert {".test", "p.test"} == set(selectors_found) +class TestGenerateUniqueSelectorsForNodes: + def test_basic(self): + page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>' + page1 = Page(page1_html) + page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>' + page2 = Page(page2_html) -class TestGenerateSelectorForNodes: - def test_generate_selector_for_nodes(self): - # generate_selector_for_nodes() - pass + nodes = list(map(lambda p: p.select("p.test")[0], [page1, page2])) + gen = generate_unique_selectors_for_nodes(nodes, None, 1) + selectors_found = [sel.css_rule for sel in gen] + + assert "p" not in selectors_found + assert "div" not in selectors_found + + assert ".test" in selectors_found + assert "p.test" in selectors_found + + def test_ids(self): + page = Page( + b""" + <html><body> + <div id="target">test</div> + <div>irrelevant</div> + </body></html>""" + ) + node = page.select("#target")[0] + selectors = get_css_selectors_for_node(node) + assert selectors == ["#target"] + + def test_multi_parents(self): + page = Page(b'<html><body><div id="target"><p>test</p></div><div><p></p></div>') + node = page.select("#target")[0].select("p")[0] + selectors = get_css_selectors_for_node(node) + assert "#target p" in selectors |