summaryrefslogtreecommitdiffstats
path: root/tests/test_selectors.py
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-06-20 11:12:00 +0200
committerKarl Lorey <git@karllorey.com>2022-06-20 11:12:00 +0200
commitc6c371223d56f23ad4a588231b5e6f51bee4259c (patch)
tree45d68736dfce2c8bee9d061e19224161f3916332 /tests/test_selectors.py
parent3a4c3234653984768a992747ba45da5e34d3af9c (diff)
Re-implement selector generation with a speedup >10xdevelop
Diffstat (limited to 'tests/test_selectors.py')
-rw-r--r--tests/test_selectors.py56
1 files changed, 42 insertions, 14 deletions
diff --git a/tests/test_selectors.py b/tests/test_selectors.py
index 3e4570a..9e7acc0 100644
--- a/tests/test_selectors.py
+++ b/tests/test_selectors.py
@@ -1,21 +1,49 @@
from mlscraper.html import Page
-from mlscraper.selectors import generate_selector_for_nodes
+from mlscraper.selectors import generate_unique_selectors_for_nodes
-def test_generate_selector_for_nodes():
- page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>'
- page1 = Page(page1_html)
+def get_css_selectors_for_node(node):
+ """
+ helper to extract plain css rules
+ """
+ return [
+ selector.css_rule
+ for selector in generate_unique_selectors_for_nodes([node], None, 100)
+ ]
- page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>'
- page2 = Page(page2_html)
- nodes = list(map(lambda p: p.select("p.test")[0], [page1, page2]))
- gen = generate_selector_for_nodes(nodes, None, 1)
- selectors_found = [sel.css_rule for sel in gen]
- assert {".test", "p.test"} == set(selectors_found)
+class TestGenerateUniqueSelectorsForNodes:
+ def test_basic(self):
+ page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>'
+ page1 = Page(page1_html)
+ page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>'
+ page2 = Page(page2_html)
-class TestGenerateSelectorForNodes:
- def test_generate_selector_for_nodes(self):
- # generate_selector_for_nodes()
- pass
+ nodes = list(map(lambda p: p.select("p.test")[0], [page1, page2]))
+ gen = generate_unique_selectors_for_nodes(nodes, None, 1)
+ selectors_found = [sel.css_rule for sel in gen]
+
+ assert "p" not in selectors_found
+ assert "div" not in selectors_found
+
+ assert ".test" in selectors_found
+ assert "p.test" in selectors_found
+
+ def test_ids(self):
+ page = Page(
+ b"""
+ <html><body>
+ <div id="target">test</div>
+ <div>irrelevant</div>
+ </body></html>"""
+ )
+ node = page.select("#target")[0]
+ selectors = get_css_selectors_for_node(node)
+ assert selectors == ["#target"]
+
+ def test_multi_parents(self):
+ page = Page(b'<html><body><div id="target"><p>test</p></div><div><p></p></div>')
+ node = page.select("#target")[0].select("p")[0]
+ selectors = get_css_selectors_for_node(node)
+ assert "#target p" in selectors