diff options
author | Karl Lorey <git@karllorey.com> | 2022-06-24 01:10:29 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-06-24 01:10:29 +0200 |
commit | 42e19d25d5c21e20250d3f327f66a90b7846d57a (patch) | |
tree | d7691f33ba788a50a42a27ab94c56f6e4261767c | |
parent | d567ea3ecf5f1bf7e42e4d93def6052b7adc5fa2 (diff) |
Add nth-child selector generation
-rw-r--r-- | mlscraper/selectors.py | 25 | ||||
-rw-r--r-- | tests/test_selectors.py | 23 |
2 files changed, 45 insertions, 3 deletions
diff --git a/mlscraper/selectors.py b/mlscraper/selectors.py index 4587274..4e1a7c0 100644 --- a/mlscraper/selectors.py +++ b/mlscraper/selectors.py @@ -103,12 +103,31 @@ def generate_selectors_for_nodes(nodes: list[Node], roots, complexity: int): def _generate_direct_css_selectors_for_nodes(nodes: list[Node]): + # pseudo classes apply to already generated selectors + # and can thus be applied in retrospect + + # see: https://developer.mozilla.org/en-US/docs/Web/CSS/:nth-child + for css_selector in _generate_direct_css_selectors_for_nodes_without_pseudo(nodes): + yield css_selector + + # pull to the end as far as possible + for css_selector in _generate_direct_css_selectors_for_nodes_without_pseudo(nodes): + if all(n.tag_name not in ["html", "body"] for n in nodes): + child_indexes = [n.parent.select(css_selector).index(n) for n in nodes] + if len(set(child_indexes)) == 1: + # nth is indexed with 1 + nth = 1 + child_indexes[0] + yield f"{css_selector}:nth-child({nth})" + + +def _generate_direct_css_selectors_for_nodes_without_pseudo(nodes: list[Node]): common_classes = set.intersection(*[set(n.classes) for n in nodes]) # check for same tag name is_same_tag = len({n.tag_name for n in nodes}) == 1 - common_tag_name = nodes[0].tag_name - yield common_tag_name + common_tag_name = nodes[0].tag_name if is_same_tag else None + if common_tag_name: + yield common_tag_name # check for common id common_ids = {n.id for n in nodes} @@ -123,7 +142,7 @@ def _generate_direct_css_selectors_for_nodes(nodes: list[Node]): yield css_selector # if same tag name, also yield tag_name + selector - if is_same_tag: + if common_tag_name: yield common_tag_name + css_selector else: # empty combination -> ignore diff --git a/tests/test_selectors.py b/tests/test_selectors.py index 4038dd9..eace894 100644 --- a/tests/test_selectors.py +++ b/tests/test_selectors.py @@ -32,6 +32,19 @@ class TestGenerateUniqueSelectorsForNodes: assert "p.test" in selectors_found assert "body > p.test" in selectors_found + def test_nth(self): + html = b"""<html><body> + <ul><li>target</li><li>noise</li></ul> + <ul><li>target</li><li>noise</li></ul> + </body></html>""" + page = Page(html) + first_li_tags = [ul.select("li")[0] for ul in page.select("ul")] + unique_selectors = [ + s.css_rule + for s in generate_unique_selectors_for_nodes(first_li_tags, None, 100) + ] + assert "li:nth-child(1)" in unique_selectors + def test_ids(self): page = Page( b""" @@ -64,3 +77,13 @@ class TestGenerateDirectCssSelectorsForNodes: assert "div[itemprop]" in direct_css_selectors assert 'div[itemprop="user"]' in direct_css_selectors + + def test_nth(self): + html = b"""<html><body> + <ul><li>target</li><li>noise</li></ul> + <ul><li>target</li><li>noise</li></ul> + </body></html>""" + page = Page(html) + first_li_tags = [ul.select("li")[0] for ul in page.select("ul")] + unique_selectors = list(_generate_direct_css_selectors_for_nodes(first_li_tags)) + assert "li:nth-child(1)" in unique_selectors |