summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-06-24 01:10:29 +0200
committerKarl Lorey <git@karllorey.com>2022-06-24 01:10:29 +0200
commit42e19d25d5c21e20250d3f327f66a90b7846d57a (patch)
treed7691f33ba788a50a42a27ab94c56f6e4261767c
parentd567ea3ecf5f1bf7e42e4d93def6052b7adc5fa2 (diff)
Add nth-child selector generation
-rw-r--r--mlscraper/selectors.py25
-rw-r--r--tests/test_selectors.py23
2 files changed, 45 insertions, 3 deletions
diff --git a/mlscraper/selectors.py b/mlscraper/selectors.py
index 4587274..4e1a7c0 100644
--- a/mlscraper/selectors.py
+++ b/mlscraper/selectors.py
@@ -103,12 +103,31 @@ def generate_selectors_for_nodes(nodes: list[Node], roots, complexity: int):
def _generate_direct_css_selectors_for_nodes(nodes: list[Node]):
+ # pseudo classes apply to already generated selectors
+ # and can thus be applied in retrospect
+
+ # see: https://developer.mozilla.org/en-US/docs/Web/CSS/:nth-child
+ for css_selector in _generate_direct_css_selectors_for_nodes_without_pseudo(nodes):
+ yield css_selector
+
+ # pull to the end as far as possible
+ for css_selector in _generate_direct_css_selectors_for_nodes_without_pseudo(nodes):
+ if all(n.tag_name not in ["html", "body"] for n in nodes):
+ child_indexes = [n.parent.select(css_selector).index(n) for n in nodes]
+ if len(set(child_indexes)) == 1:
+ # nth is indexed with 1
+ nth = 1 + child_indexes[0]
+ yield f"{css_selector}:nth-child({nth})"
+
+
+def _generate_direct_css_selectors_for_nodes_without_pseudo(nodes: list[Node]):
common_classes = set.intersection(*[set(n.classes) for n in nodes])
# check for same tag name
is_same_tag = len({n.tag_name for n in nodes}) == 1
- common_tag_name = nodes[0].tag_name
- yield common_tag_name
+ common_tag_name = nodes[0].tag_name if is_same_tag else None
+ if common_tag_name:
+ yield common_tag_name
# check for common id
common_ids = {n.id for n in nodes}
@@ -123,7 +142,7 @@ def _generate_direct_css_selectors_for_nodes(nodes: list[Node]):
yield css_selector
# if same tag name, also yield tag_name + selector
- if is_same_tag:
+ if common_tag_name:
yield common_tag_name + css_selector
else:
# empty combination -> ignore
diff --git a/tests/test_selectors.py b/tests/test_selectors.py
index 4038dd9..eace894 100644
--- a/tests/test_selectors.py
+++ b/tests/test_selectors.py
@@ -32,6 +32,19 @@ class TestGenerateUniqueSelectorsForNodes:
assert "p.test" in selectors_found
assert "body > p.test" in selectors_found
+ def test_nth(self):
+ html = b"""<html><body>
+ <ul><li>target</li><li>noise</li></ul>
+ <ul><li>target</li><li>noise</li></ul>
+ </body></html>"""
+ page = Page(html)
+ first_li_tags = [ul.select("li")[0] for ul in page.select("ul")]
+ unique_selectors = [
+ s.css_rule
+ for s in generate_unique_selectors_for_nodes(first_li_tags, None, 100)
+ ]
+ assert "li:nth-child(1)" in unique_selectors
+
def test_ids(self):
page = Page(
b"""
@@ -64,3 +77,13 @@ class TestGenerateDirectCssSelectorsForNodes:
assert "div[itemprop]" in direct_css_selectors
assert 'div[itemprop="user"]' in direct_css_selectors
+
+ def test_nth(self):
+ html = b"""<html><body>
+ <ul><li>target</li><li>noise</li></ul>
+ <ul><li>target</li><li>noise</li></ul>
+ </body></html>"""
+ page = Page(html)
+ first_li_tags = [ul.select("li")[0] for ul in page.select("ul")]
+ unique_selectors = list(_generate_direct_css_selectors_for_nodes(first_li_tags))
+ assert "li:nth-child(1)" in unique_selectors