summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-06-23 23:26:28 +0200
committerKarl Lorey <git@karllorey.com>2022-06-23 23:26:28 +0200
commitc54208e15c2b303cced2698ff331962d243b6329 (patch)
tree1734492fa51b6ceca2a1cb37cbf6bc2820e742fd
parenta2f481c3481f6445e520e6bbdfafae3bbf94f96b (diff)
Add child selectors for CSS generation
-rw-r--r--mlscraper/selectors.py27
-rw-r--r--tests/test_selectors.py1
2 files changed, 15 insertions, 13 deletions
diff --git a/mlscraper/selectors.py b/mlscraper/selectors.py
index 99b8b7b..4587274 100644
--- a/mlscraper/selectors.py
+++ b/mlscraper/selectors.py
@@ -81,21 +81,28 @@ def generate_selectors_for_nodes(nodes: list[Node], roots, complexity: int):
for direct_css_selector in direct_css_selectors:
yield CssRuleSelector(direct_css_selector)
- parents_of_nodes_below_roots = [
- [p for p in n.parents if p.has_parent(r) and p.tag_name not in ["html", "body"]]
+ ancestors_below_roots = [
+ [p for p in n.parents if p.has_parent(r) and p.tag_name != "html"]
for n, r in zip(nodes, roots)
]
- for parent_nodes in product(*parents_of_nodes_below_roots):
- for parent_selector_raw in _generate_direct_css_selectors_for_nodes(
- parent_nodes
+ for ancestors in product(*ancestors_below_roots):
+ for ancestor_selector_raw in _generate_direct_css_selectors_for_nodes(
+ ancestors
):
+ # generate refinement selectors for parents
+ # e.g. if selectivity of child selector is not enough
for css_selector_raw in direct_css_selectors:
- css_selector_combined = parent_selector_raw + " " + css_selector_raw
+ css_selector_combined = ancestor_selector_raw + " " + css_selector_raw
yield CssRuleSelector(css_selector_combined)
+ # make parent selector
+ if all(node.parent == parent for node, parent in zip(nodes, ancestors)):
+ yield CssRuleSelector(
+ f"{ancestor_selector_raw} > {css_selector_raw}"
+ )
+
def _generate_direct_css_selectors_for_nodes(nodes: list[Node]):
- logging.info(f"generating direct css selector for nodes ({nodes=})")
common_classes = set.intersection(*[set(n.classes) for n in nodes])
# check for same tag name
@@ -112,7 +119,6 @@ def _generate_direct_css_selectors_for_nodes(nodes: list[Node]):
# check for common classes
for class_combination in powerset(common_classes):
if class_combination:
- logging.info(f"- generating selector for ({class_combination=})")
css_selector = "".join(map(lambda cl: "." + cl, class_combination))
yield css_selector
@@ -136,11 +142,6 @@ def _generate_direct_css_selectors_for_nodes(nodes: list[Node]):
yield f"{common_tag_name}[{common_attribute}]"
# check for common attribute values
- logging.info("attribute: %s", common_attribute)
- logging.info(
- "attribute values: %s",
- [n.html_attributes[common_attribute] for n in nodes],
- )
attribute_values = {n.html_attributes[common_attribute] for n in nodes}
if len(attribute_values) == 1:
common_attribute_value = first(attribute_values)
diff --git a/tests/test_selectors.py b/tests/test_selectors.py
index 12c81d5..4038dd9 100644
--- a/tests/test_selectors.py
+++ b/tests/test_selectors.py
@@ -30,6 +30,7 @@ class TestGenerateUniqueSelectorsForNodes:
assert ".test" in selectors_found
assert "p.test" in selectors_found
+ assert "body > p.test" in selectors_found
def test_ids(self):
page = Page(