diff options
author | Karl Lorey <git@karllorey.com> | 2022-06-23 21:50:55 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-06-23 21:50:55 +0200 |
commit | 789e635aabd126e934af7dbf0b2769bef28d9683 (patch) | |
tree | be12f49d5a1f0181265693875dd52a24ba3c0ceb | |
parent | cc9b326102060b396a34279a77e7259484e01bc6 (diff) |
Add attribute-based CSS selectors
-rw-r--r-- | mlscraper/selectors.py | 30 | ||||
-rw-r--r-- | tests/test_selectors.py | 18 |
2 files changed, 46 insertions, 2 deletions
diff --git a/mlscraper/selectors.py b/mlscraper/selectors.py index 2f3f477..99b8b7b 100644 --- a/mlscraper/selectors.py +++ b/mlscraper/selectors.py @@ -36,7 +36,7 @@ class CssRuleSelector(Selector): selection = node.select(self.css_rule) if not selection: raise AssertionError( - f"css rule does not match on node ({self.css_rule=}, {node=})" + f"css rule does not match any node ({self.css_rule=}, {node=})" ) return selection[0] @@ -98,22 +98,50 @@ def _generate_direct_css_selectors_for_nodes(nodes: list[Node]): logging.info(f"generating direct css selector for nodes ({nodes=})") common_classes = set.intersection(*[set(n.classes) for n in nodes]) + # check for same tag name is_same_tag = len({n.tag_name for n in nodes}) == 1 common_tag_name = nodes[0].tag_name yield common_tag_name + # check for common id common_ids = {n.id for n in nodes} is_same_id = len(common_ids) == 1 if is_same_id and None not in common_ids: yield "#" + first(common_ids) + # check for common classes for class_combination in powerset(common_classes): if class_combination: logging.info(f"- generating selector for ({class_combination=})") css_selector = "".join(map(lambda cl: "." + cl, class_combination)) yield css_selector + + # if same tag name, also yield tag_name + selector if is_same_tag: yield common_tag_name + css_selector else: # empty combination -> ignore pass + + # check for common attributes + # see: https://developer.mozilla.org/en-US/docs/Web/CSS/Attribute_selectors + if common_tag_name: + common_attributes = set.intersection( + *[set(n.html_attributes.keys()) for n in nodes] + ) + common_attributes_filtered = [ + ca for ca in common_attributes if ca not in ["id", "class", "rel"] + ] + for common_attribute in common_attributes_filtered: + yield f"{common_tag_name}[{common_attribute}]" + + # check for common attribute values + logging.info("attribute: %s", common_attribute) + logging.info( + "attribute values: %s", + [n.html_attributes[common_attribute] for n in nodes], + ) + attribute_values = {n.html_attributes[common_attribute] for n in nodes} + if len(attribute_values) == 1: + common_attribute_value = first(attribute_values) + yield f'{common_tag_name}[{common_attribute}="{common_attribute_value}"]' diff --git a/tests/test_selectors.py b/tests/test_selectors.py index 9e7acc0..12c81d5 100644 --- a/tests/test_selectors.py +++ b/tests/test_selectors.py @@ -1,4 +1,5 @@ from mlscraper.html import Page +from mlscraper.selectors import _generate_direct_css_selectors_for_nodes from mlscraper.selectors import generate_unique_selectors_for_nodes @@ -40,10 +41,25 @@ class TestGenerateUniqueSelectorsForNodes: ) node = page.select("#target")[0] selectors = get_css_selectors_for_node(node) - assert selectors == ["#target"] + assert "#target" in selectors def test_multi_parents(self): page = Page(b'<html><body><div id="target"><p>test</p></div><div><p></p></div>') node = page.select("#target")[0].select("p")[0] selectors = get_css_selectors_for_node(node) assert "#target p" in selectors + + +class TestGenerateDirectCssSelectorsForNodes: + def test_itemprop_selector(self): + html = b"""<html><body> + <div itemprop="user">lorey</div> + <div itemprop="user">jonashaag</div> + </body></html>""" + page = Page(html) + direct_css_selectors = list( + _generate_direct_css_selectors_for_nodes(page.select("div")) + ) + + assert "div[itemprop]" in direct_css_selectors + assert 'div[itemprop="user"]' in direct_css_selectors |