diff options
author | Karl Lorey <git@karllorey.com> | 2022-06-19 15:58:37 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-06-19 15:58:37 +0200 |
commit | 59d0eeb254184ae2d7962650c8ddc45b212c3f18 (patch) | |
tree | afa8816fbc4b06632d9f4eff2608a136a62b0048 | |
parent | edc327cbc52de5fc9d9cb8eb475d8007ea7337f1 (diff) |
Fix css selector generation by adding tag name and avoiding empty selector
-rw-r--r-- | mlscraper/html.py | 12 | ||||
-rw-r--r-- | tests/test_selectors.py | 18 |
2 files changed, 18 insertions, 12 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py index 0a6f4de..911638a 100644 --- a/mlscraper/html.py +++ b/mlscraper/html.py @@ -43,12 +43,18 @@ def _generate_css_selectors_for_node(soup: Tag, complexity: int): if tag_id: yield "#" + tag_id + yield soup.name + # use classes css_classes = soup.attrs.get("class", []) for css_class_combo in powerset_max_length(css_classes, complexity): - css_clases_str = "".join([f".{css_class}" for css_class in css_class_combo]) - css_selector = soup.name + css_clases_str - yield css_selector + if css_class_combo: + css_clases_str = "".join([f".{css_class}" for css_class in css_class_combo]) + yield css_clases_str + yield soup.name + css_clases_str + else: + # empty set, no selector + pass # todo: nth applies to whole selectors # -> should thus be a step after actual selector generation diff --git a/tests/test_selectors.py b/tests/test_selectors.py index c2d204a..3e4570a 100644 --- a/tests/test_selectors.py +++ b/tests/test_selectors.py @@ -1,21 +1,21 @@ from mlscraper.html import Page -from mlscraper.samples import Sample from mlscraper.selectors import generate_selector_for_nodes def test_generate_selector_for_nodes(): page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>' page1 = Page(page1_html) - sample1 = Sample(page1, "test") page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>' page2 = Page(page2_html) - sample2 = Sample(page2, "hallo") - samples = [sample1, sample2] - - nodes = [s.get_matches()[0].root for s in samples] - print(nodes) + nodes = list(map(lambda p: p.select("p.test")[0], [page1, page2])) gen = generate_selector_for_nodes(nodes, None, 1) - # todo .test is also possible - assert ["p.test"] == [sel.css_rule for sel in gen] + selectors_found = [sel.css_rule for sel in gen] + assert {".test", "p.test"} == set(selectors_found) + + +class TestGenerateSelectorForNodes: + def test_generate_selector_for_nodes(self): + # generate_selector_for_nodes() + pass |