summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-06-19 15:58:37 +0200
committerKarl Lorey <git@karllorey.com>2022-06-19 15:58:37 +0200
commit59d0eeb254184ae2d7962650c8ddc45b212c3f18 (patch)
treeafa8816fbc4b06632d9f4eff2608a136a62b0048
parentedc327cbc52de5fc9d9cb8eb475d8007ea7337f1 (diff)
Fix css selector generation by adding tag name and avoiding empty selector
-rw-r--r--mlscraper/html.py12
-rw-r--r--tests/test_selectors.py18
2 files changed, 18 insertions, 12 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py
index 0a6f4de..911638a 100644
--- a/mlscraper/html.py
+++ b/mlscraper/html.py
@@ -43,12 +43,18 @@ def _generate_css_selectors_for_node(soup: Tag, complexity: int):
if tag_id:
yield "#" + tag_id
+ yield soup.name
+
# use classes
css_classes = soup.attrs.get("class", [])
for css_class_combo in powerset_max_length(css_classes, complexity):
- css_clases_str = "".join([f".{css_class}" for css_class in css_class_combo])
- css_selector = soup.name + css_clases_str
- yield css_selector
+ if css_class_combo:
+ css_clases_str = "".join([f".{css_class}" for css_class in css_class_combo])
+ yield css_clases_str
+ yield soup.name + css_clases_str
+ else:
+ # empty set, no selector
+ pass
# todo: nth applies to whole selectors
# -> should thus be a step after actual selector generation
diff --git a/tests/test_selectors.py b/tests/test_selectors.py
index c2d204a..3e4570a 100644
--- a/tests/test_selectors.py
+++ b/tests/test_selectors.py
@@ -1,21 +1,21 @@
from mlscraper.html import Page
-from mlscraper.samples import Sample
from mlscraper.selectors import generate_selector_for_nodes
def test_generate_selector_for_nodes():
page1_html = '<html><body><p class="test">test</p><p>bla</p></body></html>'
page1 = Page(page1_html)
- sample1 = Sample(page1, "test")
page2_html = '<html><body><div></div><p class="test">hallo</p></body></html>'
page2 = Page(page2_html)
- sample2 = Sample(page2, "hallo")
- samples = [sample1, sample2]
-
- nodes = [s.get_matches()[0].root for s in samples]
- print(nodes)
+ nodes = list(map(lambda p: p.select("p.test")[0], [page1, page2]))
gen = generate_selector_for_nodes(nodes, None, 1)
- # todo .test is also possible
- assert ["p.test"] == [sel.css_rule for sel in gen]
+ selectors_found = [sel.css_rule for sel in gen]
+ assert {".test", "p.test"} == set(selectors_found)
+
+
+class TestGenerateSelectorForNodes:
+ def test_generate_selector_for_nodes(self):
+ # generate_selector_for_nodes()
+ pass