Minor fixes and improvements

author: Karl Lorey <git@karllorey.com> 2022-07-05 16:24:43 +0200
committer: Karl Lorey <git@karllorey.com> 2022-07-05 16:24:43 +0200
commit: b8b9513e6ff688511b0ddba772d2d805c361d00c (patch)
tree: ced816b4b29f7a05a2115e7b817cab93e25ceebe
parent: e63203f7ad77d512ca5ffaffb9b8f597430b95c5 (diff)
5 files changed, 48 insertions, 16 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py
index 1f1c7cf..aac5377 100644
--- a/mlscraper/html.py
+++ b/mlscraper/html.py
@@ -175,15 +175,13 @@ def get_root_node(nodes: list[Node]) -> Node:
 
 
 def get_relative_depth(node: Node, root: Node):
-    node_parents = list(node.soup.parents)
-
-    # depth of root
-    i = node_parents.index(root.soup)
-
-    # depth of element
-    j = len(node_parents)
-
-    return j - i
+    """
+    Return the relative depth of node inside tree starting from root.
+    """
+    hierarchy = list(reversed([node] + node.parents))
+    assert node in hierarchy
+    assert root in hierarchy
+    return hierarchy.index(node) - hierarchy.index(root)
 
 
 def selector_matches_nodes(root: Node, selector: str, expected: list[Node]):
diff --git a/mlscraper/matches.py b/mlscraper/matches.py
index 8ebc4e9..508f81e 100644
--- a/mlscraper/matches.py
+++ b/mlscraper/matches.py
@@ -199,7 +199,11 @@ class ValueMatch(Match):
         if self.node.tag_name != match.node.tag_name:
             return 0
 
-        return 1
+        jaccard_top = len(set(self.node.classes).intersection(match.node.classes))
+        jaccard_bottom = len(set(self.node.classes).union(match.node.classes))
+        if jaccard_top == jaccard_bottom:
+            return 1  # also 0/0
+        return jaccard_top / jaccard_bottom
 
 
 def generate_all_value_matches(
diff --git a/mlscraper/selectors.py b/mlscraper/selectors.py
index 904e984..af7fbae 100644
--- a/mlscraper/selectors.py
+++ b/mlscraper/selectors.py
@@ -1,10 +1,11 @@
 import logging
+import typing
 from itertools import product
 
 from mlscraper.html import Node
 from mlscraper.util import no_duplicates_generator_decorator
+from mlscraper.util import powerset_max_length
 from more_itertools import first
-from more_itertools import powerset
 
 
 class Selector:
@@ -43,6 +44,17 @@ class CssRuleSelector(Selector):
     def select_all(self, node: Node):
         return node.select(self.css_rule)
 
+    def uniquely_selects(self, root: Node, nodes: typing.Collection[Node]):
+        # directly using soups:
+        # - avoids creating nodes for all selects
+        # - increases caching effort
+        # return root.soup.select(self.css_rule) == [n.soup for n in nodes]
+
+        # using select
+        # - creates nodes for every soup object
+        # - leverages caching
+        return root.select(self.css_rule) == nodes
+
     def __repr__(self):
         return f"<{self.__class__.__name__} {self.css_rule=}>"
 
@@ -58,7 +70,7 @@ def generate_unique_selectors_for_nodes(nodes: list[Node], roots, complexity: in
     nodes_per_root = {r: [n for n in nodes if n.has_parent(r)] for r in set(roots)}
     for selector in generate_selectors_for_nodes(nodes, roots, complexity):
         if all(
-            selector.select_all(r) == nodes_of_root
+            selector.uniquely_selects(r, nodes_of_root)
             for r, nodes_of_root in nodes_per_root.items()
         ):
             yield selector
@@ -85,7 +97,12 @@ def generate_selectors_for_nodes(nodes: list[Node], roots, complexity: int):
         [p for p in n.parents if p.has_parent(r) and p.tag_name != "html"]
         for n, r in zip(nodes, roots)
     ]
-    for ancestors in product(*ancestors_below_roots):
+    ancestor_combinations = sorted(
+        product(*ancestors_below_roots),
+        key=lambda ancestors: len({c for a in ancestors for c in a.classes}),
+        reverse=True,
+    )
+    for ancestors in ancestor_combinations:
         for ancestor_selector_raw in _generate_direct_css_selectors_for_nodes(
             ancestors
         ):
@@ -110,8 +127,6 @@ def _generate_direct_css_selectors_for_nodes(nodes: list[Node]):
     for css_selector in _generate_direct_css_selectors_for_nodes_without_pseudo(nodes):
         yield css_selector
 
-    # pull to the end as far as possible
-    for css_selector in _generate_direct_css_selectors_for_nodes_without_pseudo(nodes):
         if all(n.tag_name not in ["html", "body"] for n in nodes):
             child_indexes = [n.parent.select(css_selector).index(n) for n in nodes]
             if len(set(child_indexes)) == 1:
@@ -135,7 +150,11 @@ def _generate_direct_css_selectors_for_nodes_without_pseudo(nodes: list[Node]):
 
     # check for common classes
     common_classes = set.intersection(*[set(n.classes) for n in nodes])
-    for class_combination in powerset(common_classes):
+
+    # ignore selectors containing colons
+    common_classes = [cc for cc in common_classes if ":" not in cc]
+
+    for class_combination in powerset_max_length(common_classes, 2):
         if class_combination:
             css_selector = "".join(map(lambda cl: "." + cl, class_combination))
             yield css_selector
diff --git a/mlscraper/training.py b/mlscraper/training.py
index d8f89b4..3e3b259 100644
--- a/mlscraper/training.py
+++ b/mlscraper/training.py
@@ -128,6 +128,7 @@ def train_scraper_for_matches(matches, roots, complexity: int):
             None,
         )
         if not selector:
+            logging.info(f"did not find selector for matches ({matches=})")
             raise NoScraperFoundException(f"no selector found {matches=}")
         logging.info(f"found selector for ValueScraper ({selector=})")
         return ValueScraper(selector, extractor)
diff --git a/tests/test_html.py b/tests/test_html.py
index ab7a6b3..42b967f 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -1,3 +1,4 @@
+from mlscraper.html import get_relative_depth
 from mlscraper.html import get_root_node
 from mlscraper.html import HTMLExactTextMatch
 from mlscraper.html import Page
@@ -115,3 +116,12 @@ def test_find_text_with_noise():
         not isinstance(html_match, HTMLExactTextMatch)
         for html_match in page.find_all("karl")
     )
+
+
+def test_get_relative_depth():
+    html = b"<html><body><p>bla karl bla</p></body></html>"
+    page = Page(html)
+    p_tag = page.select("p")[0]
+    assert get_relative_depth(p_tag, p_tag) == 0
+    assert get_relative_depth(p_tag, p_tag.parent) == 1
+    assert get_relative_depth(p_tag, p_tag.parent.parent) == 2
author	Karl Lorey <git@karllorey.com>	2022-07-05 16:24:43 +0200
committer	Karl Lorey <git@karllorey.com>	2022-07-05 16:24:43 +0200
commit	b8b9513e6ff688511b0ddba772d2d805c361d00c (patch)
tree	ced816b4b29f7a05a2115e7b817cab93e25ceebe
parent	e63203f7ad77d512ca5ffaffb9b8f597430b95c5 (diff)