diff options
author | Karl Lorey <git@karllorey.com> | 2022-07-05 16:24:43 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-07-05 16:24:43 +0200 |
commit | b8b9513e6ff688511b0ddba772d2d805c361d00c (patch) | |
tree | ced816b4b29f7a05a2115e7b817cab93e25ceebe | |
parent | e63203f7ad77d512ca5ffaffb9b8f597430b95c5 (diff) |
Minor fixes and improvements
-rw-r--r-- | mlscraper/html.py | 16 | ||||
-rw-r--r-- | mlscraper/matches.py | 6 | ||||
-rw-r--r-- | mlscraper/selectors.py | 31 | ||||
-rw-r--r-- | mlscraper/training.py | 1 | ||||
-rw-r--r-- | tests/test_html.py | 10 |
5 files changed, 48 insertions, 16 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py index 1f1c7cf..aac5377 100644 --- a/mlscraper/html.py +++ b/mlscraper/html.py @@ -175,15 +175,13 @@ def get_root_node(nodes: list[Node]) -> Node: def get_relative_depth(node: Node, root: Node): - node_parents = list(node.soup.parents) - - # depth of root - i = node_parents.index(root.soup) - - # depth of element - j = len(node_parents) - - return j - i + """ + Return the relative depth of node inside tree starting from root. + """ + hierarchy = list(reversed([node] + node.parents)) + assert node in hierarchy + assert root in hierarchy + return hierarchy.index(node) - hierarchy.index(root) def selector_matches_nodes(root: Node, selector: str, expected: list[Node]): diff --git a/mlscraper/matches.py b/mlscraper/matches.py index 8ebc4e9..508f81e 100644 --- a/mlscraper/matches.py +++ b/mlscraper/matches.py @@ -199,7 +199,11 @@ class ValueMatch(Match): if self.node.tag_name != match.node.tag_name: return 0 - return 1 + jaccard_top = len(set(self.node.classes).intersection(match.node.classes)) + jaccard_bottom = len(set(self.node.classes).union(match.node.classes)) + if jaccard_top == jaccard_bottom: + return 1 # also 0/0 + return jaccard_top / jaccard_bottom def generate_all_value_matches( diff --git a/mlscraper/selectors.py b/mlscraper/selectors.py index 904e984..af7fbae 100644 --- a/mlscraper/selectors.py +++ b/mlscraper/selectors.py @@ -1,10 +1,11 @@ import logging +import typing from itertools import product from mlscraper.html import Node from mlscraper.util import no_duplicates_generator_decorator +from mlscraper.util import powerset_max_length from more_itertools import first -from more_itertools import powerset class Selector: @@ -43,6 +44,17 @@ class CssRuleSelector(Selector): def select_all(self, node: Node): return node.select(self.css_rule) + def uniquely_selects(self, root: Node, nodes: typing.Collection[Node]): + # directly using soups: + # - avoids creating nodes for all selects + # - increases caching effort + # return root.soup.select(self.css_rule) == [n.soup for n in nodes] + + # using select + # - creates nodes for every soup object + # - leverages caching + return root.select(self.css_rule) == nodes + def __repr__(self): return f"<{self.__class__.__name__} {self.css_rule=}>" @@ -58,7 +70,7 @@ def generate_unique_selectors_for_nodes(nodes: list[Node], roots, complexity: in nodes_per_root = {r: [n for n in nodes if n.has_parent(r)] for r in set(roots)} for selector in generate_selectors_for_nodes(nodes, roots, complexity): if all( - selector.select_all(r) == nodes_of_root + selector.uniquely_selects(r, nodes_of_root) for r, nodes_of_root in nodes_per_root.items() ): yield selector @@ -85,7 +97,12 @@ def generate_selectors_for_nodes(nodes: list[Node], roots, complexity: int): [p for p in n.parents if p.has_parent(r) and p.tag_name != "html"] for n, r in zip(nodes, roots) ] - for ancestors in product(*ancestors_below_roots): + ancestor_combinations = sorted( + product(*ancestors_below_roots), + key=lambda ancestors: len({c for a in ancestors for c in a.classes}), + reverse=True, + ) + for ancestors in ancestor_combinations: for ancestor_selector_raw in _generate_direct_css_selectors_for_nodes( ancestors ): @@ -110,8 +127,6 @@ def _generate_direct_css_selectors_for_nodes(nodes: list[Node]): for css_selector in _generate_direct_css_selectors_for_nodes_without_pseudo(nodes): yield css_selector - # pull to the end as far as possible - for css_selector in _generate_direct_css_selectors_for_nodes_without_pseudo(nodes): if all(n.tag_name not in ["html", "body"] for n in nodes): child_indexes = [n.parent.select(css_selector).index(n) for n in nodes] if len(set(child_indexes)) == 1: @@ -135,7 +150,11 @@ def _generate_direct_css_selectors_for_nodes_without_pseudo(nodes: list[Node]): # check for common classes common_classes = set.intersection(*[set(n.classes) for n in nodes]) - for class_combination in powerset(common_classes): + + # ignore selectors containing colons + common_classes = [cc for cc in common_classes if ":" not in cc] + + for class_combination in powerset_max_length(common_classes, 2): if class_combination: css_selector = "".join(map(lambda cl: "." + cl, class_combination)) yield css_selector diff --git a/mlscraper/training.py b/mlscraper/training.py index d8f89b4..3e3b259 100644 --- a/mlscraper/training.py +++ b/mlscraper/training.py @@ -128,6 +128,7 @@ def train_scraper_for_matches(matches, roots, complexity: int): None, ) if not selector: + logging.info(f"did not find selector for matches ({matches=})") raise NoScraperFoundException(f"no selector found {matches=}") logging.info(f"found selector for ValueScraper ({selector=})") return ValueScraper(selector, extractor) diff --git a/tests/test_html.py b/tests/test_html.py index ab7a6b3..42b967f 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -1,3 +1,4 @@ +from mlscraper.html import get_relative_depth from mlscraper.html import get_root_node from mlscraper.html import HTMLExactTextMatch from mlscraper.html import Page @@ -115,3 +116,12 @@ def test_find_text_with_noise(): not isinstance(html_match, HTMLExactTextMatch) for html_match in page.find_all("karl") ) + + +def test_get_relative_depth(): + html = b"<html><body><p>bla karl bla</p></body></html>" + page = Page(html) + p_tag = page.select("p")[0] + assert get_relative_depth(p_tag, p_tag) == 0 + assert get_relative_depth(p_tag, p_tag.parent) == 1 + assert get_relative_depth(p_tag, p_tag.parent.parent) == 2 |