summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-07-05 16:24:43 +0200
committerKarl Lorey <git@karllorey.com>2022-07-05 16:24:43 +0200
commitb8b9513e6ff688511b0ddba772d2d805c361d00c (patch)
treeced816b4b29f7a05a2115e7b817cab93e25ceebe
parente63203f7ad77d512ca5ffaffb9b8f597430b95c5 (diff)
Minor fixes and improvements
-rw-r--r--mlscraper/html.py16
-rw-r--r--mlscraper/matches.py6
-rw-r--r--mlscraper/selectors.py31
-rw-r--r--mlscraper/training.py1
-rw-r--r--tests/test_html.py10
5 files changed, 48 insertions, 16 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py
index 1f1c7cf..aac5377 100644
--- a/mlscraper/html.py
+++ b/mlscraper/html.py
@@ -175,15 +175,13 @@ def get_root_node(nodes: list[Node]) -> Node:
def get_relative_depth(node: Node, root: Node):
- node_parents = list(node.soup.parents)
-
- # depth of root
- i = node_parents.index(root.soup)
-
- # depth of element
- j = len(node_parents)
-
- return j - i
+ """
+ Return the relative depth of node inside tree starting from root.
+ """
+ hierarchy = list(reversed([node] + node.parents))
+ assert node in hierarchy
+ assert root in hierarchy
+ return hierarchy.index(node) - hierarchy.index(root)
def selector_matches_nodes(root: Node, selector: str, expected: list[Node]):
diff --git a/mlscraper/matches.py b/mlscraper/matches.py
index 8ebc4e9..508f81e 100644
--- a/mlscraper/matches.py
+++ b/mlscraper/matches.py
@@ -199,7 +199,11 @@ class ValueMatch(Match):
if self.node.tag_name != match.node.tag_name:
return 0
- return 1
+ jaccard_top = len(set(self.node.classes).intersection(match.node.classes))
+ jaccard_bottom = len(set(self.node.classes).union(match.node.classes))
+ if jaccard_top == jaccard_bottom:
+ return 1 # also 0/0
+ return jaccard_top / jaccard_bottom
def generate_all_value_matches(
diff --git a/mlscraper/selectors.py b/mlscraper/selectors.py
index 904e984..af7fbae 100644
--- a/mlscraper/selectors.py
+++ b/mlscraper/selectors.py
@@ -1,10 +1,11 @@
import logging
+import typing
from itertools import product
from mlscraper.html import Node
from mlscraper.util import no_duplicates_generator_decorator
+from mlscraper.util import powerset_max_length
from more_itertools import first
-from more_itertools import powerset
class Selector:
@@ -43,6 +44,17 @@ class CssRuleSelector(Selector):
def select_all(self, node: Node):
return node.select(self.css_rule)
+ def uniquely_selects(self, root: Node, nodes: typing.Collection[Node]):
+ # directly using soups:
+ # - avoids creating nodes for all selects
+ # - increases caching effort
+ # return root.soup.select(self.css_rule) == [n.soup for n in nodes]
+
+ # using select
+ # - creates nodes for every soup object
+ # - leverages caching
+ return root.select(self.css_rule) == nodes
+
def __repr__(self):
return f"<{self.__class__.__name__} {self.css_rule=}>"
@@ -58,7 +70,7 @@ def generate_unique_selectors_for_nodes(nodes: list[Node], roots, complexity: in
nodes_per_root = {r: [n for n in nodes if n.has_parent(r)] for r in set(roots)}
for selector in generate_selectors_for_nodes(nodes, roots, complexity):
if all(
- selector.select_all(r) == nodes_of_root
+ selector.uniquely_selects(r, nodes_of_root)
for r, nodes_of_root in nodes_per_root.items()
):
yield selector
@@ -85,7 +97,12 @@ def generate_selectors_for_nodes(nodes: list[Node], roots, complexity: int):
[p for p in n.parents if p.has_parent(r) and p.tag_name != "html"]
for n, r in zip(nodes, roots)
]
- for ancestors in product(*ancestors_below_roots):
+ ancestor_combinations = sorted(
+ product(*ancestors_below_roots),
+ key=lambda ancestors: len({c for a in ancestors for c in a.classes}),
+ reverse=True,
+ )
+ for ancestors in ancestor_combinations:
for ancestor_selector_raw in _generate_direct_css_selectors_for_nodes(
ancestors
):
@@ -110,8 +127,6 @@ def _generate_direct_css_selectors_for_nodes(nodes: list[Node]):
for css_selector in _generate_direct_css_selectors_for_nodes_without_pseudo(nodes):
yield css_selector
- # pull to the end as far as possible
- for css_selector in _generate_direct_css_selectors_for_nodes_without_pseudo(nodes):
if all(n.tag_name not in ["html", "body"] for n in nodes):
child_indexes = [n.parent.select(css_selector).index(n) for n in nodes]
if len(set(child_indexes)) == 1:
@@ -135,7 +150,11 @@ def _generate_direct_css_selectors_for_nodes_without_pseudo(nodes: list[Node]):
# check for common classes
common_classes = set.intersection(*[set(n.classes) for n in nodes])
- for class_combination in powerset(common_classes):
+
+ # ignore selectors containing colons
+ common_classes = [cc for cc in common_classes if ":" not in cc]
+
+ for class_combination in powerset_max_length(common_classes, 2):
if class_combination:
css_selector = "".join(map(lambda cl: "." + cl, class_combination))
yield css_selector
diff --git a/mlscraper/training.py b/mlscraper/training.py
index d8f89b4..3e3b259 100644
--- a/mlscraper/training.py
+++ b/mlscraper/training.py
@@ -128,6 +128,7 @@ def train_scraper_for_matches(matches, roots, complexity: int):
None,
)
if not selector:
+ logging.info(f"did not find selector for matches ({matches=})")
raise NoScraperFoundException(f"no selector found {matches=}")
logging.info(f"found selector for ValueScraper ({selector=})")
return ValueScraper(selector, extractor)
diff --git a/tests/test_html.py b/tests/test_html.py
index ab7a6b3..42b967f 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -1,3 +1,4 @@
+from mlscraper.html import get_relative_depth
from mlscraper.html import get_root_node
from mlscraper.html import HTMLExactTextMatch
from mlscraper.html import Page
@@ -115,3 +116,12 @@ def test_find_text_with_noise():
not isinstance(html_match, HTMLExactTextMatch)
for html_match in page.find_all("karl")
)
+
+
+def test_get_relative_depth():
+ html = b"<html><body><p>bla karl bla</p></body></html>"
+ page = Page(html)
+ p_tag = page.select("p")[0]
+ assert get_relative_depth(p_tag, p_tag) == 0
+ assert get_relative_depth(p_tag, p_tag.parent) == 1
+ assert get_relative_depth(p_tag, p_tag.parent.parent) == 2