summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-07-07 13:33:27 +0200
committerKarl Lorey <git@karllorey.com>2022-07-07 13:33:27 +0200
commit6fe400dc1aac65169664893a81a23e61a3aed5fc (patch)
tree08ca62d0832524e45749afe9156909c92450b601
parent20c492c62dd7457c485c3bf79e28a81ea135d84d (diff)
Cache uniquely_selects to work around re-running train_scrapers
-rw-r--r--mlscraper/selectors.py35
-rw-r--r--mlscraper/training.py3
2 files changed, 23 insertions, 15 deletions
diff --git a/mlscraper/selectors.py b/mlscraper/selectors.py
index a30e669..5527b42 100644
--- a/mlscraper/selectors.py
+++ b/mlscraper/selectors.py
@@ -50,26 +50,31 @@ class CssRuleSelector(Selector):
return node.select(self.css_rule)
def uniquely_selects(self, root: Node, nodes: typing.Collection[Node]):
- # limit +1
- # ensures mismatch if selection result starts with nodes
- # e.g. select returns [1,2,3,...] and nodes is [1,2,3]
- # but decreases load with many results significantly
- limit = len(nodes) + 1
-
- # directly using soups:
- # - avoids creating nodes for all selects
- # - increases caching effort
- return root.soup.select(self.css_rule, limit=limit) == [n.soup for n in nodes]
-
- # using select
- # - creates nodes for every soup object
- # - leverages caching
- # return root.select(self.css_rule, limit=limit) == nodes
+ return _uniquely_selects(self.css_rule, root, tuple(nodes))
def __repr__(self):
return f"<{self.__class__.__name__} {self.css_rule=}>"
+@functools.lru_cache(10000)
+def _uniquely_selects(css_rule, root, nodes):
+ # limit +1
+ # ensures mismatch if selection result starts with nodes
+ # e.g. select returns [1,2,3,...] and nodes is [1,2,3]
+ # but decreases load with many results significantly
+ limit = len(nodes) + 1
+
+ # directly using soups:
+ # - avoids creating nodes for all selects
+ # - increases caching effort
+ return root.soup.select(css_rule, limit=limit) == [n.soup for n in nodes]
+
+ # using select
+ # - creates nodes for every soup object
+ # - leverages caching
+ # return root.select(self.css_rule, limit=limit) == nodes
+
+
def generate_unique_selectors_for_nodes(
nodes: list[Node], roots, complexity: int
) -> typing.Generator[Selector, None, None]:
diff --git a/mlscraper/training.py b/mlscraper/training.py
index d9b1b8a..7dabdb2 100644
--- a/mlscraper/training.py
+++ b/mlscraper/training.py
@@ -149,6 +149,9 @@ def train_scraper_for_matches(matches, roots, complexity: int):
# roots are the original roots(?)
scraper_per_key = {}
for k in keys:
+ # todo we get the same match combinations repeatedly
+ # maybe caching uniquely_selects helps
+ # but it is better to store the actual scraper
matches_per_key = [m.match_by_key[k] for m in matches]
logging.info(f"training key for DictScraper ({k=})")
logging.info(f"matches for key: {matches_per_key=}")