Cache uniquely_selects to work around re-running train_scrapers

author: Karl Lorey <git@karllorey.com> 2022-07-07 13:33:27 +0200
committer: Karl Lorey <git@karllorey.com> 2022-07-07 13:33:27 +0200
commit: 6fe400dc1aac65169664893a81a23e61a3aed5fc (patch)
tree: 08ca62d0832524e45749afe9156909c92450b601
parent: 20c492c62dd7457c485c3bf79e28a81ea135d84d (diff)
2 files changed, 23 insertions, 15 deletions
diff --git a/mlscraper/selectors.py b/mlscraper/selectors.py
index a30e669..5527b42 100644
--- a/mlscraper/selectors.py
+++ b/mlscraper/selectors.py
@@ -50,26 +50,31 @@ class CssRuleSelector(Selector):
         return node.select(self.css_rule)
 
     def uniquely_selects(self, root: Node, nodes: typing.Collection[Node]):
-        # limit +1
-        # ensures mismatch if selection result starts with nodes
-        # e.g. select returns [1,2,3,...] and nodes is [1,2,3]
-        # but decreases load with many results significantly
-        limit = len(nodes) + 1
-
-        # directly using soups:
-        # - avoids creating nodes for all selects
-        # - increases caching effort
-        return root.soup.select(self.css_rule, limit=limit) == [n.soup for n in nodes]
-
-        # using select
-        # - creates nodes for every soup object
-        # - leverages caching
-        # return root.select(self.css_rule, limit=limit) == nodes
+        return _uniquely_selects(self.css_rule, root, tuple(nodes))
 
     def __repr__(self):
         return f"<{self.__class__.__name__} {self.css_rule=}>"
 
 
+@functools.lru_cache(10000)
+def _uniquely_selects(css_rule, root, nodes):
+    # limit +1
+    # ensures mismatch if selection result starts with nodes
+    # e.g. select returns [1,2,3,...] and nodes is [1,2,3]
+    # but decreases load with many results significantly
+    limit = len(nodes) + 1
+
+    # directly using soups:
+    # - avoids creating nodes for all selects
+    # - increases caching effort
+    return root.soup.select(css_rule, limit=limit) == [n.soup for n in nodes]
+
+    # using select
+    # - creates nodes for every soup object
+    # - leverages caching
+    # return root.select(self.css_rule, limit=limit) == nodes
+
+
 def generate_unique_selectors_for_nodes(
     nodes: list[Node], roots, complexity: int
 ) -> typing.Generator[Selector, None, None]:
diff --git a/mlscraper/training.py b/mlscraper/training.py
index d9b1b8a..7dabdb2 100644
--- a/mlscraper/training.py
+++ b/mlscraper/training.py
@@ -149,6 +149,9 @@ def train_scraper_for_matches(matches, roots, complexity: int):
         # roots are the original roots(?)
         scraper_per_key = {}
         for k in keys:
+            # todo we get the same match combinations repeatedly
+            #  maybe caching uniquely_selects helps
+            #  but it is better to store the actual scraper
             matches_per_key = [m.match_by_key[k] for m in matches]
             logging.info(f"training key for DictScraper ({k=})")
             logging.info(f"matches for key: {matches_per_key=}")
author	Karl Lorey <git@karllorey.com>	2022-07-07 13:33:27 +0200
committer	Karl Lorey <git@karllorey.com>	2022-07-07 13:33:27 +0200
commit	6fe400dc1aac65169664893a81a23e61a3aed5fc (patch)
tree	08ca62d0832524e45749afe9156909c92450b601
parent	20c492c62dd7457c485c3bf79e28a81ea135d84d (diff)