diff options
author | Karl Lorey <git@karllorey.com> | 2022-07-07 13:33:27 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-07-07 13:33:27 +0200 |
commit | 6fe400dc1aac65169664893a81a23e61a3aed5fc (patch) | |
tree | 08ca62d0832524e45749afe9156909c92450b601 | |
parent | 20c492c62dd7457c485c3bf79e28a81ea135d84d (diff) |
Cache uniquely_selects to work around re-running train_scrapers
-rw-r--r-- | mlscraper/selectors.py | 35 | ||||
-rw-r--r-- | mlscraper/training.py | 3 |
2 files changed, 23 insertions, 15 deletions
diff --git a/mlscraper/selectors.py b/mlscraper/selectors.py index a30e669..5527b42 100644 --- a/mlscraper/selectors.py +++ b/mlscraper/selectors.py @@ -50,26 +50,31 @@ class CssRuleSelector(Selector): return node.select(self.css_rule) def uniquely_selects(self, root: Node, nodes: typing.Collection[Node]): - # limit +1 - # ensures mismatch if selection result starts with nodes - # e.g. select returns [1,2,3,...] and nodes is [1,2,3] - # but decreases load with many results significantly - limit = len(nodes) + 1 - - # directly using soups: - # - avoids creating nodes for all selects - # - increases caching effort - return root.soup.select(self.css_rule, limit=limit) == [n.soup for n in nodes] - - # using select - # - creates nodes for every soup object - # - leverages caching - # return root.select(self.css_rule, limit=limit) == nodes + return _uniquely_selects(self.css_rule, root, tuple(nodes)) def __repr__(self): return f"<{self.__class__.__name__} {self.css_rule=}>" +@functools.lru_cache(10000) +def _uniquely_selects(css_rule, root, nodes): + # limit +1 + # ensures mismatch if selection result starts with nodes + # e.g. select returns [1,2,3,...] and nodes is [1,2,3] + # but decreases load with many results significantly + limit = len(nodes) + 1 + + # directly using soups: + # - avoids creating nodes for all selects + # - increases caching effort + return root.soup.select(css_rule, limit=limit) == [n.soup for n in nodes] + + # using select + # - creates nodes for every soup object + # - leverages caching + # return root.select(self.css_rule, limit=limit) == nodes + + def generate_unique_selectors_for_nodes( nodes: list[Node], roots, complexity: int ) -> typing.Generator[Selector, None, None]: diff --git a/mlscraper/training.py b/mlscraper/training.py index d9b1b8a..7dabdb2 100644 --- a/mlscraper/training.py +++ b/mlscraper/training.py @@ -149,6 +149,9 @@ def train_scraper_for_matches(matches, roots, complexity: int): # roots are the original roots(?) scraper_per_key = {} for k in keys: + # todo we get the same match combinations repeatedly + # maybe caching uniquely_selects helps + # but it is better to store the actual scraper matches_per_key = [m.match_by_key[k] for m in matches] logging.info(f"training key for DictScraper ({k=})") logging.info(f"matches for key: {matches_per_key=}") |