diff options
author | Karl Lorey <git@karllorey.com> | 2022-07-07 16:40:15 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-07-07 16:40:15 +0200 |
commit | 35495fa864ec509eca48d4191d0d91175e387857 (patch) | |
tree | 0e0d198f00b4519b8909da7c9cff15bcdfd3c398 | |
parent | 9c836b12e2c6aacd3e28577db8681b45d7d17aaf (diff) |
Fix bug where ListScraper mistakenly exits early due to unsimilar roots. Fixes #25
-rw-r--r-- | mlscraper/training.py | 12 |
1 files changed, 6 insertions, 6 deletions
diff --git a/mlscraper/training.py b/mlscraper/training.py index 7dabdb2..b71d577 100644 --- a/mlscraper/training.py +++ b/mlscraper/training.py @@ -70,7 +70,7 @@ def train_scraper(training_set: TrainingSet, complexity=100): scraper = train_scraper_for_matches(match_combination, roots, complexity) return scraper except NoScraperFoundException: - logging.info( + logging.exception( "no scraper found " "for complexity and match_combination " f"({complexity=}, {match_combination=})" @@ -97,9 +97,6 @@ def train_scraper_for_matches(matches, roots, complexity: int): assert len(matches) == len(roots), f"got uneven inputs ({matches=}, {roots=})" - if len({m.root.soup.name for m in matches}) != 1: - raise NoScraperFoundException("different names found") - if any(c1.has_overlap(c2) for c1, c2 in combinations(matches, 2)): raise NoScraperFoundException("a pair of matches overlaps, most likely invalid") @@ -152,10 +149,13 @@ def train_scraper_for_matches(matches, roots, complexity: int): # todo we get the same match combinations repeatedly # maybe caching uniquely_selects helps # but it is better to store the actual scraper - matches_per_key = [m.match_by_key[k] for m in matches] logging.info(f"training key for DictScraper ({k=})") + matches_per_key = [m.match_by_key[k] for m in matches] logging.info(f"matches for key: {matches_per_key=}") - scraper = train_scraper_for_matches(matches_per_key, roots, complexity) + try: + scraper = train_scraper_for_matches(matches_per_key, roots, complexity) + except NoScraperFoundException as e: + raise NoScraperFoundException(f'Training DictScraper failed ({k=})') from e scraper_per_key[k] = scraper logging.info(f"found DictScraper ({scraper_per_key=})") return DictScraper(scraper_per_key) |