summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-07-07 16:40:15 +0200
committerKarl Lorey <git@karllorey.com>2022-07-07 16:40:15 +0200
commit35495fa864ec509eca48d4191d0d91175e387857 (patch)
tree0e0d198f00b4519b8909da7c9cff15bcdfd3c398
parent9c836b12e2c6aacd3e28577db8681b45d7d17aaf (diff)
Fix bug where ListScraper mistakenly exits early due to unsimilar roots. Fixes #25
-rw-r--r--mlscraper/training.py12
1 files changed, 6 insertions, 6 deletions
diff --git a/mlscraper/training.py b/mlscraper/training.py
index 7dabdb2..b71d577 100644
--- a/mlscraper/training.py
+++ b/mlscraper/training.py
@@ -70,7 +70,7 @@ def train_scraper(training_set: TrainingSet, complexity=100):
scraper = train_scraper_for_matches(match_combination, roots, complexity)
return scraper
except NoScraperFoundException:
- logging.info(
+ logging.exception(
"no scraper found "
"for complexity and match_combination "
f"({complexity=}, {match_combination=})"
@@ -97,9 +97,6 @@ def train_scraper_for_matches(matches, roots, complexity: int):
assert len(matches) == len(roots), f"got uneven inputs ({matches=}, {roots=})"
- if len({m.root.soup.name for m in matches}) != 1:
- raise NoScraperFoundException("different names found")
-
if any(c1.has_overlap(c2) for c1, c2 in combinations(matches, 2)):
raise NoScraperFoundException("a pair of matches overlaps, most likely invalid")
@@ -152,10 +149,13 @@ def train_scraper_for_matches(matches, roots, complexity: int):
# todo we get the same match combinations repeatedly
# maybe caching uniquely_selects helps
# but it is better to store the actual scraper
- matches_per_key = [m.match_by_key[k] for m in matches]
logging.info(f"training key for DictScraper ({k=})")
+ matches_per_key = [m.match_by_key[k] for m in matches]
logging.info(f"matches for key: {matches_per_key=}")
- scraper = train_scraper_for_matches(matches_per_key, roots, complexity)
+ try:
+ scraper = train_scraper_for_matches(matches_per_key, roots, complexity)
+ except NoScraperFoundException as e:
+ raise NoScraperFoundException(f'Training DictScraper failed ({k=})') from e
scraper_per_key[k] = scraper
logging.info(f"found DictScraper ({scraper_per_key=})")
return DictScraper(scraper_per_key)