diff options
author | Karl Lorey <git@karllorey.com> | 2020-09-26 21:52:18 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2020-09-26 21:52:18 +0200 |
commit | a0606b9e7fd9319a1279fb585b351049bc2b259b (patch) | |
tree | 82528d36c3edcdb38882a97399a6cff58b94b802 | |
parent | efad34d92b0bae12ee1fd7b66752e6fc6b7bc727 (diff) |
Use heuristic to prefer simple rules in single-item scraper
-rw-r--r-- | mlscraper/__init__.py | 22 |
1 files changed, 10 insertions, 12 deletions
diff --git a/mlscraper/__init__.py b/mlscraper/__init__.py index db52b0a..3324195 100644 --- a/mlscraper/__init__.py +++ b/mlscraper/__init__.py @@ -69,22 +69,19 @@ class RuleBasedSingleItemScraper: # get all potential matches matching_nodes = flatten([s.page.find(s.item[attr]) for s in samples]) - selectors = set( - chain( - *( - generate_unique_path_selectors(node._soup_node) - for node in matching_nodes - ) - ) + # since uniqueness requires selection over and over, we don't use generate_unique_path... here + path_selector_generator = ( + generate_path_selectors(node._soup_node) for node in matching_nodes ) + selectors = set(chain(*path_selector_generator)) # check if they are unique on every page # -> for all potential selectors: compute score selector_scoring = {} # selector -> score - for selector in selectors: + for i, selector in enumerate(selectors): if selector not in selector_scoring: - logging.info("testing %s" % selector) - matches_per_page = [s.page.select(selector) for s in samples] + logging.info("testing %s (%d/%d)", selector, i, len(selectors)) + matches_per_page = (s.page.select(selector) for s in samples) matches_per_page_right = [ len(m) == 1 and m[0].get_text() == s.item[attr] for m, s in zip(matches_per_page, samples) @@ -94,12 +91,13 @@ class RuleBasedSingleItemScraper: # find the selector with the best coverage, i.e. the highest accuracy logging.info("Scoring for %s: %s", attr, selector_scoring) + # sort by score (desc) and selector length (asc) selectors_sorted = sorted( - selector_scoring, key=selector_scoring.get, reverse=True + selector_scoring.items(), key=lambda x: (x[1], -len(x[0])), reverse=True ) logging.info("Best scores for %s: %s", attr, selectors_sorted[:3]) try: - selector_best = selectors_sorted[0] + selector_best = selectors_sorted[0][0] if selector_scoring[selector_best] < 1: logging.warning( "Best selector for %s does not work for all samples (score is %f)" |