summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2020-09-26 21:52:18 +0200
committerKarl Lorey <git@karllorey.com>2020-09-26 21:52:18 +0200
commita0606b9e7fd9319a1279fb585b351049bc2b259b (patch)
tree82528d36c3edcdb38882a97399a6cff58b94b802
parentefad34d92b0bae12ee1fd7b66752e6fc6b7bc727 (diff)
Use heuristic to prefer simple rules in single-item scraper
-rw-r--r--mlscraper/__init__.py22
1 files changed, 10 insertions, 12 deletions
diff --git a/mlscraper/__init__.py b/mlscraper/__init__.py
index db52b0a..3324195 100644
--- a/mlscraper/__init__.py
+++ b/mlscraper/__init__.py
@@ -69,22 +69,19 @@ class RuleBasedSingleItemScraper:
# get all potential matches
matching_nodes = flatten([s.page.find(s.item[attr]) for s in samples])
- selectors = set(
- chain(
- *(
- generate_unique_path_selectors(node._soup_node)
- for node in matching_nodes
- )
- )
+ # since uniqueness requires selection over and over, we don't use generate_unique_path... here
+ path_selector_generator = (
+ generate_path_selectors(node._soup_node) for node in matching_nodes
)
+ selectors = set(chain(*path_selector_generator))
# check if they are unique on every page
# -> for all potential selectors: compute score
selector_scoring = {} # selector -> score
- for selector in selectors:
+ for i, selector in enumerate(selectors):
if selector not in selector_scoring:
- logging.info("testing %s" % selector)
- matches_per_page = [s.page.select(selector) for s in samples]
+ logging.info("testing %s (%d/%d)", selector, i, len(selectors))
+ matches_per_page = (s.page.select(selector) for s in samples)
matches_per_page_right = [
len(m) == 1 and m[0].get_text() == s.item[attr]
for m, s in zip(matches_per_page, samples)
@@ -94,12 +91,13 @@ class RuleBasedSingleItemScraper:
# find the selector with the best coverage, i.e. the highest accuracy
logging.info("Scoring for %s: %s", attr, selector_scoring)
+ # sort by score (desc) and selector length (asc)
selectors_sorted = sorted(
- selector_scoring, key=selector_scoring.get, reverse=True
+ selector_scoring.items(), key=lambda x: (x[1], -len(x[0])), reverse=True
)
logging.info("Best scores for %s: %s", attr, selectors_sorted[:3])
try:
- selector_best = selectors_sorted[0]
+ selector_best = selectors_sorted[0][0]
if selector_scoring[selector_best] < 1:
logging.warning(
"Best selector for %s does not work for all samples (score is %f)"