diff options
author | Karl Lorey <git@karllorey.com> | 2022-06-24 12:58:48 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-06-24 12:58:48 +0200 |
commit | aa1ac21a0ede6f4f6a4282fcb07f87d706186817 (patch) | |
tree | 011a746160630d89b545c1a36b01051c09d7a727 | |
parent | f0d841c49891355259b510de7d3490b20fc3e61b (diff) |
Avoid matching numbers inside image dimensions
-rw-r--r-- | mlscraper/matches.py | 10 | ||||
-rw-r--r-- | mlscraper/samples.py | 7 | ||||
-rw-r--r-- | tests/test_matches.py | 19 |
3 files changed, 36 insertions, 0 deletions
diff --git a/mlscraper/matches.py b/mlscraper/matches.py index 71f6583..8ebc4e9 100644 --- a/mlscraper/matches.py +++ b/mlscraper/matches.py @@ -226,3 +226,13 @@ def is_disjoint_match_combination(matches): Check if the given matches have no overlap. """ return all(not m1.has_overlap(m2) for m1, m2 in combinations(matches, 2)) + + +def is_dimensions_match(m: Match): + if not isinstance(m, ValueMatch): + return False + + if not isinstance(m.extractor, AttributeValueExtractor): + return False + + return m.extractor.attr in ["width", "height"] diff --git a/mlscraper/samples.py b/mlscraper/samples.py index a85e11b..228e26a 100644 --- a/mlscraper/samples.py +++ b/mlscraper/samples.py @@ -5,6 +5,7 @@ from itertools import product from mlscraper.html import Page from mlscraper.matches import DictMatch from mlscraper.matches import generate_all_value_matches +from mlscraper.matches import is_dimensions_match from mlscraper.matches import is_disjoint_match_combination from mlscraper.matches import ListMatch @@ -29,7 +30,13 @@ class Sample: # todo: fix creating new sample objects, maybe by using Item class? if isinstance(self.value, str): + # generate all matches value_matches = list(generate_all_value_matches(self.page, self.value)) + + # filter out dimensions like width/height + value_matches = [vm for vm in value_matches if not is_dimensions_match(vm)] + + # raise if not found logging.info( f"found {len(value_matches)=} on page ({self.value=}, {self.page=})" ) diff --git a/tests/test_matches.py b/tests/test_matches.py index e69de29..f361d5b 100644 --- a/tests/test_matches.py +++ b/tests/test_matches.py @@ -0,0 +1,19 @@ +from mlscraper.html import Page +from mlscraper.matches import AttributeValueExtractor +from mlscraper.matches import generate_all_value_matches +from mlscraper.matches import is_dimensions_match +from mlscraper.matches import ValueMatch + + +def test_is_dimensions_match_plain(): + extractor = AttributeValueExtractor("height") + value_match = ValueMatch(None, extractor) + assert is_dimensions_match(value_match) + + +def test_is_dimensions_match_generation(): + page = Page(b'<html><body><img height="20" width="20"</body></html>') + matches_unfiltered = list(generate_all_value_matches(page, "20")) + assert matches_unfiltered + matches = [m for m in matches_unfiltered if not is_dimensions_match(m)] + assert not matches |