summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-06-24 12:58:48 +0200
committerKarl Lorey <git@karllorey.com>2022-06-24 12:58:48 +0200
commitaa1ac21a0ede6f4f6a4282fcb07f87d706186817 (patch)
tree011a746160630d89b545c1a36b01051c09d7a727
parentf0d841c49891355259b510de7d3490b20fc3e61b (diff)
Avoid matching numbers inside image dimensions
-rw-r--r--mlscraper/matches.py10
-rw-r--r--mlscraper/samples.py7
-rw-r--r--tests/test_matches.py19
3 files changed, 36 insertions, 0 deletions
diff --git a/mlscraper/matches.py b/mlscraper/matches.py
index 71f6583..8ebc4e9 100644
--- a/mlscraper/matches.py
+++ b/mlscraper/matches.py
@@ -226,3 +226,13 @@ def is_disjoint_match_combination(matches):
Check if the given matches have no overlap.
"""
return all(not m1.has_overlap(m2) for m1, m2 in combinations(matches, 2))
+
+
+def is_dimensions_match(m: Match):
+ if not isinstance(m, ValueMatch):
+ return False
+
+ if not isinstance(m.extractor, AttributeValueExtractor):
+ return False
+
+ return m.extractor.attr in ["width", "height"]
diff --git a/mlscraper/samples.py b/mlscraper/samples.py
index a85e11b..228e26a 100644
--- a/mlscraper/samples.py
+++ b/mlscraper/samples.py
@@ -5,6 +5,7 @@ from itertools import product
from mlscraper.html import Page
from mlscraper.matches import DictMatch
from mlscraper.matches import generate_all_value_matches
+from mlscraper.matches import is_dimensions_match
from mlscraper.matches import is_disjoint_match_combination
from mlscraper.matches import ListMatch
@@ -29,7 +30,13 @@ class Sample:
# todo: fix creating new sample objects, maybe by using Item class?
if isinstance(self.value, str):
+ # generate all matches
value_matches = list(generate_all_value_matches(self.page, self.value))
+
+ # filter out dimensions like width/height
+ value_matches = [vm for vm in value_matches if not is_dimensions_match(vm)]
+
+ # raise if not found
logging.info(
f"found {len(value_matches)=} on page ({self.value=}, {self.page=})"
)
diff --git a/tests/test_matches.py b/tests/test_matches.py
index e69de29..f361d5b 100644
--- a/tests/test_matches.py
+++ b/tests/test_matches.py
@@ -0,0 +1,19 @@
+from mlscraper.html import Page
+from mlscraper.matches import AttributeValueExtractor
+from mlscraper.matches import generate_all_value_matches
+from mlscraper.matches import is_dimensions_match
+from mlscraper.matches import ValueMatch
+
+
+def test_is_dimensions_match_plain():
+ extractor = AttributeValueExtractor("height")
+ value_match = ValueMatch(None, extractor)
+ assert is_dimensions_match(value_match)
+
+
+def test_is_dimensions_match_generation():
+ page = Page(b'<html><body><img height="20" width="20"</body></html>')
+ matches_unfiltered = list(generate_all_value_matches(page, "20"))
+ assert matches_unfiltered
+ matches = [m for m in matches_unfiltered if not is_dimensions_match(m)]
+ assert not matches