diff options
author | Karl Lorey <git@karllorey.com> | 2022-06-23 17:27:13 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-06-23 17:27:13 +0200 |
commit | 13a11d65d477abd347a4b24d21216985631bd446 (patch) | |
tree | ebe80fc10dec93227d32216e8f37c374937e805b | |
parent | d08a69e478582ebd2be5ac3ff7c83591ef86bf8c (diff) |
Fix selection of arbitrary text within nodes (for now)
-rw-r--r-- | mlscraper/html.py | 6 | ||||
-rw-r--r-- | mlscraper/matches.py | 11 | ||||
-rw-r--r-- | tests/test_html.py | 13 |
3 files changed, 20 insertions, 10 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py index 5ee51be..807a35e 100644 --- a/mlscraper/html.py +++ b/mlscraper/html.py @@ -20,7 +20,7 @@ class HTMLMatch(ABC): @dataclass -class HTMLTextMatch(HTMLMatch): +class HTMLExactTextMatch(HTMLMatch): pass @@ -54,11 +54,11 @@ class Node: # text # - since text matches including whitespace, a regex is used for soup_node in self.soup.find_all( - string=re.compile(r"\s*%s\s*" % html.escape(item)) + string=re.compile(r"^\s*%s\s*$" % html.escape(item)) ): # use parent node as found text is NaviableString and not Tag node = self._page._get_node_for_soup(soup_node.parent) - yield HTMLTextMatch(node) + yield HTMLExactTextMatch(node) # attributes for soup_node in self.soup.find_all(): diff --git a/mlscraper/matches.py b/mlscraper/matches.py index 1301375..c5145c1 100644 --- a/mlscraper/matches.py +++ b/mlscraper/matches.py @@ -9,7 +9,7 @@ from itertools import combinations from mlscraper.html import get_relative_depth from mlscraper.html import get_root_node from mlscraper.html import HTMLAttributeMatch -from mlscraper.html import HTMLTextMatch +from mlscraper.html import HTMLExactTextMatch from mlscraper.html import Node @@ -151,15 +151,16 @@ def generate_all_value_matches( logging.info(f"generating all value matches ({node=}, {item=})") for html_match in node.find_all(item): matched_node = html_match.node - if isinstance(html_match, HTMLTextMatch): + if isinstance(html_match, HTMLExactTextMatch): extractor = TextValueExtractor() + yield ValueMatch(matched_node, extractor) elif isinstance(html_match, HTMLAttributeMatch): extractor = AttributeValueExtractor(html_match.attr) + yield ValueMatch(matched_node, extractor) else: - raise RuntimeError( - f"unknown match type ({html_match=}, {type(html_match)=})" + logging.warning( + f"Cannot deal with HTMLMatch type, ignoring ({html_match=}, {type(html_match)=}))" ) - yield ValueMatch(matched_node, extractor) def is_disjoint_match_combination(matches): diff --git a/tests/test_html.py b/tests/test_html.py index faeaef8..e976ed5 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup from mlscraper.html import _get_root_of_nodes -from mlscraper.html import HTMLTextMatch +from mlscraper.html import HTMLExactTextMatch from mlscraper.html import Page from mlscraper.html import selector_matches_nodes from mlscraper.matches import AttributeValueExtractor @@ -98,4 +98,13 @@ def test_find_text_with_whitespace(): page = Page(html) html_matches = page.find_all("whitespace") assert len(html_matches) == 1 - assert isinstance(html_matches[0], HTMLTextMatch) + assert isinstance(html_matches[0], HTMLExactTextMatch) + + +def test_find_text_with_noise(): + html = b"<html><body><p>bla karl bla</p></body></html>" + page = Page(html) + assert all( + not isinstance(html_match, HTMLExactTextMatch) + for html_match in page.find_all("karl") + ) |