diff options
author | Karl Lorey <git@karllorey.com> | 2022-06-24 01:06:37 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-06-24 01:06:37 +0200 |
commit | d567ea3ecf5f1bf7e42e4d93def6052b7adc5fa2 (patch) | |
tree | 22b98378578c1589b07ba725c873fe59c39e455f | |
parent | c54208e15c2b303cced2698ff331962d243b6329 (diff) |
Also match all parents that contain the same text
-rw-r--r-- | mlscraper/html.py | 9 | ||||
-rw-r--r-- | tests/test_html.py | 6 |
2 files changed, 10 insertions, 5 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py index c0eede3..72f95f0 100644 --- a/mlscraper/html.py +++ b/mlscraper/html.py @@ -61,13 +61,16 @@ class Node: # text # - since text matches including whitespace, a regex is used - for soup_node in self.soup.find_all( - string=re.compile(r"^\s*%s\s*$" % html.escape(item)) - ): + target_regex = re.compile(r"^\s*%s\s*$" % html.escape(item)) + for soup_node in self.soup.find_all(string=target_regex): # use parent node as found text is NaviableString and not Tag node = self._page._get_node_for_soup(soup_node.parent) yield HTMLExactTextMatch(node) + for p in node.parents: + if p.text.strip() == node.text.strip(): + yield HTMLExactTextMatch(p) + # attributes for soup_node in self.soup.find_all(): for attr in soup_node.attrs: diff --git a/tests/test_html.py b/tests/test_html.py index 7780504..ab7a6b3 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -102,8 +102,10 @@ def test_find_text_with_whitespace(): html = b"<html><body><p> whitespace \n\t </p></body></html>" page = Page(html) html_matches = page.find_all("whitespace") - assert len(html_matches) == 1 - assert isinstance(html_matches[0], HTMLExactTextMatch) + + # should match p, body, html (and document) + assert len(html_matches) == 4 + assert all(isinstance(hm, HTMLExactTextMatch) for hm in html_matches) def test_find_text_with_noise(): |