Fix selection of arbitrary text within nodes (for now)

author: Karl Lorey <git@karllorey.com> 2022-06-23 17:27:13 +0200
committer: Karl Lorey <git@karllorey.com> 2022-06-23 17:27:13 +0200
commit: 13a11d65d477abd347a4b24d21216985631bd446 (patch)
tree: ebe80fc10dec93227d32216e8f37c374937e805b
parent: d08a69e478582ebd2be5ac3ff7c83591ef86bf8c (diff)
3 files changed, 20 insertions, 10 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py
index 5ee51be..807a35e 100644
--- a/mlscraper/html.py
+++ b/mlscraper/html.py
@@ -20,7 +20,7 @@ class HTMLMatch(ABC):
 
 
 @dataclass
-class HTMLTextMatch(HTMLMatch):
+class HTMLExactTextMatch(HTMLMatch):
     pass
 
 
@@ -54,11 +54,11 @@ class Node:
         # text
         # - since text matches including whitespace, a regex is used
         for soup_node in self.soup.find_all(
-            string=re.compile(r"\s*%s\s*" % html.escape(item))
+            string=re.compile(r"^\s*%s\s*$" % html.escape(item))
         ):
             # use parent node as found text is NaviableString and not Tag
             node = self._page._get_node_for_soup(soup_node.parent)
-            yield HTMLTextMatch(node)
+            yield HTMLExactTextMatch(node)
 
         # attributes
         for soup_node in self.soup.find_all():
diff --git a/mlscraper/matches.py b/mlscraper/matches.py
index 1301375..c5145c1 100644
--- a/mlscraper/matches.py
+++ b/mlscraper/matches.py
@@ -9,7 +9,7 @@ from itertools import combinations
 from mlscraper.html import get_relative_depth
 from mlscraper.html import get_root_node
 from mlscraper.html import HTMLAttributeMatch
-from mlscraper.html import HTMLTextMatch
+from mlscraper.html import HTMLExactTextMatch
 from mlscraper.html import Node
 
 
@@ -151,15 +151,16 @@ def generate_all_value_matches(
     logging.info(f"generating all value matches ({node=}, {item=})")
     for html_match in node.find_all(item):
         matched_node = html_match.node
-        if isinstance(html_match, HTMLTextMatch):
+        if isinstance(html_match, HTMLExactTextMatch):
             extractor = TextValueExtractor()
+            yield ValueMatch(matched_node, extractor)
         elif isinstance(html_match, HTMLAttributeMatch):
             extractor = AttributeValueExtractor(html_match.attr)
+            yield ValueMatch(matched_node, extractor)
         else:
-            raise RuntimeError(
-                f"unknown match type ({html_match=}, {type(html_match)=})"
+            logging.warning(
+                f"Cannot deal with HTMLMatch type, ignoring ({html_match=}, {type(html_match)=}))"
             )
-        yield ValueMatch(matched_node, extractor)
 
 
 def is_disjoint_match_combination(matches):
diff --git a/tests/test_html.py b/tests/test_html.py
index faeaef8..e976ed5 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -1,6 +1,6 @@
 from bs4 import BeautifulSoup
 from mlscraper.html import _get_root_of_nodes
-from mlscraper.html import HTMLTextMatch
+from mlscraper.html import HTMLExactTextMatch
 from mlscraper.html import Page
 from mlscraper.html import selector_matches_nodes
 from mlscraper.matches import AttributeValueExtractor
@@ -98,4 +98,13 @@ def test_find_text_with_whitespace():
     page = Page(html)
     html_matches = page.find_all("whitespace")
     assert len(html_matches) == 1
-    assert isinstance(html_matches[0], HTMLTextMatch)
+    assert isinstance(html_matches[0], HTMLExactTextMatch)
+
+
+def test_find_text_with_noise():
+    html = b"<html><body><p>bla karl bla</p></body></html>"
+    page = Page(html)
+    assert all(
+        not isinstance(html_match, HTMLExactTextMatch)
+        for html_match in page.find_all("karl")
+    )
author	Karl Lorey <git@karllorey.com>	2022-06-23 17:27:13 +0200
committer	Karl Lorey <git@karllorey.com>	2022-06-23 17:27:13 +0200
commit	13a11d65d477abd347a4b24d21216985631bd446 (patch)
tree	ebe80fc10dec93227d32216e8f37c374937e805b
parent	d08a69e478582ebd2be5ac3ff7c83591ef86bf8c (diff)