Ignore whitespace around values when searching for matches in HTML

author: Karl Lorey <git@karllorey.com> 2022-06-21 16:59:05 +0200
committer: Karl Lorey <git@karllorey.com> 2022-06-21 16:59:05 +0200
commit: 9c22147fd8f7c451ed8627af8bcb8c7eab5ba8e0 (patch)
tree: 8b9d2bfe8d0d8013b0a5c5781ddacbfc4ff18288
parent: f58ddd5464e170d2570286f5e4d9b3d6d53efb5d (diff)
3 files changed, 30 insertions, 16 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py
index 98201a9..5ee51be 100644
--- a/mlscraper/html.py
+++ b/mlscraper/html.py
@@ -2,7 +2,9 @@
 Encapsulation of html-related functionality.
 BeautifulSoup should only get used here.
 """
+import html
 import logging
+import re
 from abc import ABC
 from dataclasses import dataclass
 from functools import cached_property
@@ -13,17 +15,17 @@ from bs4 import Tag
 
 
 @dataclass
-class Match(ABC):
+class HTMLMatch(ABC):
     node: "Node" = None
 
 
 @dataclass
-class TextMatch(Match):
+class HTMLTextMatch(HTMLMatch):
     pass
 
 
 @dataclass
-class AttributeMatch(Match):
+class HTMLAttributeMatch(HTMLMatch):
     attr: str = None
 
 
@@ -43,24 +45,27 @@ class Node:
     def text(self):
         return self.soup.text
 
-    def find_all(self, item) -> list[Match]:
+    def find_all(self, item) -> list[HTMLMatch]:
         return list(self._generate_find_all(item))
 
     def _generate_find_all(self, item):
         assert isinstance(item, str), "can only search for str at the moment"
 
         # text
-        for soup_node in self.soup.find_all(string=item):
+        # - since text matches including whitespace, a regex is used
+        for soup_node in self.soup.find_all(
+            string=re.compile(r"\s*%s\s*" % html.escape(item))
+        ):
             # use parent node as found text is NaviableString and not Tag
             node = self._page._get_node_for_soup(soup_node.parent)
-            yield TextMatch(node)
+            yield HTMLTextMatch(node)
 
         # attributes
         for soup_node in self.soup.find_all():
             for attr in soup_node.attrs:
                 if soup_node[attr] == item:
                     node = self._page._get_node_for_soup(soup_node)
-                    yield AttributeMatch(node, attr)
+                    yield HTMLAttributeMatch(node, attr)
 
         # todo implement other find methods
 
diff --git a/mlscraper/matches.py b/mlscraper/matches.py
index 36da4b4..1301375 100644
--- a/mlscraper/matches.py
+++ b/mlscraper/matches.py
@@ -6,11 +6,11 @@ import typing
 from functools import cached_property
 from itertools import combinations
 
-from mlscraper.html import AttributeMatch
 from mlscraper.html import get_relative_depth
 from mlscraper.html import get_root_node
+from mlscraper.html import HTMLAttributeMatch
+from mlscraper.html import HTMLTextMatch
 from mlscraper.html import Node
-from mlscraper.html import TextMatch
 
 
 class Match:
@@ -61,7 +61,7 @@ class TextValueExtractor(Extractor):
     """
 
     def extract(self, node: Node):
-        return node.soup.text
+        return node.soup.text.strip()
 
     def __repr__(self):
         return f"<{self.__class__.__name__}>"
@@ -151,9 +151,9 @@ def generate_all_value_matches(
     logging.info(f"generating all value matches ({node=}, {item=})")
     for html_match in node.find_all(item):
         matched_node = html_match.node
-        if isinstance(html_match, TextMatch):
+        if isinstance(html_match, HTMLTextMatch):
             extractor = TextValueExtractor()
-        elif isinstance(html_match, AttributeMatch):
+        elif isinstance(html_match, HTMLAttributeMatch):
             extractor = AttributeValueExtractor(html_match.attr)
         else:
             raise RuntimeError(
diff --git a/tests/test_html.py b/tests/test_html.py
index 678a5f9..faeaef8 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -1,5 +1,6 @@
 from bs4 import BeautifulSoup
 from mlscraper.html import _get_root_of_nodes
+from mlscraper.html import HTMLTextMatch
 from mlscraper.html import Page
 from mlscraper.html import selector_matches_nodes
 from mlscraper.matches import AttributeValueExtractor
@@ -7,7 +8,7 @@ from mlscraper.matches import AttributeValueExtractor
 
 def test_get_root_of_nodes():
     soup = BeautifulSoup(
-        '<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>',
+        b'<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>',
         "lxml",
     )
     node_1 = soup.select_one("#one")
@@ -49,13 +50,13 @@ def test_extractor_factory():
 
 def test_equality():
     # we want to make sure that equal html does not result in equality
-    same_html = "<html><body><div><p></p></div></body></html>"
+    same_html = b"<html><body><div><p></p></div></body></html>"
     assert Page(same_html) == Page(same_html)
     assert Page(same_html) is not Page(same_html)
 
 
 def test_select():
-    html = "<html><body><p></p><p></p></body></html>"
+    html = b"<html><body><p></p><p></p></body></html>"
     page = Page(html)
     p_tag_nodes = page.select("p")
     assert len(p_tag_nodes) == 2
@@ -78,7 +79,7 @@ def test_classes():
 
 
 def test_selector_matches_nodes():
-    html = "<html><body><p>1</p><p>2</p></body></html>"
+    html = b"<html><body><p>1</p><p>2</p></body></html>"
     page = Page(html)
 
     p_tags = page.select("p")
@@ -90,3 +91,11 @@ def test_selector_matches_nodes():
     assert not selector_matches_nodes(
         page, "p", p_tags_reversed
     ), "matches reversed order"
+
+
+def test_find_text_with_whitespace():
+    html = b"<html><body><p>    whitespace  \n\t </p></body></html>"
+    page = Page(html)
+    html_matches = page.find_all("whitespace")
+    assert len(html_matches) == 1
+    assert isinstance(html_matches[0], HTMLTextMatch)
author	Karl Lorey <git@karllorey.com>	2022-06-21 16:59:05 +0200
committer	Karl Lorey <git@karllorey.com>	2022-06-21 16:59:05 +0200
commit	9c22147fd8f7c451ed8627af8bcb8c7eab5ba8e0 (patch)
tree	8b9d2bfe8d0d8013b0a5c5781ddacbfc4ff18288
parent	f58ddd5464e170d2570286f5e4d9b3d6d53efb5d (diff)