diff options
author | Karl Lorey <git@karllorey.com> | 2022-06-21 16:59:05 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-06-21 16:59:05 +0200 |
commit | 9c22147fd8f7c451ed8627af8bcb8c7eab5ba8e0 (patch) | |
tree | 8b9d2bfe8d0d8013b0a5c5781ddacbfc4ff18288 | |
parent | f58ddd5464e170d2570286f5e4d9b3d6d53efb5d (diff) |
Ignore whitespace around values when searching for matches in HTML
-rw-r--r-- | mlscraper/html.py | 19 | ||||
-rw-r--r-- | mlscraper/matches.py | 10 | ||||
-rw-r--r-- | tests/test_html.py | 17 |
3 files changed, 30 insertions, 16 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py index 98201a9..5ee51be 100644 --- a/mlscraper/html.py +++ b/mlscraper/html.py @@ -2,7 +2,9 @@ Encapsulation of html-related functionality. BeautifulSoup should only get used here. """ +import html import logging +import re from abc import ABC from dataclasses import dataclass from functools import cached_property @@ -13,17 +15,17 @@ from bs4 import Tag @dataclass -class Match(ABC): +class HTMLMatch(ABC): node: "Node" = None @dataclass -class TextMatch(Match): +class HTMLTextMatch(HTMLMatch): pass @dataclass -class AttributeMatch(Match): +class HTMLAttributeMatch(HTMLMatch): attr: str = None @@ -43,24 +45,27 @@ class Node: def text(self): return self.soup.text - def find_all(self, item) -> list[Match]: + def find_all(self, item) -> list[HTMLMatch]: return list(self._generate_find_all(item)) def _generate_find_all(self, item): assert isinstance(item, str), "can only search for str at the moment" # text - for soup_node in self.soup.find_all(string=item): + # - since text matches including whitespace, a regex is used + for soup_node in self.soup.find_all( + string=re.compile(r"\s*%s\s*" % html.escape(item)) + ): # use parent node as found text is NaviableString and not Tag node = self._page._get_node_for_soup(soup_node.parent) - yield TextMatch(node) + yield HTMLTextMatch(node) # attributes for soup_node in self.soup.find_all(): for attr in soup_node.attrs: if soup_node[attr] == item: node = self._page._get_node_for_soup(soup_node) - yield AttributeMatch(node, attr) + yield HTMLAttributeMatch(node, attr) # todo implement other find methods diff --git a/mlscraper/matches.py b/mlscraper/matches.py index 36da4b4..1301375 100644 --- a/mlscraper/matches.py +++ b/mlscraper/matches.py @@ -6,11 +6,11 @@ import typing from functools import cached_property from itertools import combinations -from mlscraper.html import AttributeMatch from mlscraper.html import get_relative_depth from mlscraper.html import get_root_node +from mlscraper.html import HTMLAttributeMatch +from mlscraper.html import HTMLTextMatch from mlscraper.html import Node -from mlscraper.html import TextMatch class Match: @@ -61,7 +61,7 @@ class TextValueExtractor(Extractor): """ def extract(self, node: Node): - return node.soup.text + return node.soup.text.strip() def __repr__(self): return f"<{self.__class__.__name__}>" @@ -151,9 +151,9 @@ def generate_all_value_matches( logging.info(f"generating all value matches ({node=}, {item=})") for html_match in node.find_all(item): matched_node = html_match.node - if isinstance(html_match, TextMatch): + if isinstance(html_match, HTMLTextMatch): extractor = TextValueExtractor() - elif isinstance(html_match, AttributeMatch): + elif isinstance(html_match, HTMLAttributeMatch): extractor = AttributeValueExtractor(html_match.attr) else: raise RuntimeError( diff --git a/tests/test_html.py b/tests/test_html.py index 678a5f9..faeaef8 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -1,5 +1,6 @@ from bs4 import BeautifulSoup from mlscraper.html import _get_root_of_nodes +from mlscraper.html import HTMLTextMatch from mlscraper.html import Page from mlscraper.html import selector_matches_nodes from mlscraper.matches import AttributeValueExtractor @@ -7,7 +8,7 @@ from mlscraper.matches import AttributeValueExtractor def test_get_root_of_nodes(): soup = BeautifulSoup( - '<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>', + b'<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>', "lxml", ) node_1 = soup.select_one("#one") @@ -49,13 +50,13 @@ def test_extractor_factory(): def test_equality(): # we want to make sure that equal html does not result in equality - same_html = "<html><body><div><p></p></div></body></html>" + same_html = b"<html><body><div><p></p></div></body></html>" assert Page(same_html) == Page(same_html) assert Page(same_html) is not Page(same_html) def test_select(): - html = "<html><body><p></p><p></p></body></html>" + html = b"<html><body><p></p><p></p></body></html>" page = Page(html) p_tag_nodes = page.select("p") assert len(p_tag_nodes) == 2 @@ -78,7 +79,7 @@ def test_classes(): def test_selector_matches_nodes(): - html = "<html><body><p>1</p><p>2</p></body></html>" + html = b"<html><body><p>1</p><p>2</p></body></html>" page = Page(html) p_tags = page.select("p") @@ -90,3 +91,11 @@ def test_selector_matches_nodes(): assert not selector_matches_nodes( page, "p", p_tags_reversed ), "matches reversed order" + + +def test_find_text_with_whitespace(): + html = b"<html><body><p> whitespace \n\t </p></body></html>" + page = Page(html) + html_matches = page.find_all("whitespace") + assert len(html_matches) == 1 + assert isinstance(html_matches[0], HTMLTextMatch) |