diff options
author | Karl Lorey <git@karllorey.com> | 2022-06-23 21:53:23 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-06-23 21:53:23 +0200 |
commit | a2f481c3481f6445e520e6bbdfafae3bbf94f96b (patch) | |
tree | ea6b44ed7ac418af1a9f11e6891319288a5a475a | |
parent | 789e635aabd126e934af7dbf0b2769bef28d9683 (diff) |
Improve performance by fixing hashing and root computation
-rw-r--r-- | mlscraper/html.py | 68 | ||||
-rw-r--r-- | mlscraper/matches.py | 8 | ||||
-rw-r--r-- | tests/test_html.py | 27 |
3 files changed, 51 insertions, 52 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py index 807a35e..c0eede3 100644 --- a/mlscraper/html.py +++ b/mlscraper/html.py @@ -11,7 +11,6 @@ from functools import cached_property from bs4 import BeautifulSoup from bs4 import NavigableString -from bs4 import Tag @dataclass @@ -35,13 +34,22 @@ class Node: def __init__(self, soup, page: "Page"): self.soup = soup + self._hash = soup.__hash__() self._page = page @property def root(self): return self._page - @property + @cached_property + def depth(self): + return self.parent.depth + + @cached_property + def parent(self): + return self._page._get_node_for_soup(self.soup.parent) + + @cached_property def text(self): return self.soup.text @@ -91,6 +99,10 @@ class Node: def tag_name(self): return self.soup.name + @property + def html_attributes(self): + return self.soup.attrs + def select(self, css_selector): return [ self._page._get_node_for_soup(n) for n in self.soup.select(css_selector) @@ -106,7 +118,8 @@ class Node: ) def __hash__(self): - return self.soup.__hash__() + return self._hash + # return self.soup.__hash__() # return super().__hash__() def __eq__(self, other): @@ -129,6 +142,10 @@ class Page(Node): super().__init__(soup, self) + @property + def depth(self): + return 0 + def _get_node_for_soup(self, soup) -> Node: if soup not in self._node_registry: self._node_registry[soup] = Node(soup, self) @@ -138,41 +155,18 @@ class Page(Node): def get_root_node(nodes: list[Node]) -> Node: pages = [n._page for n in nodes] assert len(set(pages)) == 1, "different pages found, cannot get a root" - root = _get_root_of_nodes(n.soup for n in nodes) - return pages[0]._get_node_for_soup(root) - - -def _get_root_of_nodes(soups): - soups = list(soups) - assert all(isinstance(n, Tag) for n in soups) - - # root can be node itself, so it has to be added - parent_paths_of_nodes = [[node] + list(node.parents) for node in soups] - # paths are needed from top to bottom - parent_paths_rev = [list(reversed(pp)) for pp in parent_paths_of_nodes] - try: - ancestor = _get_root_of_paths(parent_paths_rev) - except RuntimeError as e: - raise RuntimeError(f"No common ancestor: {soups}") from e - return ancestor - - -def _get_root_of_paths(paths): - """ - Computes the first common ancestor for list of paths. - :param paths: list of list of nodes from top to bottom - :return: first common index or RuntimeError - """ - assert paths - assert all(p for p in paths) - - # go through paths one by one, starting from bottom - for nodes in reversed(list(zip(*paths))): - if len(set(nodes)) == 1: - return nodes[0] - logging.info("failed to find ancestor for : %s", paths) - raise RuntimeError("No common ancestor") + # generate parent paths from top to bottom + # [elem, parent, ancestor, root] + parent_paths = [reversed([n] + n.parents) for n in nodes] + + # start looping from bottom to top + # zip automatically uses common length + # -> last element is the first one, where len(nodes) roots to compare exist + for layer_nodes in reversed(list(zip(*parent_paths))): + if len(set(layer_nodes)) == 1: + return layer_nodes[0] + raise RuntimeError("no root found") def get_relative_depth(node: Node, root: Node): diff --git a/mlscraper/matches.py b/mlscraper/matches.py index c5145c1..59a2318 100644 --- a/mlscraper/matches.py +++ b/mlscraper/matches.py @@ -6,7 +6,6 @@ import typing from functools import cached_property from itertools import combinations -from mlscraper.html import get_relative_depth from mlscraper.html import get_root_node from mlscraper.html import HTMLAttributeMatch from mlscraper.html import HTMLExactTextMatch @@ -40,10 +39,10 @@ class Match: or other_match.root.has_parent(self.root) ) - @cached_property + @property def depth(self): # depth of root compared to document - return get_relative_depth(self.root, self.root.root) + return self.root.depth class Extractor: @@ -159,7 +158,8 @@ def generate_all_value_matches( yield ValueMatch(matched_node, extractor) else: logging.warning( - f"Cannot deal with HTMLMatch type, ignoring ({html_match=}, {type(html_match)=}))" + "Cannot deal with HTMLMatch type, ignoring " + f"({html_match=}, {type(html_match)=}))" ) diff --git a/tests/test_html.py b/tests/test_html.py index e976ed5..7780504 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -1,20 +1,25 @@ -from bs4 import BeautifulSoup -from mlscraper.html import _get_root_of_nodes +from mlscraper.html import get_root_node from mlscraper.html import HTMLExactTextMatch from mlscraper.html import Page from mlscraper.html import selector_matches_nodes from mlscraper.matches import AttributeValueExtractor -def test_get_root_of_nodes(): - soup = BeautifulSoup( - b'<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>', - "lxml", - ) - node_1 = soup.select_one("#one") - node_2 = soup.select_one("#two") - root = _get_root_of_nodes([node_1, node_2]) - assert root == soup.select_one("div") +def test_get_root_nodes(): + html = b'<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>' + page = Page(html) + node_1 = page.select("#one")[0] + node_2 = page.select("#two")[0] + root = get_root_node([node_1, node_2]) + assert root == page.select("div")[0] + + +def test_node_set(): + html = b"<html><body><p>test</p></body></html>" + page = Page(html) + node_1 = page.select("p")[0] + node_2 = node_1.parent.select("p")[0] + assert node_1.parent == node_2.parent class TestPage: |