Improve performance by fixing hashing and root computation

author: Karl Lorey <git@karllorey.com> 2022-06-23 21:53:23 +0200
committer: Karl Lorey <git@karllorey.com> 2022-06-23 21:53:23 +0200
commit: a2f481c3481f6445e520e6bbdfafae3bbf94f96b (patch)
tree: ea6b44ed7ac418af1a9f11e6891319288a5a475a
parent: 789e635aabd126e934af7dbf0b2769bef28d9683 (diff)
3 files changed, 51 insertions, 52 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py
index 807a35e..c0eede3 100644
--- a/mlscraper/html.py
+++ b/mlscraper/html.py
@@ -11,7 +11,6 @@ from functools import cached_property
 
 from bs4 import BeautifulSoup
 from bs4 import NavigableString
-from bs4 import Tag
 
 
 @dataclass
@@ -35,13 +34,22 @@ class Node:
 
     def __init__(self, soup, page: "Page"):
         self.soup = soup
+        self._hash = soup.__hash__()
         self._page = page
 
     @property
     def root(self):
         return self._page
 
-    @property
+    @cached_property
+    def depth(self):
+        return self.parent.depth
+
+    @cached_property
+    def parent(self):
+        return self._page._get_node_for_soup(self.soup.parent)
+
+    @cached_property
     def text(self):
         return self.soup.text
 
@@ -91,6 +99,10 @@ class Node:
     def tag_name(self):
         return self.soup.name
 
+    @property
+    def html_attributes(self):
+        return self.soup.attrs
+
     def select(self, css_selector):
         return [
             self._page._get_node_for_soup(n) for n in self.soup.select(css_selector)
@@ -106,7 +118,8 @@ class Node:
         )
 
     def __hash__(self):
-        return self.soup.__hash__()
+        return self._hash
+        # return self.soup.__hash__()
         # return super().__hash__()
 
     def __eq__(self, other):
@@ -129,6 +142,10 @@ class Page(Node):
 
         super().__init__(soup, self)
 
+    @property
+    def depth(self):
+        return 0
+
     def _get_node_for_soup(self, soup) -> Node:
         if soup not in self._node_registry:
             self._node_registry[soup] = Node(soup, self)
@@ -138,41 +155,18 @@ class Page(Node):
 def get_root_node(nodes: list[Node]) -> Node:
     pages = [n._page for n in nodes]
     assert len(set(pages)) == 1, "different pages found, cannot get a root"
-    root = _get_root_of_nodes(n.soup for n in nodes)
-    return pages[0]._get_node_for_soup(root)
-
-
-def _get_root_of_nodes(soups):
-    soups = list(soups)
-    assert all(isinstance(n, Tag) for n in soups)
-
-    # root can be node itself, so it has to be added
-    parent_paths_of_nodes = [[node] + list(node.parents) for node in soups]
 
-    # paths are needed from top to bottom
-    parent_paths_rev = [list(reversed(pp)) for pp in parent_paths_of_nodes]
-    try:
-        ancestor = _get_root_of_paths(parent_paths_rev)
-    except RuntimeError as e:
-        raise RuntimeError(f"No common ancestor: {soups}") from e
-    return ancestor
-
-
-def _get_root_of_paths(paths):
-    """
-    Computes the first common ancestor for list of paths.
-    :param paths: list of list of nodes from top to bottom
-    :return: first common index or RuntimeError
-    """
-    assert paths
-    assert all(p for p in paths)
-
-    # go through paths one by one, starting from bottom
-    for nodes in reversed(list(zip(*paths))):
-        if len(set(nodes)) == 1:
-            return nodes[0]
-    logging.info("failed to find ancestor for : %s", paths)
-    raise RuntimeError("No common ancestor")
+    # generate parent paths from top to bottom
+    # [elem, parent, ancestor, root]
+    parent_paths = [reversed([n] + n.parents) for n in nodes]
+
+    # start looping from bottom to top
+    # zip automatically uses common length
+    # -> last element is the first one, where len(nodes) roots to compare exist
+    for layer_nodes in reversed(list(zip(*parent_paths))):
+        if len(set(layer_nodes)) == 1:
+            return layer_nodes[0]
+    raise RuntimeError("no root found")
 
 
 def get_relative_depth(node: Node, root: Node):
diff --git a/mlscraper/matches.py b/mlscraper/matches.py
index c5145c1..59a2318 100644
--- a/mlscraper/matches.py
+++ b/mlscraper/matches.py
@@ -6,7 +6,6 @@ import typing
 from functools import cached_property
 from itertools import combinations
 
-from mlscraper.html import get_relative_depth
 from mlscraper.html import get_root_node
 from mlscraper.html import HTMLAttributeMatch
 from mlscraper.html import HTMLExactTextMatch
@@ -40,10 +39,10 @@ class Match:
             or other_match.root.has_parent(self.root)
         )
 
-    @cached_property
+    @property
     def depth(self):
         # depth of root compared to document
-        return get_relative_depth(self.root, self.root.root)
+        return self.root.depth
 
 
 class Extractor:
@@ -159,7 +158,8 @@ def generate_all_value_matches(
             yield ValueMatch(matched_node, extractor)
         else:
             logging.warning(
-                f"Cannot deal with HTMLMatch type, ignoring ({html_match=}, {type(html_match)=}))"
+                "Cannot deal with HTMLMatch type, ignoring "
+                f"({html_match=}, {type(html_match)=}))"
             )
 
 
diff --git a/tests/test_html.py b/tests/test_html.py
index e976ed5..7780504 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -1,20 +1,25 @@
-from bs4 import BeautifulSoup
-from mlscraper.html import _get_root_of_nodes
+from mlscraper.html import get_root_node
 from mlscraper.html import HTMLExactTextMatch
 from mlscraper.html import Page
 from mlscraper.html import selector_matches_nodes
 from mlscraper.matches import AttributeValueExtractor
 
 
-def test_get_root_of_nodes():
-    soup = BeautifulSoup(
-        b'<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>',
-        "lxml",
-    )
-    node_1 = soup.select_one("#one")
-    node_2 = soup.select_one("#two")
-    root = _get_root_of_nodes([node_1, node_2])
-    assert root == soup.select_one("div")
+def test_get_root_nodes():
+    html = b'<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>'
+    page = Page(html)
+    node_1 = page.select("#one")[0]
+    node_2 = page.select("#two")[0]
+    root = get_root_node([node_1, node_2])
+    assert root == page.select("div")[0]
+
+
+def test_node_set():
+    html = b"<html><body><p>test</p></body></html>"
+    page = Page(html)
+    node_1 = page.select("p")[0]
+    node_2 = node_1.parent.select("p")[0]
+    assert node_1.parent == node_2.parent
 
 
 class TestPage:
author	Karl Lorey <git@karllorey.com>	2022-06-23 21:53:23 +0200
committer	Karl Lorey <git@karllorey.com>	2022-06-23 21:53:23 +0200
commit	a2f481c3481f6445e520e6bbdfafae3bbf94f96b (patch)
tree	ea6b44ed7ac418af1a9f11e6891319288a5a475a
parent	789e635aabd126e934af7dbf0b2769bef28d9683 (diff)