summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-06-23 21:53:23 +0200
committerKarl Lorey <git@karllorey.com>2022-06-23 21:53:23 +0200
commita2f481c3481f6445e520e6bbdfafae3bbf94f96b (patch)
treeea6b44ed7ac418af1a9f11e6891319288a5a475a
parent789e635aabd126e934af7dbf0b2769bef28d9683 (diff)
Improve performance by fixing hashing and root computation
-rw-r--r--mlscraper/html.py68
-rw-r--r--mlscraper/matches.py8
-rw-r--r--tests/test_html.py27
3 files changed, 51 insertions, 52 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py
index 807a35e..c0eede3 100644
--- a/mlscraper/html.py
+++ b/mlscraper/html.py
@@ -11,7 +11,6 @@ from functools import cached_property
from bs4 import BeautifulSoup
from bs4 import NavigableString
-from bs4 import Tag
@dataclass
@@ -35,13 +34,22 @@ class Node:
def __init__(self, soup, page: "Page"):
self.soup = soup
+ self._hash = soup.__hash__()
self._page = page
@property
def root(self):
return self._page
- @property
+ @cached_property
+ def depth(self):
+ return self.parent.depth
+
+ @cached_property
+ def parent(self):
+ return self._page._get_node_for_soup(self.soup.parent)
+
+ @cached_property
def text(self):
return self.soup.text
@@ -91,6 +99,10 @@ class Node:
def tag_name(self):
return self.soup.name
+ @property
+ def html_attributes(self):
+ return self.soup.attrs
+
def select(self, css_selector):
return [
self._page._get_node_for_soup(n) for n in self.soup.select(css_selector)
@@ -106,7 +118,8 @@ class Node:
)
def __hash__(self):
- return self.soup.__hash__()
+ return self._hash
+ # return self.soup.__hash__()
# return super().__hash__()
def __eq__(self, other):
@@ -129,6 +142,10 @@ class Page(Node):
super().__init__(soup, self)
+ @property
+ def depth(self):
+ return 0
+
def _get_node_for_soup(self, soup) -> Node:
if soup not in self._node_registry:
self._node_registry[soup] = Node(soup, self)
@@ -138,41 +155,18 @@ class Page(Node):
def get_root_node(nodes: list[Node]) -> Node:
pages = [n._page for n in nodes]
assert len(set(pages)) == 1, "different pages found, cannot get a root"
- root = _get_root_of_nodes(n.soup for n in nodes)
- return pages[0]._get_node_for_soup(root)
-
-
-def _get_root_of_nodes(soups):
- soups = list(soups)
- assert all(isinstance(n, Tag) for n in soups)
-
- # root can be node itself, so it has to be added
- parent_paths_of_nodes = [[node] + list(node.parents) for node in soups]
- # paths are needed from top to bottom
- parent_paths_rev = [list(reversed(pp)) for pp in parent_paths_of_nodes]
- try:
- ancestor = _get_root_of_paths(parent_paths_rev)
- except RuntimeError as e:
- raise RuntimeError(f"No common ancestor: {soups}") from e
- return ancestor
-
-
-def _get_root_of_paths(paths):
- """
- Computes the first common ancestor for list of paths.
- :param paths: list of list of nodes from top to bottom
- :return: first common index or RuntimeError
- """
- assert paths
- assert all(p for p in paths)
-
- # go through paths one by one, starting from bottom
- for nodes in reversed(list(zip(*paths))):
- if len(set(nodes)) == 1:
- return nodes[0]
- logging.info("failed to find ancestor for : %s", paths)
- raise RuntimeError("No common ancestor")
+ # generate parent paths from top to bottom
+ # [elem, parent, ancestor, root]
+ parent_paths = [reversed([n] + n.parents) for n in nodes]
+
+ # start looping from bottom to top
+ # zip automatically uses common length
+ # -> last element is the first one, where len(nodes) roots to compare exist
+ for layer_nodes in reversed(list(zip(*parent_paths))):
+ if len(set(layer_nodes)) == 1:
+ return layer_nodes[0]
+ raise RuntimeError("no root found")
def get_relative_depth(node: Node, root: Node):
diff --git a/mlscraper/matches.py b/mlscraper/matches.py
index c5145c1..59a2318 100644
--- a/mlscraper/matches.py
+++ b/mlscraper/matches.py
@@ -6,7 +6,6 @@ import typing
from functools import cached_property
from itertools import combinations
-from mlscraper.html import get_relative_depth
from mlscraper.html import get_root_node
from mlscraper.html import HTMLAttributeMatch
from mlscraper.html import HTMLExactTextMatch
@@ -40,10 +39,10 @@ class Match:
or other_match.root.has_parent(self.root)
)
- @cached_property
+ @property
def depth(self):
# depth of root compared to document
- return get_relative_depth(self.root, self.root.root)
+ return self.root.depth
class Extractor:
@@ -159,7 +158,8 @@ def generate_all_value_matches(
yield ValueMatch(matched_node, extractor)
else:
logging.warning(
- f"Cannot deal with HTMLMatch type, ignoring ({html_match=}, {type(html_match)=}))"
+ "Cannot deal with HTMLMatch type, ignoring "
+ f"({html_match=}, {type(html_match)=}))"
)
diff --git a/tests/test_html.py b/tests/test_html.py
index e976ed5..7780504 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -1,20 +1,25 @@
-from bs4 import BeautifulSoup
-from mlscraper.html import _get_root_of_nodes
+from mlscraper.html import get_root_node
from mlscraper.html import HTMLExactTextMatch
from mlscraper.html import Page
from mlscraper.html import selector_matches_nodes
from mlscraper.matches import AttributeValueExtractor
-def test_get_root_of_nodes():
- soup = BeautifulSoup(
- b'<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>',
- "lxml",
- )
- node_1 = soup.select_one("#one")
- node_2 = soup.select_one("#two")
- root = _get_root_of_nodes([node_1, node_2])
- assert root == soup.select_one("div")
+def test_get_root_nodes():
+ html = b'<html><body><div><p id="one"></p><p><span id="two"></span></p></div></body></html>'
+ page = Page(html)
+ node_1 = page.select("#one")[0]
+ node_2 = page.select("#two")[0]
+ root = get_root_node([node_1, node_2])
+ assert root == page.select("div")[0]
+
+
+def test_node_set():
+ html = b"<html><body><p>test</p></body></html>"
+ page = Page(html)
+ node_1 = page.select("p")[0]
+ node_2 = node_1.parent.select("p")[0]
+ assert node_1.parent == node_2.parent
class TestPage: