summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKarl Lorey <git@karllorey.com>2022-07-07 13:31:23 +0200
committerKarl Lorey <git@karllorey.com>2022-07-07 13:31:23 +0200
commit2c58adb28b1bc8750d0a434f721676b8cf83734f (patch)
treef4598685852dcde52752bca2c3cbc87de56bc6fe
parent6ae1e18a666c869986b93da9c448ab73ec5396d1 (diff)
Limit recursive depth of get_similarity
-rw-r--r--mlscraper/html.py12
1 files changed, 10 insertions, 2 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py
index 7af733c..09d217e 100644
--- a/mlscraper/html.py
+++ b/mlscraper/html.py
@@ -249,15 +249,23 @@ def is_supported_class(cl):
return all(c not in cl for c in CLASS_CHAR_BLACKLIST)
-def get_similarity(node1: Node, node2: Node) -> float:
+def get_similarity(node1: Node, node2: Node, depth=3) -> float:
+ if depth < 1:
+ return 0
+
if node1.tag_name != node2.tag_name:
return 0
+ # compute nodes jaccard similarity
jaccard_top = len(set(node1.classes).intersection(node2.classes))
jaccard_bottom = len(set(node1.classes).union(node2.classes))
if jaccard_top == jaccard_bottom:
return 1 # also 0/0
jaccard = jaccard_top / jaccard_bottom
+
+ # add recursion
if node1.parent and node2.parent:
- jaccard = 0.75 * jaccard + 0.25 * get_similarity(node1.parent, node2.parent)
+ jaccard = 0.8 * jaccard + 0.2 * get_similarity(
+ node1.parent, node2.parent, depth=depth - 1
+ )
return jaccard