diff options
author | Karl Lorey <git@karllorey.com> | 2022-07-07 13:31:23 +0200 |
---|---|---|
committer | Karl Lorey <git@karllorey.com> | 2022-07-07 13:31:23 +0200 |
commit | 2c58adb28b1bc8750d0a434f721676b8cf83734f (patch) | |
tree | f4598685852dcde52752bca2c3cbc87de56bc6fe | |
parent | 6ae1e18a666c869986b93da9c448ab73ec5396d1 (diff) |
Limit recursive depth of get_similarity
-rw-r--r-- | mlscraper/html.py | 12 |
1 files changed, 10 insertions, 2 deletions
diff --git a/mlscraper/html.py b/mlscraper/html.py index 7af733c..09d217e 100644 --- a/mlscraper/html.py +++ b/mlscraper/html.py @@ -249,15 +249,23 @@ def is_supported_class(cl): return all(c not in cl for c in CLASS_CHAR_BLACKLIST) -def get_similarity(node1: Node, node2: Node) -> float: +def get_similarity(node1: Node, node2: Node, depth=3) -> float: + if depth < 1: + return 0 + if node1.tag_name != node2.tag_name: return 0 + # compute nodes jaccard similarity jaccard_top = len(set(node1.classes).intersection(node2.classes)) jaccard_bottom = len(set(node1.classes).union(node2.classes)) if jaccard_top == jaccard_bottom: return 1 # also 0/0 jaccard = jaccard_top / jaccard_bottom + + # add recursion if node1.parent and node2.parent: - jaccard = 0.75 * jaccard + 0.25 * get_similarity(node1.parent, node2.parent) + jaccard = 0.8 * jaccard + 0.2 * get_similarity( + node1.parent, node2.parent, depth=depth - 1 + ) return jaccard |