mirror of
https://github.com/hoshikawa2/mdm_project.git
synced 2026-03-03 16:09:37 +00:00
24 lines
720 B
Python
24 lines
720 B
Python
from rapidfuzz import fuzz
|
|
|
|
def _sim(a: str, b: str) -> float:
|
|
if not a or not b: return 0.0
|
|
return fuzz.token_set_ratio(a, b) / 100.0
|
|
|
|
def _pairs(n: int):
|
|
for i in range(n):
|
|
for j in range(i+1, n):
|
|
yield i, j
|
|
|
|
def dedupe_candidates(rows, threshold=0.87):
|
|
out = []
|
|
for i,j in _pairs(len(rows)):
|
|
a,b = rows[i], rows[j]
|
|
s = (
|
|
_sim(a.get("name",""), b.get("name","")) +
|
|
max(_sim(a.get("email",""), b.get("email","")), _sim(a.get("phone",""), b.get("phone",""))) +
|
|
_sim(a.get("address",""), b.get("address",""))
|
|
) / 3.0
|
|
if s >= threshold:
|
|
out.append({"i": i, "j": j, "score": round(s,3)})
|
|
return out
|