Files
mdm_project/services/dedupe_service.py
Oracle Public Cloud User b51b2f5e1e first commit
2025-09-02 23:15:17 +00:00

24 lines
720 B
Python

from rapidfuzz import fuzz
def _sim(a: str, b: str) -> float:
if not a or not b: return 0.0
return fuzz.token_set_ratio(a, b) / 100.0
def _pairs(n: int):
for i in range(n):
for j in range(i+1, n):
yield i, j
def dedupe_candidates(rows, threshold=0.87):
out = []
for i,j in _pairs(len(rows)):
a,b = rows[i], rows[j]
s = (
_sim(a.get("name",""), b.get("name","")) +
max(_sim(a.get("email",""), b.get("email","")), _sim(a.get("phone",""), b.get("phone",""))) +
_sim(a.get("address",""), b.get("address",""))
) / 3.0
if s >= threshold:
out.append({"i": i, "j": j, "score": round(s,3)})
return out