mirror of
https://github.com/hoshikawa2/mdm_project.git
synced 2026-03-06 18:21:04 +00:00
first commit
This commit is contained in:
23
services/dedupe_service.py
Normal file
23
services/dedupe_service.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
def _sim(a: str, b: str) -> float:
|
||||
if not a or not b: return 0.0
|
||||
return fuzz.token_set_ratio(a, b) / 100.0
|
||||
|
||||
def _pairs(n: int):
|
||||
for i in range(n):
|
||||
for j in range(i+1, n):
|
||||
yield i, j
|
||||
|
||||
def dedupe_candidates(rows, threshold=0.87):
|
||||
out = []
|
||||
for i,j in _pairs(len(rows)):
|
||||
a,b = rows[i], rows[j]
|
||||
s = (
|
||||
_sim(a.get("name",""), b.get("name","")) +
|
||||
max(_sim(a.get("email",""), b.get("email","")), _sim(a.get("phone",""), b.get("phone",""))) +
|
||||
_sim(a.get("address",""), b.get("address",""))
|
||||
) / 3.0
|
||||
if s >= threshold:
|
||||
out.append({"i": i, "j": j, "score": round(s,3)})
|
||||
return out
|
||||
Reference in New Issue
Block a user