Skip to content

Commit 88bcf3f

Browse files
Create DuplicateDetectionService.php
1 parent 188b5fa commit 88bcf3f

File tree

1 file changed

+175
-0
lines changed

1 file changed

+175
-0
lines changed
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
<?php
2+
3+
namespace App\Services;
4+
5+
use App\Models\Person;
6+
use App\Models\DuplicateMatch;
7+
use Illuminate\Support\Str;
8+
use Illuminate\Support\Collection;
9+
10+
class DuplicateDetectionService
11+
{
12+
/**
13+
* Scan persons and return collection of suggested duplicate pairs with score.
14+
* This will persist DuplicateMatch records.
15+
*
16+
* @param float $threshold minimal confidence to persist (0.0 - 1.0)
17+
* @param int $limitPerPerson maximum candidates per person
18+
* @return Collection DuplicateMatch[]
19+
*/
20+
public function scan(float $threshold = 0.7, int $limitPerPerson = 10): Collection
21+
{
22+
$created = collect();
23+
24+
$persons = Person::select(['id', 'givn', 'surn', 'name', 'email', 'phone', 'birthday'])->get();
25+
26+
// Index persons by email and phone for cheap exact matches
27+
$emailIndex = $persons->filter(fn($p) => $p->email)->groupBy(fn($p) => Str::lower($p->email));
28+
$phoneIndex = $persons->filter(fn($p) => $p->phone)->groupBy(fn($p) => preg_replace('/\D+/', '', $p->phone));
29+
30+
foreach ($persons as $primary) {
31+
$candidates = collect();
32+
33+
// exact email matches
34+
if ($primary->email) {
35+
$email = Str::lower($primary->email);
36+
foreach ($emailIndex->get($email, []) as $p) {
37+
if ($p->id === $primary->id) continue;
38+
$score = 0.95;
39+
$candidates->push([$p, $score, ['reason' => 'email_exact']]);
40+
}
41+
}
42+
43+
// exact phone matches
44+
if ($primary->phone) {
45+
$phone = preg_replace('/\D+/', '', $primary->phone);
46+
foreach ($phoneIndex->get($phone, []) as $p) {
47+
if ($p->id === $primary->id) continue;
48+
$score = 0.93;
49+
$candidates->push([$p, $score, ['reason' => 'phone_exact']]);
50+
}
51+
}
52+
53+
// naive pass comparing birthdays and name similarity (O(n^2) but acceptable for small/medium datasets)
54+
foreach ($persons as $other) {
55+
if ($other->id === $primary->id) continue;
56+
57+
// skip if already added by exact match
58+
if ($candidates->first(fn($t) => $t[0]->id === $other->id)) continue;
59+
60+
$score = $this->computeScore($primary, $other);
61+
if ($score >= $threshold) {
62+
$candidates->push([$other, $score, ['reason' => 'fuzzy_name']]);
63+
}
64+
}
65+
66+
// keep top N per person
67+
$top = $candidates->sortByDesc(fn($t) => $t[1])->take($limitPerPerson);
68+
foreach ($top as [$other, $score, $meta]) {
69+
// ensure unique ordered pair (smaller id as primary to avoid duplicates)
70+
$primaryId = $primary->id;
71+
$duplicateId = $other->id;
72+
73+
// do not create self-pairs
74+
if ($primaryId === $duplicateId) continue;
75+
76+
// choose canonical ordering to avoid creating both (A,B) and (B,A)
77+
if ($primaryId > $duplicateId) {
78+
$primaryKey = $duplicateId;
79+
$duplicateKey = $primaryId;
80+
} else {
81+
$primaryKey = $primaryId;
82+
$duplicateKey = $duplicateId;
83+
}
84+
85+
// create or update
86+
$record = DuplicateMatch::firstOrNew([
87+
'primary_person_id' => $primaryKey,
88+
'duplicate_person_id' => $duplicateKey,
89+
]);
90+
91+
// If new or confidence improved, store
92+
$existing = $record->exists ? (float) $record->confidence_score : 0.0;
93+
if (!$record->exists || $score > $existing) {
94+
$record->confidence_score = $score;
95+
$record->match_data = array_merge($record->match_data ?? [], [
96+
'last_scanned_at' => now()->toDateTimeString(),
97+
'reasons' => $meta,
98+
'primary' => [
99+
'id' => $primary->id,
100+
'name' => $primary->name ?? ($primary->givn . ' ' . $primary->surn),
101+
'email' => $primary->email,
102+
'phone' => $primary->phone,
103+
'birthday' => $primary->birthday,
104+
],
105+
'candidate' => [
106+
'id' => $other->id,
107+
'name' => $other->name ?? ($other->givn . ' ' . $other->surn),
108+
'email' => $other->email,
109+
'phone' => $other->phone,
110+
'birthday' => $other->birthday,
111+
],
112+
]);
113+
$record->status = $record->status ?? 'pending';
114+
$record->save();
115+
}
116+
117+
$created->push($record);
118+
}
119+
}
120+
121+
return $created;
122+
}
123+
124+
/**
125+
* Compute a similarity score between two person records (0..1).
126+
*/
127+
protected function computeScore(Person $a, Person $b): float
128+
{
129+
$score = 0.0;
130+
131+
// email exact (very strong)
132+
if ($a->email && $b->email && Str::lower($a->email) === Str::lower($b->email)) {
133+
$score = max($score, 0.95);
134+
}
135+
136+
// phone exact
137+
$pa = $a->phone ? preg_replace('/\D+/', '', $a->phone) : null;
138+
$pb = $b->phone ? preg_replace('/\D+/', '', $b->phone) : null;
139+
if ($pa && $pb && $pa === $pb) {
140+
$score = max($score, 0.93);
141+
}
142+
143+
// birthday match
144+
if ($a->birthday && $b->birthday && $a->birthday == $b->birthday) {
145+
$score += 0.25;
146+
}
147+
148+
// name similarity using normalized levenshtein and soundex
149+
$nameA = $this->normalizeName($a->name ?? ($a->givn . ' ' . $a->surn));
150+
$nameB = $this->normalizeName($b->name ?? ($b->givn . ' ' . $b->surn));
151+
152+
if ($nameA && $nameB) {
153+
$lev = levenshtein($nameA, $nameB);
154+
$maxlen = max(strlen($nameA), strlen($nameB), 1);
155+
$nameSim = 1 - ($lev / $maxlen); // 0..1
156+
$score += $nameSim * 0.5; // name contributes up to 0.5
157+
// soundex boost
158+
if (soundex($nameA) === soundex($nameB)) {
159+
$score += 0.1;
160+
}
161+
}
162+
163+
// clamp 0..1
164+
return min(1.0, (float) $score);
165+
}
166+
167+
protected function normalizeName(?string $s): string
168+
{
169+
if (!$s) return '';
170+
$s = Str::lower($s);
171+
$s = preg_replace('/[^a-z0-9 ]+/', '', $s);
172+
$s = preg_replace('/\s+/', ' ', trim($s));
173+
return $s;
174+
}
175+
}

0 commit comments

Comments
 (0)