|
| 1 | +<?php |
| 2 | + |
| 3 | +namespace App\Services; |
| 4 | + |
| 5 | +use App\Models\Person; |
| 6 | +use App\Models\DuplicateMatch; |
| 7 | +use Illuminate\Support\Str; |
| 8 | +use Illuminate\Support\Collection; |
| 9 | + |
| 10 | +class DuplicateDetectionService |
| 11 | +{ |
| 12 | + /** |
| 13 | + * Scan persons and return collection of suggested duplicate pairs with score. |
| 14 | + * This will persist DuplicateMatch records. |
| 15 | + * |
| 16 | + * @param float $threshold minimal confidence to persist (0.0 - 1.0) |
| 17 | + * @param int $limitPerPerson maximum candidates per person |
| 18 | + * @return Collection DuplicateMatch[] |
| 19 | + */ |
| 20 | + public function scan(float $threshold = 0.7, int $limitPerPerson = 10): Collection |
| 21 | + { |
| 22 | + $created = collect(); |
| 23 | + |
| 24 | + $persons = Person::select(['id', 'givn', 'surn', 'name', 'email', 'phone', 'birthday'])->get(); |
| 25 | + |
| 26 | + // Index persons by email and phone for cheap exact matches |
| 27 | + $emailIndex = $persons->filter(fn($p) => $p->email)->groupBy(fn($p) => Str::lower($p->email)); |
| 28 | + $phoneIndex = $persons->filter(fn($p) => $p->phone)->groupBy(fn($p) => preg_replace('/\D+/', '', $p->phone)); |
| 29 | + |
| 30 | + foreach ($persons as $primary) { |
| 31 | + $candidates = collect(); |
| 32 | + |
| 33 | + // exact email matches |
| 34 | + if ($primary->email) { |
| 35 | + $email = Str::lower($primary->email); |
| 36 | + foreach ($emailIndex->get($email, []) as $p) { |
| 37 | + if ($p->id === $primary->id) continue; |
| 38 | + $score = 0.95; |
| 39 | + $candidates->push([$p, $score, ['reason' => 'email_exact']]); |
| 40 | + } |
| 41 | + } |
| 42 | + |
| 43 | + // exact phone matches |
| 44 | + if ($primary->phone) { |
| 45 | + $phone = preg_replace('/\D+/', '', $primary->phone); |
| 46 | + foreach ($phoneIndex->get($phone, []) as $p) { |
| 47 | + if ($p->id === $primary->id) continue; |
| 48 | + $score = 0.93; |
| 49 | + $candidates->push([$p, $score, ['reason' => 'phone_exact']]); |
| 50 | + } |
| 51 | + } |
| 52 | + |
| 53 | + // naive pass comparing birthdays and name similarity (O(n^2) but acceptable for small/medium datasets) |
| 54 | + foreach ($persons as $other) { |
| 55 | + if ($other->id === $primary->id) continue; |
| 56 | + |
| 57 | + // skip if already added by exact match |
| 58 | + if ($candidates->first(fn($t) => $t[0]->id === $other->id)) continue; |
| 59 | + |
| 60 | + $score = $this->computeScore($primary, $other); |
| 61 | + if ($score >= $threshold) { |
| 62 | + $candidates->push([$other, $score, ['reason' => 'fuzzy_name']]); |
| 63 | + } |
| 64 | + } |
| 65 | + |
| 66 | + // keep top N per person |
| 67 | + $top = $candidates->sortByDesc(fn($t) => $t[1])->take($limitPerPerson); |
| 68 | + foreach ($top as [$other, $score, $meta]) { |
| 69 | + // ensure unique ordered pair (smaller id as primary to avoid duplicates) |
| 70 | + $primaryId = $primary->id; |
| 71 | + $duplicateId = $other->id; |
| 72 | + |
| 73 | + // do not create self-pairs |
| 74 | + if ($primaryId === $duplicateId) continue; |
| 75 | + |
| 76 | + // choose canonical ordering to avoid creating both (A,B) and (B,A) |
| 77 | + if ($primaryId > $duplicateId) { |
| 78 | + $primaryKey = $duplicateId; |
| 79 | + $duplicateKey = $primaryId; |
| 80 | + } else { |
| 81 | + $primaryKey = $primaryId; |
| 82 | + $duplicateKey = $duplicateId; |
| 83 | + } |
| 84 | + |
| 85 | + // create or update |
| 86 | + $record = DuplicateMatch::firstOrNew([ |
| 87 | + 'primary_person_id' => $primaryKey, |
| 88 | + 'duplicate_person_id' => $duplicateKey, |
| 89 | + ]); |
| 90 | + |
| 91 | + // If new or confidence improved, store |
| 92 | + $existing = $record->exists ? (float) $record->confidence_score : 0.0; |
| 93 | + if (!$record->exists || $score > $existing) { |
| 94 | + $record->confidence_score = $score; |
| 95 | + $record->match_data = array_merge($record->match_data ?? [], [ |
| 96 | + 'last_scanned_at' => now()->toDateTimeString(), |
| 97 | + 'reasons' => $meta, |
| 98 | + 'primary' => [ |
| 99 | + 'id' => $primary->id, |
| 100 | + 'name' => $primary->name ?? ($primary->givn . ' ' . $primary->surn), |
| 101 | + 'email' => $primary->email, |
| 102 | + 'phone' => $primary->phone, |
| 103 | + 'birthday' => $primary->birthday, |
| 104 | + ], |
| 105 | + 'candidate' => [ |
| 106 | + 'id' => $other->id, |
| 107 | + 'name' => $other->name ?? ($other->givn . ' ' . $other->surn), |
| 108 | + 'email' => $other->email, |
| 109 | + 'phone' => $other->phone, |
| 110 | + 'birthday' => $other->birthday, |
| 111 | + ], |
| 112 | + ]); |
| 113 | + $record->status = $record->status ?? 'pending'; |
| 114 | + $record->save(); |
| 115 | + } |
| 116 | + |
| 117 | + $created->push($record); |
| 118 | + } |
| 119 | + } |
| 120 | + |
| 121 | + return $created; |
| 122 | + } |
| 123 | + |
| 124 | + /** |
| 125 | + * Compute a similarity score between two person records (0..1). |
| 126 | + */ |
| 127 | + protected function computeScore(Person $a, Person $b): float |
| 128 | + { |
| 129 | + $score = 0.0; |
| 130 | + |
| 131 | + // email exact (very strong) |
| 132 | + if ($a->email && $b->email && Str::lower($a->email) === Str::lower($b->email)) { |
| 133 | + $score = max($score, 0.95); |
| 134 | + } |
| 135 | + |
| 136 | + // phone exact |
| 137 | + $pa = $a->phone ? preg_replace('/\D+/', '', $a->phone) : null; |
| 138 | + $pb = $b->phone ? preg_replace('/\D+/', '', $b->phone) : null; |
| 139 | + if ($pa && $pb && $pa === $pb) { |
| 140 | + $score = max($score, 0.93); |
| 141 | + } |
| 142 | + |
| 143 | + // birthday match |
| 144 | + if ($a->birthday && $b->birthday && $a->birthday == $b->birthday) { |
| 145 | + $score += 0.25; |
| 146 | + } |
| 147 | + |
| 148 | + // name similarity using normalized levenshtein and soundex |
| 149 | + $nameA = $this->normalizeName($a->name ?? ($a->givn . ' ' . $a->surn)); |
| 150 | + $nameB = $this->normalizeName($b->name ?? ($b->givn . ' ' . $b->surn)); |
| 151 | + |
| 152 | + if ($nameA && $nameB) { |
| 153 | + $lev = levenshtein($nameA, $nameB); |
| 154 | + $maxlen = max(strlen($nameA), strlen($nameB), 1); |
| 155 | + $nameSim = 1 - ($lev / $maxlen); // 0..1 |
| 156 | + $score += $nameSim * 0.5; // name contributes up to 0.5 |
| 157 | + // soundex boost |
| 158 | + if (soundex($nameA) === soundex($nameB)) { |
| 159 | + $score += 0.1; |
| 160 | + } |
| 161 | + } |
| 162 | + |
| 163 | + // clamp 0..1 |
| 164 | + return min(1.0, (float) $score); |
| 165 | + } |
| 166 | + |
| 167 | + protected function normalizeName(?string $s): string |
| 168 | + { |
| 169 | + if (!$s) return ''; |
| 170 | + $s = Str::lower($s); |
| 171 | + $s = preg_replace('/[^a-z0-9 ]+/', '', $s); |
| 172 | + $s = preg_replace('/\s+/', ' ', trim($s)); |
| 173 | + return $s; |
| 174 | + } |
| 175 | +} |
0 commit comments