Skip to content

Commit 6fa8cb2

Browse files
authored
fix: improve member deduplication by parsing GitHub noreply emails (CM-958) (#3929)
Signed-off-by: Yeganathan S <63534555+skwowet@users.noreply.github.com>
1 parent 0db3443 commit 6fa8cb2

File tree

4 files changed

+198
-2
lines changed

4 files changed

+198
-2
lines changed

services/apps/data_sink_worker/src/service/activity.service.ts

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import {
1010
escapeNullByte,
1111
generateUUIDv1,
1212
isValidEmail,
13+
parseGitHubNoreplyEmail,
1314
single,
1415
singleOrDefault,
1516
trimUtf8ToMaxByteLength,
@@ -805,6 +806,92 @@ export default class ActivityService extends LoggerBase {
805806
)
806807
}
807808

809+
// Look up members by parsing noreply emails to extract platform usernames
810+
// e.g. "123+john@users.noreply.github.com" -> GitHub username "john"
811+
const noreplyEmailFilterMap = new Map<
812+
string,
813+
{ platform: PlatformType; username: string; segmentId: string }
814+
>()
815+
816+
for (const payload of payloadsNotInDb.filter((p) => !p.dbMember)) {
817+
for (const identity of payload.activity.member.identities.filter(
818+
(i) => i.verified && i.type === MemberIdentityType.EMAIL,
819+
)) {
820+
const ghUsername = parseGitHubNoreplyEmail(identity.value)
821+
if (ghUsername) {
822+
const key = `${PlatformType.GITHUB}:${ghUsername}:${payload.segmentId}`
823+
if (!noreplyEmailFilterMap.has(key)) {
824+
noreplyEmailFilterMap.set(key, {
825+
platform: PlatformType.GITHUB,
826+
username: ghUsername,
827+
segmentId: payload.segmentId,
828+
})
829+
}
830+
}
831+
}
832+
}
833+
834+
for (const payload of payloadsNotInDb.filter(
835+
(p) => !p.dbObjectMember && p.activity.objectMember,
836+
)) {
837+
for (const identity of payload.activity.objectMember.identities.filter(
838+
(i) => i.verified && i.type === MemberIdentityType.EMAIL,
839+
)) {
840+
const ghUsername = parseGitHubNoreplyEmail(identity.value)
841+
if (ghUsername) {
842+
const key = `${PlatformType.GITHUB}:${ghUsername}:${payload.segmentId}`
843+
if (!noreplyEmailFilterMap.has(key)) {
844+
noreplyEmailFilterMap.set(key, {
845+
platform: PlatformType.GITHUB,
846+
username: ghUsername,
847+
segmentId: payload.segmentId,
848+
})
849+
}
850+
}
851+
}
852+
}
853+
854+
if (noreplyEmailFilterMap.size > 0) {
855+
const dbMembersByNoreplyEmail = await logExecutionTimeV2(
856+
async () =>
857+
findMembersByVerifiedUsernames(this.pgQx, Array.from(noreplyEmailFilterMap.values())),
858+
this.log,
859+
'processActivities -> memberRepo.findMembersByVerifiedUsernames (noreply-email)',
860+
)
861+
862+
mapResultsToPayloads(
863+
dbMembersByNoreplyEmail,
864+
(p, value) =>
865+
!p.dbMember &&
866+
p.activity.member.identities.some(
867+
(i) =>
868+
i.verified &&
869+
i.type === MemberIdentityType.EMAIL &&
870+
parseGitHubNoreplyEmail(i.value) === value.toLowerCase(),
871+
),
872+
(p, member) => {
873+
p.dbMember = member
874+
p.dbMemberSource = 'email'
875+
},
876+
)
877+
878+
mapResultsToPayloads(
879+
dbMembersByNoreplyEmail,
880+
(p, value) =>
881+
!p.dbObjectMember &&
882+
p.activity.objectMember?.identities.some(
883+
(i) =>
884+
i.verified &&
885+
i.type === MemberIdentityType.EMAIL &&
886+
parseGitHubNoreplyEmail(i.value) === value.toLowerCase(),
887+
),
888+
(p, member) => {
889+
p.dbObjectMember = member
890+
p.dbObjectMemberSource = 'email'
891+
},
892+
)
893+
}
894+
808895
// Look up members using cross-identity matching (different platforms)
809896
// we will check only on platforms that store email identities as usernames
810897

services/apps/merge_suggestions_worker/src/activities/memberMergeSuggestions.ts

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/* eslint-disable @typescript-eslint/no-explicit-any */
22
import uniqBy from 'lodash.uniqby'
33

4+
import { parseGitHubNoreplyEmail } from '@crowd/common'
45
import { addMemberNoMerge } from '@crowd/data-access-layer/src/member_merge'
56
import { MemberField, queryMembers } from '@crowd/data-access-layer/src/members'
67
import MemberMergeSuggestionsRepository from '@crowd/data-access-layer/src/old/apps/merge_suggestions_worker/memberMergeSuggestions.repo'
@@ -14,6 +15,7 @@ import {
1415
MemberIdentityType,
1516
MemberMergeSuggestionTable,
1617
OpenSearchIndex,
18+
PlatformType,
1719
} from '@crowd/types'
1820

1921
import { EMAIL_AS_USERNAME_PLATFORMS } from '../enums'
@@ -77,6 +79,10 @@ export async function getMemberMergeSuggestions(
7779
const unverifiedEmailUsernameMatches = []
7880
const unverifiedUsernameEmailMatches = []
7981

82+
// Noreply email -> platform username matches
83+
// e.g. "123+john@users.noreply.github.com" -> GitHub username "john"
84+
const noreplyEmailUsernameMatches = []
85+
8086
// Process up to 75 identities
8187
// This is a safety limit to prevent OpenSearch max clause errors
8288
for (const { verified, value, platform, type } of identities.slice(0, 75)) {
@@ -107,6 +113,14 @@ export async function getMemberMergeSuggestions(
107113
targetLists.usernameEmail.push({ value })
108114
}
109115

116+
// Noreply email -> platform username extraction
117+
if (isEmail && verified) {
118+
const ghUsername = parseGitHubNoreplyEmail(value)
119+
if (ghUsername) {
120+
noreplyEmailUsernameMatches.push({ value: ghUsername, platform: PlatformType.GITHUB })
121+
}
122+
}
123+
110124
// Fuzzy matches (only for verified & non-numeric)
111125
if (verified && !isNumeric(value)) {
112126
targetLists.fuzzy.push({ value: stripProtocol(value) })
@@ -199,7 +213,20 @@ export async function getMemberMergeSuggestions(
199213
}),
200214
},
201215
{
202-
// Query 4: Verified -> Verified fuzzy matches
216+
// Query 8: Noreply/private email -> username (verified or unverified)
217+
matches: uniqBy(noreplyEmailUsernameMatches, 'value'),
218+
builder: ({ value, platform }) => ({
219+
bool: {
220+
must: [
221+
{ term: { [`nested_identities.keyword_value`]: value } },
222+
{ match: { [`nested_identities.string_platform`]: platform } },
223+
{ term: { [`nested_identities.keyword_type`]: MemberIdentityType.USERNAME } },
224+
],
225+
},
226+
}),
227+
},
228+
{
229+
// Query 9: Verified -> Verified fuzzy matches
203230
matches: uniqBy(verifiedFuzzyMatches, 'value'),
204231
builder: ({ value }) => ({
205232
match: {

services/apps/merge_suggestions_worker/src/memberSimilarityCalculator.ts

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { get as getLevenshteinDistance } from 'fast-levenshtein'
22

3+
import { parseGitHubNoreplyEmail } from '@crowd/common'
34
import {
45
IMemberIdentity,
56
IMemberOpensearch,
@@ -105,6 +106,11 @@ class MemberSimilarityCalculator {
105106
}
106107
}
107108

109+
// Check if a noreply email on one member resolves to a username on the other
110+
if (this.hasMatchingUsernameFromNoreplyEmail(primaryMember, similarMember)) {
111+
return 0.95
112+
}
113+
108114
for (const primaryIdentity of primaryMember.identities.filter((i) => i.verified)) {
109115
// similar member has an unverified identity as one of primary members's verified identity, return score 95
110116
if (
@@ -187,7 +193,9 @@ class MemberSimilarityCalculator {
187193
similarMember: IMemberOpensearch,
188194
): boolean {
189195
if (member.identities && member.identities.length > 0) {
190-
for (const identity of member.identities) {
196+
for (const identity of member.identities.filter(
197+
(i) => i.type === MemberIdentityType.USERNAME,
198+
)) {
191199
if (
192200
similarMember.nested_identities.some(
193201
(i) =>
@@ -204,6 +212,59 @@ class MemberSimilarityCalculator {
204212
return false
205213
}
206214

215+
/**
216+
* Checks if a noreply email in one member matches a username in the other (e.g. GitHub noreply email -> GitHub username)
217+
* Works bidirectionally: primary email -> similar username, and similar email -> primary username.
218+
*/
219+
static hasMatchingUsernameFromNoreplyEmail(
220+
primaryMember: IMemberWithAggregatesForMergeSuggestions,
221+
similarMember: IMemberOpensearch,
222+
): boolean {
223+
// Primary member's noreply emails -> similar member's platform usernames
224+
const similarUsernamesByPlatform = {
225+
[PlatformType.GITHUB]: new Set(
226+
similarMember.nested_identities
227+
.filter(
228+
(i) =>
229+
i.string_platform === PlatformType.GITHUB &&
230+
i.keyword_type === MemberIdentityType.USERNAME,
231+
)
232+
.map((i) => i.string_value?.toLowerCase()),
233+
),
234+
}
235+
236+
for (const identity of primaryMember.identities) {
237+
if (!identity.verified || identity.type !== MemberIdentityType.EMAIL) continue
238+
239+
const ghUsername = parseGitHubNoreplyEmail(identity.value)
240+
if (ghUsername && similarUsernamesByPlatform[PlatformType.GITHUB].has(ghUsername)) {
241+
return true
242+
}
243+
}
244+
245+
// Similar member's noreply emails -> primary member's platform usernames
246+
const primaryUsernamesByPlatform = {
247+
[PlatformType.GITHUB]: new Set(
248+
primaryMember.identities
249+
.filter(
250+
(i) => i.platform === PlatformType.GITHUB && i.type === MemberIdentityType.USERNAME,
251+
)
252+
.map((i) => i.value?.toLowerCase()),
253+
),
254+
}
255+
256+
for (const identity of similarMember.nested_identities) {
257+
if (!identity.bool_verified || identity.keyword_type !== MemberIdentityType.EMAIL) continue
258+
259+
const ghUsername = parseGitHubNoreplyEmail(identity.string_value)
260+
if (ghUsername && primaryUsernamesByPlatform[PlatformType.GITHUB].has(ghUsername)) {
261+
return true
262+
}
263+
}
264+
265+
return false
266+
}
267+
207268
static hasEmailAsUsernameIdentityMatch(
208269
primaryIdentity: IMemberIdentity,
209270
similarMember: IMemberOpensearch,

services/libs/common/src/email.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,24 @@ import validator from 'validator'
33
export const isValidEmail = (value: string): boolean => {
44
return validator.isEmail(value)
55
}
6+
7+
const GITHUB_NOREPLY_EMAIL_SUFFIX = '@users.noreply.github.com'
8+
9+
/**
10+
* Extracts username from a GitHub noreply email.
11+
* @see https://docs.github.com/en/account-and-profile/reference/email-addresses-reference#your-noreply-email-address
12+
*/
13+
export const parseGitHubNoreplyEmail = (email?: string | null): string | null => {
14+
if (!email) return null
15+
16+
const lower = email.toLowerCase()
17+
if (!lower.endsWith(GITHUB_NOREPLY_EMAIL_SUFFIX)) return null
18+
19+
const local = lower.slice(0, -GITHUB_NOREPLY_EMAIL_SUFFIX.length)
20+
if (!local) return null
21+
22+
const plusIndex = local.indexOf('+')
23+
const username = plusIndex >= 0 ? local.slice(plusIndex + 1) : local
24+
25+
return username || null
26+
}

0 commit comments

Comments
 (0)