Skip to content

Commit ff856f8

Browse files
ensure all relevant clientSpamTrainingData is uploaded for mailbox
We want to make sure that all relevant clientSpamTrainingData is uploaded correctly for each mailbox. Previously, if clientSpamTrainingData was not empty for a mailbox, we would not upload more training data. This led to cases where users do only have a fraction of training data in comparison to mails available in their mailbox. We now check if the length of the already existing clientSpamTrainingData is smaller than the number of relevant mails for training when training from scratch. Co-authored-by: abp <abp@tutao.de>
1 parent aaa7535 commit ff856f8

File tree

2 files changed

+129
-20
lines changed

2 files changed

+129
-20
lines changed

src/mail-app/workerUtils/spamClassification/SpamClassificationDataDealer.ts

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -55,16 +55,21 @@ export class SpamClassificationDataDealer {
5555
// clientSpamTrainingData is NOT cached
5656
let clientSpamTrainingData = await this.entityClient.loadAll(ClientSpamTrainingDatumTypeRef, mailbox.clientSpamTrainingData)
5757

58-
// if the training data is empty for this mailbox, we are aggregating
59-
// the last INITIAL_SPAM_CLASSIFICATION_INDEX_INTERVAL_DAYS of mails and uploading the training data
60-
if (isEmpty(clientSpamTrainingData)) {
61-
console.log("building and uploading initial training data for mailbox: " + mailbox._id)
62-
const mailsWithMailDetails = await this.fetchMailAndMailDetailsForMailbox(mailbox, mailSets)
63-
console.log(`mailbox has ${mailsWithMailDetails.length} mails suitable for encrypted training vector data upload`)
64-
console.log(`vectorizing, compressing and encrypting those ${mailsWithMailDetails.length} mails...`)
65-
await this.uploadTrainingDataForMails(mailsWithMailDetails, mailbox, mailSets)
58+
// if the clientSpamTrainingData is empty or does not include all relevant clientSpamTrainingData
59+
// for this mailbox, we are aggregating the last INITIAL_SPAM_CLASSIFICATION_INDEX_INTERVAL_DAYS of mails
60+
// and upload the missing clientSpamTrainingDatum entries
61+
const allRelevantMailsInTrainingInterval = await this.fetchMailAndMailDetailsForMailbox(mailbox, mailSets)
62+
console.log(`mailbox ${mailbox._id} has total ${allRelevantMailsInTrainingInterval.length} relevant mails in training interval for spam classification`)
63+
if (clientSpamTrainingData.length < allRelevantMailsInTrainingInterval.length) {
64+
const mailsToUpload = allRelevantMailsInTrainingInterval.filter((mail) => {
65+
return !clientSpamTrainingData.some((datum) => isSameId(getElementId(mail.mail), getElementId(datum)))
66+
})
67+
console.log("building and uploading initial / new training data for mailbox: " + mailbox._id)
68+
console.log(`mailbox ${mailbox._id} has ${mailsToUpload.length} new mails suitable for encrypted training vector data upload`)
69+
console.log(`vectorizing, compressing and encrypting those ${mailsToUpload.length} mails... for mailbox ${mailbox._id}`)
70+
await this.uploadTrainingDataForMails(mailsToUpload, mailbox, mailSets)
6671
clientSpamTrainingData = await this.entityClient.loadAll(ClientSpamTrainingDatumTypeRef, mailbox.clientSpamTrainingData)
67-
console.log(`clientSpamTrainingData list on the mailbox has ${clientSpamTrainingData.length} members.`)
72+
console.log(`clientSpamTrainingData list on the mailbox ${mailbox._id} has ${clientSpamTrainingData.length} members.`)
6873
}
6974

7075
const { subsampledTrainingData, hamCount, spamCount } = this.subsampleHamAndSpamMails(clientSpamTrainingData)
@@ -172,8 +177,8 @@ export class SpamClassificationDataDealer {
172177
async fetchMailsByMailbagAfterDate(mailbag: MailBag, mailSets: MailFolder[], startDate: Date): Promise<Array<MailWithMailDetails>> {
173178
const bulkMailLoader = await this.bulkMailLoader()
174179
const mails = await this.entityClient.loadAll(MailTypeRef, mailbag.mails, timestampToGeneratedId(startDate.getTime()))
180+
const trashFolder = assertNotNull(mailSets.find((set) => getMailSetKind(set) === MailSetKind.TRASH))
175181
const filteredMails = mails.filter((mail) => {
176-
const trashFolder = assertNotNull(mailSets.find((set) => getMailSetKind(set) === MailSetKind.TRASH))
177182
const isMailTrashed = mail.sets.some((setId) => isSameId(setId, trashFolder._id))
178183
return isNotNull(mail.mailDetails) && !hasError(mail) && mail.receivedDate > startDate && !isMailTrashed
179184
})

test/tests/api/worker/utils/spamClassification/SpamClassificationDataDealerTest.ts

Lines changed: 114 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,13 @@ function createMailByFolderAndReceivedDate(mailId: IdTuple, mailSet: IdTuple, re
4040
})
4141
}
4242

43-
function createSpamTrainingDatumByConfidenceAndDecision(confidence: string, spamDecision: SpamDecision): ClientSpamTrainingDatum {
43+
function createSpamTrainingDatumByConfidenceAndDecision(
44+
confidence: string,
45+
spamDecision: SpamDecision,
46+
id: IdTuple = ["listId", "elementId"],
47+
): ClientSpamTrainingDatum {
4448
return createTestEntity(ClientSpamTrainingDatumTypeRef, {
49+
_id: id,
4550
_ownerGroup: "group",
4651
confidence,
4752
spamDecision,
@@ -153,14 +158,24 @@ o.spec("SpamClassificationDataDealer", () => {
153158
o("uploads training data when clientSpamTrainingData is empty", async () => {
154159
when(entityClientMock.load(MailboxGroupRootTypeRef, "owner")).thenResolve(mailboxGroupRoot)
155160
when(entityClientMock.load(MailBoxTypeRef, "mailbox")).thenResolve(mailBox)
156-
const spamTrainingData = Array.from({ length: 10 }, () =>
157-
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.WHITELIST),
158-
).concat(Array.from({ length: 10 }, () => createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.BLACKLIST)))
159-
const mails = Array.from({ length: 10 }, () =>
160-
createMailByFolderAndReceivedDate([mailBox.currentMailBag!.mails, "inboxMailId"], inboxFolder._id, new Date(), mailDetails._id),
161+
const mails = Array.from({ length: 10 }, (_, index) =>
162+
createMailByFolderAndReceivedDate([mailBox.currentMailBag!.mails, "inboxMailId" + index], inboxFolder._id, new Date(), mailDetails._id),
163+
).concat(
164+
Array.from({ length: 10 }, (_, index) =>
165+
createMailByFolderAndReceivedDate([mailBox.currentMailBag!.mails, "spamMailId" + index], spamFolder._id, new Date(), mailDetails._id),
166+
),
167+
)
168+
const spamTrainingData = Array.from({ length: 10 }, (_, index) =>
169+
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.WHITELIST, [
170+
mailBox.clientSpamTrainingData!,
171+
getElementId(mails[index]),
172+
]),
161173
).concat(
162-
Array.from({ length: 10 }, () =>
163-
createMailByFolderAndReceivedDate([mailBox.currentMailBag!.mails, "spamMailId"], spamFolder._id, new Date(), mailDetails._id),
174+
Array.from({ length: 10 }, (_, index) =>
175+
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.BLACKLIST, [
176+
mailBox.clientSpamTrainingData!,
177+
getElementId(mails[10 + index]),
178+
]),
164179
),
165180
)
166181
const modifiedIndicesSinceStart = spamTrainingData.map((data) =>
@@ -203,18 +218,106 @@ o.spec("SpamClassificationDataDealer", () => {
203218
})
204219
})
205220

221+
o("uploads training data when clientSpamTrainingData does not include all relevant mails", async () => {
222+
when(entityClientMock.load(MailboxGroupRootTypeRef, "owner")).thenResolve(mailboxGroupRoot)
223+
when(entityClientMock.load(MailBoxTypeRef, "mailbox")).thenResolve(mailBox)
224+
225+
const relevantMails = Array.from({ length: 40 }, (_, index) =>
226+
createMailByFolderAndReceivedDate([mailBox.currentMailBag!.mails, "inboxMailId" + index], inboxFolder._id, new Date(), mailDetails._id),
227+
).concat(
228+
Array.from({ length: 40 }, (_, index) =>
229+
createMailByFolderAndReceivedDate([mailBox.currentMailBag!.mails, "spamMailId" + index], spamFolder._id, new Date(), mailDetails._id),
230+
),
231+
)
232+
233+
const existingSpamTrainingData = Array.from({ length: 20 }, (_, index) =>
234+
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.WHITELIST, [
235+
mailBox.clientSpamTrainingData!,
236+
getElementId(relevantMails[index]),
237+
]),
238+
).concat(
239+
Array.from({ length: 20 }, (_, index) =>
240+
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.BLACKLIST, [
241+
mailBox.clientSpamTrainingData!,
242+
getElementId(relevantMails[40 + index]),
243+
]),
244+
),
245+
)
246+
247+
const updatedSpamTrainingData = Array.from({ length: 40 }, (_, index) =>
248+
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.WHITELIST, [
249+
mailBox.clientSpamTrainingData!,
250+
getElementId(relevantMails[index]),
251+
]),
252+
).concat(
253+
Array.from({ length: 40 }, (_, index) =>
254+
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.BLACKLIST, [
255+
mailBox.clientSpamTrainingData!,
256+
getElementId(relevantMails[40 + index]),
257+
]),
258+
),
259+
)
260+
261+
const modifiedIndicesSinceStart = updatedSpamTrainingData.map((data) =>
262+
createClientSpamTrainingDatumIndexEntryByClientSpamTrainingDatumElementId(getElementId(data)),
263+
)
264+
265+
when(entityClientMock.loadAll(ClientSpamTrainingDatumTypeRef, mailBox.clientSpamTrainingData!)).thenResolve(
266+
existingSpamTrainingData,
267+
updatedSpamTrainingData,
268+
)
269+
when(entityClientMock.loadAll(MailTypeRef, mailBox.currentMailBag!.mails, anything())).thenResolve(relevantMails)
270+
when(entityClientMock.loadAll(MailTypeRef, mailBox.archivedMailBags[0].mails, anything())).thenResolve([])
271+
when(entityClientMock.loadAll(MailFolderTypeRef, mailBox.folders!.folders)).thenResolve([inboxFolder, spamFolder, trashFolder])
272+
when(entityClientMock.loadAll(ClientSpamTrainingDatumIndexEntryTypeRef, mailBox.modifiedClientSpamTrainingDataIndex!)).thenResolve(
273+
modifiedIndicesSinceStart,
274+
)
275+
276+
when(bulkMailLoaderMock.loadMailDetails(relevantMails)).thenResolve(
277+
relevantMails.map((mail) => {
278+
return { mail, mailDetails }
279+
}),
280+
)
281+
282+
const trainingDataset = await spamClassificationDataDealer.fetchAllTrainingData("owner")
283+
284+
// first load: empty, second load: fetch uploaded data
285+
verify(entityClientMock.loadAll(ClientSpamTrainingDatumTypeRef, mailBox.clientSpamTrainingData!), { times: 2 })
286+
verify(entityClientMock.loadAll(ClientSpamTrainingDatumIndexEntryTypeRef, mailBox.modifiedClientSpamTrainingDataIndex!), { times: 1 })
287+
288+
const expectedUploadMailsHam = relevantMails.slice(20, 40)
289+
const expectedUploadMailsSpam = relevantMails.slice(60, 80)
290+
291+
const unencryptedPayload = expectedUploadMailsHam.concat(expectedUploadMailsSpam).map((mail) => {
292+
return {
293+
mailId: mail._id,
294+
isSpam: isSameId(mail.sets[0], spamFolder._id),
295+
confidence: DEFAULT_IS_SPAM_CONFIDENCE,
296+
vector: new Uint8Array(1),
297+
} as UnencryptedPopulateClientSpamTrainingDatum
298+
})
299+
verify(mailFacadeMock.populateClientSpamTrainingData("owner", unencryptedPayload), { times: 1 })
300+
301+
o(trainingDataset).deepEquals({
302+
trainingData: updatedSpamTrainingData,
303+
lastTrainingDataIndexId: getElementId(last(modifiedIndicesSinceStart)!),
304+
hamCount: 40,
305+
spamCount: 40,
306+
})
307+
})
308+
206309
o("successfully returns training data with mixed ham/spam data", async () => {
207310
when(entityClientMock.load(MailboxGroupRootTypeRef, "owner")).thenResolve(mailboxGroupRoot)
208311
when(entityClientMock.load(MailBoxTypeRef, "mailbox")).thenResolve(mailBox)
312+
when(entityClientMock.loadAll(MailTypeRef, anything(), anything())).thenResolve([])
313+
209314
const spamTrainingData = Array.from({ length: 10 }, () =>
210315
createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.WHITELIST),
211316
).concat(Array.from({ length: 10 }, () => createSpamTrainingDatumByConfidenceAndDecision(DEFAULT_IS_SPAM_CONFIDENCE, SpamDecision.BLACKLIST)))
212-
213317
const modifiedIndicesSinceStart = spamTrainingData.map((data) =>
214318
createClientSpamTrainingDatumIndexEntryByClientSpamTrainingDatumElementId(getElementId(data)),
215319
)
216320
when(entityClientMock.loadAll(ClientSpamTrainingDatumTypeRef, mailBox.clientSpamTrainingData!)).thenResolve(spamTrainingData)
217-
when(entityClientMock.loadAll(MailTypeRef, mailBox.archivedMailBags[0].mails, anything())).thenResolve([])
218321
when(entityClientMock.loadAll(MailFolderTypeRef, mailBox.folders!.folders)).thenResolve([inboxFolder, spamFolder, trashFolder])
219322
when(entityClientMock.loadAll(ClientSpamTrainingDatumIndexEntryTypeRef, mailBox.modifiedClientSpamTrainingDataIndex!)).thenResolve(
220323
modifiedIndicesSinceStart,
@@ -241,6 +344,7 @@ o.spec("SpamClassificationDataDealer", () => {
241344
const validSpamData = createSpamTrainingDatumByConfidenceAndDecision("4", SpamDecision.BLACKLIST)
242345
when(entityClientMock.load(MailboxGroupRootTypeRef, "owner")).thenResolve(mailboxGroupRoot)
243346
when(entityClientMock.load(MailBoxTypeRef, "mailbox")).thenResolve(mailBox)
347+
when(entityClientMock.loadAll(MailTypeRef, anything(), anything())).thenResolve([])
244348

245349
const spamTrainingData = [noneDecisionData, zeroConfData, validSpamData, validHamData]
246350
const modifiedIndicesSinceStart = spamTrainingData.map((data) =>

0 commit comments

Comments
 (0)