Skip to content

Commit 18dc917

Browse files
authored
Fix mission import performance (#226)
* create branch * Update .eslintrc.js
1 parent ce5b64d commit 18dc917

6 files changed

Lines changed: 565 additions & 581 deletions

File tree

process/.eslintrc.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ module.exports = {
2121
"no-constant-condition": "off", // TODO: fix code and activate
2222
"no-useless-escape": "off", // TODO: fix code and activate
2323
"no-prototype-builtins": "off", // TODO: fix code and activate
24-
24+
"no-control-regex": "off", // TODO: fix code and activate
2525
"import/extensions": "off",
2626
"import/no-extraneous-dependencies": "off",
2727
},

process/src/jobs/import/index.ts

Lines changed: 49 additions & 158 deletions
Original file line numberDiff line numberDiff line change
@@ -1,107 +1,16 @@
1-
import { XMLParser } from "fast-xml-parser";
2-
31
import { captureException } from "../../error";
42
import ImportModel from "../../models/import";
53
import PublisherModel from "../../models/publisher";
64

7-
import { Schema } from "mongoose";
85
import MissionModel from "../../models/mission";
9-
import { Import, Mission, MissionXML, Publisher } from "../../types";
6+
import { Import, Mission, Publisher } from "../../types";
107
import { enrichWithGeoloc } from "./geoloc";
11-
import { buildMission } from "./mission";
8+
import { buildData } from "./mission";
129
import { verifyOrganization } from "./organization";
13-
import { bulkDB } from "./utils/db";
14-
15-
const parseXML = (xmlString: string) => {
16-
const parser = new XMLParser();
17-
18-
const options = {
19-
attributeNamePrefix: "@_",
20-
textNodeName: "#text",
21-
ignoreAttributes: true,
22-
ignoreNameSpace: false,
23-
allowBooleanAttributes: false,
24-
parseNodeValue: true,
25-
parseAttributeValue: false,
26-
trimValues: true,
27-
cdataPositionChar: "\\c",
28-
parseTrueNumberOnly: false,
29-
arrayMode: false, //"strict"
30-
stopNodes: ["parse-me-as-string"],
31-
isArray: (name: string, jpath: string, isLeafNode: boolean, isAttribute: boolean) => {
32-
if (jpath === "source.mission.addresses.address") {
33-
return true;
34-
}
35-
return false;
36-
},
37-
};
38-
39-
const res = parser.parse(xmlString, options);
40-
41-
if (!res.source || !res.source.mission) {
42-
return;
43-
}
44-
if (res.source.mission && !Array.isArray(res.source.mission)) {
45-
res.source.mission = [res.source.mission];
46-
}
47-
48-
// Remove duplicates clientId
49-
const clientId = new Set();
50-
const unique = [] as MissionXML[];
51-
const data = res.source.mission as MissionXML[];
10+
import { bulkDB, cleanDB } from "./utils/db";
11+
import { parseXML } from "./utils/xml";
5212

53-
data.forEach((mission) => {
54-
if (!clientId.has(mission.clientId)) {
55-
const addresses = mission.addresses as any;
56-
if (addresses?.address && Array.isArray(addresses.address)) {
57-
mission.addresses = addresses.address;
58-
} else if (addresses?.address) {
59-
mission.addresses = [addresses.address];
60-
}
61-
clientId.add(mission.clientId);
62-
unique.push(mission);
63-
}
64-
});
65-
66-
return unique;
67-
};
68-
69-
const buildData = async (startTime: Date, publisher: Publisher, missionXML: MissionXML) => {
70-
try {
71-
const missionDB = await MissionModel.findOne({
72-
publisherId: publisher._id,
73-
clientId: missionXML.clientId,
74-
});
75-
76-
const mission = buildMission(publisher, missionXML, missionDB?.toObject());
77-
if (missionDB) {
78-
mission._id = missionDB._id as Schema.Types.ObjectId;
79-
mission.createdAt = missionDB.createdAt;
80-
}
81-
mission.deleted = false;
82-
mission.deletedAt = null;
83-
mission.lastSyncAt = startTime;
84-
mission.publisherId = publisher._id.toString();
85-
mission.publisherName = publisher.name;
86-
mission.publisherLogo = publisher.logo;
87-
mission.publisherUrl = publisher.url;
88-
mission.updatedAt = startTime;
89-
90-
mission.organizationVerificationStatus = missionDB?.organizationVerificationStatus;
91-
if (missionDB && missionDB.statusCommentHistoric && Array.isArray(missionDB.statusCommentHistoric)) {
92-
if (missionDB.statusCode !== mission.statusCode) {
93-
mission.statusCommentHistoric = [...missionDB.statusCommentHistoric, { status: mission.statusCode, comment: mission.statusComment, date: mission.updatedAt }];
94-
}
95-
} else {
96-
mission.statusCommentHistoric = [{ status: mission.statusCode, comment: mission.statusComment, date: mission.updatedAt }];
97-
}
98-
99-
return mission;
100-
} catch (error) {
101-
console.log("ici", error);
102-
captureException(error, `Error while parsing mission ${missionXML.clientId}`);
103-
}
104-
};
13+
const CHUNK_SIZE = 2000;
10514

10615
const importPublisher = async (publisher: Publisher, start: Date) => {
10716
if (!publisher) {
@@ -134,9 +43,7 @@ const importPublisher = async (publisher: Publisher, start: Date) => {
13443
console.log(`[${publisher.name}] Parse xml from ${publisher.feed}`);
13544
const missionsXML = parseXML(xml);
13645
if (!missionsXML || !missionsXML.length) {
137-
console.log(`[${publisher.name}] Empty xml`);
138-
139-
console.log(`[${publisher.name}] Mongo cleaning...`);
46+
console.log(`[${publisher.name}] Empty xml, mongo cleaning...`);
14047
const mongoRes = await MissionModel.updateMany({ publisherId: publisher._id, deletedAt: null, updatedAt: { $lt: start } }, { deleted: true, deletedAt: new Date() });
14148
console.log(`[${publisher.name}] Mongo cleaning deleted ${mongoRes.modifiedCount}`);
14249
obj.endedAt = new Date();
@@ -147,75 +54,59 @@ const importPublisher = async (publisher: Publisher, start: Date) => {
14754
// GET COUNT MISSIONS IN DB
14855
const missionsDB = await MissionModel.countDocuments({
14956
publisherId: publisher._id,
150-
deleted: false,
57+
deletedAt: null,
15158
});
15259
console.log(`[${publisher.name}] Found ${missionsDB} missions in DB`);
15360

154-
// BUILD NEW MISSIONS
155-
const missions = [] as Mission[];
156-
const promises = [] as Promise<Mission | undefined>[];
157-
for (let j = 0; j < missionsXML.length; j++) {
158-
const missionXML = missionsXML[j];
159-
promises.push(buildData(obj.startedAt, publisher, missionXML));
160-
161-
if (j % 50 === 0) {
61+
for (let i = 0; i < missionsXML.length; i += CHUNK_SIZE) {
62+
console.log(`[${publisher.name}] Processing chunk ${i / CHUNK_SIZE + 1} of ${Math.ceil(missionsXML.length / CHUNK_SIZE)}`);
63+
const chunk = missionsXML.slice(i, i + CHUNK_SIZE);
64+
// BUILD NEW MISSIONS
65+
const missions = [] as Mission[];
66+
const promises = [] as Promise<Mission | undefined>[];
67+
for (let j = 0; j < chunk.length; j++) {
68+
const missionXML = chunk[j];
69+
promises.push(buildData(obj.startedAt, publisher, missionXML));
70+
71+
if (j % 50 === 0) {
72+
const res = await Promise.all(promises);
73+
res.filter((e) => e !== undefined).forEach((e: Mission) => missions.push(e));
74+
promises.length = 0;
75+
}
76+
}
77+
if (promises.length > 0) {
16278
const res = await Promise.all(promises);
16379
res.filter((e) => e !== undefined).forEach((e: Mission) => missions.push(e));
164-
promises.length = 0;
16580
}
166-
}
167-
if (promises.length > 0) {
168-
const res = await Promise.all(promises);
169-
res.filter((e) => e !== undefined).forEach((e: Mission) => missions.push(e));
170-
}
17181

172-
// GEOLOC
173-
const resultGeoloc = await enrichWithGeoloc(publisher, missions);
174-
resultGeoloc.forEach((r) => {
175-
const mission = missions.find((m) => m.clientId.toString() === r.clientId.toString());
176-
if (mission && r.addressIndex < mission.addresses.length) {
177-
const address = mission.addresses[r.addressIndex];
178-
address.street = r.street;
179-
address.city = r.city;
180-
address.postalCode = r.postalCode;
181-
address.departmentCode = r.departmentCode;
182-
address.departmentName = r.departmentName;
183-
address.region = r.region;
184-
if (r.location?.lat && r.location?.lon) {
185-
address.location = { lat: r.location.lat, lon: r.location.lon };
186-
address.geoPoint = r.geoPoint;
82+
// GEOLOC
83+
const resultGeoloc = await enrichWithGeoloc(publisher, missions);
84+
resultGeoloc.forEach((r) => {
85+
const mission = missions.find((m) => m.clientId.toString() === r.clientId.toString());
86+
if (mission && r.addressIndex < mission.addresses.length) {
87+
const address = mission.addresses[r.addressIndex];
88+
address.street = r.street;
89+
address.city = r.city;
90+
address.postalCode = r.postalCode;
91+
address.departmentCode = r.departmentCode;
92+
address.departmentName = r.departmentName;
93+
address.region = r.region;
94+
if (r.location?.lat && r.location?.lon) {
95+
address.location = { lat: r.location.lat, lon: r.location.lon };
96+
address.geoPoint = r.geoPoint;
97+
}
98+
address.geolocStatus = r.geolocStatus;
18799
}
188-
address.geolocStatus = r.geolocStatus;
189-
}
190-
});
100+
});
191101

192-
// RNA
193-
console.log(`[Organization] Starting organization verification for ${missions.length} missions`);
194-
const resultRNA = await verifyOrganization(missions);
195-
console.log(`[Organization] Received ${resultRNA.length} verification results`);
196-
197-
resultRNA.forEach((r) => {
198-
const mission = missions.find((m) => m.clientId.toString() === r.clientId.toString());
199-
if (mission) {
200-
mission.organizationId = r.organizationId;
201-
mission.organizationNameVerified = r.organizationNameVerified;
202-
mission.organizationRNAVerified = r.organizationRNAVerified;
203-
mission.organizationSirenVerified = r.organizationSirenVerified;
204-
mission.organizationSiretVerified = r.organizationSiretVerified;
205-
mission.organizationAddressVerified = r.organizationAddressVerified;
206-
mission.organizationCityVerified = r.organizationCityVerified;
207-
mission.organizationPostalCodeVerified = r.organizationPostalCodeVerified;
208-
mission.organizationDepartmentCodeVerified = r.organizationDepartmentCodeVerified;
209-
mission.organizationDepartmentNameVerified = r.organizationDepartmentNameVerified;
210-
mission.organizationRegionVerified = r.organizationRegionVerified;
211-
mission.organizationVerificationStatus = r.organizationVerificationStatus;
212-
} else {
213-
console.log(`[Organization Warning] Could not find mission for clientId: ${r.clientId}`);
214-
}
215-
});
102+
// RNA
103+
await verifyOrganization(missions);
104+
// BULK WRITE
105+
await bulkDB(missions, publisher, obj);
106+
}
216107

217-
// BULK WRITE
218-
await bulkDB(missions, publisher, obj);
108+
// CLEAN DB
109+
await cleanDB(publisher, obj);
219110

220111
// STATS
221112
obj.missionCount = await MissionModel.countDocuments({

process/src/jobs/import/mission.ts

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
import he from "he";
22
import { convert } from "html-to-text";
3+
import { Schema } from "mongoose";
34

5+
import { SC_ID } from "../../config";
46
import { COUNTRIES } from "../../constants/countries";
57
import { AUTRE_IMAGE, DOMAINS, DOMAIN_IMAGES } from "../../constants/domains";
8+
import { captureException } from "../../error";
9+
import MissionModel from "../../models/mission";
610
import { Mission, MissionXML, Publisher } from "../../types";
711
import { getAddress, getAddresses } from "./utils/address";
812

@@ -161,7 +165,7 @@ const parseArray = (value: string | { value: string[] | string } | undefined, in
161165
return [value];
162166
};
163167

164-
export const buildMission = (publisher: Publisher, missionXML: MissionXML, missionDB?: Mission) => {
168+
const parseMission = (publisher: Publisher, missionXML: MissionXML, missionDB: Mission | null) => {
165169
const mission = {
166170
title: he.decode(missionXML.title),
167171
description: convert(he.decode(missionXML.description || ""), {
@@ -209,6 +213,13 @@ export const buildMission = (publisher: Publisher, missionXML: MissionXML, missi
209213
organizationReseaux: parseArray(missionXML.organizationReseaux, true) || [],
210214
} as Mission;
211215

216+
// Moderation except Service Civique (already moderated) // Moderation except Service Civique (already moderated)
217+
mission.statusComment = "";
218+
mission.statusCode = "ACCEPTED";
219+
if (publisher._id.toString() !== SC_ID) {
220+
getModeration(mission);
221+
}
222+
212223
if (mission.domain === "mémoire et citoyenneté") {
213224
mission.domain = "memoire-et-citoyennete";
214225
}
@@ -221,17 +232,19 @@ export const buildMission = (publisher: Publisher, missionXML: MissionXML, missi
221232
getAddress(mission, missionXML);
222233
}
223234

224-
// Moderation except Service Civique (already moderated)
225-
mission.statusComment = "";
226-
mission.statusCode = "ACCEPTED";
227-
if (publisher._id.toString() !== "5f99dbe75eb1ad767733b206") {
228-
getModeration(mission);
229-
}
235+
if (missionDB) {
236+
mission._id = missionDB._id as Schema.Types.ObjectId;
237+
mission.createdAt = missionDB.createdAt;
238+
mission.organizationVerificationStatus = missionDB.organizationVerificationStatus;
230239

231-
// SPECIFIC CASE
232-
if (!mission.publisherLogo) {
233-
mission.publisherLogo = "";
234-
} // Publisher without logo
240+
if (missionDB.statusCommentHistoric && Array.isArray(missionDB.statusCommentHistoric)) {
241+
if (missionDB.statusCode !== mission.statusCode) {
242+
mission.statusCommentHistoric = [...missionDB.statusCommentHistoric, { status: mission.statusCode, comment: mission.statusComment, date: mission.updatedAt }];
243+
}
244+
} else {
245+
mission.statusCommentHistoric = [{ status: mission.statusCode, comment: mission.statusComment, date: mission.updatedAt }];
246+
}
247+
}
235248

236249
// Dirty dirty hack for afev to get Joe happy
237250
if (missionXML.organizationName === "Afev") {
@@ -294,3 +307,27 @@ export const buildMission = (publisher: Publisher, missionXML: MissionXML, missi
294307

295308
return mission;
296309
};
310+
311+
export const buildData = async (startTime: Date, publisher: Publisher, missionXML: MissionXML) => {
312+
try {
313+
const missionDB = await MissionModel.findOne({
314+
publisherId: publisher._id,
315+
clientId: missionXML.clientId,
316+
});
317+
318+
const mission = parseMission(publisher, missionXML, missionDB?.toObject() || null);
319+
320+
mission.deleted = false;
321+
mission.deletedAt = null;
322+
mission.lastSyncAt = startTime;
323+
mission.publisherId = publisher._id.toString();
324+
mission.publisherName = publisher.name;
325+
mission.publisherLogo = publisher.logo;
326+
mission.publisherUrl = publisher.url;
327+
mission.updatedAt = startTime;
328+
329+
return mission;
330+
} catch (error) {
331+
captureException(error, `Error while parsing mission ${missionXML.clientId}`);
332+
}
333+
};

0 commit comments

Comments
 (0)