diff --git a/functions/env.sample b/functions/env.sample new file mode 100644 index 00000000..fedd0c6b --- /dev/null +++ b/functions/env.sample @@ -0,0 +1,26 @@ +NUM_SHARDS_INSTANCE_COUNT= +NUM_SHARDS_VOTE_COUNT= +GRAPH_API_VERSION= +ENVIRONMENT= +SIMILARITY_THRESHOLD= +TYPESENSE_HOST= +EMBEDDER_HOST= +CHECKER1_ID= +CHECKER1_TELEGRAM_ID= +CHECKER1_PHONE_NUMBER= +TYPESENSE_PORT= +TYPESENSE_PROTOCOL= +TELEGRAM_REPORT_CHANNEL_ID= +TEST_IMAGE_URL= +HASHIDS_SALT= +WEBHOOK_PATH_WHATSAPP= +WEBHOOK_PATH_TELEGRAM= +CHECKER_APP_HOST= +WEBHOOK_PATH_TYPEFORM= +CHECKERS_GROUP_LINK= +USERS_WHATSAPP_NUMBER= +CHECKERS_CHAT_ID= +TYPEFORM_URL= + +#EXTERNAL APIS +VIRUS_TOTAL_API_KEY= diff --git a/functions/src/definitions/eventHandlers/userGenericMessageHandlers.ts b/functions/src/definitions/eventHandlers/userGenericMessageHandlers.ts index 86ebf668..ae25e6a1 100644 --- a/functions/src/definitions/eventHandlers/userGenericMessageHandlers.ts +++ b/functions/src/definitions/eventHandlers/userGenericMessageHandlers.ts @@ -1,5 +1,6 @@ import * as admin from "firebase-admin" import * as functions from "firebase-functions" +import { validateURLs } from '../../utils/utils'; import { onMessagePublished } from "firebase-functions/v2/pubsub" import { Timestamp } from "firebase-admin/firestore" import { checkNewlyJoined } from "../../validators/common/checkNewlyJoined" @@ -70,6 +71,7 @@ const userGenericMessageHandlerWhatsapp = async function ( const isNewlyJoined = checkNewlyJoined(userSnap, messageTimestamp) + console.log(`Message is of type "${type}"`) switch (type) { //only two types: text or image case "text": @@ -77,6 +79,7 @@ const userGenericMessageHandlerWhatsapp = async function ( if (!message.text) { break } + console.log(`Text message is "${message.text}"`) const textNormalised = normalizeSpaces(message.text).toLowerCase() //normalise spaces needed cos of potential   when copying message on desktop whatsapp if ( checkTemplate( @@ -87,6 +90,7 @@ const userGenericMessageHandlerWhatsapp = async function ( textNormalised, responses?.REFERRAL_PREPOPULATED_PREFIX_1.toLowerCase() ) + ) { step = "text_prepopulated" if (isFirstTimeUser) { @@ -94,9 +98,9 @@ const userGenericMessageHandlerWhatsapp = async function ( } else { await sendMenuMessage(userSnap, "MENU_PREFIX", "whatsapp", null, null) } + console.log(`step ${step}`) break } - step = await newTextInstanceHandler({ userSnap, source: message.source, @@ -169,6 +173,8 @@ async function newTextInstanceHandler({ let hasMatch = false let messageRef: FirebaseFirestore.DocumentReference | null = null let messageUpdateObj: MessageData | null = null + let validatedURLS: any; + const machineCategory = (await classifyText(text)) ?? "error" if (from && isFirstTimeUser && machineCategory.includes("irrelevant")) { await userSnap.ref.update({ @@ -181,6 +187,7 @@ async function newTextInstanceHandler({ let embedding let textHash = hashMessage(text) // 1 - check if the exact same message exists in database + try { ;({ embedding, similarity } = await calculateSimilarity( text, @@ -236,45 +243,58 @@ async function newTextInstanceHandler({ rationalisation = await rationaliseMessage(text, machineCategory) } messageRef = db.collection("messages").doc() - messageUpdateObj = { - machineCategory: machineCategory, //Can be "fake news" or "scam" - isMachineCategorised: isMachineAssessed, - originalText: text, - text: strippedMessage, //text - caption: null, - latestInstance: null, - firstTimestamp: timestamp, //timestamp of first instance (firestore timestamp data type) - lastTimestamp: timestamp, //timestamp of latest instance (firestore timestamp data type) - lastRefreshedTimestamp: timestamp, - isPollStarted: false, //boolean, whether or not polling has started - isAssessed: isMachineAssessed, //boolean, whether or not we have concluded the voting - assessedTimestamp: null, - assessmentExpiry: null, - assessmentExpired: false, - truthScore: null, //float, the mean truth score - numberPointScale: 6, - isIrrelevant: - isMachineAssessed && machineCategory.includes("irrelevant") - ? true - : null, //bool, if majority voted irrelevant then update this - isScam: isMachineAssessed && machineCategory === "scam" ? true : null, - isIllicit: - isMachineAssessed && machineCategory === "illicit" ? true : null, - isSpam: isMachineAssessed && machineCategory === "spam" ? true : null, - isLegitimate: null, - isUnsure: null, - isInfo: machineCategory === "info" ? true : null, - isSatire: null, - isHarmful: null, - isHarmless: null, - tags: {}, - primaryCategory: isMachineAssessed - ? machineCategory.split("_")[0] //in case of irrelevant_length, we want to store irrelevant - : null, - customReply: null, //string - instanceCount: 0, - rationalisation: rationalisation, + + try { + validatedURLS = await validateURLs(text) + console.log('Validated URLs:', validatedURLS) + + messageUpdateObj = { + machineCategory: machineCategory, //Can be "fake news" or "scam" + isMachineCategorised: isMachineAssessed, + originalText: text, + text: strippedMessage, //text + caption: null, + latestInstance: null, + firstTimestamp: timestamp, //timestamp of first instance (firestore timestamp data type) + lastTimestamp: timestamp, //timestamp of latest instance (firestore timestamp data type) + lastRefreshedTimestamp: timestamp, + isPollStarted: false, //boolean, whether or not polling has started + isAssessed: isMachineAssessed, //boolean, whether or not we have concluded the voting + assessedTimestamp: null, + assessmentExpiry: null, + assessmentExpired: false, + truthScore: null, //float, the mean truth score + numberPointScale: 6, + isIrrelevant: + isMachineAssessed && machineCategory.includes("irrelevant") + ? true + : null, //bool, if majority voted irrelevant then update this + isScam: isMachineAssessed && machineCategory === "scam" ? true : null, + isIllicit: + isMachineAssessed && machineCategory === "illicit" ? true : null, + isSpam: isMachineAssessed && machineCategory === "spam" ? true : null, + isLegitimate: null, + isUnsure: null, + isInfo: machineCategory === "info" ? true : null, + isSatire: null, + isHarmful: null, + isHarmless: null, + tags: {}, + primaryCategory: isMachineAssessed + ? machineCategory.split("_")[0] //in case of irrelevant_length, we want to store irrelevant + : null, + customReply: null, //string + instanceCount: 0, + rationalisation: rationalisation, + virtualTotalResults: validatedURLS + } + console.log('messageUpdateObj:', messageUpdateObj) + } catch (error) { + console.error('Error validating URLs:', error) + // You might want to handle the error, such as setting a default value + validatedURLS = null } + } else { messageRef = matchedParentMessageRef } @@ -369,6 +389,8 @@ async function newImageInstanceHandler({ let matchedInstanceSnap let captionHash = caption ? hashMessage(caption) : null + let validatedURLS: any; + if (!mediaId) { throw new Error(`No mediaId for whatsapp message with id ${id}`) } @@ -513,44 +535,55 @@ async function newImageInstanceHandler({ ) } messageRef = db.collection("messages").doc() - messageUpdateObj = { - machineCategory: machineCategory, - isMachineCategorised: isMachineAssessed, - originalText: extractedMessage ?? null, - text: strippedMessage ?? null, //text - caption: caption ?? null, - latestInstance: null, - firstTimestamp: timestamp, //timestamp of first instance (firestore timestamp data type) - lastTimestamp: timestamp, //timestamp of latest instance (firestore timestamp data type) - lastRefreshedTimestamp: timestamp, - isPollStarted: false, //boolean, whether or not polling has started - isAssessed: isMachineAssessed, //boolean, whether or not we have concluded the voting - assessedTimestamp: null, - assessmentExpiry: null, - assessmentExpired: false, - truthScore: null, //float, the mean truth score - numberPointScale: 6, - isIrrelevant: - isMachineAssessed && machineCategory.includes("irrelevant") - ? true - : null, //bool, if majority voted irrelevant then update this - isScam: isMachineAssessed && machineCategory === "scam" ? true : null, - isIllicit: - isMachineAssessed && machineCategory === "illicit" ? true : null, - isSpam: isMachineAssessed && machineCategory === "spam" ? true : null, - isLegitimate: null, - isUnsure: null, - isInfo: !caption && machineCategory === "info" ? true : null, - isSatire: null, - isHarmful: null, - isHarmless: null, - tags: {}, - primaryCategory: isMachineAssessed - ? machineCategory.split("_")[0] //in case of irrelevant_length, we want to store irrelevant - : null, - customReply: null, //string - instanceCount: 0, - rationalisation: rationalisation, + + try { + validatedURLS = await validateURLs(extractedMessage) + console.log('Validated URLs:', validatedURLS) + + messageUpdateObj = { + machineCategory: machineCategory, + isMachineCategorised: isMachineAssessed, + originalText: extractedMessage ?? null, + text: strippedMessage ?? null, //text + caption: caption ?? null, + latestInstance: null, + firstTimestamp: timestamp, //timestamp of first instance (firestore timestamp data type) + lastTimestamp: timestamp, //timestamp of latest instance (firestore timestamp data type) + lastRefreshedTimestamp: timestamp, + isPollStarted: false, //boolean, whether or not polling has started + isAssessed: isMachineAssessed, //boolean, whether or not we have concluded the voting + assessedTimestamp: null, + assessmentExpiry: null, + assessmentExpired: false, + truthScore: null, //float, the mean truth score + numberPointScale: 6, + isIrrelevant: + isMachineAssessed && machineCategory.includes("irrelevant") + ? true + : null, //bool, if majority voted irrelevant then update this + isScam: isMachineAssessed && machineCategory === "scam" ? true : null, + isIllicit: + isMachineAssessed && machineCategory === "illicit" ? true : null, + isSpam: isMachineAssessed && machineCategory === "spam" ? true : null, + isLegitimate: null, + isUnsure: null, + isInfo: !caption && machineCategory === "info" ? true : null, + isSatire: null, + isHarmful: null, + isHarmless: null, + tags: {}, + primaryCategory: isMachineAssessed + ? machineCategory.split("_")[0] //in case of irrelevant_length, we want to store irrelevant + : null, + customReply: null, //string + instanceCount: 0, + rationalisation: rationalisation, + virtualTotalResults: validatedURLS + } + } catch (error) { + console.error('Error validating URLs:', error) + // You might want to handle the error, such as setting a default value + validatedURLS = null } } else { if (matchType === "image" && matchedInstanceSnap) { diff --git a/functions/src/types.ts b/functions/src/types.ts index 75617d28..6d998694 100644 --- a/functions/src/types.ts +++ b/functions/src/types.ts @@ -146,6 +146,7 @@ export type MessageData = { customReply: string | null instanceCount: number rationalisation: string | null // Assuming 'rationalisation' is a string; adjust as necessary if it's a different type. + virtualTotalResults: any | null } export type InstanceData = { diff --git a/functions/src/utils/utils.ts b/functions/src/utils/utils.ts new file mode 100644 index 00000000..19fa33ec --- /dev/null +++ b/functions/src/utils/utils.ts @@ -0,0 +1,91 @@ +const axios = require('axios'); +const { URLSearchParams } = require('url'); + +interface URLValidationResult { + url: string; + success: boolean; + data: any; + error?: any; +} + +function addHttpsIfMissing(url: string) { + // Create a URL object to easily parse the URL + try { + const parsedUrl = new URL(url); + + // If URL already has a scheme, return it as is + if (parsedUrl.protocol) { + return url; + } + } catch (e) { + // If URL parsing fails, it means it's a relative or invalid URL + // We need to handle this case + } + + // Add https:// if the URL is missing a scheme + return `https://${url}`; +} + +export function validateURLs(text: string): Promise { + const urlRegex = /https?:\/\/[^\s/$.?#].[^\s]*/g; + const urls = text.match(urlRegex); + const results: URLValidationResult[] = []; + + if (urls) { + // Create an array of promises for each URL request + const requests = urls.map((url, index) => { + url = addHttpsIfMissing(url) + console.log(`URL ${index + 1} is: "${url}"`); + + const base64URL: string = Buffer.from(url).toString('base64'); + const virusTotalURL: string = `https://www.virustotal.com/api/v3/urls/${base64URL}`; + console.log(`Calling API ${virusTotalURL} to get scan results of ${url}`); + const VIRUS_TOTAL_API_KEY = String(process.env.VIRUS_TOTAL_API_KEY); + + //Print only the last 4 characters of the API key instead of the full key for security reasons + console.log(`VIRUS_TOTAL_API_KEY: ${VIRUS_TOTAL_API_KEY.slice(-4)}`); + const options = { + method: 'GET', + url: virusTotalURL, + headers: { + accept: 'application/json', + 'x-apikey': VIRUS_TOTAL_API_KEY + } + }; + + return axios + .request(options) + .then((response: { data: any; }) => { + console.log(`Success calling ${virusTotalURL}`); + let data = JSON.stringify(response.data.data.attributes.total_votes) + console.error(data); + results.push({ + url, + success: true, + data: data, + error: null, + }); + }) + .catch((error: { response: { data: any; }; }) => { + console.log(`Error calling ${virusTotalURL}`); + let data = JSON.stringify(error.response.data) + console.error(data); + results.push({ + url, + success: false, + data: null, + error: data, + }); + }); + }); + + // Wait for all requests to complete and return results + return Promise.all(requests).then(() => { + console.log('All validate URLs requests completed. Results:', results); + return results; + }); + } else { + // If no URLs are found, return an empty array + return Promise.resolve([]); + } +} \ No newline at end of file