StartupScoutingAI/Utils.js at main · Luigina2001/StartupScoutingAI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
/**
 * Retrieves the Google Gemini API key from the script properties.
 * * @return {string} The valid API key string.
 * @throws {Error} CRITICAL: If the 'GEMINI_API_KEY' property is missing or empty.
 */
function getApiKey() {
    const key = PropertiesService.getScriptProperties().getProperty('GEMINI_API_KEY');
    if(!key){
        throw new Error('CRITICAL: Missing GEMINI_API_KEY. Please set it in Settings > Script Properties.');
    }
    return key;
}

/**
 * Normalizes a URL to a standard format for consistent comparison and deduplication.
 * * Transformations applied:
 * 1. Converts to lowercase and trims whitespace.
 * 2. Removes protocol (http://, https://).
 * 3. Removes 'www.' subdomain.
 * 4. Removes trailing slashes.
 * * Example: "https://www.Google.com/" -> "google.com"
 * * @param {string|Object} URL - The raw URL string to normalize.
 * @return {string} The cleaned, bare domain string (or empty string if input is null).
 */
function normalize_URL(URL) {
    if (!URL){
        return "";
    }

    let clean = URL.toString().trim().toLowerCase();
    clean = clean.replace(/^(?:https?:\/\/)?(?:www\.)?/i, "");

    if (clean.endsWith("/")){
        clean = clean.slice(0, -1);
    }

    return clean
}

/**
 * Validates if a domain is active and reachable.
 * * Strategy:
 * 1. Cleans the input domain.
 * 2. Attempts an HTTPS connection first.
 * 3. Fallback to HTTP if HTTPS fails.
 * * @param {string} domain - The website URL or domain string.
 * @return {string|null} The fully qualified URL (e.g., "https://site.com") if reachable, otherwise null.
 */
function resolve_active_url(domain) {
    if (!domain) {
        return null;
    }

    let clean = domain.toString().trim().toLowerCase();
    clean = clean.replace(/^(?:https?:\/\/)?(?:www\.)?/i, "");

    // ATTEMPT 1: HTTPS
    const urlHttps = "https://" + clean;
    try {
        const response = UrlFetchApp.fetch(urlHttps, {
            muteHttpExceptions: true,
            validateHttpsCertificates: false, // Ignore expired certificates
            followRedirects: true
        });
        if (response.getResponseCode() >= 200 && response.getResponseCode() < 400) {
            return urlHttps;
        }
    } catch (e) {
        console.warn(`[HTTPS FAILED] ${clean}: ${e.message}. Attempt with HTTP...`);
    }

    // ATTEMPT 2: HTTP (Fallback)
    const urlHttp = "http://" + clean;
    try {
        const response = UrlFetchApp.fetch(urlHttp, {
            muteHttpExceptions: true,
            followRedirects: true
        });
        if (response.getResponseCode() >= 200 && response.getResponseCode() < 400) {
            console.warn(`[HTTP FALLBACK] Sito trovato solo su HTTP: ${clean}`);
            return urlHttp;
        }
    } catch (e) {
        console.warn(`[DEAD LINK] ${clean} unreachable even via HTTP: ${e.message}`);
    }

    return null; // website unreachable
}

/**
 * Wrapper for the Google Gemini API.
 * * Configured to use 'gemini-2.5-flash' with a temperature of 0.5.
 * Handles payload construction, HTTP request, error checking, and response cleaning.
 * * @param {string} systemPrompt - The role definition and high-level context for the AI.
 * @param {string} userPrompt - The specific task or query.
 * @param {boolean} [jsonMode=false] - If true, requests 'application/json' response.
 * @return {string} The cleaned text content of the LLM's response.
 * @throws {Error} If the API call fails or returns no candidates.
 */
function call_LLM(systemPrompt, userPrompt, jsonMode=false) {
    const apiKey = getApiKey();
    const model = "gemini-2.5-flash";
    const endpoint = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${apiKey}`;

    const combinedPrompt = `${systemPrompt}\n\nTask:\n${userPrompt}`;

    const payload = {
        contents:[{
            parts: [{
                text: combinedPrompt
            }]
        }],
        generationConfig: {
            temperature: 0.5,
            responseMimeType: jsonMode ? "application/json" : "text/plain"
        }
    };

    const options = {
        method: "post",
        contentType: "application/json",
        payload: JSON.stringify(payload),
        muteHttpExceptions: true
    };

    try{
        const response = UrlFetchApp.fetch(endpoint, options);
        const responseCode = response.getResponseCode();
        const responseBody = response.getContentText();

        if (responseCode!== 200) {
            console.error(`[GEMINI ERROR] Status: ${responseCode} | Body: ${responseBody}`);
            throw new Error(`Gemini call error: ${responseCode} - ${responseBody}`);
        }

        const json = JSON.parse(responseBody);

        if (!json.candidates || json.candidates.length === 0) {
            throw new Error("Gemini did not return any candidates.");
        }

        const contentText = json.candidates[0].content.parts[0].text;
        const cleanText = contentText.replace(/```json/g, "").replace(/```/g, "").trim();

        return cleanText;

    }catch(e) {
        console.error("[GEMINI EXCEPTION] " + e.toString());
        throw e;
    }
}


/**
 * Fetches and processes the HTML content of a given URL.
 * * Features:
 * - Scrapes the raw HTML (following redirects).
 * - Extracts potential 'Portfolio' or 'Startups' sub-pages using regex.
 * - Extracts a list of all external links found on the page for cross-referencing.
 * - Cleans the HTML (removes scripts, styles, comments) to save tokens.
 * - Truncates the text to 30,000 characters to fit within LLM context limits.
 * * @param {string} url - The target website URL.
 * @return {Object|null} An object containing {text, portfolioUrl, extractedLinks} or null if fetch fails.
 */
function fetchWebsiteContent(url) {
    try {
        const response = UrlFetchApp.fetch(url, {
            muteHttpExceptions: true,
            followRedirects: true,
            validateHttpsCertificates: false
        });

        if (response.getResponseCode() !== 200) {
            return null;
        }

        let html = response.getContentText();

        // Regex to find hrefs containing keywords
        const linkRegex = /<a\s+(?:[^>]*?\s+)?href=(["'])(.*?)\1/gi;
        let match;
        let portfolioLink = null;

        let foundLinks = [];
        let linkIndex = 0;

        while ((match = linkRegex.exec(html)) !== null) {
            const rawLink = match[2];
            if (!portfolioLink && rawLink.match(/portfolio|companies|ventures|startups|investments/i) && !rawLink.match(/login|signin|policy/i)) {
                if (rawLink.startsWith("http")) {
                    portfolioLink = rawLink;
                } else if (rawLink.startsWith("/")) {
                    const baseUrl = url.split('/').slice(0, 3).join('/');
                    portfolioLink = baseUrl + rawLink;
                }
            }

            if (rawLink.startsWith("http") && !rawLink.includes(url.split('/')[2])) {
                foundLinks.push(`[${linkIndex}] ${rawLink}`);
                linkIndex++;
            }
        }

        // HTML cleanup to save tokens: remove scripts, styles and comments
        html = html.replace(/<script[^>]*>([\s\S]*?)<\/script>/gi, "");  // scripts
        html = html.replace(/<style[^>]*>([\s\S]*?)<\/style>/gi, "");  // styles
        html = html.replace(/<!--[\s\S]*?-->/g, "");  // comments

        let text = html.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();

        text = text.slice(0, 30000);

        return {
            text: text,
            portfolioUrl: portfolioLink,
            extractedLinks: foundLinks.slice(0, 300).join("\n")
        };

    } catch (e) {
        console.warn(`[SCRAPE ERROR] Impossibile leggere ${url}: ${e.message}`);
        return null;
    }
}