Skip to content

Commit 3b37d1d

Browse files
authored
Merge pull request #56 from TBosak/suggestion-engine
Updated suggestion engine heuristics + added loading indicator, deduplication in strict mode, etc.
2 parents 2eeb8fe + 9d08924 commit 3b37d1d

File tree

6 files changed

+1806
-379
lines changed

6 files changed

+1806
-379
lines changed

frontend/src/components/forms/WebScrapingForm.tsx

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { useState } from "react";
12
import {
23
UseFormRegister,
34
Control,
@@ -30,6 +31,7 @@ import {
3031
import { Info, Sparkles, Link, Wand2 } from "lucide-react";
3132
import { SelectorField } from "./SelectorField";
3233
import { SelectorPlayground } from "./SelectorPlayground";
34+
import { LoadingSpinner } from "@/components/ui/loading-spinner";
3335

3436
interface WebScrapingFormProps {
3537
register: UseFormRegister<FeedFormData>;
@@ -47,13 +49,15 @@ export const WebScrapingForm = ({
4749
feedUrl,
4850
}: WebScrapingFormProps) => {
4951
const dateFormat = watch("dateFormat");
52+
const [isLoadingSelectors, setIsLoadingSelectors] = useState(false);
5053

5154
const handleAutoFillSelectors = async () => {
5255
if (!feedUrl) {
5356
alert("Please enter a target URL first.");
5457
return;
5558
}
5659

60+
setIsLoadingSelectors(true);
5761
try {
5862
const response = await fetch("/utils/suggest-selectors", {
5963
method: "POST",
@@ -89,11 +93,19 @@ export const WebScrapingForm = ({
8993
} catch (error) {
9094
console.error("Error:", error);
9195
alert("An error occurred while auto-filling selectors.");
96+
} finally {
97+
setIsLoadingSelectors(false);
9298
}
9399
};
94100

95101
return (
96102
<>
103+
{isLoadingSelectors && (
104+
<LoadingSpinner
105+
fullscreen
106+
message="Analyzing page structure and suggesting selectors..."
107+
/>
108+
)}
97109
<SelectorPlayground
98110
feedUrl={feedUrl}
99111
setValue={setValue}
@@ -116,9 +128,10 @@ export const WebScrapingForm = ({
116128
variant="outline"
117129
onClick={handleAutoFillSelectors}
118130
className="w-full"
131+
disabled={isLoadingSelectors}
119132
>
120133
<Sparkles className="mr-2 h-4 w-4" />
121-
Suggest Selectors
134+
{isLoadingSelectors ? "Analyzing..." : "Suggest Selectors"}
122135
</Button>
123136
</div>
124137

index.ts

Lines changed: 40 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1200,40 +1200,34 @@ async function determineIsRelativeAndBaseUrl(
12001200
userBaseUrl: string | undefined,
12011201
feedUrl: string | undefined,
12021202
): Promise<{ isRelative: boolean; baseUrl: string | undefined }> {
1203+
// If user explicitly set both isRelative and baseUrl, use those
12031204
if (typeof userIsRelative === "boolean" && userBaseUrl) {
12041205
return { isRelative: userIsRelative, baseUrl: userBaseUrl };
12051206
}
1207+
1208+
// If user explicitly set isRelative
12061209
if (typeof userIsRelative === "boolean") {
12071210
if (userIsRelative && !userBaseUrl && feedUrl) {
12081211
return { isRelative: true, baseUrl: feedUrl };
12091212
}
12101213
return { isRelative: userIsRelative, baseUrl: userBaseUrl };
12111214
}
1215+
1216+
// If user provided baseUrl but not isRelative, detect from URL format
12121217
if (userBaseUrl) {
1213-
if (isLikelyAbsoluteUrl(url)) {
1214-
return { isRelative: false, baseUrl: userBaseUrl };
1215-
} else {
1216-
return { isRelative: true, baseUrl: userBaseUrl };
1217-
}
1218+
const isAbs = isLikelyAbsoluteUrl(url);
1219+
return { isRelative: !isAbs, baseUrl: userBaseUrl };
12181220
}
1219-
if (isLikelyAbsoluteUrl(url)) {
1220-
try {
1221-
const resp = await axios.head(url, {
1222-
maxRedirects: 2,
1223-
validateStatus: () => true,
1224-
});
1225-
if (resp.status >= 200 && resp.status < 600) {
1226-
return { isRelative: false, baseUrl: undefined };
1227-
}
1228-
} catch {
1229-
if (feedUrl) return { isRelative: true, baseUrl: feedUrl };
1230-
return { isRelative: true, baseUrl: undefined };
1231-
}
1232-
if (feedUrl) return { isRelative: true, baseUrl: feedUrl };
1233-
return { isRelative: true, baseUrl: undefined };
1221+
1222+
// Auto-detect based on URL format
1223+
const isAbsolute = isLikelyAbsoluteUrl(url);
1224+
1225+
if (isAbsolute) {
1226+
return { isRelative: false, baseUrl: undefined };
1227+
} else {
1228+
// Relative URL - use feedUrl as base if available
1229+
return { isRelative: true, baseUrl: feedUrl };
12341230
}
1235-
if (feedUrl) return { isRelative: true, baseUrl: feedUrl };
1236-
return { isRelative: true, baseUrl: undefined };
12371231
}
12381232

12391233
function extractSampleUrlFromHtml(
@@ -1242,12 +1236,27 @@ function extractSampleUrlFromHtml(
12421236
attribute?: string,
12431237
): string {
12441238
const $ = cheerio.load(html);
1245-
const el = $(selector).first();
1246-
if (!el.length) return "";
1247-
if (attribute) {
1248-
return el.attr(attribute) || "";
1239+
const elements = $(selector).slice(0, 5); // Check first 5 elements
1240+
1241+
if (elements.length === 0) return "";
1242+
1243+
// Try to find a non-empty URL from the sample
1244+
for (let i = 0; i < elements.length; i++) {
1245+
const el = elements.eq(i);
1246+
let url = "";
1247+
if (attribute) {
1248+
url = el.attr(attribute) || "";
1249+
} else {
1250+
url = el.attr("href") || el.attr("src") || "";
1251+
}
1252+
1253+
// Return first non-empty URL found
1254+
if (url && url.trim()) {
1255+
return url.trim();
1256+
}
12491257
}
1250-
return el.attr("href") || el.attr("src") || "";
1258+
1259+
return "";
12511260
}
12521261

12531262
async function buildCSSTarget(
@@ -1287,6 +1296,7 @@ async function buildCSSTarget(
12871296
);
12881297
isRelative = result.isRelative;
12891298
baseUrl = result.baseUrl;
1299+
console.log(`[Preview ${prefix}] Sample URL: "${urlSample}" → isRelative: ${isRelative}, baseUrl: ${baseUrl}`);
12901300
}
12911301

12921302
// Extract drill chain data directly if it was pre-processed into an array of objects
@@ -1564,9 +1574,12 @@ async function generatePreview(feedConfig: any) {
15641574
},
15651575
maxContentLength: 2 * 1024 * 1024,
15661576
maxBodyLength: 2 * 1024 * 1024,
1577+
timeout: 30000, // 30 second timeout
15671578
});
1579+
console.log("[Preview] Page fetched, building RSS...");
15681580
const html = response.data;
15691581
rssXml = await buildRSS(html, feedConfig);
1582+
console.log("[Preview] RSS build complete");
15701583
}
15711584
} else if (feedConfig.feedType === "api") {
15721585
// feedConfig.config contains API call details (baseUrl, route, method, params, apiSpecificHeaders, apiSpecificBody)

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"name": "mkfd",
33
"module": "index.ts",
44
"type": "module",
5-
"version": "2.0.6",
5+
"version": "2.0.7",
66
"scripts": {
77
"build": "cd frontend && bun run build",
88
"start": "bun run index.ts",

0 commit comments

Comments
 (0)