Skip to content

Commit 430ce99

Browse files
committed
feat: add simple metadata fetcher as fallback for Puppeteer with improved error handling
1 parent 78018af commit 430ce99

File tree

5 files changed

+469
-31
lines changed

5 files changed

+469
-31
lines changed

.env.example

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,9 @@ APP_PORT=3000
117117
IMGPROXY_ENABLE_WEBP_DETECTION=true
118118

119119
# Add your OpenAI API key to enable SQL Editor Assistant and AI features
120-
OPENAI_API_KEY=
121-
120+
# OpenAI Configuration
121+
OPENAI_API_KEY=your-openai-api-key
122+
OPENAI_MODEL=gpt-4-turbo-preview
122123

123124
############
124125
# Functions - Configuration for Functions

src/lib/config/supabase.js

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,16 @@ let SUPABASE_URL, SUPABASE_ANON_KEY;
99

1010
if (browser) {
1111
// Client-side: use SvelteKit's public env vars
12-
const { PUBLIC_SUPABASE_URL, PUBLIC_SUPABASE_ANON_KEY } = await import('$env/static/public');
13-
SUPABASE_URL = PUBLIC_SUPABASE_URL;
14-
SUPABASE_ANON_KEY = PUBLIC_SUPABASE_ANON_KEY;
12+
try {
13+
const { PUBLIC_SUPABASE_URL, PUBLIC_SUPABASE_ANON_KEY } = await import('$env/static/public');
14+
SUPABASE_URL = PUBLIC_SUPABASE_URL;
15+
SUPABASE_ANON_KEY = PUBLIC_SUPABASE_ANON_KEY;
16+
} catch (error) {
17+
console.error('Failed to import public environment variables:', error);
18+
// Fallback to process.env even on client side
19+
SUPABASE_URL = process.env.PUBLIC_SUPABASE_URL;
20+
SUPABASE_ANON_KEY = process.env.PUBLIC_SUPABASE_ANON_KEY;
21+
}
1522
} else {
1623
// Server-side: use process.env directly with Docker environment variables
1724
SUPABASE_URL = process.env.PUBLIC_SUPABASE_URL || process.env.SUPABASE_URL;
Lines changed: 321 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
/**
2+
* Simple Metadata Fetcher
3+
* Lightweight fallback for when Puppeteer fails
4+
* Uses simple HTTP requests and Cheerio for HTML parsing
5+
*/
6+
7+
import { load } from 'cheerio';
8+
import { URL } from 'url';
9+
10+
export class SimpleMetadataFetcher {
11+
constructor(options = {}) {
12+
this.timeout = options.timeout || 10000;
13+
this.userAgent = options.userAgent || 'ADLP-Bot/1.0 (+https://adlp.dev/bot)';
14+
this.maxRedirects = options.maxRedirects || 5;
15+
this.maxContentLength = options.maxContentLength || 5 * 1024 * 1024; // 5MB
16+
}
17+
18+
/**
19+
* Validates a URL for security and format
20+
* @param {string} url - The URL to validate
21+
* @throws {Error} If URL is invalid or not allowed
22+
*/
23+
validateUrl(url) {
24+
if (!url || typeof url !== 'string') {
25+
throw new Error('URL is required');
26+
}
27+
28+
let parsedUrl;
29+
try {
30+
parsedUrl = new URL(url);
31+
} catch (error) {
32+
throw new Error('Invalid URL format');
33+
}
34+
35+
// Only allow HTTP and HTTPS
36+
if (!['http:', 'https:'].includes(parsedUrl.protocol)) {
37+
throw new Error('Only HTTP and HTTPS URLs are supported');
38+
}
39+
40+
// Security checks for production
41+
if (process.env.NODE_ENV === 'production') {
42+
const hostname = parsedUrl.hostname.toLowerCase();
43+
44+
// Block localhost
45+
if (hostname === 'localhost' || hostname === '127.0.0.1' || hostname === '::1') {
46+
throw new Error('Localhost URLs are not allowed');
47+
}
48+
49+
// Block private IP ranges
50+
if (this.isPrivateIP(hostname)) {
51+
throw new Error('Private IP addresses are not allowed');
52+
}
53+
}
54+
}
55+
56+
/**
57+
* Checks if a hostname is a private IP address
58+
* @param {string} hostname - The hostname to check
59+
* @returns {boolean} True if it's a private IP
60+
*/
61+
isPrivateIP(hostname) {
62+
const ipv4Regex = /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/;
63+
const match = hostname.match(ipv4Regex);
64+
65+
if (!match) return false;
66+
67+
const [, a, b, c, d] = match.map(Number);
68+
69+
// Check for private IP ranges
70+
return (
71+
(a === 10) ||
72+
(a === 172 && b >= 16 && b <= 31) ||
73+
(a === 192 && b === 168) ||
74+
(a === 169 && b === 254) // Link-local
75+
);
76+
}
77+
78+
/**
79+
* Fetches HTML content from a URL
80+
* @param {string} url - The URL to fetch
81+
* @returns {Promise<string>} The HTML content
82+
*/
83+
async fetchHtml(url) {
84+
const controller = new AbortController();
85+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
86+
87+
try {
88+
const response = await fetch(url, {
89+
method: 'GET',
90+
headers: {
91+
'User-Agent': this.userAgent,
92+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
93+
'Accept-Language': 'en-US,en;q=0.5',
94+
'Accept-Encoding': 'gzip, deflate',
95+
'DNT': '1',
96+
'Connection': 'keep-alive',
97+
'Upgrade-Insecure-Requests': '1'
98+
},
99+
signal: controller.signal,
100+
redirect: 'follow'
101+
});
102+
103+
if (!response.ok) {
104+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
105+
}
106+
107+
// Check content type
108+
const contentType = response.headers.get('content-type') || '';
109+
if (!contentType.includes('text/html')) {
110+
throw new Error('Response is not HTML content');
111+
}
112+
113+
// Check content length
114+
const contentLength = response.headers.get('content-length');
115+
if (contentLength && parseInt(contentLength) > this.maxContentLength) {
116+
throw new Error('Content too large');
117+
}
118+
119+
const html = await response.text();
120+
return html;
121+
} catch (error) {
122+
if (error.name === 'AbortError') {
123+
throw new Error('Request timeout');
124+
}
125+
throw error;
126+
} finally {
127+
clearTimeout(timeoutId);
128+
}
129+
}
130+
131+
/**
132+
* Extracts metadata from HTML using Cheerio
133+
* @param {string} html - The HTML content
134+
* @param {string} url - The original URL
135+
* @returns {Object} The extracted metadata
136+
*/
137+
extractMetadata(html, url) {
138+
const $ = load(html);
139+
140+
// Helper functions
141+
const getMetaContent = (name) => {
142+
return $(`meta[name="${name}"]`).attr('content') || null;
143+
};
144+
145+
const getMetaProperty = (property) => {
146+
return $(`meta[property="${property}"]`).attr('content') || null;
147+
};
148+
149+
const resolveUrl = (relativeUrl) => {
150+
if (!relativeUrl) return null;
151+
try {
152+
return new URL(relativeUrl, url).href;
153+
} catch {
154+
return null;
155+
}
156+
};
157+
158+
// Extract basic metadata
159+
const title = $('title').text().trim() ||
160+
getMetaProperty('og:title') ||
161+
getMetaContent('twitter:title') ||
162+
'Untitled';
163+
164+
const description = getMetaContent('description') ||
165+
getMetaProperty('og:description') ||
166+
getMetaContent('twitter:description') ||
167+
'';
168+
169+
// Extract Open Graph metadata
170+
const openGraph = {};
171+
$('meta[property^="og:"]').each((_, element) => {
172+
const property = $(element).attr('property');
173+
const content = $(element).attr('content');
174+
if (property && content) {
175+
const key = property.replace('og:', '');
176+
openGraph[key] = content;
177+
}
178+
});
179+
180+
// Extract Twitter Card metadata
181+
const twitter = {};
182+
$('meta[name^="twitter:"]').each((_, element) => {
183+
const name = $(element).attr('name');
184+
const content = $(element).attr('content');
185+
if (name && content) {
186+
const key = name.replace('twitter:', '').replace(':', '_');
187+
twitter[key] = content;
188+
}
189+
});
190+
191+
// Extract images with priorities
192+
const images = {
193+
primary: null,
194+
sources: []
195+
};
196+
197+
const imageSelectors = [
198+
{ selector: 'meta[property="og:image"]', type: 'og:image', priority: 10 },
199+
{ selector: 'meta[name="twitter:image"]', type: 'twitter:image', priority: 9 },
200+
{ selector: 'meta[name="twitter:image:src"]', type: 'twitter:image:src', priority: 8 },
201+
{ selector: 'link[rel="image_src"]', type: 'image_src', priority: 7 }
202+
];
203+
204+
imageSelectors.forEach(({ selector, type, priority }) => {
205+
$(selector).each((_, element) => {
206+
const imageUrl = $(element).attr('content') || $(element).attr('href');
207+
if (imageUrl) {
208+
const resolvedUrl = resolveUrl(imageUrl);
209+
if (resolvedUrl) {
210+
images.sources.push({
211+
url: resolvedUrl,
212+
type,
213+
priority,
214+
width: parseInt($(element).attr('width')) || null,
215+
height: parseInt($(element).attr('height')) || null
216+
});
217+
}
218+
}
219+
});
220+
});
221+
222+
// Sort by priority and set primary
223+
images.sources.sort((a, b) => b.priority - a.priority);
224+
if (images.sources.length > 0) {
225+
images.primary = images.sources[0].url;
226+
}
227+
228+
// Extract favicons
229+
const favicons = [];
230+
const faviconSelectors = [
231+
'link[rel="icon"]',
232+
'link[rel="shortcut icon"]',
233+
'link[rel="apple-touch-icon"]',
234+
'link[rel="apple-touch-icon-precomposed"]'
235+
];
236+
237+
faviconSelectors.forEach(selector => {
238+
$(selector).each((_, element) => {
239+
const href = $(element).attr('href');
240+
if (href) {
241+
const resolvedUrl = resolveUrl(href);
242+
if (resolvedUrl) {
243+
favicons.push({
244+
url: resolvedUrl,
245+
type: $(element).attr('rel'),
246+
sizes: $(element).attr('sizes'),
247+
mimeType: $(element).attr('type')
248+
});
249+
}
250+
}
251+
});
252+
});
253+
254+
// Extract JSON-LD structured data
255+
const jsonLd = [];
256+
$('script[type="application/ld+json"]').each((_, element) => {
257+
try {
258+
const data = JSON.parse($(element).html());
259+
jsonLd.push(data);
260+
} catch (error) {
261+
// Ignore malformed JSON-LD
262+
}
263+
});
264+
265+
// Detect content type
266+
let contentType = 'website';
267+
if (openGraph.type) {
268+
contentType = openGraph.type;
269+
} else if ($('video').length > 0) {
270+
contentType = 'video';
271+
} else if ($('article').length > 0) {
272+
contentType = 'article';
273+
}
274+
275+
return {
276+
url,
277+
title,
278+
description,
279+
contentType,
280+
images,
281+
favicons,
282+
openGraph,
283+
twitter,
284+
structuredData: {
285+
jsonLd,
286+
microdata: [] // Could be enhanced to extract microdata
287+
},
288+
// Additional metadata
289+
loadTime: Date.now(),
290+
hasJavaScript: false, // This fetcher doesn't execute JS
291+
fetchMethod: 'simple-http'
292+
};
293+
}
294+
295+
/**
296+
* Fetches metadata from a URL
297+
* @param {string} url - The URL to fetch metadata from
298+
* @returns {Promise<Object>} The extracted metadata
299+
*/
300+
async fetchMetadata(url) {
301+
this.validateUrl(url);
302+
303+
try {
304+
const html = await this.fetchHtml(url);
305+
const metadata = this.extractMetadata(html, url);
306+
return metadata;
307+
} catch (error) {
308+
throw new Error(`Failed to fetch metadata: ${error.message}`);
309+
}
310+
}
311+
312+
/**
313+
* Cleanup method (no-op for simple fetcher)
314+
*/
315+
async cleanup() {
316+
// No cleanup needed for simple HTTP fetcher
317+
}
318+
}
319+
320+
// Export a default instance
321+
export const simpleMetadataFetcher = new SimpleMetadataFetcher();

0 commit comments

Comments
 (0)