1+ /**
2+ * Simple Metadata Fetcher
3+ * Lightweight fallback for when Puppeteer fails
4+ * Uses simple HTTP requests and Cheerio for HTML parsing
5+ */
6+
7+ import { load } from 'cheerio' ;
8+ import { URL } from 'url' ;
9+
10+ export class SimpleMetadataFetcher {
11+ constructor ( options = { } ) {
12+ this . timeout = options . timeout || 10000 ;
13+ this . userAgent = options . userAgent || 'ADLP-Bot/1.0 (+https://adlp.dev/bot)' ;
14+ this . maxRedirects = options . maxRedirects || 5 ;
15+ this . maxContentLength = options . maxContentLength || 5 * 1024 * 1024 ; // 5MB
16+ }
17+
18+ /**
19+ * Validates a URL for security and format
20+ * @param {string } url - The URL to validate
21+ * @throws {Error } If URL is invalid or not allowed
22+ */
23+ validateUrl ( url ) {
24+ if ( ! url || typeof url !== 'string' ) {
25+ throw new Error ( 'URL is required' ) ;
26+ }
27+
28+ let parsedUrl ;
29+ try {
30+ parsedUrl = new URL ( url ) ;
31+ } catch ( error ) {
32+ throw new Error ( 'Invalid URL format' ) ;
33+ }
34+
35+ // Only allow HTTP and HTTPS
36+ if ( ! [ 'http:' , 'https:' ] . includes ( parsedUrl . protocol ) ) {
37+ throw new Error ( 'Only HTTP and HTTPS URLs are supported' ) ;
38+ }
39+
40+ // Security checks for production
41+ if ( process . env . NODE_ENV === 'production' ) {
42+ const hostname = parsedUrl . hostname . toLowerCase ( ) ;
43+
44+ // Block localhost
45+ if ( hostname === 'localhost' || hostname === '127.0.0.1' || hostname === '::1' ) {
46+ throw new Error ( 'Localhost URLs are not allowed' ) ;
47+ }
48+
49+ // Block private IP ranges
50+ if ( this . isPrivateIP ( hostname ) ) {
51+ throw new Error ( 'Private IP addresses are not allowed' ) ;
52+ }
53+ }
54+ }
55+
56+ /**
57+ * Checks if a hostname is a private IP address
58+ * @param {string } hostname - The hostname to check
59+ * @returns {boolean } True if it's a private IP
60+ */
61+ isPrivateIP ( hostname ) {
62+ const ipv4Regex = / ^ ( \d { 1 , 3 } ) \. ( \d { 1 , 3 } ) \. ( \d { 1 , 3 } ) \. ( \d { 1 , 3 } ) $ / ;
63+ const match = hostname . match ( ipv4Regex ) ;
64+
65+ if ( ! match ) return false ;
66+
67+ const [ , a , b , c , d ] = match . map ( Number ) ;
68+
69+ // Check for private IP ranges
70+ return (
71+ ( a === 10 ) ||
72+ ( a === 172 && b >= 16 && b <= 31 ) ||
73+ ( a === 192 && b === 168 ) ||
74+ ( a === 169 && b === 254 ) // Link-local
75+ ) ;
76+ }
77+
78+ /**
79+ * Fetches HTML content from a URL
80+ * @param {string } url - The URL to fetch
81+ * @returns {Promise<string> } The HTML content
82+ */
83+ async fetchHtml ( url ) {
84+ const controller = new AbortController ( ) ;
85+ const timeoutId = setTimeout ( ( ) => controller . abort ( ) , this . timeout ) ;
86+
87+ try {
88+ const response = await fetch ( url , {
89+ method : 'GET' ,
90+ headers : {
91+ 'User-Agent' : this . userAgent ,
92+ 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ,
93+ 'Accept-Language' : 'en-US,en;q=0.5' ,
94+ 'Accept-Encoding' : 'gzip, deflate' ,
95+ 'DNT' : '1' ,
96+ 'Connection' : 'keep-alive' ,
97+ 'Upgrade-Insecure-Requests' : '1'
98+ } ,
99+ signal : controller . signal ,
100+ redirect : 'follow'
101+ } ) ;
102+
103+ if ( ! response . ok ) {
104+ throw new Error ( `HTTP ${ response . status } : ${ response . statusText } ` ) ;
105+ }
106+
107+ // Check content type
108+ const contentType = response . headers . get ( 'content-type' ) || '' ;
109+ if ( ! contentType . includes ( 'text/html' ) ) {
110+ throw new Error ( 'Response is not HTML content' ) ;
111+ }
112+
113+ // Check content length
114+ const contentLength = response . headers . get ( 'content-length' ) ;
115+ if ( contentLength && parseInt ( contentLength ) > this . maxContentLength ) {
116+ throw new Error ( 'Content too large' ) ;
117+ }
118+
119+ const html = await response . text ( ) ;
120+ return html ;
121+ } catch ( error ) {
122+ if ( error . name === 'AbortError' ) {
123+ throw new Error ( 'Request timeout' ) ;
124+ }
125+ throw error ;
126+ } finally {
127+ clearTimeout ( timeoutId ) ;
128+ }
129+ }
130+
131+ /**
132+ * Extracts metadata from HTML using Cheerio
133+ * @param {string } html - The HTML content
134+ * @param {string } url - The original URL
135+ * @returns {Object } The extracted metadata
136+ */
137+ extractMetadata ( html , url ) {
138+ const $ = load ( html ) ;
139+
140+ // Helper functions
141+ const getMetaContent = ( name ) => {
142+ return $ ( `meta[name="${ name } "]` ) . attr ( 'content' ) || null ;
143+ } ;
144+
145+ const getMetaProperty = ( property ) => {
146+ return $ ( `meta[property="${ property } "]` ) . attr ( 'content' ) || null ;
147+ } ;
148+
149+ const resolveUrl = ( relativeUrl ) => {
150+ if ( ! relativeUrl ) return null ;
151+ try {
152+ return new URL ( relativeUrl , url ) . href ;
153+ } catch {
154+ return null ;
155+ }
156+ } ;
157+
158+ // Extract basic metadata
159+ const title = $ ( 'title' ) . text ( ) . trim ( ) ||
160+ getMetaProperty ( 'og:title' ) ||
161+ getMetaContent ( 'twitter:title' ) ||
162+ 'Untitled' ;
163+
164+ const description = getMetaContent ( 'description' ) ||
165+ getMetaProperty ( 'og:description' ) ||
166+ getMetaContent ( 'twitter:description' ) ||
167+ '' ;
168+
169+ // Extract Open Graph metadata
170+ const openGraph = { } ;
171+ $ ( 'meta[property^="og:"]' ) . each ( ( _ , element ) => {
172+ const property = $ ( element ) . attr ( 'property' ) ;
173+ const content = $ ( element ) . attr ( 'content' ) ;
174+ if ( property && content ) {
175+ const key = property . replace ( 'og:' , '' ) ;
176+ openGraph [ key ] = content ;
177+ }
178+ } ) ;
179+
180+ // Extract Twitter Card metadata
181+ const twitter = { } ;
182+ $ ( 'meta[name^="twitter:"]' ) . each ( ( _ , element ) => {
183+ const name = $ ( element ) . attr ( 'name' ) ;
184+ const content = $ ( element ) . attr ( 'content' ) ;
185+ if ( name && content ) {
186+ const key = name . replace ( 'twitter:' , '' ) . replace ( ':' , '_' ) ;
187+ twitter [ key ] = content ;
188+ }
189+ } ) ;
190+
191+ // Extract images with priorities
192+ const images = {
193+ primary : null ,
194+ sources : [ ]
195+ } ;
196+
197+ const imageSelectors = [
198+ { selector : 'meta[property="og:image"]' , type : 'og:image' , priority : 10 } ,
199+ { selector : 'meta[name="twitter:image"]' , type : 'twitter:image' , priority : 9 } ,
200+ { selector : 'meta[name="twitter:image:src"]' , type : 'twitter:image:src' , priority : 8 } ,
201+ { selector : 'link[rel="image_src"]' , type : 'image_src' , priority : 7 }
202+ ] ;
203+
204+ imageSelectors . forEach ( ( { selector, type, priority } ) => {
205+ $ ( selector ) . each ( ( _ , element ) => {
206+ const imageUrl = $ ( element ) . attr ( 'content' ) || $ ( element ) . attr ( 'href' ) ;
207+ if ( imageUrl ) {
208+ const resolvedUrl = resolveUrl ( imageUrl ) ;
209+ if ( resolvedUrl ) {
210+ images . sources . push ( {
211+ url : resolvedUrl ,
212+ type,
213+ priority,
214+ width : parseInt ( $ ( element ) . attr ( 'width' ) ) || null ,
215+ height : parseInt ( $ ( element ) . attr ( 'height' ) ) || null
216+ } ) ;
217+ }
218+ }
219+ } ) ;
220+ } ) ;
221+
222+ // Sort by priority and set primary
223+ images . sources . sort ( ( a , b ) => b . priority - a . priority ) ;
224+ if ( images . sources . length > 0 ) {
225+ images . primary = images . sources [ 0 ] . url ;
226+ }
227+
228+ // Extract favicons
229+ const favicons = [ ] ;
230+ const faviconSelectors = [
231+ 'link[rel="icon"]' ,
232+ 'link[rel="shortcut icon"]' ,
233+ 'link[rel="apple-touch-icon"]' ,
234+ 'link[rel="apple-touch-icon-precomposed"]'
235+ ] ;
236+
237+ faviconSelectors . forEach ( selector => {
238+ $ ( selector ) . each ( ( _ , element ) => {
239+ const href = $ ( element ) . attr ( 'href' ) ;
240+ if ( href ) {
241+ const resolvedUrl = resolveUrl ( href ) ;
242+ if ( resolvedUrl ) {
243+ favicons . push ( {
244+ url : resolvedUrl ,
245+ type : $ ( element ) . attr ( 'rel' ) ,
246+ sizes : $ ( element ) . attr ( 'sizes' ) ,
247+ mimeType : $ ( element ) . attr ( 'type' )
248+ } ) ;
249+ }
250+ }
251+ } ) ;
252+ } ) ;
253+
254+ // Extract JSON-LD structured data
255+ const jsonLd = [ ] ;
256+ $ ( 'script[type="application/ld+json"]' ) . each ( ( _ , element ) => {
257+ try {
258+ const data = JSON . parse ( $ ( element ) . html ( ) ) ;
259+ jsonLd . push ( data ) ;
260+ } catch ( error ) {
261+ // Ignore malformed JSON-LD
262+ }
263+ } ) ;
264+
265+ // Detect content type
266+ let contentType = 'website' ;
267+ if ( openGraph . type ) {
268+ contentType = openGraph . type ;
269+ } else if ( $ ( 'video' ) . length > 0 ) {
270+ contentType = 'video' ;
271+ } else if ( $ ( 'article' ) . length > 0 ) {
272+ contentType = 'article' ;
273+ }
274+
275+ return {
276+ url,
277+ title,
278+ description,
279+ contentType,
280+ images,
281+ favicons,
282+ openGraph,
283+ twitter,
284+ structuredData : {
285+ jsonLd,
286+ microdata : [ ] // Could be enhanced to extract microdata
287+ } ,
288+ // Additional metadata
289+ loadTime : Date . now ( ) ,
290+ hasJavaScript : false , // This fetcher doesn't execute JS
291+ fetchMethod : 'simple-http'
292+ } ;
293+ }
294+
295+ /**
296+ * Fetches metadata from a URL
297+ * @param {string } url - The URL to fetch metadata from
298+ * @returns {Promise<Object> } The extracted metadata
299+ */
300+ async fetchMetadata ( url ) {
301+ this . validateUrl ( url ) ;
302+
303+ try {
304+ const html = await this . fetchHtml ( url ) ;
305+ const metadata = this . extractMetadata ( html , url ) ;
306+ return metadata ;
307+ } catch ( error ) {
308+ throw new Error ( `Failed to fetch metadata: ${ error . message } ` ) ;
309+ }
310+ }
311+
312+ /**
313+ * Cleanup method (no-op for simple fetcher)
314+ */
315+ async cleanup ( ) {
316+ // No cleanup needed for simple HTTP fetcher
317+ }
318+ }
319+
320+ // Export a default instance
321+ export const simpleMetadataFetcher = new SimpleMetadataFetcher ( ) ;
0 commit comments