@@ -32,41 +32,39 @@ const BOOLEAN_ATTRIBUTES = [
3232] ;
3333
3434/**
35- * Default allowlist patterns for embed content (ported from Wikidot's default.php)
36- * Only content matching these patterns will be rendered.
37- *
38- * Security: The 'anyiframe' pattern is kept for Wikidot compatibility, but
39- * hasDangerousIframeAttributes() blocks dangerous attributes like srcdoc and
40- * non-https src URLs. hasDangerousScripts() blocks all script tags.
35+ * Allowlist entry for embed content validation
36+ * Each entry specifies a host pattern and optional path prefix
4137 */
42- export const DEFAULT_EMBED_ALLOWLIST : RegExp [ ] = [
43- // Any iframe with standard attributes (Wikidot's 'anyiframe' pattern)
44- // Note: Dangerous attributes are blocked separately by hasDangerousIframeAttributes()
45- / ^ < i f r a m e ( \s + [ a - z 0 - 9 _ ] + \s * = \s * " [ ^ " ] * " ) + > \s * < \/ i f r a m e > $ / is,
46-
47- // YouTube embed
48- / ^ < i f r a m e [ ^ > ] * \s + s r c = " h t t p s ? : \/ \/ ( w w w \. ) ? y o u t u b e \. c o m \/ e m b e d \/ [ a - z A - Z 0 - 9 _ - ] + " [ ^ > ] * > \s * < \/ i f r a m e > $ / is,
49- / ^ < i f r a m e [ ^ > ] * \s + s r c = " h t t p s ? : \/ \/ ( w w w \. ) ? y o u t u b e - n o c o o k i e \. c o m \/ e m b e d \/ [ a - z A - Z 0 - 9 _ - ] + " [ ^ > ] * > \s * < \/ i f r a m e > $ / is,
50-
51- // Vimeo embed
52- / ^ < i f r a m e [ ^ > ] * \s + s r c = " h t t p s ? : \/ \/ p l a y e r \. v i m e o \. c o m \/ v i d e o \/ [ 0 - 9 ] + " [ ^ > ] * > \s * < \/ i f r a m e > $ / is,
38+ export interface EmbedAllowlistEntry {
39+ /** Host pattern. Supports wildcard prefix '*.' (e.g., '*.youtube.com') */
40+ host : string ;
41+ /** Optional path prefix that must match (e.g., '/embed/') */
42+ pathPrefix ?: string ;
43+ }
5344
45+ /**
46+ * Default allowlist for embed content (ported from Wikidot's default.php)
47+ * Only iframes with src matching these host+path patterns will be rendered.
48+ *
49+ * Note: Set to null to allow any HTTPS iframe (Wikidot's 'anyiframe' behavior).
50+ * DOMPurify still enforces HTTPS-only and blocks dangerous attributes.
51+ */
52+ export const DEFAULT_EMBED_ALLOWLIST : EmbedAllowlistEntry [ ] | null = [
53+ // YouTube
54+ { host : "*.youtube.com" , pathPrefix : "/embed/" } ,
55+ { host : "*.youtube-nocookie.com" , pathPrefix : "/embed/" } ,
56+ // Vimeo
57+ { host : "player.vimeo.com" , pathPrefix : "/video/" } ,
5458 // Google Maps
55- / ^ < i f r a m e [ ^ > ] * \s + s r c = " h t t p s ? : \/ \/ w w w \. g o o g l e \. c o m \/ m a p s \/ e m b e d [ ^ " ] * " [ ^ > ] * > \s * < \/ i f r a m e > $ / is,
56-
59+ { host : "*.google.com" , pathPrefix : "/maps/embed" } ,
5760 // Google Calendar
58- / ^ < i f r a m e [ ^ > ] * \s + s r c = " h t t p s ? : \/ \/ c a l e n d a r \. g o o g l e \. c o m \/ c a l e n d a r \/ e m b e d [ ^ " ] * " [ ^ > ] * > \s * < \/ i f r a m e > $ / is,
59-
61+ { host : "calendar.google.com" , pathPrefix : "/calendar/embed" } ,
6062 // Spotify
61- / ^ < i f r a m e [ ^ > ] * \s + s r c = " h t t p s ? : \/ \/ o p e n \. s p o t i f y \. c o m \/ e m b e d \/ [ ^ " ] * " [ ^ > ] * > \s * < \/ i f r a m e > $ / is,
62-
63+ { host : "open.spotify.com" , pathPrefix : "/embed/" } ,
6364 // SoundCloud
64- / ^ < i f r a m e [ ^ > ] * \s + s r c = " h t t p s ? : \/ \/ w \. s o u n d c l o u d \. c o m \/ p l a y e r \/ [ ^ " ] * " [ ^ > ] * > \s * < \/ i f r a m e > $ / is,
65-
66- // Note: Twitter/X embed pattern removed due to XSS risks with blockquote content injection
67-
65+ { host : "w.soundcloud.com" , pathPrefix : "/player/" } ,
6866 // CodePen
69- / ^ < i f r a m e [ ^ > ] * \s + s r c = " h t t p s ? : \/ \/ c o d e p e n \ .i o \/ [ ^ " ] * " [ ^ > ] * > \s * < \/ i f r a m e > $ / is ,
67+ { host : " codepen.io" } ,
7068] ;
7169
7270// Initialize DOMPurify with jsdom
@@ -97,39 +95,111 @@ const DOMPURIFY_CONFIG: Config = {
9795} ;
9896
9997/**
100- * Sanitize embed content using DOMPurify
101- * Returns null if content is completely removed or src is missing (dangerous content )
98+ * Check if a hostname matches an allowlist entry
99+ * Supports wildcard prefix with '*.' (e.g., '*.youtube.com' matches 'www.youtube.com' )
102100 */
103- function sanitizeEmbed ( content : string ) : string | null {
101+ function matchesHostPattern ( hostname : string , pattern : string ) : boolean {
102+ const lowerHostname = hostname . toLowerCase ( ) ;
103+ const lowerPattern = pattern . toLowerCase ( ) ;
104+
105+ if ( lowerPattern . startsWith ( "*." ) ) {
106+ // Wildcard match: *.example.com matches example.com and sub.example.com
107+ // But not evil-example.com (must be exact or have dot boundary)
108+ const base = lowerPattern . slice ( 2 ) ; // Remove '*.'
109+ return lowerHostname === base || lowerHostname . endsWith ( "." + base ) ;
110+ }
111+ // Exact match
112+ return lowerHostname === lowerPattern ;
113+ }
114+
115+ /**
116+ * Check if URL matches an allowlist entry (host and optional path prefix)
117+ * Path prefix must match at a boundary (followed by /, ?, #, or end of path)
118+ */
119+ function matchesAllowlistEntry ( url : URL , entry : EmbedAllowlistEntry ) : boolean {
120+ if ( ! matchesHostPattern ( url . hostname , entry . host ) ) {
121+ return false ;
122+ }
123+ if ( entry . pathPrefix ) {
124+ const pathLower = url . pathname . toLowerCase ( ) ;
125+ const prefixLower = entry . pathPrefix . toLowerCase ( ) ;
126+ if ( ! pathLower . startsWith ( prefixLower ) ) {
127+ return false ;
128+ }
129+ // If prefix ends with /, boundary check is already satisfied
130+ // Otherwise ensure prefix matches at a boundary (not partial, e.g., /embed vs /embedX)
131+ if ( ! prefixLower . endsWith ( "/" ) ) {
132+ const remainder = pathLower . slice ( prefixLower . length ) ;
133+ if ( remainder && ! / ^ [ / ? # ] / . test ( remainder ) ) {
134+ return false ;
135+ }
136+ }
137+ }
138+ return true ;
139+ }
140+
141+ /**
142+ * Validate and sanitize embed content
143+ * Returns sanitized HTML string or null if content is invalid/dangerous
144+ *
145+ * Validation rules:
146+ * - Content must contain exactly one iframe element
147+ * - iframe must have a valid HTTPS src URL
148+ * - src URL must match the allowlist (host + path prefix)
149+ * - DOMPurify removes dangerous attributes
150+ */
151+ function validateAndSanitizeEmbed (
152+ content : string ,
153+ allowlist : EmbedAllowlistEntry [ ] | null ,
154+ ) : string | null {
155+ // First, sanitize with DOMPurify to remove dangerous content
104156 const sanitized = purify . sanitize ( content . trim ( ) , {
105157 ...DOMPURIFY_CONFIG ,
106158 RETURN_TRUSTED_TYPE : false ,
107159 } ) as string ;
108- // If DOMPurify removed everything, the content was dangerous
160+
109161 if ( ! sanitized . trim ( ) ) {
110162 return null ;
111163 }
112- // If iframe exists but has no valid src (empty or removed), reject it
113- if ( / < i f r a m e [ ^ > ] * > / i. test ( sanitized ) ) {
114- const srcMatch = sanitized . match ( / \s + s r c \s * = \s * [ " ' ] ( [ ^ " ' ] * ) [ " ' ] / i) ;
115- if ( ! srcMatch || ! srcMatch [ 1 ] ) {
116- return null ;
117- }
164+
165+ // Parse sanitized content once (avoid multiple JSDOM instances)
166+ const dom = new JSDOM ( sanitized ) ;
167+ const iframes = dom . window . document . querySelectorAll ( "iframe" ) ;
168+
169+ // Must have exactly one iframe
170+ if ( iframes . length !== 1 ) {
171+ return null ;
118172 }
119- return sanitized ;
120- }
121173
122- /**
123- * Validate embed content against allowlist (pattern-based pre-check)
124- */
125- function matchesAllowlist ( content : string , allowlist : RegExp [ ] ) : boolean {
126- const trimmed = content . trim ( ) ;
127- for ( const pattern of allowlist ) {
128- if ( pattern . test ( trimmed ) ) {
129- return true ;
174+ const iframe = iframes [ 0 ] ! ;
175+ const src = iframe . getAttribute ( "src" ) ?. trim ( ) ;
176+ if ( ! src ) {
177+ return null ;
178+ }
179+
180+ // Parse URL
181+ let url : URL ;
182+ try {
183+ url = new URL ( src ) ;
184+ } catch {
185+ return null ;
186+ }
187+
188+ // Only allow HTTPS
189+ if ( url . protocol !== "https:" ) {
190+ return null ;
191+ }
192+
193+ // If allowlist is null, allow any HTTPS iframe (Wikidot's 'anyiframe' behavior)
194+ if ( allowlist !== null ) {
195+ // Check if URL matches any allowlist entry
196+ const matched = allowlist . some ( ( entry ) => matchesAllowlistEntry ( url , entry ) ) ;
197+ if ( ! matched ) {
198+ return null ;
130199 }
131200 }
132- return false ;
201+
202+ return sanitized ;
133203}
134204
135205/**
@@ -153,23 +223,17 @@ function normalizeBooleanAttributes(html: string): string {
153223/**
154224 * Render embed-block element (Wikidot style [[embed]]..[[/embed]])
155225 *
156- * Content is validated in two stages:
157- * 1. Pattern-based allowlist check (for Wikidot compatibility)
158- * 2. DOMPurify sanitization (for XSS protection)
159- *
160- * Both stages must pass for content to be rendered.
226+ * Content is validated in a single pass:
227+ * 1. DOMPurify sanitization (removes dangerous attributes)
228+ * 2. Single iframe requirement check
229+ * 3. HTTPS-only and allowlist (host + path) validation
161230 */
162231export function renderEmbedBlock ( ctx : RenderContext , data : EmbedBlockData ) : void {
163- const allowlist = ctx . options . embedAllowlist ?? DEFAULT_EMBED_ALLOWLIST ;
164-
165- // Stage 1: Pattern-based allowlist check
166- if ( ! matchesAllowlist ( data . contents , allowlist ) ) {
167- ctx . push ( '<div class="error-block">Sorry, no match for the embedded content.</div>' ) ;
168- return ;
169- }
232+ // Use explicit undefined check to allow null (anyiframe mode)
233+ const allowlist =
234+ ctx . options . embedAllowlist !== undefined ? ctx . options . embedAllowlist : DEFAULT_EMBED_ALLOWLIST ;
170235
171- // Stage 2: DOMPurify sanitization (defense in depth)
172- const sanitized = sanitizeEmbed ( data . contents ) ;
236+ const sanitized = validateAndSanitizeEmbed ( data . contents , allowlist ) ;
173237 if ( sanitized === null ) {
174238 ctx . push ( '<div class="error-block">Sorry, no match for the embedded content.</div>' ) ;
175239 return ;
0 commit comments