fix(security): prevent ReDoS in extractTagContent (#9)

theagenticguy · Copilot · web-flow · commit 344014dcc956 · 2026-01-12T16:35:25.000-06:00
* fix(security): prevent ReDoS in extractTagContent Escape special regex characters in tagName parameter before constructing RegExp to prevent Regular Expression Denial-of-Service attacks from malicious input. * fix(security): eliminate ReDoS in extractTagContent via string-based parsing (#10) * Initial plan * fix(security): replace RegExp with string-based parsing in extractTagContent - Remove dynamic RegExp construction to prevent ReDoS vulnerability - Replace with safe string-based indexOf parsing - Add input validation: only allow alphanumeric, hyphens, underscores - Add length limit (100 chars) for tag names - Maintain all existing functionality and test compatibility Co-authored-by: theagenticguy <9553966+theagenticguy@users.noreply.github.com> * refactor: improve extractTagContent performance and validation - Replace regex validation with character-by-character check - Implement custom case-insensitive search to avoid lowercasing large texts - More efficient for large inputs (no full text copy) - Still prevents ReDoS and maintains all functionality Co-authored-by: theagenticguy <9553966+theagenticguy@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: theagenticguy <9553966+theagenticguy@users.noreply.github.com> * fix: regex type guard --------- Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com>
diff --git a/src/schemas/session.ts b/src/schemas/session.ts
@@ -158,18 +158,85 @@ function sanitizeText(text: string): string {
 }
 
 /**
- * Extracts content between XML-like tags.
+ * Case-insensitive string search helper.
  * @param text - The text to search in
- * @param tagName - The tag name to look for
+ * @param searchStr - The string to search for
+ * @param startIndex - The index to start searching from
+ * @returns The index where searchStr is found, or -1 if not found
+ */
+function indexOfCaseInsensitive(
+	text: string,
+	searchStr: string,
+	startIndex = 0,
+): number {
+	const lowerSearchStr = searchStr.toLowerCase();
+	const textLen = text.length;
+	const searchLen = searchStr.length;
+
+	for (let i = startIndex; i <= textLen - searchLen; i++) {
+		let match = true;
+		for (let j = 0; j < searchLen; j++) {
+			if (text[i + j]?.toLowerCase() !== lowerSearchStr[j]) {
+				match = false;
+				break;
+			}
+		}
+		if (match) return i;
+	}
+	return -1;
+}
+
+/**
+ * Extracts content between XML-like tags using string-based parsing.
+ * Avoids RegExp construction with user input to prevent ReDoS attacks.
+ * @param text - The text to search in
+ * @param tagName - The tag name to look for (validated for safety)
  * @returns The content between tags, or null if not found
  */
 export function extractTagContent(
 	text: string,
 	tagName: string,
 ): string | null {
-	const pattern = new RegExp(`<${tagName}>([\\s\\S]*?)</${tagName}>`, "i");
-	const match = text.match(pattern);
-	const content = match?.[1]?.trim() ?? null;
+	// Validate tagName character by character to avoid even simple regex
+	// Only allow alphanumeric, hyphens, underscores (common XML/HTML tag patterns)
+	if (tagName.length === 0 || tagName.length > 100) {
+		return null;
+	}
+
+	for (let i = 0; i < tagName.length; i++) {
+		const char = tagName[i];
+		if (char === undefined) {
+			return null;
+		}
+		const isValid =
+			(char >= "a" && char <= "z") ||
+			(char >= "A" && char <= "Z") ||
+			(char >= "0" && char <= "9") ||
+			char === "_" ||
+			char === "-";
+		if (!isValid) {
+			return null;
+		}
+	}
+
+	// Use string-based parsing with case-insensitive search
+	const openTag = `<${tagName}>`;
+	const closeTag = `</${tagName}>`;
+
+	const startIdx = indexOfCaseInsensitive(text, openTag);
+	if (startIdx === -1) {
+		return null;
+	}
+
+	const contentStart = startIdx + openTag.length;
+	const endIdx = indexOfCaseInsensitive(text, closeTag, contentStart);
+	if (endIdx === -1) {
+		return null;
+	}
+
+	// Extract content from original text to preserve case
+	const content = text.substring(contentStart, endIdx).trim();
+
 	// Sanitize to remove any binary/control characters
 	return content ? sanitizeText(content) : null;
 }