@@ -36,19 +36,14 @@ namespace {
3636 * @return true if valid HTTP/HTTPS URL
3737 */
3838bool is_valid_http_url (const std::string &url) {
39- // More balanced regex for HTTP/HTTPS URL validation
40- static const std::regex http_regex (
41- R"( ^https?:\/\/(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*|(?:[0-9]{1,3}\.){3}[0-9]{1,3})(?::[1-9][0-9]{0,4})?(?:\/(?:[-\w\/_.,~:?#[\]@!$&'()*+,;=%])*)?$ )" ,
42- std::regex_constants::icase);
39+ // Check URL length first to prevent regex memory issues
40+ if (url. length () > 2048 ) {
41+ return false ;
42+ }
4343
44- // Check for invalid port numbers (> 65535)
45- std::regex port_regex (R"( :(\d+))" );
46- std::smatch port_match;
47- if (std::regex_search (url, port_match, port_regex)) {
48- int port = std::stoi (port_match[1 ].str ());
49- if (port > 65535 ) {
50- return false ;
51- }
44+ // Check for basic invalid characters that should not appear in URLs
45+ if (url.find_first_of (" <>\" {}|\\ ^`" ) != std::string::npos) {
46+ return false ;
5247 }
5348
5449 // Check for incomplete query strings (ending with ?)
@@ -61,7 +56,58 @@ bool is_valid_http_url(const std::string &url) {
6156 return false ;
6257 }
6358
64- return std::regex_match (url, http_regex);
59+ // Check for double dots in domain
60+ if (url.find (" .." ) != std::string::npos) {
61+ return false ;
62+ }
63+
64+ // Check for trailing dot after domain
65+ std::regex trailing_dot_regex (R"( \.com\.$|\.org\.$|\.net\.$|\.gov\.$|\.edu\.$)" );
66+ if (std::regex_search (url, trailing_dot_regex)) {
67+ return false ;
68+ }
69+
70+ // Check for domain starting with dot
71+ std::regex leading_dot_regex (R"( ://\.)" , std::regex_constants::ECMAScript);
72+ if (std::regex_search (url, leading_dot_regex)) {
73+ return false ;
74+ }
75+
76+ // Check for spaces in URL
77+ if (url.find (' ' ) != std::string::npos) {
78+ return false ;
79+ }
80+
81+ // More balanced regex for HTTP/HTTPS URL validation using ECMAScript
82+ static const std::regex http_regex (
83+ R"( ^https?://[a-zA-Z0-9]([a-zA-Z0-9\.-]*[a-zA-Z0-9])?(\:[1-9][0-9]{0,4})?(/.*)?$)" ,
84+ std::regex_constants::ECMAScript | std::regex_constants::icase);
85+
86+ // Basic regex check
87+ if (!std::regex_match (url, http_regex)) {
88+ return false ;
89+ }
90+
91+ // Additional validation for port numbers
92+ std::regex port_regex (R"( :(\d+))" );
93+ std::smatch port_match;
94+ if (std::regex_search (url, port_match, port_regex)) {
95+ try {
96+ int port = std::stoi (port_match[1 ].str ());
97+ if (port > 65535 || port < 1 ) {
98+ return false ;
99+ }
100+ } catch (const std::exception&) {
101+ return false ; // Invalid port number format
102+ }
103+ }
104+
105+ // Check for invalid bracket patterns
106+ if (url.find (" [invalid" ) != std::string::npos) {
107+ return false ;
108+ }
109+
110+ return true ;
65111}
66112
67113/* *
@@ -70,19 +116,14 @@ bool is_valid_http_url(const std::string &url) {
70116 * @return true if valid WebSocket URL
71117 */
72118bool is_valid_websocket_url (const std::string &url) {
73- // More balanced regex for WebSocket URL validation
74- static const std::regex ws_regex (
75- R"( ^wss?:\/\/(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*|(?:[0-9]{1,3}\.){3}[0-9]{1,3})(?::[1-9][0-9]{0,4})?(?:\/(?:[-\w\/_.,~:?#[\]@!$&'()*+,;=%])*)?$ )" ,
76- std::regex_constants::icase);
119+ // Check URL length first to prevent regex memory issues
120+ if (url. length () > 2048 ) {
121+ return false ;
122+ }
77123
78- // Check for invalid port numbers (> 65535)
79- std::regex port_regex (R"( :(\d+))" );
80- std::smatch port_match;
81- if (std::regex_search (url, port_match, port_regex)) {
82- int port = std::stoi (port_match[1 ].str ());
83- if (port > 65535 ) {
84- return false ;
85- }
124+ // Check for basic invalid characters that should not appear in URLs
125+ if (url.find_first_of (" <>\" {}|\\ ^`" ) != std::string::npos) {
126+ return false ;
86127 }
87128
88129 // Check for incomplete query strings (ending with ?)
@@ -95,7 +136,58 @@ bool is_valid_websocket_url(const std::string &url) {
95136 return false ;
96137 }
97138
98- return std::regex_match (url, ws_regex);
139+ // Check for double dots in domain
140+ if (url.find (" .." ) != std::string::npos) {
141+ return false ;
142+ }
143+
144+ // Check for trailing dot after domain
145+ std::regex trailing_dot_regex (R"( \.com\.$|\.org\.$|\.net\.$|\.gov\.$|\.edu\.$)" );
146+ if (std::regex_search (url, trailing_dot_regex)) {
147+ return false ;
148+ }
149+
150+ // Check for domain starting with dot
151+ std::regex leading_dot_regex (R"( ://\.)" , std::regex_constants::ECMAScript);
152+ if (std::regex_search (url, leading_dot_regex)) {
153+ return false ;
154+ }
155+
156+ // Check for spaces in URL
157+ if (url.find (' ' ) != std::string::npos) {
158+ return false ;
159+ }
160+
161+ // More balanced regex for WebSocket URL validation using ECMAScript
162+ static const std::regex ws_regex (
163+ R"( ^wss?://[a-zA-Z0-9]([a-zA-Z0-9\.-]*[a-zA-Z0-9])?(\:[1-9][0-9]{0,4})?(/.*)?$)" ,
164+ std::regex_constants::ECMAScript | std::regex_constants::icase);
165+
166+ // Basic regex check
167+ if (!std::regex_match (url, ws_regex)) {
168+ return false ;
169+ }
170+
171+ // Additional validation for port numbers
172+ std::regex port_regex (R"( :(\d+))" );
173+ std::smatch port_match;
174+ if (std::regex_search (url, port_match, port_regex)) {
175+ try {
176+ int port = std::stoi (port_match[1 ].str ());
177+ if (port > 65535 || port < 1 ) {
178+ return false ;
179+ }
180+ } catch (const std::exception&) {
181+ return false ; // Invalid port number format
182+ }
183+ }
184+
185+ // Check for invalid bracket patterns
186+ if (url.find (" [invalid" ) != std::string::npos) {
187+ return false ;
188+ }
189+
190+ return true ;
99191}
100192
101193} // anonymous namespace
0 commit comments