@@ -40,7 +40,7 @@ def extract_time(paragraph):
4040
4141 timestamps = []
4242 # Check for The given query happens in m - n (seconds)
43- patterns = [r"(\d+\.*\d*)\s*- \s*(\d+\.*\d*)" ]
43+ patterns = [r"(\d+\.*\d*)\s*[–-] \s*(\d+\.*\d*)" ]
4444
4545 for time_pattern in patterns :
4646 time_matches = re .findall (time_pattern , paragraph )
@@ -84,6 +84,16 @@ def extract_time(paragraph):
8484 times .append (time_in_sec )
8585 times = times [: len (times ) // 2 * 2 ]
8686 timestamps = [(times [i ], times [i + 1 ]) for i in range (0 , len (times ), 2 )]
87+ # Fallback: if no timestamps found, search for any two number patterns with dash
88+ if len (timestamps ) == 0 :
89+ # More comprehensive pattern to match various formats like:
90+ # xx - xx, x.xx s - x.xx s, x.xxs - x.xxs, etc.
91+ # Also handle en dash (–) and regular dash (-)
92+ fallback_pattern = r"(\d+(?:\.\d+)?)\s*s?\s*[–-]\s*(\d+(?:\.\d+)?)\s*s?"
93+ fallback_matches = re .findall (fallback_pattern , paragraph )
94+ if fallback_matches :
95+ timestamps = [[float (start ), float (end )] for start , end in fallback_matches ]
96+
8797 results = []
8898 for start , end in timestamps :
8999 if end > start :
0 commit comments