@@ -103,54 +103,108 @@ export function formatAssemblyTranscript(transcript: AssemblyAIPollingResponse,
103
103
* Converts LRC content (common lyrics file format) to plain text with timestamps.
104
104
* - Strips out lines that contain certain metadata (like [by:whisper.cpp]).
105
105
* - Converts original timestamps [MM:SS.xx] to a simplified [MM:SS] format.
106
- * - Collapses lines with single or few words into lines of up to 15 words, retaining only the first timestamp
107
- * among collapsed lines and removing subsequent timestamps.
106
+ * - Properly extracts all timestamps in each line, then merges them into
107
+ * chunks of up to 15 words, adopting the newest timestamp as soon
108
+ * as it appears.
108
109
*
109
110
* @param lrcContent - The content of the LRC file as a string
110
111
* @returns The converted text content with simple timestamps
111
112
*/
112
113
export function formatWhisperTranscript ( lrcContent : string ) : string {
113
- const lines = lrcContent . split ( '\n' )
114
+ // 1) Remove lines like `[by:whisper.cpp]`, convert "[MM:SS.xx]" to "[MM:SS]"
115
+ const rawLines = lrcContent
116
+ . split ( '\n' )
114
117
. filter ( line => ! line . startsWith ( '[by:whisper.cpp]' ) )
115
- . map ( line => line . replace ( / \[ ( \d { 1 , 3 } ) : ( \d { 2 } ) ( \. \d + ) ? \] / g, ( _ , p1 , p2 ) => `[${ p1 } :${ p2 } ]` ) )
118
+ . map ( line =>
119
+ line . replace (
120
+ / \[ ( \d { 1 , 3 } ) : ( \d { 2 } ) ( \. \d + ) ? \] / g,
121
+ ( _ , minutes , seconds ) => `[${ minutes } :${ seconds } ]`
122
+ )
123
+ )
124
+
125
+ // We define a Segment with timestamp: string | undefined
126
+ type Segment = {
127
+ timestamp : string | undefined
128
+ words : string [ ]
129
+ }
130
+
131
+ /**
132
+ * Given a line (which may contain multiple [MM:SS] tags),
133
+ * extract those timestamps + the words in between.
134
+ */
135
+ function parseLineIntoSegments ( line : string ) : Segment [ ] {
136
+ const segments : Segment [ ] = [ ]
137
+ const pattern = / \[ ( \d { 1 , 3 } : \d { 2 } ) \] / g
138
+
139
+ let lastIndex = 0
140
+ let match : RegExpExecArray | null
141
+ let currentTimestamp : string | undefined = undefined
142
+
143
+ while ( ( match = pattern . exec ( line ) ) !== null ) {
144
+ // Text before this timestamp
145
+ const textBeforeThisTimestamp = line . slice ( lastIndex , match . index ) . trim ( )
146
+ if ( textBeforeThisTimestamp ) {
147
+ segments . push ( {
148
+ timestamp : currentTimestamp ,
149
+ words : textBeforeThisTimestamp . split ( / \s + / ) . filter ( Boolean ) ,
150
+ } )
151
+ }
152
+ // Update timestamp to the newly found one
153
+ currentTimestamp = match [ 1 ]
154
+ lastIndex = pattern . lastIndex
155
+ }
156
+
157
+ // After the last timestamp, grab any trailing text
158
+ const trailing = line . slice ( lastIndex ) . trim ( )
159
+ if ( trailing ) {
160
+ segments . push ( {
161
+ timestamp : currentTimestamp ,
162
+ words : trailing . split ( / \s + / ) . filter ( Boolean ) ,
163
+ } )
164
+ }
165
+
166
+ // If line had no timestamps, the entire line is one segment with `timestamp: undefined`.
167
+ return segments
168
+ }
169
+
170
+ // 2) Flatten all lines into an array of typed segments
171
+ const allSegments : Segment [ ] = rawLines . flatMap ( line => parseLineIntoSegments ( line ) )
116
172
173
+ // 3) Accumulate words into lines up to 15 words each.
174
+ // Whenever we see a new timestamp, we finalize the previous chunk
175
+ // and start a new chunk with that timestamp.
117
176
const finalLines : string [ ] = [ ]
118
- let currentTimestamp = ''
177
+ let currentTimestamp : string | undefined = undefined
119
178
let currentWords : string [ ] = [ ]
120
179
121
- lines . forEach ( line => {
122
- const match = line . match ( / ^ \[ ( \d { 1 , 3 } : \d { 2 } ) \] \s * ( .* ) $ / )
123
- if ( match ) {
124
- const timestamp = match [ 1 ] || ''
125
- const text = match [ 2 ]
126
- if ( currentWords . length > 0 ) {
127
- finalLines . push ( `[${ currentTimestamp } ] ${ currentWords . join ( ' ' ) } ` )
128
- currentWords = [ ]
129
- }
130
- currentTimestamp = timestamp
131
- const splitted = ( text || '' ) . split ( / \s + / ) . filter ( Boolean )
132
- splitted . forEach ( word => {
133
- if ( currentWords . length >= 15 ) {
134
- finalLines . push ( `[${ currentTimestamp } ] ${ currentWords . join ( ' ' ) } ` )
135
- currentWords = [ ]
136
- }
137
- currentWords . push ( word )
138
- } )
139
- } else {
140
- const splitted = line . trim ( ) . split ( / \s + / ) . filter ( Boolean )
141
- splitted . forEach ( word => {
142
- if ( currentWords . length >= 15 ) {
143
- finalLines . push ( `[${ currentTimestamp } ] ${ currentWords . join ( ' ' ) } ` )
144
- currentWords = [ ]
145
- }
146
- currentWords . push ( word )
147
- } )
180
+ function finalizeChunk ( ) {
181
+ if ( currentWords . length > 0 ) {
182
+ // If we have never encountered a timestamp, default to "00:00"
183
+ const tsToUse = currentTimestamp ?? '00:00'
184
+ finalLines . push ( `[${ tsToUse } ] ${ currentWords . join ( ' ' ) } ` )
185
+ currentWords = [ ]
148
186
}
149
- } )
187
+ }
150
188
151
- if ( currentWords . length > 0 ) {
152
- finalLines . push ( `[${ currentTimestamp } ] ${ currentWords . join ( ' ' ) } ` )
189
+ for ( const segment of allSegments ) {
190
+ // If this segment has a new timestamp, finalize the old chunk and start new
191
+ if ( segment . timestamp !== undefined ) {
192
+ finalizeChunk ( )
193
+ currentTimestamp = segment . timestamp
194
+ }
195
+
196
+ // Accumulate words from this segment, chunking at 15
197
+ for ( const word of segment . words ) {
198
+ currentWords . push ( word )
199
+ if ( currentWords . length === 15 ) {
200
+ finalizeChunk ( )
201
+ }
202
+ }
153
203
}
154
204
205
+ // 4) Finalize any leftover words
206
+ finalizeChunk ( )
207
+
208
+ // 5) Return as simple text
155
209
return finalLines . join ( '\n' )
156
210
}
0 commit comments