@@ -103,54 +103,108 @@ export function formatAssemblyTranscript(transcript: AssemblyAIPollingResponse,
103103 * Converts LRC content (common lyrics file format) to plain text with timestamps.
104104 * - Strips out lines that contain certain metadata (like [by:whisper.cpp]).
105105 * - Converts original timestamps [MM:SS.xx] to a simplified [MM:SS] format.
106- * - Collapses lines with single or few words into lines of up to 15 words, retaining only the first timestamp
107- * among collapsed lines and removing subsequent timestamps.
106+ * - Properly extracts all timestamps in each line, then merges them into
107+ * chunks of up to 15 words, adopting the newest timestamp as soon
108+ * as it appears.
108109 *
109110 * @param lrcContent - The content of the LRC file as a string
110111 * @returns The converted text content with simple timestamps
111112 */
112113export function formatWhisperTranscript ( lrcContent : string ) : string {
113- const lines = lrcContent . split ( '\n' )
114+ // 1) Remove lines like `[by:whisper.cpp]`, convert "[MM:SS.xx]" to "[MM:SS]"
115+ const rawLines = lrcContent
116+ . split ( '\n' )
114117 . filter ( line => ! line . startsWith ( '[by:whisper.cpp]' ) )
115- . map ( line => line . replace ( / \[ ( \d { 1 , 3 } ) : ( \d { 2 } ) ( \. \d + ) ? \] / g, ( _ , p1 , p2 ) => `[${ p1 } :${ p2 } ]` ) )
118+ . map ( line =>
119+ line . replace (
120+ / \[ ( \d { 1 , 3 } ) : ( \d { 2 } ) ( \. \d + ) ? \] / g,
121+ ( _ , minutes , seconds ) => `[${ minutes } :${ seconds } ]`
122+ )
123+ )
124+
125+ // We define a Segment with timestamp: string | undefined
126+ type Segment = {
127+ timestamp : string | undefined
128+ words : string [ ]
129+ }
130+
131+ /**
132+ * Given a line (which may contain multiple [MM:SS] tags),
133+ * extract those timestamps + the words in between.
134+ */
135+ function parseLineIntoSegments ( line : string ) : Segment [ ] {
136+ const segments : Segment [ ] = [ ]
137+ const pattern = / \[ ( \d { 1 , 3 } : \d { 2 } ) \] / g
138+
139+ let lastIndex = 0
140+ let match : RegExpExecArray | null
141+ let currentTimestamp : string | undefined = undefined
142+
143+ while ( ( match = pattern . exec ( line ) ) !== null ) {
144+ // Text before this timestamp
145+ const textBeforeThisTimestamp = line . slice ( lastIndex , match . index ) . trim ( )
146+ if ( textBeforeThisTimestamp ) {
147+ segments . push ( {
148+ timestamp : currentTimestamp ,
149+ words : textBeforeThisTimestamp . split ( / \s + / ) . filter ( Boolean ) ,
150+ } )
151+ }
152+ // Update timestamp to the newly found one
153+ currentTimestamp = match [ 1 ]
154+ lastIndex = pattern . lastIndex
155+ }
156+
157+ // After the last timestamp, grab any trailing text
158+ const trailing = line . slice ( lastIndex ) . trim ( )
159+ if ( trailing ) {
160+ segments . push ( {
161+ timestamp : currentTimestamp ,
162+ words : trailing . split ( / \s + / ) . filter ( Boolean ) ,
163+ } )
164+ }
165+
166+ // If line had no timestamps, the entire line is one segment with `timestamp: undefined`.
167+ return segments
168+ }
169+
170+ // 2) Flatten all lines into an array of typed segments
171+ const allSegments : Segment [ ] = rawLines . flatMap ( line => parseLineIntoSegments ( line ) )
116172
173+ // 3) Accumulate words into lines up to 15 words each.
174+ // Whenever we see a new timestamp, we finalize the previous chunk
175+ // and start a new chunk with that timestamp.
117176 const finalLines : string [ ] = [ ]
118- let currentTimestamp = ''
177+ let currentTimestamp : string | undefined = undefined
119178 let currentWords : string [ ] = [ ]
120179
121- lines . forEach ( line => {
122- const match = line . match ( / ^ \[ ( \d { 1 , 3 } : \d { 2 } ) \] \s * ( .* ) $ / )
123- if ( match ) {
124- const timestamp = match [ 1 ] || ''
125- const text = match [ 2 ]
126- if ( currentWords . length > 0 ) {
127- finalLines . push ( `[${ currentTimestamp } ] ${ currentWords . join ( ' ' ) } ` )
128- currentWords = [ ]
129- }
130- currentTimestamp = timestamp
131- const splitted = ( text || '' ) . split ( / \s + / ) . filter ( Boolean )
132- splitted . forEach ( word => {
133- if ( currentWords . length >= 15 ) {
134- finalLines . push ( `[${ currentTimestamp } ] ${ currentWords . join ( ' ' ) } ` )
135- currentWords = [ ]
136- }
137- currentWords . push ( word )
138- } )
139- } else {
140- const splitted = line . trim ( ) . split ( / \s + / ) . filter ( Boolean )
141- splitted . forEach ( word => {
142- if ( currentWords . length >= 15 ) {
143- finalLines . push ( `[${ currentTimestamp } ] ${ currentWords . join ( ' ' ) } ` )
144- currentWords = [ ]
145- }
146- currentWords . push ( word )
147- } )
180+ function finalizeChunk ( ) {
181+ if ( currentWords . length > 0 ) {
182+ // If we have never encountered a timestamp, default to "00:00"
183+ const tsToUse = currentTimestamp ?? '00:00'
184+ finalLines . push ( `[${ tsToUse } ] ${ currentWords . join ( ' ' ) } ` )
185+ currentWords = [ ]
148186 }
149- } )
187+ }
150188
151- if ( currentWords . length > 0 ) {
152- finalLines . push ( `[${ currentTimestamp } ] ${ currentWords . join ( ' ' ) } ` )
189+ for ( const segment of allSegments ) {
190+ // If this segment has a new timestamp, finalize the old chunk and start new
191+ if ( segment . timestamp !== undefined ) {
192+ finalizeChunk ( )
193+ currentTimestamp = segment . timestamp
194+ }
195+
196+ // Accumulate words from this segment, chunking at 15
197+ for ( const word of segment . words ) {
198+ currentWords . push ( word )
199+ if ( currentWords . length === 15 ) {
200+ finalizeChunk ( )
201+ }
202+ }
153203 }
154204
205+ // 4) Finalize any leftover words
206+ finalizeChunk ( )
207+
208+ // 5) Return as simple text
155209 return finalLines . join ( '\n' )
156210}
0 commit comments