Skip to content

Commit a69e806

Browse files
committed
fix 00:00 timestamp bug
1 parent 2895b8d commit a69e806

File tree

1 file changed

+89
-35
lines changed

1 file changed

+89
-35
lines changed

src/transcription/format-transcript.ts

+89-35
Original file line numberDiff line numberDiff line change
@@ -103,54 +103,108 @@ export function formatAssemblyTranscript(transcript: AssemblyAIPollingResponse,
103103
* Converts LRC content (common lyrics file format) to plain text with timestamps.
104104
* - Strips out lines that contain certain metadata (like [by:whisper.cpp]).
105105
* - Converts original timestamps [MM:SS.xx] to a simplified [MM:SS] format.
106-
* - Collapses lines with single or few words into lines of up to 15 words, retaining only the first timestamp
107-
* among collapsed lines and removing subsequent timestamps.
106+
* - Properly extracts all timestamps in each line, then merges them into
107+
* chunks of up to 15 words, adopting the newest timestamp as soon
108+
* as it appears.
108109
*
109110
* @param lrcContent - The content of the LRC file as a string
110111
* @returns The converted text content with simple timestamps
111112
*/
112113
export function formatWhisperTranscript(lrcContent: string): string {
113-
const lines = lrcContent.split('\n')
114+
// 1) Remove lines like `[by:whisper.cpp]`, convert "[MM:SS.xx]" to "[MM:SS]"
115+
const rawLines = lrcContent
116+
.split('\n')
114117
.filter(line => !line.startsWith('[by:whisper.cpp]'))
115-
.map(line => line.replace(/\[(\d{1,3}):(\d{2})(\.\d+)?\]/g, (_, p1, p2) => `[${p1}:${p2}]`))
118+
.map(line =>
119+
line.replace(
120+
/\[(\d{1,3}):(\d{2})(\.\d+)?\]/g,
121+
(_, minutes, seconds) => `[${minutes}:${seconds}]`
122+
)
123+
)
124+
125+
// We define a Segment with timestamp: string | undefined
126+
type Segment = {
127+
timestamp: string | undefined
128+
words: string[]
129+
}
130+
131+
/**
132+
* Given a line (which may contain multiple [MM:SS] tags),
133+
* extract those timestamps + the words in between.
134+
*/
135+
function parseLineIntoSegments(line: string): Segment[] {
136+
const segments: Segment[] = []
137+
const pattern = /\[(\d{1,3}:\d{2})\]/g
138+
139+
let lastIndex = 0
140+
let match: RegExpExecArray | null
141+
let currentTimestamp: string | undefined = undefined
142+
143+
while ((match = pattern.exec(line)) !== null) {
144+
// Text before this timestamp
145+
const textBeforeThisTimestamp = line.slice(lastIndex, match.index).trim()
146+
if (textBeforeThisTimestamp) {
147+
segments.push({
148+
timestamp: currentTimestamp,
149+
words: textBeforeThisTimestamp.split(/\s+/).filter(Boolean),
150+
})
151+
}
152+
// Update timestamp to the newly found one
153+
currentTimestamp = match[1]
154+
lastIndex = pattern.lastIndex
155+
}
156+
157+
// After the last timestamp, grab any trailing text
158+
const trailing = line.slice(lastIndex).trim()
159+
if (trailing) {
160+
segments.push({
161+
timestamp: currentTimestamp,
162+
words: trailing.split(/\s+/).filter(Boolean),
163+
})
164+
}
165+
166+
// If line had no timestamps, the entire line is one segment with `timestamp: undefined`.
167+
return segments
168+
}
169+
170+
// 2) Flatten all lines into an array of typed segments
171+
const allSegments: Segment[] = rawLines.flatMap(line => parseLineIntoSegments(line))
116172

173+
// 3) Accumulate words into lines up to 15 words each.
174+
// Whenever we see a new timestamp, we finalize the previous chunk
175+
// and start a new chunk with that timestamp.
117176
const finalLines: string[] = []
118-
let currentTimestamp = ''
177+
let currentTimestamp: string | undefined = undefined
119178
let currentWords: string[] = []
120179

121-
lines.forEach(line => {
122-
const match = line.match(/^\[(\d{1,3}:\d{2})\]\s*(.*)$/)
123-
if (match) {
124-
const timestamp = match[1] || ''
125-
const text = match[2]
126-
if (currentWords.length > 0) {
127-
finalLines.push(`[${currentTimestamp}] ${currentWords.join(' ')}`)
128-
currentWords = []
129-
}
130-
currentTimestamp = timestamp
131-
const splitted = (text || '').split(/\s+/).filter(Boolean)
132-
splitted.forEach(word => {
133-
if (currentWords.length >= 15) {
134-
finalLines.push(`[${currentTimestamp}] ${currentWords.join(' ')}`)
135-
currentWords = []
136-
}
137-
currentWords.push(word)
138-
})
139-
} else {
140-
const splitted = line.trim().split(/\s+/).filter(Boolean)
141-
splitted.forEach(word => {
142-
if (currentWords.length >= 15) {
143-
finalLines.push(`[${currentTimestamp}] ${currentWords.join(' ')}`)
144-
currentWords = []
145-
}
146-
currentWords.push(word)
147-
})
180+
function finalizeChunk() {
181+
if (currentWords.length > 0) {
182+
// If we have never encountered a timestamp, default to "00:00"
183+
const tsToUse = currentTimestamp ?? '00:00'
184+
finalLines.push(`[${tsToUse}] ${currentWords.join(' ')}`)
185+
currentWords = []
148186
}
149-
})
187+
}
150188

151-
if (currentWords.length > 0) {
152-
finalLines.push(`[${currentTimestamp}] ${currentWords.join(' ')}`)
189+
for (const segment of allSegments) {
190+
// If this segment has a new timestamp, finalize the old chunk and start new
191+
if (segment.timestamp !== undefined) {
192+
finalizeChunk()
193+
currentTimestamp = segment.timestamp
194+
}
195+
196+
// Accumulate words from this segment, chunking at 15
197+
for (const word of segment.words) {
198+
currentWords.push(word)
199+
if (currentWords.length === 15) {
200+
finalizeChunk()
201+
}
202+
}
153203
}
154204

205+
// 4) Finalize any leftover words
206+
finalizeChunk()
207+
208+
// 5) Return as simple text
155209
return finalLines.join('\n')
156210
}

0 commit comments

Comments
 (0)