Skip to content
Open
738 changes: 738 additions & 0 deletions transcriptions/2022_07_02-chaos-im-radio.txt

Large diffs are not rendered by default.

3,191 changes: 3,191 additions & 0 deletions transcriptions/2022_07_02-chaos-im-radio.vtt

Large diffs are not rendered by default.

486 changes: 486 additions & 0 deletions transcriptions/2022_09_12-chaos-im-radio.txt

Large diffs are not rendered by default.

2,813 changes: 2,813 additions & 0 deletions transcriptions/2022_09_12-chaos-im-radio.vtt

Large diffs are not rendered by default.

653 changes: 653 additions & 0 deletions transcriptions/2022_10_10-chaos-im-radio.txt

Large diffs are not rendered by default.

2,747 changes: 2,747 additions & 0 deletions transcriptions/2022_10_10-chaos-im-radio.vtt

Large diffs are not rendered by default.

546 changes: 546 additions & 0 deletions transcriptions/2022_12_12-chaos-im-radio.txt

Large diffs are not rendered by default.

3,071 changes: 3,071 additions & 0 deletions transcriptions/2022_12_12-chaos-im-radio.vtt

Large diffs are not rendered by default.

798 changes: 798 additions & 0 deletions transcriptions/2023_01_09-chaos-im-radio.txt

Large diffs are not rendered by default.

3,566 changes: 3,566 additions & 0 deletions transcriptions/2023_01_09-chaos-im-radio.vtt

Large diffs are not rendered by default.

328 changes: 328 additions & 0 deletions transcriptions/2023_02_13-chaos-im-radio.txt

Large diffs are not rendered by default.

1,142 changes: 1,142 additions & 0 deletions transcriptions/2023_02_13-chaos-im-radio.vtt

Large diffs are not rendered by default.

570 changes: 570 additions & 0 deletions transcriptions/2023_03_13-chaos-im-radio.txt

Large diffs are not rendered by default.

2,894 changes: 2,894 additions & 0 deletions transcriptions/2023_03_13-chaos-im-radio.vtt

Large diffs are not rendered by default.

771 changes: 771 additions & 0 deletions transcriptions/2023_05_08-chaos-im-radio.txt

Large diffs are not rendered by default.

2,882 changes: 2,882 additions & 0 deletions transcriptions/2023_05_08-chaos-im-radio.vtt

Large diffs are not rendered by default.

4,289 changes: 4,289 additions & 0 deletions transcriptions/2023_06-12-chaos-im-radio.vtt

Large diffs are not rendered by default.

467 changes: 467 additions & 0 deletions transcriptions/2023_06_12-chaos-im-radio.txt

Large diffs are not rendered by default.

2,279 changes: 2,279 additions & 0 deletions transcriptions/2023_06_12-chaos-im-radio.vtt

Large diffs are not rendered by default.

494 changes: 494 additions & 0 deletions transcriptions/2023_07_10-chaos-im-radio.txt

Large diffs are not rendered by default.

2,762 changes: 2,762 additions & 0 deletions transcriptions/2023_07_10-chaos-im-radio.vtt

Large diffs are not rendered by default.

765 changes: 765 additions & 0 deletions transcriptions/2023_08-14-chaos-im-radio.txt

Large diffs are not rendered by default.

3,131 changes: 3,131 additions & 0 deletions transcriptions/2023_08-14-chaos-im-radio.vtt

Large diffs are not rendered by default.

680 changes: 680 additions & 0 deletions transcriptions/2023_08_14-chaos-im-radio.txt

Large diffs are not rendered by default.

4,319 changes: 4,319 additions & 0 deletions transcriptions/2023_08_14-chaos-im-radio.vtt

Large diffs are not rendered by default.

740 changes: 740 additions & 0 deletions transcriptions/2023_09_11-chaos-im-radio.txt

Large diffs are not rendered by default.

2,765 changes: 2,765 additions & 0 deletions transcriptions/2023_09_11-chaos-im-radio.vtt

Large diffs are not rendered by default.

810 changes: 810 additions & 0 deletions transcriptions/2023_10_09-chaos-im-radio.txt

Large diffs are not rendered by default.

3,155 changes: 3,155 additions & 0 deletions transcriptions/2023_10_09-chaos-im-radio.vtt

Large diffs are not rendered by default.

539 changes: 539 additions & 0 deletions transcriptions/2023_12_11-chaos-im-radio.txt

Large diffs are not rendered by default.

2,594 changes: 2,594 additions & 0 deletions transcriptions/2023_12_11-chaos-im-radio.vtt

Large diffs are not rendered by default.

725 changes: 725 additions & 0 deletions transcriptions/2024_03_11-chaos-im-radio.txt

Large diffs are not rendered by default.

3,086 changes: 3,086 additions & 0 deletions transcriptions/2024_03_11-chaos-im-radio.vtt

Large diffs are not rendered by default.

711 changes: 711 additions & 0 deletions transcriptions/2024_04_08-chaos-im-radio.txt

Large diffs are not rendered by default.

3,554 changes: 3,554 additions & 0 deletions transcriptions/2024_04_08-chaos-im-radio.vtt

Large diffs are not rendered by default.

760 changes: 760 additions & 0 deletions transcriptions/2024_05_13-chaos-im-radio.txt

Large diffs are not rendered by default.

2,837 changes: 2,837 additions & 0 deletions transcriptions/2024_05_13-chaos-im-radio.vtt

Large diffs are not rendered by default.

645 changes: 645 additions & 0 deletions transcriptions/2024_06_10-chaos-im-radio.txt

Large diffs are not rendered by default.

2,213 changes: 2,213 additions & 0 deletions transcriptions/2024_06_10-chaos-im-radio.vtt

Large diffs are not rendered by default.

597 changes: 597 additions & 0 deletions transcriptions/2024_07_08-chaos-im-radio.txt

Large diffs are not rendered by default.

2,795 changes: 2,795 additions & 0 deletions transcriptions/2024_07_08-chaos-im-radio.vtt

Large diffs are not rendered by default.

586 changes: 586 additions & 0 deletions transcriptions/2024_09_09-chaos-im-radio.txt

Large diffs are not rendered by default.

2,699 changes: 2,699 additions & 0 deletions transcriptions/2024_09_09-chaos-im-radio.vtt

Large diffs are not rendered by default.

1,017 changes: 1,017 additions & 0 deletions transcriptions/2024_11_11-chaos-im-radio.txt

Large diffs are not rendered by default.

3,464 changes: 3,464 additions & 0 deletions transcriptions/2024_11_11-chaos-im-radio.vtt

Large diffs are not rendered by default.

513 changes: 513 additions & 0 deletions transcriptions/2025-09-08-chaos-im-radio.txt

Large diffs are not rendered by default.

2,156 changes: 2,156 additions & 0 deletions transcriptions/2025-09-08-chaos-im-radio.vtt

Large diffs are not rendered by default.

747 changes: 747 additions & 0 deletions transcriptions/2025_01_13-chaos-im-radio.txt

Large diffs are not rendered by default.

3,029 changes: 3,029 additions & 0 deletions transcriptions/2025_01_13-chaos-im-radio.vtt

Large diffs are not rendered by default.

749 changes: 749 additions & 0 deletions transcriptions/2025_03_10-chaos-im-radio.txt

Large diffs are not rendered by default.

2,999 changes: 2,999 additions & 0 deletions transcriptions/2025_03_10-chaos-im-radio.vtt

Large diffs are not rendered by default.

504 changes: 504 additions & 0 deletions transcriptions/2025_05_12-chaos-im-radio.txt

Large diffs are not rendered by default.

3,629 changes: 3,629 additions & 0 deletions transcriptions/2025_05_12-chaos-im-radio.vtt

Large diffs are not rendered by default.

485 changes: 485 additions & 0 deletions transcriptions/2025_07_14-chaos-im-radio.txt

Large diffs are not rendered by default.

1,745 changes: 1,745 additions & 0 deletions transcriptions/2025_07_14-chaos-im-radio.vtt

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions transcriptions/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Transkriptionen

Dieses Verzeichnis enthält die automatisierten Transkriptionen aller Podcast-Episoden von "Chaos im Radio" seit Juli 2022.

## Wichtige Hinweise

⚠️ **Diese Transkriptionen wurden automatisiert erstellt und sind nicht korrekturgelesen!**

- Die Transkriptionen wurden mittels automatischer Spracherkennung generiert (siehe `transcribe-folder.sh`)
- Es können Fehler bei der Erkennung von Namen, Fachbegrffen oder undeutlich gesprochenen Wörtern auftreten
- Die Transkriptionen dienen als Hilfsmittel und repräsentieren das gesprochene Wort im Podcast
- Zeitstempel können geringfügig ungenau sein

## Verfügbare Skripte

### `transcribe-folder.sh`
Skript zur automatischen Transkription neuer Audio-Dateien mit whisper.cpp

### `fix-mistakes.sh`
Ein Bash-Skript zur automatischen Korrektur häufiger Spracherkennungsfehler:
- Korrigiert falsch erkannte Namen der Moderator:innen (z.B. "Syrux" → "Cyroxx")
- Behebt typische Erkennungsfehler bei häufig verwendeten Wörtern

## Dateiformat

Die Transkriptionen liegen im WebVTT-Format (`.vtt`) vor, welches sowohl Zeitstempel als auch den transkribierten Text enthält. Dieses Format ist mit den meisten Video- und Audio-Playern kompatibel und ermöglicht die Anzeige von Untertiteln.

## Mitwirkende

Die Stimmen in den Transkriptionen gehören zu den regelmäßigen Moderator:innen:
- Cyroxx
- Hannes
- Gini
- Knurps
- Ajuvo

---

*Diese README wurde erstellt, um Transparenz über die Entstehung und Qualität der automatisierten Transkriptionen zu schaffen.*
121 changes: 121 additions & 0 deletions transcriptions/fix-mistakes.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/bin/bash
# Script to fix common speech recognition mistakes in transcription files (.vtt, .srt, .txt)
# Usage: ./fix-mistakes.sh [directory]
# If no directory is provided, it will process the current directory

# Define the directory to process
TARGET_DIR="${1:-.}"

# Check if the directory exists
if [ ! -d "$TARGET_DIR" ]; then
echo "Error: Directory '$TARGET_DIR' does not exist."
exit 1
fi

echo "Processing transcription files in: $TARGET_DIR"

# Define common misrecognized words and their corrections
# Format: "wrong_word:correct_word"
declare -A word_corrections=(
# Names/Nicknames
["Syrox"]="Cyroxx"
["syrox"]="Cyroxx"
["Syrux"]="Cyroxx"
["syrux"]="Cyroxx"
["cyrox"]="Cyroxx"
["Cyrox"]="Cyroxx"
["TCyroxx"]="Cyroxx"
["Cybrox"]="Cyroxx"
["Genie"]="Gini"
["Joanie"]="Gini"
["Jeanie"]="Gini"
["Jeany"]="Gini"
["Jeannie"]="Gini"
["Jenny"]="Gini"
["Dini"]="Gini"
["Knurfs"]="Knurps"
["Knops"]="Knurps"
["Knobbs"]="Knurps"
["Knorps"]="Knurps"
["Klof"]="Knurps"
["Knurbs"]="Knurps"
["Urnups"]="Knurps"
["Juwo"]="Ajuvo"
["Ayubo"]="Ajuvo"
["Ajuwo"]="Ajuvo"

# Add more common misrecognitions here as needed
["Creative Comments"]="Creative Commons"
)

# Function to process a single file
process_file() {
local file="$1"
local temp_file=$(mktemp)
local changes_made=false

echo "Processing: $file"

# Copy original content to temp file
cp "$file" "$temp_file"

# Apply each correction
for wrong_word in "${!word_corrections[@]}"; do
correct_word="${word_corrections[$wrong_word]}"

# Use sed to replace whole words only (with word boundaries)
# This prevents partial word replacements
if sed -i "s/\b$wrong_word\b/$correct_word/g" "$temp_file"; then
# Check if any changes were actually made
if ! cmp -s "$file" "$temp_file"; then
changes_made=true
echo " - Replaced '$wrong_word' with '$correct_word'"
fi
fi
done

# If changes were made, update the original file
if [ "$changes_made" = true ]; then
mv "$temp_file" "$file"
echo " ✓ File updated"
else
rm "$temp_file"
echo " - No changes needed"
fi
}

# Counter for processed files
file_count=0

# Process all supported transcription files
find "$TARGET_DIR" -type f \( -name "*.vtt" -o -name "*.srt" -o -name "*.txt" \) | while read -r file; do
# Skip this script itself
if [[ "$(basename "$file")" == "fix-mistakes.sh" ]]; then
continue
fi

process_file "$file"
((file_count++))
done

# remove all newlines from txt files and create a newline after each sentence
find "$TARGET_DIR" -type f -name "*.txt" | while read -r txt_file; do
echo "Re-arranging newlines in: $txt_file"
tr '\n' ' ' < "$txt_file" > "${txt_file}.tmp"
sed -i 's/\. /\.\n/g; s/! /\!\n/g; s/? /\?\n/g' "${txt_file}.tmp"
mv "${txt_file}.tmp" "$txt_file"
done

# find txt files where a line is repeated consecutively more than twice. Report these files and the duplicated line.
find "$TARGET_DIR" -type f -name "*.txt" | while read -r txt_file; do
dup_line=$(awk 'NR>2 && $0==prev && $0==prev2 {print $0; exit} {prev2=prev; prev=$0}' "$txt_file")
if [ -n "$dup_line" ]; then
echo "Warning: File '$txt_file' has a line repeated more than twice consecutively: \"$dup_line\""
else
basename_file=$(basename "$txt_file" .txt)
git add $basename_file.txt $basename_file.vtt
fi
done

echo "Completed processing transcription files."
echo "Total files processed: $file_count"
10 changes: 8 additions & 2 deletions transcriptions/transcribe-folder.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ mkdir -p "$OUTPUT_DIR"
WHISPER_BIN=~/workspace/whisper.cpp/build/bin/whisper-cli
WHISPER_MODEL=~/workspace/whisper.cpp/models/ggml-small.bin
#WHISPER_MODEL=~/workspace/whisper.cpp/models/ggml-large-v3-turbo-german-q5_1.bin
WHISPER_VAD_MODEL=~/workspace/whisper.cpp/models/ggml-silero-v5.1.2.bin
LANGUAGE=DE

# Global variables for batch operations
Expand Down Expand Up @@ -84,9 +85,14 @@ transcribe_file() {
echo "Transcribing: $(basename "$input_file")"
local transcribe_start=$(date +%s)

# Run transcription
local whisper_vad_option=""
pushd "$working_dir" > /dev/null
${WHISPER_BIN} -m ${WHISPER_MODEL} -f audio.wav -np -pp -otxt -ovtt -oj -ocsv -osrt -l ${LANGUAGE} > /dev/null
if [ -n "$WHISPER_VAD_MODEL" ]; then
if [ -f "$WHISPER_VAD_MODEL" ]; then
whisper_vad_option="--vad -vm $WHISPER_VAD_MODEL"
fi
fi
${WHISPER_BIN} -m ${WHISPER_MODEL} -f audio.wav -np -pp -otxt -ovtt -oj -ocsv -osrt ${whisper_vad_option} -l ${LANGUAGE}
popd > /dev/null

local transcribe_end=$(date +%s)
Expand Down