yaspp/transcriptions/fix-mistakes.sh at 534f660c35f27b761480e0ce44d8d2e4e6945cf3 · Chaostreff-Potsdam/yaspp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/bin/bash
# Script to fix common speech recognition mistakes in transcription files (.vtt, .srt, .txt)
# Usage: ./fix-mistakes.sh [directory]
# If no directory is provided, it will process the current directory

# Define the directory to process
TARGET_DIR="${1:-.}"

# Check if the directory exists
if [ ! -d "$TARGET_DIR" ]; then
    echo "Error: Directory '$TARGET_DIR' does not exist."
    exit 1
fi

echo "Processing transcription files in: $TARGET_DIR"

# Define common misrecognized words and their corrections
# Format: "wrong_word:correct_word"
declare -A word_corrections=(
    # Names/Nicknames
    ["Syrox"]="Cyroxx"
    ["syrox"]="Cyroxx"
    ["Syrux"]="Cyroxx"
    ["syrux"]="Cyroxx"
    ["cyrox"]="Cyroxx"
    ["Cyrox"]="Cyroxx"
    ["TCyroxx"]="Cyroxx"
    ["Cybrox"]="Cyroxx"
    ["Genie"]="Gini"
    ["Joanie"]="Gini"
    ["Jeanie"]="Gini"
    ["Jeany"]="Gini"
    ["Jeannie"]="Gini"
    ["Jenny"]="Gini"
    ["Dini"]="Gini"
    ["Knurfs"]="Knurps"
    ["Knops"]="Knurps"
    ["Knobbs"]="Knurps"
    ["Knorps"]="Knurps"
    ["Klof"]="Knurps"
    ["Knurbs"]="Knurps"
    ["Urnups"]="Knurps"
    ["Juwo"]="Ajuvo"
    ["Ayubo"]="Ajuvo"
    ["Ajuwo"]="Ajuvo"

    # Add more common misrecognitions here as needed
    ["Creative Comments"]="Creative Commons"
)

# Function to process a single file
process_file() {
    local file="$1"
    local temp_file=$(mktemp)
    local changes_made=false

    echo "Processing: $file"

    # Copy original content to temp file
    cp "$file" "$temp_file"

    # Apply each correction
    for wrong_word in "${!word_corrections[@]}"; do
        correct_word="${word_corrections[$wrong_word]}"

        # Use sed to replace whole words only (with word boundaries)
        # This prevents partial word replacements
        if sed -i "s/\b$wrong_word\b/$correct_word/g" "$temp_file"; then
            # Check if any changes were actually made
            if ! cmp -s "$file" "$temp_file"; then
                changes_made=true
                echo "  - Replaced '$wrong_word' with '$correct_word'"
            fi
        fi
    done

    # If changes were made, update the original file
    if [ "$changes_made" = true ]; then
        mv "$temp_file" "$file"
        echo "  ✓ File updated"
    else
        rm "$temp_file"
        echo "  - No changes needed"
    fi
}

# Counter for processed files
file_count=0

# Process all supported transcription files
find "$TARGET_DIR" -type f \( -name "*.vtt" -o -name "*.srt" -o -name "*.txt" \) | while read -r file; do
    # Skip this script itself
    if [[ "$(basename "$file")" == "fix-mistakes.sh" ]]; then
        continue
    fi

    process_file "$file"
    ((file_count++))
done

# remove all newlines from txt files and create a newline after each sentence
find "$TARGET_DIR" -type f -name "*.txt" | while read -r txt_file; do
    echo "Re-arranging newlines in: $txt_file"
    tr '\n' ' ' < "$txt_file" > "${txt_file}.tmp"
    sed -i 's/\. /\.\n/g; s/! /\!\n/g; s/? /\?\n/g' "${txt_file}.tmp"
    mv "${txt_file}.tmp" "$txt_file"
done

# find txt files where a line is repeated consecutively more than twice. Report these files and the duplicated line.
find "$TARGET_DIR" -type f -name "*.txt" | while read -r txt_file; do
    dup_line=$(awk 'NR>2 && $0==prev && $0==prev2 {print $0; exit} {prev2=prev; prev=$0}' "$txt_file")
    if [ -n "$dup_line" ]; then
        echo "Warning: File '$txt_file' has a line repeated more than twice consecutively: \"$dup_line\""
    else
        basename_file=$(basename "$txt_file" .txt)
        git add $basename_file.txt $basename_file.vtt
    fi
done

echo "Completed processing transcription files."
echo "Total files processed: $file_count"