-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathfix-mistakes.sh
More file actions
executable file
·121 lines (104 loc) · 3.6 KB
/
fix-mistakes.sh
File metadata and controls
executable file
·121 lines (104 loc) · 3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/bin/bash
# Script to fix common speech recognition mistakes in transcription files (.vtt, .srt, .txt)
# Usage: ./fix-mistakes.sh [directory]
# If no directory is provided, it will process the current directory
# Define the directory to process
TARGET_DIR="${1:-.}"
# Check if the directory exists
if [ ! -d "$TARGET_DIR" ]; then
echo "Error: Directory '$TARGET_DIR' does not exist."
exit 1
fi
echo "Processing transcription files in: $TARGET_DIR"
# Define common misrecognized words and their corrections
# Format: "wrong_word:correct_word"
declare -A word_corrections=(
# Names/Nicknames
["Syrox"]="Cyroxx"
["syrox"]="Cyroxx"
["Syrux"]="Cyroxx"
["syrux"]="Cyroxx"
["cyrox"]="Cyroxx"
["Cyrox"]="Cyroxx"
["TCyroxx"]="Cyroxx"
["Cybrox"]="Cyroxx"
["Genie"]="Gini"
["Joanie"]="Gini"
["Jeanie"]="Gini"
["Jeany"]="Gini"
["Jeannie"]="Gini"
["Jenny"]="Gini"
["Dini"]="Gini"
["Knurfs"]="Knurps"
["Knops"]="Knurps"
["Knobbs"]="Knurps"
["Knorps"]="Knurps"
["Klof"]="Knurps"
["Knurbs"]="Knurps"
["Urnups"]="Knurps"
["Juwo"]="Ajuvo"
["Ayubo"]="Ajuvo"
["Ajuwo"]="Ajuvo"
# Add more common misrecognitions here as needed
["Creative Comments"]="Creative Commons"
)
# Function to process a single file
process_file() {
local file="$1"
local temp_file=$(mktemp)
local changes_made=false
echo "Processing: $file"
# Copy original content to temp file
cp "$file" "$temp_file"
# Apply each correction
for wrong_word in "${!word_corrections[@]}"; do
correct_word="${word_corrections[$wrong_word]}"
# Use sed to replace whole words only (with word boundaries)
# This prevents partial word replacements
if sed -i "s/\b$wrong_word\b/$correct_word/g" "$temp_file"; then
# Check if any changes were actually made
if ! cmp -s "$file" "$temp_file"; then
changes_made=true
echo " - Replaced '$wrong_word' with '$correct_word'"
fi
fi
done
# If changes were made, update the original file
if [ "$changes_made" = true ]; then
mv "$temp_file" "$file"
echo " ✓ File updated"
else
rm "$temp_file"
echo " - No changes needed"
fi
}
# Counter for processed files
file_count=0
# Process all supported transcription files
find "$TARGET_DIR" -type f \( -name "*.vtt" -o -name "*.srt" -o -name "*.txt" \) | while read -r file; do
# Skip this script itself
if [[ "$(basename "$file")" == "fix-mistakes.sh" ]]; then
continue
fi
process_file "$file"
((file_count++))
done
# remove all newlines from txt files and create a newline after each sentence
find "$TARGET_DIR" -type f -name "*.txt" | while read -r txt_file; do
echo "Re-arranging newlines in: $txt_file"
tr '\n' ' ' < "$txt_file" > "${txt_file}.tmp"
sed -i 's/\. /\.\n/g; s/! /\!\n/g; s/? /\?\n/g' "${txt_file}.tmp"
mv "${txt_file}.tmp" "$txt_file"
done
# find txt files where a line is repeated consecutively more than twice. Report these files and the duplicated line.
find "$TARGET_DIR" -type f -name "*.txt" | while read -r txt_file; do
dup_line=$(awk 'NR>2 && $0==prev && $0==prev2 {print $0; exit} {prev2=prev; prev=$0}' "$txt_file")
if [ -n "$dup_line" ]; then
echo "Warning: File '$txt_file' has a line repeated more than twice consecutively: \"$dup_line\""
else
basename_file=$(basename "$txt_file" .txt)
git add $basename_file.txt $basename_file.vtt
fi
done
echo "Completed processing transcription files."
echo "Total files processed: $file_count"