-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsort.sh
More file actions
64 lines (55 loc) · 1.53 KB
/
sort.sh
File metadata and controls
64 lines (55 loc) · 1.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/bin/bash
# created with help of ChatGPT
# for reproducible sorting of punctuations
export LC_ALL=C
input_file="DictFix.txt"
conflict_found=false
# Format validation
awk -F'\t' '
{
sub(/\r$/, "", $0) # strip CR for CRLF files
if ($0 ~ /^[[:space:]]*$/) next # skip blank lines
if (NF != 2) {
printf("Format error (line %d): expected exactly 2 tab-separated fields\n >> %s\n\n", NR, $0) > "/dev/stderr"
bad_fmt = 1
next
}
if ($1 !~ /^[A-Za-z'\''\-]+$/) {
printf("Format error (line %d): first field must be letters, hyphens or apostrophes only\n >> %s\n\n", NR, $0) > "/dev/stderr"
bad_fmt = 1
}
if ($2 !~ /^[A-Za-z0-9@^-]+$/) {
printf("Format error (line %d): second field must be WiscBet characters only\n >> %s\n\n", NR, $0) > "/dev/stderr"
bad_fmt = 1
}
}
END {
if (bad_fmt) exit 2
}
' "$input_file"
if [[ $? -eq 2 ]]; then
echo "Aborting: format errors detected. Fix the lines above and rerun." >&2
exit 2
fi
# Check for duplicates with different pronunciations
awk -F'\t' '
{
if (seen[$1] && seen[$1] != $2) {
print "Conflict: word \"" $1 "\" has pronunciations \"" seen[$1] "\" and \"" $2 "\""
conflict_found = 1
} else {
seen[$1] = $2
}
}
END {
if (conflict_found) {
exit 1
}
}' "$input_file"
# Check if conflicts were found
if [[ $? -eq 1 ]]; then
echo "Deduplication failed due to conflicting pronunciations."
read -p "Press Enter to continue..."
exit 1
fi
sort --unique "$input_file" --output="$input_file"