Skip to content

Commit cbba938

Browse files
committed
Improve link validation script with canonical path handling and line numbers
1 parent d2e5674 commit cbba938

File tree

1 file changed

+118
-37
lines changed

1 file changed

+118
-37
lines changed

dist/validate-links.sh

Lines changed: 118 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,21 @@
33
CONTENT_DIR="content"
44
EXIT_CODE=0
55

6+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
7+
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
8+
CONTENT_ROOT="$(cd "$REPO_ROOT/$CONTENT_DIR" && pwd)"
9+
10+
if [[ ! -d "$CONTENT_ROOT" ]]; then
11+
echo "Error: content directory not found. Run from repository root."
12+
exit 1
13+
fi
14+
615
normalize_link() {
716
local link="$1"
817

18+
link="${link//%/\\x}"
19+
link="$(printf '%b' "$link")"
20+
921
link="${link%%#*}"
1022
link="${link%%\?*}"
1123

@@ -16,99 +28,168 @@ normalize_link() {
1628
printf "%s" "$link"
1729
}
1830

31+
canonicalize_path() {
32+
local path="$1"
33+
local result=()
34+
local part
35+
36+
IFS='/' read -r -a parts <<< "$path"
37+
38+
for part in "${parts[@]}"; do
39+
if [[ -z "$part" || "$part" == "." ]]; then
40+
continue
41+
elif [[ "$part" == ".." ]]; then
42+
if [[ ${#result[@]} -gt 0 ]]; then
43+
unset 'result[-1]'
44+
fi
45+
else
46+
result+=("$part")
47+
fi
48+
done
49+
50+
if [[ ${#result[@]} -eq 0 ]]; then
51+
printf "/"
52+
else
53+
( IFS='/'; printf "/%s" "${result[*]}" )
54+
fi
55+
}
56+
57+
resolve_real_path() {
58+
local path="$1"
59+
60+
if command -v python3 >/dev/null 2>&1; then
61+
# Use python to compute realpath which is tolerant of non existing final target
62+
python3 - <<'PY' "$path"
63+
import os
64+
import sys
65+
p = sys.argv[1]
66+
# os.path.realpath resolves symlinks for existing components and otherwise returns a normalized path
67+
print(os.path.realpath(p))
68+
PY
69+
else
70+
# Fallback to the safe canonicalize_path output if python3 is not available
71+
canonicalize_path "$path"
72+
fi
73+
}
74+
1975
check_internal_link() {
2076
local link="$1"
2177
local file="$2"
2278
local line_no="$3"
2379
local clean_link
2480
local target_path
81+
local location
2582

26-
clean_link=$(normalize_link "$link")
83+
clean_link="$(normalize_link "$link")"
2784

2885
[[ -z "$clean_link" || "$clean_link" == "#" ]] && return 0
2986

30-
if [[ "$clean_link" == "{{<"* || "$clean_link" == "{{%"* || "$clean_link" == "{{"* ]]; then
87+
if [[ "$clean_link" == "{{"* ]]; then
3188
return 0
3289
fi
3390

34-
local clean_link_lower="${clean_link,,}"
91+
local clean_lower="${clean_link,,}"
3592

36-
if [[ "$clean_link_lower" == http://* || "$clean_link_lower" == https://* || "$clean_link_lower" == "//"* ]]; then
93+
if [[ "$clean_lower" == http://* || "$clean_lower" == https://* || "$clean_lower" == "//"* ]]; then
3794
return 0
3895
fi
3996

40-
case "$clean_link_lower" in
97+
case "$clean_lower" in
4198
mailto:*|tel:*|javascript:*|data:*)
4299
return 0
43100
;;
44101
esac
45102

46103
if [[ "$clean_link" == /docs/* ]]; then
47-
target_path="content/en${clean_link}"
104+
target_path="$CONTENT_ROOT/en${clean_link}"
48105
elif [[ "$clean_link" == /cn/docs/* ]]; then
49-
target_path="content${clean_link}"
106+
target_path="$CONTENT_ROOT${clean_link}"
50107
elif [[ "$clean_link" == /* ]]; then
51-
target_path="content/en${clean_link}"
108+
target_path="$CONTENT_ROOT/en${clean_link}"
52109
else
53110
local file_dir
54-
file_dir=$(dirname "$file")
55-
target_path="${file_dir}/${clean_link}"
56-
57-
while [[ "$target_path" == *"/./"* ]]; do
58-
target_path="${target_path//\/.\//\/}"
59-
done
60-
61-
while [[ "$target_path" =~ ([^/]+/\.\./?) ]]; do
62-
target_path="${target_path/${BASH_REMATCH[0]}/}"
63-
done
111+
file_dir="$(cd "$(dirname "$file")" && pwd)"
112+
target_path="$file_dir/$clean_link"
64113
fi
65114

66-
case "$clean_link_lower" in
115+
target_path="$(canonicalize_path "$target_path")"
116+
target_path="$(resolve_real_path "$target_path")"
117+
118+
case "$target_path" in
119+
"$CONTENT_ROOT"/*) ;;
120+
*)
121+
location="$file"
122+
[[ -n "$line_no" ]] && location="$file:$line_no"
123+
echo "Error: Link resolves outside content directory"
124+
echo " File: $location"
125+
echo " Link: $link"
126+
EXIT_CODE=1
127+
return
128+
;;
129+
esac
130+
131+
case "$clean_lower" in
67132
*.png|*.jpg|*.jpeg|*.svg|*.gif|*.xml|*.yaml|*.yml|*.json|*.css|*.js|*.pdf|*.zip|*.tar.gz)
68-
[[ -f "$target_path" ]] && return 0
133+
if [[ -f "$target_path" ]]; then
134+
return 0
135+
else
136+
location="$file"
137+
[[ -n "$line_no" ]] && location="$file:$line_no"
138+
echo "Error: Broken link"
139+
echo " File: $location"
140+
echo " Link: $link"
141+
echo " Target: $target_path"
142+
EXIT_CODE=1
143+
return
144+
fi
69145
;;
70146
esac
71147

72-
if [[ -f "${target_path}.md" ]]; then
73-
return 0
74-
elif [[ -f "$target_path" ]]; then
75-
return 0
76-
elif [[ -f "${target_path}/_index.md" ]]; then
77-
return 0
78-
elif [[ -f "${target_path}/README.md" ]]; then
148+
if [[ -f "$target_path" || -f "$target_path.md" || -f "$target_path/_index.md" || -f "$target_path/README.md" ]]; then
79149
return 0
80150
fi
81151

152+
location="$file"
153+
[[ -n "$line_no" ]] && location="$file:$line_no"
154+
82155
echo "Error: Broken link"
83-
echo " File: $file:$line_no"
156+
echo " File: $location"
84157
echo " Link: $link"
85-
echo " Target: $target_path (and variants)"
158+
echo " Target: $target_path"
86159
EXIT_CODE=1
87160
}
88161

89162
echo "Starting link validation..."
90163

91164
while read -r FILE; do
92165
declare -A CODE_LINES
93-
in_code=false
166+
in_fence=false
94167
line_no=0
95168

96-
# Pass 1: mark fenced code block lines
97169
while IFS= read -r line; do
98170
((line_no++))
171+
99172
if [[ "$line" =~ ^[[:space:]]*(\`\`\`|~~~) ]]; then
100-
if $in_code; then
101-
in_code=false
173+
if $in_fence; then
174+
in_fence=false
102175
else
103-
in_code=true
176+
in_fence=true
104177
fi
105178
CODE_LINES[$line_no]=1
106-
elif $in_code; then
179+
continue
180+
fi
181+
182+
if $in_fence; then
183+
CODE_LINES[$line_no]=1
184+
continue
185+
fi
186+
187+
inline_count=$(grep -o "\`" <<< "$line" | wc -l)
188+
if (( inline_count % 2 == 1 )); then
107189
CODE_LINES[$line_no]=1
108190
fi
109191
done < "$FILE"
110192

111-
# Pass 2: extract links with original line numbers
112193
while read -r MATCH; do
113194
[[ -z "$MATCH" ]] && continue
114195

@@ -124,7 +205,7 @@ while read -r FILE; do
124205
done < <(grep -n -oE '\]\([^)]+\)' "$FILE")
125206

126207
unset CODE_LINES
127-
done < <(find "$CONTENT_DIR" -type f -name "*.md")
208+
done < <(find "$CONTENT_ROOT" -type f -name "*.md")
128209

129210
if [[ $EXIT_CODE -eq 0 ]]; then
130211
echo "Link validation passed!"

0 commit comments

Comments
 (0)