Skip to content

Commit ac43d50

Browse files
committed
Improve link validator robustness and diagnostics
1 parent 60e2878 commit ac43d50

File tree

1 file changed

+60
-24
lines changed

1 file changed

+60
-24
lines changed

dist/validate-links.sh

Lines changed: 60 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,17 @@
33
CONTENT_DIR="content"
44
EXIT_CODE=0
55

6-
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
7-
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
8-
CONTENT_ROOT="$(cd "$REPO_ROOT/$CONTENT_DIR" && pwd)"
6+
VERBOSE="${VERBOSE:-0}"
7+
8+
log_verbose() {
9+
[[ "$VERBOSE" == "1" ]] && echo "Info: $*"
10+
}
11+
12+
13+
ASSET_EXTENSIONS_REGEX='png|jpg|jpeg|svg|gif|webp|avif|ico|xml|yaml|yml|json|css|js|pdf|zip|tar.gz|woff|woff2|ttf|eot|mp4|webm'
14+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" || exit 1
15+
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" || exit 1
16+
CONTENT_ROOT="$(cd "$REPO_ROOT/$CONTENT_DIR" && pwd)" || exit 1
917

1018
if [[ ! -d "$CONTENT_ROOT" ]]; then
1119
echo "Error: content directory not found. Run from repository root."
@@ -15,6 +23,12 @@ fi
1523
normalize_link() {
1624
local link="$1"
1725

26+
# Decode common URL-encoded characters explicitly
27+
link="${link//%20/ }" # space
28+
link="${link//%23/#}" # hash
29+
link="${link//%2F/\/}" # forward slash
30+
31+
# Generic percent-decoding for remaining cases
1832
link="${link//%/\\x}"
1933
link="$(printf '%b' "$link")"
2034

@@ -58,16 +72,17 @@ resolve_real_path() {
5872
local path="$1"
5973

6074
if command -v python3 >/dev/null 2>&1; then
61-
# Use python to compute realpath which is tolerant of non existing final target
75+
# Use Python to compute realpath which resolves symlinks AND normalizes paths
76+
# Python's os.path.realpath is tolerant of non-existent final targets
6277
python3 - <<'PY' "$path"
6378
import os
6479
import sys
6580
p = sys.argv[1]
66-
# os.path.realpath resolves symlinks for existing components and otherwise returns a normalized path
6781
print(os.path.realpath(p))
6882
PY
6983
else
70-
# Fallback to the safe canonicalize_path output if python3 is not available
84+
# Fallback: Normalize without symlink resolution if Python3 unavailable
85+
# Note: This won't resolve symlinks, only normalize .. and . components
7186
canonicalize_path "$path"
7287
fi
7388
}
@@ -85,12 +100,14 @@ check_internal_link() {
85100
[[ -z "$clean_link" || "$clean_link" == "#" ]] && return 0
86101

87102
if [[ "$clean_link" == "{{"* ]]; then
103+
log_verbose "Skipping Hugo shortcode link: $link ($file:$line_no)"
88104
return 0
89105
fi
90106

91107
local clean_lower="${clean_link,,}"
92108

93109
if [[ "$clean_lower" == http://* || "$clean_lower" == https://* || "$clean_lower" == "//"* ]]; then
110+
log_verbose "Skipping external link: $link ($file:$line_no)"
94111
return 0
95112
fi
96113

@@ -105,7 +122,14 @@ check_internal_link() {
105122
elif [[ "$clean_link" == /cn/docs/* ]]; then
106123
target_path="$CONTENT_ROOT${clean_link}"
107124
elif [[ "$clean_link" == /* ]]; then
108-
target_path="$CONTENT_ROOT/en${clean_link}"
125+
# Skip validation for ambiguous absolute paths (Hugo runtime URLs)
126+
location="$file"
127+
[[ -n "$line_no" ]] && location="$file:$line_no"
128+
echo "Warning: Skipping validation for ambiguous absolute path"
129+
echo " File: $location"
130+
echo " Link: $link"
131+
echo " Reason: Hugo runtime URL (not directly mappable to filesystem)"
132+
return 0
109133
else
110134
local file_dir
111135
file_dir="$(cd "$(dirname "$file")" && pwd)"
@@ -128,22 +152,20 @@ check_internal_link() {
128152
;;
129153
esac
130154

131-
case "$clean_lower" in
132-
*.png|*.jpg|*.jpeg|*.svg|*.gif|*.xml|*.yaml|*.yml|*.json|*.css|*.js|*.pdf|*.zip|*.tar.gz)
133-
if [[ -f "$target_path" ]]; then
134-
return 0
135-
else
136-
location="$file"
137-
[[ -n "$line_no" ]] && location="$file:$line_no"
138-
echo "Error: Broken link"
139-
echo " File: $location"
140-
echo " Link: $link"
141-
echo " Target: $target_path"
142-
EXIT_CODE=1
143-
return
144-
fi
145-
;;
146-
esac
155+
if [[ "$clean_lower" =~ \.(${ASSET_EXTENSIONS_REGEX})$ ]]; then
156+
if [[ -f "$target_path" ]]; then
157+
return 0
158+
else
159+
location="$file"
160+
[[ -n "$line_no" ]] && location="$file:$line_no"
161+
echo "Error: Broken link"
162+
echo " File: $location"
163+
echo " Link: $link"
164+
echo " Target: $target_path"
165+
EXIT_CODE=1
166+
return
167+
fi
168+
fi
147169

148170
if [[ -f "$target_path" || -f "$target_path.md" || -f "$target_path/_index.md" || -f "$target_path/README.md" ]]; then
149171
return 0
@@ -170,6 +192,12 @@ while read -r FILE; do
170192
((line_no++))
171193

172194
if [[ "$line" =~ ^[[:space:]]*(\`\`\`|~~~) ]]; then
195+
# NOTE:
196+
# Code fence detection assumes fences are properly paired.
197+
# If a Markdown file contains an unclosed or mismatched fence,
198+
# subsequent content may be treated as code and skipped.
199+
# This script does not attempt full Markdown validation.
200+
173201
if $in_fence; then
174202
in_fence=false
175203
else
@@ -184,10 +212,18 @@ while read -r FILE; do
184212
continue
185213
fi
186214

187-
inline_count=$(grep -o "\`" <<< "$line" | wc -l)
215+
# NOTE:
216+
# Inline code detection is heuristic.
217+
# It assumes backticks are paired on the same line.
218+
# Escaped backticks are ignored, but complex or malformed
219+
# Markdown inline code spans may still be misdetected.
220+
221+
escaped_line="${line//\\\`/}"
222+
inline_count=$(grep -o "\`" <<< "$escaped_line" | wc -l)
188223
if (( inline_count % 2 == 1 )); then
189224
CODE_LINES[$line_no]=1
190225
fi
226+
191227
done < "$FILE"
192228

193229
while read -r MATCH; do

0 commit comments

Comments
 (0)