Skip to content

Commit d2e5674

Browse files
committed
Harden link validation to fail CI on unresolved internal links
1 parent 1012c4a commit d2e5674

File tree

1 file changed

+39
-40
lines changed

1 file changed

+39
-40
lines changed

dist/validate-links.sh

Lines changed: 39 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,14 @@
11
#!/bin/bash
22

3-
# Configuration
43
CONTENT_DIR="content"
54
EXIT_CODE=0
65

7-
strip_fenced_code_blocks() {
8-
awk '
9-
BEGIN { code = 0 }
10-
/^[[:space:]]*```/ { code = !code; next }
11-
/^[[:space:]]*~~~/ { code = !code; next }
12-
{ if (!code) print }
13-
' "$1"
14-
}
15-
166
normalize_link() {
177
local link="$1"
188

19-
# Remove anchor and query parameters
209
link="${link%%#*}"
2110
link="${link%%\?*}"
2211

23-
# Remove trailing slash
2412
if [[ "$link" != "/" ]]; then
2513
link="${link%/}"
2614
fi
@@ -31,70 +19,56 @@ normalize_link() {
3119
check_internal_link() {
3220
local link="$1"
3321
local file="$2"
22+
local line_no="$3"
3423
local clean_link
3524
local target_path
3625

3726
clean_link=$(normalize_link "$link")
3827

39-
# Skip empty or anchor-only links
40-
if [[ -z "$clean_link" || "$clean_link" == "#" ]]; then
41-
return 0
42-
fi
28+
[[ -z "$clean_link" || "$clean_link" == "#" ]] && return 0
4329

44-
# Skip Hugo shortcodes
4530
if [[ "$clean_link" == "{{<"* || "$clean_link" == "{{%"* || "$clean_link" == "{{"* ]]; then
4631
return 0
4732
fi
4833

49-
# Convert to lowercase for protocol checking (case-insensitive)
5034
local clean_link_lower="${clean_link,,}"
5135

52-
# Skip external links (case-insensitive)
5336
if [[ "$clean_link_lower" == http://* || "$clean_link_lower" == https://* || "$clean_link_lower" == "//"* ]]; then
5437
return 0
5538
fi
5639

57-
# Skip mailto, tel, javascript, data links
5840
case "$clean_link_lower" in
5941
mailto:*|tel:*|javascript:*|data:*)
6042
return 0
6143
;;
6244
esac
6345

64-
# Resolve target path based on link type
6546
if [[ "$clean_link" == /docs/* ]]; then
66-
# Hugo path: /docs/* → content/en/docs/*
6747
target_path="content/en${clean_link}"
6848
elif [[ "$clean_link" == /cn/docs/* ]]; then
69-
# Hugo path: /cn/docs/* → content/cn/docs/*
7049
target_path="content${clean_link}"
7150
elif [[ "$clean_link" == /* ]]; then
72-
# Absolute root path: /* → content/en/*
7351
target_path="content/en${clean_link}"
7452
else
75-
# Relative link: resolve against file directory
76-
local file_dir=$(dirname "$file")
53+
local file_dir
54+
file_dir=$(dirname "$file")
7755
target_path="${file_dir}/${clean_link}"
78-
79-
# Normalize path (remove redundant ./ and resolve ../)
56+
8057
while [[ "$target_path" == *"/./"* ]]; do
8158
target_path="${target_path//\/.\//\/}"
8259
done
83-
84-
# Basic .. resolution
60+
8561
while [[ "$target_path" =~ ([^/]+/\.\./?) ]]; do
8662
target_path="${target_path/${BASH_REMATCH[0]}/}"
8763
done
8864
fi
8965

90-
# Check asset files first (non-markdown extensions)
9166
case "$clean_link_lower" in
9267
*.png|*.jpg|*.jpeg|*.svg|*.gif|*.xml|*.yaml|*.yml|*.json|*.css|*.js|*.pdf|*.zip|*.tar.gz)
9368
[[ -f "$target_path" ]] && return 0
9469
;;
9570
esac
9671

97-
# Check for markdown file existence variations
9872
if [[ -f "${target_path}.md" ]]; then
9973
return 0
10074
elif [[ -f "$target_path" ]]; then
@@ -105,26 +79,51 @@ check_internal_link() {
10579
return 0
10680
fi
10781

108-
echo "Error: Broken link in $file"
82+
echo "Error: Broken link"
83+
echo " File: $file:$line_no"
10984
echo " Link: $link"
11085
echo " Target: $target_path (and variants)"
11186
EXIT_CODE=1
11287
}
11388

11489
echo "Starting link validation..."
11590

116-
# Find all markdown files and verify links
11791
while read -r FILE; do
118-
# Extract inline links [text](url) and check internal doc links
92+
declare -A CODE_LINES
93+
in_code=false
94+
line_no=0
95+
96+
# Pass 1: mark fenced code block lines
97+
while IFS= read -r line; do
98+
((line_no++))
99+
if [[ "$line" =~ ^[[:space:]]*(\`\`\`|~~~) ]]; then
100+
if $in_code; then
101+
in_code=false
102+
else
103+
in_code=true
104+
fi
105+
CODE_LINES[$line_no]=1
106+
elif $in_code; then
107+
CODE_LINES[$line_no]=1
108+
fi
109+
done < "$FILE"
110+
111+
# Pass 2: extract links with original line numbers
119112
while read -r MATCH; do
120-
if [[ -z "$MATCH" ]]; then continue; fi
113+
[[ -z "$MATCH" ]] && continue
114+
115+
LINE_NO="${MATCH%%:*}"
116+
LINK_PART="${MATCH#*:}"
121117

122-
# Extract URL from ](url)
123-
LINK="${MATCH#*](}"
118+
[[ ${CODE_LINES[$LINE_NO]} ]] && continue
119+
120+
LINK="${LINK_PART#*](}"
124121
LINK="${LINK%)}"
125122

126-
check_internal_link "$LINK" "$FILE"
127-
done < <(strip_fenced_code_blocks "$FILE" | grep -oE '\]\([^)]+\)')
123+
check_internal_link "$LINK" "$FILE" "$LINE_NO"
124+
done < <(grep -n -oE '\]\([^)]+\)' "$FILE")
125+
126+
unset CODE_LINES
128127
done < <(find "$CONTENT_DIR" -type f -name "*.md")
129128

130129
if [[ $EXIT_CODE -eq 0 ]]; then

0 commit comments

Comments
 (0)