|
5 | 5 | from each markdown file under ebook/en/content. Writes output to cli/data/commands.json |
6 | 6 | """ |
7 | 7 |
|
8 | | -from pathlib import Path |
9 | | -import re |
10 | 8 | import json |
| 9 | +import re |
11 | 10 | import sys |
| 11 | +from pathlib import Path |
12 | 12 |
|
13 | 13 |
|
14 | | -def extract_from_markdown(path: Path) -> dict: |
15 | | - text = path.read_text(encoding="utf-8") |
16 | | - lines = text.splitlines() |
17 | | - |
18 | | - # title: first heading line starting with '#' |
19 | | - title = None |
| 14 | +def _extract_title(lines: list[str]) -> str | None: |
| 15 | + """Extract the first heading line starting with '#'.""" |
20 | 16 | for ln in lines: |
21 | 17 | if ln.strip().startswith("#"): |
22 | | - title = ln.strip().lstrip("#").strip() |
23 | | - break |
| 18 | + return ln.strip().lstrip("#").strip() |
| 19 | + return None |
| 20 | + |
24 | 21 |
|
25 | | - # try to infer command name from code span in title like `ls` |
26 | | - name = None |
| 22 | +def _extract_command_name(title: str | None, path: Path) -> str: |
| 23 | + """Extract command name from title or fallback to filename pattern.""" |
27 | 24 | if title: |
| 25 | + # Try to infer command name from code span in title like `ls` |
28 | 26 | m = re.search(r"`([a-zA-Z0-9_\-]+)`", title) |
29 | 27 | if m: |
30 | | - name = m.group(1) |
31 | | - else: |
32 | | - # fallback: take last token of title |
33 | | - tokens = re.findall(r"\w+", title) |
34 | | - if tokens: |
35 | | - name = tokens[-1].lower() |
36 | | - |
37 | | - # fallback: filename pattern 001-the-ls-command.md -> ls |
38 | | - if not name: |
39 | | - m = re.search(r"-the-([a-zA-Z0-9_\-]+)-command", path.name) |
40 | | - if m: |
41 | | - name = m.group(1) |
42 | | - else: |
43 | | - name = path.stem |
| 28 | + return m.group(1) |
| 29 | + # Fallback: take last token of title |
| 30 | + tokens = re.findall(r"\w+", title) |
| 31 | + if tokens: |
| 32 | + return tokens[-1].lower() |
| 33 | + |
| 34 | + # Fallback: filename pattern 001-the-ls-command.md -> ls |
| 35 | + m = re.search(r"-the-([a-zA-Z0-9_\-]+)-command", path.name) |
| 36 | + if m: |
| 37 | + return m.group(1) |
| 38 | + |
| 39 | + return path.stem |
| 40 | + |
44 | 41 |
|
45 | | - # description: first paragraph after title |
46 | | - desc = "" |
| 42 | +def _extract_description(lines: list[str]) -> str: |
| 43 | + """Extract first paragraph after title.""" |
47 | 44 | try: |
48 | 45 | idx = 0 |
49 | 46 | while idx < len(lines) and not lines[idx].strip().startswith("#"): |
50 | 47 | idx += 1 |
51 | | - # move to next line after title |
| 48 | + # Move to next line after title |
52 | 49 | idx += 1 |
53 | | - # collect lines until empty line |
| 50 | + # Collect lines until empty line |
54 | 51 | para = [] |
55 | 52 | while idx < len(lines): |
56 | | - if lines[idx].strip() == "": |
| 53 | + line = lines[idx].strip() |
| 54 | + if line == "": |
57 | 55 | if para: |
58 | 56 | break |
59 | 57 | idx += 1 |
60 | 58 | continue |
61 | | - if lines[idx].strip().startswith("###") or lines[idx].strip().startswith( |
62 | | - "##" |
63 | | - ): |
| 59 | + if line.startswith("###") or line.startswith("##"): |
64 | 60 | if para: |
65 | 61 | break |
66 | | - para.append(lines[idx].strip()) |
| 62 | + para.append(line) |
67 | 63 | idx += 1 |
68 | | - desc = " ".join(para).strip() |
| 64 | + return " ".join(para).strip() |
69 | 65 | except Exception: |
70 | | - desc = "" |
| 66 | + return "" |
71 | 67 |
|
72 | | - # usage: look for 'Syntax' or 'Usage' section code block |
73 | | - usage = "" |
| 68 | + |
| 69 | +def _extract_usage(text: str) -> str: |
| 70 | + """Extract usage information from Syntax or Usage sections.""" |
74 | 71 | usage_match = re.search( |
75 | 72 | r"###\s*Syntax\s*:\s*\n(```[\s\S]*?```)", text, flags=re.IGNORECASE |
76 | 73 | ) |
77 | 74 | if usage_match: |
78 | | - usage = usage_match.group(1).strip().strip("`") |
79 | | - else: |
80 | | - m2 = re.search(r"Usage\s*[:|-]\s*(.+)", text, flags=re.IGNORECASE) |
81 | | - if m2: |
82 | | - usage = m2.group(1).strip() |
83 | | - |
84 | | - # example: prefer first fenced code block under Examples or first code block |
85 | | - example = "" |
| 75 | + return usage_match.group(1).strip().strip("`") |
| 76 | + |
| 77 | + m2 = re.search(r"Usage\s*[:|-]\s*(.+)", text, flags=re.IGNORECASE) |
| 78 | + if m2: |
| 79 | + return m2.group(1).strip() |
| 80 | + |
| 81 | + return "" |
| 82 | + |
| 83 | + |
| 84 | +def _extract_example(text: str) -> str: |
| 85 | + """Extract example from Examples section or first code block.""" |
86 | 86 | examples_section = re.search( |
87 | 87 | r"###\s*Examples[:\s]*\n([\s\S]*?)(?:\n###|\n##|$)", text, flags=re.IGNORECASE |
88 | 88 | ) |
89 | | - if examples_section: |
90 | | - # find first fenced code block inside |
91 | | - fb = re.search( |
92 | | - r"```(?:bash|sh|shell|).*?\n([\s\S]*?)```", |
93 | | - examples_section.group(1), |
94 | | - flags=re.IGNORECASE, |
95 | | - ) |
96 | | - if fb: |
97 | | - example = fb.group(1).strip() |
98 | | - else: |
99 | | - # fallback: first inline code occurrence |
100 | | - ic = re.search(r"`([^`]+)`", examples_section.group(1)) |
101 | | - if ic: |
102 | | - example = ic.group(1).strip() |
103 | | - |
104 | | - # notes: capture 'Additional Flags' or remaining list items |
105 | | - notes = "" |
| 89 | + if not examples_section: |
| 90 | + return "" |
| 91 | + |
| 92 | + # Find first fenced code block inside |
| 93 | + fb = re.search( |
| 94 | + r"```(?:bash|sh|shell|).*?\n([\s\S]*?)```", |
| 95 | + examples_section.group(1), |
| 96 | + flags=re.IGNORECASE, |
| 97 | + ) |
| 98 | + if fb: |
| 99 | + return fb.group(1).strip() |
| 100 | + |
| 101 | + # Fallback: first inline code occurrence |
| 102 | + ic = re.search(r"`([^`]+)`", examples_section.group(1)) |
| 103 | + if ic: |
| 104 | + return ic.group(1).strip() |
| 105 | + |
| 106 | + return "" |
| 107 | + |
| 108 | + |
| 109 | +def _extract_notes(text: str) -> str: |
| 110 | + """Extract notes from Additional Flags/Notes sections or bullet points.""" |
106 | 111 | notes_section = re.search( |
107 | 112 | r"###\s*(Additional Flags|Notes|Notes:|Additional).*?\n([\s\S]*?)(?:\n###|\n##|$)", |
108 | 113 | text, |
109 | 114 | flags=re.IGNORECASE, |
110 | 115 | ) |
111 | 116 | if notes_section: |
112 | | - notes = notes_section.group(2).strip() |
113 | | - else: |
114 | | - # collect bullet points |
115 | | - bullets = re.findall(r"^\s*[-\*]\s+(.+)$", text, flags=re.MULTILINE) |
116 | | - if bullets: |
117 | | - notes = "; ".join(bullets[:10]) |
| 117 | + return notes_section.group(2).strip() |
| 118 | + |
| 119 | + # Collect bullet points |
| 120 | + bullets = re.findall(r"^\s*[-\*]\s+(.+)$", text, flags=re.MULTILINE) |
| 121 | + if bullets: |
| 122 | + return "; ".join(bullets[:10]) |
| 123 | + |
| 124 | + return "" |
| 125 | + |
| 126 | + |
| 127 | +def extract_from_markdown(path: Path) -> dict: |
| 128 | + """Extract command information from a markdown file.""" |
| 129 | + text = path.read_text(encoding="utf-8") |
| 130 | + lines = text.splitlines() |
| 131 | + |
| 132 | + title = _extract_title(lines) |
| 133 | + name = _extract_command_name(title, path) |
| 134 | + desc = _extract_description(lines) |
| 135 | + usage = _extract_usage(text) |
| 136 | + example = _extract_example(text) |
| 137 | + notes = _extract_notes(text) |
118 | 138 |
|
119 | 139 | return { |
120 | 140 | "name": name, |
|
0 commit comments