|
4 | 4 |
|
5 | 5 | import pandas as pd |
6 | 6 |
|
| 7 | +MARKDOWN_EXTENSION = [".md", ".mdx"] |
7 | 8 |
|
8 | | -def parse_markdown(content: str) -> list[tuple[str, str]]: |
9 | | - parsed_content = [] |
10 | | - pattern = re.compile(r"^---\n(.*?)\n---", re.DOTALL) |
11 | | - metadata_match = pattern.search(content) |
12 | | - |
13 | | - if metadata_match: |
14 | | - metadata_str = metadata_match.group(1).strip() |
15 | | - content_extract = content.split("---", 2)[-1].strip() |
16 | | - parsed_content.append((metadata_str, content_extract)) |
17 | | - return parsed_content |
18 | 9 |
|
19 | | - parsed_content.append(("", content)) |
20 | | - return parsed_content |
| 10 | +def parse_markdown(content: str) -> list[tuple[str, str]]: |
| 11 | + pattern = re.compile(r"^---\n(.*?)\n---\n(.*)", re.DOTALL) |
| 12 | + match = pattern.match(content) |
| 13 | + return ( |
| 14 | + [(match.group(1).strip(), match.group(2).strip())] |
| 15 | + if match |
| 16 | + else [("", content.strip())] |
| 17 | + ) |
21 | 18 |
|
22 | 19 |
|
23 | | -def process_markdown_files(directory: str, output_csv: str) -> None: |
24 | | - directory_path = Path(directory) |
25 | | - if not directory_path.exists(): |
| 20 | +def process_markdown_files(directory: Path, output_csv: Path) -> None: |
| 21 | + if not directory.exists(): |
26 | 22 | print(f"Directory not found: {directory}") |
27 | 23 | return |
28 | 24 |
|
29 | | - file_content = [] |
| 25 | + file_content: list[list[str]] = [] |
30 | 26 | for root, _, files in os.walk(directory): |
31 | 27 | for file in files: |
32 | 28 | file_path = Path(root) / file |
33 | | - if file_path.suffix.lower() not in [".md", ".mdx"]: |
| 29 | + if file_path.suffix.lower() not in MARKDOWN_EXTENSION: |
34 | 30 | continue |
35 | 31 |
|
36 | 32 | print(f"Processing file: {file_path}") |
37 | 33 | try: |
38 | | - with Path.open(file_path, encoding="utf-8") as md_file: |
39 | | - content = md_file.read() |
40 | | - parsed_content = parse_markdown(content) |
| 34 | + content = file_path.read_text(encoding="utf-8") |
| 35 | + parsed_content = parse_markdown(content) |
41 | 36 |
|
42 | | - for metadata_, content in parsed_content: |
43 | | - file_content.append([file_path.name, metadata_, content]) |
| 37 | + for metadata_, content in parsed_content: |
| 38 | + file_content.append([file_path.name, metadata_, content]) |
44 | 39 |
|
45 | 40 | except OSError as e: |
46 | 41 | print(f"Error processing file {file_path}: {e}") |
47 | | - pd_data = pd.DataFrame(file_content, columns=["Filename", "Metadata", "Contents"]) |
| 42 | + pd_data = pd.DataFrame(file_content, columns=["Filename", "Metadata", "Contents"]) # pyright: ignore [reportArgumentType] |
48 | 43 | pd_data.to_csv(output_csv, index=False) |
49 | 44 |
|
50 | 45 |
|
51 | 46 | if __name__ == "__main__": |
52 | 47 | # Define the directory containing .md/.mdx files and the output CSV file |
53 | | - directory = r"../docs" |
54 | | - output_csv = "docs.csv" |
| 48 | + directory = Path("..", "docs") |
| 49 | + output_csv = Path("docs.csv") |
55 | 50 |
|
56 | 51 | # Process the files |
57 | 52 | process_markdown_files(directory, output_csv) |
|
0 commit comments