Skip to content

Commit b4c2fa8

Browse files
committed
fix(format): improve paths and parser
1 parent ed0ba0f commit b4c2fa8

File tree

2 files changed

+22
-26
lines changed

2 files changed

+22
-26
lines changed

automations/.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,4 +264,5 @@ fabric.properties
264264

265265
.ruff_cache
266266

267-
issues.md
267+
issues.md
268+
*.csv

automations/docs_to_bigquery.py

Lines changed: 20 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,54 +4,49 @@
44

55
import pandas as pd
66

7+
MARKDOWN_EXTENSION = [".md", ".mdx"]
78

8-
def parse_markdown(content: str) -> list[tuple[str, str]]:
9-
parsed_content = []
10-
pattern = re.compile(r"^---\n(.*?)\n---", re.DOTALL)
11-
metadata_match = pattern.search(content)
12-
13-
if metadata_match:
14-
metadata_str = metadata_match.group(1).strip()
15-
content_extract = content.split("---", 2)[-1].strip()
16-
parsed_content.append((metadata_str, content_extract))
17-
return parsed_content
189

19-
parsed_content.append(("", content))
20-
return parsed_content
10+
def parse_markdown(content: str) -> list[tuple[str, str]]:
11+
pattern = re.compile(r"^---\n(.*?)\n---\n(.*)", re.DOTALL)
12+
match = pattern.match(content)
13+
return (
14+
[(match.group(1).strip(), match.group(2).strip())]
15+
if match
16+
else [("", content.strip())]
17+
)
2118

2219

23-
def process_markdown_files(directory: str, output_csv: str) -> None:
24-
directory_path = Path(directory)
25-
if not directory_path.exists():
20+
def process_markdown_files(directory: Path, output_csv: Path) -> None:
21+
if not directory.exists():
2622
print(f"Directory not found: {directory}")
2723
return
2824

29-
file_content = []
25+
file_content: list[list[str]] = []
3026
for root, _, files in os.walk(directory):
3127
for file in files:
3228
file_path = Path(root) / file
33-
if file_path.suffix.lower() not in [".md", ".mdx"]:
29+
if file_path.suffix.lower() not in MARKDOWN_EXTENSION:
3430
continue
3531

3632
print(f"Processing file: {file_path}")
3733
try:
38-
with Path.open(file_path, encoding="utf-8") as md_file:
39-
content = md_file.read()
40-
parsed_content = parse_markdown(content)
34+
content = file_path.read_text(encoding="utf-8")
35+
parsed_content = parse_markdown(content)
4136

42-
for metadata_, content in parsed_content:
43-
file_content.append([file_path.name, metadata_, content])
37+
for metadata_, content in parsed_content:
38+
file_content.append([file_path.name, metadata_, content])
4439

4540
except OSError as e:
4641
print(f"Error processing file {file_path}: {e}")
47-
pd_data = pd.DataFrame(file_content, columns=["Filename", "Metadata", "Contents"])
42+
pd_data = pd.DataFrame(file_content, columns=["Filename", "Metadata", "Contents"]) # pyright: ignore [reportArgumentType]
4843
pd_data.to_csv(output_csv, index=False)
4944

5045

5146
if __name__ == "__main__":
5247
# Define the directory containing .md/.mdx files and the output CSV file
53-
directory = r"../docs"
54-
output_csv = "docs.csv"
48+
directory = Path("..", "docs")
49+
output_csv = Path("docs.csv")
5550

5651
# Process the files
5752
process_markdown_files(directory, output_csv)

0 commit comments

Comments
 (0)