1+ import os
2+ import re
3+ import pandas as pd
4+ from pathlib import Path
5+ from typing import List , Tuple
6+
7+ def parse_markdown (content : str ) -> List [Tuple [str , str ]]:
8+
9+ parsed_content = []
10+ pattern = re .compile (r"^---\n(.*?)\n---" , re .DOTALL )
11+ metadata_match = pattern .search (content )
12+
13+ if metadata_match :
14+ metadata_str = metadata_match .group (1 ).strip ()
15+ content_extract = content .split ("---" , 2 )[- 1 ].strip ()
16+ parsed_content .append ((metadata_str ,content_extract ))
17+ return parsed_content
18+ else :
19+ parsed_content .append (("" ,content ))
20+ return parsed_content
21+
22+ def process_markdown_files (directory : str , output_csv : str ) -> None :
23+
24+ directory_path = Path (directory )
25+ if not directory_path .exists ():
26+ print (f"Directory not found: { directory } " )
27+ return
28+
29+ file_content = []
30+ for root , _ , files in os .walk (directory ):
31+ for file in files :
32+ file_path = Path (root ) / file
33+ if file_path .suffix .lower () not in [".md" , ".mdx" ]:
34+ continue
35+
36+ print (f"Processing file: { file_path } " )
37+ try :
38+ with open (file_path , mode = "r" , encoding = "utf-8" ) as md_file :
39+ content = md_file .read ()
40+ parsed_content = parse_markdown (content )
41+
42+ for metadata_ , content in parsed_content :
43+ file_content .append ([file_path .name , metadata_ , content ])
44+
45+ except Exception as e :
46+ print (f"Error processing file { file_path } : { e } " )
47+ pd_data = pd .DataFrame (file_content ,columns = ["Filename" ,"Metadata" ,"Contents" ])
48+ pd_data .to_csv (output_csv , index = False )
49+
50+ if __name__ == "__main__" :
51+ # Define the directory containing .md/.mdx files and the output CSV file
52+ directory = r'../docs'
53+ output_csv = "docs.csv"
54+
55+ # Process the files
56+ process_markdown_files (directory , output_csv )
57+ print (f"CSV file created at: { output_csv } " )
0 commit comments