11import os
22import re
3- import pandas as pd
43from pathlib import Path
5- from typing import List , Tuple
64
7- def parse_markdown (content : str ) -> List [Tuple [str , str ]]:
8-
5+ import pandas as pd
6+
7+
8+ def parse_markdown (content : str ) -> list [tuple [str , str ]]:
99 parsed_content = []
1010 pattern = re .compile (r"^---\n(.*?)\n---" , re .DOTALL )
1111 metadata_match = pattern .search (content )
1212
1313 if metadata_match :
1414 metadata_str = metadata_match .group (1 ).strip ()
1515 content_extract = content .split ("---" , 2 )[- 1 ].strip ()
16- parsed_content .append ((metadata_str ,content_extract ))
17- return parsed_content
18- else :
19- parsed_content .append (("" ,content ))
16+ parsed_content .append ((metadata_str , content_extract ))
2017 return parsed_content
2118
19+ parsed_content .append (("" , content ))
20+ return parsed_content
21+
22+
2223def process_markdown_files (directory : str , output_csv : str ) -> None :
23-
2424 directory_path = Path (directory )
2525 if not directory_path .exists ():
2626 print (f"Directory not found: { directory } " )
2727 return
28-
28+
2929 file_content = []
3030 for root , _ , files in os .walk (directory ):
3131 for file in files :
@@ -35,23 +35,24 @@ def process_markdown_files(directory: str, output_csv: str) -> None:
3535
3636 print (f"Processing file: { file_path } " )
3737 try :
38- with open (file_path , mode = "r" , encoding = "utf-8" ) as md_file :
38+ with Path . open (file_path , encoding = "utf-8" ) as md_file :
3939 content = md_file .read ()
4040 parsed_content = parse_markdown (content )
4141
4242 for metadata_ , content in parsed_content :
4343 file_content .append ([file_path .name , metadata_ , content ])
4444
45- except Exception as e :
45+ except OSError as e :
4646 print (f"Error processing file { file_path } : { e } " )
47- pd_data = pd .DataFrame (file_content ,columns = ["Filename" ,"Metadata" ,"Contents" ])
47+ pd_data = pd .DataFrame (file_content , columns = ["Filename" , "Metadata" , "Contents" ])
4848 pd_data .to_csv (output_csv , index = False )
4949
50+
5051if __name__ == "__main__" :
5152 # Define the directory containing .md/.mdx files and the output CSV file
52- directory = r' ../docs'
53+ directory = r" ../docs"
5354 output_csv = "docs.csv"
5455
5556 # Process the files
5657 process_markdown_files (directory , output_csv )
57- print (f"CSV file created at: { output_csv } " )
58+ print (f"CSV file created at: { output_csv } " )
0 commit comments