Skip to content

Commit 40b1373

Browse files
committed
feat(docs): add bigquery script to get CSV format of the entire docs
1 parent 8a9f113 commit 40b1373

File tree

3 files changed

+85
-0
lines changed

3 files changed

+85
-0
lines changed

queryscript/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
/venv

queryscript/app.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import os
2+
import re
3+
import pandas as pd
4+
from pathlib import Path
5+
from typing import List, Tuple
6+
7+
def parse_markdown(content: str) -> List[Tuple[str, str]]:
8+
9+
parsed_content = []
10+
pattern = re.compile(r"^---\n(.*?)\n---", re.DOTALL)
11+
metadata_match = pattern.search(content)
12+
13+
if metadata_match:
14+
metadata_str = metadata_match.group(1).strip()
15+
content_extract = content.split("---", 2)[-1].strip()
16+
parsed_content.append((metadata_str,content_extract))
17+
return parsed_content
18+
else:
19+
parsed_content.append(("",content))
20+
return parsed_content
21+
22+
def process_markdown_files(directory: str, output_csv: str) -> None:
23+
24+
directory_path = Path(directory)
25+
if not directory_path.exists():
26+
print(f"Directory not found: {directory}")
27+
return
28+
29+
file_content = []
30+
for root, _, files in os.walk(directory):
31+
for file in files:
32+
file_path = Path(root) / file
33+
if file_path.suffix.lower() not in [".md", ".mdx"]:
34+
continue
35+
36+
print(f"Processing file: {file_path}")
37+
try:
38+
with open(file_path, mode="r", encoding="utf-8") as md_file:
39+
content = md_file.read()
40+
parsed_content = parse_markdown(content)
41+
42+
for metadata_, content in parsed_content:
43+
file_content.append([file_path.name, metadata_, content])
44+
45+
except Exception as e:
46+
print(f"Error processing file {file_path}: {e}")
47+
pd_data = pd.DataFrame(file_content,columns=["Filename","Metadata","Contents"])
48+
pd_data.to_csv(output_csv, index=False)
49+
50+
if __name__ == "__main__":
51+
# Define the directory containing .md/.mdx files and the output CSV file
52+
directory = r'../docs'
53+
output_csv = "docs.csv"
54+
55+
# Process the files
56+
process_markdown_files(directory, output_csv)
57+
print(f"CSV file created at: {output_csv}")

queryscript/requirements.txt

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
cachetools==5.5.1
2+
certifi==2025.1.31
3+
charset-normalizer==3.4.1
4+
google-api-core==2.24.1
5+
google-auth==2.38.0
6+
google-cloud-bigquery==3.29.0
7+
google-cloud-core==2.4.1
8+
google-crc32c==1.6.0
9+
google-resumable-media==2.7.2
10+
googleapis-common-protos==1.66.0
11+
grpcio==1.70.0
12+
grpcio-status==1.70.0
13+
idna==3.10
14+
numpy==2.2.2
15+
packaging==24.2
16+
pandas==2.2.3
17+
proto-plus==1.26.0
18+
protobuf==5.29.3
19+
pyasn1==0.6.1
20+
pyasn1_modules==0.4.1
21+
python-dateutil==2.9.0.post0
22+
pytz==2025.1
23+
requests==2.32.3
24+
rsa==4.9
25+
six==1.17.0
26+
tzdata==2025.1
27+
urllib3==2.3.0

0 commit comments

Comments
 (0)