-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_text.py
50 lines (39 loc) · 1.76 KB
/
extract_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
"""
PDF to Markdown Converter
This script extracts text from PDF files and converts it to markdown format.
It processes all PDF files in the 'papers/' directory and saves the markdown
versions in the 'papers/md/' directory with the same filename but .md extension.
Requirements:
- pymupdf4llm package for PDF text extraction
- The 'papers/' directory should exist and contain PDF files
- The 'papers/md/' directory should exist for output files
"""
import os
import pymupdf4llm
def extract_text_from_pdf(pdf_path, output_path):
"""
Extract text from a PDF file and save it as markdown.
Args:
pdf_path (str): Path to the PDF file to be processed
output_path (str): Path where the extracted markdown will be saved
Returns:
None: The function writes the extracted text to the specified output path
"""
# Convert PDF content to markdown format using pymupdf4llm
md_text = pymupdf4llm.to_markdown(pdf_path)
# Write the text to the output file in UTF-8 encoding
import pathlib
pathlib.Path(output_path).write_bytes(md_text.encode())
if __name__ == "__main__":
# Define the directory containing PDF files
pdf_dir = "papers/"
# Create a list of all PDF files in the specified directory
list_pdfs = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
# Process each PDF file
for pdf_path in list_pdfs:
# Generate output path by changing directory and extension
output_path = pdf_path.replace('.pdf', '.md')
output_path = output_path.replace('papers/', 'papers/md/')
# Convert the PDF to markdown and save it
print(f"Converting {pdf_path} to {output_path}")
extract_text_from_pdf(pdf_path, output_path)