-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtesseract_work.py
More file actions
41 lines (27 loc) · 938 Bytes
/
tesseract_work.py
File metadata and controls
41 lines (27 loc) · 938 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# Import libraries
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'
# Path of the pdf
PDF_file = "E:\Codes\PDF\DL_0197_18-19.pdf"
# Store all the pages of the PDF in a variable
pages = convert_from_path(PDF_file, 500)
# Counter to store images of each page of PDF to image
image_counter = 1
# Iterate through all the pages stored above
for page in pages:
filename = "page_"+str(image_counter)+".jpg"
page.save(filename, 'JPEG',)
image_counter = image_counter + 1
filelimit = image_counter-1
outfile = "E:\Codes\PDF\DL_0197_18-19.txt"
f = open(outfile, "a")
for i in range(1, filelimit + 1):
filename = "page_"+str(i)+".jpg"
text = str(((pytesseract.image_to_string(Image.open(filename)))))
text = text.replace('-\n', '')
f.write(text)
f.close()