First_try/tesseract_work.py at master · Metalrex0926/First_try · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# Import libraries
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'
# Path of the pdf
PDF_file = "E:\Codes\PDF\DL_0197_18-19.pdf"

# Store all the pages of the PDF in a variable
pages = convert_from_path(PDF_file, 500)

# Counter to store images of each page of PDF to image
image_counter = 1

# Iterate through all the pages stored above
for page in pages:
    filename = "page_"+str(image_counter)+".jpg"
    page.save(filename, 'JPEG',)
    image_counter = image_counter + 1

filelimit = image_counter-1


outfile = "E:\Codes\PDF\DL_0197_18-19.txt"


f = open(outfile, "a")


for i in range(1, filelimit + 1):
    filename = "page_"+str(i)+".jpg"
    text = str(((pytesseract.image_to_string(Image.open(filename)))))
    text = text.replace('-\n', '')
    f.write(text)

f.close()