-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathOCR_APP.py
More file actions
137 lines (118 loc) · 5.27 KB
/
Copy pathOCR_APP.py
File metadata and controls
137 lines (118 loc) · 5.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import easyocr as ocr #OCR
import streamlit as st #Web App
from PIL import Image #Image Processing
import numpy as np #Image Processing
st. set_page_config(layout="wide")
import re
import pandas as pd
#title
st.title(":orange[UNLOCKING DATA FROM BUSINESS CARDS USING OCR]")
st.write(" ")
col1, col2,col3= st.columns([3,0.5,4.5])
with col1:
#image uploader
st.write("## UPLOAD IMAGE")
image = st.file_uploader(label = "",type=['png','jpg','jpeg'])
@st.cache
def load_model():
reader = ocr.Reader(['en'])#,model_storage_directory='.')
return reader
reader = load_model() #load model
if image is not None:
input_image = Image.open(image) #read image
with col1:
#st.write("## YOUR IMAGE")
st.image(input_image) #display image
result = reader.readtext(np.array(input_image))
result_text = [] #empty list for results
for text in result:
result_text.append(text[1])
PH=[]
PHID=[]
ADD=set()
AID=[]
EMAIL=''
EID=''
PIN=''
PID=''
WEB=''
WID=''
for i, string in enumerate(result_text):
#st.write(string.lower())
# TO FIND EMAIL
if re.search(r'@', string.lower()):
EMAIL=string.lower()
EID=i
# TO FIND PINCODE
match = re.search(r'\d{6,7}', string.lower())
if match:
PIN=match.group()
PID=i
# TO FIND PHONE NUMBER
# match = re.search(r'(?:ph|phone|phno)?(?:[+-]?\d*){7,}', string)
#match = re.search(r'(?:ph|phone|phno)?\s*(?:[+-]?\d\s*){7,}', string)
match = re.search(r'(?:ph|phone|phno)?\s*(?:[+-]?\d\s*[\(\)]*){7,}', string)
if match and len(re.findall(r'\d', string)) > 7:
PH.append(string)
PHID.append(i)
# TO FIND ADDRESS
keywords = ['road', 'floor', ' st ', 'st,', 'street', ' dt ', 'district',
'near', 'beside', 'opposite', ' at ', ' in ', 'center', 'main road',
'state','country', 'post','zip','city','zone','mandal','town','rural',
'circle','next to','across from','area','building','towers','village',
' ST ',' VA ',' VA,',' EAST ',' WEST ',' NORTH ',' SOUTH ']
# Define the regular expression pattern to match six or seven continuous digits
digit_pattern = r'\d{6,7}'
# Check if the string contains any of the keywords or a sequence of six or seven digits
if any(keyword in string.lower() for keyword in keywords) or re.search(digit_pattern, string):
ADD.add(string)
AID.append(i)
# TO FIND STATE (USING SIMILARITY SCORE)
states = ['Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chhattisgarh', 'Goa', 'Gujarat',
'Haryana','Hyderabad', 'Himachal Pradesh', 'Jharkhand', 'Karnataka', 'Kerala', 'Madhya Pradesh',
'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 'Punjab',
'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal',
"United States", "China", "Japan", "Germany", "United Kingdom", "France", "India",
"Canada", "Italy", "South Korea", "Russia", "Australia", "Brazil", "Spain", "Mexico", 'USA','UK']
import Levenshtein
def string_similarity(s1, s2):
distance = Levenshtein.distance(s1, s2)
similarity = 1 - (distance / max(len(s1), len(s2)))
return similarity * 100
for x in states:
similarity = string_similarity(x.lower(), string.lower())
if similarity > 50:
ADD.add(string)
AID.append(i)
# WEBSITE URL
if re.match(r"(?!.*@)(www|.*com$)", string):
WEB=string.lower()
WID=i
with col3:
# DISPLAY ALL THE ELEMENTS OF BUSINESS CARD
st.write("## EXTRACTED TEXT")
st.write('##### :red[WEBSITE URL: ] '+ str(WEB))
st.write('##### :red[EMAIL: ] '+ str(EMAIL))
st.write('##### :red[PIN CODE: ] '+ str(PIN))
ph_str = ', '.join(PH)
st.write('##### :red[PHONE NUMBER(S): ] '+ph_str)
add_str = ' '.join([str(elem) for elem in ADD])
st.write('##### :red[ADDRESS: ] ', add_str)
IDS= [EID,PID,WID]
IDS.extend(AID)
IDS.extend(PHID)
# st.write(IDS)
oth=''
fin=[]
for i, string in enumerate(result_text):
if i not in IDS:
if len(string) >= 4 and ',' not in string and '.' not in string and 'www.' not in string:
if not re.match("^[0-9]{0,3}$", string) and not re.match("^[^a-zA-Z0-9]+$", string):
numbers = re.findall('\d+', string)
if len(numbers) == 0 or all(len(num) < 3 for num in numbers) and not any(num in string for num in ['0','1','2','3','4','5','6','7','8','9']*3):
fin.append(string)
st.write('##### :red[CARD HOLDER & COMPANY DETAILS: ] ')
for i in fin:
st.write('##### '+i)
# st.write(result_text)
# st.write(PH)