Skip to content

Commit a73a33b

Browse files
committed
fix: safe_text from parsed html; coz ERP sucks
1 parent e975ba2 commit a73a33b

File tree

3 files changed

+40
-28
lines changed

3 files changed

+40
-28
lines changed

mftp/company.py

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import json
33
import logging
44
from env import ROLL_NUMBER
5+
from utils import safe_text
56
from datetime import datetime
67
import xml.etree.ElementTree as ET
78
from bs4 import BeautifulSoup as bs
@@ -18,13 +19,13 @@ def filter(companies, filter):
1819
if filter.upper() == "OPEN":
1920
filter_func = currently_open
2021
elif filter.upper() == "OPEN_N":
21-
filter_func = open_not_applied # important
22+
filter_func = open_not_applied
2223
elif filter.upper() == "APPLIED":
2324
filter_func = applied
2425
elif filter.upper() == "APPLIED_Y":
25-
filter_func = applied_available # important
26+
filter_func = applied_available
2627
elif filter.upper() == "APPLIED_N":
27-
filter_func = applied_not_available # important
28+
filter_func = applied_not_available
2829

2930
filtered = []
3031
for company in companies:
@@ -54,7 +55,7 @@ def fetch(session, headers, ssoToken):
5455

5556
fetched_companies = []
5657
for row in root.findall("row"):
57-
jd_args = row.find("cell[4]").text.split("'")[5].split('"')
58+
jd_args = safe_text(row.find("cell[4]")).split("'")[5].split('"')
5859
jnf_id, com_id, year = jd_args[1], jd_args[3], jd_args[5]
5960

6061
# Links
@@ -67,24 +68,22 @@ def fetch(session, headers, ssoToken):
6768
form_additional_details = f"https://erp.iitkgp.ac.in/TrainingPlacementSSO/AdmFilePDF.htm?type=JNF&year={year}&jnf_id={jnf_id}&com_id={com_id}"
6869

6970
company_info = {
70-
"Name": row.find("cell[1]").text.split(">")[1].split("<")[0].strip(),
71+
"Name": safe_text(row.find("cell[1]")).split(">")[1].split("<")[0].strip(),
7172
"Company_Details": company_details,
7273
"Company_Additional_Details": company_additional_details,
7374
"PPT": ppt,
74-
"Role": row.find("cell[4]").text.split("'")[1].strip(),
75+
"Role": safe_text(row.find("cell[4]")).split("'")[1].strip(),
7576
"Job_Description": jd,
7677
"Apply_Link_CV": apply_link_cv,
7778
"Additional_Job_Description": additional_jd,
7879
"CTC": get_ctc_with_currency(session, headers, additional_jd),
7980
"Form_Additional_Details": form_additional_details,
80-
"Application_Status": row.find("cell[9]").text.strip() if row.find("cell[9]").text.strip() else "N",
81-
"Start_Date": row.find("cell[10]").text.strip(),
82-
"End_Date": row.find("cell[11]").text.strip(),
83-
"Interview_Date": row.find("cell[12]").text.strip() if row.find("cell[12]").text.strip() else None,
81+
"Application_Status": safe_text(row.find("cell[9]"), "N"),
82+
"Start_Date": safe_text(row.find("cell[10]")),
83+
"End_Date": safe_text(row.find("cell[11]")),
84+
"Interview_Date": safe_text(row.find("cell[12]"), None),
8485
}
85-
8686
fetched_companies.append(company_info)
87-
8887
stored_companies = get_list()
8988
new_companies, modified_companies = get_new_and_modified_companies(fetched_companies, stored_companies)
9089

@@ -129,7 +128,7 @@ def get_list():
129128
try:
130129
with open(COMPANIES_FILE, "r") as json_file:
131130
return json.load(json_file)
132-
except json.JSONDecodeError as _:
131+
except json.JSONDecodeError:
133132
store_list([])
134133
return []
135134
except FileNotFoundError:
@@ -138,13 +137,12 @@ def get_list():
138137

139138

140139
# Downloads pdf content in bytes format
141-
## Not used currently
140+
# Not used currently
142141
def parse_link(session, link):
143142
stream = session.get(link, stream=True)
144143
attachment = b''
145144
for chunk in stream.iter_content(4096):
146145
attachment += chunk
147-
148146
return attachment
149147

150148

@@ -196,7 +194,6 @@ def compare_deadline_lt(company, deadline_key):
196194

197195
def parse_date(company, date_key):
198196
date_format = "%d-%m-%Y %H:%M"
199-
200197
date = None
201198
if company.get(date_key):
202199
try:
@@ -206,4 +203,3 @@ def parse_date(company, date_key):
206203
date = None
207204

208205
return date
209-

mftp/notice.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
import logging
2+
from utils import safe_text
23
import xml.etree.ElementTree as ET
34
from bs4 import BeautifulSoup as bs
45
from endpoints import TPSTUDENT_URL, NOTICEBOARD_URL, NOTICES_URL, ATTACHMENT_URL, NOTICE_CONTENT_URL
56

67

78
LAST_NOTICES_CHECK_COUNT = 30
89

9-
10+
1011
def fetch(headers, session, ssoToken, notice_db):
1112
print('[FETCHING NOTICES]', flush=True)
1213
try:
@@ -30,14 +31,15 @@ def fetch(headers, session, ssoToken, notice_db):
3031
if i >= LAST_NOTICES_CHECK_COUNT:
3132
break
3233

33-
id_ = row.find('cell[1]').text.strip()
34-
year = root.findall('row')[0].find('cell[8]').text.split('"')[1].strip()
34+
id_ = safe_text(row.find('cell[1]'))
35+
year = safe_text(root.findall('row')[0].find('cell[8]')).split('"')[1].strip()
36+
3537
notice = {
36-
'UID': f'{id_}_{year}',
37-
'Time': row.find('cell[7]').text.strip(),
38-
'Type': row.find('cell[2]').text.strip(),
39-
'Subject': row.find('cell[3]').text.strip(),
40-
'Company': row.find('cell[4]').text.strip(),
38+
"UID": f"{id_}_{year}",
39+
"Company": safe_text(row.find("cell[4]")),
40+
"Time": safe_text(row.find("cell[7]")),
41+
"Type": safe_text(row.find("cell[2]")),
42+
"Subject": safe_text(row.find("cell[3]"))
4143
}
4244

4345
# Handling Body
@@ -58,7 +60,6 @@ def fetch(headers, session, ssoToken, notice_db):
5860
break
5961

6062
latest_X_notices.append(notice)
61-
6263
# This is done to reduce DB queries
6364
# Get all first X notices from ERP in latest_notices
6465
# Check if these notices exist in the DB using their UIDs in a single query
@@ -91,6 +92,4 @@ def parse_attachment(session, year, id_):
9192
attachment = b''
9293
for chunk in stream.iter_content(4096):
9394
attachment += chunk
94-
9595
return attachment
96-

mftp/utils.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from typing import Optional
2+
3+
4+
def safe_text(element, default: Optional[str] = ""):
5+
"""
6+
Return stripped text from an element, or a default value if None or empty.
7+
8+
Args:
9+
element: An element object (e.g., from BeautifulSoup or lxml) or None.
10+
default (str): Value to return if element is None or has no text.
11+
12+
Returns:
13+
str: The stripped text content of the element, or the default value.
14+
"""
15+
if element is not None and element.text:
16+
return element.text.strip()
17+
return default

0 commit comments

Comments
 (0)