|
| 1 | +import argparse |
| 2 | +from bs4 import BeautifulSoup |
| 3 | +import re |
| 4 | +import os |
| 5 | +import sqlite3 |
| 6 | +from tqdm import tqdm |
| 7 | +import logging |
| 8 | + |
| 9 | +def extract_job_id_from_html(soup): |
| 10 | + # Try to find job ID in a <h1> tag with the specific class |
| 11 | + header_tag = soup.find('h1', class_='h3 dashboard-header__profile-information-name mobile--small-font color--font--white margin--b--s') |
| 12 | + if header_tag: |
| 13 | + header_text = header_tag.get_text(strip=True) |
| 14 | + match = re.match(r'^(\d+)', header_text) |
| 15 | + if match: |
| 16 | + return match.group(1) |
| 17 | + |
| 18 | + # If not found, try to find an <h1> tag containing the words "Job ID" |
| 19 | + job_id_tag = soup.find('h1', string=re.compile(r'Job ID', re.IGNORECASE)) |
| 20 | + if job_id_tag: |
| 21 | + job_id_text = job_id_tag.get_text(strip=True) |
| 22 | + match = re.search(r'Job ID\s*:\s*(\d+)', job_id_text, re.IGNORECASE) |
| 23 | + if match: |
| 24 | + return match.group(1) |
| 25 | + |
| 26 | + return None |
| 27 | + |
| 28 | +def parse_html_file(filepath, job_posting_date, verbose=False): |
| 29 | + with open(filepath, 'r', encoding='utf-8') as file: |
| 30 | + html_content = file.read() |
| 31 | + |
| 32 | + soup = BeautifulSoup(html_content, 'lxml') |
| 33 | + |
| 34 | + # Extract the year, month, and day from the job_posting_date string |
| 35 | + posting_date = job_posting_date.split('_')[0] |
| 36 | + data = {'postingDate': posting_date} |
| 37 | + job_id = extract_job_id_from_html(soup) |
| 38 | + if job_id: |
| 39 | + data['id'] = job_id |
| 40 | + |
| 41 | + rows = soup.find_all('tr') # find all table rows |
| 42 | + |
| 43 | + for row in rows: |
| 44 | + tds = row.find_all('td') # find all table data cells |
| 45 | + |
| 46 | + if len(tds) >= 2: |
| 47 | + label_td = tds[0] |
| 48 | + label_text = '\n'.join(label_td.stripped_strings).replace(':', '') |
| 49 | + |
| 50 | + value_td = tds[1] |
| 51 | + value_text = '\n'.join(value_td.stripped_strings) |
| 52 | + |
| 53 | + links = value_td.find_all('a') |
| 54 | + for link in links: |
| 55 | + url = link.get('href') |
| 56 | + link_text = link.get_text() |
| 57 | + value_text = value_text.replace(link_text, f'{link_text} ({url})') |
| 58 | + |
| 59 | + # Map label_text to corresponding database column |
| 60 | + column_mapping = { |
| 61 | + # 'Job ID': 'id', |
| 62 | + # 'Job Posting Date': 'postingDate', |
| 63 | + 'Job Title': 'title', |
| 64 | + 'Organization': 'company', |
| 65 | + 'Division': 'companyDivision', |
| 66 | + 'Website': 'companyWebsite', |
| 67 | + 'Job Location': 'location', |
| 68 | + 'Job Location Type': 'locationType', |
| 69 | + 'Number of Positions': 'numPositions', |
| 70 | + 'Salary': 'salary', |
| 71 | + 'Start Date': 'startDate', |
| 72 | + 'End Date': 'endDate', |
| 73 | + 'Job Function': 'function', |
| 74 | + 'Job Description': 'description', |
| 75 | + 'Job Requirements': 'requirements', |
| 76 | + 'Preferred Disciplines': 'preferredDisciplines', |
| 77 | + 'Application Deadline': 'applicationDeadline', |
| 78 | + 'Application Method': 'applicationMethod', |
| 79 | + 'Application Receipt Procedure': 'applicationReceiptProcedure', |
| 80 | + 'If by Website, go to': 'applicationReceiptProcedure', |
| 81 | + 'Additional Application Information': 'applicationDetails', |
| 82 | + } |
| 83 | + |
| 84 | + # Check if label_text matches any of the predefined columns |
| 85 | + if label_text in column_mapping: |
| 86 | + db_column = column_mapping[label_text] |
| 87 | + # If key already exists, append the value to it |
| 88 | + if db_column in data: |
| 89 | + data[db_column] += f'\n{value_text}' |
| 90 | + else: |
| 91 | + data[db_column] = value_text |
| 92 | + |
| 93 | + return data |
| 94 | + |
| 95 | +def store_data_in_db(data, db_cursor): |
| 96 | + columns = ', '.join([f'"{key}"' for key in data.keys()]) |
| 97 | + placeholders = ', '.join(['?' for _ in data.values()]) |
| 98 | + sql = f'INSERT INTO "JobPosting" ({columns}) VALUES ({placeholders})' |
| 99 | + try: |
| 100 | + db_cursor.execute(sql, tuple(data.values())) |
| 101 | + except sqlite3.IntegrityError: |
| 102 | + logging.info("Integrity Error: Skipping row") |
| 103 | + pass |
| 104 | + |
| 105 | +def create_db_schema(db_cursor): |
| 106 | + db_cursor.execute(''' |
| 107 | + CREATE TABLE IF NOT EXISTS JobPosting ( |
| 108 | + id INTEGER, |
| 109 | + postingDate DATE, |
| 110 | + title TEXT, |
| 111 | + company TEXT, |
| 112 | + companyDivision TEXT, |
| 113 | + companyWebsite TEXT, |
| 114 | + location TEXT, |
| 115 | + locationType TEXT, |
| 116 | + numPositions INTEGER, |
| 117 | + salary TEXT, |
| 118 | + startDate TEXT, |
| 119 | + endDate TEXT, |
| 120 | + function TEXT, |
| 121 | + description TEXT, |
| 122 | + requirements TEXT, |
| 123 | + preferredDisciplines TEXT, |
| 124 | + applicationDeadline TEXT, |
| 125 | + applicationMethod TEXT, |
| 126 | + applicationReceiptProcedure TEXT, |
| 127 | + applicationDetails TEXT, |
| 128 | + PRIMARY KEY(id, postingDate) |
| 129 | + ) |
| 130 | + ''') |
| 131 | + |
| 132 | +if __name__ == "__main__": |
| 133 | + logging.basicConfig(filename='run.log', level=logging.INFO, format='%(asctime)s %(message)s') |
| 134 | + |
| 135 | + parser = argparse.ArgumentParser(description="Parse HTML files in a folder and store data in SQLite DB.") |
| 136 | + parser.add_argument("-d", "--directory", default=os.getcwd(), help="Path to the directory containing HTML files. Default is the current directory.") |
| 137 | + parser.add_argument("--db", default=os.path.join(os.getcwd(), "job_postings.db"), help="SQLite database file to store the parsed data. Default is 'job_postings.db' in the directory specified by -d.") |
| 138 | + parser.add_argument("-v", "--verbose", action="store_true", help="logging.info parsed data.") |
| 139 | + |
| 140 | + args = parser.parse_args() |
| 141 | + |
| 142 | + conn = sqlite3.connect(args.db) |
| 143 | + cursor = conn.cursor() |
| 144 | + create_db_schema(cursor) |
| 145 | + |
| 146 | + # Get the list of files |
| 147 | + files = [os.path.join(dirpath, file) for dirpath, _, files in os.walk(args.directory) for file in files if file.endswith('.html') or file.endswith('.htm')] |
| 148 | + |
| 149 | + # Create a progress bar |
| 150 | + with tqdm(total=len(files)) as pbar: |
| 151 | + for subdir, _, files in os.walk(args.directory): |
| 152 | + job_posting_date = os.path.basename(subdir) |
| 153 | + for file in files: |
| 154 | + if file.endswith('.html') or file.endswith('.htm'): |
| 155 | + filepath = os.path.join(subdir, file) |
| 156 | + logging.info(filepath) |
| 157 | + data = parse_html_file(filepath, job_posting_date, args.verbose) |
| 158 | + store_data_in_db(data, cursor) |
| 159 | + # Update the progress bar |
| 160 | + pbar.update(1) |
| 161 | + |
| 162 | + conn.commit() |
| 163 | + conn.close() |
| 164 | + logging.info("Parsing and storing completed.") |
0 commit comments