Job_app/fetcher.py at main · Loic-lhf/Job_app · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#%%## Importations #####
import warnings
warnings.filterwarnings("ignore")  # Suppress UserWarnings

import json
import pandas as pd
from tqdm import tqdm

import streamlit as st
import streamlit.components.v1 as stc  # Import stc for using HTML components directly

from funcs import Bcolors
from funcs import (sfe_list, reseautee_list, cnrs_list, inrae_list,
    inria_list, ird_list, cirad_list, euraxess_list, ifremer_list,
    specifics_list)
from funcs import concours_alert


#%%## Parameters #####
# keywords inputs (non-user-friendly)
with open("keywords.json", "r") as f:
    keywords = json.load(f)


# User input (typing-friendly)
date_cutoff = input("Cutoff date for fetching offers? (format: YYYY-MM-DD) ")


#%%## Main #####
# Import all job offers
functions = [sfe_list, cnrs_list, inrae_list, inria_list, ird_list, cirad_list, euraxess_list, ifremer_list, specifics_list]
names = ["SFE", "CNRS", "INRAE", "INRIA", "IRD", "CIRAD", "Euraxess", "IFREMER", "others"]

for function, name in tqdm(zip(functions, names), leave=False, desc="Fetching offers", total=10):
    tqdm.write(f"Treating {name} offers...")
    if name == "SFE": # initialisation of the df
        df_offers = function(date_cutoff)
    elif name in ["INRAE", "INRIA", "IRD", "others"]:
        df_offers = pd.concat([df_offers, function()])
    else:
        df_offers = pd.concat([df_offers, function(date_cutoff)])
    tqdm.write(f"Table currently has {len(df_offers)} rows.\n")
tqdm.write(f"{Bcolors.OKGREEN}Successfully imported all job offers{Bcolors.ENDC}.\n")

concours_alert()

print(f"\nRemoving offers based on custom keywords...")
# Remove a maximum of useless offers
for city in keywords["places_to_exclude"]:
    mask = df_offers['Location'].str.contains(city, case=False)
    mask = mask.fillna(False)
    df_offers = df_offers[~mask]
for tte in keywords["wrong_job_types"]:
    mask = df_offers['Type'].str.contains(tte, case=False)
    mask = mask.fillna(False)
    df_offers = df_offers[~mask]

# Keep only offers that have the following keywords
mask = df_offers['Title'].str.contains("é", case=False)
for keyword in keywords["good_omen"]:
    mask = mask + df_offers['Title'].str.contains(keyword, case=False)
df_offers = df_offers[mask]

# Exclude offers that don't have the following keywords
for keyword in keywords["bad_omen"]:
    mask = df_offers['Title'].str.contains(keyword, case=False)
    mask = mask.fillna(False)
    df_offers = df_offers[~mask]

print(f"\nFinal dataframe contains {len(df_offers)} offers.")

# Save the remaining offers
df_offers.to_csv("last_batch_of_job_offers.csv")