1+ from datetime import date
2+ import pandas as pd
3+ from labonneboite .importer import util as import_util
4+ from labonneboite .importer import settings as importer_settings
5+ from labonneboite .importer .jobs .common import logger
6+
7+ def clean_csv_act_dpae_file (DEBUG = False , existing_sql_table = True ):
8+
9+ dpae_folder_path = importer_settings .INPUT_SOURCE_FOLDER + '/'
10+ csv_path = dpae_folder_path + 'act_dpae.csv'
11+
12+ df_dpae_act = pd .read_csv (csv_path ,
13+ sep = '|' ,
14+ header = 0 )
15+
16+ logger .info ("The .csv file generated to clean has {} rows" .format (df_dpae_act .shape [0 ]))
17+
18+ df_dpae_act = df_dpae_act [df_dpae_act .premiere_embauche == 'Embauche' ]
19+ logger .info ("The .csv file - rows with not 'premiere embauche' has {} rows" .format (df_dpae_act .shape [0 ]))
20+
21+ # remove duplicates when multiple activities for the same dpae
22+ df_dpae_act = df_dpae_act .sort_values ('dateheure' )
23+ df_dpae_act = df_dpae_act .drop_duplicates (
24+ subset = ['idutilisateur-peconnect' , 'siret' ], keep = 'first' )
25+ logger .info ("The .csv file - duplicates has {} rows " .format (df_dpae_act .shape [0 ]))
26+
27+ # rename some columns
28+ df_dpae_act .rename (columns = {'dateheure' : 'date_activite' ,
29+ 'kd_dateembauche' : 'date_embauche' ,
30+ 'nbrjourtravaille' : 'duree_activite_cdd_jours' ,
31+ 'kn_trancheage' : 'tranche_age' ,
32+ 'duree_pec' : 'duree_prise_en_charge' ,
33+ 'dc_commune_id' : 'code_postal'
34+ },
35+ inplace = True )
36+
37+
38+ def get_type_contrat (row ):
39+ if row ['dc_typecontrat_id' ] == 1 :
40+ return 'CDD'
41+ elif row ['dc_typecontrat_id' ] == 2 :
42+ return 'CDI'
43+ return 'CTT'
44+ df_dpae_act ['type_contrat' ] = df_dpae_act .apply (
45+ lambda row : get_type_contrat (row ), axis = 1 )
46+
47+
48+ def get_nb_mois (row ):
49+ return row ['duree_activite_cdd_jours' ] // 30
50+ df_dpae_act ['duree_activite_cdd_mois' ] = df_dpae_act .apply (
51+ lambda row : get_nb_mois (row ), axis = 1 )
52+
53+
54+ def get_nbr_jours_act_emb (row ):
55+ de = row ['date_embauche' ][:10 ].split ('-' )
56+ da = row ['date_activite' ][:10 ].split ('-' )
57+ f_date = date (int (da [0 ]), int (da [1 ]), int (da [2 ]))
58+ l_date = date (int (de [0 ]), int (de [1 ]), int (de [2 ]))
59+ delta = l_date - f_date
60+ return delta .days
61+ df_dpae_act ['diff_activite_embauche_jrs' ] = df_dpae_act .apply (
62+ lambda row : get_nbr_jours_act_emb (row ), axis = 1 )
63+
64+
65+ def get_priv_pub (row ):
66+ if row ['dc_privepublic' ] == 0 :
67+ return 'Public'
68+ return 'Prive'
69+ df_dpae_act ['dc_privepublic' ] = df_dpae_act .apply (
70+ lambda row : get_priv_pub (row ), axis = 1 )
71+
72+
73+ def good_format (row ):
74+ return row ['date_embauche' ][:- 2 ]
75+ df_dpae_act ['date_embauche' ] = df_dpae_act .apply (
76+ lambda row : good_format (row ), axis = 1 )
77+
78+
79+ def del_interrogation (row ):
80+ if row ['tranche_age' ] == 'de 26 ans ? 50 ans' :
81+ return 'entre 26 et 50 ans'
82+ return row ['tranche_age' ]
83+ df_dpae_act ['tranche_age' ] = df_dpae_act .apply (
84+ lambda row : del_interrogation (row ), axis = 1 )
85+
86+
87+ def del_cdd_incoherent (row ):
88+ try :
89+ if int (row ['duree_activite_cdd_jours' ]) > 1200 :
90+ return 1
91+ return 0
92+ except :
93+ return 0
94+ df_dpae_act ['temporaire' ] = df_dpae_act .apply (
95+ lambda row : del_cdd_incoherent (row ), axis = 1 )
96+ df_dpae_act = df_dpae_act [df_dpae_act .temporaire == 0 ]
97+ logger .info ("The .csv file - contrats which last too long to be legal has {} rows" .format (df_dpae_act .shape [0 ]))
98+
99+ # We only have activities in august for 31/08/2018 --> ugly charts, we want to start from the 1st september
100+ df_dpae_act = df_dpae_act [df_dpae_act .date_activite > "2018-08-31" ]
101+ logger .info ("The .csv file - activity with date = 31/08/2018 has {} rows" .format (df_dpae_act .shape [0 ]))
102+
103+ cols_of_interest = ['idutilisateur_peconnect' ,
104+ 'siret' ,
105+ 'date_activite' ,
106+ 'date_embauche' ,
107+ 'type_contrat' ,
108+ 'duree_activite_cdd_mois' ,
109+ 'duree_activite_cdd_jours' ,
110+ 'diff_activite_embauche_jrs' ,
111+ 'dc_lblprioritede' ,
112+ 'tranche_age' ,
113+ 'dc_privepublic' ,
114+ 'duree_prise_en_charge' ,
115+ 'dn_tailleetablissement' ,
116+ 'code_postal' ]
117+
118+ df_dpae_act = df_dpae_act [cols_of_interest ]
119+
120+ engine = import_util .create_sqlalchemy_engine ()
121+
122+ if existing_sql_table :
123+
124+ query = "select * from act_dpae_clean"
125+ df_dpae_act_existing = pd .read_sql_query (query , engine )
126+
127+ #In case a problem appear in the script, we save old datas under .csv extension
128+ # because we will rewrite the whole table after each execution, we have to remove duplicates
129+ df_dpae_act_existing .to_csv (dpae_folder_path + 'backup_sql_act_dpae_clean' , encoding = 'utf-8' , sep = '|' )
130+ logger .info ("There were already act/dpae : {} rows" .format (df_dpae_act_existing .shape [0 ]))
131+ df_dpae_act = pd .concat ([df_dpae_act ,df_dpae_act_existing ])
132+ logger .info ("Concatenation of both has {} rows" .format (df_dpae_act .shape [0 ]))
133+
134+ df_dpae_act = df_dpae_act .drop_duplicates (
135+ subset = ['idutilisateur_peconnect' , 'siret' ], keep = 'first' )
136+ logger .info ("Concatenation of both - duplicates has {} rows" .format (df_dpae_act .shape [0 ]))
137+
138+ df_dpae_act .to_sql (con = engine , name = 'act_dpae_clean' ,
139+ if_exists = 'replace' , index = False , chunksize = 10000 )
140+
141+ engine .close ()
142+
143+ def run_main ():
144+ clean_csv_act_dpae_file (existing_sql_table = True )
145+
146+ if __name__ == '__main__' :
147+ run_main ()
0 commit comments