Skip to content
This repository was archived by the owner on Nov 15, 2024. It is now read-only.

Commit 8618084

Browse files
committed
Clean and prepare jobs join & clean activity_logs-dpae for Jenkins
1 parent 8f49414 commit 8618084

6 files changed

Lines changed: 304 additions & 504 deletions

File tree

Makefile

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,6 @@ rebuild-importer-tests-compressed-files:
139139

140140
rebuild-city-codes:
141141
export LBB_ENV=development && cd $(PACKAGE_DIR) && python importer/scripts/clean_csv_city_codes.py
142-
143-
daily-json-activity-parser:
144-
export LBB_ENV=development && cd $(PACKAGE_DIR) && python scripts/impact_retour_emploi/daily_json_activity_parser.py
145142

146143
# Load testing
147144
# ------------
@@ -224,6 +221,26 @@ alembic-generate-migration:
224221
@echo
225222
@echo " $$ alembic revision -m 'create account table'"
226223

224+
# Impact retour à l'emploi
225+
# ------------------------
226+
daily-json-activity-parser:
227+
export LBB_ENV=development && cd $(PACKAGE_DIR) && python scripts/impact_retour_emploi/daily_json_activity_parser.py
228+
229+
join_activity_logs_and_dpae:
230+
export LBB_ENV=development && cd $(PACKAGE_DIR) && python scripts/impact_retour_emploi/join_activity_logs_dpae.py
231+
232+
clean_activity_logs_and_dpae:
233+
export LBB_ENV=development && cd $(PACKAGE_DIR) && python scripts/impact_retour_emploi/clean_activity_logs_dpae.py
234+
235+
make_report:
236+
export LBB_ENV=development && cd $(PACKAGE_DIR) && python scripts/impact_retour_emploi/make_report.py
237+
238+
run_ire_jobs:
239+
make join_activity_logs_and_dpae && \
240+
make clean_activity_logs_and_dpae && \
241+
make make_report && \
242+
echo "The new report has been built successfully."
243+
227244
# Importer jobs
228245
# -------------
229246

@@ -316,7 +333,3 @@ clean-car-isochrone-and-durations-cache: clean-car-isochrone-cache
316333
PATTERN='*durations**car*' $(MAKE) redis-count-keys
317334
@echo '###### EXTERMINATE DURATIONS! \######'
318335
$(REDIS_DOCKER_COMPOSE) "$(REDIS_CONNECT) --scan --pattern '*durations**car*' | tr '\n' '\0' | xargs -L1 -0 $(REDIS_CONNECT) del"
319-
320-
delete-unused-redis-containers:
321-
docker ps -f status=restarting -f name=redis --format "{{.ID}}" \
322-
| xargs docker rm -f
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
from datetime import date
2+
import pandas as pd
3+
from labonneboite.importer import util as import_util
4+
from labonneboite.importer import settings as importer_settings
5+
from labonneboite.importer.jobs.common import logger
6+
7+
def clean_csv_act_dpae_file(DEBUG=False, existing_sql_table=True):
8+
9+
dpae_folder_path = importer_settings.INPUT_SOURCE_FOLDER + '/'
10+
csv_path = dpae_folder_path+'act_dpae.csv'
11+
12+
df_dpae_act = pd.read_csv(csv_path,
13+
sep='|',
14+
header=0)
15+
16+
logger.info("The .csv file generated to clean has {} rows".format(df_dpae_act.shape[0]))
17+
18+
df_dpae_act = df_dpae_act[df_dpae_act.premiere_embauche == 'Embauche']
19+
logger.info("The .csv file - rows with not 'premiere embauche' has {} rows".format(df_dpae_act.shape[0]))
20+
21+
# remove duplicates when multiple activities for the same dpae
22+
df_dpae_act = df_dpae_act.sort_values('dateheure')
23+
df_dpae_act = df_dpae_act.drop_duplicates(
24+
subset=['idutilisateur-peconnect', 'siret'], keep='first')
25+
logger.info("The .csv file - duplicates has {} rows ".format(df_dpae_act.shape[0]))
26+
27+
# rename some columns
28+
df_dpae_act.rename(columns={'dateheure': 'date_activite',
29+
'kd_dateembauche': 'date_embauche',
30+
'nbrjourtravaille': 'duree_activite_cdd_jours',
31+
'kn_trancheage': 'tranche_age',
32+
'duree_pec': 'duree_prise_en_charge',
33+
'dc_commune_id': 'code_postal'
34+
},
35+
inplace=True)
36+
37+
38+
def get_type_contrat(row):
39+
if row['dc_typecontrat_id'] == 1:
40+
return 'CDD'
41+
elif row['dc_typecontrat_id'] == 2:
42+
return 'CDI'
43+
return 'CTT'
44+
df_dpae_act['type_contrat'] = df_dpae_act.apply(
45+
lambda row: get_type_contrat(row), axis=1)
46+
47+
48+
def get_nb_mois(row):
49+
return row['duree_activite_cdd_jours'] // 30
50+
df_dpae_act['duree_activite_cdd_mois'] = df_dpae_act.apply(
51+
lambda row: get_nb_mois(row), axis=1)
52+
53+
54+
def get_nbr_jours_act_emb(row):
55+
de = row['date_embauche'][:10].split('-')
56+
da = row['date_activite'][:10].split('-')
57+
f_date = date(int(da[0]), int(da[1]), int(da[2]))
58+
l_date = date(int(de[0]), int(de[1]), int(de[2]))
59+
delta = l_date - f_date
60+
return delta.days
61+
df_dpae_act['diff_activite_embauche_jrs'] = df_dpae_act.apply(
62+
lambda row: get_nbr_jours_act_emb(row), axis=1)
63+
64+
65+
def get_priv_pub(row):
66+
if row['dc_privepublic'] == 0:
67+
return 'Public'
68+
return 'Prive'
69+
df_dpae_act['dc_privepublic'] = df_dpae_act.apply(
70+
lambda row: get_priv_pub(row), axis=1)
71+
72+
73+
def good_format(row):
74+
return row['date_embauche'][:-2]
75+
df_dpae_act['date_embauche'] = df_dpae_act.apply(
76+
lambda row: good_format(row), axis=1)
77+
78+
79+
def del_interrogation(row):
80+
if row['tranche_age'] == 'de 26 ans ? 50 ans':
81+
return 'entre 26 et 50 ans'
82+
return row['tranche_age']
83+
df_dpae_act['tranche_age'] = df_dpae_act.apply(
84+
lambda row: del_interrogation(row), axis=1)
85+
86+
87+
def del_cdd_incoherent(row):
88+
try:
89+
if int(row['duree_activite_cdd_jours']) > 1200:
90+
return 1
91+
return 0
92+
except:
93+
return 0
94+
df_dpae_act['temporaire'] = df_dpae_act.apply(
95+
lambda row: del_cdd_incoherent(row), axis=1)
96+
df_dpae_act = df_dpae_act[df_dpae_act.temporaire == 0]
97+
logger.info("The .csv file - contrats which last too long to be legal has {} rows".format(df_dpae_act.shape[0]))
98+
99+
# We only have activities in august for 31/08/2018 --> ugly charts, we want to start from the 1st september
100+
df_dpae_act = df_dpae_act[df_dpae_act.date_activite > "2018-08-31"]
101+
logger.info("The .csv file - activity with date = 31/08/2018 has {} rows".format(df_dpae_act.shape[0]))
102+
103+
cols_of_interest = ['idutilisateur_peconnect',
104+
'siret',
105+
'date_activite',
106+
'date_embauche',
107+
'type_contrat',
108+
'duree_activite_cdd_mois',
109+
'duree_activite_cdd_jours',
110+
'diff_activite_embauche_jrs',
111+
'dc_lblprioritede',
112+
'tranche_age',
113+
'dc_privepublic',
114+
'duree_prise_en_charge',
115+
'dn_tailleetablissement',
116+
'code_postal']
117+
118+
df_dpae_act = df_dpae_act[cols_of_interest]
119+
120+
engine = import_util.create_sqlalchemy_engine()
121+
122+
if existing_sql_table:
123+
124+
query = "select * from act_dpae_clean"
125+
df_dpae_act_existing = pd.read_sql_query(query, engine)
126+
127+
#In case a problem appear in the script, we save old datas under .csv extension
128+
# because we will rewrite the whole table after each execution, we have to remove duplicates
129+
df_dpae_act_existing.to_csv(dpae_folder_path+'backup_sql_act_dpae_clean', encoding='utf-8', sep='|')
130+
logger.info("There were already act/dpae : {} rows".format(df_dpae_act_existing.shape[0]))
131+
df_dpae_act = pd.concat([df_dpae_act,df_dpae_act_existing])
132+
logger.info("Concatenation of both has {} rows".format(df_dpae_act.shape[0]))
133+
134+
df_dpae_act = df_dpae_act.drop_duplicates(
135+
subset=['idutilisateur_peconnect', 'siret'], keep='first')
136+
logger.info("Concatenation of both - duplicates has {} rows".format(df_dpae_act.shape[0]))
137+
138+
df_dpae_act.to_sql(con=engine, name='act_dpae_clean',
139+
if_exists='replace', index=False, chunksize=10000)
140+
141+
engine.close()
142+
143+
def run_main():
144+
clean_csv_act_dpae_file(existing_sql_table=True)
145+
146+
if __name__ == '__main__':
147+
run_main()

0 commit comments

Comments
 (0)