Skip to content

Commit 4e9cf8e

Browse files
committed
improve code readability
1 parent d7cc12a commit 4e9cf8e

3 files changed

Lines changed: 85 additions & 127 deletions

File tree

2023_halathon.ipynb

Lines changed: 65 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
},
2828
{
2929
"cell_type": "code",
30-
"execution_count": 13,
30+
"execution_count": 2,
3131
"id": "a2532e32",
3232
"metadata": {
3333
"scrolled": true
@@ -37,7 +37,7 @@
3737
"name": "stdout",
3838
"output_type": "stream",
3939
"text": [
40-
"nb de DOI importés depuis fichier \t1000\n"
40+
"nb de DOI importés depuis fichier \t400\n"
4141
]
4242
}
4343
],
@@ -53,21 +53,21 @@
5353
" ## precier le nom de cette colonne (DOI, doiId_s etc. )\n",
5454
" doi_col_name = \"DOI\"\n",
5555
" \n",
56-
" df_mydois = pd.read_csv(\"data/2023-04-halathon-scopus-sante.csv\", usecols=[doi_col_name])\n",
56+
" df_mydois = pd.read_csv(\"data/2023-05-halathon-scopus-sante.csv\", usecols = [doi_col_name])\n",
5757
" \n",
5858
" # dans la suite du code la colonne doit être nommée doi\n",
5959
" df_mydois.rename(columns = {doi_col_name : \"doi\"}, inplace = True)\n",
6060
" \n",
6161
" ## limiter le nombre de ligne/publication\n",
62-
" df_mydois = df_mydois[:1000]\n",
62+
" df_mydois = df_mydois[:400]\n",
6363
" print(f\"nb de DOI importés depuis fichier \\t{len(df_mydois)}\")\n",
6464
"else : \n",
6565
" df_mydois = pd.DataFrame()"
6666
]
6767
},
6868
{
6969
"cell_type": "code",
70-
"execution_count": 14,
70+
"execution_count": 3,
7171
"id": "0f72220f",
7272
"metadata": {
7373
"scrolled": true
@@ -104,7 +104,7 @@
104104
},
105105
{
106106
"cell_type": "code",
107-
"execution_count": 15,
107+
"execution_count": 4,
108108
"id": "a1ceb691",
109109
"metadata": {
110110
"scrolled": true
@@ -114,14 +114,15 @@
114114
"name": "stdout",
115115
"output_type": "stream",
116116
"text": [
117-
" nb de DOI a traiter \t1000\n",
118-
" /!\\ temps estimé ~33 minutes\n"
117+
" nb de DOI a traiter \t400\n",
118+
" /!\\ temps estimé ~13 minutes\n"
119119
]
120120
}
121121
],
122122
"source": [
123123
"# retrait des publications sans DOI\n",
124124
"df.dropna(subset=[\"doi\"], inplace = True)\n",
125+
"\n",
125126
"# feedback nb de DOI et tps de traitement\n",
126127
"print(f\" nb de DOI a traiter \\t{len(df)}\")\n",
127128
"print(f\" /!\\ temps estimé ~{round(len(df) * 40/1200)} minutes\")"
@@ -137,7 +138,7 @@
137138
},
138139
{
139140
"cell_type": "code",
140-
"execution_count": 16,
141+
"execution_count": 5,
141142
"id": "1285c198",
142143
"metadata": {
143144
"scrolled": true
@@ -147,7 +148,7 @@
147148
"name": "stdout",
148149
"output_type": "stream",
149150
"text": [
150-
"nb DOI a verifier dans HAL 1000\n",
151+
"nb DOI a verifier dans HAL 400\n",
151152
"hal 10% \n",
152153
"hal 20% \n",
153154
"hal 30% \n",
@@ -158,21 +159,22 @@
158159
"hal 80% \n",
159160
"hal 90% \n",
160161
"hal 100%\n",
161-
"nb de DOI après retrait de ceux en TI dans HAL 911\n"
162+
"nb de DOI après retrait de ceux en TI dans HAL 352\n"
162163
]
163164
}
164165
],
165166
"source": [
166167
"# 2.1. pour chaque publications déduire la présence dans HAL\n",
167168
"df_hal = enrich_w_hal(df) # renseigner df[:50].copy() pour tester uniquement sur les 50 premiers DOI\n",
169+
"\n",
168170
"#retirer ce qui est déjà déposé avec fichier\n",
169171
"df_no_file = df_hal[ df_hal[\"submitType\"] != \"file\" ].copy()\n",
170172
"print(f\"nb de DOI après retrait de ceux en TI dans HAL {len(df_no_file)}\")"
171173
]
172174
},
173175
{
174176
"cell_type": "code",
175-
"execution_count": 17,
177+
"execution_count": 7,
176178
"id": "a6ef3e24",
177179
"metadata": {
178180
"scrolled": true
@@ -182,7 +184,7 @@
182184
"name": "stdout",
183185
"output_type": "stream",
184186
"text": [
185-
"nb DOI a verifier dans upw \t911\n",
187+
"nb DOI a verifier dans upw \t352\n",
186188
"upw 10% \n",
187189
"upw 20% \n",
188190
"upw 30% \n",
@@ -191,8 +193,8 @@
191193
"upw 60% \n",
192194
"upw 70% \n",
193195
"upw 80% \n",
194-
"upw 90% \n",
195-
"upw 100% \n",
196+
"upw 89% \n",
197+
"upw 99% \n",
196198
"upw 100%\n"
197199
]
198200
}
@@ -204,7 +206,7 @@
204206
},
205207
{
206208
"cell_type": "code",
207-
"execution_count": 18,
209+
"execution_count": 8,
208210
"id": "2b7c568d",
209211
"metadata": {
210212
"scrolled": true
@@ -219,7 +221,7 @@
219221
},
220222
{
221223
"cell_type": "code",
222-
"execution_count": 19,
224+
"execution_count": 9,
223225
"id": "f79cff8b",
224226
"metadata": {
225227
"scrolled": true
@@ -230,91 +232,34 @@
230232
"output_type": "stream",
231233
"text": [
232234
"10.1016/j.omtn.2023.03.012 publishedVersion can be shared 🎉\n",
233-
"10.1084/jem.20221292 publishedVersion can be shared 🎉\n",
234-
"10.1016/j.eurox.2023.100190 publishedVersion can be shared 🎉\n",
235235
"10.1016/j.resplu.2023.100381 publishedVersion can be shared 🎉\n",
236-
"10.1016/j.resmer.2023.100999 acceptedVersion , no embargo\n",
237-
"10.1016/j.scr.2023.103074 publishedVersion can be shared 🎉\n",
238236
"10.1016/j.jgar.2023.02.005 publishedVersion can be shared 🎉\n",
239237
"10.1016/j.resmer.2022.100981 acceptedVersion , no embargo\n",
240-
"10.1530/ERC-22-0405 acceptedVersion , no embargo\n",
241-
"doi problem w permissions 10.1016/j.kint.2023.02.013\n",
242-
"doi problem w permissions 10.1016/j.jaccas.2023.101767\n",
243-
"10.1182/blood.2022017019 publishedVersion can be shared 🎉\n",
244-
"doi problem w permissions 10.1148/radiol.221835\n",
245-
"doi problem w permissions 10.1016/j.jaccao.2023.03.002\n",
246238
"10.1530/REP-22-0416 acceptedVersion , no embargo\n",
247-
"10.1016/j.banm.2023.01.024 acceptedVersion , no embargo\n",
248239
"10.1200/JCO.22.00437 publishedVersion can be shared 🎉\n",
249-
"10.1136/jmg-2022-108435 acceptedVersion , no embargo\n",
250240
"10.1177/19714009221109885 acceptedVersion , no embargo\n",
251241
"10.1093/bjd/ljac134 acceptedVersion , no embargo\n",
252242
"10.1093/gerona/glac234 acceptedVersion , no embargo\n",
253243
"10.1002/14651858.CD015490 acceptedVersion , no embargo\n",
254244
"10.1182/blood.2022016943 publishedVersion can be shared 🎉\n",
245+
"10.15252/emmm.202216320 publishedVersion can be shared 🎉\n",
255246
"doi problem w permissions 10.4244/EIJ-D-22-00723\n",
256-
"10.1128/aac.01130-22 acceptedVersion , no embargo\n",
257247
"10.1002/jmd2.12358 publishedVersion can be shared 🎉\n",
258248
"10.1093/clinchem/hvac201 acceptedVersion , no embargo\n",
259-
"10.1126/scitranslmed.add5275 acceptedVersion , no embargo\n",
260249
"10.1016/j.banm.2022.10.019 acceptedVersion , no embargo\n",
261250
"10.1016/j.idnow.2023.01.001 acceptedVersion , no embargo\n",
262251
"10.1002/adbi.202200224 acceptedVersion , no embargo\n",
263252
"10.1136/thorax-2022-219086 acceptedVersion , no embargo\n",
264253
"10.1002/nop2.1394 publishedVersion can be shared 🎉\n",
265254
"10.1136/archdischild-2022-324321 acceptedVersion , no embargo\n",
266255
"10.1136/sextrans-2021-055364 acceptedVersion , no embargo\n",
267-
"10.1093/ehjqcco/qcac029 acceptedVersion , no embargo\n",
268256
"10.1016/j.isci.2023.106019 publishedVersion can be shared 🎉\n",
269-
"10.1126/sciimmunol.abq5204 acceptedVersion , no embargo\n",
270-
"10.1182/bloodadvances.2022007464 publishedVersion can be shared 🎉\n",
271257
"10.1093/ejendo/lvad012 acceptedVersion , no embargo\n",
272258
"doi problem w permissions 10.1684/vir.2023.0985\n",
273259
"10.1128/jcm.01457-22 acceptedVersion , no embargo\n",
274260
"10.1128/aac.00871-22 acceptedVersion , no embargo\n",
275261
"10.1159/000525552 acceptedVersion , no embargo\n",
276-
"doi problem w permissions 10.1148/radiol.211658\n",
277-
"10.1530/ERC-22-0198 acceptedVersion , no embargo\n",
278-
"10.1177/03913988221143803 acceptedVersion , no embargo\n",
279-
"10.1016/j.gim.2022.10.006 acceptedVersion , no embargo\n",
280-
"10.1016/j.idnow.2022.09.020 acceptedVersion , no embargo\n",
281-
"10.1016/j.idnow.2022.08.005 acceptedVersion , no embargo\n",
282-
"10.1002/nop2.1323 publishedVersion can be shared 🎉\n",
283-
"10.1126/sciimmunol.ade1413 acceptedVersion , no embargo\n",
284-
"10.1200/JCO.22.01780 publishedVersion can be shared 🎉\n",
285-
"10.1182/blood.2022015482 publishedVersion can be shared 🎉\n",
286-
"10.1182/blood.2022017277 publishedVersion can be shared 🎉\n",
287-
"10.1159/000528588 acceptedVersion , no embargo\n",
288-
"doi problem w permissions 10.3166/afmu-2022-0472\n",
289-
"10.1182/blood.2022017273 publishedVersion can be shared 🎉\n",
290-
"10.1016/j.bj.2023.02.001 publishedVersion can be shared 🎉\n",
291-
"10.1177/00220345231154569 acceptedVersion , no embargo\n",
292-
"10.1002/1878-0261.13412 publishedVersion can be shared 🎉\n",
293-
"10.1177/09622802231160554 acceptedVersion , no embargo\n",
294-
"doi problem w permissions 10.1007/978-981-19-7376-5_12\n",
295-
"doi problem w permissions 10.1007/978-1-0716-2938-3_6\n",
296-
"doi problem w permissions 10.1007/978-1-0716-2938-3_20\n",
297-
"doi problem w permissions 10.1007/978-1-0716-2938-3_9\n",
298-
"doi problem w permissions 10.1007/978-3-031-21358-8_14\n",
299-
"doi problem w permissions 10.1684/vir.2023.986\n",
300-
"10.1056/NEJMoa2203769 publishedVersion can be shared 🎉\n",
301-
"doi problem w permissions 10.1007/978-1-0716-2954-3_25\n",
302-
"doi problem w permissions 10.1007/978-1-0716-2954-3_26\n",
303-
"doi problem w permissions 10.1007/978-1-0716-2954-3_11\n",
304-
"10.1177/23969873221150022 acceptedVersion , no embargo\n",
305-
"doi problem w permissions 10.4103/aca.aca_79_22\n",
306-
"doi problem w permissions 10.4244/EIJ-D-22-00725\n",
307-
"10.1177/17588359221146132 publishedVersion can be shared 🎉\n",
308-
"10.1177/02841851221138519 acceptedVersion , no embargo\n",
309-
"10.1177/20406207221145627 publishedVersion can be shared 🎉\n",
310-
"10.1093/eurjpc/zwac203 acceptedVersion , no embargo\n",
311-
"doi problem w permissions 10.3174/ajnr.A7733\n",
312-
"doi problem w permissions 10.1148/rycan.220051\n",
313-
"doi problem w permissions 10.4088/JCP.21M14236\n",
314-
"10.1016/j.ebiom.2022.104414 publishedVersion can be shared 🎉\n",
315-
"doi problem w permissions 10.4088/JCP.21m14277\n",
316-
"doi problem w permissions 10.4088/JCP.22m14448\n",
317-
"doi problem w permissions 10.1016/B978-0-323-85419-1.00010-4\n"
262+
"doi problem w permissions 10.1148/radiol.211658\n"
318263
]
319264
}
320265
],
@@ -325,7 +270,7 @@
325270
},
326271
{
327272
"cell_type": "code",
328-
"execution_count": 20,
273+
"execution_count": 10,
329274
"id": "b09e3df9",
330275
"metadata": {
331276
"scrolled": true
@@ -346,7 +291,7 @@
346291
},
347292
{
348293
"cell_type": "code",
349-
"execution_count": 21,
294+
"execution_count": 11,
350295
"id": "05f25ddb",
351296
"metadata": {
352297
"scrolled": true
@@ -356,13 +301,13 @@
356301
"name": "stdout",
357302
"output_type": "stream",
358303
"text": [
359-
"nb DOI retiré car marqué 'open' dans HAL et repository dans upw 6\n"
304+
"nb DOI retiré car marqué 'open' dans HAL et repository dans upw 4\n"
360305
]
361306
}
362307
],
363308
"source": [
364309
"df_final = df_upw\n",
365-
"df_final.fillna(\"\", inplace = True)# sinon nan sera compris comme une string non vide\n",
310+
"df_final.fillna(\"\", inplace = True) # sinon nan sera compris comme une string non vide\n",
366311
"\n",
367312
"## retirer ce qui est dans HAL qui a un lien extérieur et qui est signalé en repository dans upw\n",
368313
"index2remove = df_final[ (df_final[\"linkExtId\"] != \"\") & (df_final[\"oa_repo_link\"] != \"\") ].index\n",
@@ -372,11 +317,34 @@
372317
},
373318
{
374319
"cell_type": "code",
375-
"execution_count": 22,
320+
"execution_count": 12,
376321
"id": "1b7f56c3",
377-
"metadata": {
378-
"scrolled": true
379-
},
322+
"metadata": {},
323+
"outputs": [],
324+
"source": [
325+
"# deduire la colonne todo\n",
326+
"df_final[\"todo\"] = df_final.apply(lambda row : deduce_todo(row), axis = 1)"
327+
]
328+
},
329+
{
330+
"cell_type": "code",
331+
"execution_count": 13,
332+
"id": "e00e847f",
333+
"metadata": {},
334+
"outputs": [],
335+
"source": [
336+
"## retrait des colonnes non utilisées\n",
337+
"remove_cols = [\"submitType\", \"has_issn\"]\n",
338+
"for col in remove_cols : \n",
339+
" if col in df_final.columns :\n",
340+
" del df_final[col]"
341+
]
342+
},
343+
{
344+
"cell_type": "code",
345+
"execution_count": 15,
346+
"id": "2681445f",
347+
"metadata": {},
380348
"outputs": [
381349
{
382350
"name": "stdout",
@@ -385,59 +353,46 @@
385353
"\n",
386354
"Statistiques\n",
387355
"\n",
388-
"nb de DOI a traiter\t\t\t905\n",
389-
"ecrire a l auteur pour appliquer la LRN 428\n",
390-
"selon la licence ajouter le PDF editeur 308\n",
391-
"creer ou retrouver la notice 135\n",
392-
"recuperer le PDF editeur et ecrire a l auteur pour accord 24\n",
393-
"verifier les identifiants de la notice 5\n",
356+
"nb de DOI a traiter\t\t\t348\n",
357+
"ecrire a l auteur pour appliquer la LRN 144\n",
358+
"selon la licence ajouter le PDF editeur 129\n",
359+
"creer ou retrouver la notice 63\n",
360+
"recuperer le PDF editeur et ecrire a l auteur pour accord 9\n",
361+
"verifier les identifiants de la notice 2\n",
394362
"Name: todo, dtype: int64\n"
395363
]
396364
}
397365
],
398366
"source": [
399-
"# deduire la colonne todo\n",
400-
"df_final[\"todo\"] = df_final.apply(lambda row : deduce_todo(row), axis = 1)\n",
401-
"df_final.to_csv(\"data\\dois_a_traiter.csv\", index = False)\n",
402367
"#imprimer des statistiques brutes\n",
403368
"print(f\"\\nStatistiques\\n\\nnb de DOI a traiter\\t\\t\\t{len(df_final)}\\n{df_final['todo'].value_counts()}\")"
404369
]
405370
},
406371
{
407372
"cell_type": "code",
408-
"execution_count": 23,
373+
"execution_count": 16,
409374
"id": "62c68955",
410375
"metadata": {},
411376
"outputs": [
412377
{
413378
"data": {
414379
"text/plain": [
415-
"Index(['doi', 'halId', 'submitType', 'linkExtId', 'upw_state',\n",
416-
" 'published_date', 'oa_publisher_license', 'oa_publisher_link',\n",
417-
" 'oa_repo_link', 'has_issn', 'deposit_condition', 'todo'],\n",
380+
"Index(['doi', 'halId', 'linkExtId', 'upw_state', 'published_date',\n",
381+
" 'oa_publisher_license', 'oa_publisher_link', 'oa_repo_link',\n",
382+
" 'deposit_condition', 'todo'],\n",
418383
" dtype='object')"
419384
]
420385
},
421-
"execution_count": 23,
386+
"execution_count": 16,
422387
"metadata": {},
423388
"output_type": "execute_result"
424389
}
425390
],
426391
"source": [
392+
"df_final.to_csv(\"data\\dois_a_traiter.csv\", index = False)\n",
427393
"df_final.columns"
428394
]
429395
},
430-
{
431-
"cell_type": "code",
432-
"execution_count": 24,
433-
"id": "b40b4884",
434-
"metadata": {},
435-
"outputs": [],
436-
"source": [
437-
"# retrait des colonnes inutiles \n",
438-
"df_final.drop(columns= [\"submitType\", \"has_issn\"], inplace = True)"
439-
]
440-
},
441396
{
442397
"cell_type": "code",
443398
"execution_count": 25,
@@ -461,7 +416,7 @@
461416
"metadata": {},
462417
"outputs": [],
463418
"source": [
464-
"## exporter le jeux de données final\n",
419+
"## exporter les données sous forme de tableau pour libreOffice\n",
465420
"df_final.to_csv(\"data\\dois_a_traiter_formules.csv\", index = False)"
466421
]
467422
},

0 commit comments

Comments
 (0)