|
27 | 27 | }, |
28 | 28 | { |
29 | 29 | "cell_type": "code", |
30 | | - "execution_count": 13, |
| 30 | + "execution_count": 2, |
31 | 31 | "id": "a2532e32", |
32 | 32 | "metadata": { |
33 | 33 | "scrolled": true |
|
37 | 37 | "name": "stdout", |
38 | 38 | "output_type": "stream", |
39 | 39 | "text": [ |
40 | | - "nb de DOI importés depuis fichier \t1000\n" |
| 40 | + "nb de DOI importés depuis fichier \t400\n" |
41 | 41 | ] |
42 | 42 | } |
43 | 43 | ], |
|
53 | 53 | " ## precier le nom de cette colonne (DOI, doiId_s etc. )\n", |
54 | 54 | " doi_col_name = \"DOI\"\n", |
55 | 55 | " \n", |
56 | | - " df_mydois = pd.read_csv(\"data/2023-04-halathon-scopus-sante.csv\", usecols=[doi_col_name])\n", |
| 56 | + " df_mydois = pd.read_csv(\"data/2023-05-halathon-scopus-sante.csv\", usecols = [doi_col_name])\n", |
57 | 57 | " \n", |
58 | 58 | " # dans la suite du code la colonne doit être nommée doi\n", |
59 | 59 | " df_mydois.rename(columns = {doi_col_name : \"doi\"}, inplace = True)\n", |
60 | 60 | " \n", |
61 | 61 | " ## limiter le nombre de ligne/publication\n", |
62 | | - " df_mydois = df_mydois[:1000]\n", |
| 62 | + " df_mydois = df_mydois[:400]\n", |
63 | 63 | " print(f\"nb de DOI importés depuis fichier \\t{len(df_mydois)}\")\n", |
64 | 64 | "else : \n", |
65 | 65 | " df_mydois = pd.DataFrame()" |
66 | 66 | ] |
67 | 67 | }, |
68 | 68 | { |
69 | 69 | "cell_type": "code", |
70 | | - "execution_count": 14, |
| 70 | + "execution_count": 3, |
71 | 71 | "id": "0f72220f", |
72 | 72 | "metadata": { |
73 | 73 | "scrolled": true |
|
104 | 104 | }, |
105 | 105 | { |
106 | 106 | "cell_type": "code", |
107 | | - "execution_count": 15, |
| 107 | + "execution_count": 4, |
108 | 108 | "id": "a1ceb691", |
109 | 109 | "metadata": { |
110 | 110 | "scrolled": true |
|
114 | 114 | "name": "stdout", |
115 | 115 | "output_type": "stream", |
116 | 116 | "text": [ |
117 | | - " nb de DOI a traiter \t1000\n", |
118 | | - " /!\\ temps estimé ~33 minutes\n" |
| 117 | + " nb de DOI a traiter \t400\n", |
| 118 | + " /!\\ temps estimé ~13 minutes\n" |
119 | 119 | ] |
120 | 120 | } |
121 | 121 | ], |
122 | 122 | "source": [ |
123 | 123 | "# retrait des publications sans DOI\n", |
124 | 124 | "df.dropna(subset=[\"doi\"], inplace = True)\n", |
| 125 | + "\n", |
125 | 126 | "# feedback nb de DOI et tps de traitement\n", |
126 | 127 | "print(f\" nb de DOI a traiter \\t{len(df)}\")\n", |
127 | 128 | "print(f\" /!\\ temps estimé ~{round(len(df) * 40/1200)} minutes\")" |
|
137 | 138 | }, |
138 | 139 | { |
139 | 140 | "cell_type": "code", |
140 | | - "execution_count": 16, |
| 141 | + "execution_count": 5, |
141 | 142 | "id": "1285c198", |
142 | 143 | "metadata": { |
143 | 144 | "scrolled": true |
|
147 | 148 | "name": "stdout", |
148 | 149 | "output_type": "stream", |
149 | 150 | "text": [ |
150 | | - "nb DOI a verifier dans HAL 1000\n", |
| 151 | + "nb DOI a verifier dans HAL 400\n", |
151 | 152 | "hal 10% \n", |
152 | 153 | "hal 20% \n", |
153 | 154 | "hal 30% \n", |
|
158 | 159 | "hal 80% \n", |
159 | 160 | "hal 90% \n", |
160 | 161 | "hal 100%\n", |
161 | | - "nb de DOI après retrait de ceux en TI dans HAL 911\n" |
| 162 | + "nb de DOI après retrait de ceux en TI dans HAL 352\n" |
162 | 163 | ] |
163 | 164 | } |
164 | 165 | ], |
165 | 166 | "source": [ |
166 | 167 | "# 2.1. pour chaque publications déduire la présence dans HAL\n", |
167 | 168 | "df_hal = enrich_w_hal(df) # renseigner df[:50].copy() pour tester uniquement sur les 50 premiers DOI\n", |
| 169 | + "\n", |
168 | 170 | "#retirer ce qui est déjà déposé avec fichier\n", |
169 | 171 | "df_no_file = df_hal[ df_hal[\"submitType\"] != \"file\" ].copy()\n", |
170 | 172 | "print(f\"nb de DOI après retrait de ceux en TI dans HAL {len(df_no_file)}\")" |
171 | 173 | ] |
172 | 174 | }, |
173 | 175 | { |
174 | 176 | "cell_type": "code", |
175 | | - "execution_count": 17, |
| 177 | + "execution_count": 7, |
176 | 178 | "id": "a6ef3e24", |
177 | 179 | "metadata": { |
178 | 180 | "scrolled": true |
|
182 | 184 | "name": "stdout", |
183 | 185 | "output_type": "stream", |
184 | 186 | "text": [ |
185 | | - "nb DOI a verifier dans upw \t911\n", |
| 187 | + "nb DOI a verifier dans upw \t352\n", |
186 | 188 | "upw 10% \n", |
187 | 189 | "upw 20% \n", |
188 | 190 | "upw 30% \n", |
|
191 | 193 | "upw 60% \n", |
192 | 194 | "upw 70% \n", |
193 | 195 | "upw 80% \n", |
194 | | - "upw 90% \n", |
195 | | - "upw 100% \n", |
| 196 | + "upw 89% \n", |
| 197 | + "upw 99% \n", |
196 | 198 | "upw 100%\n" |
197 | 199 | ] |
198 | 200 | } |
|
204 | 206 | }, |
205 | 207 | { |
206 | 208 | "cell_type": "code", |
207 | | - "execution_count": 18, |
| 209 | + "execution_count": 8, |
208 | 210 | "id": "2b7c568d", |
209 | 211 | "metadata": { |
210 | 212 | "scrolled": true |
|
219 | 221 | }, |
220 | 222 | { |
221 | 223 | "cell_type": "code", |
222 | | - "execution_count": 19, |
| 224 | + "execution_count": 9, |
223 | 225 | "id": "f79cff8b", |
224 | 226 | "metadata": { |
225 | 227 | "scrolled": true |
|
230 | 232 | "output_type": "stream", |
231 | 233 | "text": [ |
232 | 234 | "10.1016/j.omtn.2023.03.012 publishedVersion can be shared 🎉\n", |
233 | | - "10.1084/jem.20221292 publishedVersion can be shared 🎉\n", |
234 | | - "10.1016/j.eurox.2023.100190 publishedVersion can be shared 🎉\n", |
235 | 235 | "10.1016/j.resplu.2023.100381 publishedVersion can be shared 🎉\n", |
236 | | - "10.1016/j.resmer.2023.100999 acceptedVersion , no embargo\n", |
237 | | - "10.1016/j.scr.2023.103074 publishedVersion can be shared 🎉\n", |
238 | 236 | "10.1016/j.jgar.2023.02.005 publishedVersion can be shared 🎉\n", |
239 | 237 | "10.1016/j.resmer.2022.100981 acceptedVersion , no embargo\n", |
240 | | - "10.1530/ERC-22-0405 acceptedVersion , no embargo\n", |
241 | | - "doi problem w permissions 10.1016/j.kint.2023.02.013\n", |
242 | | - "doi problem w permissions 10.1016/j.jaccas.2023.101767\n", |
243 | | - "10.1182/blood.2022017019 publishedVersion can be shared 🎉\n", |
244 | | - "doi problem w permissions 10.1148/radiol.221835\n", |
245 | | - "doi problem w permissions 10.1016/j.jaccao.2023.03.002\n", |
246 | 238 | "10.1530/REP-22-0416 acceptedVersion , no embargo\n", |
247 | | - "10.1016/j.banm.2023.01.024 acceptedVersion , no embargo\n", |
248 | 239 | "10.1200/JCO.22.00437 publishedVersion can be shared 🎉\n", |
249 | | - "10.1136/jmg-2022-108435 acceptedVersion , no embargo\n", |
250 | 240 | "10.1177/19714009221109885 acceptedVersion , no embargo\n", |
251 | 241 | "10.1093/bjd/ljac134 acceptedVersion , no embargo\n", |
252 | 242 | "10.1093/gerona/glac234 acceptedVersion , no embargo\n", |
253 | 243 | "10.1002/14651858.CD015490 acceptedVersion , no embargo\n", |
254 | 244 | "10.1182/blood.2022016943 publishedVersion can be shared 🎉\n", |
| 245 | + "10.15252/emmm.202216320 publishedVersion can be shared 🎉\n", |
255 | 246 | "doi problem w permissions 10.4244/EIJ-D-22-00723\n", |
256 | | - "10.1128/aac.01130-22 acceptedVersion , no embargo\n", |
257 | 247 | "10.1002/jmd2.12358 publishedVersion can be shared 🎉\n", |
258 | 248 | "10.1093/clinchem/hvac201 acceptedVersion , no embargo\n", |
259 | | - "10.1126/scitranslmed.add5275 acceptedVersion , no embargo\n", |
260 | 249 | "10.1016/j.banm.2022.10.019 acceptedVersion , no embargo\n", |
261 | 250 | "10.1016/j.idnow.2023.01.001 acceptedVersion , no embargo\n", |
262 | 251 | "10.1002/adbi.202200224 acceptedVersion , no embargo\n", |
263 | 252 | "10.1136/thorax-2022-219086 acceptedVersion , no embargo\n", |
264 | 253 | "10.1002/nop2.1394 publishedVersion can be shared 🎉\n", |
265 | 254 | "10.1136/archdischild-2022-324321 acceptedVersion , no embargo\n", |
266 | 255 | "10.1136/sextrans-2021-055364 acceptedVersion , no embargo\n", |
267 | | - "10.1093/ehjqcco/qcac029 acceptedVersion , no embargo\n", |
268 | 256 | "10.1016/j.isci.2023.106019 publishedVersion can be shared 🎉\n", |
269 | | - "10.1126/sciimmunol.abq5204 acceptedVersion , no embargo\n", |
270 | | - "10.1182/bloodadvances.2022007464 publishedVersion can be shared 🎉\n", |
271 | 257 | "10.1093/ejendo/lvad012 acceptedVersion , no embargo\n", |
272 | 258 | "doi problem w permissions 10.1684/vir.2023.0985\n", |
273 | 259 | "10.1128/jcm.01457-22 acceptedVersion , no embargo\n", |
274 | 260 | "10.1128/aac.00871-22 acceptedVersion , no embargo\n", |
275 | 261 | "10.1159/000525552 acceptedVersion , no embargo\n", |
276 | | - "doi problem w permissions 10.1148/radiol.211658\n", |
277 | | - "10.1530/ERC-22-0198 acceptedVersion , no embargo\n", |
278 | | - "10.1177/03913988221143803 acceptedVersion , no embargo\n", |
279 | | - "10.1016/j.gim.2022.10.006 acceptedVersion , no embargo\n", |
280 | | - "10.1016/j.idnow.2022.09.020 acceptedVersion , no embargo\n", |
281 | | - "10.1016/j.idnow.2022.08.005 acceptedVersion , no embargo\n", |
282 | | - "10.1002/nop2.1323 publishedVersion can be shared 🎉\n", |
283 | | - "10.1126/sciimmunol.ade1413 acceptedVersion , no embargo\n", |
284 | | - "10.1200/JCO.22.01780 publishedVersion can be shared 🎉\n", |
285 | | - "10.1182/blood.2022015482 publishedVersion can be shared 🎉\n", |
286 | | - "10.1182/blood.2022017277 publishedVersion can be shared 🎉\n", |
287 | | - "10.1159/000528588 acceptedVersion , no embargo\n", |
288 | | - "doi problem w permissions 10.3166/afmu-2022-0472\n", |
289 | | - "10.1182/blood.2022017273 publishedVersion can be shared 🎉\n", |
290 | | - "10.1016/j.bj.2023.02.001 publishedVersion can be shared 🎉\n", |
291 | | - "10.1177/00220345231154569 acceptedVersion , no embargo\n", |
292 | | - "10.1002/1878-0261.13412 publishedVersion can be shared 🎉\n", |
293 | | - "10.1177/09622802231160554 acceptedVersion , no embargo\n", |
294 | | - "doi problem w permissions 10.1007/978-981-19-7376-5_12\n", |
295 | | - "doi problem w permissions 10.1007/978-1-0716-2938-3_6\n", |
296 | | - "doi problem w permissions 10.1007/978-1-0716-2938-3_20\n", |
297 | | - "doi problem w permissions 10.1007/978-1-0716-2938-3_9\n", |
298 | | - "doi problem w permissions 10.1007/978-3-031-21358-8_14\n", |
299 | | - "doi problem w permissions 10.1684/vir.2023.986\n", |
300 | | - "10.1056/NEJMoa2203769 publishedVersion can be shared 🎉\n", |
301 | | - "doi problem w permissions 10.1007/978-1-0716-2954-3_25\n", |
302 | | - "doi problem w permissions 10.1007/978-1-0716-2954-3_26\n", |
303 | | - "doi problem w permissions 10.1007/978-1-0716-2954-3_11\n", |
304 | | - "10.1177/23969873221150022 acceptedVersion , no embargo\n", |
305 | | - "doi problem w permissions 10.4103/aca.aca_79_22\n", |
306 | | - "doi problem w permissions 10.4244/EIJ-D-22-00725\n", |
307 | | - "10.1177/17588359221146132 publishedVersion can be shared 🎉\n", |
308 | | - "10.1177/02841851221138519 acceptedVersion , no embargo\n", |
309 | | - "10.1177/20406207221145627 publishedVersion can be shared 🎉\n", |
310 | | - "10.1093/eurjpc/zwac203 acceptedVersion , no embargo\n", |
311 | | - "doi problem w permissions 10.3174/ajnr.A7733\n", |
312 | | - "doi problem w permissions 10.1148/rycan.220051\n", |
313 | | - "doi problem w permissions 10.4088/JCP.21M14236\n", |
314 | | - "10.1016/j.ebiom.2022.104414 publishedVersion can be shared 🎉\n", |
315 | | - "doi problem w permissions 10.4088/JCP.21m14277\n", |
316 | | - "doi problem w permissions 10.4088/JCP.22m14448\n", |
317 | | - "doi problem w permissions 10.1016/B978-0-323-85419-1.00010-4\n" |
| 262 | + "doi problem w permissions 10.1148/radiol.211658\n" |
318 | 263 | ] |
319 | 264 | } |
320 | 265 | ], |
|
325 | 270 | }, |
326 | 271 | { |
327 | 272 | "cell_type": "code", |
328 | | - "execution_count": 20, |
| 273 | + "execution_count": 10, |
329 | 274 | "id": "b09e3df9", |
330 | 275 | "metadata": { |
331 | 276 | "scrolled": true |
|
346 | 291 | }, |
347 | 292 | { |
348 | 293 | "cell_type": "code", |
349 | | - "execution_count": 21, |
| 294 | + "execution_count": 11, |
350 | 295 | "id": "05f25ddb", |
351 | 296 | "metadata": { |
352 | 297 | "scrolled": true |
|
356 | 301 | "name": "stdout", |
357 | 302 | "output_type": "stream", |
358 | 303 | "text": [ |
359 | | - "nb DOI retiré car marqué 'open' dans HAL et repository dans upw 6\n" |
| 304 | + "nb DOI retiré car marqué 'open' dans HAL et repository dans upw 4\n" |
360 | 305 | ] |
361 | 306 | } |
362 | 307 | ], |
363 | 308 | "source": [ |
364 | 309 | "df_final = df_upw\n", |
365 | | - "df_final.fillna(\"\", inplace = True)# sinon nan sera compris comme une string non vide\n", |
| 310 | + "df_final.fillna(\"\", inplace = True) # sinon nan sera compris comme une string non vide\n", |
366 | 311 | "\n", |
367 | 312 | "## retirer ce qui est dans HAL qui a un lien extérieur et qui est signalé en repository dans upw\n", |
368 | 313 | "index2remove = df_final[ (df_final[\"linkExtId\"] != \"\") & (df_final[\"oa_repo_link\"] != \"\") ].index\n", |
|
372 | 317 | }, |
373 | 318 | { |
374 | 319 | "cell_type": "code", |
375 | | - "execution_count": 22, |
| 320 | + "execution_count": 12, |
376 | 321 | "id": "1b7f56c3", |
377 | | - "metadata": { |
378 | | - "scrolled": true |
379 | | - }, |
| 322 | + "metadata": {}, |
| 323 | + "outputs": [], |
| 324 | + "source": [ |
| 325 | + "# deduire la colonne todo\n", |
| 326 | + "df_final[\"todo\"] = df_final.apply(lambda row : deduce_todo(row), axis = 1)" |
| 327 | + ] |
| 328 | + }, |
| 329 | + { |
| 330 | + "cell_type": "code", |
| 331 | + "execution_count": 13, |
| 332 | + "id": "e00e847f", |
| 333 | + "metadata": {}, |
| 334 | + "outputs": [], |
| 335 | + "source": [ |
| 336 | + "## retrait des colonnes non utilisées\n", |
| 337 | + "remove_cols = [\"submitType\", \"has_issn\"]\n", |
| 338 | + "for col in remove_cols : \n", |
| 339 | + " if col in df_final.columns :\n", |
| 340 | + " del df_final[col]" |
| 341 | + ] |
| 342 | + }, |
| 343 | + { |
| 344 | + "cell_type": "code", |
| 345 | + "execution_count": 15, |
| 346 | + "id": "2681445f", |
| 347 | + "metadata": {}, |
380 | 348 | "outputs": [ |
381 | 349 | { |
382 | 350 | "name": "stdout", |
|
385 | 353 | "\n", |
386 | 354 | "Statistiques\n", |
387 | 355 | "\n", |
388 | | - "nb de DOI a traiter\t\t\t905\n", |
389 | | - "ecrire a l auteur pour appliquer la LRN 428\n", |
390 | | - "selon la licence ajouter le PDF editeur 308\n", |
391 | | - "creer ou retrouver la notice 135\n", |
392 | | - "recuperer le PDF editeur et ecrire a l auteur pour accord 24\n", |
393 | | - "verifier les identifiants de la notice 5\n", |
| 356 | + "nb de DOI a traiter\t\t\t348\n", |
| 357 | + "ecrire a l auteur pour appliquer la LRN 144\n", |
| 358 | + "selon la licence ajouter le PDF editeur 129\n", |
| 359 | + "creer ou retrouver la notice 63\n", |
| 360 | + "recuperer le PDF editeur et ecrire a l auteur pour accord 9\n", |
| 361 | + "verifier les identifiants de la notice 2\n", |
394 | 362 | "Name: todo, dtype: int64\n" |
395 | 363 | ] |
396 | 364 | } |
397 | 365 | ], |
398 | 366 | "source": [ |
399 | | - "# deduire la colonne todo\n", |
400 | | - "df_final[\"todo\"] = df_final.apply(lambda row : deduce_todo(row), axis = 1)\n", |
401 | | - "df_final.to_csv(\"data\\dois_a_traiter.csv\", index = False)\n", |
402 | 367 | "#imprimer des statistiques brutes\n", |
403 | 368 | "print(f\"\\nStatistiques\\n\\nnb de DOI a traiter\\t\\t\\t{len(df_final)}\\n{df_final['todo'].value_counts()}\")" |
404 | 369 | ] |
405 | 370 | }, |
406 | 371 | { |
407 | 372 | "cell_type": "code", |
408 | | - "execution_count": 23, |
| 373 | + "execution_count": 16, |
409 | 374 | "id": "62c68955", |
410 | 375 | "metadata": {}, |
411 | 376 | "outputs": [ |
412 | 377 | { |
413 | 378 | "data": { |
414 | 379 | "text/plain": [ |
415 | | - "Index(['doi', 'halId', 'submitType', 'linkExtId', 'upw_state',\n", |
416 | | - " 'published_date', 'oa_publisher_license', 'oa_publisher_link',\n", |
417 | | - " 'oa_repo_link', 'has_issn', 'deposit_condition', 'todo'],\n", |
| 380 | + "Index(['doi', 'halId', 'linkExtId', 'upw_state', 'published_date',\n", |
| 381 | + " 'oa_publisher_license', 'oa_publisher_link', 'oa_repo_link',\n", |
| 382 | + " 'deposit_condition', 'todo'],\n", |
418 | 383 | " dtype='object')" |
419 | 384 | ] |
420 | 385 | }, |
421 | | - "execution_count": 23, |
| 386 | + "execution_count": 16, |
422 | 387 | "metadata": {}, |
423 | 388 | "output_type": "execute_result" |
424 | 389 | } |
425 | 390 | ], |
426 | 391 | "source": [ |
| 392 | + "df_final.to_csv(\"data\\dois_a_traiter.csv\", index = False)\n", |
427 | 393 | "df_final.columns" |
428 | 394 | ] |
429 | 395 | }, |
430 | | - { |
431 | | - "cell_type": "code", |
432 | | - "execution_count": 24, |
433 | | - "id": "b40b4884", |
434 | | - "metadata": {}, |
435 | | - "outputs": [], |
436 | | - "source": [ |
437 | | - "# retrait des colonnes inutiles \n", |
438 | | - "df_final.drop(columns= [\"submitType\", \"has_issn\"], inplace = True)" |
439 | | - ] |
440 | | - }, |
441 | 396 | { |
442 | 397 | "cell_type": "code", |
443 | 398 | "execution_count": 25, |
|
461 | 416 | "metadata": {}, |
462 | 417 | "outputs": [], |
463 | 418 | "source": [ |
464 | | - "## exporter le jeux de données final\n", |
| 419 | + "## exporter les données sous forme de tableau pour libreOffice\n", |
465 | 420 | "df_final.to_csv(\"data\\dois_a_traiter_formules.csv\", index = False)" |
466 | 421 | ] |
467 | 422 | }, |
|
0 commit comments