|
33 | 33 | "name": "stdout", |
34 | 34 | "output_type": "stream", |
35 | 35 | "text": [ |
36 | | - "nb de DOI importés depuis fichier \t1113\n" |
| 36 | + "nb de DOI importés depuis fichier \t500\n" |
37 | 37 | ] |
38 | 38 | } |
39 | 39 | ], |
40 | 40 | "source": [ |
41 | 41 | "## option A : utiliser un fichier local \n", |
42 | 42 | "\n", |
43 | | - "utiliser_fichier_local = True # True pour oui False pour non\n", |
| 43 | + "utiliser_fichier_local = True\n", |
44 | 44 | "\n", |
45 | 45 | "if utiliser_fichier_local : \n", |
46 | 46 | " ## un fichier local placé dans le dossier *data*\n", |
47 | 47 | " ## doit être un fichier .csv avec encodage utf8 et ',' comme séparateur\n", |
48 | 48 | " ## avec au moins une colonne *doi* \n", |
49 | | - " #(pour scopus remplacer DOI par doi)\n", |
50 | | - " df_mydois = pd.read_csv(\"data/2022-04-sciences-ipgp-scopus.csv\", usecols=[\"doi\"]) \n", |
| 49 | + " ## precier le nom de cette colonne (DOI, doiId_s etc. )\n", |
| 50 | + " doi_col_name = \"DOI\"\n", |
| 51 | + " df_mydois = pd.read_csv(\"data/2022-05-05-scopus-sci-ipgp.csv\", usecols=[doi_col_name])\n", |
| 52 | + " # dans la suite du code la colonne doit être nommée doi\n", |
| 53 | + " df_mydois.rename(columns = {doi_col_name : \"doi\"}, inplace = True)\n", |
| 54 | + " ## limiter le nombre de ligne/publication\n", |
| 55 | + " df_mydois = df_mydois[:500]\n", |
51 | 56 | " #df_mydois = pd.read_csv(\"data/up/2021_12_02_scopus_fac_sh__doi.csv\", usecols=[\"doi\"]) \n", |
52 | 57 | " print(f\"nb de DOI importés depuis fichier \\t{len(df_mydois)}\")\n", |
53 | 58 | "else : \n", |
|
101 | 106 | "name": "stdout", |
102 | 107 | "output_type": "stream", |
103 | 108 | "text": [ |
104 | | - " nb de DOI a traiter \t1113\n", |
105 | | - " /!\\ temps estimé ~37 minutes\n" |
| 109 | + " nb de DOI a traiter \t500\n", |
| 110 | + " /!\\ temps estimé ~17 minutes\n" |
106 | 111 | ] |
107 | 112 | } |
108 | 113 | ], |
109 | 114 | "source": [ |
| 115 | + "# retrait des publications sans DOI\n", |
| 116 | + "df.dropna(subset=[\"doi\"], inplace = True)\n", |
| 117 | + "# feedback nb de DOI et tps de traitement\n", |
110 | 118 | "print(f\" nb de DOI a traiter \\t{len(df)}\")\n", |
111 | 119 | "print(f\" /!\\ temps estimé ~{round(len(df) * 40/1200)} minutes\")" |
112 | 120 | ] |
|
129 | 137 | "name": "stdout", |
130 | 138 | "output_type": "stream", |
131 | 139 | "text": [ |
132 | | - "nb DOI a verifier dans HAL 1113\n", |
| 140 | + "nb DOI a verifier dans HAL 500\n", |
133 | 141 | "hal 10% \n", |
134 | 142 | "hal 20% \n", |
135 | 143 | "hal 30% \n", |
|
139 | 147 | "hal 70% \n", |
140 | 148 | "hal 80% \n", |
141 | 149 | "hal 90% \n", |
142 | | - "hal 100% \n", |
143 | 150 | "hal 100%\n", |
144 | | - "nb de DOI après retrait de ceux en TI Dans HAL 888\n" |
| 151 | + "nb de DOI après retrait de ceux en TI dans HAL 411\n" |
145 | 152 | ] |
146 | 153 | } |
147 | 154 | ], |
|
150 | 157 | "df_hal = enrich_w_hal(df) # renseigner df[:50].copy() pour tester uniquement sur les 50 premiers DOI\n", |
151 | 158 | "#retirer ce qui est déjà déposé avec fichier\n", |
152 | 159 | "df_no_file = df_hal[ df_hal[\"submitType\"] != \"file\" ].copy()\n", |
153 | | - "print(f\"nb de DOI après retrait de ceux en TI Dans HAL {len(df_no_file)}\")" |
| 160 | + "print(f\"nb de DOI après retrait de ceux en TI dans HAL {len(df_no_file)}\")" |
154 | 161 | ] |
155 | 162 | }, |
156 | 163 | { |
|
163 | 170 | "name": "stdout", |
164 | 171 | "output_type": "stream", |
165 | 172 | "text": [ |
166 | | - "nb DOI a verifier dans upw \t888\n", |
| 173 | + "nb DOI a verifier dans upw \t411\n", |
167 | 174 | "upw 10% \n", |
168 | 175 | "upw 20% \n", |
169 | 176 | "upw 30% \n", |
170 | 177 | "upw 40% \n", |
171 | 178 | "upw 50% \n", |
172 | | - "upw 59% \n", |
173 | | - "upw 69% \n", |
174 | | - "upw 79% \n", |
175 | | - "upw 89% \n", |
176 | | - "upw 99% \n", |
| 179 | + "upw 60% \n", |
| 180 | + "upw 70% \n", |
| 181 | + "upw 80% \n", |
| 182 | + "upw 90% \n", |
| 183 | + "upw 100% \n", |
177 | 184 | "upw 100%\n" |
178 | 185 | ] |
179 | 186 | } |
|
185 | 192 | }, |
186 | 193 | { |
187 | 194 | "cell_type": "code", |
188 | | - "execution_count": null, |
| 195 | + "execution_count": 8, |
189 | 196 | "id": "2b7c568d", |
190 | 197 | "metadata": {}, |
191 | 198 | "outputs": [], |
|
198 | 205 | }, |
199 | 206 | { |
200 | 207 | "cell_type": "code", |
201 | | - "execution_count": null, |
| 208 | + "execution_count": 9, |
202 | 209 | "id": "f79cff8b", |
203 | 210 | "metadata": {}, |
204 | | - "outputs": [], |
| 211 | + "outputs": [ |
| 212 | + { |
| 213 | + "name": "stdout", |
| 214 | + "output_type": "stream", |
| 215 | + "text": [ |
| 216 | + "doi problem w permissions 10.1038/s41598-022-10647-5\n", |
| 217 | + "10.1515/ijmr-2005-0215 publishedVersion accepted ! ouraaaaah \n", |
| 218 | + "10.1083/jcb.202110044 publishedVersion accepted ! ouraaaaah \n", |
| 219 | + "10.1093/gji/ggac026 publishedVersion accepted ! ouraaaaah \n", |
| 220 | + "10.1093/gji/ggac005 publishedVersion accepted ! ouraaaaah \n", |
| 221 | + "doi problem w permissions 10.1016/j.rinp.2022.105443\n", |
| 222 | + "10.1029/2021GL095557 publishedVersion accepted ! ouraaaaah \n", |
| 223 | + "10.1103/PhysRevB.105.165403 acceptedVersion , no embargo\n", |
| 224 | + "doi problem w permissions 10.1063/5.0081481\n", |
| 225 | + "doi problem w permissions 10.1063/5.0082016\n", |
| 226 | + "doi problem w permissions 10.1063/5.0081408\n", |
| 227 | + "10.1029/2021JB023715 publishedVersion accepted ! ouraaaaah \n", |
| 228 | + "10.1029/2022JB024131 publishedVersion accepted ! ouraaaaah \n", |
| 229 | + "doi problem w permissions 10.3847/1538-4365/ac45f7\n", |
| 230 | + "10.1107/S1600576722001406 acceptedVersion , no embargo\n", |
| 231 | + "doi problem w permissions 10.1515/crelle-2021-0088\n", |
| 232 | + "10.1109/TED.2022.3145767 acceptedVersion , no embargo\n", |
| 233 | + "doi problem w permissions 10.1073/pnas.2115258119\n", |
| 234 | + "10.1029/2021GL097156 publishedVersion accepted ! ouraaaaah \n", |
| 235 | + "10.1063/5.0079588 publishedVersion accepted ! ouraaaaah \n", |
| 236 | + "10.1029/2021JD036140 publishedVersion accepted ! ouraaaaah \n", |
| 237 | + "10.1029/2021GL096990 publishedVersion accepted ! ouraaaaah \n", |
| 238 | + "doi problem w permissions 10.1103/PhysRevD.105.064061\n", |
| 239 | + "10.1103/PhysRevB.105.125420 acceptedVersion , no embargo\n", |
| 240 | + "10.1103/PhysRevB.105.125112 acceptedVersion , no embargo\n", |
| 241 | + "10.1242/dev.200159 publishedVersion accepted ! ouraaaaah \n", |
| 242 | + "10.1063/5.0083282 publishedVersion accepted ! ouraaaaah \n", |
| 243 | + "10.1126/science.abo5791 acceptedVersion , no embargo\n", |
| 244 | + "10.1083/jcb.202011085 publishedVersion accepted ! ouraaaaah \n", |
| 245 | + "10.1103/PhysRevLett.128.094503 acceptedVersion , no embargo\n", |
| 246 | + "10.1182/BLOODADVANCES.2021005983 publishedVersion accepted ! ouraaaaah \n", |
| 247 | + "10.1109/TCDS.2020.2986411 acceptedVersion , no embargo\n", |
| 248 | + "10.1103/PhysRevC.105.034334 acceptedVersion , no embargo\n", |
| 249 | + "10.1103/PhysRevE.105.034504 acceptedVersion , no embargo\n", |
| 250 | + "10.1103/PhysRevC.105.034319 acceptedVersion , no embargo\n", |
| 251 | + "doi problem w permissions 10.4230/LIPIcs.STACS.2022.9\n", |
| 252 | + "doi problem w permissions 10.4230/LIPIcs.STACS.2022.35\n", |
| 253 | + "10.1128/aem.02378-21 acceptedVersion , no embargo\n" |
| 254 | + ] |
| 255 | + } |
| 256 | + ], |
205 | 257 | "source": [ |
206 | 258 | "# 2.2. ajouter les possibilités de dépôt via l'API Permissions\n", |
207 | 259 | "df_upw[\"deposit_condition\"] = df_upw.apply(lambda row : add_permissions(row), axis = 1)" |
208 | 260 | ] |
209 | 261 | }, |
210 | 262 | { |
211 | 263 | "cell_type": "code", |
212 | | - "execution_count": null, |
| 264 | + "execution_count": 10, |
213 | 265 | "id": "b09e3df9", |
214 | 266 | "metadata": {}, |
215 | 267 | "outputs": [], |
|
228 | 280 | }, |
229 | 281 | { |
230 | 282 | "cell_type": "code", |
231 | | - "execution_count": null, |
| 283 | + "execution_count": 11, |
232 | 284 | "id": "05f25ddb", |
233 | 285 | "metadata": {}, |
234 | | - "outputs": [], |
| 286 | + "outputs": [ |
| 287 | + { |
| 288 | + "name": "stdout", |
| 289 | + "output_type": "stream", |
| 290 | + "text": [ |
| 291 | + "nb DOI retiré car marqué 'open' dans HAL et repository dans upw 82\n" |
| 292 | + ] |
| 293 | + } |
| 294 | + ], |
235 | 295 | "source": [ |
236 | | - "df_final = df_permissions\n", |
| 296 | + "df_final = df_upw\n", |
237 | 297 | "df_final.fillna(\"\", inplace = True)# sinon nan sera compris comme une string non vide\n", |
238 | 298 | "\n", |
239 | 299 | "## retirer ce qui est dans HAL qui a un lien extérieur et qui est signalé en repository dans upw\n", |
|
244 | 304 | }, |
245 | 305 | { |
246 | 306 | "cell_type": "code", |
247 | | - "execution_count": null, |
| 307 | + "execution_count": 12, |
248 | 308 | "id": "1b7f56c3", |
249 | 309 | "metadata": {}, |
250 | | - "outputs": [], |
| 310 | + "outputs": [ |
| 311 | + { |
| 312 | + "name": "stdout", |
| 313 | + "output_type": "stream", |
| 314 | + "text": [ |
| 315 | + "\n", |
| 316 | + "Statistiques\n", |
| 317 | + "\n", |
| 318 | + "nb de DOI a traiter\t\t\t329\n", |
| 319 | + "creer ou retrouver la notice 123\n", |
| 320 | + "ecrire a l auteur pour appliquer la LRN 119\n", |
| 321 | + "selon la licence ajouter le PDF editeur 66\n", |
| 322 | + "recuperer le PDF editeur et ecrire a l auteur pour accord 15\n", |
| 323 | + "verifier les identifiants de la notice 2\n", |
| 324 | + "Name: todo, dtype: int64\n" |
| 325 | + ] |
| 326 | + } |
| 327 | + ], |
251 | 328 | "source": [ |
252 | 329 | "# deduire la colonne todo\n", |
253 | 330 | "df_final[\"todo\"] = df_final.apply(lambda row : deduce_todo(row), axis = 1)\n", |
|
258 | 335 | }, |
259 | 336 | { |
260 | 337 | "cell_type": "code", |
261 | | - "execution_count": null, |
| 338 | + "execution_count": 13, |
262 | 339 | "id": "b40b4884", |
263 | 340 | "metadata": {}, |
264 | 341 | "outputs": [], |
|
274 | 351 | "\n", |
275 | 352 | "df_final.to_csv(\"data\\dois_a_traiter_formules.csv\", index = False)" |
276 | 353 | ] |
| 354 | + }, |
| 355 | + { |
| 356 | + "cell_type": "code", |
| 357 | + "execution_count": null, |
| 358 | + "id": "c37e1cc4", |
| 359 | + "metadata": {}, |
| 360 | + "outputs": [], |
| 361 | + "source": [] |
277 | 362 | } |
278 | 363 | ], |
279 | 364 | "metadata": { |
|
0 commit comments