1- from typing import Dict , List , Union
1+ import glob
2+ import logging
3+ import os
4+ import sys
5+ from typing import Dict , List , Literal , Union
26
37import arxiv
48import pandas as pd
9+ import pkg_resources
510from tqdm import tqdm
611
712from ..utils import dump_papers
8- from .utils import get_query_from_keywords
13+ from ..xrxiv .xrxiv_query import XRXivQuery
14+ from .utils import get_query_from_keywords , infer_backend
15+
16+ logging .basicConfig (stream = sys .stdout , level = logging .INFO )
17+ logger = logging .getLogger (__name__ )
18+
19+ dump_root = pkg_resources .resource_filename ("paperscraper" , "server_dumps" )
20+
21+ global ARXIV_QUERIER
22+ ARXIV_QUERIER = None
23+
24+
25+ def search_local_arxiv ():
26+ global ARXIV_QUERIER
27+ if ARXIV_QUERIER is not None :
28+ return
29+ dump_paths = glob .glob (os .path .join (dump_root , "arxiv*" ))
30+
31+ if len (dump_paths ) > 0 :
32+ path = sorted (dump_paths , reverse = True )[0 ]
33+ querier = XRXivQuery (path )
34+ if not querier .errored :
35+ ARXIV_QUERIER = querier .search_keywords
36+ logger .info (f"Loaded arxiv dump with { len (querier .df )} entries" )
37+
938
1039arxiv_field_mapper = {
1140 "published" : "date" ,
2352}
2453
2554
26- def get_arxiv_papers (
55+ def get_arxiv_papers_local (
56+ keywords : List [Union [str , List [str ]]],
57+ fields : List [str ] = None ,
58+ output_filepath : str = None ,
59+ ) -> pd .DataFrame :
60+ """
61+ Search for papers in the dump using keywords.
62+
63+ Args:
64+ keywords: Items will be AND separated. If items
65+ are lists themselves, they will be OR separated.
66+ fields: fields to be used in the query search.
67+ Defaults to None, a.k.a. search in all fields excluding date.
68+ output_filepath: optional output filepath where to store the hits in JSONL format.
69+ Defaults to None, a.k.a., no export to a file.
70+
71+ Returns:
72+ pd.DataFrame: A dataframe with one paper per row.
73+ """
74+ search_local_arxiv ()
75+ if ARXIV_QUERIER is None :
76+ raise ValueError (
77+ "Could not find local arxiv dump. Use `backend=api` or download dump via `paperscraper.get_dumps.arxiv"
78+ )
79+ return ARXIV_QUERIER (
80+ keywords = keywords , fields = fields , output_filepath = output_filepath
81+ )
82+
83+
84+ def get_arxiv_papers_api (
2785 query : str ,
2886 fields : List = ["title" , "authors" , "date" , "abstract" , "journal" , "doi" ],
2987 max_results : int = 99999 ,
@@ -36,14 +94,14 @@ def get_arxiv_papers(
3694 fields as desired.
3795
3896 Args:
39- query (str): Query to arxiv API. Needs to match the arxiv API notation.
40- fields (List[str]) : List of strings with fields to keep in output.
41- max_results (int) : Maximal number of results, defaults to 99999.
42- client_options (Dict) : Optional arguments for `arxiv.Client`. E.g.:
97+ query Query to arxiv API. Needs to match the arxiv API notation.
98+ fields: List of strings with fields to keep in output.
99+ max_results: Maximal number of results, defaults to 99999.
100+ client_options: Optional arguments for `arxiv.Client`. E.g.:
43101 page_size (int), delay_seconds (int), num_retries (int).
44102 NOTE: Decreasing 'num_retries' will speed up processing but might
45103 result in more frequent 'UnexpectedEmptyPageErrors'.
46- search_options (Dict) : Optional arguments for `arxiv.Search`. E.g.:
104+ search_options: Optional arguments for `arxiv.Search`. E.g.:
47105 id_list (List), sort_by, or sort_order.
48106
49107 Returns:
@@ -75,27 +133,43 @@ def get_and_dump_arxiv_papers(
75133 fields : List = ["title" , "authors" , "date" , "abstract" , "journal" , "doi" ],
76134 start_date : str = "None" ,
77135 end_date : str = "None" ,
136+ backend : Literal ["api" , "local" , "infer" ] = "api" ,
78137 * args ,
79138 ** kwargs ,
80139):
81140 """
82141 Combines get_arxiv_papers and dump_papers.
83142
84143 Args:
85- keywords (List[str, List[str]]) : List of keywords to request arxiv API .
144+ keywords: List of keywords for arxiv search .
86145 The outer list level will be considered as AND separated keys, the
87146 inner level as OR separated.
88- filepath (str) : Path where the dump will be saved.
89- fields (List, optional) : List of strings with fields to keep in output.
147+ filepath: Path where the dump will be saved.
148+ fields: List of strings with fields to keep in output.
90149 Defaults to ['title', 'authors', 'date', 'abstract',
91150 'journal', 'doi'].
92- start_date (str) : Start date for the search. Needs to be in format:
151+ start_date: Start date for the search. Needs to be in format:
93152 YYYY/MM/DD, e.g. '2020/07/20'. Defaults to 'None', i.e. no specific
94153 dates are used.
95- end_date (str): End date for the search. Same notation as start_date.
154+ end_date: End date for the search. Same notation as start_date.
155+ backend: If `api`, the arXiv API is queried. If `local` the local arXiv dump
156+ is queried (has to be downloaded before). If `infer` the local dump will
157+ be used if exists, otherwise API will be queried. Defaults to `api`
158+ since it is faster.
96159 *args, **kwargs are additional arguments for `get_arxiv_papers`.
97160 """
98161 # Translate keywords into query.
99162 query = get_query_from_keywords (keywords , start_date = start_date , end_date = end_date )
100- papers = get_arxiv_papers (query , fields , * args , ** kwargs )
163+
164+ if backend not in {"api" , "local" , "infer" }:
165+ raise ValueError (
166+ f"Invalid backend: { backend } . Must be one of ['api', 'local', 'infer']"
167+ )
168+ elif backend == "infer" :
169+ backend = infer_backend ()
170+
171+ if backend == "api" :
172+ papers = get_arxiv_papers_api (query , fields , * args , ** kwargs )
173+ elif backend == "local" :
174+ papers = get_arxiv_papers_local (keywords , fields , * args , ** kwargs )
101175 dump_papers (papers , output_filepath )
0 commit comments