1717from urllib .request import Request , urlopen
1818
1919GOOGLE_SCHOLAR_URL = "https://scholar.google.com"
20- HEADERS = {' User-Agent' : ' Mozilla/5.0' }
20+ HEADERS = {" User-Agent" : " Mozilla/5.0" }
2121
2222FORMAT_BIBTEX = 4
2323FORMAT_ENDNOTE = 3
2929
3030
3131def query (
32- searchstr : str ,
33- outformat : int = FORMAT_BIBTEX ,
34- allresults : bool = False
32+ searchstr : str , outformat : int = FORMAT_BIBTEX , allresults : bool = False
3533) -> list [str ]:
3634 """Query google scholar.
3735
@@ -53,17 +51,17 @@ def query(
5351
5452 """
5553 logger .debug (f"Query: { searchstr } " )
56- searchstr = ' /scholar?q=' + quote (searchstr )
54+ searchstr = " /scholar?q=" + quote (searchstr )
5755 url = GOOGLE_SCHOLAR_URL + searchstr
5856 header = HEADERS
59- header [' Cookie' ] = f"GSP=CF={ outformat } "
57+ header [" Cookie" ] = f"GSP=CF={ outformat } "
6058 request = Request (url , headers = header )
6159 response = urlopen (request )
6260 # add set_cookie in header in request header!
63- set_cookie = response .headers [' Set-Cookie' ]
64- header [' Cookie' ] += set_cookie
61+ set_cookie = response .headers [" Set-Cookie" ]
62+ header [" Cookie" ] += set_cookie
6563 html = response .read ()
66- html = html .decode (' utf8' )
64+ html = html .decode (" utf8" )
6765 # grab the links
6866 tmp = get_links (html , outformat )
6967
@@ -72,11 +70,11 @@ def query(
7270 if not allresults :
7371 tmp = tmp [:1 ]
7472 for link in tmp :
75- url = GOOGLE_SCHOLAR_URL + link
73+ url = GOOGLE_SCHOLAR_URL + link
7674 request = Request (url , headers = header )
7775 response = urlopen (request )
7876 bib = response .read ()
79- bib = bib .decode (' utf8' )
77+ bib = bib .decode (" utf8" )
8078 result .append (bib )
8179 return result
8280
@@ -96,24 +94,25 @@ def get_links(html: str, outformat: int) -> list[str]:
9694 the links to the references
9795
9896 """
99- base_url = ' https://scholar.googleusercontent.com'
97+ base_url = " https://scholar.googleusercontent.com"
10098 if outformat == FORMAT_BIBTEX :
101- refre = re .compile (fr '<a href="{ base_url } (/scholar\.bib\?[^"]*)' )
99+ refre = re .compile (rf '<a href="{ base_url } (/scholar\.bib\?[^"]*)' )
102100 elif outformat == FORMAT_ENDNOTE :
103- refre = re .compile (fr '<a href="{ base_url } (/scholar\.enw\?[^"]*)"' )
101+ refre = re .compile (rf '<a href="{ base_url } (/scholar\.enw\?[^"]*)"' )
104102 elif outformat == FORMAT_REFMAN :
105- refre = re .compile (fr '<a href="{ base_url } (/scholar\.ris\?[^"]*)"' )
103+ refre = re .compile (rf '<a href="{ base_url } (/scholar\.ris\?[^"]*)"' )
106104 elif outformat == FORMAT_WENXIANWANG :
107- refre = re .compile (fr '<a href="{ base_url } (/scholar\.ral\?[^"]*)"' )
105+ refre = re .compile (rf '<a href="{ base_url } (/scholar\.ral\?[^"]*)"' )
108106 reflist = refre .findall (html )
109107 # escape html entities
110108 reflist = [
111109 re .sub (
112- ' &({});' .format ('|' .join (name2codepoint )),
110+ " &({});" .format ("|" .join (name2codepoint )),
113111 lambda m : chr (name2codepoint [m .group (1 )]), # type: ignore[index]
114- s
112+ s ,
115113 )
116- for s in reflist ]
114+ for s in reflist
115+ ]
117116 return reflist
118117
119118
@@ -136,20 +135,19 @@ def convert_pdf_to_txt(pdf: str, startpage: int | None = None) -> str:
136135
137136 """
138137 if startpage is not None :
139- startpageargs = ['-f' , str (startpage )]
138+ startpageargs = ["-f" , str (startpage )]
140139 else :
141140 startpageargs = []
142- stdout = subprocess .Popen (["pdftotext" , "-q" ] + startpageargs + [pdf , "-" ],
143- stdout = subprocess .PIPE ).communicate ()[0 ]
141+ stdout = subprocess .Popen (
142+ ["pdftotext" , "-q" ] + startpageargs + [pdf , "-" ],
143+ stdout = subprocess .PIPE ,
144+ ).communicate ()[0 ]
144145
145146 return stdout .decode ()
146147
147148
148149def pdflookup (
149- pdf : str ,
150- allresults : bool ,
151- outformat : int ,
152- startpage : int | None = None
150+ pdf : str , allresults : bool , outformat : int , startpage : int | None = None
153151) -> list [str ]:
154152 """Look a pdf up on google scholar and return bibtex items.
155153
@@ -197,9 +195,9 @@ def _get_bib_element(bibitem: str, element: str) -> str | None:
197195 if i .startswith (element ):
198196 value = i .split ("=" , 1 )[- 1 ]
199197 value = value .strip ()
200- while value .endswith (',' ):
198+ while value .endswith ("," ):
201199 value = value [:- 1 ]
202- while value .startswith ('{' ) or value .startswith ('"' ):
200+ while value .startswith ("{" ) or value .startswith ('"' ):
203201 value = value [1 :- 1 ]
204202 return value
205203 return None
@@ -215,5 +213,5 @@ def rename_file(pdf: str, bibitem: str) -> None:
215213 elem = [i for i in (year , author , title ) if i ]
216214 filename = "-" .join (elem ) + ".pdf"
217215 newfile = pdf .replace (os .path .basename (pdf ), filename )
218- logger .info (f' Renaming { pdf } to { newfile } ' )
216+ logger .info (f" Renaming { pdf } to { newfile } " )
219217 os .rename (pdf , newfile )
0 commit comments