Skip to content

Commit 4d81cea

Browse files
committed
last commit Miguel
1 parent a81e667 commit 4d81cea

23 files changed

Lines changed: 460 additions & 30 deletions

__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__version__ = "0.0.1"

__main__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from object_creator.pipeline import *
2+
3+
output_directory = "/Users/pingamax2/Documents" #output path
4+
dois_txt = "./test/dois.txt" #path to txt of dois
5+
if __name__ == '__main__':
6+
dois_txt_to_bidir_json(dois_txt=dois_txt,output_dir=output_directory)
7+
dois_txt_to_unidir_json(dois_txt=dois_txt,output_dir=output_directory)

download_pdf/download_pipeline.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ def _is_arxiv(doi):
2222
else:
2323
return False
2424

25+
2526
def pdf_download_pipeline(doi, output_directory):
2627
"""
2728
Input

metadata_extraction/github_extractor_tika.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,26 @@ def read_pdf(pdf_path):
1515
except Exception as e:
1616
return []
1717

18+
def find_abstract_index(pdf_data):
19+
index = 0
20+
for line in pdf_data:
21+
if "abstract" in line.lower():
22+
if index < len(pdf_data):
23+
return index
24+
index +=1
25+
def get_possible_abstract(pdf_data):
26+
try:
27+
index = find_abstract_index(pdf_data)
28+
if index:
29+
return ''.join(pdf_data[index:index+50])
30+
except Exception as e:
31+
print(e)
32+
33+
def find_github_in_abstract(pdf_data):
34+
abstract = get_possible_abstract(pdf_data)
35+
if abstract:
36+
return look_for_github_urls(abstract)
37+
1838
#regular expression to get all the urls, returned as a list
1939
def get_git_urls(text):
2040
"""

metadata_extraction/paper_obj.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
from utils.regex import str_to_doiID, str_to_arxivID
44

55
class PaperObj:
6-
def __init__(self, title, urls, doi, arxiv, file_name, file_path):
6+
def __init__(self, title, urls, doi, arxiv, abstract, file_name, file_path):
77
self._title = title
88
self._urls = urls
99
self._doi = str_to_doiID(doi)
1010
self._arxiv = str_to_arxivID(arxiv)
1111
self._file_name = file_name
1212
self._file_path = file_path
13+
self._abstract = abstract
1314

1415
@property
1516
def title(self):
@@ -27,6 +28,14 @@ def urls(self):
2728
def urls(self, value):
2829
self._urls = value
2930

31+
@property
32+
def abstract(self):
33+
return self._abstract
34+
35+
@urls.setter
36+
def abstract(self, value):
37+
self._abstract = value
38+
3039
@property
3140
def doi(self):
3241
return self._doi
@@ -62,9 +71,10 @@ def file_path(self, value):
6271
def to_dict(self):
6372
return {
6473
'title': self._title,
65-
'urls': self._urls,
74+
'urls': self._abstract,
6675
'doi': self._doi,
6776
'arxiv': self.arxiv,
77+
'abstract': self.abstract,
6878
'file_name': self._file_name,
6979
'file_path': self._file_path
7080
}

modelling/unidirectionality.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def find_substring(substring, larger_string):
3131
return
3232

3333
def _iterate_results(results, string_2_find):
34-
if not results:
34+
if (not results) or (not string_2_find):
3535
return False
3636
for result in results:
3737
value = safe_dic(safe_dic(result,"result"),'value')
@@ -51,6 +51,14 @@ def is_repo_unidir(paperObj, repo_json):
5151
#Repo title is close to the repo full title
5252
results = safe_dic(repo_data,'full_title')
5353
unidir = _iterate_results(results, paperObj.title)
54+
if not unidir:
55+
#Repo title is close to the repo full title
56+
results = safe_dic(repo_data,'name')
57+
unidir = _iterate_results(results, paperObj.abstract)
58+
if not unidir:
59+
#Repo title is close to the repo full title
60+
results = safe_dic(repo_data,'full_title')
61+
unidir = _iterate_results(results, paperObj.abstract)
5462
# See if paper title is within the description
5563
if not unidir:
5664
results = safe_dic(repo_data,'description')

object_creator/create_downloadedObj.py

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from download_pdf.downloaded_obj import DownloadedObj
33
import os
44
import json
5-
from .doi_to_metadata import metaDict_to_metaObj
5+
from .doi_to_metadata import metaDict_to_metaObj, doi_to_metadataObj
66

77

88
def meta_to_dwnldd(metadataObj, output_dir):
@@ -20,8 +20,9 @@ def meta_to_dwnldd(metadataObj, output_dir):
2020
file_path = pdf_download_pipeline(doi=metadataObj.doi,output_directory=output_dir)
2121
file_name = os.path.basename(file_path)
2222
return DownloadedObj(title=metadataObj.title,doi=metadataObj.doi,arxiv=metadataObj.arxiv,file_name=file_name,file_path=file_path)
23-
except:
23+
except Exception as e:
2424
print("Error while creating the downloaded object")
25+
print(str(e))
2526
return None
2627

2728

@@ -98,7 +99,55 @@ def metaJson_to_downloadedJson(meta_json, output_dir):
9899
ensure_ascii=False)
99100
return output_path
100101

102+
def doi_to_downloadedObj(doi,output_dir):
103+
meta = doi_to_metadataObj(doi)
104+
return meta_to_dwnldd(meta,output_dir)
105+
106+
def doi_to_downloadedDic(doi,output_dir):
107+
return downloaded_dictionary(doi_to_downloadedObj(doi, output_dir))
108+
109+
110+
def dois_to_downloadedDics(dois_list, output_dir):
111+
result = {}
112+
for doi in dois_list:
113+
result.update(doi_to_downloadedDic(doi,output_dir))
114+
return result
115+
def dois_txt_to_downloadedDics(dois_txt,output_dir):
116+
try:
117+
with open(dois_txt, 'r') as file:
118+
dois = file.read().splitlines()
119+
except:
120+
print("Error while opening the txt")
121+
return dois_to_downloadedDics(dois,output_dir)
122+
123+
def doi_to_downloadedJson(doi,output_dir):
124+
dict = doi_to_downloadedDic(doi, output_dir)
125+
output_path = output_dir + "/" + "downloaded_metadata.json"
126+
with open(output_path, 'w+') as out_file:
127+
json.dump(dict, out_file, sort_keys=True, indent=4,
128+
ensure_ascii=False)
129+
return output_path
130+
def dois_to_downloadedJson(dois,output_dir):
131+
dict = dois_to_downloadedDics(dois, output_dir)
132+
output_path = output_dir + "/" + "downloaded_metadata.json"
133+
with open(output_path, 'w+') as out_file:
134+
json.dump(dict, out_file, sort_keys=True, indent=4,
135+
ensure_ascii=False)
136+
return output_path
137+
def dois_txt_to_downloadedJson(dois_txt,output_dir):
138+
dict = dois_txt_to_downloadedDics(dois_txt, output_dir)
139+
output_path = output_dir + "/" + "downloaded_metadata.json"
140+
with open(output_path, 'w+') as out_file:
141+
json.dump(dict, out_file, sort_keys=True, indent=4,
142+
ensure_ascii=False)
143+
return output_path
101144

145+
def download_from_doi(doi,output_dir):
146+
return doi_to_downloadedJson(doi,output_dir)
147+
def download_from_doi_list(dois,output_dir):
148+
return dois_to_downloadedJson(dois,output_dir)
149+
def download_from_doi_txt(dois_txt,output_dir):
150+
return dois_to_downloadedJson(dois_txt, output_dir)
102151
def safe_dic(dic, key):
103152
try:
104153
return dic[key]

object_creator/downloaded_to_paperObj.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from metadata_extraction.github_extractor_tika import ranked_git_url, read_pdf
1+
from metadata_extraction.github_extractor_tika import ranked_git_url, read_pdf, get_possible_abstract
22
from metadata_extraction.paper_obj import PaperObj
33
from object_creator.create_downloadedObj import downloadedDic_to_downloadedObj
44
import json
@@ -17,12 +17,13 @@ def downloaded_to_paperObj(downloadedObj):
1717
try:
1818
pdf_data = read_pdf(downloadedObj.file_path)
1919
urls = ranked_git_url(pdf_data)
20+
abstract = get_possible_abstract(pdf_data)
2021
title = downloadedObj.title
2122
doi = downloadedObj.doi
2223
arxiv = downloadedObj.arxiv
2324
file_name = downloadedObj.file_name
2425
file_path = downloadedObj.file_path
25-
return PaperObj(title, urls, doi, arxiv, file_name, file_path)
26+
return PaperObj(title, urls, doi, arxiv, abstract, file_name, file_path)
2627
except Exception as e:
2728
print(str(e))
2829
print("Error while trying to read from the pdf")

object_creator/paper_obj_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ def paperDict_to_paperObj(paper_dict):
77
file_name = safe_dic(paper_dict,"file_name")
88
file_path = safe_dic(paper_dict,"file_path")
99
urls = safe_dic(paper_dict,"urls")
10-
return PaperObj(title=title, urls=urls, doi=doi, arxiv=arxiv, file_name=file_name, file_path=file_path)
10+
abstract = safe_dic(paper_dict,"abstract")
11+
return PaperObj(title=title, urls=urls, doi=doi, arxiv=arxiv, file_name=file_name, file_path=file_path, abstract=abstract)
1112

1213

1314
def safe_dic(dic, key):

object_creator/pipeline.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,16 @@ def pipeline_single_bidir(doi,output_dir):
2828
paper = doi_to_paper(doi,output_dir)
2929
result = check_bidir(paper,output_dir)
3030
return result
31+
def pipeline_single_unidir(doi,output_dir):
32+
'''
33+
@Param doi: doi
34+
@Param output_dir: where the pdf will be downloaded to
35+
:returns
36+
dictionary with doi and the urls found that are unidirectional for that doi
37+
'''
38+
paper = doi_to_paper(doi,output_dir)
39+
result = check_unidir(paper,output_dir)
40+
return result
3141

3242
def pipeline_multiple_bidir(list_dois, output_dir):
3343
'''
@@ -48,6 +58,25 @@ def pipeline_multiple_bidir(list_dois, output_dir):
4858
except Exception as e:
4959
print(str(e))
5060
return None
61+
def pipeline_multiple_unidir(list_dois, output_dir):
62+
'''
63+
@Param list_dois: list of dois
64+
@Param output_dir: where the pdf will be downloaded to
65+
:returns
66+
dictionary with dois and the urls found that are unidirectional for that doi
67+
'''
68+
result = {}
69+
try:
70+
for doi in list_dois:
71+
paper = doi_to_paper(doi,output_dir)
72+
if not paper:
73+
continue
74+
if (unidir:=(check_unidir(paper,output_dir))):
75+
result.update(unidir)
76+
return result
77+
except Exception as e:
78+
print(str(e))
79+
return None
5180

5281
def pipeline_txt_dois_bidir(dois_txt, output_dir):
5382
'''
@@ -63,6 +92,20 @@ def pipeline_txt_dois_bidir(dois_txt, output_dir):
6392
print("Error while opening the txt")
6493
return pipeline_multiple_bidir(dois,output_dir)
6594

95+
def pipeline_txt_dois_unidir(dois_txt, output_dir):
96+
'''
97+
@Param dois_txt: dois seperated by \n within a txt
98+
@Param output_dir: where the pdf will be downloaded to
99+
:returns
100+
dictionary with dois and the urls found that are bidirectional for that doi
101+
'''
102+
try:
103+
with open(dois_txt, 'r') as file:
104+
dois = file.read().splitlines()
105+
except:
106+
print("Error while opening the txt")
107+
return pipeline_multiple_unidir(dois,output_dir)
108+
66109
def from_papers_json_to_bidir(papers_json, output_dir):
67110
'''
68111
@Param papers_json: json of papers, Key: DOI, V: paperObj (as a dictionary)
@@ -112,6 +155,15 @@ def dois_txt_to_bidir_json(dois_txt, output_dir):
112155
'''
113156
output_path = os.path.join(output_dir,"bidir.json")
114157
return dict_to_json(pipeline_txt_dois_bidir(dois_txt,output_dir),output_path)
158+
def dois_txt_to_unidir_json(dois_txt, output_dir):
159+
'''
160+
@Param dois_txt: dois seperated by \n within a txt
161+
@Param output_dir: where the pdf will be downloaded to
162+
:returns
163+
path to output JSON
164+
'''
165+
output_path = os.path.join(output_dir,"unidir_20_07.json")
166+
return dict_to_json(pipeline_txt_dois_unidir(dois_txt,output_dir),output_path)
115167

116168

117169
def from_papers_json_to_unidir(papers_json, output_dir):
@@ -133,7 +185,7 @@ def from_papers_json_to_unidir(papers_json, output_dir):
133185
unidir = check_unidir(paper, output_dir)
134186
if unidir:
135187
result.update(unidir)
136-
return dict_to_json(result,output_path=os.path.join(output_dir,"unidir.json"))
188+
return dict_to_json(result,output_path=os.path.join(output_dir,"unidir_20_07.json"))
137189

138190

139191

0 commit comments

Comments
 (0)