Skip to content

Commit f2ce2d3

Browse files
authored
Merge pull request #1 from Str3am786/main
New pipeline to work from pdf and other fixes
2 parents 4d81cea + 1fad69c commit f2ce2d3

94 files changed

Lines changed: 1891 additions & 249 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Dockerfile

Lines changed: 0 additions & 19 deletions
This file was deleted.

README.md

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ This tool verifies the link between a scientific paper and a software repository
99
There is also a "unidirectional" metric, which finds a repository url and see's within the repository if the paper is named.
1010

1111
## Dependencies
12-
- Python 3.9
12+
- Python 3.10
1313
- Java 8 or above (please see [Tika requirements](https://tika.apache.org))
1414

1515
## Installation
@@ -39,18 +39,23 @@ TODO
3939
### The repository is divided into the following directories:
4040

4141
1. Download_pdf
42-
2. Metadata_extraction
43-
3. Object_creator
44-
4. Modelling
45-
5. Prediction
42+
2. Metadata
43+
3. Extraction
44+
4. Object_creator
45+
5. Modelling
46+
6. Prediction
4647

4748
### Download_pdf
4849
Pertains to all the downloading of pdfs.
4950
Downloaded_obj is a representation of downloaded papers which have not been processed yet.
5051

51-
### Metadata_extraction
52+
### Metadata
53+
TODO
5254
Encompasses petitions to OpenAlex for fetching the paper's metadata.
5355
MetadataObj contains the metadata from OpenAlex: doi, arxiv and its title.
56+
57+
### Extraction
58+
TODO
5459
Tika scripts to open a pdf and extract its urls are also found witin this module.
5560
PaperObj is created once the downloadedObj's pdf has been processed to locate all its urls. Contains: doi, arxiv, title, file_path, urls.
5661
Finally, the necessary functions dowloading a repository and extracting its metadata with SOMEF

__main__.py

Lines changed: 0 additions & 7 deletions
This file was deleted.

metadata_extraction/api/openAlex_api_queries.py

Lines changed: 0 additions & 56 deletions
This file was deleted.

pyproject.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[build-system]
2+
requires = [
3+
"setuptools>=42",
4+
"wheel"
5+
]
6+
build-backend = "setuptools.build_meta"

requirements.txt

Lines changed: 0 additions & 8 deletions
This file was deleted.

setup.cfg

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
[metadata]
2+
name = SSKG
3+
version = attr: SSKG.__version__
4+
author = Miguel Arroyo Márquez, Daniel Garijo
5+
author_email = daniel.garijo@upm.es
6+
description = TODO
7+
long_description = file: README.md
8+
long_description_content_type = text/markdown
9+
url = https://github.com/SoftwareUnderstanding/SSKG
10+
classifiers =
11+
Programming Language :: Python :: 3
12+
Operating System :: OS Independent
13+
14+
[options]
15+
package_dir =
16+
= src
17+
packages = find:
18+
include_package_data = True
19+
python_requires = >= 3.10.0
20+
install_requires =
21+
somef >= 0.9.4
22+
arxiv
23+
bibtexparser
24+
fuzzywuzzy
25+
jaro_winkler
26+
tika
27+
beautifulsoup4
28+
python-Levenshtein
29+
click
30+
pdftitle
31+
32+
[options.packages.find]
33+
where = src
34+
35+
[options.entry_points]
36+
console_scripts =
37+
sskg = SSKG.__main__:cli
File renamed without changes.

src/SSKG/__main__.py

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
#TODO find appropiate names
2+
from . import __version__
3+
import click
4+
import os
5+
import logging
6+
CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
7+
VALID_EXTENSIONS = ['.txt', '.json']
8+
9+
10+
@click.group(context_settings=CONTEXT_SETTINGS)
11+
@click.version_option(__version__)
12+
def cli():
13+
"""
14+
███████ ███████ ██ ██ ██████ \n
15+
██ ██ ██ ██ ██ \n
16+
███████ ███████ █████ ██ ███ \n
17+
██ ██ ██ ██ ██ ██ \n
18+
███████ ███████ ██ ██ ██████ \n
19+
20+
Scientific Software Knowledge Graphs (SSKG)\n
21+
Find and assess Research Software within Research papers.\n
22+
23+
Usage:\n
24+
1. (assess) Assess doi for unidirectionality or bidirectionality\n
25+
2. (download) Download PDF (paper) from a doi or list\n
26+
3. (process) Process downloaded pdf to find urls and abstract\n
27+
28+
"""
29+
pass
30+
31+
# #TODO
32+
# @cli.command()
33+
# def configure():
34+
# """This creates a ~/.soca/configure.ini file"""
35+
# #TODO defaults check
36+
# url = click.prompt("URL to database",default = "http://localhost:8086")
37+
# bucket = click.prompt("Bucket", default = "my-bucket")
38+
# org = click.prompt("Organisation",default = "org_name")
39+
# token = click.prompt("Token", default = "")
40+
# if len(token) == 0:
41+
# click.echo("No token given, please enter token or press enter")
42+
# token = click.prompt("Token", default = "")
43+
# try:
44+
# from soca.commands import create_config
45+
#
46+
# create_config.create_config(url,bucket,token,org)
47+
# click.secho(f"Success", fg="green")
48+
# except Exception as e:
49+
# click.secho(f"Error: "+str(e),fg="red")
50+
# exit(1)
51+
52+
@cli.command()
53+
@click.option('--input','-i', required=True, help="DOI or path to .txt list of DOIs", metavar='<name>')
54+
@click.option('--output','-o', default="output", show_default=True, help="Output csv file", metavar='<path>')
55+
@click.option('--unidir', '-U', is_flag=True, default = False, help="Unidirectionality")
56+
@click.option('--bidir', '-B', is_flag=True, default = False, help="Bidirectionality")
57+
def assess(input, output,unidir,bidir):
58+
from .object_creator.pipeline import dois_txt_to_unidir_json, dois_txt_to_bidir_json, single_doi_pipeline_unidir, \
59+
single_doi_pipeline_bidir, papers_json_to_unidir_json, papers_json_to_bidir_json
60+
if unidir:
61+
if input.endswith(".txt") and os.path.exists(input):
62+
dois_txt_to_unidir_json(dois_txt=input,output_dir=output)
63+
if input.endswith(".json") and os.path.exists(input):
64+
papers_json_to_unidir_json(papers_json=input, output_dir=output)
65+
return
66+
else:
67+
single_doi_pipeline_unidir(doi=input,output_dir=output)
68+
return
69+
70+
elif bidir:
71+
if input.endswith(".txt") and os.path.exists(input):
72+
dois_txt_to_bidir_json(dois_txt=input,output_dir=output)
73+
if input.endswith(".json") and os.path.exists(input):
74+
papers_json_to_bidir_json(papers_json=input, output_dir=output)
75+
else:
76+
single_doi_pipeline_bidir(doi=input, output_dir=output)
77+
return
78+
else:
79+
print("Please select a directionality to measure")
80+
print("-U is to assess Uni-directionality")
81+
print("-B is to assess Bi-directionality")
82+
pass
83+
84+
85+
86+
@cli.command()
87+
@click.option('--input','-i', required=True, help="DOI or path to .txt list of DOIs", metavar='<name>')
88+
@click.option('--output','-o', default="./", show_default=True, help="Output Directory ", metavar='<path>')
89+
def download(input, output):
90+
from .object_creator.create_downloadedObj import doi_to_downloadedJson, dois_txt_to_downloadedJson
91+
if input.endswith(".txt") and os.path.exists(input):
92+
dois_txt_to_downloadedJson(dois_txt=input, output_dir=output)
93+
else:
94+
try:
95+
doi_to_downloadedJson(doi=input, output_dir=output)
96+
except Exception as e:
97+
print(e)
98+
return
99+
@cli.command()
100+
@click.option('--input','-i', required=True, help="DOI or path to .txt list of DOIs", metavar='<name>')
101+
@click.option('--output','-o', default="./", show_default=True, help="Output Directory ", metavar='<path>')
102+
def process(input,output):
103+
from .object_creator.downloaded_to_paperObj import dwnlddJson_to_paperJson, dwnldd_obj_to_paper_json
104+
from .object_creator.create_downloadedObj import pdf_to_downloaded_obj
105+
106+
if os.path.isdir(input):
107+
_aux_pdfs_to_pp_json(input= input, output= output)
108+
return
109+
if input.endswith(".json") and os.path.exists(input):
110+
dwnlddJson_to_paperJson(input,output)
111+
if input.endswith(".pdf") and os.path.exists(input):
112+
#TODO
113+
dwnldd = pdf_to_downloaded_obj(pdf= input, output_dir= output)
114+
dwnldd_obj_to_paper_json(download_obj= dwnldd,output_dir= output)
115+
return
116+
else:
117+
print("Error")
118+
return
119+
120+
def _aux_pdfs_to_pp_json(input, output):
121+
from .object_creator.create_downloadedObj import pdf_to_downloaded_obj
122+
from .object_creator.downloaded_to_paperObj import dwnldd_obj_to_paper_dic
123+
import json
124+
try:
125+
result = {}
126+
for pdfFile in os.listdir(input):
127+
print(pdfFile)
128+
try:
129+
if os.path.isfile(pdfFile) and pdfFile.endswith(".pdf"):
130+
dwnldd = pdf_to_downloaded_obj(pdf=pdfFile, output_dir=output)
131+
pp_dic = dwnldd_obj_to_paper_dic(downloaded_obj=dwnldd)
132+
try:
133+
result.update(pp_dic)
134+
except Exception as update_error:
135+
logging.error(f"Error updating result with pp_dic: {str(update_error)}")
136+
continue
137+
print(pp_dic)
138+
print(pdfFile)
139+
except Exception as file_error:
140+
logging.error(f"Error processing file: {str(file_error)}")
141+
continue
142+
output_path = output + "/" + "processed_metadata.json"
143+
with open(output_path, 'w+') as out_file:
144+
json.dump(result, out_file, sort_keys=True, indent=4,
145+
ensure_ascii=False)
146+
return output_path
147+
except Exception as e:
148+
logging.error(f"an error occurred: {str(e)}")
149+
print(str(e))

0 commit comments

Comments
 (0)