Skip to content

Commit 719f75c

Browse files
committed
update (refering Wikidata Random item)
1 parent 9f90eb4 commit 719f75c

File tree

6 files changed

+57
-33
lines changed

6 files changed

+57
-33
lines changed

config.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
database:
2-
name: 'wikidata_claims_refs_parsed.db'
2+
name: '/hpc/scratch/prj/inf_wqp/wikidata_claims_refs_parsed.db'
3+
result_db_for_API: '/hpc/scratch/prj/inf_wqp/reference_checked.db'
34

45
parsing:
56
reset_database: False #This is a developer mode to clean-up DB to test soemthing

eventHandler.py

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import datetime
1010
import time
1111
import uuid
12+
import yaml
1213

1314
def save_to_sqlite(result_df, db_path, table_name):
1415
result_df = result_df.astype(str)
@@ -34,7 +35,7 @@ def save_to_sqlite(result_df, db_path, table_name):
3435
finally:
3536
conn.close()
3637

37-
def initialize_database(db_path='reference_checked.db'):
38+
def initialize_database(db_path):
3839
conn = sqlite3.connect(db_path)
3940
cursor = conn.cursor()
4041
cursor.execute("""
@@ -77,6 +78,7 @@ def initialize_database(db_path='reference_checked.db'):
7778
CREATE TABLE IF NOT EXISTS aggregated_results (
7879
id INTEGER PRIMARY KEY AUTOINCREMENT,
7980
triple TEXT,
81+
property_id TEXT,
8082
url TEXT,
8183
Results TEXT,
8284
qid TEXT,
@@ -114,11 +116,21 @@ def initialize_database(db_path='reference_checked.db'):
114116
def get_random_qids(num_qids=5, max_retries=3, delay=5):
115117
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
116118
sparql.setQuery("""
117-
SELECT ?item
118-
WHERE {
119-
?item wdt:P31 wd:Q5 . # Instance of human
120-
}
121-
LIMIT 1000 # Fetch more items than needed
119+
SELECT ?item {
120+
SERVICE bd:sample {
121+
?item wikibase:sitelinks [].
122+
bd:serviceParam bd:sample.limit "100".
123+
}
124+
MINUS {?item wdt:P31/wdt:P279* wd:Q4167836.}
125+
MINUS {?item wdt:P31/wdt:P279* wd:Q4167410.}
126+
MINUS {?item wdt:P31 wd:Q13406463.}
127+
MINUS {?item wdt:P31/wdt:P279* wd:Q11266439.}
128+
MINUS {?item wdt:P31 wd:Q17633526.}
129+
MINUS {?item wdt:P31 wd:Q13442814.}
130+
MINUS {?item wdt:P3083 [].}
131+
MINUS {?item wdt:P1566 [].}
132+
MINUS {?item wdt:P442 [].}
133+
}
122134
""")
123135
sparql.setReturnFormat(JSON)
124136

@@ -237,8 +249,15 @@ def prove_process(db_path, batch_qids, algo_version):
237249
if 'conn' in locals():
238250
conn.close()
239251

240-
def main(db_path, batch_qids, algo_version, Test_mode):
241-
reset_database = False # Developer mode to test, it initialize db for getting clean db
252+
def load_config(config_path: str):
253+
with open(config_path, 'r') as file:
254+
return yaml.safe_load(file)
255+
256+
257+
def main(batch_qids, algo_version):
258+
reset_database = True # Developer mode to test, it initialize db for getting clean db
259+
config = load_config('config.yaml')
260+
db_path = config['database']['result_db_for_API']
242261
if reset_database and os.path.exists(db_path):
243262
os.remove(db_path)
244263
print(f"Database file {db_path} has been deleted.")
@@ -254,12 +273,8 @@ def main(db_path, batch_qids, algo_version, Test_mode):
254273
time.sleep(30)
255274

256275

257-
258-
259276
if __name__ == "__main__":
260-
db_path = 'reference_checked.db'
261-
batch_qids = 3
262-
algo_version = '1.0.2'
263-
Test_mode = True #using different temp .db to test code.
264-
main(db_path, batch_qids, algo_version, Test_mode)
277+
batch_qids = 5
278+
algo_version = '1.0.3'
279+
main(batch_qids, algo_version)
265280
# nohup python3 eventHandler.py > output.log 2>&1 &

functions.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
import sqlite3
22
import pandas as pd
33
from datetime import datetime
4+
import yaml
5+
46
#Params.
5-
db_path = 'reference_checked.db'
7+
def load_config(config_path: str):
8+
with open(config_path, 'r') as file:
9+
return yaml.safe_load(file)
10+
config = load_config('config.yaml')
11+
db_path = config['database']['result_db_for_API']
612

713
#Table summary
814
def get_all_tables_and_schemas(db_path):
@@ -81,10 +87,10 @@ def comprehensive_results(target_id):
8187
first_item = response[0]
8288
if isinstance(first_item, dict):
8389
if 'error' in first_item:
84-
return {'health_value': 'processing error',
85-
'NOT ENOUGH INFO': 'processing error',
86-
'SUPPORTS': 'processing error',
87-
'REFUTES': 'processing error'
90+
return {'health_value': 'Not processed yet',
91+
'NOT ENOUGH INFO': 'Not processed yet',
92+
'SUPPORTS': 'Not processed yet',
93+
'REFUTES': 'Not processed yet'
8894
}
8995
elif 'status' in first_item and first_item['status'] == 'error':
9096
return {'health_value': 'processing error',

html_fetching.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -308,15 +308,17 @@ def fetch_and_update_html(self):
308308
chrome_options.add_argument("--disable-dev-shm-usage")
309309
chrome_options.add_argument("--disable-plugins")
310310
chrome_options.add_argument("--disable-pdf-viewer")
311-
service = Service('/usr/bin/chromedriver')
312-
driver = webdriver.Chrome(service=service, options=chrome_options)
311+
chrome_options.add_argument("--disable-extensions")
313312
chrome_options.add_experimental_option("prefs", {
314-
"download.default_directory": "/CodeArchive/payloads/",
313+
"download.default_directory": "/dev/null",
315314
"download.prompt_for_download": False,
316315
"download.directory_upgrade": True,
317316
"plugins.always_open_pdf_externally": False,
318-
"safebrowsing.enabled": True
317+
"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
318+
"download_restrictions": 3
319319
})
320+
service = Service('/usr/bin/chromedriver')
321+
driver = webdriver.Chrome(service=service, options=chrome_options)
320322
driver.set_page_load_timeout(10)
321323
for i, (url,) in enumerate(urls_to_fetch):
322324
if i > 0 and i % batch_size == 0:
@@ -813,5 +815,5 @@ def main(qids: List[str]):
813815

814816

815817
if __name__ == "__main__":
816-
qids =['Q4616']
818+
qids =['Q42']
817819
main(qids)

reference_checking.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@
1515
import torch, gc
1616

1717
class ReferenceChecker:
18-
def __init__(self, db_name: str = 'wikidata_claims_refs_parsed.db', config_path: str = 'config.yaml'):
19-
self.db_name = db_name
18+
def __init__(self, config_path: str = 'config.yaml'):
19+
self.config = self.load_config(config_path)
20+
self.db_name = self.config['database']['name']
2021
self.conn = None
2122
self.cursor = None
22-
self.config = self.load_config(config_path)
2323
self.verb_module = VerbModule()
2424
nltk.download('punkt', quiet=True)
2525

@@ -376,8 +376,9 @@ def TableMaking(self, verbalised_claims_df_final, result):
376376
aResult = pd.concat([aResult, pd.DataFrame(row["evidence_TE_labels_all_TOP_N"], columns=['TextEntailment'])], axis=1)
377377
aResult = pd.concat([aResult, pd.DataFrame(np.max(row["evidence_TE_prob_all_TOP_N"], axis=1), columns=['Entailment_score'])], axis=1)
378378
aResult = aResult.reindex(columns=['sentence', 'TextEntailment', 'Entailment_score','Relevance_score'])
379-
aBox = pd.DataFrame({'triple': [row["triple"]], 'url': row['url'],'Results': [aResult]})
379+
aBox = pd.DataFrame({'triple': [row["triple"]], 'property_id' : row['property_id'], 'url': row['url'],'Results': [aResult]})
380380
all_result = pd.concat([all_result,aBox], axis=0)
381+
381382

382383
def dataframe_to_html(all_result):
383384
html = '<html><head><style>table {border-collapse: collapse; width: 100%;} th, td {border: 1px solid black; padding: 8px; text-align: left;} th {background-color: #f2f2f2;}</style></head><body>'
@@ -439,7 +440,6 @@ def main(qids: List[str]):
439440

440441
if __name__ == "__main__":
441442
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
442-
443-
qids =['Q4934']
443+
qids =['Q42']
444444
original_results, aggregated_results, reformedHTML_results = main(qids)
445445

wikidata_reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,5 +360,5 @@ def main(qids: List[str], reset: bool = False):
360360

361361
if __name__ == "__main__":
362362
nltk.download('punkt', quiet=True)
363-
qids =['Q4616']
363+
qids =['Q42']
364364
main(qids)

0 commit comments

Comments
 (0)