Skip to content

Commit 291ee05

Browse files
authored
Merge pull request #104 from ARGOeu/devel
Version 1.0.4
2 parents d0a98f4 + 726062e commit 291ee05

36 files changed

+1571
-100
lines changed

environment.yml

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,53 +2,53 @@ name: rsmetrics
22
channels:
33
- defaults
44
dependencies:
5-
- _libgcc_mutex=0.1=main
6-
- _openmp_mutex=4.5=1_gnu
7-
- ca-certificates=2022.3.18=h06a4308_0
8-
- certifi=2021.10.8=py39h06a4308_2
9-
- ld_impl_linux-64=2.35.1=h7274673_9
10-
- libffi=3.3=he6710b0_2
11-
- libgcc-ng=9.3.0=h5101ec6_17
12-
- libgomp=9.3.0=h5101ec6_17
13-
- libstdcxx-ng=9.3.0=hd4cf53a_17
14-
- ncurses=6.3=h7f8727e_2
15-
- openssl=1.1.1n=h7f8727e_0
16-
- pip=21.2.4=py39h06a4308_0
17-
- python=3.9.11=h12debd9_2
18-
- readline=8.1.2=h7f8727e_1
19-
- setuptools=58.0.4=py39h06a4308_0
20-
- sqlite=3.38.0=hc218d9a_0
21-
- tk=8.6.11=h1ccaba5_0
22-
- tzdata=2021e=hda174b7_0
23-
- wheel=0.37.1=pyhd3eb1b0_0
24-
- xz=5.2.5=h7b6447c_0
25-
- zlib=1.2.11=h7f8727e_4
5+
- _libgcc_mutex=0.1
6+
- _openmp_mutex=4.5
7+
- ca-certificates=2022.3.18
8+
- ld_impl_linux-64=2.35.1
9+
- libffi=3.3
10+
- libgcc-ng=9.3.0
11+
- libgomp=9.3.0
12+
- libstdcxx-ng=9.3.0
13+
- ncurses=6.3
14+
- openssl=1.1.1n
15+
- pip=21.2.4
16+
- python=3.9.11
17+
- readline=8.1.2
18+
- setuptools=58.0.4
19+
- sqlite=3.38.0
20+
- tk=8.6.11
21+
- tzdata=2021e
22+
- wheel=0.37.1
23+
- xz=5.2.5
24+
- zlib=1.2.11
2625
- pip:
2726
- beautifulsoup4==4.10.0
28-
- certifi==2021.10.8
27+
- certifi==2022.12.7
2928
- charset-normalizer==2.0.12
3029
- click==8.1.3
31-
- Flask==2.1.2
30+
- flask==2.1.2
31+
- flask-pymongo==2.3.0
3232
- idna==3.3
3333
- importlib-metadata==4.11.4
3434
- itsdangerous==2.1.2
35-
- Jinja2==3.1.2
35+
- jinja2==3.1.2
3636
- joblib==1.2.0
37-
- MarkupSafe==2.1.1
37+
- markupsafe==2.1.1
3838
- natsort==8.1.0
3939
- numpy==1.22.3
4040
- pandas==1.4.2
41+
- pyarrow==10.0.1
4142
- pymongo==4.1.0
43+
- pymongoarrow==0.6.2
4244
- python-dateutil==2.8.2
4345
- python-dotenv==0.20.0
4446
- pytz==2022.1
45-
- PyYAML==6.0
47+
- pyyaml==6.0
4648
- requests==2.27.1
4749
- scipy==1.8.0
4850
- six==1.16.0
4951
- soupsieve==2.3.2
5052
- urllib3==1.26.9
51-
- Werkzeug==2.1.2
53+
- werkzeug==2.1.2
5254
- zipp==3.8.0
53-
- flask-pymongo==2.3.0
54-
- pymongoarrow==0.6.2

metric_descriptions/accuracy.yml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
name: Accuracy
2+
3+
summary: >
4+
Measures Recommendations' accuracy based on users' access to the services. A value of 1, indicates that the RS model got all the predictions right, and a value of 0 indicates that the RS model did not make a single correct prediction
5+
6+
description: >
7+
The accuracy (\(A\)) of the recommendations is based on users' access to the services. A value of 1, indicates that the RS model got all the predictions right, and a value of 0 indicates that the RS model did not make a single correct prediction. Generally, the Accuracy mathematical expression is defined as:
8+
$$A=\frac{Number\;of\;correct\;predictions}{Total\;number\;of\;predictions}$$In RS Metrics the computation is determined by the following formula:
9+
$$Accuracy=\frac{Number\;of\;correctly\;recommended\;services}{Total\;number\;of\;services}$$where correctness is defined as if the service is both accessed by the user and also it is recommended by the RS
10+
11+
output:
12+
type: float
13+
min: 0
14+
max: 1
15+
comment: A value of 1, indicates that the RS model got all the predictions right, and a value of 0 indicates that the RS model did not make a single correct prediction.
16+
17+
prerequisites:
18+
- recommendations without anonymous users
19+
- all available users (with their accessed services)
20+
- all available services
21+
22+
process:
23+
- step: Clean up
24+
details: >
25+
Recommendations clean up; entries removal where users or services are not found in "users" or "services" accordingly
26+
- step: Vector creation of the Accessed Services
27+
details: >
28+
For each user create a vector at the size of the number of the services, and assign a binary value for each service with a value of 1 if it is found in the user's accessed services, or 0 if it is not
29+
- step: Vector creation of the Recommended Services
30+
details: >
31+
For each user create a vector at the size of the number of the services, and assign a binary value for each service with a value of 1 if it is recommended to the user, or 0 if it is not
32+
- step: Accuracy score computation
33+
details: >
34+
For each user compute the average value of the difference vector; a vector which states True if service is found in both accessed and recommended vectors or False if it is not
35+
- step: Mean value of Accuracy score
36+
details: >
37+
Computation of the overall value by calculating the mean value of each user's accuracy score
38+
39+
# This is optional for visual stylization of the metric when displayed on the report
40+
style:
41+
icon: pe-7s-arc
42+
color: bg-night-sky

metrics.py

Lines changed: 114 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -609,25 +609,44 @@ def top5_services_ordered(object, k=5, base='https://marketplace.eosc-portal.eu'
609609

610610
return topk_services
611611

612-
@statistic('A dictionary of the number of recommendations per day')
613-
def recommendations_per_day(object):
612+
@statistic('A dictionary of the number of recommended items per day')
613+
def recommended_items_per_day(object):
614614
"""
615-
It returns a statistical report in dictionary format. Specifically, the key
616-
is set for each particular day found and its value contains the respective
617-
number of recommendations committed. The dictionary includes all in-between
618-
days (obviously, with the count set to zero). Recommendations are already
619-
filtered by those where the user or service does not exist in users' or services' catalogs.
615+
It returns a a timeseries of recommended item counts per day. Each timeseries item has two fields: date and value
620616
"""
621617
# count recommendations for each day found in entries
622618
res=object.recommendations.groupby(by=object.recommendations['Timestamp'].dt.date).count().iloc[:,0]
623619

624-
# fill the in between days with zero recommendations
625-
res=res.asfreq('D', fill_value=0)
620+
# create a Series with period's start and end times and value of 0
621+
init=pd.Series([0,0],index=[pd.to_datetime(start(object)).date(), pd.to_datetime(end(object)).date()])
622+
623+
# remove duplicate entries for corner cases where start and end time match
624+
init.drop_duplicates(keep='first', inplace=True)
625+
626+
# append above two indexes and values (i.e. 0) to the Series
627+
# with axis=1, same indexes are being merged
628+
# since dataframe is created, get the first column
629+
res=pd.concat([res,init],ignore_index=False, axis=1).iloc[:, 0]
626630

631+
# convert Nan values created by the concatenation to 0
632+
# and change data type back to int
633+
res=res.fillna(0).astype(int)
634+
635+
# fill the in between days with zero user_actions
636+
res=res.asfreq('D', fill_value=0)
637+
627638
# convert datetimeindex to string
628639
res.index=res.index.format()
629640

630-
return res.to_dict()
641+
# convert series to dataframe with extra column having the dates
642+
res = res.to_frame().reset_index()
643+
644+
# rename columns to date, value
645+
res.rename(columns={ res.columns[0]: "date", res.columns[1]: "value" }, inplace = True)
646+
647+
# return a list of objects with date and value fields
648+
return res.to_dict(orient='records')
649+
631650

632651
@statistic('A dictionary of the number of user actions per day')
633652
def user_actions_per_day(object):
@@ -644,10 +663,93 @@ def user_actions_per_day(object):
644663
# count user_actions for each day found in entries
645664
res=object.user_actions.groupby(by=object.user_actions['Timestamp'].dt.date).count().iloc[:,0]
646665

666+
# create a Series with period's start and end times and value of 0
667+
init=pd.Series([0,0],index=[pd.to_datetime(start(object)).date(), pd.to_datetime(end(object)).date()])
668+
669+
# remove duplicate entries for corner cases where start and end time match
670+
init.drop_duplicates(keep='first', inplace=True)
671+
672+
# append above two indexes and values (i.e. 0) to the Series
673+
# with axis=1, same indexes are being merged
674+
# since dataframe is created, get the first column
675+
res=pd.concat([res,init],ignore_index=False, axis=1).iloc[:, 0]
676+
677+
# convert Nan values created by the concatenation to 0
678+
# and change data type back to int
679+
res=res.fillna(0).astype(int)
680+
647681
# fill the in between days with zero user_actions
648682
res=res.asfreq('D', fill_value=0)
649-
683+
650684
# convert datetimeindex to string
651685
res.index=res.index.format()
652686

653-
return res.to_dict()
687+
# convert series to dataframe with extra column having the dates
688+
res = res.to_frame().reset_index()
689+
690+
# rename columns to date, value
691+
res.rename(columns={ res.columns[0]: "date", res.columns[1]: "value" }, inplace = True)
692+
693+
# return a list of objects with date and value fields
694+
return res.to_dict(orient='records')
695+
696+
@metric('The mean value of the accuracy score found for each user defined by the fraction of the number of the correct predictions by the total number of predictions')
697+
def accuracy(object):
698+
"""
699+
Calculate the accuracy score found for each and retrieve the mean value.
700+
The score is calculated by dividing the number of the correct predictions
701+
by the total number of predictions.
702+
"""
703+
# a list of unique services' ids found in Datastore
704+
services_list=object.services['Service'].unique().tolist()
705+
# the length of the above value
706+
len_services=services(object)
707+
708+
def score(x):
709+
"""
710+
Inner function called at each row of the final dataframe
711+
in order to calculate the accuracy score for each row (=user)
712+
"""
713+
# 'Services' header indicates the accessed services' list,
714+
# while the 'Service' header indicates the recommended services' list
715+
# if accessed or recommended services' list is empty
716+
# it does not calculate any further computations
717+
# else for each service found in services_list,
718+
# put 1 or 0 if it is also found in the accessed or
719+
# recommended services respectively
720+
if not x['Services']:
721+
true_values=np.array([0]*len_services)
722+
else:
723+
true_values=np.array(list(map(lambda s: 1 if s in x['Services'] else 0,services_list)))
724+
if not x['Service']:
725+
pred_values=np.array([0]*len_services)
726+
else:
727+
pred_values=np.array(list(map(lambda s: 1 if s in x['Service'] else 0,services_list)))
728+
729+
# calculate the accuracy score by computing the average of the returned array
730+
# The returned array is a True/False array when the respective element of true_values
731+
# is equal or not to the respective element of pred_values
732+
x['Services']=np.average(true_values==pred_values)
733+
# return the row, where the 'Services' column has the accuracy score now
734+
return x
735+
736+
# a matrix of User ids and the respective accessed services' ids
737+
access_df=object.users[['User','Services']]
738+
739+
# a matrix of User ids and the respective recommended services' ids
740+
rec_df=(object.recommendations[['User','Service']].groupby(['User'])
741+
.agg({'Service': lambda x: x.unique().tolist()})
742+
.reset_index())
743+
744+
# performs a left join on User id, which means that nan values
745+
# are set for cases where no recommendations were made
746+
data=pd.merge(access_df, rec_df, on='User', how='left')
747+
# convert nan values to zeros, in order to be handled easily by the inner function
748+
data.fillna(0, inplace = True)
749+
# apply the score function row-wise
750+
data=data.apply(score, axis=1)
751+
752+
# return the mean value of all users' accuracy score
753+
# up to 4 digits precision
754+
return round(data['Services'].mean(),4)
755+

preprocessor.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,9 @@ def __init__(self, source_page_id, target_page_id, order):
212212
_query=query.copy()
213213
_query['date'] = _query.pop('timestamp')
214214
for rec in recdb["recommendation"].find(_query).sort("user_id"):
215+
# if dataset contains null references to user_ids replace them with the value -1
216+
if not rec["user_id"]:
217+
rec["user_id"] = -1
215218
recs.append({'user_id':int(rec['user_id']),
216219
'resource_ids': list(map(lambda x: x['service_id'],rec['recommendation'])),
217220
'resource_scores': list(map(lambda x: x['score'],rec['recommendation'])),
@@ -224,8 +227,10 @@ def __init__(self, source_page_id, target_page_id, order):
224227
# store data to Mongo DB
225228

226229
rsmetrics_db["user_actions"].delete_many({"provider":provider['name'], "ingestion":'batch'})
227-
rsmetrics_db["user_actions"].insert_many(luas)
230+
if len(luas) > 0:
231+
rsmetrics_db["user_actions"].insert_many(luas)
228232

229233
rsmetrics_db["recommendations"].delete_many({"provider":provider['name'], "ingestion":'batch'})
230-
rsmetrics_db["recommendations"].insert_many(recs)
234+
if len(recs) > 0:
235+
rsmetrics_db["recommendations"].insert_many(recs)
231236

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
beautifulsoup4==4.10.0
2-
certifi==2021.10.8
2+
certifi==2022.12.7
33
charset-normalizer==2.0.12
44
click==8.1.3
55
Flask==2.1.2

webservice/app.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def html_metrics(provider_name):
102102
result[stat_name] = get_statistic(provider_name, stat_name).get_json()
103103

104104
metrics_needed = ['user_coverage', 'catalog_coverage',
105-
'diversity', 'diversity_gini', 'novelty']
105+
'diversity', 'diversity_gini', 'novelty', 'accuracy']
106106

107107
for metric_name in metrics_needed:
108108
result[metric_name] = get_metric(provider_name, metric_name).get_json()
@@ -142,6 +142,27 @@ def html_kpis(provider_name):
142142

143143
return render_template('./kpis.html', data=result)
144144

145+
@app.route("/ui/reports/<string:provider_name>/graphs", strict_slashes=False)
146+
def html_graphs(provider_name):
147+
'''Serve html page about graphs per provider'''
148+
reports = db_get_provider_names()
149+
if not provider_name in reports:
150+
abort(404)
151+
152+
result = {}
153+
154+
stats_needed = ['start', 'end']
155+
for stat_name in stats_needed:
156+
result[stat_name] = get_statistic(provider_name, stat_name).get_json()
157+
158+
result['timestamp'] = get_api_index(provider_name).get_json()['timestamp']
159+
result['sidebar_info'] = app.sidebar_info
160+
result['report'] = provider_name
161+
result['reports'] = reports
162+
result['metric_active'] = None
163+
164+
return render_template('./graphs.html', data=result)
165+
145166

146167
@app.route("/ui/descriptions/metrics/<string:metric_name>", strict_slashes=False)
147168
def html_metric_description(metric_name):

0 commit comments

Comments
 (0)