ARGOeu
diff --git a/‎environment.yml‎
Lines changed: 29 additions & 29 deletions b/‎environment.yml‎
Lines changed: 29 additions & 29 deletions
diff --git a/‎metric_descriptions/accuracy.yml‎
Lines changed: 42 additions & 0 deletions b/‎metric_descriptions/accuracy.yml‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎metrics.py‎
Lines changed: 114 additions & 12 deletions b/‎metrics.py‎
Lines changed: 114 additions & 12 deletions
diff --git a/‎preprocessor.py‎
Lines changed: 7 additions & 2 deletions b/‎preprocessor.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎webservice/app.py‎
Lines changed: 22 additions & 1 deletion b/‎webservice/app.py‎
Lines changed: 22 additions & 1 deletion
@@ -2,53 +2,53 @@ name: rsmetrics
 channels:
   - defaults
 dependencies:
-  - _libgcc_mutex=0.1=main
-  - _openmp_mutex=4.5=1_gnu
-  - ca-certificates=2022.3.18=h06a4308_0
-  - certifi=2021.10.8=py39h06a4308_2
-  - ld_impl_linux-64=2.35.1=h7274673_9
-  - libffi=3.3=he6710b0_2
-  - libgcc-ng=9.3.0=h5101ec6_17
-  - libgomp=9.3.0=h5101ec6_17
-  - libstdcxx-ng=9.3.0=hd4cf53a_17
-  - ncurses=6.3=h7f8727e_2
-  - openssl=1.1.1n=h7f8727e_0
-  - pip=21.2.4=py39h06a4308_0
-  - python=3.9.11=h12debd9_2
-  - readline=8.1.2=h7f8727e_1
-  - setuptools=58.0.4=py39h06a4308_0
-  - sqlite=3.38.0=hc218d9a_0
-  - tk=8.6.11=h1ccaba5_0
-  - tzdata=2021e=hda174b7_0
-  - wheel=0.37.1=pyhd3eb1b0_0
-  - xz=5.2.5=h7b6447c_0
-  - zlib=1.2.11=h7f8727e_4
+  - _libgcc_mutex=0.1
+  - _openmp_mutex=4.5
+  - ca-certificates=2022.3.18
+  - ld_impl_linux-64=2.35.1
+  - libffi=3.3
+  - libgcc-ng=9.3.0
+  - libgomp=9.3.0
+  - libstdcxx-ng=9.3.0
+  - ncurses=6.3
+  - openssl=1.1.1n
+  - pip=21.2.4
+  - python=3.9.11
+  - readline=8.1.2
+  - setuptools=58.0.4
+  - sqlite=3.38.0
+  - tk=8.6.11
+  - tzdata=2021e
+  - wheel=0.37.1
+  - xz=5.2.5
+  - zlib=1.2.11
   - pip:
     - beautifulsoup4==4.10.0
-    - certifi==2021.10.8
+    - certifi==2022.12.7
     - charset-normalizer==2.0.12
     - click==8.1.3
-    - Flask==2.1.2
+    - flask==2.1.2
+    - flask-pymongo==2.3.0
     - idna==3.3
     - importlib-metadata==4.11.4
     - itsdangerous==2.1.2
-    - Jinja2==3.1.2
+    - jinja2==3.1.2
     - joblib==1.2.0
-    - MarkupSafe==2.1.1
+    - markupsafe==2.1.1
     - natsort==8.1.0
     - numpy==1.22.3
     - pandas==1.4.2
+    - pyarrow==10.0.1
     - pymongo==4.1.0
+    - pymongoarrow==0.6.2
     - python-dateutil==2.8.2
     - python-dotenv==0.20.0
     - pytz==2022.1
-    - PyYAML==6.0
+    - pyyaml==6.0
     - requests==2.27.1
     - scipy==1.8.0
     - six==1.16.0
     - soupsieve==2.3.2
     - urllib3==1.26.9
-    - Werkzeug==2.1.2
+    - werkzeug==2.1.2
     - zipp==3.8.0
-    - flask-pymongo==2.3.0
-    - pymongoarrow==0.6.2
@@ -0,0 +1,42 @@
+name: Accuracy
+
+summary: > 
+    Measures Recommendations' accuracy based on users' access to the services. A value of 1, indicates that the RS model got all the predictions right, and a value of 0 indicates that the RS model did not make a single correct prediction
+
+description: > 
+    The accuracy (\(A\)) of the recommendations is based on users' access to the services. A value of 1, indicates that the RS model got all the predictions right, and a value of 0 indicates that the RS model did not make a single correct prediction. Generally, the Accuracy mathematical expression is defined as: 
+    $$A=\frac{Number\;of\;correct\;predictions}{Total\;number\;of\;predictions}$$In RS Metrics the computation is determined by the following formula: 
+    $$Accuracy=\frac{Number\;of\;correctly\;recommended\;services}{Total\;number\;of\;services}$$where correctness is defined as if the service is both accessed by the user and also it is recommended by the RS
+
+output:
+    type: float
+    min: 0
+    max: 1
+    comment: A value of 1, indicates that the RS model got all the predictions right, and a value of 0 indicates that the RS model did not make a single correct prediction.
+
+prerequisites:
+    - recommendations without anonymous users
+    - all available users (with their accessed services)
+    - all available services
+
+process:
+    - step: Clean up
+      details: >
+        Recommendations clean up; entries removal where users or services are not found in "users" or "services" accordingly
+    - step: Vector creation of the Accessed Services
+      details: >
+        For each user create a vector at the size of the number of the services, and assign a binary value for each service with a value of 1 if it is found in the user's accessed services, or 0 if it is not
+    - step: Vector creation of the Recommended Services
+      details: >
+        For each user create a vector at the size of the number of the services, and assign a binary value for each service with a value of 1 if it is recommended to the user, or 0 if it is not
+    - step: Accuracy score computation
+      details: >
+        For each user compute the average value of the difference vector; a vector which states True if service is found in both accessed and recommended vectors or False if it is not
+    - step: Mean value of Accuracy score
+      details: >
+        Computation of the overall value by calculating the mean value of each user's accuracy score 
+
+# This is optional for visual stylization of the metric when displayed on the report
+style:
+    icon: pe-7s-arc
+    color: bg-night-sky
@@ -609,25 +609,44 @@ def top5_services_ordered(object, k=5, base='https://marketplace.eosc-portal.eu'
 
     return topk_services
 
-@statistic('A dictionary of the number of recommendations per day')
-def recommendations_per_day(object):
+@statistic('A dictionary of the number of recommended items per day')
+def recommended_items_per_day(object):
     """
-    It returns a statistical report in dictionary format. Specifically, the key 
-    is set for each particular day found and its value contains the respective 
-    number of recommendations committed. The dictionary includes all in-between 
-    days (obviously, with the count set to zero). Recommendations are already 
-    filtered by those where the user or service does not exist in users' or services' catalogs.
+    It returns a a timeseries of recommended item counts per day. Each timeseries item has two fields: date and value
     """
     # count recommendations for each day found in entries
     res=object.recommendations.groupby(by=object.recommendations['Timestamp'].dt.date).count().iloc[:,0]
 
-    # fill the in between days with zero recommendations
-    res=res.asfreq('D', fill_value=0)
+    # create a Series with period's start and end times and value of 0
+    init=pd.Series([0,0],index=[pd.to_datetime(start(object)).date(), pd.to_datetime(end(object)).date()])
+
+    # remove duplicate entries for corner cases where start and end time match
+    init.drop_duplicates(keep='first', inplace=True)
+
+    # append above two indexes and values (i.e. 0) to the Series
+    # with axis=1, same indexes are being merged
+    # since dataframe is created, get the first column
+    res=pd.concat([res,init],ignore_index=False, axis=1).iloc[:, 0]
 
+    # convert Nan values created by the concatenation to 0
+    # and change data type back to int
+    res=res.fillna(0).astype(int)
+
+    # fill the in between days with zero user_actions
+    res=res.asfreq('D', fill_value=0)
+   
     # convert datetimeindex to string
     res.index=res.index.format()
 
-    return res.to_dict()
+    # convert series to dataframe with extra column having the dates
+    res = res.to_frame().reset_index()
+
+    # rename columns to date, value
+    res.rename(columns={ res.columns[0]: "date", res.columns[1]: "value" }, inplace = True)
+    
+    # return a list of objects with date and value fields
+    return res.to_dict(orient='records')
+    
 
 @statistic('A dictionary of the number of user actions per day')
 def user_actions_per_day(object):
@@ -644,10 +663,93 @@ def user_actions_per_day(object):
     # count user_actions for each day found in entries
     res=object.user_actions.groupby(by=object.user_actions['Timestamp'].dt.date).count().iloc[:,0]
 
+    # create a Series with period's start and end times and value of 0
+    init=pd.Series([0,0],index=[pd.to_datetime(start(object)).date(), pd.to_datetime(end(object)).date()])
+
+    # remove duplicate entries for corner cases where start and end time match
+    init.drop_duplicates(keep='first', inplace=True)
+
+    # append above two indexes and values (i.e. 0) to the Series
+    # with axis=1, same indexes are being merged
+    # since dataframe is created, get the first column
+    res=pd.concat([res,init],ignore_index=False, axis=1).iloc[:, 0]
+    
+    # convert Nan values created by the concatenation to 0
+    # and change data type back to int
+    res=res.fillna(0).astype(int)
+
     # fill the in between days with zero user_actions
     res=res.asfreq('D', fill_value=0)
-    
+
     # convert datetimeindex to string
     res.index=res.index.format()
 
-    return res.to_dict()
+    # convert series to dataframe with extra column having the dates
+    res = res.to_frame().reset_index()
+
+    # rename columns to date, value
+    res.rename(columns={ res.columns[0]: "date", res.columns[1]: "value" }, inplace = True)
+    
+    # return a list of objects with date and value fields
+    return res.to_dict(orient='records')
+
+@metric('The mean value of the accuracy score found for each user defined by the fraction of the number of the correct predictions by the total number of predictions')
+def accuracy(object):
+    """
+    Calculate the accuracy score found for each and retrieve the mean value. 
+    The score is calculated by dividing the number of the correct predictions 
+    by the total number of predictions.
+    """
+    # a list of unique services' ids found in Datastore
+    services_list=object.services['Service'].unique().tolist()
+    # the length of the above value
+    len_services=services(object)
+
+    def score(x):
+        """
+        Inner function called at each row of the final dataframe
+        in order to calculate the accuracy score for each row (=user)
+        """
+        # 'Services' header indicates the accessed services' list,
+        # while the 'Service' header indicates the recommended services' list
+        # if accessed or recommended services' list is empty
+        # it does not calculate any further computations
+        # else for each service found in services_list,
+        # put 1 or 0 if it is also found in the accessed or 
+        # recommended services respectively
+        if not x['Services']:
+           true_values=np.array([0]*len_services)
+        else:
+           true_values=np.array(list(map(lambda s: 1 if s in x['Services'] else 0,services_list)))
+        if not x['Service']:
+           pred_values=np.array([0]*len_services)
+        else:
+           pred_values=np.array(list(map(lambda s: 1 if s in x['Service'] else 0,services_list)))
+
+        # calculate the accuracy score by computing the average of the returned array
+        # The returned array is a True/False array when the respective element of true_values 
+        # is equal or not to the respective element of pred_values
+        x['Services']=np.average(true_values==pred_values)
+        # return the row, where the 'Services' column has the accuracy score now
+        return x
+
+    # a matrix of User ids and the respective accessed services' ids
+    access_df=object.users[['User','Services']]
+
+    # a matrix of User ids and the respective recommended services' ids
+    rec_df=(object.recommendations[['User','Service']].groupby(['User'])
+      .agg({'Service': lambda x: x.unique().tolist()})
+      .reset_index())
+
+    # performs a left join on User id, which means that nan values 
+    # are set for cases where no recommendations were made
+    data=pd.merge(access_df, rec_df, on='User', how='left')
+    # convert nan values to zeros, in order to be handled easily by the inner function
+    data.fillna(0, inplace = True)
+    # apply the score function row-wise
+    data=data.apply(score, axis=1)
+
+    # return the mean value of all users' accuracy score
+    # up to 4 digits precision
+    return round(data['Services'].mean(),4)
+
@@ -212,6 +212,9 @@ def __init__(self, source_page_id, target_page_id, order):
     _query=query.copy()
     _query['date'] = _query.pop('timestamp')
     for rec in recdb["recommendation"].find(_query).sort("user_id"):
+        # if dataset contains null references to user_ids replace them with the value -1
+        if not rec["user_id"]:
+            rec["user_id"] = -1 
         recs.append({'user_id':int(rec['user_id']),
                  'resource_ids': list(map(lambda x: x['service_id'],rec['recommendation'])),
                  'resource_scores': list(map(lambda x: x['score'],rec['recommendation'])),
@@ -224,8 +227,10 @@ def __init__(self, source_page_id, target_page_id, order):
 # store data to Mongo DB
 
 rsmetrics_db["user_actions"].delete_many({"provider":provider['name'], "ingestion":'batch'})
-rsmetrics_db["user_actions"].insert_many(luas)
+if len(luas) > 0:
+	rsmetrics_db["user_actions"].insert_many(luas)
 
 rsmetrics_db["recommendations"].delete_many({"provider":provider['name'], "ingestion":'batch'})
-rsmetrics_db["recommendations"].insert_many(recs)
+if len(recs) > 0:
+	rsmetrics_db["recommendations"].insert_many(recs)
 
@@ -1,5 +1,5 @@
 beautifulsoup4==4.10.0
-certifi==2021.10.8
+certifi==2022.12.7
 charset-normalizer==2.0.12
 click==8.1.3
 Flask==2.1.2
 
@@ -102,7 +102,7 @@ def html_metrics(provider_name):
         result[stat_name] = get_statistic(provider_name, stat_name).get_json()
 
     metrics_needed = ['user_coverage', 'catalog_coverage',
-                      'diversity', 'diversity_gini', 'novelty']
+                      'diversity', 'diversity_gini', 'novelty', 'accuracy']
 
     for metric_name in metrics_needed:
         result[metric_name] = get_metric(provider_name, metric_name).get_json()
@@ -142,6 +142,27 @@ def html_kpis(provider_name):
 
     return render_template('./kpis.html', data=result)
 
+@app.route("/ui/reports/<string:provider_name>/graphs", strict_slashes=False)
+def html_graphs(provider_name):
+    '''Serve html page about graphs per provider'''
+    reports = db_get_provider_names()
+    if not provider_name in reports:
+        abort(404)
+
+    result = {}
+
+    stats_needed = ['start', 'end']
+    for stat_name in stats_needed:
+        result[stat_name] = get_statistic(provider_name, stat_name).get_json()
+
+    result['timestamp'] = get_api_index(provider_name).get_json()['timestamp']
+    result['sidebar_info'] = app.sidebar_info
+    result['report'] = provider_name
+    result['reports'] = reports
+    result['metric_active'] = None
+
+    return render_template('./graphs.html', data=result)
+
 
 @app.route("/ui/descriptions/metrics/<string:metric_name>", strict_slashes=False)
 def html_metric_description(metric_name):