Project-Skill-Tree · Alex-Scofield · Sep 29, 2022 · Oct 2, 2022 · Jun 11, 2023 · Jun 13, 2023
diff --git a/README.md b/README.md
@@ -1,5 +1,16 @@
-# Skill-Tree-Data-Analytics
-A python program to compile useful insights from users' skill tree data.
-# Installation
+# Skill-Tree Data-Analytics
+Python library that streamlines the process of data analysis for Project Skill Tree.
+
+## Installation
 To install the necessary dependencies run the command pip install -r requirements.txt
-To access the Database you must use a Database User and a Database Password, stored as enviornment variables as "STDB_USER" and "STDB_PASSWORD" respectively.
+
+To access the Database you must use a Database User and a Database Password, stored as enviornment variables as "STDB_USER" and "STDB_PASSWORD" respectively.
+
+## Usage
+There are two parts to the repository: 
+1) The utilities package.
+2) Jupyter notebooks on which the Data Analysis takes place.
+
+The utilities package contains methods that process the data. There is a pandas DataFrame for each type of data, which can be manipulated as needed in the notebooks.
+
+Currently the old version of the source code is still in the repository, until refactoring is completed.
diff --git a/documentation.txt b/documentation.txt
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 matplotlib
 pymongo
 pymongo[srv]
-pandas
+pandas
+seaborn
diff --git a/src/__init__.py → src (outdated)/__init__.py b/src/__init__.py → src (outdated)/__init__.py
diff --git a/src (outdated)/example.py b/src (outdated)/example.py
@@ -0,0 +1,34 @@
+'''
+There are two modules that can be used to access the SkillTree data. 
+We will begin by looking at stdata, which includes a series of methods whose
+objective is to perform calculations using the raw data from the data compiling it into
+more comprehensible formats.
+'''
+import stdata
+
+'''
+There are now various ways of proceeding. The way the stdata module is structured is very
+simple. There is a class for UserData, one for SkillData and one for ChallengeData. Each of
+these is equiped with a wide range of methods, which we must call in order to perform 
+our analysis on the data. The easiest way of doing so is by creating an instance in place as follows.
+'''
+
+number_users = stdata.UserData().count_users()
+
+'''
+If you're going to call various methods of the UserData class, it might be better to use the 
+following code.
+'''
+
+userData = stdata.UserData()
+number_users = userData.count_users()
+timezone_info = userData.timezone_counter()
+
+'''
+In order to make queries more interesting it can be very useful to make use of the parameters
+that are available in each of the methods.
+'''
+
+# Completion rate of the skills belonging to the fitness category, where users are in timezone 0
+data = stdata.SkillData().get_skill_completion_rate(skill_parameter={"category":"fitness"}, user_parameter={"timezone":0})
+print(data)
diff --git a/src (outdated)/gui.py b/src (outdated)/gui.py
@@ -0,0 +1,31 @@
+from stdata import *
+from stgraphs import *
+from PyQt6.QtWidgets import *
+import sys
+
+# Definition of PyQt App, Layout and Window
+app = QApplication(sys.argv)
+
+
+class MainWin(QMainWindow):
+    def __init__(self):
+        super().__init__()
+        self.button = QPushButton('Top')
+        self.button.clicked.connect(self.show_new_window)
+        self.setCentralWidget(self.button)
+
+    def show_new_window(self, checked):
+        self.w = SkillWin()
+        self.w.show()
+
+class SkillWin(QWidget):
+    def __init__(self):
+        super().__init__()
+        layout = QVBoxLayout()
+        label = QLabel(str(SkillData().get_ease()))
+        layout.addWidget(label)
+        self.setLayout(layout)
+
+mainWin = MainWin()
+mainWin.show()
+app.exec()
diff --git a/src (outdated)/play.py b/src (outdated)/play.py
@@ -0,0 +1,4 @@
+from stdata import *
+import stgraphs
+
+stgraphs.SkillGraph().graph_skills_by_ease()
diff --git a/src (outdated)/stdata.py b/src (outdated)/stdata.py
@@ -0,0 +1,129 @@
+from pymongo import MongoClient
+from pymongo.server_api import ServerApi
+import os
+from collections import Counter, OrderedDict
+import pandas as pd
+
+################ CREATE GET SCV
+
+
+# Useful function to make sense of the raw data
+def count_and_order(list_to_order) -> OrderedDict:
+    return OrderedDict(Counter(list_to_order).most_common())
+
+# Base class for all the different types of data
+class DataObject():
+    db_user = os.getenv("STDB_USER")
+    db_password = os.getenv("STDB_PASS")
+    client = MongoClient(f'mongodb+srv://{db_user}:{db_password}@adonis.n0u0i.mongodb.net/Database?retryWrites=true&w=majority', server_api=ServerApi('1'))
+    db = client.Database
+    users = db.Users
+    challenges = db.Challenges
+    items = db.Items
+    skills = db.Skills
+    tasks = db.Tasks
+
+    # Run after each call to close the connection with the Database
+    def close(self) -> None:
+        DataObject.client.close()
+
+# Includes methods common to skills and challenges
+class ActionData (DataObject):
+    def __init__(self):
+        self.data_type = None
+        self.completed = None
+        self.progress = None
+        self.find_description = None
+
+    def id_to_goals(self, dictionary) -> dict:
+        descriptions = [self.data_type.find_one({"_id":item})["goals"]for item in dictionary]
+        return list(zip(descriptions, list(dictionary.values())))
+
+    def order_by_popularity(self, user_parameter={}) -> dict:
+        # First create a list with the lists of skills that each user has completed and then unpack that list.
+        list = [user[self.completed] for user in DataObject.users.find(user_parameter)]
+        total_list = [item for sublist in list for item in sublist]
+        return count_and_order(total_list)
+
+    def get_completion_rate(self, user_parameter={}, action_parameter={}) -> dict:
+        from collections import Counter
+        users = self.users.find(user_parameter)
+        items = [item["_id"] for item in self.data_type.find(action_parameter)]
+        completed_list = []
+        progress_list = []
+
+        for user in users:
+            for completed in user[self.completed]:
+               if completed in items:
+                    completed_list.append(completed)
+            for progress in user[self.progress]:
+                if progress in items:
+                    progress_list.append(progress)
+
+        completed_counted = Counter(completed_list)
+        progress_counted = Counter(progress_list)
+        data_unordered = {key: {'Started': value + completed_counted[key], 'Progress': value, 'Completed': completed_counted[key], 'Score':float(completed_counted[key])/float(value+completed_counted[key])}  for (key, value) in progress_counted.items()}
+        data_ordered = dict(sorted(data_unordered.items(), key=lambda x:x[1]['Score']))
+
+        return data_ordered
+
+    def get_ease(self, action_parameter={}) -> dict:
+        data = self.get_completion_rate(action_parameter=action_parameter)
+        keys = [self.data_type.find_one({"_id":id})["goals"][0] for id in data.keys()]
+        values = [value['Score'] for value in data.values()]
+        total_dict = dict(zip(keys, values))
+        return total_dict
+
+
+class UserData (DataObject):
+    def count_users(self, parameter={}) -> int:
+        return len(list(DataObject.users.find(parameter)))
+
+    def timezone_counter(self, parameter={}) -> OrderedDict:
+        return count_and_order([str(user["timezone"]) for user in DataObject.users.find(parameter)])
+
+    def number_skills_completed_dict(self, parameter={}) -> OrderedDict:
+        return count_and_order([len(user["skillscompleted"]) for user in self.users.find(parameter)])
+
+    def number_skills_completed_data(self, parameter={}) -> str:
+        return pd.Series([len(user["skillscompleted"]) for user in self.users.find(parameter)]).describe()
+
+    def days_tracked_data(self, parameter={}) -> str:
+        return pd.Series([user["numDaysTracked"] for user in self.users.find(parameter)]).describe()
+
+
+class SkillData(ActionData):
+    def __init__(self):
+        super().__init__()
+        self.data_type = DataObject.skills
+        self.completed = "skillscompleted"
+        self.progress = "skillsinprogress"
+
+    def id_to_title_and_level(self, dictionary) -> dict:
+        title_and_id = [(self.data_type.find_one({"_id":item})["title"], self.data_type.find_one({"_id":item})["level"])  for item in dictionary]
+        return dict(zip(title_and_id, dictionary.values()))
+
+    def get_skills_csv(self) -> None:
+        import csv
+        data = self.get_completion_rate()
+        titles = list(self.id_to_title_and_level(data).keys())
+        goals = [item[0] for item in self.id_to_goals(data)]
+        started = [data[datum]["Started"] for datum in data]
+        progress = [data[datum]["Progress"] for datum in data]
+        completed = [data[datum]["Completed"] for datum in data]
+        score = [data[datum]["Score"] for datum in data]
+        rows = [[titles[i], goals[i], started[i], progress[i], completed[i], score[i]] for i in range(len(titles))]
+
+        with open('skills.csv', 'w', encoding='UTF8') as f:
+            writer = csv.writer(f, delimiter=';')
+            for row in rows:
+                writer.writerow(row)
+
+class ChallengeData(ActionData):
+    def __init__(self):
+        super().__init__()
+        self.data_type = DataObject.challenges
+        self.completed = "challengescompleted"
+        self.progress = "challengesinprogress"
+
+SkillData().get_skills_csv()
diff --git a/src/stgraphs.py → src (outdated)/stgraphs.py b/src/stgraphs.py → src (outdated)/stgraphs.py
@@ -27,9 +27,16 @@ def pie_timezones(self, user_parameter={}, tight_layout=True) -> None:
     def bar_timezones(self, user_parameter={}, tight_layout=True) -> None:
         data = UserData().timezone_counter(parameter=user_parameter)
         x = data.keys()
-        y= data.values()
+        y = data.values()
         plt.bar(x, y)
         self.set_plot("Users per timezone", tight_layout)
+
+    def graph_number_skills_completed(self, user_parameter={}, tight_layout=True) -> None:
+        data = UserData().number_skills_completed_dict(parameter=user_parameter)
+        x = data.keys()
+        y = data.values()
+        plt.bar(x, y)
+        self.set_plot("Number of skills completed", tight_layout)
 
 class SkillGraph(GraphObject):
     def graph_skills_by_popularity(self, user_parameter={}, amount=10, graph_all=False, tight_layout=True) -> None:
@@ -49,9 +56,9 @@ def graph_skills_by_popularity(self, user_parameter={}, amount=10, graph_all=Fal
 
         self.set_plot("Skill Popularity", tight_layout)
 
-    def graph_skills_by_ease(self, skill_parameter={}, tight_layout=False) -> None:
+    def graph_skills_by_ease(self, skill_parameter={}, tight_layout=False, amount=10) -> None:
         data = SkillData().list_skills_by_ease(skill_parameter=skill_parameter)
-        plt.bar(data.keys(), data.values())
+        plt.barh( list(data.keys())[:amount], list(data.values())[:amount])
         plt.xlabel("Completion_rate")
         self.set_plot("Skills by ease", tight_layout=tight_layout)
 
@@ -72,4 +79,3 @@ def graph_challenges_by_popularity(self, user_parameter={}, amount=10, graph_all
             plt.text(v + 1, i, str(v), color='blue', fontweight='bold')
 
         self.set_plot("Challenge Popularity", tight_layout)
-