Skip to content

New Notebook analysing correlations between Skill Completions #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# Skill-Tree-Data-Analytics
A python program to compile useful insights from users' skill tree data.
# Installation
# Skill-Tree Data-Analytics
Python library that streamlines the process of data analysis for Project Skill Tree.

## Installation
To install the necessary dependencies run the command pip install -r requirements.txt
To access the Database you must use a Database User and a Database Password, stored as enviornment variables as "STDB_USER" and "STDB_PASSWORD" respectively.

To access the Database you must use a Database User and a Database Password, stored as enviornment variables as "STDB_USER" and "STDB_PASSWORD" respectively.

## Usage
There are two parts to the repository:
1) The utilities package.
2) Jupyter notebooks on which the Data Analysis takes place.

The utilities package contains methods that process the data. There is a pandas DataFrame for each type of data, which can be manipulated as needed in the notebooks.

Currently the old version of the source code is still in the repository, until refactoring is completed.
37 changes: 0 additions & 37 deletions documentation.txt

This file was deleted.

3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
matplotlib
pymongo
pymongo[srv]
pandas
pandas
seaborn
File renamed without changes.
34 changes: 34 additions & 0 deletions src (outdated)/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
'''
There are two modules that can be used to access the SkillTree data.
We will begin by looking at stdata, which includes a series of methods whose
objective is to perform calculations using the raw data from the data compiling it into
more comprehensible formats.
'''
import stdata

'''
There are now various ways of proceeding. The way the stdata module is structured is very
simple. There is a class for UserData, one for SkillData and one for ChallengeData. Each of
these is equiped with a wide range of methods, which we must call in order to perform
our analysis on the data. The easiest way of doing so is by creating an instance in place as follows.
'''

number_users = stdata.UserData().count_users()

'''
If you're going to call various methods of the UserData class, it might be better to use the
following code.
'''

userData = stdata.UserData()
number_users = userData.count_users()
timezone_info = userData.timezone_counter()

'''
In order to make queries more interesting it can be very useful to make use of the parameters
that are available in each of the methods.
'''

# Completion rate of the skills belonging to the fitness category, where users are in timezone 0
data = stdata.SkillData().get_skill_completion_rate(skill_parameter={"category":"fitness"}, user_parameter={"timezone":0})
print(data)
31 changes: 31 additions & 0 deletions src (outdated)/gui.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from stdata import *
from stgraphs import *
from PyQt6.QtWidgets import *
import sys

# Definition of PyQt App, Layout and Window
app = QApplication(sys.argv)


class MainWin(QMainWindow):
def __init__(self):
super().__init__()
self.button = QPushButton('Top')
self.button.clicked.connect(self.show_new_window)
self.setCentralWidget(self.button)

def show_new_window(self, checked):
self.w = SkillWin()
self.w.show()

class SkillWin(QWidget):
def __init__(self):
super().__init__()
layout = QVBoxLayout()
label = QLabel(str(SkillData().get_ease()))
layout.addWidget(label)
self.setLayout(layout)

mainWin = MainWin()
mainWin.show()
app.exec()
4 changes: 4 additions & 0 deletions src (outdated)/play.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from stdata import *
import stgraphs

stgraphs.SkillGraph().graph_skills_by_ease()
129 changes: 129 additions & 0 deletions src (outdated)/stdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from pymongo import MongoClient
from pymongo.server_api import ServerApi
import os
from collections import Counter, OrderedDict
import pandas as pd

################ CREATE GET SCV


# Useful function to make sense of the raw data
def count_and_order(list_to_order) -> OrderedDict:
return OrderedDict(Counter(list_to_order).most_common())

# Base class for all the different types of data
class DataObject():
db_user = os.getenv("STDB_USER")
db_password = os.getenv("STDB_PASS")
client = MongoClient(f'mongodb+srv://{db_user}:{db_password}@adonis.n0u0i.mongodb.net/Database?retryWrites=true&w=majority', server_api=ServerApi('1'))
db = client.Database
users = db.Users
challenges = db.Challenges
items = db.Items
skills = db.Skills
tasks = db.Tasks

# Run after each call to close the connection with the Database
def close(self) -> None:
DataObject.client.close()

# Includes methods common to skills and challenges
class ActionData (DataObject):
def __init__(self):
self.data_type = None
self.completed = None
self.progress = None
self.find_description = None

def id_to_goals(self, dictionary) -> dict:
descriptions = [self.data_type.find_one({"_id":item})["goals"]for item in dictionary]
return list(zip(descriptions, list(dictionary.values())))

def order_by_popularity(self, user_parameter={}) -> dict:
# First create a list with the lists of skills that each user has completed and then unpack that list.
list = [user[self.completed] for user in DataObject.users.find(user_parameter)]
total_list = [item for sublist in list for item in sublist]
return count_and_order(total_list)

def get_completion_rate(self, user_parameter={}, action_parameter={}) -> dict:
from collections import Counter
users = self.users.find(user_parameter)
items = [item["_id"] for item in self.data_type.find(action_parameter)]
completed_list = []
progress_list = []

for user in users:
for completed in user[self.completed]:
if completed in items:
completed_list.append(completed)
for progress in user[self.progress]:
if progress in items:
progress_list.append(progress)

completed_counted = Counter(completed_list)
progress_counted = Counter(progress_list)
data_unordered = {key: {'Started': value + completed_counted[key], 'Progress': value, 'Completed': completed_counted[key], 'Score':float(completed_counted[key])/float(value+completed_counted[key])} for (key, value) in progress_counted.items()}
data_ordered = dict(sorted(data_unordered.items(), key=lambda x:x[1]['Score']))

return data_ordered

def get_ease(self, action_parameter={}) -> dict:
data = self.get_completion_rate(action_parameter=action_parameter)
keys = [self.data_type.find_one({"_id":id})["goals"][0] for id in data.keys()]
values = [value['Score'] for value in data.values()]
total_dict = dict(zip(keys, values))
return total_dict


class UserData (DataObject):
def count_users(self, parameter={}) -> int:
return len(list(DataObject.users.find(parameter)))

def timezone_counter(self, parameter={}) -> OrderedDict:
return count_and_order([str(user["timezone"]) for user in DataObject.users.find(parameter)])

def number_skills_completed_dict(self, parameter={}) -> OrderedDict:
return count_and_order([len(user["skillscompleted"]) for user in self.users.find(parameter)])

def number_skills_completed_data(self, parameter={}) -> str:
return pd.Series([len(user["skillscompleted"]) for user in self.users.find(parameter)]).describe()

def days_tracked_data(self, parameter={}) -> str:
return pd.Series([user["numDaysTracked"] for user in self.users.find(parameter)]).describe()


class SkillData(ActionData):
def __init__(self):
super().__init__()
self.data_type = DataObject.skills
self.completed = "skillscompleted"
self.progress = "skillsinprogress"

def id_to_title_and_level(self, dictionary) -> dict:
title_and_id = [(self.data_type.find_one({"_id":item})["title"], self.data_type.find_one({"_id":item})["level"]) for item in dictionary]
return dict(zip(title_and_id, dictionary.values()))

def get_skills_csv(self) -> None:
import csv
data = self.get_completion_rate()
titles = list(self.id_to_title_and_level(data).keys())
goals = [item[0] for item in self.id_to_goals(data)]
started = [data[datum]["Started"] for datum in data]
progress = [data[datum]["Progress"] for datum in data]
completed = [data[datum]["Completed"] for datum in data]
score = [data[datum]["Score"] for datum in data]
rows = [[titles[i], goals[i], started[i], progress[i], completed[i], score[i]] for i in range(len(titles))]

with open('skills.csv', 'w', encoding='UTF8') as f:
writer = csv.writer(f, delimiter=';')
for row in rows:
writer.writerow(row)

class ChallengeData(ActionData):
def __init__(self):
super().__init__()
self.data_type = DataObject.challenges
self.completed = "challengescompleted"
self.progress = "challengesinprogress"

SkillData().get_skills_csv()
14 changes: 10 additions & 4 deletions src/stgraphs.py → src (outdated)/stgraphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,16 @@ def pie_timezones(self, user_parameter={}, tight_layout=True) -> None:
def bar_timezones(self, user_parameter={}, tight_layout=True) -> None:
data = UserData().timezone_counter(parameter=user_parameter)
x = data.keys()
y= data.values()
y = data.values()
plt.bar(x, y)
self.set_plot("Users per timezone", tight_layout)

def graph_number_skills_completed(self, user_parameter={}, tight_layout=True) -> None:
data = UserData().number_skills_completed_dict(parameter=user_parameter)
x = data.keys()
y = data.values()
plt.bar(x, y)
self.set_plot("Number of skills completed", tight_layout)

class SkillGraph(GraphObject):
def graph_skills_by_popularity(self, user_parameter={}, amount=10, graph_all=False, tight_layout=True) -> None:
Expand All @@ -49,9 +56,9 @@ def graph_skills_by_popularity(self, user_parameter={}, amount=10, graph_all=Fal

self.set_plot("Skill Popularity", tight_layout)

def graph_skills_by_ease(self, skill_parameter={}, tight_layout=False) -> None:
def graph_skills_by_ease(self, skill_parameter={}, tight_layout=False, amount=10) -> None:
data = SkillData().list_skills_by_ease(skill_parameter=skill_parameter)
plt.bar(data.keys(), data.values())
plt.barh( list(data.keys())[:amount], list(data.values())[:amount])
plt.xlabel("Completion_rate")
self.set_plot("Skills by ease", tight_layout=tight_layout)

Expand All @@ -72,4 +79,3 @@ def graph_challenges_by_popularity(self, user_parameter={}, amount=10, graph_all
plt.text(v + 1, i, str(v), color='blue', fontweight='bold')

self.set_plot("Challenge Popularity", tight_layout)

Loading