diff --git a/worker-stats/README.md b/worker-stats/README.md new file mode 100644 index 00000000..2226b3bd --- /dev/null +++ b/worker-stats/README.md @@ -0,0 +1,60 @@ +# worker-user-stats + +A small worker script to generate OSM user statistics from the HOTOSM Tasking Manager database. + +### Overview + +This script generates timestamp data for users who have used the HOTOSM tasking manager. Specifically, edit timestamps in the categories of done, validated, and invalidated, arranged by project and by user. For example: + +``` +"2156(userid)": { + "167(projectid)": { + "done": { + "times": [ + "Sat Feb 13 2016 04:58:03 GMT" + "Sat Feb 20 2016 04:02:20 GMT" + "Sat Jan 30 2016 17:50:38 GMT" + ] + }, + "invalidated": { + "times": [ + "Sun Apr 10 2016 08:24:42 GMT" + "Thu Mar 03 2016 06:17:58 GMT" + ] + }, + "validated": { + "times": [ + "Sun Jan 31 2016 07:16:41 GMT" + "Thu Feb 11 2016 08:09:27 GMT" + "Mon May 09 2016 02:25:27 GMT" + "Thu Apr 14 2016 06:37:59 GMT" + "Wed Mar 09 2016 19:44:51 GMT" + "Wed May 25 2016 04:40:04 GMT" + ] + } + } +} +``` + +The script generates JSON and posts to an Amazon S3 bucket to be served as static JSON endpoint. + +## Usage + +To be run as a worker alongside the Tasking Manager every 10 mins. + +``` +$ pip install -r requirements.txt +$ python user-stats.py +``` + +## Requirements + +- Read access to the Tasking Manager database +- Amazon S3 bucket with public read access +- Amazon S3 write credentials. See `users-stats.py` for environment variables. + +### Environment variables + +- `S3_ACCESS_KEY=your_access_key` +- `S3_SECRET_KEY=your_secret_key` +- `BUCKET=your_bucket` diff --git a/worker-stats/requirements.txt b/worker-stats/requirements.txt new file mode 100644 index 00000000..32e2245c --- /dev/null +++ b/worker-stats/requirements.txt @@ -0,0 +1,6 @@ +futures==3.0.5 +psycopg2==2.6.1 +requests==2.9.1 +simplejson==3.8.2 +tinys3==0.1.11 +wsgiref==0.1.2 diff --git a/worker-stats/user-stats.py b/worker-stats/user-stats.py new file mode 100644 index 00000000..9554d680 --- /dev/null +++ b/worker-stats/user-stats.py @@ -0,0 +1,89 @@ +#!/usr/bin/python + +# This script creates statistics for OSM users of the OSM Tasking Manager database. +# Edit timestamps for each category of action arranged by project and user. +# Output is in JSON format and is uploaded directly to an Amazon S3 bucket. +# +# Environment variables: +# +# S3_ACCESS_KEY=your_access_key +# S3_SECRET_KEY=your_secret_key +# BUCKET=your_bucket +# +# Usage +# +# python user-stats.py + +import psycopg2 +import psycopg2.extras +import tinys3 +import simplejson as json +from datetime import datetime +import time +import os + +def connectDB(): + # login info + host = 'localhost' + database = 'osmtm' + user = 'postgres' + # define connection string + conn_string = "host='%s' dbname='%s' user='%s'" % (host, database, user) + # get a connection + conn = psycopg2.connect(conn_string) + # initialize a cursor + return conn.cursor(cursor_factory=psycopg2.extras.NamedTupleCursor) + +# returns dictionary of users, their projects, the categories of edits +# made by them, and the times they made those edits +def getTaskstate(): + # connect to database + cur = connectDB() + # cursor to select all relevant task_state columns + cur.execute(' SELECT user_id, project_id, state, date \ + FROM task_state \ + WHERE user_id IS NOT NULL \ + AND state IN (1, 2, 3) ') + records = cur.fetchall() + # build user dictionary with unique keys and subkeys for user_id and + # project_id, and placeholders for edit type and times + stateLookup = {2: 'done', 3: 'validated', 1: 'invalidated'} + users = {} + # loop through all records from database query to generate per user data + for r in records: + user_id = str(r.user_id) + proj_id = str(r.project_id) + # create empty dict for each user + users[user_id] = {} + users[user_id][proj_id] = {'done': {'times': []}, + 'validated': {'times': []}, + 'invalidated': {'times': []}} + # check if user exists in the users dict + if user_id in users: + if proj_id in users[user_id]: + users[user_id][proj_id][stateLookup[r.state]]['times']\ + .append(str(r.date).split(".")[0]) + + return users + +def upload(file): + # uses tinys3 to create connection, open file, and upload + # get access keys and buckets from environment variables + conn = tinys3.Connection(os.getenv('S3_ACCESS_KEY'), os.getenv('S3_SECRET_KEY'), tls=True) + f = open('%s.json' % file, 'rb') + conn.upload('%s.json' % file, f, bucket=os.getenv('BUCKET'), content_type='application/json') + +def main(): + users = getTaskstate() + + # dump users dict into minified json + fout = json.dumps(users, separators=(',',':')) + # generate file of users json + f = open('users.json', 'wb') + f.write(fout) + f.close() + # trigger upload to s3 + upload('users') + +if __name__ == '__main__': + main() \ No newline at end of file