|
| 1 | +# rubocop:disable Layout/LineLength |
| 2 | +# frozen_string_literal: true |
| 3 | + |
| 4 | +require '../config/environment' |
| 5 | +require 'json' |
| 6 | + |
| 7 | +# if project's classification_rate (difference in classifications / days apart) is higher than 5000 classifications per day and percentage difference is over 50% then we flag as potential project with spurious classifications |
| 8 | +PROJECT_SPURIOUS_CLASSIFICATION_RATE_LOWER_BOUND = 5_000 |
| 9 | +PERCENTAGE_DIFF_THRESHOLD = 50 |
| 10 | + |
| 11 | +USER_CLASSIFICATION_RATE_LOWER_BOUND = 3 |
| 12 | +USER_CLASSIFICATION_COUNT_THRESHOLD = 1_000 |
| 13 | + |
| 14 | +puts 'Querying diffs to flag potential affected projects...' |
| 15 | +projects_weekly_classifications_history = ActiveRecord::Base.connection.exec_query("SELECT |
| 16 | + record1.day as day1, |
| 17 | + record2.day as day_compare, |
| 18 | + record1.project_id, |
| 19 | + record2.project_id, |
| 20 | + record1.classification_count as day1_count, |
| 21 | + record2.classification_count as day_compare_count, |
| 22 | + abs(cast(record2.classification_count - record1.classification_count as float) / record1.classification_count) * 100 as percentage_diff, |
| 23 | + abs(cast(record2.classification_count - record1.classification_count as float) / extract(day from record2.day - record1.day)) as classification_rate |
| 24 | +FROM |
| 25 | + daily_classification_count_per_project AS record1 |
| 26 | +INNER JOIN |
| 27 | + daily_classification_count_per_project AS record2 ON record1.project_id = record2.project_id |
| 28 | +WHERE |
| 29 | + record1.classification_count IS NOT NULL AND record2.classification_count IS NOT NULL and record1.day < record2.day and record1.day >= (CURRENT_DATE - INTERVAL '7 days') and record2.day >= CURRENT_DATE - INTERVAL '2 days' and record2.day < CURRENT_DATE and record1.classification_count > 1000 and record2.classification_count > 1000 order by classification_rate desc;") |
| 30 | + |
| 31 | +flagged_project_id_to_high_classifying_dates = Hash.new { |h, k| h[k] = [] } |
| 32 | +projects_weekly_classifications_history.each do |proj_history| |
| 33 | + next unless proj_history['classification_rate'] >= PROJECT_SPURIOUS_CLASSIFICATION_RATE_LOWER_BOUND && proj_history['percentage_diff'] >= PERCENTAGE_DIFF_THRESHOLD |
| 34 | + |
| 35 | + if proj_history['day1_count'] > proj_history['day_compare_count'] |
| 36 | + flagged_project_id_to_high_classifying_dates[proj_history['project_id']] << proj_history['day1'].strftime('%Y-%m-%d') |
| 37 | + elsif proj_history['day_compare_count'] > proj_history['day1_count'] |
| 38 | + flagged_project_id_to_high_classifying_dates[proj_history['project_id']] << proj_history['day_compare'].strftime('%Y-%m-%d') |
| 39 | + end |
| 40 | +end |
| 41 | + |
| 42 | +puts 'Potential Affected Project IDs...' |
| 43 | +puts flagged_project_id_to_high_classifying_dates.keys |
| 44 | + |
| 45 | +puts 'Finding Potential Spurious Classifiers for each Project...' |
| 46 | + |
| 47 | +users_to_flag = [] |
| 48 | + |
| 49 | +flagged_project_id_to_high_classifying_dates.each do |proj_id, dates| |
| 50 | + user_rates_for_proj = ActiveRecord::Base.connection.exec_query('SELECT *, cast(classification_count as float) / total_session_time as rate from daily_user_classification_count_and_time_per_project where project_id = $1 and day = ANY($2) order by rate desc', 'SQL', [proj_id, "{#{dates.join(',')}}"]) |
| 51 | + |
| 52 | + user_rates_for_proj.each do |user_rate| |
| 53 | + users_to_flag << user_rate['user_id'] if user_rate['rate'] >= USER_CLASSIFICATION_RATE_LOWER_BOUND && user_rate['classification_count'] >= USER_CLASSIFICATION_COUNT_THRESHOLD |
| 54 | + end |
| 55 | +end |
| 56 | + |
| 57 | +puts 'Flagged Users...' |
| 58 | +puts users_to_flag |
0 commit comments