Skip to content

Commit 858fc5e

Browse files
committed
Centralize data retention policy
Also prune page data.
1 parent 3caaee1 commit 858fc5e

6 files changed

Lines changed: 79 additions & 4 deletions

File tree

SETUP/dp.cron.template

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
01 0 * * * JOB=RecordProjectStateCounts; <<PHP_CLI_EXECUTABLE>> <<CODE_DIR>>/crontab/run_background_job.php $JOB
1111
# See SETUP/ARCHIVING.md to set up archiving before enabling the line below
1212
#10 0 * * * JOB=ArchiveProjects; <<PHP_CLI_EXECUTABLE>> <<CODE_DIR>>/crontab/run_background_job.php $JOB
13+
30 0 * * * JOB=PrunePageData; <<PHP_CLI_EXECUTABLE>> <<CODE_DIR>>/crontab/run_background_job.php $JOB
1314
45 0 * * * JOB=CleanDownloadTemp; <<PHP_CLI_EXECUTABLE>> <<CODE_DIR>>/crontab/run_background_job.php $JOB
1415
50 0 * * * JOB=CleanUploadsTrash; <<PHP_CLI_EXECUTABLE>> <<CODE_DIR>>/crontab/run_background_job.php $JOB
1516
55 0 * * * JOB=ExtendSiteTallyGoals; <<PHP_CLI_EXECUTABLE>> <<CODE_DIR>>/crontab/run_background_job.php $JOB

crontab/ArchiveProjects.inc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ include_once($relPath.'archiving.inc');
44
// Find projects that were posted to PG a while ago and archive them.
55
class ArchiveProjects extends BackgroundJob
66
{
7-
private int $days_to_retain = 100;
87
public bool $requires_web_context = true;
98

109
public function work()
@@ -14,11 +13,12 @@ class ArchiveProjects extends BackgroundJob
1413
SELECT *
1514
FROM projects
1615
WHERE
17-
modifieddate <= UNIX_TIMESTAMP() - (24 * 60 * 60) * {$this->days_to_retain}
16+
modifieddate <= UNIX_TIMESTAMP() - (24 * 60 * 60) * %d
1817
AND archived = '0'
1918
AND state = '%s'
2019
ORDER BY modifieddate
2120
",
21+
SiteConfig::get()->days_to_retain_projects_before_archiving,
2222
DPDatabase::escape(PROJ_SUBMIT_PG_POSTED)
2323
);
2424
$result = DPDatabase::query($sql);

crontab/PruneJobLogs.inc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,6 @@ class PruneJobLogs extends BackgroundJob
66
{
77
public function work()
88
{
9-
prune_job_log_entries(365 + 30); // ~13 months
9+
prune_job_log_entries(SiteConfig::get()->days_to_retain_job_logs);
1010
}
1111
}

crontab/PruneNonActivatedUsers.inc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,6 @@ class PruneNonActivatedUsers extends BackgroundJob
55
{
66
public function work()
77
{
8-
NonactivatedUser::prune(90); // ~3 months
8+
NonactivatedUser::prune(SiteConfig::get()->days_to_retain_pending_accounts);
99
}
1010
}

crontab/PrunePageData.inc

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
<?php
2+
3+
// Find projects that were posted to PG a while ago and prune page data.
4+
class PrunePageData extends BackgroundJob
5+
{
6+
public function work()
7+
{
8+
$database = SiteConfig::get()->archive_db_name ? SiteConfig::get()->archive_db_name : SiteConfig::get()->db_name;
9+
10+
// We need to iteratively delete data in batches by projectid because:
11+
// 1. we base the pruning on the project's posted time
12+
// 2. there are indexes for these tables based on projectid
13+
// 3. to possibly limit our runtime
14+
$sql = sprintf(
15+
"
16+
SELECT projectid
17+
FROM projects
18+
WHERE
19+
modifieddate <= UNIX_TIMESTAMP() - (24 * 60 * 60) * %d
20+
AND state = '%s'
21+
ORDER BY modifieddate
22+
",
23+
SiteConfig::get()->days_to_retain_page_data_after_posting,
24+
DPDatabase::escape(PROJ_SUBMIT_PG_POSTED)
25+
);
26+
$result = DPDatabase::query($sql);
27+
$num_projects = mysqli_num_rows($result);
28+
29+
echo "Pruning data for $num_projects projects...\n";
30+
31+
$num_projects_pruned = 0;
32+
while ([$project_id] = mysqli_fetch_row($result)) {
33+
if ($this->watch->read() >= $this->web_context_max_runtime_s) {
34+
break;
35+
}
36+
37+
// first wordcheck_events
38+
$sql = sprintf(
39+
"
40+
DELETE FROM $database.wordcheck_events
41+
WHERE projectid = '%s'
42+
",
43+
DPDatabase::escape($project_id)
44+
);
45+
DPDatabase::query($sql);
46+
47+
// then page_events
48+
$sql = sprintf(
49+
"
50+
DELETE FROM $database.page_events
51+
WHERE projectid = '%s'
52+
",
53+
DPDatabase::escape($project_id)
54+
);
55+
DPDatabase::query($sql);
56+
57+
$num_projects_pruned += 1;
58+
}
59+
60+
$leftover_projects = $num_projects - $num_projects_pruned;
61+
if ($leftover_projects) {
62+
echo "Reached runtime limit, skipping pruning of page data for remaining $leftover_projects projects.\n";
63+
$this->stop_message = "Pruned page data for $num_projects_pruned projects, ran out of time to prune page data for remaining $leftover_projects";
64+
} else {
65+
$this->stop_message = "Pruned page data for $num_projects_pruned projects";
66+
}
67+
}
68+
}

pinc/SiteConfig.inc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ class SiteConfig
2121
public ?string $archive_db_name;
2222
public ?string $archive_projects_dir;
2323

24+
// data retention configuration
25+
public int $days_to_retain_projects_before_archiving = 100;
26+
public int $days_to_retain_page_data_after_posting = 365;
27+
public int $days_to_retain_pending_accounts = 90;
28+
public int $days_to_retain_job_logs = 365 + 30; // ~13 months
29+
2430
// directories & URLs
2531
public string $code_dir;
2632
public string $code_url;

0 commit comments

Comments
 (0)