Skip to content

Commit 8d6c79f

Browse files
committed
Add multi stage cleanup for orphaned packages
1 parent 5c92198 commit 8d6c79f

File tree

7 files changed

+465
-52
lines changed

7 files changed

+465
-52
lines changed

cavil.conf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,12 @@
5656
# Number of packages per cleanup process
5757
cleanup_bucket_average => 400,
5858

59+
# Number of days to keep reviewed packages when not part of a product before cleanup
60+
days_to_keep_orphaned_packages => 30,
61+
62+
# Number of days to keep duplicate reviewed packages when not part of a product before cleanup
63+
days_to_keep_orphaned_duplicate_packages => 7,
64+
5965
# Maximum number of files to show in reports per risk level
6066
min_files_short_report => 20,
6167

lib/Cavil/Model/Packages.pm

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,11 @@ sub mark_matched_for_reindex ($self, $pid, $priority = 0) {
397397
$self->minion->enqueue(reindex_matched_later => [$pid] => {priority => $priority});
398398
}
399399

400+
sub need_cleanup ($self) {
401+
return $self->pg->db->query('SELECT id FROM bot_packages WHERE obsolete IS TRUE AND cleaned IS NULL ORDER BY ID')
402+
->arrays->flatten->to_array;
403+
}
404+
400405
sub name_suggestions ($self, $partial) {
401406
my $like = '%' . $partial . '%';
402407
return $self->pg->db->select(
@@ -426,6 +431,26 @@ sub obs_import ($self, $id, $data, $priority = 5) {
426431
);
427432
}
428433

434+
sub obsolete_duplicate_new ($self) {
435+
my $db = $self->pg->db;
436+
437+
# Mark all duplicate new packages as obsolete (same external_link and name)
438+
$db->query(
439+
q{
440+
UPDATE bot_packages
441+
SET obsolete = true, state = 'obsolete'
442+
WHERE id IN (
443+
SELECT a.id FROM (
444+
SELECT id, ROW_NUMBER() OVER (PARTITION BY external_link, name ORDER BY id DESC) row_no
445+
FROM bot_packages
446+
WHERE state = 'new' AND external_link IS NOT NULL
447+
) AS a
448+
WHERE row_no > 1
449+
);
450+
}
451+
);
452+
}
453+
429454
sub obsolete_if_not_in_product ($self, $id) {
430455
my $db = $self->pg->db;
431456
return undef if $db->query('select 1 from bot_package_products where package = ? limit 1', $id)->array;
@@ -436,6 +461,35 @@ sub obsolete_if_not_in_product ($self, $id) {
436461
return 1;
437462
}
438463

464+
sub obsolete_old_packages ($self, $days_to_keep_orphaned, $days_to_keep_orphaned_duplicates) {
465+
my $db = $self->pg->db;
466+
467+
# Mark duplicate old packages not in products as obsolete
468+
$db->query(
469+
"UPDATE bot_packages SET obsolete = true WHERE id IN (
470+
SELECT id FROM (
471+
SELECT id, ROW_NUMBER() OVER (PARTITION BY name ORDER BY id DESC) row_no
472+
FROM bot_packages LEFT JOIN bot_package_products ON bot_package_products.package = bot_packages.id
473+
WHERE state != 'new' AND checksum IS NOT NULL
474+
AND imported < NOW() - (INTERVAL '1 days' * ?)
475+
AND bot_package_products.product IS NULL
476+
) AS a
477+
WHERE row_no > 1
478+
)", $days_to_keep_orphaned_duplicates
479+
);
480+
481+
# Mark all old packages not in products as obsolete
482+
$db->query(
483+
"UPDATE bot_packages SET obsolete = true WHERE id IN (
484+
SELECT id
485+
FROM bot_packages LEFT JOIN bot_package_products ON bot_package_products.package = bot_packages.id
486+
WHERE state != 'new' AND obsolete != true AND checksum IS NOT NULL
487+
AND imported < NOW() - (INTERVAL '1 days' * ?)
488+
AND bot_package_products.product IS NULL
489+
)", $days_to_keep_orphaned
490+
);
491+
}
492+
439493
sub reindex ($self, $id, @args) {
440494

441495
# Protect from race conditions (even before creating a background job)

lib/Cavil/Task/Cleanup.pm

Lines changed: 8 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -26,27 +26,11 @@ sub register ($self, $app, $config) {
2626
}
2727

2828
sub _cleanup ($job) {
29-
my $app = $job->app;
30-
my $db = $app->pg->db;
29+
my $app = $job->app;
30+
my $pkgs = $app->packages;
3131

32-
# Mark all duplicate new packages as obsolete (same external_link and name)
33-
$db->query(
34-
q{
35-
UPDATE bot_packages
36-
SET obsolete = true, state = 'obsolete'
37-
WHERE id IN (
38-
SELECT a.id FROM (
39-
SELECT id, ROW_NUMBER() OVER (PARTITION BY external_link, name ORDER BY id DESC) row_no
40-
FROM bot_packages
41-
WHERE state = 'new' AND external_link IS NOT NULL
42-
) AS a
43-
WHERE row_no > 1
44-
);
45-
}
46-
);
47-
48-
my $ids = $db->query('SELECT id FROM bot_packages WHERE obsolete IS TRUE AND cleaned IS NULL ORDER BY ID')
49-
->arrays->flatten->to_array;
32+
$pkgs->obsolete_duplicate_new;
33+
my $ids = $pkgs->need_cleanup;
5034
my $buckets = Cavil::Util::buckets($ids, $app->config->{cleanup_bucket_average});
5135

5236
my $minion = $app->minion;
@@ -59,22 +43,10 @@ sub _cleanup_batch ($job, @ids) {
5943
}
6044

6145
sub _obsolete ($job) {
62-
my $app = $job->app;
63-
my $log = $app->log;
64-
my $db = $job->app->pg->db;
65-
66-
my $leave_untagged_imports = 7;
67-
68-
my $list = $db->query(
69-
"update bot_packages set obsolete = true where id in
70-
(select id from bot_packages
71-
left join bot_package_products on bot_package_products.package=bot_packages.id
72-
where state != 'new' and checksum is not null and
73-
imported < now() - Interval '$leave_untagged_imports days' and
74-
bot_package_products.product is null
75-
)"
76-
);
77-
46+
my $app = $job->app;
47+
my $config = $app->config;
48+
$app->packages->obsolete_old_packages($config->{days_to_keep_orphaned_packages},
49+
$config->{days_to_keep_orphaned_duplicate_packages});
7850
$app->minion->enqueue('cleanup' => [] => {parents => [$job->id]});
7951
}
8052

staging/start.pl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@
4242
acceptable_risk => 3,
4343
index_bucket_average => 100,
4444
cleanup_bucket_average => 50,
45+
days_to_keep_orphaned_packages => 30,
46+
days_to_keep_orphaned_duplicate_packages => 7,
4547
min_files_short_report => 20,
4648
max_email_url_size => 2048,
4749
max_task_memory => 5_000_000_000,
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ use Mojo::URL;
3030

3131
plan skip_all => 'set TEST_ONLINE to enable this test' unless $ENV{TEST_ONLINE};
3232

33-
my $cavil_test = Cavil::Test->new(online => $ENV{TEST_ONLINE}, schema => 'cleanup_duplicates_test');
33+
my $cavil_test = Cavil::Test->new(online => $ENV{TEST_ONLINE}, schema => 'cleanup_new_duplicates_test');
3434
my $config = $cavil_test->default_config;
3535
my $t = Test::Mojo->new(Cavil => $config);
3636
$cavil_test->no_fixtures($t->app);
@@ -190,7 +190,7 @@ subtest 'Clean up duplicates' => sub {
190190
ok !$t->app->pg->db->select('file_snippets', [\'count(*)'], {package => $one_id})->array->[0], 'no file snippets';
191191

192192
# Second package (cleaned up)
193-
is $t->app->packages->find($one_id)->{state}, 'obsolete', 'right state';
193+
is $t->app->packages->find($two_id)->{state}, 'obsolete', 'right state';
194194
ok $t->app->packages->find($two_id)->{obsolete}, 'obsolete';
195195
ok $t->app->packages->find($two_id)->{cleaned}, 'cleanup done';
196196
is $t->app->packages->find($two_id)->{result}, undef, 'right result';

0 commit comments

Comments
 (0)