This repository was archived by the owner on Sep 30, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Expand file tree
/
Copy pathcleanup.go
More file actions
1254 lines (1100 loc) · 43.6 KB
/
cleanup.go
File metadata and controls
1254 lines (1100 loc) · 43.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package internal
import (
"bytes"
"context"
_ "embed"
"fmt"
"hash/fnv"
"io"
"io/fs"
"os"
"os/exec"
"path/filepath"
"sort"
"strconv"
"strings"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/sourcegraph/log"
"github.com/sourcegraph/sourcegraph/cmd/frontend/envvar"
"github.com/sourcegraph/sourcegraph/cmd/gitserver/internal/common"
"github.com/sourcegraph/sourcegraph/cmd/gitserver/internal/executil"
"github.com/sourcegraph/sourcegraph/cmd/gitserver/internal/git"
"github.com/sourcegraph/sourcegraph/cmd/gitserver/internal/gitserverfs"
"github.com/sourcegraph/sourcegraph/internal/actor"
"github.com/sourcegraph/sourcegraph/internal/api"
"github.com/sourcegraph/sourcegraph/internal/conf"
"github.com/sourcegraph/sourcegraph/internal/database"
du "github.com/sourcegraph/sourcegraph/internal/diskusage"
"github.com/sourcegraph/sourcegraph/internal/env"
"github.com/sourcegraph/sourcegraph/internal/errcode"
"github.com/sourcegraph/sourcegraph/internal/gitserver"
"github.com/sourcegraph/sourcegraph/internal/goroutine"
"github.com/sourcegraph/sourcegraph/internal/hostname"
"github.com/sourcegraph/sourcegraph/internal/lazyregexp"
"github.com/sourcegraph/sourcegraph/internal/wrexec"
"github.com/sourcegraph/sourcegraph/lib/errors"
)
type JanitorConfig struct {
JanitorInterval time.Duration
ShardID string
ReposDir string
DesiredPercentFree int
DisableDeleteReposOnWrongShard bool
}
func NewJanitor(ctx context.Context, cfg JanitorConfig, db database.DB, rcf *wrexec.RecordingCommandFactory, cloneRepo cloneRepoFunc, logger log.Logger) goroutine.BackgroundRoutine {
return goroutine.NewPeriodicGoroutine(
actor.WithInternalActor(ctx),
goroutine.HandlerFunc(func(ctx context.Context) error {
logger.Info("Starting janitor run")
// On Sourcegraph.com, we clone repos lazily, meaning whatever github.com
// repo is visited will be cloned eventually. So over time, we would always
// accumulate terabytes of repos, of which many are probably not visited
// often. Thus, we have this special cleanup worker for Sourcegraph.com that
// will remove repos that have not been changed in a long time (thats the
// best metric we have here today) once our disks are running full.
// On customer instances, this worker is useless, because repos are always
// managed by an external service connection and they will be recloned
// ASAP.
if envvar.SourcegraphDotComMode() {
diskSizer := &StatDiskSizer{}
logger := logger.Scoped("dotcom-repo-cleaner")
start := time.Now()
logger.Info("Starting dotcom repo cleaner")
toFree, err := howManyBytesToFree(logger, cfg.ReposDir, diskSizer, cfg.DesiredPercentFree)
if err != nil {
logger.Error("ensuring free disk space", log.Error(err))
} else if err := freeUpSpace(ctx, logger, db, cfg.ShardID, cfg.ReposDir, diskSizer, cfg.DesiredPercentFree, toFree); err != nil {
logger.Error("error freeing up space", log.Error(err))
}
logger.Info("dotcom repo cleaner finished", log.Int64("toFree", toFree), log.Bool("failed", err != nil), log.String("duration", time.Since(start).String()))
}
gitserverAddrs := gitserver.NewGitserverAddresses(conf.Get())
// TODO: Should this return an error?
cleanupRepos(ctx, logger, db, rcf, cfg.ShardID, cfg.ReposDir, cloneRepo, gitserverAddrs, cfg.DisableDeleteReposOnWrongShard)
return nil
}),
goroutine.WithName("gitserver.janitor"),
goroutine.WithDescription("cleans up and maintains repositories regularly"),
goroutine.WithInterval(cfg.JanitorInterval),
)
}
var (
wrongShardReposTotal = promauto.NewGauge(prometheus.GaugeOpts{
Name: "src_gitserver_repo_wrong_shard",
Help: "The number of repos that are on disk on the wrong shard",
})
wrongShardReposDeletedCounter = promauto.NewCounter(prometheus.CounterOpts{
Name: "src_gitserver_repo_wrong_shard_deleted",
Help: "The number of repos on the wrong shard that we deleted",
})
)
//go:embed sg_maintenance.sh
var sgMaintenanceScript string
const (
day = 24 * time.Hour
// repoTTL is how often we should re-clone a repository.
repoTTL = 45 * day
// repoTTLGC is how often we should re-clone a repository once it is
// reporting git gc issues.
repoTTLGC = 2 * day
// gitConfigMaybeCorrupt is a key we add to git config to signal that a repo may be
// corrupt on disk.
gitConfigMaybeCorrupt = "sourcegraph.maybeCorruptRepo"
// The name of the log file placed by sg maintenance in case it encountered an
// error.
sgmLog = "sgm.log"
)
const (
// gitGCModeGitAutoGC is when we rely on git running auto gc.
gitGCModeGitAutoGC int = 1
// gitGCModeJanitorAutoGC is when during janitor jobs we run git gc --auto.
gitGCModeJanitorAutoGC = 2
// gitGCModeMaintenance is when during janitor jobs we run sg maintenance.
gitGCModeMaintenance = 3
)
// gitGCMode describes which mode we should be running git gc.
// See for a detailed description of the modes: https://docs.sourcegraph.com/dev/background-information/git_gc
var gitGCMode = func() int {
// EnableGCAuto is a temporary flag that allows us to control whether or not
// `git gc --auto` is invoked during janitorial activities. This flag will
// likely evolve into some form of site config value in the future.
enableGCAuto, _ := strconv.ParseBool(env.Get("SRC_ENABLE_GC_AUTO", "true", "Use git-gc during janitorial cleanup phases"))
// sg maintenance and git gc must not be enabled at the same time. However, both
// might be disabled at the same time, hence we need both SRC_ENABLE_GC_AUTO and
// SRC_ENABLE_SG_MAINTENANCE.
enableSGMaintenance, _ := strconv.ParseBool(env.Get("SRC_ENABLE_SG_MAINTENANCE", "false", "Use sg maintenance during janitorial cleanup phases"))
if enableGCAuto && !enableSGMaintenance {
return gitGCModeJanitorAutoGC
}
if enableSGMaintenance && !enableGCAuto {
return gitGCModeMaintenance
}
return gitGCModeGitAutoGC
}()
// The limit of 50 mirrors Git's gc_auto_pack_limit
var autoPackLimit, _ = strconv.Atoi(env.Get("SRC_GIT_AUTO_PACK_LIMIT", "50", "the maximum number of pack files we tolerate before we trigger a repack"))
// Our original Git gc job used 1 as limit, while git's default is 6700. We
// don't want to be too aggressive to avoid unnecessary IO, hence we choose a
// value somewhere in the middle. https://gitlab.com/gitlab-org/gitaly uses a
// limit of 1024, which corresponds to an average of 4 loose objects per folder.
// We can tune this parameter once we gain more experience.
var looseObjectsLimit, _ = strconv.Atoi(env.Get("SRC_GIT_LOOSE_OBJECTS_LIMIT", "1024", "the maximum number of loose objects we tolerate before we trigger a repack"))
// A failed sg maintenance run will place a log file in the git directory.
// Subsequent sg maintenance runs are skipped unless the log file is old.
//
// Based on how https://github.com/git/git handles the gc.log file.
var sgmLogExpire = env.MustGetDuration("SRC_GIT_LOG_FILE_EXPIRY", 24*time.Hour, "the number of hours after which sg maintenance runs even if a log file is present")
// Each failed sg maintenance run increments a counter in the sgmLog file.
// We reclone the repository if the number of retries exceeds sgmRetries.
// Setting SRC_SGM_RETRIES to -1 disables recloning due to sgm failures.
// Default value is 3 (reclone after 3 failed sgm runs).
//
// We mention this ENV variable in the header message of the sgmLog files. Make
// sure that changes here are reflected in sgmLogHeader, too.
var sgmRetries, _ = strconv.Atoi(env.Get("SRC_SGM_RETRIES", "3", "the maximum number of times we retry sg maintenance before triggering a reclone."))
// Controls if gitserver cleanup tries to remove repos from disk which are not defined in the DB. Defaults to false.
var removeNonExistingRepos, _ = strconv.ParseBool(env.Get("SRC_REMOVE_NON_EXISTING_REPOS", "false", "controls if gitserver cleanup tries to remove repos from disk which are not defined in the DB"))
var (
reposRemoved = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "src_gitserver_repos_removed",
Help: "number of repos removed during cleanup",
}, []string{"reason"})
reposRecloned = promauto.NewCounter(prometheus.CounterOpts{
Name: "src_gitserver_repos_recloned",
Help: "number of repos removed and re-cloned due to age",
})
reposRemovedDiskPressure = promauto.NewCounter(prometheus.CounterOpts{
Name: "src_gitserver_repos_removed_disk_pressure",
Help: "number of repos removed due to not enough disk space",
})
janitorRunning = promauto.NewGauge(prometheus.GaugeOpts{
Name: "src_gitserver_janitor_running",
Help: "set to 1 when the gitserver janitor background job is running",
})
jobTimer = promauto.NewHistogramVec(prometheus.HistogramOpts{
Name: "src_gitserver_janitor_job_duration_seconds",
Help: "Duration of the individual jobs within the gitserver janitor background job",
}, []string{"success", "job_name"})
maintenanceStatus = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "src_gitserver_maintenance_status",
Help: "whether the maintenance run was a success (true/false) and the reason why a cleanup was needed",
}, []string{"success", "reason"})
pruneStatus = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "src_gitserver_prune_status",
Help: "whether git prune was a success (true/false) and whether it was skipped (true/false)",
}, []string{"success", "skipped"})
janitorTimer = promauto.NewHistogram(prometheus.HistogramOpts{
Name: "src_gitserver_janitor_duration_seconds",
Help: "Duration of gitserver janitor background job",
Buckets: []float64{0.1, 1, 10, 60, 300, 3600, 7200},
})
nonExistingReposRemoved = promauto.NewCounter(prometheus.CounterOpts{
Name: "src_gitserver_non_existing_repos_removed",
Help: "number of non existing repos removed during cleanup",
})
)
type cloneRepoFunc func(ctx context.Context, repo api.RepoName, opts CloneOptions) (cloneProgress string, err error)
// cleanupRepos walks the repos directory and performs maintenance tasks:
//
// 1. Compute the amount of space used by the repo
// 2. Remove corrupt repos.
// 3. Remove stale lock files.
// 4. Ensure correct git attributes
// 5. Ensure gc.auto=0 or unset depending on gitGCMode
// 6. Perform garbage collection
// 7. Re-clone repos after a while. (simulate git gc)
// 8. Remove repos based on disk pressure.
// 9. Perform sg-maintenance
// 10. Git prune
// 11. Set sizes of repos
func cleanupRepos(
ctx context.Context,
logger log.Logger,
db database.DB,
rcf *wrexec.RecordingCommandFactory,
shardID string,
reposDir string,
cloneRepo cloneRepoFunc,
gitServerAddrs gitserver.GitserverAddresses,
disableDeleteReposOnWrongShard bool,
) {
logger = logger.Scoped("cleanup")
start := time.Now()
janitorRunning.Set(1)
defer janitorRunning.Set(0)
janitorStart := time.Now()
defer func() {
janitorTimer.Observe(time.Since(janitorStart).Seconds())
}()
knownGitServerShard := false
for _, addr := range gitServerAddrs.Addresses {
if hostnameMatch(shardID, addr) {
knownGitServerShard = true
break
}
}
if !knownGitServerShard {
logger.Warn("current shard is not included in the list of known gitserver shards, will not delete repos", log.String("current-hostname", shardID), log.Strings("all-shards", gitServerAddrs.Addresses))
}
repoToSize := make(map[api.RepoName]int64)
var wrongShardRepoCount int64
defer func() {
// We want to set the gauge only at the end when we know the total
wrongShardReposTotal.Set(float64(wrongShardRepoCount))
}()
var wrongShardReposDeleted int64
defer func() {
// We want to set the gauge only when wrong shard clean-up is enabled
if disableDeleteReposOnWrongShard {
wrongShardReposDeletedCounter.Add(float64(wrongShardReposDeleted))
}
}()
maybeDeleteWrongShardRepos := func(dir common.GitDir) (done bool, err error) {
// Record the number of repos that should not belong on this instance and
// remove up to SRC_WRONG_SHARD_DELETE_LIMIT in a single Janitor run.
name := gitserverfs.RepoNameFromDir(reposDir, dir)
addr := addrForRepo(ctx, name, gitServerAddrs)
if hostnameMatch(shardID, addr) {
return false, nil
}
wrongShardRepoCount++
// If we're on a shard not currently known, basically every repo would
// be considered on the wrong shard. This is probably a configuration
// error and we don't want to completely empty our disk in that case,
// so skip.
if !knownGitServerShard {
return false, nil
}
// Check that wrong shard deletion has not been disabled.
if disableDeleteReposOnWrongShard {
return false, nil
}
logger.Info(
"removing repo cloned on the wrong shard",
log.String("dir", string(dir)),
log.String("target-shard", addr),
log.String("current-shard", shardID),
)
if err := gitserverfs.RemoveRepoDirectory(ctx, logger, db, shardID, reposDir, dir, false); err != nil {
return true, err
}
wrongShardReposDeleted++
// Note: We just deleted the repo. So we're done with any further janitor tasks!
return true, nil
}
collectSize := func(dir common.GitDir) (done bool, err error) {
size := gitserverfs.DirSize(dir.Path("."))
name := gitserverfs.RepoNameFromDir(reposDir, dir)
repoToSize[name] = size
return false, nil
}
maybeRemoveCorrupt := func(dir common.GitDir) (done bool, _ error) {
corrupt, reason, err := checkRepoDirCorrupt(rcf, reposDir, dir)
if !corrupt || err != nil {
return false, err
}
repoName := gitserverfs.RepoNameFromDir(reposDir, dir)
err = db.GitserverRepos().LogCorruption(ctx, repoName, fmt.Sprintf("sourcegraph detected corrupt repo: %s", reason), shardID)
if err != nil {
logger.Warn("failed to log repo corruption", log.String("repo", string(repoName)), log.Error(err))
}
logger.Info("removing corrupt repo", log.String("repo", string(dir)), log.String("reason", reason))
if err := gitserverfs.RemoveRepoDirectory(ctx, logger, db, shardID, reposDir, dir, true); err != nil {
return true, err
}
reposRemoved.WithLabelValues(reason).Inc()
return true, nil
}
maybeRemoveNonExisting := func(dir common.GitDir) (bool, error) {
if !removeNonExistingRepos {
return false, nil
}
_, err := db.GitserverRepos().GetByName(ctx, gitserverfs.RepoNameFromDir(reposDir, dir))
// Repo still exists, nothing to do.
if err == nil {
return false, nil
}
// Failed to talk to DB, skip this repo.
if !errcode.IsNotFound(err) {
logger.Warn("failed to look up repo", log.Error(err), log.String("repo", string(dir)))
return false, nil
}
// The repo does not exist in the DB (or is soft-deleted), continue deleting it.
err = gitserverfs.RemoveRepoDirectory(ctx, logger, db, shardID, reposDir, dir, false)
if err == nil {
nonExistingReposRemoved.Inc()
}
return true, err
}
ensureGitAttributes := func(dir common.GitDir) (done bool, err error) {
return false, git.SetGitAttributes(dir)
}
ensureAutoGC := func(dir common.GitDir) (done bool, err error) {
return false, gitSetAutoGC(rcf, reposDir, dir)
}
maybeReclone := func(dir common.GitDir) (done bool, err error) {
repoType, err := git.GetRepositoryType(rcf, reposDir, dir)
if err != nil {
return false, err
}
recloneTime, err := getRecloneTime(rcf, reposDir, dir)
if err != nil {
return false, err
}
// Add a jitter to spread out re-cloning of repos cloned at the same time.
var reason string
const maybeCorrupt = "maybeCorrupt"
if maybeCorrupt, _ := git.ConfigGet(rcf, reposDir, dir, gitConfigMaybeCorrupt); maybeCorrupt != "" {
// Set the reason so that the repo cleaned up
reason = maybeCorrupt
// We don't log the corruption here, since the corruption *should* have already been
// logged when this config setting was set in the repo.
// When the repo is recloned, the corrupted_at status should be cleared, which means
// the repo is not considered corrupted anymore.
//
// unset flag to stop constantly re-cloning if it fails.
_ = git.ConfigUnset(rcf, reposDir, dir, gitConfigMaybeCorrupt)
}
if time.Since(recloneTime) > repoTTL+jitterDuration(string(dir), repoTTL/4) {
reason = "old"
}
if time.Since(recloneTime) > repoTTLGC+jitterDuration(string(dir), repoTTLGC/4) {
if gclog, err := os.ReadFile(dir.Path("gc.log")); err == nil && len(gclog) > 0 {
reason = fmt.Sprintf("git gc %s", string(bytes.TrimSpace(gclog)))
}
}
if (sgmRetries >= 0) && (bestEffortReadFailed(dir) > sgmRetries) {
if sgmLog, err := os.ReadFile(dir.Path(sgmLog)); err == nil && len(sgmLog) > 0 {
reason = fmt.Sprintf("sg maintenance, too many retries: %s", string(bytes.TrimSpace(sgmLog)))
}
}
// We believe converting a Perforce depot to a Git repository is generally a
// very expensive operation, therefore we do not try to re-clone/redo the
// conversion only because it is old or slow to do "git gc".
if repoType == "perforce" && reason != maybeCorrupt {
reason = ""
}
if reason == "" {
return false, nil
}
// name is the relative path to ReposDir, but without the .git suffix.
repo := gitserverfs.RepoNameFromDir(reposDir, dir)
recloneLogger := logger.With(
log.String("repo", string(repo)),
log.Time("cloned", recloneTime),
log.String("reason", reason),
)
recloneLogger.Info("re-cloning expired repo")
// update the re-clone time so that we don't constantly re-clone if cloning fails.
// For example if a repo fails to clone due to being large, we will constantly be
// doing a clone which uses up lots of resources.
if err := setRecloneTime(rcf, reposDir, dir, recloneTime.Add(time.Since(recloneTime)/2)); err != nil {
recloneLogger.Warn("setting backed off re-clone time failed", log.Error(err))
}
cmdCtx, cancel := context.WithTimeout(ctx, conf.GitLongCommandTimeout())
defer cancel()
if _, err := cloneRepo(cmdCtx, repo, CloneOptions{Block: true, Overwrite: true}); err != nil {
return true, err
}
reposRecloned.Inc()
return true, nil
}
removeStaleLocks := func(gitDir common.GitDir) (done bool, err error) {
// if removing a lock fails, we still want to try the other locks.
var multi error
// config.lock should be held for a very short amount of time.
if _, err := removeFileOlderThan(logger, gitDir.Path("config.lock"), time.Minute); err != nil {
multi = errors.Append(multi, err)
}
// packed-refs can be held for quite a while, so we are conservative
// with the age.
if _, err := removeFileOlderThan(logger, gitDir.Path("packed-refs.lock"), time.Hour); err != nil {
multi = errors.Append(multi, err)
}
// we use the same conservative age for locks inside of refs
if err := gitserverfs.BestEffortWalk(gitDir.Path("refs"), func(path string, fi fs.DirEntry) error {
if fi.IsDir() {
return nil
}
if !strings.HasSuffix(path, ".lock") {
return nil
}
_, err := removeFileOlderThan(logger, path, time.Hour)
return err
}); err != nil {
multi = errors.Append(multi, err)
}
// We have seen that, occasionally, commit-graph.locks prevent a git repack from
// succeeding. Benchmarks on our dogfood cluster have shown that a commit-graph
// call for a 5GB bare repository takes less than 1 min. The lock is only held
// during a short period during this time. A 1-hour grace period is very
// conservative.
if _, err := removeFileOlderThan(logger, gitDir.Path("objects", "info", "commit-graph.lock"), time.Hour); err != nil {
multi = errors.Append(multi, err)
}
// gc.pid is set by git gc and our sg maintenance script. 24 hours is twice the
// time git gc uses internally.
gcPIDMaxAge := 24 * time.Hour
if foundStale, err := removeFileOlderThan(logger, gitDir.Path(gcLockFile), gcPIDMaxAge); err != nil {
multi = errors.Append(multi, err)
} else if foundStale {
logger.Warn(
"removeStaleLocks found a stale gc.pid lockfile and removed it. This should not happen and points to a problem with garbage collection. Monitor the repo for possible corruption and verify if this error reoccurs",
log.String("path", string(gitDir)),
log.Duration("age", gcPIDMaxAge))
}
return false, multi
}
performGC := func(dir common.GitDir) (done bool, err error) {
return false, gitGC(rcf, reposDir, dir)
}
performSGMaintenance := func(dir common.GitDir) (done bool, err error) {
return false, sgMaintenance(logger, dir)
}
performGitPrune := func(reposDir string, dir common.GitDir) (done bool, err error) {
return false, pruneIfNeeded(rcf, reposDir, dir, looseObjectsLimit)
}
type cleanupFn struct {
Name string
Do func(common.GitDir) (bool, error)
}
cleanups := []cleanupFn{
// First, check if we should even be having this repo on disk anymore,
// maybe there's been a resharding event and we can actually remove it
// and not spend further CPU cycles fixing it.
{"delete wrong shard repos", maybeDeleteWrongShardRepos},
// Compute the amount of space used by the repo
{"compute stats", collectSize},
// Do some sanity checks on the repository.
{"maybe remove corrupt", maybeRemoveCorrupt},
// Remove repo if DB does not contain it anymore
{"maybe remove non existing", maybeRemoveNonExisting},
// If git is interrupted it can leave lock files lying around. It does not clean
// these up, and instead fails commands.
{"remove stale locks", removeStaleLocks},
// We always want to have the same git attributes file at info/attributes.
{"ensure git attributes", ensureGitAttributes},
// Enable or disable background garbage collection depending on
// gitGCMode. The purpose is to avoid repository corruption which can
// happen if several git-gc operations are running at the same time.
// We only disable if sg is managing gc.
{"auto gc config", ensureAutoGC},
}
if gitGCMode == gitGCModeJanitorAutoGC {
// Runs a number of housekeeping tasks within the current repository, such as
// compressing file revisions (to reduce disk space and increase performance),
// removing unreachable objects which may have been created from prior
// invocations of git add, packing refs, pruning reflog, rerere metadata or stale
// working trees. May also update ancillary indexes such as the commit-graph.
cleanups = append(cleanups, cleanupFn{"garbage collect", performGC})
}
if gitGCMode == gitGCModeMaintenance {
// Run tasks to optimize Git repository data, speeding up other Git commands and
// reducing storage requirements for the repository. Note: "garbage collect" and
// "sg maintenance" must not be enabled at the same time.
cleanups = append(cleanups, cleanupFn{"sg maintenance", performSGMaintenance})
cleanups = append(cleanups, cleanupFn{"git prune", func(dir common.GitDir) (bool, error) {
return performGitPrune(reposDir, dir)
}})
}
if !conf.Get().DisableAutoGitUpdates {
// Old git clones accumulate loose git objects that waste space and slow down git
// operations. Periodically do a fresh clone to avoid these problems. git gc is
// slow and resource intensive. It is cheaper and faster to just re-clone the
// repository. We don't do this if DisableAutoGitUpdates is set as it could
// potentially kick off a clone operation.
cleanups = append(cleanups, cleanupFn{
Name: "maybe re-clone",
Do: maybeReclone,
})
}
reposCleaned := 0
err := iterateGitDirs(reposDir, func(gitDir common.GitDir) (done bool) {
for _, cfn := range cleanups {
// Check if context has been canceled, if so skip the rest of the repos.
select {
case <-ctx.Done():
logger.Warn("aborting janitor run", log.Error(ctx.Err()))
return true
default:
}
start := time.Now()
done, err := cfn.Do(gitDir)
if err != nil {
logger.Error("error running cleanup command",
log.String("name", cfn.Name),
log.String("repo", string(gitDir)),
log.Error(err))
}
jobTimer.WithLabelValues(strconv.FormatBool(err == nil), cfn.Name).Observe(time.Since(start).Seconds())
if done {
break
}
}
reposCleaned++
// Every 1000 repos, log a progress message.
if reposCleaned%1000 == 0 {
logger.Info("Janitor progress", log.Int("repos_cleaned", reposCleaned))
}
return false
})
if err != nil {
logger.Error("error iterating over repositories", log.Error(err))
}
if len(repoToSize) > 0 {
_, err := db.GitserverRepos().UpdateRepoSizes(ctx, logger, shardID, repoToSize)
if err != nil {
logger.Error("setting repo sizes", log.Error(err))
}
}
logger.Info("Janitor run finished", log.String("duration", time.Since(start).String()))
}
func checkRepoDirCorrupt(rcf *wrexec.RecordingCommandFactory, reposDir string, dir common.GitDir) (bool, string, error) {
// We treat repositories missing HEAD to be corrupt. Both our cloning
// and fetching ensure there is a HEAD file.
if _, err := os.Stat(dir.Path("HEAD")); os.IsNotExist(err) {
return true, "missing-head", nil
} else if err != nil {
return false, "", err
}
// We have seen repository corruption fail in such a way that the git
// config is missing the bare repo option but everything else looks
// like it works. This leads to failing fetches, so treat non-bare
// repos as corrupt. Since we often fetch with ensureRevision, this
// leads to most commands failing against the repository. It is safer
// to remove now than try a safe reclone.
if gitIsNonBareBestEffort(rcf, reposDir, dir) {
return true, "non-bare", nil
}
return false, "", nil
}
// DiskSizer gets information about disk size and free space.
type DiskSizer interface {
BytesFreeOnDisk(mountPoint string) (uint64, error)
DiskSizeBytes(mountPoint string) (uint64, error)
}
// howManyBytesToFree returns the number of bytes that should be freed to make sure
// there is sufficient disk space free to satisfy s.DesiredPercentFree.
func howManyBytesToFree(logger log.Logger, reposDir string, diskSizer DiskSizer, desiredPercentFree int) (int64, error) {
actualFreeBytes, err := diskSizer.BytesFreeOnDisk(reposDir)
if err != nil {
return 0, errors.Wrap(err, "finding the amount of space free on disk")
}
// Free up space if necessary.
diskSizeBytes, err := diskSizer.DiskSizeBytes(reposDir)
if err != nil {
return 0, errors.Wrap(err, "getting disk size")
}
desiredFreeBytes := uint64(float64(desiredPercentFree) / 100.0 * float64(diskSizeBytes))
howManyBytesToFree := int64(desiredFreeBytes - actualFreeBytes)
if howManyBytesToFree < 0 {
howManyBytesToFree = 0
}
const G = float64(1024 * 1024 * 1024)
logger.Debug(
"howManyBytesToFree",
log.Int("desired percent free", desiredPercentFree),
log.Float64("actual percent free", float64(actualFreeBytes)/float64(diskSizeBytes)*100.0),
log.Float64("amount to free in GiB", float64(howManyBytesToFree)/G),
)
return howManyBytesToFree, nil
}
type StatDiskSizer struct{}
func (s *StatDiskSizer) BytesFreeOnDisk(mountPoint string) (uint64, error) {
usage, err := du.New(mountPoint)
if err != nil {
return 0, err
}
return usage.Available(), nil
}
func (s *StatDiskSizer) DiskSizeBytes(mountPoint string) (uint64, error) {
usage, err := du.New(mountPoint)
if err != nil {
return 0, err
}
return usage.Size(), nil
}
// freeUpSpace removes git directories under ReposDir, in order from least
// recently to most recently used, until it has freed howManyBytesToFree.
func freeUpSpace(ctx context.Context, logger log.Logger, db database.DB, shardID string, reposDir string, diskSizer DiskSizer, desiredPercentFree int, howManyBytesToFree int64) error {
if howManyBytesToFree <= 0 {
return nil
}
logger = logger.Scoped("freeUpSpace")
// Get the git directories and their mod times.
gitDirs, err := findGitDirs(reposDir)
if err != nil {
return errors.Wrap(err, "finding git dirs")
}
dirModTimes := make(map[common.GitDir]time.Time, len(gitDirs))
for _, d := range gitDirs {
mt, err := gitDirModTime(d)
if err != nil {
// If we get an error here, we move it to the end of the queue,
// since it's the janitor's job to clean/fix this.
logger.Warn("computing mod time of git dir failed", log.String("dir", string(d)), log.Error(err))
dirModTimes[d] = time.Now()
continue
}
dirModTimes[d] = mt
}
// Sort the repos from least to most recently used.
sort.Slice(gitDirs, func(i, j int) bool {
return dirModTimes[gitDirs[i]].Before(dirModTimes[gitDirs[j]])
})
// Remove repos until howManyBytesToFree is met or exceeded.
var spaceFreed int64
diskSizeBytes, err := diskSizer.DiskSizeBytes(reposDir)
if err != nil {
return errors.Wrap(err, "getting disk size")
}
for _, d := range gitDirs {
if spaceFreed >= howManyBytesToFree {
return nil
}
// Fast-exit if the context has been canceled.
select {
case <-ctx.Done():
return ctx.Err()
default:
}
delta := gitserverfs.DirSize(d.Path("."))
if err := gitserverfs.RemoveRepoDirectory(ctx, logger, db, shardID, reposDir, d, true); err != nil {
logger.Warn("failed to remove least recently used repo", log.String("dir", string(d)), log.Error(err))
continue
}
spaceFreed += delta
reposRemovedDiskPressure.Inc()
// Report the new disk usage situation after removing this repo.
actualFreeBytes, err := diskSizer.BytesFreeOnDisk(reposDir)
if err != nil {
return errors.Wrap(err, "finding the amount of space free on disk")
}
G := float64(1024 * 1024 * 1024)
logger.Warn("removed least recently used repo",
log.String("repo", string(d)),
log.Duration("how old", time.Since(dirModTimes[d])),
log.Float64("free space in GiB", float64(actualFreeBytes)/G),
log.Float64("actual percent of disk space free", float64(actualFreeBytes)/float64(diskSizeBytes)*100.0),
log.Float64("desired percent of disk space free", float64(desiredPercentFree)),
log.Float64("space freed in GiB", float64(spaceFreed)/G),
log.Float64("how much space to free in GiB", float64(howManyBytesToFree)/G))
}
// Check.
if spaceFreed < howManyBytesToFree {
return errors.Errorf("only freed %d bytes, wanted to free %d", spaceFreed, howManyBytesToFree)
}
return nil
}
func gitDirModTime(d common.GitDir) (time.Time, error) {
head, err := os.Stat(d.Path("HEAD"))
if err != nil {
return time.Time{}, errors.Wrap(err, "getting repository modification time")
}
return head.ModTime(), nil
}
// iterateGitDirs walks over the reposDir on disk and calls walkFn for each of the
// git directories found on disk.
func iterateGitDirs(reposDir string, walkFn func(common.GitDir) (done bool)) error {
return gitserverfs.BestEffortWalk(reposDir, func(dir string, fi fs.DirEntry) error {
if gitserverfs.IgnorePath(reposDir, dir) {
if fi.IsDir() {
return filepath.SkipDir
}
return nil
}
// Look for $GIT_DIR
if !fi.IsDir() || fi.Name() != ".git" {
return nil
}
// We are sure this is a GIT_DIR after the above check
gitDir := common.GitDir(dir)
if done := walkFn(gitDir); done {
return filepath.SkipAll
}
return filepath.SkipDir
})
}
// findGitDirs collects the GitDirs of all repos under reposDir.
func findGitDirs(reposDir string) ([]common.GitDir, error) {
var dirs []common.GitDir
return dirs, iterateGitDirs(reposDir, func(dir common.GitDir) bool {
dirs = append(dirs, dir)
return false
})
}
// setRecloneTime sets the time a repository is cloned.
func setRecloneTime(rcf *wrexec.RecordingCommandFactory, reposDir string, dir common.GitDir, now time.Time) error {
err := git.ConfigSet(rcf, reposDir, dir, "sourcegraph.recloneTimestamp", strconv.FormatInt(now.Unix(), 10))
if err != nil {
if err2 := git.EnsureHEAD(dir); err2 != nil {
err = errors.Append(err, err2)
}
return errors.Wrap(err, "failed to update recloneTimestamp")
}
return nil
}
// getRecloneTime returns an approximate time a repository is cloned. If the
// value is not stored in the repository, the re-clone time for the repository is
// set to now.
func getRecloneTime(rcf *wrexec.RecordingCommandFactory, reposDir string, dir common.GitDir) (time.Time, error) {
// We store the time we re-cloned the repository. If the value is missing,
// we store the current time. This decouples this timestamp from the
// different ways a clone can appear in gitserver.
update := func() (time.Time, error) {
now := time.Now()
return now, setRecloneTime(rcf, reposDir, dir, now)
}
value, err := git.ConfigGet(rcf, reposDir, dir, "sourcegraph.recloneTimestamp")
if err != nil {
return time.Unix(0, 0), errors.Wrap(err, "failed to determine clone timestamp")
}
if value == "" {
return update()
}
sec, err := strconv.ParseInt(value, 10, 0)
if err != nil {
// If the value is bad update it to the current time
now, err2 := update()
if err2 != nil {
err = err2
}
return now, err
}
return time.Unix(sec, 0), nil
}
// gitIsNonBareBestEffort returns true if the repository is not a bare
// repo. If we fail to check or the repository is bare we return false.
//
// Note: it is not always possible to check if a repository is bare since a
// lock file may prevent the check from succeeding. We only want bare
// repositories and want to avoid transient false positives.
func gitIsNonBareBestEffort(rcf *wrexec.RecordingCommandFactory, reposDir string, dir common.GitDir) bool {
cmd := exec.Command("git", "-C", dir.Path(), "rev-parse", "--is-bare-repository")
dir.Set(cmd)
wrappedCmd := rcf.WrapWithRepoName(context.Background(), log.NoOp(), gitserverfs.RepoNameFromDir(reposDir, dir), cmd)
b, _ := wrappedCmd.Output()
b = bytes.TrimSpace(b)
return bytes.Equal(b, []byte("false"))
}
// gitGC will invoke `git-gc` to clean up any garbage in the repo. It will
// operate synchronously and be aggressive with its internal heuristics when
// deciding to act (meaning it will act now at lower thresholds).
func gitGC(rcf *wrexec.RecordingCommandFactory, reposDir string, dir common.GitDir) error {
cmd := exec.Command("git", "-c", "gc.auto=1", "-c", "gc.autoDetach=false", "gc", "--auto")
dir.Set(cmd)
wrappedCmd := rcf.WrapWithRepoName(context.Background(), log.NoOp(), gitserverfs.RepoNameFromDir(reposDir, dir), cmd)
err := wrappedCmd.Run()
if err != nil {
return errors.Wrapf(executil.WrapCmdError(cmd, err), "failed to git-gc")
}
return nil
}
const (
sgmLogPrefix = "failed="
sgmLogHeader = `DO NOT EDIT: generated by gitserver.
This file records the number of failed runs of sg maintenance and the
last error message. The number of failed attempts is compared to the
number of allowed retries (see SRC_SGM_RETRIES) to decide whether a
repository should be recloned.`
)
// writeSGMLog writes a log file with the format
//
// <header>
//
// <sgmLogPrefix>=<int>
//
// <error message>
func writeSGMLog(dir common.GitDir, m []byte) error {
return os.WriteFile(
dir.Path(sgmLog),
[]byte(fmt.Sprintf("%s\n\n%s%d\n\n%s\n", sgmLogHeader, sgmLogPrefix, bestEffortReadFailed(dir)+1, m)),
0600,
)
}
func bestEffortReadFailed(dir common.GitDir) int {
b, err := os.ReadFile(dir.Path(sgmLog))
if err != nil {
return 0
}
return bestEffortParseFailed(b)
}
func bestEffortParseFailed(b []byte) int {
prefix := []byte(sgmLogPrefix)
from := bytes.Index(b, prefix)
if from < 0 {
return 0
}
b = b[from+len(prefix):]
if to := bytes.IndexByte(b, '\n'); to > 0 {
b = b[:to]
}
n, _ := strconv.Atoi(string(b))
return n
}
// sgMaintenance runs a set of git cleanup tasks in dir. This must not be run
// concurrently with git gc. sgMaintenance will check the state of the repository
// to avoid running the cleanup tasks if possible. If a sgmLog file is present in
// dir, sgMaintenance will not run unless the file is old.
func sgMaintenance(logger log.Logger, dir common.GitDir) (err error) {
// Don't run if sgmLog file is younger than sgmLogExpire hours. There is no need
// to report an error, because the error has already been logged in a previous
// run.
if fi, err := os.Stat(dir.Path(sgmLog)); err == nil {
if fi.ModTime().After(time.Now().Add(-sgmLogExpire)) {
return nil
}
}
needed, reason, err := needsMaintenance(dir)
defer func() {
maintenanceStatus.WithLabelValues(strconv.FormatBool(err == nil), reason).Inc()
}()
if err != nil {
return err
}
if !needed {
return nil
}
cmd := exec.Command("zsh")
dir.Set(cmd)
cmd.Stdin = strings.NewReader(sgMaintenanceScript)
err, unlock := lockRepoForGC(dir)
if err != nil {
logger.Debug(
"could not lock repository for sg maintenance",
log.String("dir", string(dir)),
log.Error(err),
)
return nil
}
defer unlock()