Skip to content

Commit 389c18f

Browse files
committed
pack-objects: thread the path-based compression
Adapting the implementation of ll_find_deltas(), create a threaded version of the --path-walk compression step in 'git pack-objects'. This involves adding a 'regions' member to the thread_params struct, allowing each thread to own a section of paths. We can simplify the way jobs are split because there is no value in extending the batch based on name-hash the way sections of the object entry array are attempted to be grouped. We re-use the 'list_size' and 'remaining' items for the purpose of borrowing work in progress from other "victim" threads when a thread has finished its batch of work more quickly. Using the Git repository as a test repo, the p5313 performance test shows that the resulting size of the repo is the same, but the threaded implementation gives gains of varying degrees depending on the number of objects being packed. (This was tested on a 16-core machine.) Test HEAD~1 HEAD ----------------------------------------------------------------- 5313.2: thin pack 0.00 0.00 = 5313.3: thin pack size 589 589 +0.0% 5313.4: thin pack with --path-walk 0.00 0.00 = 5313.5: thin pack size with --path-walk 589 589 +0.0% 5313.6: big pack 2.84 2.80 -1.4% 5313.7: big pack size 14.0M 14.1M +0.3% 5313.8: big pack with --path-walk 5.46 3.77 -31.0% 5313.9: big pack size with --path-walk 13.2M 13.2M -0.0% 5313.10: repack 22.11 21.50 -2.8% 5313.11: repack size 126.4M 126.2M -0.2% 5313.12: repack with --path-walk 66.89 26.41 -60.5% 5313.13: repack size with --path-walk 109.6M 109.6M +0.0% This 60% reduction in 'git repack --path-walk' time is typical across all repos I used for testing. What is interesting is to compare when the overall time improves enough to outperform the standard case. These time improvements correlate with repositories with data shapes that significantly improve their data size as well. For example, the microsoft/fluentui repo has a 439M to 122M size reduction, and the repack time is now 36.6 seconds with --path-walk compared to 95+ seconds without it: Test HEAD~! HEAD ----------------------------------------------------------------- 5313.2: thin pack 0.41 0.42 +2.4% 5313.3: thin pack size 1.2M 1.2M +0.0% 5313.4: thin pack with --path-walk 0.08 0.05 -37.5% 5313.5: thin pack size with --path-walk 18.4K 18.4K +0.0% 5313.6: big pack 4.47 4.53 +1.3% 5313.7: big pack size 19.6M 19.7M +0.3% 5313.8: big pack with --path-walk 6.76 3.51 -48.1% 5313.9: big pack size with --path-walk 16.5M 16.4M -0.2% 5313.10: repack 96.87 99.05 +2.3% 5313.11: repack size 439.5M 439.0M -0.1% 5313.12: repack with --path-walk 95.68 36.55 -61.8% 5313.13: repack size with --path-walk 122.6M 122.6M +0.0% In a more extreme example, an internal repository that has a similar name-hash collision issue to microsoft/fluentui reduces its size from 6.4G to 805M with the --path-walk option. This also reduces the repacking time from 2,138 seconds to 478 seconds. Test HEAD~1 HEAD ------------------------------------------------------------------ 5313.10: repack 2138.22 2138.19 -0.0% 5313.11: repack size 6.4G 6.4G -0.0% 5313.12: repack with --path-walk 1351.46 477.91 -64.6% 5313.13: repack size with --path-walk 804.1M 804.1M -0.0% Finally, the Linux kernel repository is a good test for this repacking time change, even though the space savings is more reasonable: Test HEAD~1 HEAD ---------------------------------------------------------------- 5313.10: repack 734.26 735.11 +0.1% 5313.11: repack size 2.5G 2.5G -0.0% 5313.12: repack with --path-walk 1457.23 598.17 -59.0% 5313.13: repack size with --path-walk 2.2G 2.2G +0.0% Signed-off-by: Derrick Stolee <[email protected]>
1 parent ee81764 commit 389c18f

File tree

1 file changed

+160
-2
lines changed

1 file changed

+160
-2
lines changed

builtin/pack-objects.c

+160-2
Original file line numberDiff line numberDiff line change
@@ -2935,6 +2935,7 @@ static void find_deltas(struct object_entry **list, unsigned *list_size,
29352935
struct thread_params {
29362936
pthread_t thread;
29372937
struct object_entry **list;
2938+
struct packing_region *regions;
29382939
unsigned list_size;
29392940
unsigned remaining;
29402941
int window;
@@ -3248,6 +3249,163 @@ static void find_deltas_by_region(struct object_entry *list,
32483249
stop_progress(&progress_state);
32493250
}
32503251

3252+
static void *threaded_find_deltas_by_path(void *arg)
3253+
{
3254+
struct thread_params *me = arg;
3255+
3256+
progress_lock();
3257+
while (me->remaining) {
3258+
while (me->remaining) {
3259+
progress_unlock();
3260+
find_deltas_for_region(to_pack.objects,
3261+
me->regions,
3262+
me->processed);
3263+
progress_lock();
3264+
me->remaining--;
3265+
me->regions++;
3266+
}
3267+
3268+
me->working = 0;
3269+
pthread_cond_signal(&progress_cond);
3270+
progress_unlock();
3271+
3272+
/*
3273+
* We must not set ->data_ready before we wait on the
3274+
* condition because the main thread may have set it to 1
3275+
* before we get here. In order to be sure that new
3276+
* work is available if we see 1 in ->data_ready, it
3277+
* was initialized to 0 before this thread was spawned
3278+
* and we reset it to 0 right away.
3279+
*/
3280+
pthread_mutex_lock(&me->mutex);
3281+
while (!me->data_ready)
3282+
pthread_cond_wait(&me->cond, &me->mutex);
3283+
me->data_ready = 0;
3284+
pthread_mutex_unlock(&me->mutex);
3285+
3286+
progress_lock();
3287+
}
3288+
progress_unlock();
3289+
/* leave ->working 1 so that this doesn't get more work assigned */
3290+
return NULL;
3291+
}
3292+
3293+
static void ll_find_deltas_by_region(struct object_entry *list,
3294+
struct packing_region *regions,
3295+
uint32_t start, uint32_t nr)
3296+
{
3297+
struct thread_params *p;
3298+
int i, ret, active_threads = 0;
3299+
unsigned int processed = 0;
3300+
uint32_t progress_nr;
3301+
init_threaded_search();
3302+
3303+
if (!nr)
3304+
return;
3305+
3306+
progress_nr = regions[nr - 1].start + regions[nr - 1].nr;
3307+
if (delta_search_threads <= 1) {
3308+
find_deltas_by_region(list, regions, start, nr);
3309+
cleanup_threaded_search();
3310+
return;
3311+
}
3312+
3313+
if (progress > pack_to_stdout)
3314+
fprintf_ln(stderr, _("Path-based delta compression using up to %d threads"),
3315+
delta_search_threads);
3316+
CALLOC_ARRAY(p, delta_search_threads);
3317+
3318+
if (progress)
3319+
progress_state = start_progress(_("Compressing objects by path"),
3320+
progress_nr);
3321+
/* Partition the work amongst work threads. */
3322+
for (i = 0; i < delta_search_threads; i++) {
3323+
unsigned sub_size = nr / (delta_search_threads - i);
3324+
3325+
p[i].window = window;
3326+
p[i].depth = depth;
3327+
p[i].processed = &processed;
3328+
p[i].working = 1;
3329+
p[i].data_ready = 0;
3330+
3331+
p[i].regions = regions;
3332+
p[i].list_size = sub_size;
3333+
p[i].remaining = sub_size;
3334+
3335+
regions += sub_size;
3336+
nr -= sub_size;
3337+
}
3338+
3339+
/* Start work threads. */
3340+
for (i = 0; i < delta_search_threads; i++) {
3341+
if (!p[i].list_size)
3342+
continue;
3343+
pthread_mutex_init(&p[i].mutex, NULL);
3344+
pthread_cond_init(&p[i].cond, NULL);
3345+
ret = pthread_create(&p[i].thread, NULL,
3346+
threaded_find_deltas_by_path, &p[i]);
3347+
if (ret)
3348+
die(_("unable to create thread: %s"), strerror(ret));
3349+
active_threads++;
3350+
}
3351+
3352+
/*
3353+
* Now let's wait for work completion. Each time a thread is done
3354+
* with its work, we steal half of the remaining work from the
3355+
* thread with the largest number of unprocessed objects and give
3356+
* it to that newly idle thread. This ensure good load balancing
3357+
* until the remaining object list segments are simply too short
3358+
* to be worth splitting anymore.
3359+
*/
3360+
while (active_threads) {
3361+
struct thread_params *target = NULL;
3362+
struct thread_params *victim = NULL;
3363+
unsigned sub_size = 0;
3364+
3365+
progress_lock();
3366+
for (;;) {
3367+
for (i = 0; !target && i < delta_search_threads; i++)
3368+
if (!p[i].working)
3369+
target = &p[i];
3370+
if (target)
3371+
break;
3372+
pthread_cond_wait(&progress_cond, &progress_mutex);
3373+
}
3374+
3375+
for (i = 0; i < delta_search_threads; i++)
3376+
if (p[i].remaining > 2*window &&
3377+
(!victim || victim->remaining < p[i].remaining))
3378+
victim = &p[i];
3379+
if (victim) {
3380+
sub_size = victim->remaining / 2;
3381+
target->regions = victim->regions + victim->remaining - sub_size;
3382+
victim->list_size -= sub_size;
3383+
victim->remaining -= sub_size;
3384+
}
3385+
target->list_size = sub_size;
3386+
target->remaining = sub_size;
3387+
target->working = 1;
3388+
progress_unlock();
3389+
3390+
pthread_mutex_lock(&target->mutex);
3391+
target->data_ready = 1;
3392+
pthread_cond_signal(&target->cond);
3393+
pthread_mutex_unlock(&target->mutex);
3394+
3395+
if (!sub_size) {
3396+
pthread_join(target->thread, NULL);
3397+
pthread_cond_destroy(&target->cond);
3398+
pthread_mutex_destroy(&target->mutex);
3399+
active_threads--;
3400+
}
3401+
}
3402+
cleanup_threaded_search();
3403+
free(p);
3404+
3405+
display_progress(progress_state, progress_nr);
3406+
stop_progress(&progress_state);
3407+
}
3408+
32513409
static void prepare_pack(int window, int depth)
32523410
{
32533411
struct object_entry **delta_list;
@@ -3273,8 +3431,8 @@ static void prepare_pack(int window, int depth)
32733431
return;
32743432

32753433
if (path_walk)
3276-
find_deltas_by_region(to_pack.objects, to_pack.regions,
3277-
0, to_pack.nr_regions);
3434+
ll_find_deltas_by_region(to_pack.objects, to_pack.regions,
3435+
0, to_pack.nr_regions);
32783436

32793437
ALLOC_ARRAY(delta_list, to_pack.nr_objects);
32803438
nr_deltas = n = 0;

0 commit comments

Comments
 (0)