From 00c0d84e6f0cdb91b0f261e12ed058ad3baa49bf Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 13:59:34 -0500 Subject: [PATCH 01/24] midx: mark `get_midx_checksum()` arguments as const To make clear that the function `get_midx_checksum()` does not do anything to modify its argument, mark the MIDX pointer as const. The following commit will rename this function altogether to make clear that it returns the raw bytes of the checksum, not a hex-encoded copy of it. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- midx.c | 2 +- midx.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/midx.c b/midx.c index a75ea99a0d4bb0..2a6b18954c51fb 100644 --- a/midx.c +++ b/midx.c @@ -24,7 +24,7 @@ void clear_incremental_midx_files_ext(struct odb_source *source, const char *ext int cmp_idx_or_pack_name(const char *idx_or_pack_name, const char *idx_name); -const unsigned char *get_midx_checksum(struct multi_pack_index *m) +const unsigned char *get_midx_checksum(const struct multi_pack_index *m) { return m->data + m->data_len - m->source->odb->repo->hash_algo->rawsz; } diff --git a/midx.h b/midx.h index 6e54d73503d560..7c7e0b5912151b 100644 --- a/midx.h +++ b/midx.h @@ -85,7 +85,7 @@ struct multi_pack_index { #define MIDX_EXT_BITMAP "bitmap" #define MIDX_EXT_MIDX "midx" -const unsigned char *get_midx_checksum(struct multi_pack_index *m); +const unsigned char *get_midx_checksum(const struct multi_pack_index *m); void get_midx_filename(struct odb_source *source, struct strbuf *out); void get_midx_filename_ext(struct odb_source *source, struct strbuf *out, const unsigned char *hash, const char *ext); From de811c26bb97ac324b60883ae4f4db84a83be2f1 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 13:59:39 -0500 Subject: [PATCH 02/24] midx: rename `get_midx_checksum()` to `midx_get_checksum_hash()` Since 541204aabea (Documentation: document naming schema for structs and their functions, 2024-07-30), we have adopted a naming convention for functions that would prefer a name like, say, `midx_get_checksum()` over `get_midx_checksum()`. Adopt this convention throughout the midx.h API. Since this function returns a raw (that is, non-hex encoded) hash, let's suffix the function with "_hash()" to make this clear. As a side effect, this prepares us for the subsequent change which will introduce a "_hex()" variant that encodes the checksum itself. Suggested-by: Patrick Steinhardt Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- midx-write.c | 6 +++--- midx.c | 2 +- midx.h | 2 +- pack-bitmap.c | 8 ++++---- pack-revindex.c | 4 ++-- t/helper/test-read-midx.c | 4 ++-- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/midx-write.c b/midx-write.c index 6485cb67068553..73d33752ef1119 100644 --- a/midx-write.c +++ b/midx-write.c @@ -946,7 +946,7 @@ static int link_midx_to_chain(struct multi_pack_index *m) } for (i = 0; i < ARRAY_SIZE(midx_exts); i++) { - const unsigned char *hash = get_midx_checksum(m); + const unsigned char *hash = midx_get_checksum_hash(m); get_midx_filename_ext(m->source, &from, hash, midx_exts[i].non_split); @@ -1151,7 +1151,7 @@ static int write_midx_internal(struct odb_source *source, while (m) { if (flags & MIDX_WRITE_BITMAP && load_midx_revindex(m)) { error(_("could not load reverse index for MIDX %s"), - hash_to_hex_algop(get_midx_checksum(m), + hash_to_hex_algop(midx_get_checksum_hash(m), m->source->odb->repo->hash_algo)); goto cleanup; } @@ -1520,7 +1520,7 @@ static int write_midx_internal(struct odb_source *source, for (uint32_t i = 0; i < ctx.num_multi_pack_indexes_before; i++) { uint32_t j = ctx.num_multi_pack_indexes_before - i - 1; - keep_hashes[j] = xstrdup(hash_to_hex_algop(get_midx_checksum(m), + keep_hashes[j] = xstrdup(hash_to_hex_algop(midx_get_checksum_hash(m), r->hash_algo)); m = m->base_midx; } diff --git a/midx.c b/midx.c index 2a6b18954c51fb..1d072bd99316df 100644 --- a/midx.c +++ b/midx.c @@ -24,7 +24,7 @@ void clear_incremental_midx_files_ext(struct odb_source *source, const char *ext int cmp_idx_or_pack_name(const char *idx_or_pack_name, const char *idx_name); -const unsigned char *get_midx_checksum(const struct multi_pack_index *m) +const unsigned char *midx_get_checksum_hash(const struct multi_pack_index *m) { return m->data + m->data_len - m->source->odb->repo->hash_algo->rawsz; } diff --git a/midx.h b/midx.h index 7c7e0b5912151b..62d6105195f0fb 100644 --- a/midx.h +++ b/midx.h @@ -85,7 +85,7 @@ struct multi_pack_index { #define MIDX_EXT_BITMAP "bitmap" #define MIDX_EXT_MIDX "midx" -const unsigned char *get_midx_checksum(const struct multi_pack_index *m); +const unsigned char *midx_get_checksum_hash(const struct multi_pack_index *m); void get_midx_filename(struct odb_source *source, struct strbuf *out); void get_midx_filename_ext(struct odb_source *source, struct strbuf *out, const unsigned char *hash, const char *ext); diff --git a/pack-bitmap.c b/pack-bitmap.c index 972203f12b6d9b..6307bbdf1e148a 100644 --- a/pack-bitmap.c +++ b/pack-bitmap.c @@ -441,11 +441,11 @@ char *midx_bitmap_filename(struct multi_pack_index *midx) struct strbuf buf = STRBUF_INIT; if (midx->has_chain) get_split_midx_filename_ext(midx->source, &buf, - get_midx_checksum(midx), + midx_get_checksum_hash(midx), MIDX_EXT_BITMAP); else get_midx_filename_ext(midx->source, &buf, - get_midx_checksum(midx), + midx_get_checksum_hash(midx), MIDX_EXT_BITMAP); return strbuf_detach(&buf, NULL); @@ -502,7 +502,7 @@ static int open_midx_bitmap_1(struct bitmap_index *bitmap_git, if (load_bitmap_header(bitmap_git) < 0) goto cleanup; - if (!hasheq(get_midx_checksum(bitmap_git->midx), bitmap_git->checksum, + if (!hasheq(midx_get_checksum_hash(bitmap_git->midx), bitmap_git->checksum, bitmap_repo(bitmap_git)->hash_algo)) { error(_("checksum doesn't match in MIDX and bitmap")); goto cleanup; @@ -2819,7 +2819,7 @@ void test_bitmap_walk(struct rev_info *revs) if (bitmap_is_midx(found)) fprintf_ln(stderr, "Located via MIDX '%s'.", - hash_to_hex_algop(get_midx_checksum(found->midx), + hash_to_hex_algop(midx_get_checksum_hash(found->midx), revs->repo->hash_algo)); else fprintf_ln(stderr, "Located via pack '%s'.", diff --git a/pack-revindex.c b/pack-revindex.c index 56cd803a6798d5..294b802d402653 100644 --- a/pack-revindex.c +++ b/pack-revindex.c @@ -390,11 +390,11 @@ int load_midx_revindex(struct multi_pack_index *m) if (m->has_chain) get_split_midx_filename_ext(m->source, &revindex_name, - get_midx_checksum(m), + midx_get_checksum_hash(m), MIDX_EXT_REV); else get_midx_filename_ext(m->source, &revindex_name, - get_midx_checksum(m), + midx_get_checksum_hash(m), MIDX_EXT_REV); ret = load_revindex_from_disk(m->source->odb->repo->hash_algo, diff --git a/t/helper/test-read-midx.c b/t/helper/test-read-midx.c index 6de5d1665afbfc..b8fefb1a1245e2 100644 --- a/t/helper/test-read-midx.c +++ b/t/helper/test-read-midx.c @@ -34,7 +34,7 @@ static int read_midx_file(const char *object_dir, const char *checksum, return 1; if (checksum) { - while (m && strcmp(hash_to_hex(get_midx_checksum(m)), checksum)) + while (m && strcmp(hash_to_hex(midx_get_checksum_hash(m)), checksum)) m = m->base_midx; if (!m) return 1; @@ -94,7 +94,7 @@ static int read_midx_checksum(const char *object_dir) m = setup_midx(object_dir); if (!m) return 1; - printf("%s\n", hash_to_hex(get_midx_checksum(m))); + printf("%s\n", hash_to_hex(midx_get_checksum_hash(m))); close_midx(m); return 0; From 6e86f679248580bec5c105b37217862f3022b504 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 13:59:44 -0500 Subject: [PATCH 03/24] midx: introduce `midx_get_checksum_hex()` When trying to print out, say, the hexadecimal representation of a MIDX's hash, our code will do something like: hash_to_hex_algop(midx_get_checksum_hash(m), m->source->odb->repo->hash_algo); , which is both cumbersome and repetitive. In fact, all but a handful of callers to `midx_get_checksum_hash()` do exactly the above. Reduce the repetitive nature of calling `midx_get_checksum_hash()` by having it return a pointer into a static buffer containing the above result. For the handful of callers that do need to compare the raw bytes and don't want to deal with an encoded copy (e.g., because they are passing it to hasheq() or similar), they may still rely on `midx_get_checksum_hash()` which returns the raw bytes. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- midx-write.c | 6 ++---- midx.c | 6 ++++++ midx.h | 1 + pack-bitmap.c | 3 +-- t/helper/test-read-midx.c | 4 ++-- 5 files changed, 12 insertions(+), 8 deletions(-) diff --git a/midx-write.c b/midx-write.c index 73d33752ef1119..13171d7e9c4ae0 100644 --- a/midx-write.c +++ b/midx-write.c @@ -1151,8 +1151,7 @@ static int write_midx_internal(struct odb_source *source, while (m) { if (flags & MIDX_WRITE_BITMAP && load_midx_revindex(m)) { error(_("could not load reverse index for MIDX %s"), - hash_to_hex_algop(midx_get_checksum_hash(m), - m->source->odb->repo->hash_algo)); + midx_get_checksum_hex(m)); goto cleanup; } ctx.num_multi_pack_indexes_before++; @@ -1520,8 +1519,7 @@ static int write_midx_internal(struct odb_source *source, for (uint32_t i = 0; i < ctx.num_multi_pack_indexes_before; i++) { uint32_t j = ctx.num_multi_pack_indexes_before - i - 1; - keep_hashes[j] = xstrdup(hash_to_hex_algop(midx_get_checksum_hash(m), - r->hash_algo)); + keep_hashes[j] = xstrdup(midx_get_checksum_hex(m)); m = m->base_midx; } diff --git a/midx.c b/midx.c index 1d072bd99316df..bae458923232d7 100644 --- a/midx.c +++ b/midx.c @@ -24,6 +24,12 @@ void clear_incremental_midx_files_ext(struct odb_source *source, const char *ext int cmp_idx_or_pack_name(const char *idx_or_pack_name, const char *idx_name); +const char *midx_get_checksum_hex(const struct multi_pack_index *m) +{ + return hash_to_hex_algop(midx_get_checksum_hash(m), + m->source->odb->repo->hash_algo); +} + const unsigned char *midx_get_checksum_hash(const struct multi_pack_index *m) { return m->data + m->data_len - m->source->odb->repo->hash_algo->rawsz; diff --git a/midx.h b/midx.h index 62d6105195f0fb..a39bcc9d03fc9c 100644 --- a/midx.h +++ b/midx.h @@ -85,6 +85,7 @@ struct multi_pack_index { #define MIDX_EXT_BITMAP "bitmap" #define MIDX_EXT_MIDX "midx" +const char *midx_get_checksum_hex(const struct multi_pack_index *m) /* static buffer */; const unsigned char *midx_get_checksum_hash(const struct multi_pack_index *m); void get_midx_filename(struct odb_source *source, struct strbuf *out); void get_midx_filename_ext(struct odb_source *source, struct strbuf *out, diff --git a/pack-bitmap.c b/pack-bitmap.c index 6307bbdf1e148a..afc7fba019779c 100644 --- a/pack-bitmap.c +++ b/pack-bitmap.c @@ -2819,8 +2819,7 @@ void test_bitmap_walk(struct rev_info *revs) if (bitmap_is_midx(found)) fprintf_ln(stderr, "Located via MIDX '%s'.", - hash_to_hex_algop(midx_get_checksum_hash(found->midx), - revs->repo->hash_algo)); + midx_get_checksum_hex(found->midx)); else fprintf_ln(stderr, "Located via pack '%s'.", hash_to_hex_algop(found->pack->hash, diff --git a/t/helper/test-read-midx.c b/t/helper/test-read-midx.c index b8fefb1a1245e2..9d42c587564182 100644 --- a/t/helper/test-read-midx.c +++ b/t/helper/test-read-midx.c @@ -34,7 +34,7 @@ static int read_midx_file(const char *object_dir, const char *checksum, return 1; if (checksum) { - while (m && strcmp(hash_to_hex(midx_get_checksum_hash(m)), checksum)) + while (m && strcmp(midx_get_checksum_hex(m), checksum)) m = m->base_midx; if (!m) return 1; @@ -94,7 +94,7 @@ static int read_midx_checksum(const char *object_dir) m = setup_midx(object_dir); if (!m) return 1; - printf("%s\n", hash_to_hex(midx_get_checksum_hash(m))); + printf("%s\n", midx_get_checksum_hex(m)); close_midx(m); return 0; From 6b8fb17490f637f5651834596b08cdb255e9280a Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 13:59:48 -0500 Subject: [PATCH 04/24] builtin/multi-pack-index.c: make '--progress' a common option All multi-pack-index sub-commands (write, verify, repack, and expire) support a '--progress' command-line option, despite not listing it as one of the common options in `common_opts`. As a result each sub-command declares its own `OPT_BIT()` for a "--progress" command-line option. Centralize this within the `common_opts` to avoid re-declaring it in each sub-command. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- Documentation/git-multi-pack-index.adoc | 2 ++ builtin/multi-pack-index.c | 10 ++-------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/Documentation/git-multi-pack-index.adoc b/Documentation/git-multi-pack-index.adoc index 2f642697e9e106..a4550e28bedd5b 100644 --- a/Documentation/git-multi-pack-index.adoc +++ b/Documentation/git-multi-pack-index.adoc @@ -18,6 +18,8 @@ Write or verify a multi-pack-index (MIDX) file. OPTIONS ------- +The following command-line options are applicable to all sub-commands: + --object-dir=:: Use given directory for the location of Git objects. We check `/packs/multi-pack-index` for the current MIDX file, and diff --git a/builtin/multi-pack-index.c b/builtin/multi-pack-index.c index 5f364aa816ba25..ca98d4c3ba3c2f 100644 --- a/builtin/multi-pack-index.c +++ b/builtin/multi-pack-index.c @@ -84,6 +84,8 @@ static struct option common_opts[] = { N_("directory"), N_("object directory containing set of packfile and pack-index pairs"), parse_object_dir), + OPT_BIT(0, "progress", &opts.flags, N_("force progress reporting"), + MIDX_PROGRESS), OPT_END(), }; @@ -138,8 +140,6 @@ static int cmd_multi_pack_index_write(int argc, const char **argv, N_("pack for reuse when computing a multi-pack bitmap")), OPT_BIT(0, "bitmap", &opts.flags, N_("write multi-pack bitmap"), MIDX_WRITE_BITMAP | MIDX_WRITE_REV_INDEX), - OPT_BIT(0, "progress", &opts.flags, - N_("force progress reporting"), MIDX_PROGRESS), OPT_BIT(0, "incremental", &opts.flags, N_("write a new incremental MIDX"), MIDX_WRITE_INCREMENTAL), OPT_BOOL(0, "stdin-packs", &opts.stdin_packs, @@ -200,8 +200,6 @@ static int cmd_multi_pack_index_verify(int argc, const char **argv, { struct option *options; static struct option builtin_multi_pack_index_verify_options[] = { - OPT_BIT(0, "progress", &opts.flags, - N_("force progress reporting"), MIDX_PROGRESS), OPT_END(), }; struct odb_source *source; @@ -231,8 +229,6 @@ static int cmd_multi_pack_index_expire(int argc, const char **argv, { struct option *options; static struct option builtin_multi_pack_index_expire_options[] = { - OPT_BIT(0, "progress", &opts.flags, - N_("force progress reporting"), MIDX_PROGRESS), OPT_END(), }; struct odb_source *source; @@ -264,8 +260,6 @@ static int cmd_multi_pack_index_repack(int argc, const char **argv, static struct option builtin_multi_pack_index_repack_options[] = { OPT_UNSIGNED(0, "batch-size", &opts.batch_size, N_("during repack, collect pack-files of smaller size into a batch that is larger than this size")), - OPT_BIT(0, "progress", &opts.flags, - N_("force progress reporting"), MIDX_PROGRESS), OPT_END(), }; struct odb_source *source; From f775d5b1cf7b3d7ec0b8448684b8710c82013684 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 13:59:52 -0500 Subject: [PATCH 05/24] git-multi-pack-index(1): remove non-existent incompatibility Since fcb2205b774 (midx: implement support for writing incremental MIDX chains, 2024-08-06), the command-line options '--incremental' and '--bitmap' were declared to be incompatible with one another when running 'git multi-pack-index write'. However, since 27afc272c49 (midx: implement writing incremental MIDX bitmaps, 2025-03-20), that incompatibility no longer exists, despite the documentation saying so. Correct this by removing the stale reference to their incompatibility. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- Documentation/git-multi-pack-index.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/git-multi-pack-index.adoc b/Documentation/git-multi-pack-index.adoc index a4550e28bedd5b..a502819fc38858 100644 --- a/Documentation/git-multi-pack-index.adoc +++ b/Documentation/git-multi-pack-index.adoc @@ -75,7 +75,7 @@ marker). Write an incremental MIDX file containing only objects and packs not present in an existing MIDX layer. Migrates non-incremental MIDXs to incremental ones when - necessary. Incompatible with `--bitmap`. + necessary. -- verify:: From d0e91c128b99ca86e31e4c0e9ef80643039709f9 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 13:59:56 -0500 Subject: [PATCH 06/24] git-multi-pack-index(1): align SYNOPSIS with 'git multi-pack-index -h' Since c39fffc1c90 (tests: start asserting that *.txt SYNOPSIS matches -h output, 2022-10-13), the manual page for 'git multi-pack-index' has a SYNOPSIS section which differs from 'git multi-pack-index -h'. Correct this while also documenting additional options accepted by the 'write' sub-command. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- Documentation/git-multi-pack-index.adoc | 7 ++++++- builtin/multi-pack-index.c | 5 +++-- t/t0450/adoc-help-mismatches | 1 - 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/Documentation/git-multi-pack-index.adoc b/Documentation/git-multi-pack-index.adoc index a502819fc38858..164cf1f2291b99 100644 --- a/Documentation/git-multi-pack-index.adoc +++ b/Documentation/git-multi-pack-index.adoc @@ -9,7 +9,12 @@ git-multi-pack-index - Write and verify multi-pack-indexes SYNOPSIS -------- [verse] -'git multi-pack-index' [--object-dir=] [--[no-]bitmap] +'git multi-pack-index' [] write [--preferred-pack=] + [--[no-]bitmap] [--[no-]incremental] [--[no-]stdin-packs] + [--refs-snapshot=] +'git multi-pack-index' [] verify +'git multi-pack-index' [] expire +'git multi-pack-index' [] repack [--batch-size=] DESCRIPTION ----------- diff --git a/builtin/multi-pack-index.c b/builtin/multi-pack-index.c index ca98d4c3ba3c2f..c0c6c1760c0f28 100644 --- a/builtin/multi-pack-index.c +++ b/builtin/multi-pack-index.c @@ -13,8 +13,9 @@ #include "repository.h" #define BUILTIN_MIDX_WRITE_USAGE \ - N_("git multi-pack-index [] write [--preferred-pack=]" \ - "[--refs-snapshot=]") + N_("git multi-pack-index [] write [--preferred-pack=]\n" \ + " [--[no-]bitmap] [--[no-]incremental] [--[no-]stdin-packs]\n" \ + " [--refs-snapshot=]") #define BUILTIN_MIDX_VERIFY_USAGE \ N_("git multi-pack-index [] verify") diff --git a/t/t0450/adoc-help-mismatches b/t/t0450/adoc-help-mismatches index 8ee2d3f7c81502..e8d6c13ccd0333 100644 --- a/t/t0450/adoc-help-mismatches +++ b/t/t0450/adoc-help-mismatches @@ -33,7 +33,6 @@ merge merge-file merge-index merge-one-file -multi-pack-index name-rev notes push From 80437821124d7f529dc6c98daa2df92552c84db0 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 14:00:00 -0500 Subject: [PATCH 07/24] t/t5319-multi-pack-index.sh: fix copy-and-paste error in t5319.39 Commit d4bf1d88b90 (multi-pack-index: verify missing pack, 2018-09-13) adds a new test to the MIDX test script to test how we handle missing packs. While the commit itself describes the test as "verify missing pack[s]", the test itself is actually called "verify packnames out of order", despite that not being what it tests. Likely this was a copy-and-paste of the test immediately above it of the same name. Correct this by renaming the test to match the commit message. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/t5319-multi-pack-index.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh index faae98c7e76a20..efeab4d22b7c79 100755 --- a/t/t5319-multi-pack-index.sh +++ b/t/t5319-multi-pack-index.sh @@ -517,7 +517,7 @@ test_expect_success 'verify packnames out of order' ' "pack names out of order" ' -test_expect_success 'verify packnames out of order' ' +test_expect_success 'verify missing pack' ' corrupt_midx_and_verify $MIDX_BYTE_PACKNAME_ORDER "a" $objdir \ "failed to load pack" ' From ac10f6aa40355bef7ead9e992e76c3a562d2c01a Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 14:00:05 -0500 Subject: [PATCH 08/24] midx-write.c: don't use `pack_perm` when assigning `bitmap_pos` In midx_pack_order(), we compute for each bitmapped pack the first bit to correspond to an object in that pack, along with how many bits were assigned to object(s) in that pack. Initially, each bitmap_nr value is set to zero, and each bitmap_pos value is set to the sentinel BITMAP_POS_UNKNOWN. This is done to ensure that there are no packs who have an unknown bit position but a somehow non-zero number of objects (cf. `write_midx_bitmapped_packs()` in midx-write.c). Once the pack order is fully determined, midx_pack_order() sets the bitmap_pos field for any bitmapped packs to zero if they are still listed as BITMAP_POS_UNKNOWN. However, we enumerate the bitmapped packs in order of `ctx->pack_perm`. This is fine for existing cases, since the only time the `ctx->pack_perm` array holds a value outside of the addressable range of `ctx->info` is when there are expired packs, which only occurs via 'git multi-pack-index expire', which does not support writing MIDX bitmaps. As a result, the range of ctx->pack_perm covers all values in [0, `ctx->nr`), so enumerating in this order isn't an issue. A future change necessary for compaction will complicate this further by introducing a wrapper around the `ctx->pack_perm` array, which turns the given `pack_int_id` into one that is relative to the lower end of the compaction range. As a result, indexing into `ctx->pack_perm` through this helper, say, with "0" will produce a crash when the lower end of the compaction range has >0 pack(s) in its base layer, since the subtraction will wrap around the 32-bit unsigned range, resulting in an uninitialized read. But the process is completely unnecessary in the first place: we are enumerating all values of `ctx->info`, and there is no reason to process them in a different order than they appear in memory. Index `ctx->info` directly to reflect that. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- midx-write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/midx-write.c b/midx-write.c index 13171d7e9c4ae0..da9c5a7c29577c 100644 --- a/midx-write.c +++ b/midx-write.c @@ -637,7 +637,7 @@ static uint32_t *midx_pack_order(struct write_midx_context *ctx) pack_order[i] = data[i].nr; } for (i = 0; i < ctx->nr; i++) { - struct pack_info *pack = &ctx->info[ctx->pack_perm[i]]; + struct pack_info *pack = &ctx->info[i]; if (pack->bitmap_pos == BITMAP_POS_UNKNOWN) pack->bitmap_pos = 0; } From 82c905ea6bd79cd2045fea91b67f4c858379bff1 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 14:00:09 -0500 Subject: [PATCH 09/24] midx-write.c: introduce `struct write_midx_opts` In the MIDX writing code, there are four functions which perform some sort of MIDX write operation. They are: - write_midx_file() - write_midx_file_only() - expire_midx_packs() - midx_repack() All of these functions are thin wrappers over `write_midx_internal()`, which implements the bulk of these routines. As a result, the `write_midx_internal()` function takes six arguments. Future commits in this series will want to add additional arguments, and in general this function's signature will be the union of parameters among *all* possible ways to write a MIDX. Instead of adding yet more arguments to this function to support MIDX compaction, introduce a `struct write_midx_opts`, which has the same struct members as `write_midx_internal()`'s arguments. Adding additional fields to the `write_midx_opts` struct is preferable to adding additional arguments to `write_midx_internal()`. This is because the callers below all zero-initialize the struct, so each time we add a new piece of information, we do not have to pass the zero value for it in all other call-sites that do not care about it. For now, no functional changes are included in this patch. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- midx-write.c | 135 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 81 insertions(+), 54 deletions(-) diff --git a/midx-write.c b/midx-write.c index da9c5a7c29577c..8a54644e427992 100644 --- a/midx-write.c +++ b/midx-write.c @@ -1078,14 +1078,20 @@ static bool midx_needs_update(struct multi_pack_index *midx, struct write_midx_c return needed; } -static int write_midx_internal(struct odb_source *source, - struct string_list *packs_to_include, - struct string_list *packs_to_drop, - const char *preferred_pack_name, - const char *refs_snapshot, - unsigned flags) +struct write_midx_opts { + struct odb_source *source; /* non-optional */ + + struct string_list *packs_to_include; + struct string_list *packs_to_drop; + + const char *preferred_pack_name; + const char *refs_snapshot; + unsigned flags; +}; + +static int write_midx_internal(struct write_midx_opts *opts) { - struct repository *r = source->odb->repo; + struct repository *r = opts->source->odb->repo; struct strbuf midx_name = STRBUF_INIT; unsigned char midx_hash[GIT_MAX_RAWSZ]; uint32_t start_pack; @@ -1106,22 +1112,22 @@ static int write_midx_internal(struct odb_source *source, trace2_region_enter("midx", "write_midx_internal", r); ctx.repo = r; - ctx.source = source; + ctx.source = opts->source; - ctx.incremental = !!(flags & MIDX_WRITE_INCREMENTAL); + ctx.incremental = !!(opts->flags & MIDX_WRITE_INCREMENTAL); if (ctx.incremental) strbuf_addf(&midx_name, "%s/pack/multi-pack-index.d/tmp_midx_XXXXXX", - source->path); + opts->source->path); else - get_midx_filename(source, &midx_name); + get_midx_filename(opts->source, &midx_name); if (safe_create_leading_directories(r, midx_name.buf)) die_errno(_("unable to create leading directories of %s"), midx_name.buf); - if (!packs_to_include || ctx.incremental) { - struct multi_pack_index *m = get_multi_pack_index(source); + if (!opts->packs_to_include || ctx.incremental) { + struct multi_pack_index *m = get_multi_pack_index(opts->source); if (m && !midx_checksum_valid(m)) { warning(_("ignoring existing multi-pack-index; checksum mismatch")); m = NULL; @@ -1136,7 +1142,7 @@ static int write_midx_internal(struct odb_source *source, */ if (ctx.incremental) ctx.base_midx = m; - else if (!packs_to_include) + else if (!opts->packs_to_include) ctx.m = m; } } @@ -1149,7 +1155,7 @@ static int write_midx_internal(struct odb_source *source, if (ctx.incremental) { struct multi_pack_index *m = ctx.base_midx; while (m) { - if (flags & MIDX_WRITE_BITMAP && load_midx_revindex(m)) { + if (opts->flags & MIDX_WRITE_BITMAP && load_midx_revindex(m)) { error(_("could not load reverse index for MIDX %s"), midx_get_checksum_hex(m)); goto cleanup; @@ -1164,18 +1170,18 @@ static int write_midx_internal(struct odb_source *source, start_pack = ctx.nr; ctx.pack_paths_checked = 0; - if (flags & MIDX_PROGRESS) + if (opts->flags & MIDX_PROGRESS) ctx.progress = start_delayed_progress(r, _("Adding packfiles to multi-pack-index"), 0); else ctx.progress = NULL; - ctx.to_include = packs_to_include; + ctx.to_include = opts->packs_to_include; - for_each_file_in_pack_dir(source->path, add_pack_to_midx, &ctx); + for_each_file_in_pack_dir(opts->source->path, add_pack_to_midx, &ctx); stop_progress(&ctx.progress); - if (!packs_to_drop) { + if (!opts->packs_to_drop) { /* * If there is no MIDX then either it doesn't exist, or we're * doing a geometric repack. Try to load it from the source to @@ -1188,7 +1194,7 @@ static int write_midx_internal(struct odb_source *source, if (midx && !midx_needs_update(midx, &ctx)) { struct bitmap_index *bitmap_git; int bitmap_exists; - int want_bitmap = flags & MIDX_WRITE_BITMAP; + int want_bitmap = opts->flags & MIDX_WRITE_BITMAP; bitmap_git = prepare_midx_bitmap_git(midx); bitmap_exists = bitmap_git && bitmap_is_midx(bitmap_git); @@ -1200,7 +1206,7 @@ static int write_midx_internal(struct odb_source *source, * corresponding bitmap (or one wasn't requested). */ if (!want_bitmap) - clear_midx_files_ext(source, "bitmap", NULL); + clear_midx_files_ext(ctx.source, "bitmap", NULL); result = 0; goto cleanup; } @@ -1215,11 +1221,11 @@ static int write_midx_internal(struct odb_source *source, goto cleanup; /* nothing to do */ } - if (preferred_pack_name) { + if (opts->preferred_pack_name) { ctx.preferred_pack_idx = NO_PREFERRED_PACK; for (size_t i = 0; i < ctx.nr; i++) { - if (!cmp_idx_or_pack_name(preferred_pack_name, + if (!cmp_idx_or_pack_name(opts->preferred_pack_name, ctx.info[i].pack_name)) { ctx.preferred_pack_idx = i; break; @@ -1228,9 +1234,9 @@ static int write_midx_internal(struct odb_source *source, if (ctx.preferred_pack_idx == NO_PREFERRED_PACK) warning(_("unknown preferred pack: '%s'"), - preferred_pack_name); + opts->preferred_pack_name); } else if (ctx.nr && - (flags & (MIDX_WRITE_REV_INDEX | MIDX_WRITE_BITMAP))) { + (opts->flags & (MIDX_WRITE_REV_INDEX | MIDX_WRITE_BITMAP))) { struct packed_git *oldest = ctx.info[0].p; ctx.preferred_pack_idx = 0; @@ -1241,7 +1247,7 @@ static int write_midx_internal(struct odb_source *source, */ open_pack_index(oldest); - if (packs_to_drop && packs_to_drop->nr) + if (opts->packs_to_drop && opts->packs_to_drop->nr) BUG("cannot write a MIDX bitmap during expiration"); /* @@ -1303,20 +1309,21 @@ static int write_midx_internal(struct odb_source *source, QSORT(ctx.info, ctx.nr, pack_info_compare); - if (packs_to_drop && packs_to_drop->nr) { + if (opts->packs_to_drop && opts->packs_to_drop->nr) { size_t drop_index = 0; int missing_drops = 0; - for (size_t i = 0; i < ctx.nr && drop_index < packs_to_drop->nr; i++) { + for (size_t i = 0; + i < ctx.nr && drop_index < opts->packs_to_drop->nr; i++) { int cmp = strcmp(ctx.info[i].pack_name, - packs_to_drop->items[drop_index].string); + opts->packs_to_drop->items[drop_index].string); if (!cmp) { drop_index++; ctx.info[i].expired = 1; } else if (cmp > 0) { error(_("did not see pack-file %s to drop"), - packs_to_drop->items[drop_index].string); + opts->packs_to_drop->items[drop_index].string); drop_index++; missing_drops++; i--; @@ -1353,8 +1360,8 @@ static int write_midx_internal(struct odb_source *source, } /* Check that the preferred pack wasn't expired (if given). */ - if (preferred_pack_name) { - struct pack_info *preferred = bsearch(preferred_pack_name, + if (opts->preferred_pack_name) { + struct pack_info *preferred = bsearch(opts->preferred_pack_name, ctx.info, ctx.nr, sizeof(*ctx.info), idx_or_pack_name_cmp); @@ -1362,7 +1369,7 @@ static int write_midx_internal(struct odb_source *source, uint32_t perm = ctx.pack_perm[preferred->orig_pack_int_id]; if (perm == PACK_EXPIRED) warning(_("preferred pack '%s' is expired"), - preferred_pack_name); + opts->preferred_pack_name); } } @@ -1376,15 +1383,15 @@ static int write_midx_internal(struct odb_source *source, } if (!ctx.entries_nr) { - if (flags & MIDX_WRITE_BITMAP) + if (opts->flags & MIDX_WRITE_BITMAP) warning(_("refusing to write multi-pack .bitmap without any objects")); - flags &= ~(MIDX_WRITE_REV_INDEX | MIDX_WRITE_BITMAP); + opts->flags &= ~(MIDX_WRITE_REV_INDEX | MIDX_WRITE_BITMAP); } if (ctx.incremental) { struct strbuf lock_name = STRBUF_INIT; - get_midx_chain_filename(source, &lock_name); + get_midx_chain_filename(opts->source, &lock_name); hold_lock_file_for_update(&lk, lock_name.buf, LOCK_DIE_ON_ERROR); strbuf_release(&lock_name); @@ -1427,7 +1434,7 @@ static int write_midx_internal(struct odb_source *source, MIDX_CHUNK_LARGE_OFFSET_WIDTH), write_midx_large_offsets); - if (flags & (MIDX_WRITE_REV_INDEX | MIDX_WRITE_BITMAP)) { + if (opts->flags & (MIDX_WRITE_REV_INDEX | MIDX_WRITE_BITMAP)) { ctx.pack_order = midx_pack_order(&ctx); add_chunk(cf, MIDX_CHUNKID_REVINDEX, st_mult(ctx.entries_nr, sizeof(uint32_t)), @@ -1445,11 +1452,11 @@ static int write_midx_internal(struct odb_source *source, CSUM_FSYNC | CSUM_HASH_IN_STREAM); free_chunkfile(cf); - if (flags & MIDX_WRITE_REV_INDEX && + if (opts->flags & MIDX_WRITE_REV_INDEX && git_env_bool("GIT_TEST_MIDX_WRITE_REV", 0)) write_midx_reverse_index(&ctx, midx_hash); - if (flags & MIDX_WRITE_BITMAP) { + if (opts->flags & MIDX_WRITE_BITMAP) { struct packing_data pdata; struct commit_stack commits = COMMIT_STACK_INIT; @@ -1458,7 +1465,7 @@ static int write_midx_internal(struct odb_source *source, prepare_midx_packing_data(&pdata, &ctx); - find_commits_for_midx_bitmap(&commits, refs_snapshot, &ctx); + find_commits_for_midx_bitmap(&commits, opts->refs_snapshot, &ctx); /* * The previous steps translated the information from @@ -1469,8 +1476,8 @@ static int write_midx_internal(struct odb_source *source, FREE_AND_NULL(ctx.entries); ctx.entries_nr = 0; - if (write_midx_bitmap(&ctx, midx_hash, &pdata, - commits.items, commits.nr, flags) < 0) { + if (write_midx_bitmap(&ctx, midx_hash, &pdata, commits.items, + commits.nr, opts->flags) < 0) { error(_("could not write multi-pack bitmap")); clear_packing_data(&pdata); commit_stack_clear(&commits); @@ -1503,7 +1510,7 @@ static int write_midx_internal(struct odb_source *source, if (link_midx_to_chain(ctx.base_midx) < 0) goto cleanup; - get_split_midx_filename_ext(source, &final_midx_name, + get_split_midx_filename_ext(opts->source, &final_midx_name, midx_hash, MIDX_EXT_MIDX); if (rename_tempfile(&incr, final_midx_name.buf) < 0) { @@ -1536,7 +1543,7 @@ static int write_midx_internal(struct odb_source *source, if (commit_lock_file(&lk) < 0) die_errno(_("could not write multi-pack-index")); - clear_midx_files(source, keep_hashes, + clear_midx_files(opts->source, keep_hashes, ctx.num_multi_pack_indexes_before + 1, ctx.incremental); result = 0; @@ -1571,9 +1578,14 @@ int write_midx_file(struct odb_source *source, const char *preferred_pack_name, const char *refs_snapshot, unsigned flags) { - return write_midx_internal(source, NULL, NULL, - preferred_pack_name, refs_snapshot, - flags); + struct write_midx_opts opts = { + .source = source, + .preferred_pack_name = preferred_pack_name, + .refs_snapshot = refs_snapshot, + .flags = flags, + }; + + return write_midx_internal(&opts); } int write_midx_file_only(struct odb_source *source, @@ -1581,8 +1593,15 @@ int write_midx_file_only(struct odb_source *source, const char *preferred_pack_name, const char *refs_snapshot, unsigned flags) { - return write_midx_internal(source, packs_to_include, NULL, - preferred_pack_name, refs_snapshot, flags); + struct write_midx_opts opts = { + .source = source, + .packs_to_include = packs_to_include, + .preferred_pack_name = preferred_pack_name, + .refs_snapshot = refs_snapshot, + .flags = flags, + }; + + return write_midx_internal(&opts); } int expire_midx_packs(struct odb_source *source, unsigned flags) @@ -1641,9 +1660,14 @@ int expire_midx_packs(struct odb_source *source, unsigned flags) free(count); - if (packs_to_drop.nr) - result = write_midx_internal(source, NULL, - &packs_to_drop, NULL, NULL, flags); + if (packs_to_drop.nr) { + struct write_midx_opts opts = { + .source = source, + .packs_to_drop = &packs_to_drop, + .flags = flags & MIDX_PROGRESS, + }; + result = write_midx_internal(&opts); + } string_list_clear(&packs_to_drop, 0); @@ -1776,6 +1800,10 @@ int midx_repack(struct odb_source *source, size_t batch_size, unsigned flags) struct child_process cmd = CHILD_PROCESS_INIT; FILE *cmd_in; struct multi_pack_index *m = get_multi_pack_index(source); + struct write_midx_opts opts = { + .source = source, + .flags = flags, + }; /* * When updating the default for these configuration @@ -1850,8 +1878,7 @@ int midx_repack(struct odb_source *source, size_t batch_size, unsigned flags) goto cleanup; } - result = write_midx_internal(source, NULL, NULL, NULL, NULL, - flags); + result = write_midx_internal(&opts); cleanup: free(include_pack); From b2ec8e90c201d2395b67f89f3bf38bb472e43460 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 14:00:13 -0500 Subject: [PATCH 10/24] midx: do not require packs to be sorted in lexicographic order The MIDX file format currently requires that pack files be identified by the lexicographic ordering of their names (that is, a pack having a checksum beginning with "abc" would have a numeric pack_int_id which is smaller than the same value for a pack beginning with "bcd"). As a result, it is impossible to combine adjacent MIDX layers together without permuting bits from bitmaps that are in more recent layer(s). To see why, consider the following example: | packs | preferred pack --------+-------------+--------------- MIDX #0 | { X, Y, Z } | Y MIDX #1 | { A, B, C } | B MIDX #2 | { D, E, F } | D , where MIDX #2's base MIDX is MIDX #1, and so on. Suppose that we want to combine MIDX layers #0 and #1, to create a new layer #0' containing the packs from both layers. With the original three MIDX layers, objects are laid out in the bitmap in the order they appear in their source pack, and the packs themselves are arranged according to the pseudo-pack order. In this case, that ordering is Y, X, Z, B, A, C. But recall that the pseudo-pack ordering is defined by the order that packs appear in the MIDX, with the exception of the preferred pack, which sorts ahead of all other packs regardless of its position within the MIDX. In the above example, that means that pack 'Y' could be placed anywhere (so long as it is designated as preferred), however, all other packs must be placed in the location listed above. Because that ordering isn't sorted lexicographically, it is impossible to compact MIDX layers in the above configuration without permuting the object-to-bit-position mapping. Changing this mapping would affect all bitmaps belonging to newer layers, rendering the bitmaps associated with MIDX #2 unreadable. One of the goals of MIDX compaction is that we are able to shrink the length of the MIDX chain *without* invalidating bitmaps that belong to newer layers, and the lexicographic ordering constraint is at odds with this goal. However, packs do not *need* to be lexicographically ordered within the MIDX. As far as I can gather, the only reason they are sorted lexically is to make it possible to perform a binary search over the pack names in a MIDX, necessary to make `midx_contains_pack()`'s performance logarithmic in the number of packs rather than linear. Relax this constraint by allowing MIDX writes to proceed with packs that are not arranged in lexicographic order. `midx_contains_pack()` will lazily instantiate a `pack_names_sorted` array on the MIDX, which will be used to implement the binary search over pack names. This change produces MIDXs which may not be correctly read with external tools or older versions of Git. Though older versions of Git know how to gracefully degrade and ignore any MIDX(s) they consider corrupt, external tools may not be as robust. To avoid unintentionally breaking any such tools, guard this change behind a version bump in the MIDX's on-disk format. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- Documentation/gitformat-pack.adoc | 8 ++++++-- midx-write.c | 26 ++++++++++++++++++++++---- midx.c | 31 ++++++++++++++++++++++++++++--- midx.h | 4 +++- t/t5319-multi-pack-index.sh | 16 ++++++++++------ 5 files changed, 69 insertions(+), 16 deletions(-) diff --git a/Documentation/gitformat-pack.adoc b/Documentation/gitformat-pack.adoc index 1b4db4aa611e83..3416edceab82e9 100644 --- a/Documentation/gitformat-pack.adoc +++ b/Documentation/gitformat-pack.adoc @@ -374,7 +374,9 @@ HEADER: The signature is: {'M', 'I', 'D', 'X'} 1-byte version number: - Git only writes or recognizes version 1. + Git writes the version specified by the "midx.version" + configuration option, which defaults to 2. It recognizes + both versions 1 and 2. 1-byte Object Id Version We infer the length of object IDs (OIDs) from this value: @@ -413,7 +415,9 @@ CHUNK DATA: strings. There is no extra padding between the filenames, and they are listed in lexicographic order. The chunk itself is padded at the end with between 0 and 3 NUL bytes to make the - chunk size a multiple of 4 bytes. + chunk size a multiple of 4 bytes. Version 1 MIDXs are required to + list their packs in lexicographic order, but version 2 MIDXs may + list their packs in any arbitrary order. Bitmapped Packfiles (ID: {'B', 'T', 'M', 'P'}) Stores a table of two 4-byte unsigned integers in network order. diff --git a/midx-write.c b/midx-write.c index 8a54644e427992..5c8700065a1d16 100644 --- a/midx-write.c +++ b/midx-write.c @@ -36,10 +36,13 @@ extern int cmp_idx_or_pack_name(const char *idx_or_pack_name, static size_t write_midx_header(const struct git_hash_algo *hash_algo, struct hashfile *f, unsigned char num_chunks, - uint32_t num_packs) + uint32_t num_packs, int version) { + if (version != MIDX_VERSION_V1 && version != MIDX_VERSION_V2) + BUG("unexpected MIDX version: %d", version); + hashwrite_be32(f, MIDX_SIGNATURE); - hashwrite_u8(f, MIDX_VERSION); + hashwrite_u8(f, version); hashwrite_u8(f, oid_version(hash_algo)); hashwrite_u8(f, num_chunks); hashwrite_u8(f, 0); /* unused */ @@ -105,6 +108,8 @@ struct write_midx_context { uint32_t preferred_pack_idx; + int version; /* must be MIDX_VERSION_V1 or _V2 */ + int incremental; uint32_t num_multi_pack_indexes_before; @@ -410,7 +415,9 @@ static int write_midx_pack_names(struct hashfile *f, void *data) if (ctx->info[i].expired) continue; - if (i && strcmp(ctx->info[i].pack_name, ctx->info[i - 1].pack_name) <= 0) + if (ctx->version == MIDX_VERSION_V1 && + i && strcmp(ctx->info[i].pack_name, + ctx->info[i - 1].pack_name) <= 0) BUG("incorrect pack-file order: %s before %s", ctx->info[i - 1].pack_name, ctx->info[i].pack_name); @@ -1025,6 +1032,12 @@ static bool midx_needs_update(struct multi_pack_index *midx, struct write_midx_c if (!midx_checksum_valid(midx)) goto out; + /* + * If the version differs, we need to update. + */ + if (midx->version != ctx->version) + goto out; + /* * Ignore incremental updates for now. The assumption is that any * incremental update would be either empty (in which case we will bail @@ -1100,6 +1113,7 @@ static int write_midx_internal(struct write_midx_opts *opts) struct tempfile *incr; struct write_midx_context ctx = { .preferred_pack_idx = NO_PREFERRED_PACK, + .version = MIDX_VERSION_V2, }; struct multi_pack_index *midx_to_free = NULL; int bitmapped_packs_concat_len = 0; @@ -1114,6 +1128,10 @@ static int write_midx_internal(struct write_midx_opts *opts) ctx.repo = r; ctx.source = opts->source; + repo_config_get_int(ctx.repo, "midx.version", &ctx.version); + if (ctx.version != MIDX_VERSION_V1 && ctx.version != MIDX_VERSION_V2) + die(_("unknown MIDX version: %d"), ctx.version); + ctx.incremental = !!(opts->flags & MIDX_WRITE_INCREMENTAL); if (ctx.incremental) @@ -1445,7 +1463,7 @@ static int write_midx_internal(struct write_midx_opts *opts) } write_midx_header(r->hash_algo, f, get_num_chunks(cf), - ctx.nr - dropped_packs); + ctx.nr - dropped_packs, ctx.version); write_chunkfile(cf, &ctx); finalize_hashfile(f, midx_hash, FSYNC_COMPONENT_PACK_METADATA, diff --git a/midx.c b/midx.c index bae458923232d7..c1b9658240d051 100644 --- a/midx.c +++ b/midx.c @@ -149,7 +149,7 @@ static struct multi_pack_index *load_multi_pack_index_one(struct odb_source *sou m->signature, MIDX_SIGNATURE); m->version = m->data[MIDX_BYTE_FILE_VERSION]; - if (m->version != MIDX_VERSION) + if (m->version != MIDX_VERSION_V1 && m->version != MIDX_VERSION_V2) die(_("multi-pack-index version %d not recognized"), m->version); @@ -210,7 +210,8 @@ static struct multi_pack_index *load_multi_pack_index_one(struct odb_source *sou die(_("multi-pack-index pack-name chunk is too short")); cur_pack_name = end + 1; - if (i && strcmp(m->pack_names[i], m->pack_names[i - 1]) <= 0) + if (m->version == MIDX_VERSION_V1 && + i && strcmp(m->pack_names[i], m->pack_names[i - 1]) <= 0) die(_("multi-pack-index pack names out of order: '%s' before '%s'"), m->pack_names[i - 1], m->pack_names[i]); @@ -411,6 +412,7 @@ void close_midx(struct multi_pack_index *m) } FREE_AND_NULL(m->packs); FREE_AND_NULL(m->pack_names); + FREE_AND_NULL(m->pack_names_sorted); free(m); } @@ -655,17 +657,40 @@ int cmp_idx_or_pack_name(const char *idx_or_pack_name, return strcmp(idx_or_pack_name, idx_name); } + +static int midx_pack_names_cmp(const void *a, const void *b, void *m_) +{ + struct multi_pack_index *m = m_; + return strcmp(m->pack_names[*(const size_t *)a], + m->pack_names[*(const size_t *)b]); +} + static int midx_contains_pack_1(struct multi_pack_index *m, const char *idx_or_pack_name) { uint32_t first = 0, last = m->num_packs; + if (m->version == MIDX_VERSION_V2 && !m->pack_names_sorted) { + uint32_t i; + + ALLOC_ARRAY(m->pack_names_sorted, m->num_packs); + + for (i = 0; i < m->num_packs; i++) + m->pack_names_sorted[i] = i; + + QSORT_S(m->pack_names_sorted, m->num_packs, midx_pack_names_cmp, + m); + } + while (first < last) { uint32_t mid = first + (last - first) / 2; const char *current; int cmp; - current = m->pack_names[mid]; + if (m->pack_names_sorted) + current = m->pack_names[m->pack_names_sorted[mid]]; + else + current = m->pack_names[mid]; cmp = cmp_idx_or_pack_name(idx_or_pack_name, current); if (!cmp) return 1; diff --git a/midx.h b/midx.h index a39bcc9d03fc9c..aa99a6cb2152f7 100644 --- a/midx.h +++ b/midx.h @@ -11,7 +11,8 @@ struct git_hash_algo; struct odb_source; #define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */ -#define MIDX_VERSION 1 +#define MIDX_VERSION_V1 1 +#define MIDX_VERSION_V2 2 #define MIDX_BYTE_FILE_VERSION 4 #define MIDX_BYTE_HASH_VERSION 5 #define MIDX_BYTE_NUM_CHUNKS 6 @@ -71,6 +72,7 @@ struct multi_pack_index { uint32_t num_packs_in_base; const char **pack_names; + size_t *pack_names_sorted; struct packed_git **packs; }; diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh index efeab4d22b7c79..250d21dbd67825 100755 --- a/t/t5319-multi-pack-index.sh +++ b/t/t5319-multi-pack-index.sh @@ -21,7 +21,7 @@ midx_read_expect () { EXTRA_CHUNKS="$5" { cat <<-EOF && - header: 4d494458 1 $HASH_LEN $NUM_CHUNKS $NUM_PACKS + header: 4d494458 2 $HASH_LEN $NUM_CHUNKS $NUM_PACKS chunks: pack-names oid-fanout oid-lookup object-offsets$EXTRA_CHUNKS num_objects: $NUM_OBJECTS packs: @@ -512,11 +512,6 @@ test_expect_success 'verify invalid chunk offset' ' "improper chunk offset(s)" ' -test_expect_success 'verify packnames out of order' ' - corrupt_midx_and_verify $MIDX_BYTE_PACKNAME_ORDER "z" $objdir \ - "pack names out of order" -' - test_expect_success 'verify missing pack' ' corrupt_midx_and_verify $MIDX_BYTE_PACKNAME_ORDER "a" $objdir \ "failed to load pack" @@ -578,6 +573,15 @@ test_expect_success 'verify incorrect checksum' ' $objdir "incorrect checksum" ' +test_expect_success 'setup for v1-specific fsck tests' ' + git -c midx.version=1 multi-pack-index write +' + +test_expect_success 'verify packnames out of order (v1)' ' + corrupt_midx_and_verify $MIDX_BYTE_PACKNAME_ORDER "z" $objdir \ + "pack names out of order" +' + test_expect_success 'repack progress off for redirected stderr' ' GIT_PROGRESS_DELAY=0 git multi-pack-index --object-dir=$objdir repack 2>err && test_line_count = 0 err From 4f8543255efc3cc1d5247fa16a9f7597ad565993 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 14:00:17 -0500 Subject: [PATCH 11/24] midx-write.c: introduce `midx_pack_perm()` helper The `ctx->pack_perm` array can be considered as a permutation between the original `pack_int_id` of some given pack to its position in the `ctx->info` array containing all packs. Today we can always index into this array with any known `pack_int_id`, since there is never a `pack_int_id` which is greater than or equal to the value `ctx->nr`. That is not necessarily the case with MIDX compaction. For example, suppose we have a MIDX chain with three layers, each containing three packs. The base of the MIDX chain will have packs with IDs 0, 1, and 2, the next layer 3, 4, and 5, and so on. If we are compacting the topmost two layers, we'll have input `pack_int_id` values between [3, 8], but `ctx->nr` will only be 6. In that example, if we want to know where the pack whose original `pack_int_id` value was, say, 7, we would compute `ctx->pack_perm[7]`, leading to an uninitialized read, since there are only 6 entries allocated in that array. To address this, there are a couple of options: - We could allocate enough entries in `ctx->pack_perm` to accommodate the largest `orig_pack_int_id` value. - Or, we could internally shift the input values by the number of packs in the base layer of the lower end of the MIDX compaction range. This patch prepare us to take the latter approach, since it does not allocate more memory than strictly necessary. (In our above example, the base of the lower end of the compaction range is the first MIDX layer (having three packs), so we would end up indexing `ctx->pack_perm[7-3]`, which is a valid read.) Note that this patch does not actually implement that approach yet, but merely performs a behavior-preserving refactoring which will make the change easier to carry out in the future. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- midx-write.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/midx-write.c b/midx-write.c index 5c8700065a1d16..9d345fb4737a04 100644 --- a/midx-write.c +++ b/midx-write.c @@ -119,6 +119,12 @@ struct write_midx_context { struct odb_source *source; }; +static uint32_t midx_pack_perm(struct write_midx_context *ctx, + uint32_t orig_pack_int_id) +{ + return ctx->pack_perm[orig_pack_int_id]; +} + static int should_include_pack(const struct write_midx_context *ctx, const char *file_name) { @@ -521,12 +527,12 @@ static int write_midx_object_offsets(struct hashfile *f, for (i = 0; i < ctx->entries_nr; i++) { struct pack_midx_entry *obj = list++; - if (ctx->pack_perm[obj->pack_int_id] == PACK_EXPIRED) + if (midx_pack_perm(ctx, obj->pack_int_id) == PACK_EXPIRED) BUG("object %s is in an expired pack with int-id %d", oid_to_hex(&obj->oid), obj->pack_int_id); - hashwrite_be32(f, ctx->pack_perm[obj->pack_int_id]); + hashwrite_be32(f, midx_pack_perm(ctx, obj->pack_int_id)); if (ctx->large_offsets_needed && obj->offset >> 31) hashwrite_be32(f, MIDX_LARGE_OFFSET_NEEDED | nr_large_offset++); @@ -627,7 +633,7 @@ static uint32_t *midx_pack_order(struct write_midx_context *ctx) for (i = 0; i < ctx->entries_nr; i++) { struct pack_midx_entry *e = &ctx->entries[i]; data[i].nr = i; - data[i].pack = ctx->pack_perm[e->pack_int_id]; + data[i].pack = midx_pack_perm(ctx, e->pack_int_id); if (!e->preferred) data[i].pack |= (1U << 31); data[i].offset = e->offset; @@ -637,7 +643,7 @@ static uint32_t *midx_pack_order(struct write_midx_context *ctx) for (i = 0; i < ctx->entries_nr; i++) { struct pack_midx_entry *e = &ctx->entries[data[i].nr]; - struct pack_info *pack = &ctx->info[ctx->pack_perm[e->pack_int_id]]; + struct pack_info *pack = &ctx->info[midx_pack_perm(ctx, e->pack_int_id)]; if (pack->bitmap_pos == BITMAP_POS_UNKNOWN) pack->bitmap_pos = i + base_objects; pack->bitmap_nr++; @@ -698,7 +704,7 @@ static void prepare_midx_packing_data(struct packing_data *pdata, struct object_entry *to = packlist_alloc(pdata, &from->oid); oe_set_in_pack(pdata, to, - ctx->info[ctx->pack_perm[from->pack_int_id]].p); + ctx->info[midx_pack_perm(ctx, from->pack_int_id)].p); } trace2_region_leave("midx", "prepare_midx_packing_data", ctx->repo); @@ -1384,7 +1390,7 @@ static int write_midx_internal(struct write_midx_opts *opts) sizeof(*ctx.info), idx_or_pack_name_cmp); if (preferred) { - uint32_t perm = ctx.pack_perm[preferred->orig_pack_int_id]; + uint32_t perm = midx_pack_perm(&ctx, preferred->orig_pack_int_id); if (perm == PACK_EXPIRED) warning(_("preferred pack '%s' is expired"), opts->preferred_pack_name); From 5f3e7f7279da72b20f99a2bcd26d146f7edaef9d Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 14:00:21 -0500 Subject: [PATCH 12/24] midx-write.c: extract `fill_pack_from_midx()` When filling packs from an existing MIDX, `fill_packs_from_midx()` handles preparing a MIDX'd pack, and reading out its pack name from the existing MIDX. MIDX compaction will want to perform an identical operation, though the caller will look quite different than `fill_packs_from_midx()`. To reduce any future code duplication, extract `fill_pack_from_midx()` from `fill_packs_from_midx()` to prepare to call our new helper function in a future change. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- midx-write.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/midx-write.c b/midx-write.c index 9d345fb4737a04..c54113cdc8436f 100644 --- a/midx-write.c +++ b/midx-write.c @@ -913,6 +913,21 @@ static int write_midx_bitmap(struct write_midx_context *ctx, return ret; } +static int fill_pack_from_midx(struct pack_info *info, + struct multi_pack_index *m, + uint32_t pack_int_id) +{ + if (prepare_midx_pack(m, pack_int_id)) + return error(_("could not load pack %d"), pack_int_id); + + fill_pack_info(info, + m->packs[pack_int_id - m->num_packs_in_base], + m->pack_names[pack_int_id - m->num_packs_in_base], + pack_int_id); + + return 0; +} + static int fill_packs_from_midx(struct write_midx_context *ctx) { struct multi_pack_index *m; @@ -921,13 +936,13 @@ static int fill_packs_from_midx(struct write_midx_context *ctx) uint32_t i; for (i = 0; i < m->num_packs; i++) { - if (prepare_midx_pack(m, m->num_packs_in_base + i)) - return error(_("could not load pack")); - ALLOC_GROW(ctx->info, ctx->nr + 1, ctx->alloc); - fill_pack_info(&ctx->info[ctx->nr++], m->packs[i], - m->pack_names[i], - m->num_packs_in_base + i); + + if (fill_pack_from_midx(&ctx->info[ctx->nr], m, + m->num_packs_in_base + i) < 0) + return -1; + + ctx->nr++; } } return 0; From 93c67df7511057e7c8fda169671a44322c04c2d0 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 14:00:26 -0500 Subject: [PATCH 13/24] midx-write.c: enumerate `pack_int_id` values directly Our `midx-write.c::fill_packs_from_midx()` function currently enumerates the range [0, m->num_packs), and then shifts its index variable up by `m->num_packs_in_base` to produce a valid `pack_int_id`. Instead, directly enumerate the range: [m->num_packs_in_base, m->num_packs_in_base + m->num_packs) , which are the original pack_int_ids themselves as opposed to the indexes of those packs relative to the MIDX layer they are contained within. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- midx-write.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/midx-write.c b/midx-write.c index c54113cdc8436f..80334914d3e384 100644 --- a/midx-write.c +++ b/midx-write.c @@ -935,11 +935,11 @@ static int fill_packs_from_midx(struct write_midx_context *ctx) for (m = ctx->m; m; m = m->base_midx) { uint32_t i; - for (i = 0; i < m->num_packs; i++) { + for (i = m->num_packs_in_base; + i < m->num_packs_in_base + m->num_packs; i++) { ALLOC_GROW(ctx->info, ctx->nr + 1, ctx->alloc); - if (fill_pack_from_midx(&ctx->info[ctx->nr], m, - m->num_packs_in_base + i) < 0) + if (fill_pack_from_midx(&ctx->info[ctx->nr], m, i) < 0) return -1; ctx->nr++; From 9aea84c4e78b462fe55d142e67997b90d8ee3055 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 14:00:30 -0500 Subject: [PATCH 14/24] midx-write.c: factor fanout layering from `compute_sorted_entries()` When computing the set of objects to appear in a MIDX, we use compute_sorted_entries(), which handles objects from various existing sources one fanout layer at a time. The process for computing this set is slightly different during MIDX compaction, so factor out the existing functionality into its own routine to prevent `compute_sorted_entries()` from becoming too difficult to read. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- midx-write.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/midx-write.c b/midx-write.c index 80334914d3e384..ca2469213e699f 100644 --- a/midx-write.c +++ b/midx-write.c @@ -328,6 +328,30 @@ static void midx_fanout_add_pack_fanout(struct midx_fanout *fanout, } } +static void midx_fanout_add(struct midx_fanout *fanout, + struct write_midx_context *ctx, + uint32_t start_pack, + uint32_t cur_fanout) +{ + uint32_t cur_pack; + + if (ctx->m && !ctx->incremental) + midx_fanout_add_midx_fanout(fanout, ctx->m, cur_fanout, + ctx->preferred_pack_idx); + + for (cur_pack = start_pack; cur_pack < ctx->nr; cur_pack++) { + int preferred = cur_pack == ctx->preferred_pack_idx; + midx_fanout_add_pack_fanout(fanout, ctx->info, cur_pack, + preferred, cur_fanout); + } + + if (ctx->preferred_pack_idx != NO_PREFERRED_PACK && + ctx->preferred_pack_idx < start_pack) + midx_fanout_add_pack_fanout(fanout, ctx->info, + ctx->preferred_pack_idx, 1, + cur_fanout); +} + /* * It is possible to artificially get into a state where there are many * duplicate copies of objects. That can create high memory pressure if @@ -364,23 +388,7 @@ static void compute_sorted_entries(struct write_midx_context *ctx, for (cur_fanout = 0; cur_fanout < 256; cur_fanout++) { fanout.nr = 0; - if (ctx->m && !ctx->incremental) - midx_fanout_add_midx_fanout(&fanout, ctx->m, cur_fanout, - ctx->preferred_pack_idx); - - for (cur_pack = start_pack; cur_pack < ctx->nr; cur_pack++) { - int preferred = cur_pack == ctx->preferred_pack_idx; - midx_fanout_add_pack_fanout(&fanout, - ctx->info, cur_pack, - preferred, cur_fanout); - } - - if (ctx->preferred_pack_idx != NO_PREFERRED_PACK && - ctx->preferred_pack_idx < start_pack) - midx_fanout_add_pack_fanout(&fanout, ctx->info, - ctx->preferred_pack_idx, 1, - cur_fanout); - + midx_fanout_add(&fanout, ctx, start_pack, cur_fanout); midx_fanout_sort(&fanout); /* From dedf71f0b1b9a64d5aa7558ef45b26166d8d66fc Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 14:00:36 -0500 Subject: [PATCH 15/24] t/helper/test-read-midx.c: plug memory leak when selecting layer Though our 'read-midx' test tool is capable of printing information about a single MIDX layer identified by its checksum, no caller in our test suite exercises this path. Unfortunately, there is a memory leak lurking in this (currently) unused path that would otherwise be exposed by the following commit. This occurs when providing a MIDX layer checksum other than the tip. As we walk over the MIDX chain trying to find the matching layer, we drop our reference to the top-most MIDX layer. Thus, our call to 'close_midx()' later on leaks memory between the top-most MIDX layer and the MIDX layer immediately following the specified one. Plug this leak by holding a reference to the tip of the MIDX chain, and ensure that we call `close_midx()` before terminating the test tool. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/helper/test-read-midx.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/t/helper/test-read-midx.c b/t/helper/test-read-midx.c index 9d42c587564182..388d29e2b53db3 100644 --- a/t/helper/test-read-midx.c +++ b/t/helper/test-read-midx.c @@ -26,9 +26,10 @@ static int read_midx_file(const char *object_dir, const char *checksum, int show_objects) { uint32_t i; - struct multi_pack_index *m; + struct multi_pack_index *m, *tip; + int ret = 0; - m = setup_midx(object_dir); + m = tip = setup_midx(object_dir); if (!m) return 1; @@ -36,8 +37,11 @@ static int read_midx_file(const char *object_dir, const char *checksum, if (checksum) { while (m && strcmp(midx_get_checksum_hex(m), checksum)) m = m->base_midx; - if (!m) - return 1; + if (!m) { + ret = error(_("could not find MIDX with checksum %s"), + checksum); + goto out; + } } printf("header: %08x %d %d %d %d\n", @@ -82,9 +86,10 @@ static int read_midx_file(const char *object_dir, const char *checksum, } } - close_midx(m); +out: + close_midx(tip); - return 0; + return ret; } static int read_midx_checksum(const char *object_dir) From 9df44a97f165bbdd8d591c6e99c8894ae036b5bd Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 14:00:41 -0500 Subject: [PATCH 16/24] midx: implement MIDX compaction When managing a MIDX chain with many layers, it is convenient to combine a sequence of adjacent layers into a single layer to prevent the chain from growing too long. While it is conceptually possible to "compact" a sequence of MIDX layers together by running "git multi-pack-index write --stdin-packs", there are a few drawbacks that make this less than desirable: - Preserving the MIDX chain is impossible, since there is no way to write a MIDX layer that contains objects or packs found in an earlier MIDX layer already part of the chain. So callers would have to write an entirely new (non-incremental) MIDX containing only the compacted layers, discarding all other objects/packs from the MIDX. - There is (currently) no way to write a MIDX layer outside of the MIDX chain to work around the above, such that the MIDX chain could be reassembled substituting the compacted layers with the MIDX that was written. - The `--stdin-packs` command-line option does not allow us to specify the order of packs as they appear in the MIDX. Therefore, even if there were workarounds for the previous two challenges, any bitmaps belonging to layers which come after the compacted layer(s) would no longer be valid. This commit introduces a way to compact a sequence of adjacent MIDX layers into a single layer while preserving the MIDX chain, as well as any bitmap(s) in layers which are newer than the compacted ones. Implementing MIDX compaction does not require a significant number of changes to how MIDX layers are written. The main changes are as follows: - Instead of calling `fill_packs_from_midx()`, we call a new function `fill_packs_from_midx_range()`, which walks backwards along the portion of the MIDX chain which we are compacting, and adds packs one layer a time. In order to preserve the pseudo-pack order, the concatenated pack order is preserved, with the exception of preferred packs which are always added first. - After adding entries from the set of packs in the compaction range, `compute_sorted_entries()` must adjust the `pack_int_id`'s for all objects added in each fanout layer to match their original `pack_int_id`'s (as opposed to the index at which each pack appears in `ctx.info`). Note that we cannot reuse `midx_fanout_add_midx_fanout()` directly here, as it unconditionally recurs through the `->base_midx`. Factor out a `_1()` variant that operates on a single layer, reimplement the existing function in terms of it, and use the new variant from `midx_fanout_add_compact()`. Since we are sorting the list of objects ourselves, the order we add them in does not matter. - When writing out the new 'multi-pack-index-chain' file, discard any layers in the compaction range, replacing them with the newly written layer, instead of keeping them and placing the new layer at the end of the chain. This ends up being sufficient to implement MIDX compaction in such a way that preserves bitmaps corresponding to more recent layers in the MIDX chain. The tests for MIDX compaction are so far fairly spartan, since the main interesting behavior here is ensuring that the right packs/objects are selected from each layer, and that the pack order is preserved despite whether or not they are sorted in lexicographic order in the original MIDX chain. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- Documentation/git-multi-pack-index.adoc | 13 ++ builtin/multi-pack-index.c | 74 +++++++ midx-write.c | 277 +++++++++++++++++++++--- midx.h | 5 + t/meson.build | 1 + t/t5335-compact-multi-pack-index.sh | 175 +++++++++++++++ 6 files changed, 518 insertions(+), 27 deletions(-) create mode 100755 t/t5335-compact-multi-pack-index.sh diff --git a/Documentation/git-multi-pack-index.adoc b/Documentation/git-multi-pack-index.adoc index 164cf1f2291b99..883a0529741863 100644 --- a/Documentation/git-multi-pack-index.adoc +++ b/Documentation/git-multi-pack-index.adoc @@ -12,6 +12,8 @@ SYNOPSIS 'git multi-pack-index' [] write [--preferred-pack=] [--[no-]bitmap] [--[no-]incremental] [--[no-]stdin-packs] [--refs-snapshot=] +'git multi-pack-index' [] compact [--[no-]incremental] + 'git multi-pack-index' [] verify 'git multi-pack-index' [] expire 'git multi-pack-index' [] repack [--batch-size=] @@ -83,6 +85,17 @@ marker). necessary. -- +compact:: + Write a new MIDX layer containing only objects and packs present + in the range `` to ``, where both arguments are + checksums of existing layers in the MIDX chain. ++ +-- + --incremental:: + Write the result to a MIDX chain instead of writing a + stand-alone MIDX. +-- + verify:: Verify the contents of the MIDX file. diff --git a/builtin/multi-pack-index.c b/builtin/multi-pack-index.c index c0c6c1760c0f28..043ee8c478a3d6 100644 --- a/builtin/multi-pack-index.c +++ b/builtin/multi-pack-index.c @@ -17,6 +17,10 @@ " [--[no-]bitmap] [--[no-]incremental] [--[no-]stdin-packs]\n" \ " [--refs-snapshot=]") +#define BUILTIN_MIDX_COMPACT_USAGE \ + N_("git multi-pack-index [] compact [--[no-]incremental]\n" \ + " ") + #define BUILTIN_MIDX_VERIFY_USAGE \ N_("git multi-pack-index [] verify") @@ -30,6 +34,10 @@ static char const * const builtin_multi_pack_index_write_usage[] = { BUILTIN_MIDX_WRITE_USAGE, NULL }; +static char const * const builtin_multi_pack_index_compact_usage[] = { + BUILTIN_MIDX_COMPACT_USAGE, + NULL +}; static char const * const builtin_multi_pack_index_verify_usage[] = { BUILTIN_MIDX_VERIFY_USAGE, NULL @@ -44,6 +52,7 @@ static char const * const builtin_multi_pack_index_repack_usage[] = { }; static char const * const builtin_multi_pack_index_usage[] = { BUILTIN_MIDX_WRITE_USAGE, + BUILTIN_MIDX_COMPACT_USAGE, BUILTIN_MIDX_VERIFY_USAGE, BUILTIN_MIDX_EXPIRE_USAGE, BUILTIN_MIDX_REPACK_USAGE, @@ -195,6 +204,70 @@ static int cmd_multi_pack_index_write(int argc, const char **argv, return ret; } +static int cmd_multi_pack_index_compact(int argc, const char **argv, + const char *prefix, + struct repository *repo) +{ + struct multi_pack_index *m, *cur; + struct multi_pack_index *from_midx = NULL; + struct multi_pack_index *to_midx = NULL; + struct odb_source *source; + int ret; + + struct option *options; + static struct option builtin_multi_pack_index_compact_options[] = { + OPT_BIT(0, "incremental", &opts.flags, + N_("write a new incremental MIDX"), MIDX_WRITE_INCREMENTAL), + OPT_END(), + }; + + repo_config(repo, git_multi_pack_index_write_config, NULL); + + options = add_common_options(builtin_multi_pack_index_compact_options); + + trace2_cmd_mode(argv[0]); + + if (isatty(2)) + opts.flags |= MIDX_PROGRESS; + argc = parse_options(argc, argv, prefix, + options, builtin_multi_pack_index_compact_usage, + 0); + + if (argc != 2) + usage_with_options(builtin_multi_pack_index_compact_usage, + options); + source = handle_object_dir_option(the_repository); + + FREE_AND_NULL(options); + + m = get_multi_pack_index(source); + + for (cur = m; cur && !(from_midx && to_midx); cur = cur->base_midx) { + const char *midx_csum = midx_get_checksum_hex(cur); + + if (!from_midx && !strcmp(midx_csum, argv[0])) + from_midx = cur; + if (!to_midx && !strcmp(midx_csum, argv[1])) + to_midx = cur; + } + + if (!from_midx) + die(_("could not find MIDX: %s"), argv[0]); + if (!to_midx) + die(_("could not find MIDX: %s"), argv[1]); + if (from_midx == to_midx) + die(_("MIDX compaction endpoints must be unique")); + + for (m = from_midx; m; m = m->base_midx) { + if (m == to_midx) + die(_("MIDX %s must be an ancestor of %s"), argv[0], argv[1]); + } + + ret = write_midx_file_compact(source, from_midx, to_midx, opts.flags); + + return ret; +} + static int cmd_multi_pack_index_verify(int argc, const char **argv, const char *prefix, struct repository *repo UNUSED) @@ -295,6 +368,7 @@ int cmd_multi_pack_index(int argc, struct option builtin_multi_pack_index_options[] = { OPT_SUBCOMMAND("repack", &fn, cmd_multi_pack_index_repack), OPT_SUBCOMMAND("write", &fn, cmd_multi_pack_index_write), + OPT_SUBCOMMAND("compact", &fn, cmd_multi_pack_index_compact), OPT_SUBCOMMAND("verify", &fn, cmd_multi_pack_index_verify), OPT_SUBCOMMAND("expire", &fn, cmd_multi_pack_index_expire), OPT_END(), diff --git a/midx-write.c b/midx-write.c index ca2469213e699f..bf53ad1c4b7340 100644 --- a/midx-write.c +++ b/midx-write.c @@ -113,6 +113,10 @@ struct write_midx_context { int incremental; uint32_t num_multi_pack_indexes_before; + struct multi_pack_index *compact_from; + struct multi_pack_index *compact_to; + int compact; + struct string_list *to_include; struct repository *repo; @@ -122,6 +126,8 @@ struct write_midx_context { static uint32_t midx_pack_perm(struct write_midx_context *ctx, uint32_t orig_pack_int_id) { + if (ctx->compact) + orig_pack_int_id -= ctx->compact_from->num_packs_in_base; return ctx->pack_perm[orig_pack_int_id]; } @@ -268,18 +274,14 @@ static void midx_fanout_sort(struct midx_fanout *fanout) QSORT(fanout->entries, fanout->nr, midx_oid_compare); } -static void midx_fanout_add_midx_fanout(struct midx_fanout *fanout, - struct multi_pack_index *m, - uint32_t cur_fanout, - uint32_t preferred_pack) +static void midx_fanout_add_midx_fanout_1(struct midx_fanout *fanout, + struct multi_pack_index *m, + uint32_t cur_fanout, + uint32_t preferred_pack) { uint32_t start = m->num_objects_in_base, end; uint32_t cur_object; - if (m->base_midx) - midx_fanout_add_midx_fanout(fanout, m->base_midx, cur_fanout, - preferred_pack); - if (cur_fanout) start += ntohl(m->chunk_oid_fanout[cur_fanout - 1]); end = m->num_objects_in_base + ntohl(m->chunk_oid_fanout[cur_fanout]); @@ -303,6 +305,17 @@ static void midx_fanout_add_midx_fanout(struct midx_fanout *fanout, } } +static void midx_fanout_add_midx_fanout(struct midx_fanout *fanout, + struct multi_pack_index *m, + uint32_t cur_fanout, + uint32_t preferred_pack) +{ + if (m->base_midx) + midx_fanout_add_midx_fanout(fanout, m->base_midx, cur_fanout, + preferred_pack); + midx_fanout_add_midx_fanout_1(fanout, m, cur_fanout, preferred_pack); +} + static void midx_fanout_add_pack_fanout(struct midx_fanout *fanout, struct pack_info *info, uint32_t cur_pack, @@ -352,6 +365,21 @@ static void midx_fanout_add(struct midx_fanout *fanout, cur_fanout); } +static void midx_fanout_add_compact(struct midx_fanout *fanout, + struct write_midx_context *ctx, + uint32_t cur_fanout) +{ + struct multi_pack_index *m = ctx->compact_to; + + ASSERT(ctx->compact); + + while (m && m != ctx->compact_from->base_midx) { + midx_fanout_add_midx_fanout_1(fanout, m, cur_fanout, + NO_PREFERRED_PACK); + m = m->base_midx; + } +} + /* * It is possible to artificially get into a state where there are many * duplicate copies of objects. That can create high memory pressure if @@ -370,6 +398,9 @@ static void compute_sorted_entries(struct write_midx_context *ctx, size_t alloc_objects, total_objects = 0; struct midx_fanout fanout = { 0 }; + if (ctx->compact) + ASSERT(!start_pack); + for (cur_pack = start_pack; cur_pack < ctx->nr; cur_pack++) total_objects = st_add(total_objects, ctx->info[cur_pack].p->num_objects); @@ -388,7 +419,10 @@ static void compute_sorted_entries(struct write_midx_context *ctx, for (cur_fanout = 0; cur_fanout < 256; cur_fanout++) { fanout.nr = 0; - midx_fanout_add(&fanout, ctx, start_pack, cur_fanout); + if (ctx->compact) + midx_fanout_add_compact(&fanout, ctx, cur_fanout); + else + midx_fanout_add(&fanout, ctx, start_pack, cur_fanout); midx_fanout_sort(&fanout); /* @@ -956,6 +990,75 @@ static int fill_packs_from_midx(struct write_midx_context *ctx) return 0; } +static uint32_t compactible_packs_between(const struct multi_pack_index *from, + const struct multi_pack_index *to) +{ + uint32_t nr; + + ASSERT(from && to); + + if (unsigned_add_overflows(to->num_packs, to->num_packs_in_base)) + die(_("too many packs, unable to compact")); + + nr = to->num_packs + to->num_packs_in_base; + if (nr < from->num_packs_in_base) + BUG("unexpected number of packs in base during compaction: " + "%"PRIu32" < %"PRIu32, nr, from->num_packs_in_base); + + return nr - from->num_packs_in_base; +} + +static int fill_packs_from_midx_range(struct write_midx_context *ctx, + int bitmap_order) +{ + struct multi_pack_index *m = ctx->compact_to; + uint32_t packs_nr; + + ASSERT(ctx->compact && !ctx->nr); + ASSERT(ctx->compact_from); + ASSERT(ctx->compact_to); + + packs_nr = compactible_packs_between(ctx->compact_from, + ctx->compact_to); + + ALLOC_GROW(ctx->info, packs_nr, ctx->alloc); + + while (m != ctx->compact_from->base_midx) { + uint32_t pack_int_id, preferred_pack_id; + uint32_t i; + + if (bitmap_order) { + if (midx_preferred_pack(m, &preferred_pack_id) < 0) + die(_("could not determine preferred pack")); + } else { + preferred_pack_id = m->num_packs_in_base; + } + + pack_int_id = m->num_packs_in_base - ctx->compact_from->num_packs_in_base; + + if (fill_pack_from_midx(&ctx->info[pack_int_id++], m, + preferred_pack_id) < 0) + return -1; + + for (i = m->num_packs_in_base; + i < m->num_packs_in_base + m->num_packs; i++) { + if (preferred_pack_id == i) + continue; + + if (fill_pack_from_midx(&ctx->info[pack_int_id++], m, + i) < 0) + return -1; + } + + ctx->nr += m->num_packs; + m = m->base_midx; + } + + ASSERT(ctx->nr == packs_nr); + + return 0; +} + static struct { const char *non_split; const char *split; @@ -1075,6 +1178,9 @@ static bool midx_needs_update(struct multi_pack_index *midx, struct write_midx_c if (ctx->incremental) goto out; + if (ctx->compact) + goto out; /* Compaction always requires an update. */ + /* * Otherwise, we need to verify that the packs covered by the existing * MIDX match the packs that we already have. The logic to do so is way @@ -1120,12 +1226,23 @@ static bool midx_needs_update(struct multi_pack_index *midx, struct write_midx_c return needed; } +static int midx_hashcmp(const struct multi_pack_index *a, + const struct multi_pack_index *b, + const struct git_hash_algo *algop) +{ + return hashcmp(midx_get_checksum_hash(a), midx_get_checksum_hash(b), + algop); +} + struct write_midx_opts { struct odb_source *source; /* non-optional */ struct string_list *packs_to_include; struct string_list *packs_to_drop; + struct multi_pack_index *compact_from; + struct multi_pack_index *compact_to; + const char *preferred_pack_name; const char *refs_snapshot; unsigned flags; @@ -1150,6 +1267,7 @@ static int write_midx_internal(struct write_midx_opts *opts) int dropped_packs = 0; int result = -1; const char **keep_hashes = NULL; + size_t keep_hashes_nr = 0; struct chunkfile *cf; trace2_region_enter("midx", "write_midx_internal", r); @@ -1162,6 +1280,19 @@ static int write_midx_internal(struct write_midx_opts *opts) die(_("unknown MIDX version: %d"), ctx.version); ctx.incremental = !!(opts->flags & MIDX_WRITE_INCREMENTAL); + ctx.compact = !!(opts->flags & MIDX_WRITE_COMPACT); + + if (ctx.compact) { + if (ctx.version != MIDX_VERSION_V2) + die(_("cannot perform MIDX compaction with v1 format")); + if (!opts->compact_from) + BUG("expected non-NULL 'from' MIDX during compaction"); + if (!opts->compact_to) + BUG("expected non-NULL 'to' MIDX during compaction"); + + ctx.compact_from = opts->compact_from; + ctx.compact_to = opts->compact_to; + } if (ctx.incremental) strbuf_addf(&midx_name, @@ -1189,11 +1320,18 @@ static int write_midx_internal(struct write_midx_opts *opts) */ if (ctx.incremental) ctx.base_midx = m; - else if (!opts->packs_to_include) + if (!opts->packs_to_include) ctx.m = m; } } + /* + * If compacting MIDX layer(s) in the range [from, to], then the + * compacted MIDX will share the same base MIDX as 'from'. + */ + if (ctx.compact) + ctx.base_midx = ctx.compact_from->base_midx; + ctx.nr = 0; ctx.alloc = ctx.m ? ctx.m->num_packs + ctx.m->num_packs_in_base : 16; ctx.info = NULL; @@ -1210,7 +1348,7 @@ static int write_midx_internal(struct write_midx_opts *opts) ctx.num_multi_pack_indexes_before++; m = m->base_midx; } - } else if (ctx.m && fill_packs_from_midx(&ctx)) { + } else if (ctx.m && !ctx.compact && fill_packs_from_midx(&ctx)) { goto cleanup; } @@ -1223,9 +1361,18 @@ static int write_midx_internal(struct write_midx_opts *opts) else ctx.progress = NULL; - ctx.to_include = opts->packs_to_include; + if (ctx.compact) { + int bitmap_order = 0; + if (opts->preferred_pack_name) + bitmap_order |= 1; + else if (opts->flags & (MIDX_WRITE_REV_INDEX | MIDX_WRITE_BITMAP)) + bitmap_order |= 1; - for_each_file_in_pack_dir(opts->source->path, add_pack_to_midx, &ctx); + fill_packs_from_midx_range(&ctx, bitmap_order); + } else { + ctx.to_include = opts->packs_to_include; + for_each_file_in_pack_dir(opts->source->path, add_pack_to_midx, &ctx); + } stop_progress(&ctx.progress); if (!opts->packs_to_drop) { @@ -1354,12 +1501,19 @@ static int write_midx_internal(struct write_midx_opts *opts) ctx.large_offsets_needed = 1; } - QSORT(ctx.info, ctx.nr, pack_info_compare); + if (ctx.compact) { + if (ctx.version != MIDX_VERSION_V2) + BUG("performing MIDX compaction with v1 MIDX"); + } else { + QSORT(ctx.info, ctx.nr, pack_info_compare); + } if (opts->packs_to_drop && opts->packs_to_drop->nr) { size_t drop_index = 0; int missing_drops = 0; + ASSERT(!ctx.compact); + for (size_t i = 0; i < ctx.nr && drop_index < opts->packs_to_drop->nr; i++) { int cmp = strcmp(ctx.info[i].pack_name, @@ -1391,12 +1545,20 @@ static int write_midx_internal(struct write_midx_opts *opts) */ ALLOC_ARRAY(ctx.pack_perm, ctx.nr); for (size_t i = 0; i < ctx.nr; i++) { + uint32_t from = ctx.info[i].orig_pack_int_id; + uint32_t to; + if (ctx.info[i].expired) { + to = PACK_EXPIRED; dropped_packs++; - ctx.pack_perm[ctx.info[i].orig_pack_int_id] = PACK_EXPIRED; } else { - ctx.pack_perm[ctx.info[i].orig_pack_int_id] = i - dropped_packs; + to = i - dropped_packs; } + + if (ctx.compact) + from -= ctx.compact_from->num_packs_in_base; + + ctx.pack_perm[from] = to; } for (size_t i = 0; i < ctx.nr; i++) { @@ -1542,7 +1704,24 @@ static int write_midx_internal(struct write_midx_opts *opts) if (ctx.num_multi_pack_indexes_before == UINT32_MAX) die(_("too many multi-pack-indexes")); - CALLOC_ARRAY(keep_hashes, ctx.num_multi_pack_indexes_before + 1); + if (ctx.compact) { + struct multi_pack_index *m; + + /* + * Keep all MIDX layers excluding those in the range [from, to]. + */ + for (m = ctx.base_midx; m; m = m->base_midx) + keep_hashes_nr++; + for (m = ctx.m; + m && midx_hashcmp(m, ctx.compact_to, r->hash_algo); + m = m->base_midx) + keep_hashes_nr++; + + keep_hashes_nr++; /* include the compacted layer */ + } else { + keep_hashes_nr = ctx.num_multi_pack_indexes_before + 1; + } + CALLOC_ARRAY(keep_hashes, keep_hashes_nr); if (ctx.incremental) { FILE *chainf = fdopen_lock_file(&lk, "w"); @@ -1567,17 +1746,47 @@ static int write_midx_internal(struct write_midx_opts *opts) strbuf_release(&final_midx_name); - keep_hashes[ctx.num_multi_pack_indexes_before] = - xstrdup(hash_to_hex_algop(midx_hash, r->hash_algo)); + if (ctx.compact) { + struct multi_pack_index *m; + uint32_t num_layers_before_from = 0; + uint32_t i; - for (uint32_t i = 0; i < ctx.num_multi_pack_indexes_before; i++) { - uint32_t j = ctx.num_multi_pack_indexes_before - i - 1; + for (m = ctx.base_midx; m; m = m->base_midx) + num_layers_before_from++; - keep_hashes[j] = xstrdup(midx_get_checksum_hex(m)); - m = m->base_midx; + m = ctx.base_midx; + for (i = 0; i < num_layers_before_from; i++) { + uint32_t j = num_layers_before_from - i - 1; + + keep_hashes[j] = xstrdup(midx_get_checksum_hex(m)); + m = m->base_midx; + } + + keep_hashes[i] = xstrdup(hash_to_hex_algop(midx_hash, + r->hash_algo)); + + i = 0; + for (m = ctx.m; + m && midx_hashcmp(m, ctx.compact_to, r->hash_algo); + m = m->base_midx) { + keep_hashes[keep_hashes_nr - i - 1] = + xstrdup(midx_get_checksum_hex(m)); + i++; + } + } else { + keep_hashes[ctx.num_multi_pack_indexes_before] = + xstrdup(hash_to_hex_algop(midx_hash, + r->hash_algo)); + + for (uint32_t i = 0; i < ctx.num_multi_pack_indexes_before; i++) { + uint32_t j = ctx.num_multi_pack_indexes_before - i - 1; + + keep_hashes[j] = xstrdup(midx_get_checksum_hex(m)); + m = m->base_midx; + } } - for (uint32_t i = 0; i <= ctx.num_multi_pack_indexes_before; i++) + for (uint32_t i = 0; i < keep_hashes_nr; i++) fprintf(get_lock_file_fp(&lk), "%s\n", keep_hashes[i]); } else { keep_hashes[ctx.num_multi_pack_indexes_before] = @@ -1590,8 +1799,7 @@ static int write_midx_internal(struct write_midx_opts *opts) if (commit_lock_file(&lk) < 0) die_errno(_("could not write multi-pack-index")); - clear_midx_files(opts->source, keep_hashes, - ctx.num_multi_pack_indexes_before + 1, + clear_midx_files(opts->source, keep_hashes, keep_hashes_nr, ctx.incremental); result = 0; @@ -1609,7 +1817,7 @@ static int write_midx_internal(struct write_midx_opts *opts) free(ctx.pack_perm); free(ctx.pack_order); if (keep_hashes) { - for (uint32_t i = 0; i <= ctx.num_multi_pack_indexes_before; i++) + for (uint32_t i = 0; i < keep_hashes_nr; i++) free((char *)keep_hashes[i]); free(keep_hashes); } @@ -1651,6 +1859,21 @@ int write_midx_file_only(struct odb_source *source, return write_midx_internal(&opts); } +int write_midx_file_compact(struct odb_source *source, + struct multi_pack_index *from, + struct multi_pack_index *to, + unsigned flags) +{ + struct write_midx_opts opts = { + .source = source, + .compact_from = from, + .compact_to = to, + .flags = flags | MIDX_WRITE_COMPACT, + }; + + return write_midx_internal(&opts); +} + int expire_midx_packs(struct odb_source *source, unsigned flags) { uint32_t i, *count, result = 0; diff --git a/midx.h b/midx.h index aa99a6cb2152f7..08f3728e5204b8 100644 --- a/midx.h +++ b/midx.h @@ -82,6 +82,7 @@ struct multi_pack_index { #define MIDX_WRITE_BITMAP_HASH_CACHE (1 << 3) #define MIDX_WRITE_BITMAP_LOOKUP_TABLE (1 << 4) #define MIDX_WRITE_INCREMENTAL (1 << 5) +#define MIDX_WRITE_COMPACT (1 << 6) #define MIDX_EXT_REV "rev" #define MIDX_EXT_BITMAP "bitmap" @@ -131,6 +132,10 @@ int write_midx_file_only(struct odb_source *source, struct string_list *packs_to_include, const char *preferred_pack_name, const char *refs_snapshot, unsigned flags); +int write_midx_file_compact(struct odb_source *source, + struct multi_pack_index *from, + struct multi_pack_index *to, + unsigned flags); void clear_midx_file(struct repository *r); int verify_midx_file(struct odb_source *source, unsigned flags); int expire_midx_packs(struct odb_source *source, unsigned flags); diff --git a/t/meson.build b/t/meson.build index f80e366cff73f3..2421220917aedc 100644 --- a/t/meson.build +++ b/t/meson.build @@ -618,6 +618,7 @@ integration_tests = [ 't5332-multi-pack-reuse.sh', 't5333-pseudo-merge-bitmaps.sh', 't5334-incremental-multi-pack-index.sh', + 't5335-compact-multi-pack-index.sh', 't5351-unpack-large-objects.sh', 't5400-send-pack.sh', 't5401-update-hooks.sh', diff --git a/t/t5335-compact-multi-pack-index.sh b/t/t5335-compact-multi-pack-index.sh new file mode 100755 index 00000000000000..797ae05c3bdfa5 --- /dev/null +++ b/t/t5335-compact-multi-pack-index.sh @@ -0,0 +1,175 @@ +#!/bin/sh + +test_description='multi-pack-index compaction' + +. ./test-lib.sh + +GIT_TEST_MULTI_PACK_INDEX=0 +GIT_TEST_MULTI_PACK_INDEX_WRITE_BITMAP=0 +GIT_TEST_MULTI_PACK_INDEX_WRITE_INCREMENTAL=0 + +objdir=.git/objects +packdir=$objdir/pack +midxdir=$packdir/multi-pack-index.d +midx_chain=$midxdir/multi-pack-index-chain + +nth_line() { + local n="$1" + shift + awk "NR==$n" "$@" +} + +write_packs () { + for c in "$@" + do + test_commit "$c" && + + git pack-objects --all --unpacked $packdir/pack-$c && + git prune-packed && + + git multi-pack-index write --incremental --bitmap || return 1 + done +} + +test_midx_layer_packs () { + local checksum="$1" && + shift && + + test-tool read-midx $objdir "$checksum" >out && + + printf "%s\n" "$@" >expect && + # NOTE: do *not* pipe through sort here, we want to ensure the + # order of packs is preserved during compaction. + grep "^pack-" out | cut -d"-" -f2 >actual && + + test_cmp expect actual +} + +test_midx_layer_object_uniqueness () { + : >objs.all + while read layer + do + test-tool read-midx --show-objects $objdir "$layer" >out && + grep "\.pack$" out | cut -d" " -f1 | sort >objs.layer && + test_stdout_line_count = 0 comm -12 objs.all objs.layer && + cat objs.all objs.layer | sort >objs.tmp && + mv objs.tmp objs.all || return 1 + done <$midx_chain +} + +test_expect_success 'MIDX compaction with lex-ordered pack names' ' + git init midx-compact-lex-order && + ( + cd midx-compact-lex-order && + + git config maintenance.auto false && + + write_packs A B C D E && + test_line_count = 5 $midx_chain && + + git multi-pack-index compact --incremental \ + "$(nth_line 2 "$midx_chain")" \ + "$(nth_line 4 "$midx_chain")" && + test_line_count = 3 $midx_chain && + + test_midx_layer_packs "$(nth_line 1 "$midx_chain")" A && + test_midx_layer_packs "$(nth_line 2 "$midx_chain")" B C D && + test_midx_layer_packs "$(nth_line 3 "$midx_chain")" E && + + test_midx_layer_object_uniqueness + ) +' + +test_expect_success 'MIDX compaction with non-lex-ordered pack names' ' + git init midx-compact-non-lex-order && + ( + cd midx-compact-non-lex-order && + + git config maintenance.auto false && + + write_packs D C A B E && + test_line_count = 5 $midx_chain && + + git multi-pack-index compact --incremental \ + "$(nth_line 2 "$midx_chain")" \ + "$(nth_line 4 "$midx_chain")" && + test_line_count = 3 $midx_chain && + + test_midx_layer_packs "$(nth_line 1 "$midx_chain")" D && + test_midx_layer_packs "$(nth_line 2 "$midx_chain")" C A B && + test_midx_layer_packs "$(nth_line 3 "$midx_chain")" E && + + test_midx_layer_object_uniqueness + ) +' + +test_expect_success 'setup for bogus MIDX compaction scenarios' ' + git init midx-compact-bogus && + ( + cd midx-compact-bogus && + + git config maintenance.auto false && + + write_packs A B C + ) +' + +test_expect_success 'MIDX compaction with missing endpoints' ' + ( + cd midx-compact-bogus && + + test_must_fail git multi-pack-index compact --incremental \ + "" "" 2>err && + test_grep "could not find MIDX: " err && + + test_must_fail git multi-pack-index compact --incremental \ + "" "$(nth_line 2 "$midx_chain")" 2>err && + test_grep "could not find MIDX: " err && + + test_must_fail git multi-pack-index compact --incremental \ + "$(nth_line 2 "$midx_chain")" "" 2>err && + test_grep "could not find MIDX: " err + ) +' + +test_expect_success 'MIDX compaction with reversed endpoints' ' + ( + cd midx-compact-bogus && + + from="$(nth_line 3 "$midx_chain")" && + to="$(nth_line 1 "$midx_chain")" && + + test_must_fail git multi-pack-index compact --incremental \ + "$from" "$to" 2>err && + + test_grep "MIDX $from must be an ancestor of $to" err + ) +' + +test_expect_success 'MIDX compaction with identical endpoints' ' + ( + cd midx-compact-bogus && + + from="$(nth_line 3 "$midx_chain")" && + to="$(nth_line 3 "$midx_chain")" && + + test_must_fail git multi-pack-index compact --incremental \ + "$from" "$to" 2>err && + + test_grep "MIDX compaction endpoints must be unique" err + ) +' + +test_expect_success 'MIDX compaction with midx.version=1' ' + ( + cd midx-compact-bogus && + + test_must_fail git -c midx.version=1 multi-pack-index compact \ + "$(nth_line 1 "$midx_chain")" \ + "$(nth_line 2 "$midx_chain")" 2>err && + + test_grep "fatal: cannot perform MIDX compaction with v1 format" err + ) +' + +test_done From d54da84bd9de09fc339accff553f1fc8a5539154 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 24 Feb 2026 14:00:45 -0500 Subject: [PATCH 17/24] midx: enable reachability bitmaps during MIDX compaction Enable callers to generate reachability bitmaps when performing MIDX layer compaction by combining all existing bitmaps from the compacted layers. Note that because of the object/pack ordering described by the previous commit, the pseudo-pack order for the compacted MIDX is the same as concatenating the individual pseudo-pack orderings for each layer in the compaction range. As a result, the only non-test or documentation change necessary is to treat all objects as non-preferred during compaction so as not to disturb the object ordering. In the future, we may want to adjust which commit(s) receive reachability bitmaps when compacting multiple .bitmap files into one, or even generate new bitmaps (e.g., if the references have moved significantly since the .bitmap was generated). This commit only implements combining all existing bitmaps in range together in order to demonstrate and lay the groundwork for more exotic strategies. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- Documentation/git-multi-pack-index.adoc | 5 +- builtin/multi-pack-index.c | 4 +- midx-write.c | 2 +- t/t5335-compact-multi-pack-index.sh | 122 +++++++++++++++++++++++- 4 files changed, 128 insertions(+), 5 deletions(-) diff --git a/Documentation/git-multi-pack-index.adoc b/Documentation/git-multi-pack-index.adoc index 883a0529741863..612568301412d6 100644 --- a/Documentation/git-multi-pack-index.adoc +++ b/Documentation/git-multi-pack-index.adoc @@ -13,7 +13,7 @@ SYNOPSIS [--[no-]bitmap] [--[no-]incremental] [--[no-]stdin-packs] [--refs-snapshot=] 'git multi-pack-index' [] compact [--[no-]incremental] - + [--[no-]bitmap] 'git multi-pack-index' [] verify 'git multi-pack-index' [] expire 'git multi-pack-index' [] repack [--batch-size=] @@ -94,6 +94,9 @@ compact:: --incremental:: Write the result to a MIDX chain instead of writing a stand-alone MIDX. + + --[no-]bitmap:: + Control whether or not a multi-pack bitmap is written. -- verify:: diff --git a/builtin/multi-pack-index.c b/builtin/multi-pack-index.c index 043ee8c478a3d6..2f24c113c8f261 100644 --- a/builtin/multi-pack-index.c +++ b/builtin/multi-pack-index.c @@ -19,7 +19,7 @@ #define BUILTIN_MIDX_COMPACT_USAGE \ N_("git multi-pack-index [] compact [--[no-]incremental]\n" \ - " ") + " [--[no-]bitmap] ") #define BUILTIN_MIDX_VERIFY_USAGE \ N_("git multi-pack-index [] verify") @@ -216,6 +216,8 @@ static int cmd_multi_pack_index_compact(int argc, const char **argv, struct option *options; static struct option builtin_multi_pack_index_compact_options[] = { + OPT_BIT(0, "bitmap", &opts.flags, N_("write multi-pack bitmap"), + MIDX_WRITE_BITMAP | MIDX_WRITE_REV_INDEX), OPT_BIT(0, "incremental", &opts.flags, N_("write a new incremental MIDX"), MIDX_WRITE_INCREMENTAL), OPT_END(), diff --git a/midx-write.c b/midx-write.c index bf53ad1c4b7340..0ff2e45aa7abdd 100644 --- a/midx-write.c +++ b/midx-write.c @@ -676,7 +676,7 @@ static uint32_t *midx_pack_order(struct write_midx_context *ctx) struct pack_midx_entry *e = &ctx->entries[i]; data[i].nr = i; data[i].pack = midx_pack_perm(ctx, e->pack_int_id); - if (!e->preferred) + if (!e->preferred || ctx->compact) data[i].pack |= (1U << 31); data[i].offset = e->offset; } diff --git a/t/t5335-compact-multi-pack-index.sh b/t/t5335-compact-multi-pack-index.sh index 797ae05c3bdfa5..40f3844282f04e 100755 --- a/t/t5335-compact-multi-pack-index.sh +++ b/t/t5335-compact-multi-pack-index.sh @@ -67,7 +67,7 @@ test_expect_success 'MIDX compaction with lex-ordered pack names' ' write_packs A B C D E && test_line_count = 5 $midx_chain && - git multi-pack-index compact --incremental \ + git multi-pack-index compact --incremental --bitmap \ "$(nth_line 2 "$midx_chain")" \ "$(nth_line 4 "$midx_chain")" && test_line_count = 3 $midx_chain && @@ -90,7 +90,7 @@ test_expect_success 'MIDX compaction with non-lex-ordered pack names' ' write_packs D C A B E && test_line_count = 5 $midx_chain && - git multi-pack-index compact --incremental \ + git multi-pack-index compact --incremental --bitmap \ "$(nth_line 2 "$midx_chain")" \ "$(nth_line 4 "$midx_chain")" && test_line_count = 3 $midx_chain && @@ -172,4 +172,122 @@ test_expect_success 'MIDX compaction with midx.version=1' ' ) ' +midx_objs_by_pack () { + awk '/\.pack$/ { split($3, a, "-"); print a[2], $1 }' | sort +} + +tag_objs_from_pack () { + objs="$(git rev-list --objects --no-object-names "$2")" && + printf "$1 %s\n" $objs | sort +} + +test_expect_success 'MIDX compaction preserves pack object selection' ' + git init midx-compact-preserve-selection && + ( + cd midx-compact-preserve-selection && + + git config maintenance.auto false && + + test_commit A && + test_commit B && + + # Create two packs, one containing just the objects from + # A, and another containing all objects from the + # repository. + p1="$(echo A | git pack-objects --revs --delta-base-offset \ + $packdir/pack-1)" && + p0="$(echo B | git pack-objects --revs --delta-base-offset \ + $packdir/pack-0)" && + + echo "pack-1-$p1.idx" | git multi-pack-index write \ + --incremental --bitmap --stdin-packs && + echo "pack-0-$p0.idx" | git multi-pack-index write \ + --incremental --bitmap --stdin-packs && + + write_packs C && + + git multi-pack-index compact --incremental --bitmap \ + "$(nth_line 1 "$midx_chain")" \ + "$(nth_line 2 "$midx_chain")" && + + + test-tool read-midx --show-objects $objdir \ + "$(nth_line 1 "$midx_chain")" >AB.info && + test-tool read-midx --show-objects $objdir \ + "$(nth_line 2 "$midx_chain")" >C.info && + + midx_objs_by_pack AB.actual && + midx_objs_by_pack C.actual && + + { + tag_objs_from_pack 1 A && + tag_objs_from_pack 0 A..B + } | sort >AB.expect && + tag_objs_from_pack C B..C >C.expect && + + test_cmp AB.expect AB.actual && + test_cmp C.expect C.actual + ) +' + +test_expect_success 'MIDX compaction with bitmaps' ' + git init midx-compact-with-bitmaps && + ( + cd midx-compact-with-bitmaps && + + git config maintenance.auto false && + + write_packs foo bar baz quux woot && + + test-tool read-midx --bitmap $objdir >bitmap.expect && + git multi-pack-index compact --incremental --bitmap \ + "$(nth_line 2 "$midx_chain")" \ + "$(nth_line 4 "$midx_chain")" && + test-tool read-midx --bitmap $objdir >bitmap.actual && + + test_cmp bitmap.expect bitmap.actual && + + true + ) +' + +test_expect_success 'MIDX compaction with bitmaps (non-trivial)' ' + git init midx-compact-with-bitmaps-non-trivial && + ( + cd midx-compact-with-bitmaps-non-trivial && + + git config maintenance.auto false && + + git branch -m main && + + # D(4) + # / + # A(1) --- B(2) --- C(3) --- G(7) + # \ + # E(5) --- F(6) + write_packs A B C && + git checkout -b side && + write_packs D && + git checkout -b other B && + write_packs E F && + git checkout main && + write_packs G && + + # Compact layers 2-4, leaving us with: + # + # [A, [B, C, D], E, F, G] + git multi-pack-index compact --incremental --bitmap \ + "$(nth_line 2 "$midx_chain")" \ + "$(nth_line 4 "$midx_chain")" && + + # Then compact the top two layers, condensing the above + # such that the new 4th layer contains F and G. + # + # [A, [B, C, D], E, [F, G]] + git multi-pack-index compact --incremental --bitmap \ + "$(nth_line 4 "$midx_chain")" \ + "$(nth_line 5 "$midx_chain")" + ) +' + test_done From 6daeb66baac581ab81148bcb9d5fcc61ae33347e Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 12 Mar 2026 09:42:56 +0100 Subject: [PATCH 18/24] odb: stop including "odb/source.h" The "odb.h" header currently includes the "odb/source.h" file. This is somewhat roundabout though: most callers shouldn't have to care about the `struct odb_source`, but should rather use the ODB-level functions. Furthermore, it means that a couple of definitions have to live on the source level even though they should be part of the generic interface. Reverse the relation between "odb/source.h" and "odb.h" and move the enums and typedefs that relate to the generic interfaces back into "odb.h". Add the necessary includes to all files that rely on the transitive include. Suggested-by: Justin Tobler Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/multi-pack-index.c | 1 + builtin/submodule--helper.c | 1 + odb.h | 50 ++++++++++++++++++++++++++++++++++- odb/source.h | 52 +------------------------------------ odb/streaming.c | 1 + repository.c | 1 + submodule-config.c | 1 + tmp-objdir.c | 1 + 8 files changed, 56 insertions(+), 52 deletions(-) diff --git a/builtin/multi-pack-index.c b/builtin/multi-pack-index.c index 5f364aa816ba25..3fcb207f1aa054 100644 --- a/builtin/multi-pack-index.c +++ b/builtin/multi-pack-index.c @@ -9,6 +9,7 @@ #include "strbuf.h" #include "trace2.h" #include "odb.h" +#include "odb/source.h" #include "replace-object.h" #include "repository.h" diff --git a/builtin/submodule--helper.c b/builtin/submodule--helper.c index 143f7cb3cca7d0..4957487536f453 100644 --- a/builtin/submodule--helper.c +++ b/builtin/submodule--helper.c @@ -29,6 +29,7 @@ #include "object-file.h" #include "object-name.h" #include "odb.h" +#include "odb/source.h" #include "advice.h" #include "branch.h" #include "list-objects-filter-options.h" diff --git a/odb.h b/odb.h index 86e0365c243863..7a583e38732b7d 100644 --- a/odb.h +++ b/odb.h @@ -3,7 +3,6 @@ #include "hashmap.h" #include "object.h" -#include "odb/source.h" #include "oidset.h" #include "oidmap.h" #include "string-list.h" @@ -12,6 +11,7 @@ struct oidmap; struct oidtree; struct strbuf; +struct strvec; struct repository; struct multi_pack_index; @@ -339,6 +339,42 @@ struct object_info { */ #define OBJECT_INFO_INIT { 0 } +/* Flags that can be passed to `odb_read_object_info_extended()`. */ +enum object_info_flags { + /* Invoke lookup_replace_object() on the given hash. */ + OBJECT_INFO_LOOKUP_REPLACE = (1 << 0), + + /* Do not reprepare object sources when the first lookup has failed. */ + OBJECT_INFO_QUICK = (1 << 1), + + /* + * Do not attempt to fetch the object if missing (even if fetch_is_missing is + * nonzero). + */ + OBJECT_INFO_SKIP_FETCH_OBJECT = (1 << 2), + + /* Die if object corruption (not just an object being missing) was detected. */ + OBJECT_INFO_DIE_IF_CORRUPT = (1 << 3), + + /* + * We have already tried reading the object, but it couldn't be found + * via any of the attached sources, and are now doing a second read. + * This second read asks the individual sources to also evaluate + * whether any on-disk state may have changed that may have caused the + * object to appear. + * + * This flag is for internal use, only. The second read only occurs + * when `OBJECT_INFO_QUICK` was not passed. + */ + OBJECT_INFO_SECOND_READ = (1 << 4), + + /* + * This is meant for bulk prefetching of missing blobs in a partial + * clone. Implies OBJECT_INFO_SKIP_FETCH_OBJECT and OBJECT_INFO_QUICK. + */ + OBJECT_INFO_FOR_PREFETCH = (OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_QUICK), +}; + /* * Read object info from the object database and populate the `object_info` * structure. Returns 0 on success, a negative error code otherwise. @@ -432,6 +468,18 @@ enum odb_for_each_object_flags { ODB_FOR_EACH_OBJECT_SKIP_ON_DISK_KEPT_PACKS = (1<<4), }; +/* + * A callback function that can be used to iterate through objects. If given, + * the optional `oi` parameter will be populated the same as if you would call + * `odb_read_object_info()`. + * + * Returning a non-zero error code will cause iteration to abort. The error + * code will be propagated. + */ +typedef int (*odb_for_each_object_cb)(const struct object_id *oid, + struct object_info *oi, + void *cb_data); + /* * Iterate through all objects contained in the object database. Note that * objects may be iterated over multiple times in case they are either stored diff --git a/odb/source.h b/odb/source.h index caac5581495ece..a1fd9dd9209137 100644 --- a/odb/source.h +++ b/odb/source.h @@ -2,6 +2,7 @@ #define ODB_SOURCE_H #include "object.h" +#include "odb.h" enum odb_source_type { /* @@ -14,61 +15,10 @@ enum odb_source_type { ODB_SOURCE_FILES, }; -/* Flags that can be passed to `odb_read_object_info_extended()`. */ -enum object_info_flags { - /* Invoke lookup_replace_object() on the given hash. */ - OBJECT_INFO_LOOKUP_REPLACE = (1 << 0), - - /* Do not reprepare object sources when the first lookup has failed. */ - OBJECT_INFO_QUICK = (1 << 1), - - /* - * Do not attempt to fetch the object if missing (even if fetch_is_missing is - * nonzero). - */ - OBJECT_INFO_SKIP_FETCH_OBJECT = (1 << 2), - - /* Die if object corruption (not just an object being missing) was detected. */ - OBJECT_INFO_DIE_IF_CORRUPT = (1 << 3), - - /* - * We have already tried reading the object, but it couldn't be found - * via any of the attached sources, and are now doing a second read. - * This second read asks the individual sources to also evaluate - * whether any on-disk state may have changed that may have caused the - * object to appear. - * - * This flag is for internal use, only. The second read only occurs - * when `OBJECT_INFO_QUICK` was not passed. - */ - OBJECT_INFO_SECOND_READ = (1 << 4), - - /* - * This is meant for bulk prefetching of missing blobs in a partial - * clone. Implies OBJECT_INFO_SKIP_FETCH_OBJECT and OBJECT_INFO_QUICK. - */ - OBJECT_INFO_FOR_PREFETCH = (OBJECT_INFO_SKIP_FETCH_OBJECT | OBJECT_INFO_QUICK), -}; - struct object_id; -struct object_info; struct odb_read_stream; -struct odb_transaction; -struct odb_write_stream; struct strvec; -/* - * A callback function that can be used to iterate through objects. If given, - * the optional `oi` parameter will be populated the same as if you would call - * `odb_read_object_info()`. - * - * Returning a non-zero error code will cause iteration to abort. The error - * code will be propagated. - */ -typedef int (*odb_for_each_object_cb)(const struct object_id *oid, - struct object_info *oi, - void *cb_data); - /* * The source is the part of the object database that stores the actual * objects. It thus encapsulates the logic to read and write the specific diff --git a/odb/streaming.c b/odb/streaming.c index a4355cd2458c38..5927a12954ba59 100644 --- a/odb/streaming.c +++ b/odb/streaming.c @@ -7,6 +7,7 @@ #include "environment.h" #include "repository.h" #include "odb.h" +#include "odb/source.h" #include "odb/streaming.h" #include "replace-object.h" diff --git a/repository.c b/repository.c index e7fa42c14f2254..05c26bdbc3bc08 100644 --- a/repository.c +++ b/repository.c @@ -2,6 +2,7 @@ #include "abspath.h" #include "repository.h" #include "odb.h" +#include "odb/source.h" #include "config.h" #include "object.h" #include "lockfile.h" diff --git a/submodule-config.c b/submodule-config.c index 1f19fe207741bc..72a46b7a54bc3d 100644 --- a/submodule-config.c +++ b/submodule-config.c @@ -14,6 +14,7 @@ #include "strbuf.h" #include "object-name.h" #include "odb.h" +#include "odb/source.h" #include "parse-options.h" #include "thread-utils.h" #include "tree-walk.h" diff --git a/tmp-objdir.c b/tmp-objdir.c index e436eed07e4449..d199d39e7c9d51 100644 --- a/tmp-objdir.c +++ b/tmp-objdir.c @@ -11,6 +11,7 @@ #include "strvec.h" #include "quote.h" #include "odb.h" +#include "odb/source.h" #include "repository.h" struct tmp_objdir { From dd587cd59e1575f2d5698cb45b42644e1df9b835 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 12 Mar 2026 09:42:57 +0100 Subject: [PATCH 19/24] packfile: extract logic to count number of objects In a subsequent commit we're about to introduce a new `odb_source_count_objects()` function so that we can make the logic pluggable. Prepare for this change by extracting the logic that we have to count packed objects into a standalone function. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- packfile.c | 45 +++++++++++++++++++++++++++++++++++---------- packfile.h | 9 +++++++++ 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/packfile.c b/packfile.c index 215a23e42be0fe..1ee5dd3da35c07 100644 --- a/packfile.c +++ b/packfile.c @@ -1101,6 +1101,36 @@ struct packfile_list_entry *packfile_store_get_packs(struct packfile_store *stor return store->packs.head; } +int packfile_store_count_objects(struct packfile_store *store, + unsigned long *out) +{ + struct packfile_list_entry *e; + struct multi_pack_index *m; + unsigned long count = 0; + int ret; + + m = get_multi_pack_index(store->source); + if (m) + count += m->num_objects + m->num_objects_in_base; + + for (e = packfile_store_get_packs(store); e; e = e->next) { + if (e->pack->multi_pack_index) + continue; + if (open_pack_index(e->pack)) { + ret = -1; + goto out; + } + + count += e->pack->num_objects; + } + + *out = count; + ret = 0; + +out: + return ret; +} + /* * Give a fast, rough count of the number of objects in the repository. This * ignores loose objects completely. If you have a lot of them, then either @@ -1113,21 +1143,16 @@ unsigned long repo_approximate_object_count(struct repository *r) if (!r->objects->approximate_object_count_valid) { struct odb_source *source; unsigned long count = 0; - struct packed_git *p; odb_prepare_alternates(r->objects); - for (source = r->objects->sources; source; source = source->next) { - struct multi_pack_index *m = get_multi_pack_index(source); - if (m) - count += m->num_objects + m->num_objects_in_base; - } + struct odb_source_files *files = odb_source_files_downcast(source); + unsigned long c; - repo_for_each_pack(r, p) { - if (p->multi_pack_index || open_pack_index(p)) - continue; - count += p->num_objects; + if (!packfile_store_count_objects(files->packed, &c)) + count += c; } + r->objects->approximate_object_count = count; r->objects->approximate_object_count_valid = 1; } diff --git a/packfile.h b/packfile.h index 8b04a258a7b9d5..1da8c729cba131 100644 --- a/packfile.h +++ b/packfile.h @@ -268,6 +268,15 @@ enum kept_pack_type { KEPT_PACK_IN_CORE = (1 << 1), }; +/* + * Count the number objects contained in the given packfile store. If + * successful, the number of objects will be written to the `out` pointer. + * + * Return 0 on success, a negative error code otherwise. + */ +int packfile_store_count_objects(struct packfile_store *store, + unsigned long *out); + /* * Retrieve the cache of kept packs from the given packfile store. Accepts a * combination of `kept_pack_type` flags. The cache is computed on demand and From 222fddeaa44b633eea345996735b4f7893eb71ec Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 12 Mar 2026 09:42:58 +0100 Subject: [PATCH 20/24] object-file: extract logic to approximate object count In "builtin/gc.c" we have some logic that checks whether we need to repack objects. This is done by counting the number of objects that we have and checking whether it exceeds a certain threshold. We don't really need an accurate object count though, which is why we only open a single object directory shard and then extrapolate from there. Extract this logic into a new function that is owned by the loose object database source. This is done to prepare for a subsequent change, where we'll introduce object counting on the object database source level. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/gc.c | 37 +++++++++---------------------------- object-file.c | 41 +++++++++++++++++++++++++++++++++++++++++ object-file.h | 13 +++++++++++++ 3 files changed, 63 insertions(+), 28 deletions(-) diff --git a/builtin/gc.c b/builtin/gc.c index fb329c2cffab80..a08c7554cb1374 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -467,37 +467,18 @@ static int rerere_gc_condition(struct gc_config *cfg UNUSED) static int too_many_loose_objects(int limit) { /* - * Quickly check if a "gc" is needed, by estimating how - * many loose objects there are. Because SHA-1 is evenly - * distributed, we can check only one and get a reasonable - * estimate. + * This is weird, but stems from legacy behaviour: the GC auto + * threshold was always essentially interpreted as if it was rounded up + * to the next multiple 256 of, so we retain this behaviour for now. */ - DIR *dir; - struct dirent *ent; - int auto_threshold; - int num_loose = 0; - int needed = 0; - const unsigned hexsz_loose = the_hash_algo->hexsz - 2; - char *path; - - path = repo_git_path(the_repository, "objects/17"); - dir = opendir(path); - free(path); - if (!dir) + int auto_threshold = DIV_ROUND_UP(limit, 256) * 256; + unsigned long loose_count; + + if (odb_source_loose_approximate_object_count(the_repository->objects->sources, + &loose_count) < 0) return 0; - auto_threshold = DIV_ROUND_UP(limit, 256); - while ((ent = readdir(dir)) != NULL) { - if (strspn(ent->d_name, "0123456789abcdef") != hexsz_loose || - ent->d_name[hexsz_loose] != '\0') - continue; - if (++num_loose > auto_threshold) { - needed = 1; - break; - } - } - closedir(dir); - return needed; + return loose_count > auto_threshold; } static struct packed_git *find_base_packs(struct string_list *packs, diff --git a/object-file.c b/object-file.c index a3ff7f586c91f3..da67e3c9ff576f 100644 --- a/object-file.c +++ b/object-file.c @@ -1868,6 +1868,47 @@ int odb_source_loose_for_each_object(struct odb_source *source, NULL, NULL, &data); } +int odb_source_loose_approximate_object_count(struct odb_source *source, + unsigned long *out) +{ + const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2; + unsigned long count = 0; + struct dirent *ent; + char *path = NULL; + DIR *dir = NULL; + int ret; + + path = xstrfmt("%s/17", source->path); + + dir = opendir(path); + if (!dir) { + if (errno == ENOENT) { + *out = 0; + ret = 0; + goto out; + } + + ret = error_errno("cannot open object shard '%s'", path); + goto out; + } + + while ((ent = readdir(dir)) != NULL) { + if (strspn(ent->d_name, "0123456789abcdef") != hexsz || + ent->d_name[hexsz] != '\0') + continue; + count++; + } + + *out = count * 256; + ret = 0; + +out: + if (dir) + closedir(dir); + free(path); + return ret; +} + static int append_loose_object(const struct object_id *oid, const char *path UNUSED, void *data) diff --git a/object-file.h b/object-file.h index ff6da6529640dc..b870ea9fa8deb2 100644 --- a/object-file.h +++ b/object-file.h @@ -139,6 +139,19 @@ int odb_source_loose_for_each_object(struct odb_source *source, void *cb_data, unsigned flags); +/* + * Count the number of loose objects in this source. + * + * The object count is approximated by opening a single sharding directory for + * loose objects and scanning its contents. The result is then extrapolated by + * 256. This should generally work as a reasonable estimate given that the + * object hash is supposed to be indistinguishable from random. + * + * Returns 0 on success, a negative error code otherwise. + */ +int odb_source_loose_approximate_object_count(struct odb_source *source, + unsigned long *out); + /** * format_object_header() is a thin wrapper around s xsnprintf() that * writes the initial " " part of the loose object From 2b24db1110150138ac1e09bc60d9ae5245909625 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 12 Mar 2026 09:42:59 +0100 Subject: [PATCH 21/24] object-file: generalize counting objects Generalize the function introduced in the preceding commit to not only be able to approximate the number of loose objects, but to also provide an accurate count. The behaviour can be toggled via a new flag. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/gc.c | 5 +++-- object-file.c | 59 +++++++++++++++++++++++++++++++++------------------ object-file.h | 5 +++-- odb.h | 9 ++++++++ 4 files changed, 53 insertions(+), 25 deletions(-) diff --git a/builtin/gc.c b/builtin/gc.c index a08c7554cb1374..3a64d28da81b61 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -474,8 +474,9 @@ static int too_many_loose_objects(int limit) int auto_threshold = DIV_ROUND_UP(limit, 256) * 256; unsigned long loose_count; - if (odb_source_loose_approximate_object_count(the_repository->objects->sources, - &loose_count) < 0) + if (odb_source_loose_count_objects(the_repository->objects->sources, + ODB_COUNT_OBJECTS_APPROXIMATE, + &loose_count) < 0) return 0; return loose_count > auto_threshold; diff --git a/object-file.c b/object-file.c index da67e3c9ff576f..569ce6eaed95d4 100644 --- a/object-file.c +++ b/object-file.c @@ -1868,40 +1868,57 @@ int odb_source_loose_for_each_object(struct odb_source *source, NULL, NULL, &data); } -int odb_source_loose_approximate_object_count(struct odb_source *source, - unsigned long *out) +static int count_loose_object(const struct object_id *oid UNUSED, + struct object_info *oi UNUSED, + void *payload) +{ + unsigned long *count = payload; + (*count)++; + return 0; +} + +int odb_source_loose_count_objects(struct odb_source *source, + enum odb_count_objects_flags flags, + unsigned long *out) { const unsigned hexsz = source->odb->repo->hash_algo->hexsz - 2; - unsigned long count = 0; - struct dirent *ent; char *path = NULL; DIR *dir = NULL; int ret; - path = xstrfmt("%s/17", source->path); + if (flags & ODB_COUNT_OBJECTS_APPROXIMATE) { + unsigned long count = 0; + struct dirent *ent; - dir = opendir(path); - if (!dir) { - if (errno == ENOENT) { - *out = 0; - ret = 0; + path = xstrfmt("%s/17", source->path); + + dir = opendir(path); + if (!dir) { + if (errno == ENOENT) { + *out = 0; + ret = 0; + goto out; + } + + ret = error_errno("cannot open object shard '%s'", path); goto out; } - ret = error_errno("cannot open object shard '%s'", path); - goto out; - } + while ((ent = readdir(dir)) != NULL) { + if (strspn(ent->d_name, "0123456789abcdef") != hexsz || + ent->d_name[hexsz] != '\0') + continue; + count++; + } - while ((ent = readdir(dir)) != NULL) { - if (strspn(ent->d_name, "0123456789abcdef") != hexsz || - ent->d_name[hexsz] != '\0') - continue; - count++; + *out = count * 256; + ret = 0; + } else { + *out = 0; + ret = odb_source_loose_for_each_object(source, NULL, count_loose_object, + out, 0); } - *out = count * 256; - ret = 0; - out: if (dir) closedir(dir); diff --git a/object-file.h b/object-file.h index b870ea9fa8deb2..f8d8805a18cc8c 100644 --- a/object-file.h +++ b/object-file.h @@ -149,8 +149,9 @@ int odb_source_loose_for_each_object(struct odb_source *source, * * Returns 0 on success, a negative error code otherwise. */ -int odb_source_loose_approximate_object_count(struct odb_source *source, - unsigned long *out); +int odb_source_loose_count_objects(struct odb_source *source, + enum odb_count_objects_flags flags, + unsigned long *out); /** * format_object_header() is a thin wrapper around s xsnprintf() that diff --git a/odb.h b/odb.h index 7a583e38732b7d..e6057477f624cd 100644 --- a/odb.h +++ b/odb.h @@ -500,6 +500,15 @@ int odb_for_each_object(struct object_database *odb, void *cb_data, unsigned flags); +enum odb_count_objects_flags { + /* + * Instead of providing an accurate count, allow the number of objects + * to be approximated. Details of how this approximation works are + * subject to the specific source's implementation. + */ + ODB_COUNT_OBJECTS_APPROXIMATE = (1 << 0), +}; + enum { /* * By default, `odb_write_object()` does not actually write anything From b259f2175b0ccd5574fc6b06b8ec5cbeaa860610 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 12 Mar 2026 09:43:00 +0100 Subject: [PATCH 22/24] odb/source: introduce generic object counting Introduce generic object counting on the object database source level with a new backend-specific callback function. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb/source-files.c | 30 ++++++++++++++++++++++++++++++ odb/source.h | 27 +++++++++++++++++++++++++++ packfile.c | 4 ++-- packfile.h | 1 + 4 files changed, 60 insertions(+), 2 deletions(-) diff --git a/odb/source-files.c b/odb/source-files.c index 14cb9adecadd99..c08d8993e378a0 100644 --- a/odb/source-files.c +++ b/odb/source-files.c @@ -93,6 +93,35 @@ static int odb_source_files_for_each_object(struct odb_source *source, return 0; } +static int odb_source_files_count_objects(struct odb_source *source, + enum odb_count_objects_flags flags, + unsigned long *out) +{ + struct odb_source_files *files = odb_source_files_downcast(source); + unsigned long count; + int ret; + + ret = packfile_store_count_objects(files->packed, flags, &count); + if (ret < 0) + goto out; + + if (!(flags & ODB_COUNT_OBJECTS_APPROXIMATE)) { + unsigned long loose_count; + + ret = odb_source_loose_count_objects(source, flags, &loose_count); + if (ret < 0) + goto out; + + count += loose_count; + } + + *out = count; + ret = 0; + +out: + return ret; +} + static int odb_source_files_freshen_object(struct odb_source *source, const struct object_id *oid) { @@ -220,6 +249,7 @@ struct odb_source_files *odb_source_files_new(struct object_database *odb, files->base.read_object_info = odb_source_files_read_object_info; files->base.read_object_stream = odb_source_files_read_object_stream; files->base.for_each_object = odb_source_files_for_each_object; + files->base.count_objects = odb_source_files_count_objects; files->base.freshen_object = odb_source_files_freshen_object; files->base.write_object = odb_source_files_write_object; files->base.write_object_stream = odb_source_files_write_object_stream; diff --git a/odb/source.h b/odb/source.h index a1fd9dd9209137..96c906e7a1b350 100644 --- a/odb/source.h +++ b/odb/source.h @@ -142,6 +142,21 @@ struct odb_source { void *cb_data, unsigned flags); + /* + * This callback is expected to count objects in the given object + * database source. The callback function does not have to guarantee + * that only unique objects are counted. The result shall be assigned + * to the `out` pointer. + * + * Accepts `enum odb_count_objects_flag` flags to alter the behaviour. + * + * The callback is expected to return 0 on success, or a negative error + * code otherwise. + */ + int (*count_objects)(struct odb_source *source, + enum odb_count_objects_flags flags, + unsigned long *out); + /* * This callback is expected to freshen the given object so that its * last access time is set to the current time. This is used to ensure @@ -333,6 +348,18 @@ static inline int odb_source_for_each_object(struct odb_source *source, return source->for_each_object(source, request, cb, cb_data, flags); } +/* + * Count the number of objects in the given object database source. + * + * Returns 0 on success, a negative error code otherwise. + */ +static inline int odb_source_count_objects(struct odb_source *source, + enum odb_count_objects_flags flags, + unsigned long *out) +{ + return source->count_objects(source, flags, out); +} + /* * Freshen an object in the object database by updating its timestamp. * Returns 1 in case the object has been freshened, 0 in case the object does diff --git a/packfile.c b/packfile.c index 1ee5dd3da35c07..8ee462303afa79 100644 --- a/packfile.c +++ b/packfile.c @@ -1102,6 +1102,7 @@ struct packfile_list_entry *packfile_store_get_packs(struct packfile_store *stor } int packfile_store_count_objects(struct packfile_store *store, + enum odb_count_objects_flags flags UNUSED, unsigned long *out) { struct packfile_list_entry *e; @@ -1146,10 +1147,9 @@ unsigned long repo_approximate_object_count(struct repository *r) odb_prepare_alternates(r->objects); for (source = r->objects->sources; source; source = source->next) { - struct odb_source_files *files = odb_source_files_downcast(source); unsigned long c; - if (!packfile_store_count_objects(files->packed, &c)) + if (!odb_source_count_objects(source, ODB_COUNT_OBJECTS_APPROXIMATE, &c)) count += c; } diff --git a/packfile.h b/packfile.h index 1da8c729cba131..74b6bc58c5a6b7 100644 --- a/packfile.h +++ b/packfile.h @@ -275,6 +275,7 @@ enum kept_pack_type { * Return 0 on success, a negative error code otherwise. */ int packfile_store_count_objects(struct packfile_store *store, + enum odb_count_objects_flags flags, unsigned long *out); /* From 6801ffd37df8420917e1feaf03b5f7c175798bff Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 12 Mar 2026 09:43:01 +0100 Subject: [PATCH 23/24] odb: introduce generic object counting Similar to the preceding commit, introduce counting of objects on the object database level, replacing the logic that we have in `repo_approximate_object_count()`. Note that the function knows to cache the object count. It's unclear whether this cache is really required as we shouldn't have that many cases where we count objects repeatedly. But to be on the safe side the caching mechanism is retained, with the only excepting being that we also have to use the passed flags as caching key. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/gc.c | 6 +++++- commit-graph.c | 3 ++- object-name.c | 6 +++++- odb.c | 37 ++++++++++++++++++++++++++++++++++++- odb.h | 19 ++++++++++++++++--- packfile.c | 27 --------------------------- packfile.h | 6 ------ 7 files changed, 64 insertions(+), 40 deletions(-) diff --git a/builtin/gc.c b/builtin/gc.c index 3a64d28da81b61..cb9ca89a974819 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -574,9 +574,13 @@ static uint64_t total_ram(void) static uint64_t estimate_repack_memory(struct gc_config *cfg, struct packed_git *pack) { - unsigned long nr_objects = repo_approximate_object_count(the_repository); + unsigned long nr_objects; size_t os_cache, heap; + if (odb_count_objects(the_repository->objects, + ODB_COUNT_OBJECTS_APPROXIMATE, &nr_objects) < 0) + return 0; + if (!pack || !nr_objects) return 0; diff --git a/commit-graph.c b/commit-graph.c index f8e24145a513b0..c0300033304404 100644 --- a/commit-graph.c +++ b/commit-graph.c @@ -2607,7 +2607,8 @@ int write_commit_graph(struct odb_source *source, replace = ctx.opts->split_flags & COMMIT_GRAPH_SPLIT_REPLACE; } - ctx.approx_nr_objects = repo_approximate_object_count(r); + if (odb_count_objects(r->objects, ODB_COUNT_OBJECTS_APPROXIMATE, &ctx.approx_nr_objects) < 0) + ctx.approx_nr_objects = 0; if (ctx.append && g) { for (i = 0; i < g->num_commits; i++) { diff --git a/object-name.c b/object-name.c index 7b14c3bf9b9b48..e5adec4c9d5084 100644 --- a/object-name.c +++ b/object-name.c @@ -837,7 +837,11 @@ int repo_find_unique_abbrev_r(struct repository *r, char *hex, const unsigned hexsz = algo->hexsz; if (len < 0) { - unsigned long count = repo_approximate_object_count(r); + unsigned long count; + + if (odb_count_objects(r->objects, ODB_COUNT_OBJECTS_APPROXIMATE, &count) < 0) + count = 0; + /* * Add one because the MSB only tells us the highest bit set, * not including the value of all the _other_ bits (so "15" diff --git a/odb.c b/odb.c index 84a31084d3884b..350e23f3c0798d 100644 --- a/odb.c +++ b/odb.c @@ -917,6 +917,41 @@ int odb_for_each_object(struct object_database *odb, return 0; } +int odb_count_objects(struct object_database *odb, + enum odb_count_objects_flags flags, + unsigned long *out) +{ + struct odb_source *source; + unsigned long count = 0; + int ret; + + if (odb->object_count_valid && odb->object_count_flags == flags) { + *out = odb->object_count; + return 0; + } + + odb_prepare_alternates(odb); + for (source = odb->sources; source; source = source->next) { + unsigned long c; + + ret = odb_source_count_objects(source, flags, &c); + if (ret < 0) + goto out; + + count += c; + } + + odb->object_count = count; + odb->object_count_valid = 1; + odb->object_count_flags = flags; + + *out = count; + ret = 0; + +out: + return ret; +} + void odb_assert_oid_type(struct object_database *odb, const struct object_id *oid, enum object_type expect) { @@ -1030,7 +1065,7 @@ void odb_reprepare(struct object_database *o) for (source = o->sources; source; source = source->next) odb_source_reprepare(source); - o->approximate_object_count_valid = 0; + o->object_count_valid = 0; obj_read_unlock(); } diff --git a/odb.h b/odb.h index e6057477f624cd..9aee260105ae54 100644 --- a/odb.h +++ b/odb.h @@ -110,10 +110,11 @@ struct object_database { /* * A fast, rough count of the number of objects in the repository. * These two fields are not meant for direct access. Use - * repo_approximate_object_count() instead. + * odb_count_objects() instead. */ - unsigned long approximate_object_count; - unsigned approximate_object_count_valid : 1; + unsigned long object_count; + unsigned object_count_flags; + unsigned object_count_valid : 1; /* * Submodule source paths that will be added as additional sources to @@ -509,6 +510,18 @@ enum odb_count_objects_flags { ODB_COUNT_OBJECTS_APPROXIMATE = (1 << 0), }; +/* + * Count the number of objects in the given object database. This object count + * may double-count objects that are stored in multiple backends, or which are + * stored multiple times in a single backend. + * + * Returns 0 on success, a negative error code otherwise. The number of objects + * will be assigned to the `out` pointer on success. + */ +int odb_count_objects(struct object_database *odb, + enum odb_count_objects_flags flags, + unsigned long *out); + enum { /* * By default, `odb_write_object()` does not actually write anything diff --git a/packfile.c b/packfile.c index 8ee462303afa79..d4de9f3ffe831e 100644 --- a/packfile.c +++ b/packfile.c @@ -1132,33 +1132,6 @@ int packfile_store_count_objects(struct packfile_store *store, return ret; } -/* - * Give a fast, rough count of the number of objects in the repository. This - * ignores loose objects completely. If you have a lot of them, then either - * you should repack because your performance will be awful, or they are - * all unreachable objects about to be pruned, in which case they're not really - * interesting as a measure of repo size in the first place. - */ -unsigned long repo_approximate_object_count(struct repository *r) -{ - if (!r->objects->approximate_object_count_valid) { - struct odb_source *source; - unsigned long count = 0; - - odb_prepare_alternates(r->objects); - for (source = r->objects->sources; source; source = source->next) { - unsigned long c; - - if (!odb_source_count_objects(source, ODB_COUNT_OBJECTS_APPROXIMATE, &c)) - count += c; - } - - r->objects->approximate_object_count = count; - r->objects->approximate_object_count_valid = 1; - } - return r->objects->approximate_object_count; -} - unsigned long unpack_object_header_buffer(const unsigned char *buf, unsigned long len, enum object_type *type, unsigned long *sizep) { diff --git a/packfile.h b/packfile.h index 74b6bc58c5a6b7..a16ec3950d2507 100644 --- a/packfile.h +++ b/packfile.h @@ -375,12 +375,6 @@ int packfile_store_for_each_object(struct packfile_store *store, #define PACKDIR_FILE_GARBAGE 4 extern void (*report_garbage)(unsigned seen_bits, const char *path); -/* - * Give a rough count of objects in the repository. This sacrifices accuracy - * for speed. - */ -unsigned long repo_approximate_object_count(struct repository *r); - void pack_report(struct repository *repo); /* From 41688c1a2312f62f44435e1a6d03b4b904b5b0ec Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Wed, 25 Mar 2026 12:57:55 -0700 Subject: [PATCH 24/24] The 21st batch Signed-off-by: Junio C Hamano --- Documentation/RelNotes/2.54.0.adoc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/RelNotes/2.54.0.adoc b/Documentation/RelNotes/2.54.0.adoc index 7de9f7e7ecf6d2..25affd6a981d61 100644 --- a/Documentation/RelNotes/2.54.0.adoc +++ b/Documentation/RelNotes/2.54.0.adoc @@ -227,6 +227,10 @@ Performance, Internal Implementation, Development Support etc. * Add a coccinelle rule to break the build when "struct strbuf" gets passed by value. + * Further work on incremental repacking using MIDX/bitmap + + * The logic to count objects has been cleaned up. + Fixes since v2.53 -----------------