From 8921b9d031ff7316f4a66e0763f86698b2d33610 Mon Sep 17 00:00:00 2001 From: koleoptil Date: Sun, 15 Feb 2026 14:08:28 +0100 Subject: [PATCH 1/8] Add --layout-file option and forward layout_file into ibf/ibf_helper --- include/ibf.hpp | 6 ++++-- src/ibf.cpp | 25 ++++++++++++++++++------- src/needle.cpp | 24 +++++++++--------------- 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/include/ibf.hpp b/include/ibf.hpp index ae92f23..d2789d5 100644 --- a/include/ibf.hpp +++ b/include/ibf.hpp @@ -32,7 +32,8 @@ std::vector ibf(std::vector const & sequence_fi std::vector & fpr, std::vector & cutoffs, std::filesystem::path const & expression_by_genome_file = "", - size_t num_hash = 1); + size_t num_hash = 1, + std::filesystem::path const & layout_file = {}); /*! \brief Creates IBFs based on the minimiser files * \param minimiser_files A vector of minimiser file paths. @@ -48,4 +49,5 @@ std::vector ibf(std::vector const & minimiser_f estimate_ibf_arguments & ibf_args, std::vector & fpr, std::filesystem::path const & expression_by_genome_file = "", - size_t num_hash = 1); + size_t num_hash = 1, + std::filesystem::path const & layout_file = {}); diff --git a/src/ibf.cpp b/src/ibf.cpp index 87aacef..a9ec598 100644 --- a/src/ibf.cpp +++ b/src/ibf.cpp @@ -201,8 +201,11 @@ void ibf_helper(std::vector const & minimiser_files, std::vector & cutoffs, size_t num_hash = 1, std::filesystem::path const & expression_by_genome_file = "", - minimiser_file_input_arguments const & minimiser_args = {}) + minimiser_file_input_arguments const & minimiser_args = {}, + std::filesystem::path const & layout_file = {}) { + (void)layout_file; // suppress unused-parameter warning; TODO: use layout_file to parse or call chopper + size_t const num_files = [&]() constexpr { if constexpr (minimiser_files_given) @@ -492,8 +495,11 @@ std::vector ibf(std::vector const & sequence_fi std::vector & fpr, std::vector & cutoffs, std::filesystem::path const & expression_by_genome_file, - size_t num_hash) + size_t num_hash, + std::filesystem::path const & layout_file) { + (void)layout_file; // TODO: use layout_file (parse or call chopper) in ibf_helper + // Declarations robin_hood::unordered_node_map hash_table{}; // Storage for minimisers @@ -522,7 +528,8 @@ std::vector ibf(std::vector const & sequence_fi cutoffs, num_hash, expression_by_genome_file, - minimiser_args); + minimiser_args, + layout_file); else ibf_helper(sequence_files, fpr, @@ -530,7 +537,8 @@ std::vector ibf(std::vector const & sequence_fi cutoffs, num_hash, expression_by_genome_file, - minimiser_args); + minimiser_args, + layout_file); store_args(ibf_args, filenames::data(ibf_args.path_out)); @@ -542,8 +550,11 @@ std::vector ibf(std::vector const & minimiser_f estimate_ibf_arguments & ibf_args, std::vector & fpr, std::filesystem::path const & expression_by_genome_file, - size_t num_hash) + size_t num_hash, + std::filesystem::path const & layout_file) { + (void)layout_file; // TODO: use layout_file in ibf_helper + check_expression(ibf_args.expression_thresholds, ibf_args.number_expression_thresholds, expression_by_genome_file); check_fpr(ibf_args.number_expression_thresholds, fpr); @@ -551,9 +562,9 @@ std::vector ibf(std::vector const & minimiser_f std::vector cutoffs{}; if (ibf_args.samplewise) - ibf_helper(minimiser_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file); + ibf_helper(minimiser_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file, minimiser_file_input_arguments{}, layout_file); else - ibf_helper(minimiser_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file); + ibf_helper(minimiser_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file, minimiser_file_input_arguments{}, layout_file); store_args(ibf_args, filenames::data(ibf_args.path_out)); diff --git a/src/needle.cpp b/src/needle.cpp index fa0222f..db3b1a3 100644 --- a/src/needle.cpp +++ b/src/needle.cpp @@ -319,6 +319,7 @@ int run_needle_ibf(sharg::parser & parser) std::filesystem::path expression_by_genome_file = ""; std::vector fpr{}; // The fpr of one IBF, can be different for different expression levels std::vector cutoffs{}; + std::filesystem::path layout_file{}; initialise_minimiser_arguments(parser, ibf_args); initialise_arguments_ibf(parser, ibf_args, num_hash, fpr); @@ -339,24 +340,17 @@ int run_needle_ibf(sharg::parser & parser) parser.add_option(expression_by_genome_file, sharg::config{.short_id = '\0', .long_id = "levels-by-genome", - .description = "Sequence file containing minimizers, only those minimizers will be " - "considered for determining the expression thresholds.", + .description = "Sequence file containing minimizers to determine expression thresholds.", .validator = sharg::input_file_validator{}}); - parser.add_flag(minimiser_args.ram_friendly, - sharg::config{.short_id = '\0', - .long_id = "ram", - .description = "When multithreading, prioritize lower RAM usage over speed."}); - - parsing(parser, ibf_args); - if (sequence_files[0].extension() == ".lst") - { - input_file = sequence_files[0]; - sequence_files = {}; - read_input_file_list(sequence_files, input_file); - } + parser.add_option(layout_file, + sharg::config{.short_id = '\0', + .long_id = "layout-file", + .description = "Path to a precomputed chopper layout file.", + .validator = sharg::input_file_validator{}}); + parsing(parser, ibf_args); - ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs, expression_by_genome_file, num_hash); + ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs, expression_by_genome_file, num_hash, layout_file); return 0; } From 48a0e6301d92ac9f6c6356f33eb9b3724275ed13 Mon Sep 17 00:00:00 2001 From: koleoptil Date: Mon, 16 Feb 2026 12:16:46 +0100 Subject: [PATCH 2/8] Added layout option to HIBF construction --- src/ibf.cpp | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/ibf.cpp b/src/ibf.cpp index a9ec598..03abf40 100644 --- a/src/ibf.cpp +++ b/src/ibf.cpp @@ -11,7 +11,8 @@ #include #include #include - +#include +#include #include "misc/calculate_cutoff.hpp" #include "misc/check_cutoffs_samples.hpp" #include "misc/check_for_fasta_format.hpp" @@ -465,8 +466,23 @@ void ibf_helper(std::vector const & minimiser_files, .threads = ibf_args.threads, .track_occupancy = true}; - // Construct the HIBF - seqan::hibf::hierarchical_interleaved_bloom_filter hibf{hibf_config}; + // Construct the HIBF — prefer the ctor that consumes a precomputed layout if provided. + auto hibf = [&]() + { + if (!layout_file.empty()) + { + seqan::hibf::layout::layout layout_obj; + std::ifstream in{layout_file, std::ios::binary}; + if (!in) + throw std::runtime_error{"Could not open layout file: " + layout_file.string()}; + layout_obj.read_from(in); + return seqan::hibf::hierarchical_interleaved_bloom_filter{hibf_config, layout_obj}; + } + else + { + return seqan::hibf::hierarchical_interleaved_bloom_filter{hibf_config}; + } + }(); // Store the HIBF std::filesystem::path const filename = filenames::ibf(ibf_args.path_out, samplewise, 0, ibf_args); From 3bbef092915c88ebb5687021de0936c27927263d Mon Sep 17 00:00:00 2001 From: Enrico Seiler Date: Mon, 16 Feb 2026 15:00:14 +0100 Subject: [PATCH 3/8] wrap chopper fast layout --- .clang-format | 6 ++- include/ibf.hpp | 4 +- src/CMakeLists.txt | 1 + src/ibf.cpp | 104 ++++++++++++++++++++++++++++++++++++--------- src/needle.cpp | 28 ++++++------ 5 files changed, 105 insertions(+), 38 deletions(-) diff --git a/.clang-format b/.clang-format index 1f92e78..37a6340 100644 --- a/.clang-format +++ b/.clang-format @@ -147,10 +147,14 @@ IncludeCategories: Priority: 7 SortPriority: 0 CaseSensitive: false - - Regex: '.*' + - Regex: ' ibf(std::vector const & sequence_fi std::vector & cutoffs, std::filesystem::path const & expression_by_genome_file = "", size_t num_hash = 1, - std::filesystem::path const & layout_file = {}); + bool const fast_layout = false); /*! \brief Creates IBFs based on the minimiser files * \param minimiser_files A vector of minimiser file paths. @@ -50,4 +50,4 @@ std::vector ibf(std::vector const & minimiser_f std::vector & fpr, std::filesystem::path const & expression_by_genome_file = "", size_t num_hash = 1, - std::filesystem::path const & layout_file = {}); + bool const fast_layout = false); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 338f33e..eca37a2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -26,6 +26,7 @@ add_library ("${PROJECT_NAME}_lib" STATIC ${NEEDLE_SOURCE_FILES}) target_link_libraries ("${PROJECT_NAME}_lib" PUBLIC seqan3::seqan3) target_link_libraries ("${PROJECT_NAME}_lib" PUBLIC sharg::sharg) target_link_libraries ("${PROJECT_NAME}_lib" PUBLIC seqan::hibf) +target_link_libraries ("${PROJECT_NAME}_lib" PUBLIC chopper::chopper) target_include_directories ("${PROJECT_NAME}_lib" PUBLIC ../include) add_subdirectory (layout) diff --git a/src/ibf.cpp b/src/ibf.cpp index 03abf40..df61cb8 100644 --- a/src/ibf.cpp +++ b/src/ibf.cpp @@ -11,8 +11,13 @@ #include #include #include -#include #include +#include +#include +#include + +#include + #include "misc/calculate_cutoff.hpp" #include "misc/check_cutoffs_samples.hpp" #include "misc/check_for_fasta_format.hpp" @@ -22,6 +27,17 @@ #include "misc/get_include_set_table.hpp" #include "misc/stream.hpp" +namespace chopper::layout +{ +// Foward declaration for linking. fast_layout is not declared in execute.hpp +void fast_layout(chopper::configuration const & config, + std::vector const & positions, + std::vector const & cardinalities, + std::vector const & sketches, + std::vector const & minHash_sketches, + seqan::hibf::layout::layout & hibf_layout); +} // namespace chopper::layout + // Check number of expression levels, sort expression levels void check_expression(std::vector & expression_thresholds, uint8_t & number_expression_thresholds, @@ -194,6 +210,47 @@ void store_fpr_information(seqan::hibf::hierarchical_interleaved_bloom_filter co outfile << "/\n"; } +// Full Chopper config with defaults: +// .partitioning_approach = chopper::partitioning_scheme::lsh_sim, +// .data_file = {}, +// .debug = false, +// .output_filename = "layout.txt", +// .output_timings = {}, +// .k = ibf_args.k, +// .window_size = static_cast(ibf_args.w_size.get()), +// .precomputed_files = false, +// .sketch_directory = {}, +// .disable_sketch_output = false, +// .determine_best_tmax = false, +// .force_all_binnings = false, +// .output_verbose_statistics = false, +// .hibf_config = hibf_config + +seqan::hibf::layout::layout compute_fast_layout(seqan::hibf::config const & hibf_config, + estimate_ibf_arguments const & ibf_args) +{ + seqan::hibf::layout::layout hibf_layout{}; + + chopper::configuration chopper_config{.data_file = {}, // Not initialised in cfg, so it must be here. + .k = ibf_args.k, + .window_size = static_cast(ibf_args.w_size.get()), + .hibf_config = hibf_config}; + + std::vector sketches{}; + std::vector minHash_sketches{}; + std::vector cardinalities; + + seqan::hibf::sketch::compute_sketches(hibf_config, sketches, minHash_sketches); + seqan::hibf::sketch::estimate_kmer_counts(sketches, cardinalities); + chopper::layout::fast_layout(chopper_config, + seqan::hibf::iota_vector(sketches.size()), + cardinalities, + sketches, + minHash_sketches, + hibf_layout); + return hibf_layout; +} + // Actual ibf construction template void ibf_helper(std::vector const & minimiser_files, @@ -203,10 +260,8 @@ void ibf_helper(std::vector const & minimiser_files, size_t num_hash = 1, std::filesystem::path const & expression_by_genome_file = "", minimiser_file_input_arguments const & minimiser_args = {}, - std::filesystem::path const & layout_file = {}) + bool const fast_layout = false) { - (void)layout_file; // suppress unused-parameter warning; TODO: use layout_file to parse or call chopper - size_t const num_files = [&]() constexpr { if constexpr (minimiser_files_given) @@ -465,18 +520,15 @@ void ibf_helper(std::vector const & minimiser_files, .maximum_fpr = fprs[0], .threads = ibf_args.threads, .track_occupancy = true}; + hibf_config.validate_and_set_defaults(); // Construct the HIBF — prefer the ctor that consumes a precomputed layout if provided. auto hibf = [&]() { - if (!layout_file.empty()) + if (fast_layout) { - seqan::hibf::layout::layout layout_obj; - std::ifstream in{layout_file, std::ios::binary}; - if (!in) - throw std::runtime_error{"Could not open layout file: " + layout_file.string()}; - layout_obj.read_from(in); - return seqan::hibf::hierarchical_interleaved_bloom_filter{hibf_config, layout_obj}; + seqan::hibf::layout::layout hibf_layout = compute_fast_layout(hibf_config, ibf_args); + return seqan::hibf::hierarchical_interleaved_bloom_filter{hibf_config, hibf_layout}; } else { @@ -512,10 +564,8 @@ std::vector ibf(std::vector const & sequence_fi std::vector & cutoffs, std::filesystem::path const & expression_by_genome_file, size_t num_hash, - std::filesystem::path const & layout_file) + bool const fast_layout) { - (void)layout_file; // TODO: use layout_file (parse or call chopper) in ibf_helper - // Declarations robin_hood::unordered_node_map hash_table{}; // Storage for minimisers @@ -545,7 +595,7 @@ std::vector ibf(std::vector const & sequence_fi num_hash, expression_by_genome_file, minimiser_args, - layout_file); + fast_layout); else ibf_helper(sequence_files, fpr, @@ -554,7 +604,7 @@ std::vector ibf(std::vector const & sequence_fi num_hash, expression_by_genome_file, minimiser_args, - layout_file); + fast_layout); store_args(ibf_args, filenames::data(ibf_args.path_out)); @@ -567,10 +617,8 @@ std::vector ibf(std::vector const & minimiser_f std::vector & fpr, std::filesystem::path const & expression_by_genome_file, size_t num_hash, - std::filesystem::path const & layout_file) + bool const fast_layout) { - (void)layout_file; // TODO: use layout_file in ibf_helper - check_expression(ibf_args.expression_thresholds, ibf_args.number_expression_thresholds, expression_by_genome_file); check_fpr(ibf_args.number_expression_thresholds, fpr); @@ -578,9 +626,23 @@ std::vector ibf(std::vector const & minimiser_f std::vector cutoffs{}; if (ibf_args.samplewise) - ibf_helper(minimiser_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file, minimiser_file_input_arguments{}, layout_file); + ibf_helper(minimiser_files, + fpr, + ibf_args, + cutoffs, + num_hash, + expression_by_genome_file, + minimiser_file_input_arguments{}, + fast_layout); else - ibf_helper(minimiser_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file, minimiser_file_input_arguments{}, layout_file); + ibf_helper(minimiser_files, + fpr, + ibf_args, + cutoffs, + num_hash, + expression_by_genome_file, + minimiser_file_input_arguments{}, + fast_layout); store_args(ibf_args, filenames::data(ibf_args.path_out)); diff --git a/src/needle.cpp b/src/needle.cpp index db3b1a3..22c6c11 100644 --- a/src/needle.cpp +++ b/src/needle.cpp @@ -319,7 +319,7 @@ int run_needle_ibf(sharg::parser & parser) std::filesystem::path expression_by_genome_file = ""; std::vector fpr{}; // The fpr of one IBF, can be different for different expression levels std::vector cutoffs{}; - std::filesystem::path layout_file{}; + bool fast_layout{}; initialise_minimiser_arguments(parser, ibf_args); initialise_arguments_ibf(parser, ibf_args, num_hash, fpr); @@ -337,20 +337,20 @@ int run_needle_ibf(sharg::parser & parser) .long_id = "experiment-names", .description = "Use experiment names from the given txt file."}); - parser.add_option(expression_by_genome_file, - sharg::config{.short_id = '\0', - .long_id = "levels-by-genome", - .description = "Sequence file containing minimizers to determine expression thresholds.", - .validator = sharg::input_file_validator{}}); - - parser.add_option(layout_file, - sharg::config{.short_id = '\0', - .long_id = "layout-file", - .description = "Path to a precomputed chopper layout file.", - .validator = sharg::input_file_validator{}}); - parsing(parser, ibf_args); + parser.add_option( + expression_by_genome_file, + sharg::config{.short_id = '\0', + .long_id = "levels-by-genome", + .description = "Sequence file containing minimizers to determine expression thresholds.", + .validator = sharg::input_file_validator{}}); + + parser.add_flag(fast_layout, + sharg::config{.short_id = '\0', // + .long_id = "fast-layout", + .description = "Use fast layout algorithm."}); + parsing(parser, ibf_args); - ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs, expression_by_genome_file, num_hash, layout_file); + ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs, expression_by_genome_file, num_hash, fast_layout); return 0; } From 3d3643e96e24e4f71d0f9391e057efb58ec94a97 Mon Sep 17 00:00:00 2001 From: koleoptil Date: Thu, 12 Mar 2026 11:04:54 +0100 Subject: [PATCH 4/8] added per-user bin counts as output to needle minimiser --- include/shared.hpp | 2 + src/minimiser.cpp | 131 +++++++++++++++++++++++++++++++++++++++++++-- src/needle.cpp | 29 ++++++++-- 3 files changed, 153 insertions(+), 9 deletions(-) diff --git a/include/shared.hpp b/include/shared.hpp index c4042ed..0a43834 100644 --- a/include/shared.hpp +++ b/include/shared.hpp @@ -65,6 +65,8 @@ struct minimiser_file_input_arguments bool paired = false; // If true, than experiments are seen as paired-end experiments bool experiment_names = false; // Flag, if names of experiment should be stored in a txt file bool ram_friendly = false; + bool write_counts = false; + uint8_t number_expression_thresholds = 0; }; /*! \brief Function, loading arguments diff --git a/src/minimiser.cpp b/src/minimiser.cpp index 4c09971..ac33305 100644 --- a/src/minimiser.cpp +++ b/src/minimiser.cpp @@ -6,22 +6,36 @@ #include #include +#include +#include #include "misc/calculate_cutoff.hpp" #include "misc/check_cutoffs_samples.hpp" #include "misc/fill_hash_table.hpp" #include "misc/get_include_set_table.hpp" +#include "misc/get_expression_thresholds.hpp" #include "misc/stream.hpp" +// Add result struct +struct MinResult +{ + uint64_t count{}; + uint8_t cutoff{}; + std::vector expression_thresholds; + std::vector sizes; +}; + // Actuall minimiser calculation template -void calculate_minimiser(std::vector const & sequence_files, +MinResult calculate_minimiser(std::vector const & sequence_files, robin_hood::unordered_set const & include_set_table, robin_hood::unordered_set const & exclude_set_table, minimiser_arguments const & args, minimiser_file_input_arguments const & minimiser_args, unsigned const i, - std::vector & cutoffs) + std::vector & cutoffs, + uint8_t const number_expression_thresholds, + bool const write_counts) { robin_hood::unordered_node_map hash_table{}; // Storage for minimisers uint8_t cutoff{0}; @@ -88,6 +102,19 @@ void calculate_minimiser(std::vector const & sequence_fil write_stream(outfile, hash.first); write_stream(outfile, hash.second); } + + + uint64_t count = hash_table.size(); + + MinResult res; + res.count = count; + res.cutoff = cutoff; + + // If insert counts requested, compute expression thresholds (sizes are computed). + if (write_counts) + get_expression_thresholds(number_expression_thresholds, hash_table, res.expression_thresholds, res.sizes, robin_hood::unordered_set{}, cutoff, true); + + return res; } void minimiser(std::vector const & sequence_files, @@ -108,19 +135,39 @@ void minimiser(std::vector const & sequence_files, size_t const chunk_size = std::clamp(std::bit_ceil(minimiser_args.samples.size() / args.threads), 1u, 64u); + // Prepare container for per-user-bin counts + std::vector insert_counts(minimiser_args.samples.size(), 0); + std::vector> all_thresholds; + std::vector> all_sizes; + if (minimiser_args.write_counts) + { + all_thresholds.resize(minimiser_args.samples.size()); + all_sizes.resize(minimiser_args.samples.size()); + } + // Add minimisers to ibf if (minimiser_args.ram_friendly) { seqan3::contrib::bgzf_thread_count = args.threads; for (size_t i = 0; i < minimiser_args.samples.size(); i++) { - calculate_minimiser(sequence_files, + MinResult r = calculate_minimiser(sequence_files, include_set_table, exclude_set_table, args, minimiser_args, i, - cutoffs); + cutoffs, + minimiser_args.number_expression_thresholds, + minimiser_args.write_counts); + insert_counts[i] = r.count; + if (cutoffs.size() <= i) cutoffs.resize(minimiser_args.samples.size()); + cutoffs[i] = r.cutoff; + if (minimiser_args.write_counts) + { + all_thresholds[i] = std::move(r.expression_thresholds); + all_sizes[i] = std::move(r.sizes); + } } } else @@ -129,10 +176,84 @@ void minimiser(std::vector const & sequence_files, seqan3::contrib::bgzf_thread_count = 1u; omp_set_num_threads(args.threads); + std::vector results(minimiser_args.samples.size()); + #pragma omp parallel for schedule(dynamic, chunk_size) for (size_t i = 0; i < minimiser_args.samples.size(); i++) { - calculate_minimiser(sequence_files, include_set_table, exclude_set_table, args, minimiser_args, i, cutoffs); + results[i] = calculate_minimiser(sequence_files, include_set_table, exclude_set_table, args, minimiser_args, i, cutoffs, minimiser_args.number_expression_thresholds, minimiser_args.write_counts); + } + + for (size_t i = 0; i < minimiser_args.samples.size(); ++i) + { + insert_counts[i] = results[i].count; + if (cutoffs.size() <= i) cutoffs.resize(minimiser_args.samples.size()); + cutoffs[i] = results[i].cutoff; + if (minimiser_args.write_counts) + { + all_thresholds[i] = std::move(results[i].expression_thresholds); + all_sizes[i] = std::move(results[i].sizes); + } + } + } + + // If --write-counts is set, write thresholds.tsv and per-user-bin count + if (minimiser_args.write_counts) + { + // write thresholds.tsv + std::filesystem::path thr_path = std::filesystem::path{args.path_out} / "thresholds.tsv"; + std::ofstream thr_out{thr_path}; + thr_out << "file"; + for (uint8_t l = 0; l < minimiser_args.number_expression_thresholds; ++l) + thr_out << '\t' << "level_" << static_cast(l); + thr_out << '\n'; + + for (size_t i = 0, file_it = 0; i < minimiser_args.samples.size(); ++i) + { + thr_out << sequence_files[file_it].filename().string(); + for (uint8_t l = 0; l < minimiser_args.number_expression_thresholds; ++l) + { + uint16_t val = 0; + if (l < all_thresholds[i].size()) + val = all_thresholds[i][l]; + thr_out << '\t' << val; + } + thr_out << '\n'; + file_it += minimiser_args.samples[i]; + } + + // Write insert counts + // Compute per level number of inserts by threshold positions from get_expression_thresholds + std::filesystem::path sizes_path = std::filesystem::path{args.path_out} / "thresholds_inserts.tsv"; + std::ofstream sizes_out{sizes_path}; + + for (size_t i = 0, file_it = 0; i < minimiser_args.samples.size(); ++i) + { + sizes_out << sequence_files[file_it].filename().string(); + uint64_t prev_pos = 0; + size_t const L = static_cast(minimiser_args.number_expression_thresholds); + for (size_t l = 0; l < L; ++l) + { + uint64_t pos = 0; + if (l < all_sizes[i].size()) + pos = all_sizes[i][l]; + + uint64_t per_level = 0; + if (l + 1 < L) + { + per_level = (pos > prev_pos) ? (pos - prev_pos) : 0; + prev_pos = pos; + } + else + { + uint64_t total_inserts = (i < insert_counts.size()) ? insert_counts[i] : 0; + per_level = (total_inserts > prev_pos) ? (total_inserts - prev_pos) : 0; + } + + sizes_out << '\t' << per_level; + } + sizes_out << '\n'; + file_it += minimiser_args.samples[i]; } } } diff --git a/src/needle.cpp b/src/needle.cpp index 22c6c11..2b7872b 100644 --- a/src/needle.cpp +++ b/src/needle.cpp @@ -337,18 +337,29 @@ int run_needle_ibf(sharg::parser & parser) .long_id = "experiment-names", .description = "Use experiment names from the given txt file."}); - parser.add_option( - expression_by_genome_file, + parser.add_option(expression_by_genome_file, sharg::config{.short_id = '\0', .long_id = "levels-by-genome", - .description = "Sequence file containing minimizers to determine expression thresholds.", + .description = "Sequence file containing minimizers, only those minimizers will be " + "considered for determining the expression thresholds.", .validator = sharg::input_file_validator{}}); + parser.add_flag(minimiser_args.ram_friendly, + sharg::config{.short_id = '\0', + .long_id = "ram", + .description = "When multithreading, prioritize lower RAM usage over speed."}); + parser.add_flag(fast_layout, sharg::config{.short_id = '\0', // .long_id = "fast-layout", .description = "Use fast layout algorithm."}); parsing(parser, ibf_args); + if (sequence_files[0].extension() == ".lst") + { + input_file = sequence_files[0]; + sequence_files = {}; + read_input_file_list(sequence_files, input_file); + } ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs, expression_by_genome_file, num_hash, fast_layout); @@ -454,7 +465,7 @@ int run_needle_ibf_min(sharg::parser & parser) .description = "Sequence file containing minimizers, only those minimizers will be " "considered for determining the expression thresholds.", .validator = sharg::input_file_validator{}}); - + initialise_arguments_ibf(parser, ibf_args, num_hash, fpr); parsing(parser, ibf_args); @@ -591,6 +602,16 @@ int run_needle_minimiser(sharg::parser & parser) .long_id = "ram", .description = "When multithreading, prioritize lower RAM usage over speed."}); + parser.add_flag(minimiser_args.write_counts, + sharg::config{.short_id = '\0', + .long_id = "write-counts", + .description = "Write per-user-bin minimiser counts, thresholds.tsv and preprocess.meta."}); + + parser.add_option(minimiser_args.number_expression_thresholds, + sharg::config{.short_id = 'l', + .long_id = "number_expression_thresholds", + .description = "Number of expression thresholds to produce when --write-counts is set."}); + parsing(parser, args); if (sequence_files[0].extension() == ".lst") { From 02769d579c4228ac764b79985ecae2a523f3944d Mon Sep 17 00:00:00 2001 From: "seqan-actions[bot]" Date: Fri, 27 Mar 2026 13:33:52 +0100 Subject: [PATCH 5/8] chore(lint): formatting --- src/minimiser.cpp | 65 +++++++++++++++++++++++++++++------------------ src/needle.cpp | 34 +++++++++++++------------ 2 files changed, 58 insertions(+), 41 deletions(-) diff --git a/src/minimiser.cpp b/src/minimiser.cpp index ac33305..e6875a0 100644 --- a/src/minimiser.cpp +++ b/src/minimiser.cpp @@ -4,16 +4,16 @@ #include "minimiser.hpp" +#include #include #include #include -#include #include "misc/calculate_cutoff.hpp" #include "misc/check_cutoffs_samples.hpp" #include "misc/fill_hash_table.hpp" -#include "misc/get_include_set_table.hpp" #include "misc/get_expression_thresholds.hpp" +#include "misc/get_include_set_table.hpp" #include "misc/stream.hpp" // Add result struct @@ -28,14 +28,14 @@ struct MinResult // Actuall minimiser calculation template MinResult calculate_minimiser(std::vector const & sequence_files, - robin_hood::unordered_set const & include_set_table, - robin_hood::unordered_set const & exclude_set_table, - minimiser_arguments const & args, - minimiser_file_input_arguments const & minimiser_args, - unsigned const i, - std::vector & cutoffs, - uint8_t const number_expression_thresholds, - bool const write_counts) + robin_hood::unordered_set const & include_set_table, + robin_hood::unordered_set const & exclude_set_table, + minimiser_arguments const & args, + minimiser_file_input_arguments const & minimiser_args, + unsigned const i, + std::vector & cutoffs, + uint8_t const number_expression_thresholds, + bool const write_counts) { robin_hood::unordered_node_map hash_table{}; // Storage for minimisers uint8_t cutoff{0}; @@ -103,7 +103,6 @@ MinResult calculate_minimiser(std::vector const & sequenc write_stream(outfile, hash.second); } - uint64_t count = hash_table.size(); MinResult res; @@ -112,7 +111,13 @@ MinResult calculate_minimiser(std::vector const & sequenc // If insert counts requested, compute expression thresholds (sizes are computed). if (write_counts) - get_expression_thresholds(number_expression_thresholds, hash_table, res.expression_thresholds, res.sizes, robin_hood::unordered_set{}, cutoff, true); + get_expression_thresholds(number_expression_thresholds, + hash_table, + res.expression_thresholds, + res.sizes, + robin_hood::unordered_set{}, + cutoff, + true); return res; } @@ -152,16 +157,17 @@ void minimiser(std::vector const & sequence_files, for (size_t i = 0; i < minimiser_args.samples.size(); i++) { MinResult r = calculate_minimiser(sequence_files, - include_set_table, - exclude_set_table, - args, - minimiser_args, - i, - cutoffs, - minimiser_args.number_expression_thresholds, - minimiser_args.write_counts); + include_set_table, + exclude_set_table, + args, + minimiser_args, + i, + cutoffs, + minimiser_args.number_expression_thresholds, + minimiser_args.write_counts); insert_counts[i] = r.count; - if (cutoffs.size() <= i) cutoffs.resize(minimiser_args.samples.size()); + if (cutoffs.size() <= i) + cutoffs.resize(minimiser_args.samples.size()); cutoffs[i] = r.cutoff; if (minimiser_args.write_counts) { @@ -181,13 +187,22 @@ void minimiser(std::vector const & sequence_files, #pragma omp parallel for schedule(dynamic, chunk_size) for (size_t i = 0; i < minimiser_args.samples.size(); i++) { - results[i] = calculate_minimiser(sequence_files, include_set_table, exclude_set_table, args, minimiser_args, i, cutoffs, minimiser_args.number_expression_thresholds, minimiser_args.write_counts); + results[i] = calculate_minimiser(sequence_files, + include_set_table, + exclude_set_table, + args, + minimiser_args, + i, + cutoffs, + minimiser_args.number_expression_thresholds, + minimiser_args.write_counts); } for (size_t i = 0; i < minimiser_args.samples.size(); ++i) { insert_counts[i] = results[i].count; - if (cutoffs.size() <= i) cutoffs.resize(minimiser_args.samples.size()); + if (cutoffs.size() <= i) + cutoffs.resize(minimiser_args.samples.size()); cutoffs[i] = results[i].cutoff; if (minimiser_args.write_counts) { @@ -252,8 +267,8 @@ void minimiser(std::vector const & sequence_files, sizes_out << '\t' << per_level; } - sizes_out << '\n'; - file_it += minimiser_args.samples[i]; + sizes_out << '\n'; + file_it += minimiser_args.samples[i]; } } } diff --git a/src/needle.cpp b/src/needle.cpp index 2b7872b..9c1960d 100644 --- a/src/needle.cpp +++ b/src/needle.cpp @@ -338,16 +338,16 @@ int run_needle_ibf(sharg::parser & parser) .description = "Use experiment names from the given txt file."}); parser.add_option(expression_by_genome_file, - sharg::config{.short_id = '\0', - .long_id = "levels-by-genome", - .description = "Sequence file containing minimizers, only those minimizers will be " + sharg::config{.short_id = '\0', + .long_id = "levels-by-genome", + .description = "Sequence file containing minimizers, only those minimizers will be " "considered for determining the expression thresholds.", - .validator = sharg::input_file_validator{}}); + .validator = sharg::input_file_validator{}}); parser.add_flag(minimiser_args.ram_friendly, - sharg::config{.short_id = '\0', - .long_id = "ram", - .description = "When multithreading, prioritize lower RAM usage over speed."}); + sharg::config{.short_id = '\0', + .long_id = "ram", + .description = "When multithreading, prioritize lower RAM usage over speed."}); parser.add_flag(fast_layout, sharg::config{.short_id = '\0', // @@ -465,7 +465,7 @@ int run_needle_ibf_min(sharg::parser & parser) .description = "Sequence file containing minimizers, only those minimizers will be " "considered for determining the expression thresholds.", .validator = sharg::input_file_validator{}}); - + initialise_arguments_ibf(parser, ibf_args, num_hash, fpr); parsing(parser, ibf_args); @@ -602,15 +602,17 @@ int run_needle_minimiser(sharg::parser & parser) .long_id = "ram", .description = "When multithreading, prioritize lower RAM usage over speed."}); - parser.add_flag(minimiser_args.write_counts, - sharg::config{.short_id = '\0', - .long_id = "write-counts", - .description = "Write per-user-bin minimiser counts, thresholds.tsv and preprocess.meta."}); + parser.add_flag( + minimiser_args.write_counts, + sharg::config{.short_id = '\0', + .long_id = "write-counts", + .description = "Write per-user-bin minimiser counts, thresholds.tsv and preprocess.meta."}); - parser.add_option(minimiser_args.number_expression_thresholds, - sharg::config{.short_id = 'l', - .long_id = "number_expression_thresholds", - .description = "Number of expression thresholds to produce when --write-counts is set."}); + parser.add_option( + minimiser_args.number_expression_thresholds, + sharg::config{.short_id = 'l', + .long_id = "number_expression_thresholds", + .description = "Number of expression thresholds to produce when --write-counts is set."}); parsing(parser, args); if (sequence_files[0].extension() == ".lst") From 71dce05c348ade11cae7879229e0d54c895f5341 Mon Sep 17 00:00:00 2001 From: koleoptil Date: Sun, 5 Apr 2026 17:37:37 +0200 Subject: [PATCH 6/8] Revert "added per-user bin counts as output to needle minimiser" This reverts commit 3d3643e96e24e4f71d0f9391e057efb58ec94a97. --- include/shared.hpp | 2 - src/minimiser.cpp | 165 +++++---------------------------------------- src/needle.cpp | 36 ++-------- 3 files changed, 22 insertions(+), 181 deletions(-) diff --git a/include/shared.hpp b/include/shared.hpp index 0a43834..c4042ed 100644 --- a/include/shared.hpp +++ b/include/shared.hpp @@ -65,8 +65,6 @@ struct minimiser_file_input_arguments bool paired = false; // If true, than experiments are seen as paired-end experiments bool experiment_names = false; // Flag, if names of experiment should be stored in a txt file bool ram_friendly = false; - bool write_counts = false; - uint8_t number_expression_thresholds = 0; }; /*! \brief Function, loading arguments diff --git a/src/minimiser.cpp b/src/minimiser.cpp index e6875a0..afb85e2 100644 --- a/src/minimiser.cpp +++ b/src/minimiser.cpp @@ -7,35 +7,22 @@ #include #include #include -#include #include "misc/calculate_cutoff.hpp" #include "misc/check_cutoffs_samples.hpp" #include "misc/fill_hash_table.hpp" -#include "misc/get_expression_thresholds.hpp" #include "misc/get_include_set_table.hpp" #include "misc/stream.hpp" -// Add result struct -struct MinResult -{ - uint64_t count{}; - uint8_t cutoff{}; - std::vector expression_thresholds; - std::vector sizes; -}; - // Actuall minimiser calculation template -MinResult calculate_minimiser(std::vector const & sequence_files, - robin_hood::unordered_set const & include_set_table, - robin_hood::unordered_set const & exclude_set_table, - minimiser_arguments const & args, - minimiser_file_input_arguments const & minimiser_args, - unsigned const i, - std::vector & cutoffs, - uint8_t const number_expression_thresholds, - bool const write_counts) +void calculate_minimiser(std::vector const & sequence_files, + robin_hood::unordered_set const & include_set_table, + robin_hood::unordered_set const & exclude_set_table, + minimiser_arguments const & args, + minimiser_file_input_arguments const & minimiser_args, + unsigned const i, + std::vector & cutoffs) { robin_hood::unordered_node_map hash_table{}; // Storage for minimisers uint8_t cutoff{0}; @@ -102,24 +89,6 @@ MinResult calculate_minimiser(std::vector const & sequenc write_stream(outfile, hash.first); write_stream(outfile, hash.second); } - - uint64_t count = hash_table.size(); - - MinResult res; - res.count = count; - res.cutoff = cutoff; - - // If insert counts requested, compute expression thresholds (sizes are computed). - if (write_counts) - get_expression_thresholds(number_expression_thresholds, - hash_table, - res.expression_thresholds, - res.sizes, - robin_hood::unordered_set{}, - cutoff, - true); - - return res; } void minimiser(std::vector const & sequence_files, @@ -140,40 +109,19 @@ void minimiser(std::vector const & sequence_files, size_t const chunk_size = std::clamp(std::bit_ceil(minimiser_args.samples.size() / args.threads), 1u, 64u); - // Prepare container for per-user-bin counts - std::vector insert_counts(minimiser_args.samples.size(), 0); - std::vector> all_thresholds; - std::vector> all_sizes; - if (minimiser_args.write_counts) - { - all_thresholds.resize(minimiser_args.samples.size()); - all_sizes.resize(minimiser_args.samples.size()); - } - // Add minimisers to ibf if (minimiser_args.ram_friendly) { seqan3::contrib::bgzf_thread_count = args.threads; for (size_t i = 0; i < minimiser_args.samples.size(); i++) { - MinResult r = calculate_minimiser(sequence_files, - include_set_table, - exclude_set_table, - args, - minimiser_args, - i, - cutoffs, - minimiser_args.number_expression_thresholds, - minimiser_args.write_counts); - insert_counts[i] = r.count; - if (cutoffs.size() <= i) - cutoffs.resize(minimiser_args.samples.size()); - cutoffs[i] = r.cutoff; - if (minimiser_args.write_counts) - { - all_thresholds[i] = std::move(r.expression_thresholds); - all_sizes[i] = std::move(r.sizes); - } + calculate_minimiser(sequence_files, + include_set_table, + exclude_set_table, + args, + minimiser_args, + i, + cutoffs); } } else @@ -182,93 +130,10 @@ void minimiser(std::vector const & sequence_files, seqan3::contrib::bgzf_thread_count = 1u; omp_set_num_threads(args.threads); - std::vector results(minimiser_args.samples.size()); - #pragma omp parallel for schedule(dynamic, chunk_size) for (size_t i = 0; i < minimiser_args.samples.size(); i++) { - results[i] = calculate_minimiser(sequence_files, - include_set_table, - exclude_set_table, - args, - minimiser_args, - i, - cutoffs, - minimiser_args.number_expression_thresholds, - minimiser_args.write_counts); - } - - for (size_t i = 0; i < minimiser_args.samples.size(); ++i) - { - insert_counts[i] = results[i].count; - if (cutoffs.size() <= i) - cutoffs.resize(minimiser_args.samples.size()); - cutoffs[i] = results[i].cutoff; - if (minimiser_args.write_counts) - { - all_thresholds[i] = std::move(results[i].expression_thresholds); - all_sizes[i] = std::move(results[i].sizes); - } - } - } - - // If --write-counts is set, write thresholds.tsv and per-user-bin count - if (minimiser_args.write_counts) - { - // write thresholds.tsv - std::filesystem::path thr_path = std::filesystem::path{args.path_out} / "thresholds.tsv"; - std::ofstream thr_out{thr_path}; - thr_out << "file"; - for (uint8_t l = 0; l < minimiser_args.number_expression_thresholds; ++l) - thr_out << '\t' << "level_" << static_cast(l); - thr_out << '\n'; - - for (size_t i = 0, file_it = 0; i < minimiser_args.samples.size(); ++i) - { - thr_out << sequence_files[file_it].filename().string(); - for (uint8_t l = 0; l < minimiser_args.number_expression_thresholds; ++l) - { - uint16_t val = 0; - if (l < all_thresholds[i].size()) - val = all_thresholds[i][l]; - thr_out << '\t' << val; - } - thr_out << '\n'; - file_it += minimiser_args.samples[i]; - } - - // Write insert counts - // Compute per level number of inserts by threshold positions from get_expression_thresholds - std::filesystem::path sizes_path = std::filesystem::path{args.path_out} / "thresholds_inserts.tsv"; - std::ofstream sizes_out{sizes_path}; - - for (size_t i = 0, file_it = 0; i < minimiser_args.samples.size(); ++i) - { - sizes_out << sequence_files[file_it].filename().string(); - uint64_t prev_pos = 0; - size_t const L = static_cast(minimiser_args.number_expression_thresholds); - for (size_t l = 0; l < L; ++l) - { - uint64_t pos = 0; - if (l < all_sizes[i].size()) - pos = all_sizes[i][l]; - - uint64_t per_level = 0; - if (l + 1 < L) - { - per_level = (pos > prev_pos) ? (pos - prev_pos) : 0; - prev_pos = pos; - } - else - { - uint64_t total_inserts = (i < insert_counts.size()) ? insert_counts[i] : 0; - per_level = (total_inserts > prev_pos) ? (total_inserts - prev_pos) : 0; - } - - sizes_out << '\t' << per_level; - } - sizes_out << '\n'; - file_it += minimiser_args.samples[i]; + calculate_minimiser(sequence_files, include_set_table, exclude_set_table, args, minimiser_args, i, cutoffs); } } } diff --git a/src/needle.cpp b/src/needle.cpp index 9c1960d..420449d 100644 --- a/src/needle.cpp +++ b/src/needle.cpp @@ -337,29 +337,18 @@ int run_needle_ibf(sharg::parser & parser) .long_id = "experiment-names", .description = "Use experiment names from the given txt file."}); - parser.add_option(expression_by_genome_file, - sharg::config{.short_id = '\0', - .long_id = "levels-by-genome", - .description = "Sequence file containing minimizers, only those minimizers will be " - "considered for determining the expression thresholds.", - .validator = sharg::input_file_validator{}}); - - parser.add_flag(minimiser_args.ram_friendly, - sharg::config{.short_id = '\0', - .long_id = "ram", - .description = "When multithreading, prioritize lower RAM usage over speed."}); + parser.add_option( + expression_by_genome_file, + sharg::config{.short_id = '\0', + .long_id = "levels-by-genome", + .description = "Sequence file containing minimizers to determine expression thresholds.", + .validator = sharg::input_file_validator{}}); parser.add_flag(fast_layout, sharg::config{.short_id = '\0', // .long_id = "fast-layout", .description = "Use fast layout algorithm."}); parsing(parser, ibf_args); - if (sequence_files[0].extension() == ".lst") - { - input_file = sequence_files[0]; - sequence_files = {}; - read_input_file_list(sequence_files, input_file); - } ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs, expression_by_genome_file, num_hash, fast_layout); @@ -466,6 +455,7 @@ int run_needle_ibf_min(sharg::parser & parser) "considered for determining the expression thresholds.", .validator = sharg::input_file_validator{}}); + initialise_arguments_ibf(parser, ibf_args, num_hash, fpr); parsing(parser, ibf_args); @@ -602,18 +592,6 @@ int run_needle_minimiser(sharg::parser & parser) .long_id = "ram", .description = "When multithreading, prioritize lower RAM usage over speed."}); - parser.add_flag( - minimiser_args.write_counts, - sharg::config{.short_id = '\0', - .long_id = "write-counts", - .description = "Write per-user-bin minimiser counts, thresholds.tsv and preprocess.meta."}); - - parser.add_option( - minimiser_args.number_expression_thresholds, - sharg::config{.short_id = 'l', - .long_id = "number_expression_thresholds", - .description = "Number of expression thresholds to produce when --write-counts is set."}); - parsing(parser, args); if (sequence_files[0].extension() == ".lst") { From 26f904233636ef1e5c362cfbaef4254fb64ea9a7 Mon Sep 17 00:00:00 2001 From: koleoptil Date: Wed, 15 Apr 2026 15:52:24 +0200 Subject: [PATCH 7/8] minimiser insertions via HIBF input_fn --- include/misc/stream.hpp | 6 + src/ibf.cpp | 308 ++++++++++++++++++++++++++-------------- src/misc/stream.cpp | 32 +++++ 3 files changed, 238 insertions(+), 108 deletions(-) diff --git a/include/misc/stream.hpp b/include/misc/stream.hpp index b98ae8a..d1abbcb 100644 --- a/include/misc/stream.hpp +++ b/include/misc/stream.hpp @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -42,3 +43,8 @@ void read_binary_start(minimiser_arguments & args, std::filesystem::path const & filename, uint64_t & num_of_minimisers, uint8_t & cutoff); + +/*!\brief Iterate through a minimiser binary file and call callback(hash, count) for each entry. + */ +void iterate_minimiser_file(std::filesystem::path const & filename, + std::function callback); diff --git a/src/ibf.cpp b/src/ibf.cpp index df61cb8..198b0c3 100644 --- a/src/ibf.cpp +++ b/src/ibf.cpp @@ -386,25 +386,115 @@ void ibf_helper(std::vector const & minimiser_files, } } - // Collect minimizer insertions from all threads - std::vector>>> file_insertions(num_files); - -#pragma omp parallel for schedule(dynamic, chunk_size) - for (size_t i = 0; i < num_files; i++) + // --- Streaming approach: precompute per-file thresholds (if samplewise) and use a streaming input_fn --- + if constexpr (minimiser_files_given) { - robin_hood::unordered_node_map hash_table{}; // Storage for minimisers - // Create a smaller cutoff table to save RAM, this cutoff table is only used for constructing the hash table - // and afterwards discarded. - robin_hood::unordered_node_map cutoff_table; - std::vector expression_thresholds; + if constexpr (samplewise) + { + for (size_t i = 0; i < num_files; ++i) + { + // Build temporary hash table for this file + robin_hood::unordered_node_map hash_table{}; + read_binary(minimiser_files[i], hash_table); + + std::vector expression_thresholds; + get_expression_thresholds(ibf_args.number_expression_thresholds, + hash_table, + expression_thresholds, + sizes[i], + genome, + cutoffs[i], + expression_by_genome); + expressions[i] = expression_thresholds; + } + } + // Streaming input function: open minimiser file on demand and emit only hashes for that (file,level). + auto hibf_input = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it) + { + size_t const file_index = user_bin_id % num_files; + size_t const target_level = user_bin_id / num_files; + + auto const & thr = (samplewise ? expressions[file_index] : ibf_args.expression_thresholds); + + bool emitted = false; + iterate_minimiser_file(minimiser_files[file_index], [&](uint64_t h, uint16_t c) { + // Determine level using upper_bound + auto p = std::ranges::upper_bound(thr, c); + if (p == thr.begin()) + return; // below first threshold -> skip + size_t level = static_cast(std::ranges::distance(thr.begin(), p) - 1); + if (level == target_level) + { + it = h; + emitted = true; + } + }); + + if (!emitted) // HIBF requires non-empty bins + it = 0; + }; + + // HIBF config using streaming input_fn + seqan::hibf::config hibf_config{.input_fn = hibf_input, + .number_of_user_bins = num_files * ibf_args.number_expression_thresholds, + .number_of_hash_functions = num_hash, + .maximum_fpr = fprs[0], + .threads = ibf_args.threads, + .track_occupancy = true}; + hibf_config.validate_and_set_defaults(); + + // construct HIBF + auto hibf = [&]() + { + if (fast_layout) + { + seqan::hibf::layout::layout hibf_layout = compute_fast_layout(hibf_config, ibf_args); + return seqan::hibf::hierarchical_interleaved_bloom_filter{hibf_config, hibf_layout}; + } + else + { + return seqan::hibf::hierarchical_interleaved_bloom_filter{hibf_config}; + } + }(); - // Fill hash table with minimisers. - if constexpr (minimiser_files_given) + // Store HIBF + std::filesystem::path const filename = filenames::ibf(ibf_args.path_out, samplewise, 0, ibf_args); + store_ibf(hibf, filename); + + if constexpr (samplewise) { - read_binary(minimiser_files[i], hash_table); + std::ofstream outfile{filenames::levels(ibf_args.path_out)}; + for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++) + { + for (size_t i = 0; i < num_files; i++) + outfile << expressions[i][j] << " "; + outfile << "\n"; + } + outfile << "/\n"; } - else + + store_fpr_information(hibf, num_files, ibf_args); + + return; + } + + // Collect minimizer insertions from all threads (if !minimiser_files_given) + std::vector> user_bin_minimisers; + + if constexpr (!minimiser_files_given) + { + std::vector>>> file_insertions(num_files); + +#pragma omp parallel for schedule(dynamic, chunk_size) + for (size_t i = 0; i < num_files; i++) { + robin_hood::unordered_node_map hash_table{}; // Storage for minimisers + // Create a smaller cutoff table to save RAM, this cutoff table is only used for constructing the hash table + // and afterwards discarded. + robin_hood::unordered_node_map cutoff_table; + std::vector expression_thresholds; + + // Fill hash table with minimisers size_t const file_iterator = std::reduce(minimiser_args.samples.begin(), minimiser_args.samples.begin() + i); for (size_t f = 0; f < minimiser_args.samples[i]; f++) @@ -430,130 +520,132 @@ void ibf_helper(std::vector const & minimiser_files, cutoffs[i]); } cutoff_table.clear(); - } - // Handle samplewise expression thresholds - if constexpr (samplewise) - { - get_expression_thresholds(ibf_args.number_expression_thresholds, - hash_table, - expression_thresholds, - sizes[i], - genome, - cutoffs[i], - expression_by_genome); - expressions[i] = expression_thresholds; - } + // Handle samplewise expression thresholds + if constexpr (samplewise) + { + get_expression_thresholds(ibf_args.number_expression_thresholds, + hash_table, + expression_thresholds, + sizes[i], + genome, + cutoffs[i], + expression_by_genome); + expressions[i] = expression_thresholds; + } - // Collect insertions for this file - std::vector>> local_insertions; + // Collect insertions for this file + std::vector>> local_insertions; - for (auto && [hash, occurrence] : hash_table) - { - std::vector target_bins; - for (size_t const j : std::views::iota(0u, ibf_args.number_expression_thresholds) | std::views::reverse) + for (auto && [hash, occurrence] : hash_table) { - uint16_t const threshold = [&]() + std::vector target_bins; + for (size_t const j : std::views::iota(0u, ibf_args.number_expression_thresholds) | std::views::reverse) { - if constexpr (samplewise) - return expressions[i][j]; - else - return ibf_args.expression_thresholds[j]; - }(); - - if (occurrence >= threshold) + uint16_t const threshold = [&]() + { + if constexpr (samplewise) + return expressions[i][j]; + else + return ibf_args.expression_thresholds[j]; + }(); + + if (occurrence >= threshold) + { + size_t bin_index = j * num_files + i; + target_bins.push_back(bin_index); + counts_per_level[i][j]++; + break; + } + } + if (!target_bins.empty()) { - size_t bin_index = j * num_files + i; - target_bins.push_back(bin_index); - counts_per_level[i][j]++; - break; + local_insertions.emplace_back(hash, target_bins); } } - if (!target_bins.empty()) - { - local_insertions.emplace_back(hash, target_bins); - } + file_insertions[i] = std::move(local_insertions); } - file_insertions[i] = std::move(local_insertions); - } - - // HIBF construction - std::vector> user_bin_minimisers(num_files * ibf_args.number_expression_thresholds); - // Populate user bins from collected insertions - for (size_t i = 0; i < num_files; ++i) - { - for (auto const & [hash, bins] : file_insertions[i]) + // HIBF construction: populate user_bin_minimisers declared above + user_bin_minimisers.assign(num_files * ibf_args.number_expression_thresholds, {}); + // Populate user bins from collected insertions + for (size_t i = 0; i < num_files; ++i) { - for (size_t bin_idx : bins) + for (auto const & [hash, bins] : file_insertions[i]) { - user_bin_minimisers[bin_idx].push_back(hash); + for (size_t bin_idx : bins) + { + user_bin_minimisers[bin_idx].push_back(hash); + } } } - } - // !Workaround: Bins cannot be empty in HIBF - for (size_t i = 0; i < user_bin_minimisers.size(); ++i) - { - if (user_bin_minimisers[i].empty()) + // !Workaround: Bins cannot be empty in HIBF + for (size_t i = 0; i < user_bin_minimisers.size(); ++i) { - user_bin_minimisers[i].push_back(0); + if (user_bin_minimisers[i].empty()) + { + user_bin_minimisers[i].push_back(0); + } } - } // Maybe Todo: Original needle checks `size == num_files` for each expression level, and throws if true becasue // the thresholds are bad if this happens. // We could probably add a similar check here. // If "sum of all files for an expression level == num_files", and re-enable the test in ibf_test.cpp + } - // HIBF input function - auto hibf_input = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it) - { - for (auto hash : user_bin_minimisers[user_bin_id]) - it = hash; - }; - - // HIBF config - seqan::hibf::config hibf_config{.input_fn = hibf_input, - .number_of_user_bins = user_bin_minimisers.size(), - .number_of_hash_functions = num_hash, - .maximum_fpr = fprs[0], - .threads = ibf_args.threads, - .track_occupancy = true}; - hibf_config.validate_and_set_defaults(); - - // Construct the HIBF — prefer the ctor that consumes a precomputed layout if provided. - auto hibf = [&]() + if constexpr (!minimiser_files_given) { - if (fast_layout) + // HIBF input function + auto hibf_input = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it) { - seqan::hibf::layout::layout hibf_layout = compute_fast_layout(hibf_config, ibf_args); - return seqan::hibf::hierarchical_interleaved_bloom_filter{hibf_config, hibf_layout}; - } - else + for (auto hash : user_bin_minimisers[user_bin_id]) + it = hash; + }; + + // HIBF config + seqan::hibf::config hibf_config{.input_fn = hibf_input, + .number_of_user_bins = user_bin_minimisers.size(), + .number_of_hash_functions = num_hash, + .maximum_fpr = fprs[0], + .threads = ibf_args.threads, + .track_occupancy = true}; + hibf_config.validate_and_set_defaults(); + + // Construct the HIBF — prefer the ctor that consumes a precomputed layout if provided. + auto hibf = [&]() { - return seqan::hibf::hierarchical_interleaved_bloom_filter{hibf_config}; - } - }(); + if (fast_layout) + { + seqan::hibf::layout::layout hibf_layout = compute_fast_layout(hibf_config, ibf_args); + return seqan::hibf::hierarchical_interleaved_bloom_filter{hibf_config, hibf_layout}; + } + else + { + return seqan::hibf::hierarchical_interleaved_bloom_filter{hibf_config}; + } + }(); - // Store the HIBF - std::filesystem::path const filename = filenames::ibf(ibf_args.path_out, samplewise, 0, ibf_args); - store_ibf(hibf, filename); + // Store the HIBF + std::filesystem::path const filename = filenames::ibf(ibf_args.path_out, samplewise, 0, ibf_args); + store_ibf(hibf, filename); - // Store all expression thresholds per level. - if constexpr (samplewise) - { - std::ofstream outfile{filenames::levels(ibf_args.path_out)}; - for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++) + // Store all expression thresholds per level. + if constexpr (samplewise) { - for (size_t i = 0; i < num_files; i++) - outfile << expressions[i][j] << " "; - outfile << "\n"; + std::ofstream outfile{filenames::levels(ibf_args.path_out)}; + for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++) + { + for (size_t i = 0; i < num_files; i++) + outfile << expressions[i][j] << " "; + outfile << "\n"; + } + outfile << "/\n"; } - outfile << "/\n"; - } - store_fpr_information(hibf, num_files, ibf_args); + store_fpr_information(hibf, num_files, ibf_args); + } } // Create ibfs @@ -647,4 +739,4 @@ std::vector ibf(std::vector const & minimiser_f store_args(ibf_args, filenames::data(ibf_args.path_out)); return ibf_args.expression_thresholds; -} +} \ No newline at end of file diff --git a/src/misc/stream.cpp b/src/misc/stream.cpp index fbd6d52..3aeeee0 100644 --- a/src/misc/stream.cpp +++ b/src/misc/stream.cpp @@ -3,6 +3,10 @@ // SPDX-License-Identifier: BSD-3-Clause #include "misc/stream.hpp" +#include +#include +#include +#include void read_binary(std::filesystem::path const & filename, robin_hood::unordered_node_map & hash_table) @@ -54,3 +58,31 @@ void read_binary_start(minimiser_arguments & args, else read_stream(fin, args.shape); } + +void iterate_minimiser_file(std::filesystem::path const & filename, + std::function callback) +{ + std::ifstream fin{filename, std::ios::binary}; + if (!fin) + return; + + // Skip the first 22 bytes (num_minimisers, cutoff, k, w_size, s) + fin.ignore(22); + + bool ungapped{}; + if (!read_stream(fin, ungapped)) + return; + if (!ungapped) + { + fin.ignore(8); // args.shape + } + + uint64_t minimiser{}; + uint16_t minimiser_count{}; + while (read_stream(fin, minimiser)) + { + if (!read_stream(fin, minimiser_count)) + break; + callback(minimiser, minimiser_count); + } +} From 2df505e928560b9fa0c7aecee47265867b24accc Mon Sep 17 00:00:00 2001 From: "seqan-actions[bot]" Date: Wed, 15 Apr 2026 16:19:52 +0200 Subject: [PATCH 8/8] chore(lint): formatting --- src/ibf.cpp | 36 +++++++++++++++++++----------------- src/misc/stream.cpp | 10 +++++----- src/needle.cpp | 1 - 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/src/ibf.cpp b/src/ibf.cpp index 198b0c3..b4bd1d4 100644 --- a/src/ibf.cpp +++ b/src/ibf.cpp @@ -417,18 +417,20 @@ void ibf_helper(std::vector const & minimiser_files, auto const & thr = (samplewise ? expressions[file_index] : ibf_args.expression_thresholds); bool emitted = false; - iterate_minimiser_file(minimiser_files[file_index], [&](uint64_t h, uint16_t c) { - // Determine level using upper_bound - auto p = std::ranges::upper_bound(thr, c); - if (p == thr.begin()) - return; // below first threshold -> skip - size_t level = static_cast(std::ranges::distance(thr.begin(), p) - 1); - if (level == target_level) - { - it = h; - emitted = true; - } - }); + iterate_minimiser_file(minimiser_files[file_index], + [&](uint64_t h, uint16_t c) + { + // Determine level using upper_bound + auto p = std::ranges::upper_bound(thr, c); + if (p == thr.begin()) + return; // below first threshold -> skip + size_t level = static_cast(std::ranges::distance(thr.begin(), p) - 1); + if (level == target_level) + { + it = h; + emitted = true; + } + }); if (!emitted) // HIBF requires non-empty bins it = 0; @@ -589,10 +591,10 @@ void ibf_helper(std::vector const & minimiser_files, } } - // Maybe Todo: Original needle checks `size == num_files` for each expression level, and throws if true becasue - // the thresholds are bad if this happens. - // We could probably add a similar check here. - // If "sum of all files for an expression level == num_files", and re-enable the test in ibf_test.cpp + // Maybe Todo: Original needle checks `size == num_files` for each expression level, and throws if true becasue + // the thresholds are bad if this happens. + // We could probably add a similar check here. + // If "sum of all files for an expression level == num_files", and re-enable the test in ibf_test.cpp } if constexpr (!minimiser_files_given) @@ -739,4 +741,4 @@ std::vector ibf(std::vector const & minimiser_f store_args(ibf_args, filenames::data(ibf_args.path_out)); return ibf_args.expression_thresholds; -} \ No newline at end of file +} diff --git a/src/misc/stream.cpp b/src/misc/stream.cpp index 3aeeee0..0b5bae7 100644 --- a/src/misc/stream.cpp +++ b/src/misc/stream.cpp @@ -3,10 +3,11 @@ // SPDX-License-Identifier: BSD-3-Clause #include "misc/stream.hpp" -#include -#include -#include + #include +#include +#include +#include void read_binary(std::filesystem::path const & filename, robin_hood::unordered_node_map & hash_table) @@ -59,8 +60,7 @@ void read_binary_start(minimiser_arguments & args, read_stream(fin, args.shape); } -void iterate_minimiser_file(std::filesystem::path const & filename, - std::function callback) +void iterate_minimiser_file(std::filesystem::path const & filename, std::function callback) { std::ifstream fin{filename, std::ios::binary}; if (!fin) diff --git a/src/needle.cpp b/src/needle.cpp index 420449d..22c6c11 100644 --- a/src/needle.cpp +++ b/src/needle.cpp @@ -455,7 +455,6 @@ int run_needle_ibf_min(sharg::parser & parser) "considered for determining the expression thresholds.", .validator = sharg::input_file_validator{}}); - initialise_arguments_ibf(parser, ibf_args, num_hash, fpr); parsing(parser, ibf_args);