diff --git a/.clang-format b/.clang-format index 1f92e78..37a6340 100644 --- a/.clang-format +++ b/.clang-format @@ -147,10 +147,14 @@ IncludeCategories: Priority: 7 SortPriority: 0 CaseSensitive: false - - Regex: '.*' + - Regex: ' ibf(std::vector const & sequence_fi std::vector & fpr, std::vector & cutoffs, std::filesystem::path const & expression_by_genome_file = "", - size_t num_hash = 1); + size_t num_hash = 1, + bool const fast_layout = false); /*! \brief Creates IBFs based on the minimiser files * \param minimiser_files A vector of minimiser file paths. @@ -48,4 +49,5 @@ std::vector ibf(std::vector const & minimiser_f estimate_ibf_arguments & ibf_args, std::vector & fpr, std::filesystem::path const & expression_by_genome_file = "", - size_t num_hash = 1); + size_t num_hash = 1, + bool const fast_layout = false); diff --git a/include/misc/stream.hpp b/include/misc/stream.hpp index b98ae8a..d1abbcb 100644 --- a/include/misc/stream.hpp +++ b/include/misc/stream.hpp @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -42,3 +43,8 @@ void read_binary_start(minimiser_arguments & args, std::filesystem::path const & filename, uint64_t & num_of_minimisers, uint8_t & cutoff); + +/*!\brief Iterate through a minimiser binary file and call callback(hash, count) for each entry. + */ +void iterate_minimiser_file(std::filesystem::path const & filename, + std::function callback); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 338f33e..eca37a2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -26,6 +26,7 @@ add_library ("${PROJECT_NAME}_lib" STATIC ${NEEDLE_SOURCE_FILES}) target_link_libraries ("${PROJECT_NAME}_lib" PUBLIC seqan3::seqan3) target_link_libraries ("${PROJECT_NAME}_lib" PUBLIC sharg::sharg) target_link_libraries ("${PROJECT_NAME}_lib" PUBLIC seqan::hibf) +target_link_libraries ("${PROJECT_NAME}_lib" PUBLIC chopper::chopper) target_include_directories ("${PROJECT_NAME}_lib" PUBLIC ../include) add_subdirectory (layout) diff --git a/src/ibf.cpp b/src/ibf.cpp index 87aacef..b4bd1d4 100644 --- a/src/ibf.cpp +++ b/src/ibf.cpp @@ -11,6 +11,12 @@ #include #include #include +#include +#include +#include +#include + +#include #include "misc/calculate_cutoff.hpp" #include "misc/check_cutoffs_samples.hpp" @@ -21,6 +27,17 @@ #include "misc/get_include_set_table.hpp" #include "misc/stream.hpp" +namespace chopper::layout +{ +// Foward declaration for linking. fast_layout is not declared in execute.hpp +void fast_layout(chopper::configuration const & config, + std::vector const & positions, + std::vector const & cardinalities, + std::vector const & sketches, + std::vector const & minHash_sketches, + seqan::hibf::layout::layout & hibf_layout); +} // namespace chopper::layout + // Check number of expression levels, sort expression levels void check_expression(std::vector & expression_thresholds, uint8_t & number_expression_thresholds, @@ -193,6 +210,47 @@ void store_fpr_information(seqan::hibf::hierarchical_interleaved_bloom_filter co outfile << "/\n"; } +// Full Chopper config with defaults: +// .partitioning_approach = chopper::partitioning_scheme::lsh_sim, +// .data_file = {}, +// .debug = false, +// .output_filename = "layout.txt", +// .output_timings = {}, +// .k = ibf_args.k, +// .window_size = static_cast(ibf_args.w_size.get()), +// .precomputed_files = false, +// .sketch_directory = {}, +// .disable_sketch_output = false, +// .determine_best_tmax = false, +// .force_all_binnings = false, +// .output_verbose_statistics = false, +// .hibf_config = hibf_config + +seqan::hibf::layout::layout compute_fast_layout(seqan::hibf::config const & hibf_config, + estimate_ibf_arguments const & ibf_args) +{ + seqan::hibf::layout::layout hibf_layout{}; + + chopper::configuration chopper_config{.data_file = {}, // Not initialised in cfg, so it must be here. + .k = ibf_args.k, + .window_size = static_cast(ibf_args.w_size.get()), + .hibf_config = hibf_config}; + + std::vector sketches{}; + std::vector minHash_sketches{}; + std::vector cardinalities; + + seqan::hibf::sketch::compute_sketches(hibf_config, sketches, minHash_sketches); + seqan::hibf::sketch::estimate_kmer_counts(sketches, cardinalities); + chopper::layout::fast_layout(chopper_config, + seqan::hibf::iota_vector(sketches.size()), + cardinalities, + sketches, + minHash_sketches, + hibf_layout); + return hibf_layout; +} + // Actual ibf construction template void ibf_helper(std::vector const & minimiser_files, @@ -201,7 +259,8 @@ void ibf_helper(std::vector const & minimiser_files, std::vector & cutoffs, size_t num_hash = 1, std::filesystem::path const & expression_by_genome_file = "", - minimiser_file_input_arguments const & minimiser_args = {}) + minimiser_file_input_arguments const & minimiser_args = {}, + bool const fast_layout = false) { size_t const num_files = [&]() constexpr { @@ -327,25 +386,117 @@ void ibf_helper(std::vector const & minimiser_files, } } - // Collect minimizer insertions from all threads - std::vector>>> file_insertions(num_files); - -#pragma omp parallel for schedule(dynamic, chunk_size) - for (size_t i = 0; i < num_files; i++) + // --- Streaming approach: precompute per-file thresholds (if samplewise) and use a streaming input_fn --- + if constexpr (minimiser_files_given) { - robin_hood::unordered_node_map hash_table{}; // Storage for minimisers - // Create a smaller cutoff table to save RAM, this cutoff table is only used for constructing the hash table - // and afterwards discarded. - robin_hood::unordered_node_map cutoff_table; - std::vector expression_thresholds; + if constexpr (samplewise) + { + for (size_t i = 0; i < num_files; ++i) + { + // Build temporary hash table for this file + robin_hood::unordered_node_map hash_table{}; + read_binary(minimiser_files[i], hash_table); + + std::vector expression_thresholds; + get_expression_thresholds(ibf_args.number_expression_thresholds, + hash_table, + expression_thresholds, + sizes[i], + genome, + cutoffs[i], + expression_by_genome); + expressions[i] = expression_thresholds; + } + } + // Streaming input function: open minimiser file on demand and emit only hashes for that (file,level). + auto hibf_input = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it) + { + size_t const file_index = user_bin_id % num_files; + size_t const target_level = user_bin_id / num_files; + + auto const & thr = (samplewise ? expressions[file_index] : ibf_args.expression_thresholds); + + bool emitted = false; + iterate_minimiser_file(minimiser_files[file_index], + [&](uint64_t h, uint16_t c) + { + // Determine level using upper_bound + auto p = std::ranges::upper_bound(thr, c); + if (p == thr.begin()) + return; // below first threshold -> skip + size_t level = static_cast(std::ranges::distance(thr.begin(), p) - 1); + if (level == target_level) + { + it = h; + emitted = true; + } + }); + + if (!emitted) // HIBF requires non-empty bins + it = 0; + }; + + // HIBF config using streaming input_fn + seqan::hibf::config hibf_config{.input_fn = hibf_input, + .number_of_user_bins = num_files * ibf_args.number_expression_thresholds, + .number_of_hash_functions = num_hash, + .maximum_fpr = fprs[0], + .threads = ibf_args.threads, + .track_occupancy = true}; + hibf_config.validate_and_set_defaults(); + + // construct HIBF + auto hibf = [&]() + { + if (fast_layout) + { + seqan::hibf::layout::layout hibf_layout = compute_fast_layout(hibf_config, ibf_args); + return seqan::hibf::hierarchical_interleaved_bloom_filter{hibf_config, hibf_layout}; + } + else + { + return seqan::hibf::hierarchical_interleaved_bloom_filter{hibf_config}; + } + }(); - // Fill hash table with minimisers. - if constexpr (minimiser_files_given) + // Store HIBF + std::filesystem::path const filename = filenames::ibf(ibf_args.path_out, samplewise, 0, ibf_args); + store_ibf(hibf, filename); + + if constexpr (samplewise) { - read_binary(minimiser_files[i], hash_table); + std::ofstream outfile{filenames::levels(ibf_args.path_out)}; + for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++) + { + for (size_t i = 0; i < num_files; i++) + outfile << expressions[i][j] << " "; + outfile << "\n"; + } + outfile << "/\n"; } - else + + store_fpr_information(hibf, num_files, ibf_args); + + return; + } + + // Collect minimizer insertions from all threads (if !minimiser_files_given) + std::vector> user_bin_minimisers; + + if constexpr (!minimiser_files_given) + { + std::vector>>> file_insertions(num_files); + +#pragma omp parallel for schedule(dynamic, chunk_size) + for (size_t i = 0; i < num_files; i++) { + robin_hood::unordered_node_map hash_table{}; // Storage for minimisers + // Create a smaller cutoff table to save RAM, this cutoff table is only used for constructing the hash table + // and afterwards discarded. + robin_hood::unordered_node_map cutoff_table; + std::vector expression_thresholds; + + // Fill hash table with minimisers size_t const file_iterator = std::reduce(minimiser_args.samples.begin(), minimiser_args.samples.begin() + i); for (size_t f = 0; f < minimiser_args.samples[i]; f++) @@ -371,118 +522,132 @@ void ibf_helper(std::vector const & minimiser_files, cutoffs[i]); } cutoff_table.clear(); - } - // Handle samplewise expression thresholds - if constexpr (samplewise) - { - get_expression_thresholds(ibf_args.number_expression_thresholds, - hash_table, - expression_thresholds, - sizes[i], - genome, - cutoffs[i], - expression_by_genome); - expressions[i] = expression_thresholds; - } + // Handle samplewise expression thresholds + if constexpr (samplewise) + { + get_expression_thresholds(ibf_args.number_expression_thresholds, + hash_table, + expression_thresholds, + sizes[i], + genome, + cutoffs[i], + expression_by_genome); + expressions[i] = expression_thresholds; + } - // Collect insertions for this file - std::vector>> local_insertions; + // Collect insertions for this file + std::vector>> local_insertions; - for (auto && [hash, occurrence] : hash_table) - { - std::vector target_bins; - for (size_t const j : std::views::iota(0u, ibf_args.number_expression_thresholds) | std::views::reverse) + for (auto && [hash, occurrence] : hash_table) { - uint16_t const threshold = [&]() + std::vector target_bins; + for (size_t const j : std::views::iota(0u, ibf_args.number_expression_thresholds) | std::views::reverse) { - if constexpr (samplewise) - return expressions[i][j]; - else - return ibf_args.expression_thresholds[j]; - }(); - - if (occurrence >= threshold) + uint16_t const threshold = [&]() + { + if constexpr (samplewise) + return expressions[i][j]; + else + return ibf_args.expression_thresholds[j]; + }(); + + if (occurrence >= threshold) + { + size_t bin_index = j * num_files + i; + target_bins.push_back(bin_index); + counts_per_level[i][j]++; + break; + } + } + if (!target_bins.empty()) { - size_t bin_index = j * num_files + i; - target_bins.push_back(bin_index); - counts_per_level[i][j]++; - break; + local_insertions.emplace_back(hash, target_bins); } } - if (!target_bins.empty()) - { - local_insertions.emplace_back(hash, target_bins); - } + file_insertions[i] = std::move(local_insertions); } - file_insertions[i] = std::move(local_insertions); - } - // HIBF construction - std::vector> user_bin_minimisers(num_files * ibf_args.number_expression_thresholds); - - // Populate user bins from collected insertions - for (size_t i = 0; i < num_files; ++i) - { - for (auto const & [hash, bins] : file_insertions[i]) + // HIBF construction: populate user_bin_minimisers declared above + user_bin_minimisers.assign(num_files * ibf_args.number_expression_thresholds, {}); + // Populate user bins from collected insertions + for (size_t i = 0; i < num_files; ++i) { - for (size_t bin_idx : bins) + for (auto const & [hash, bins] : file_insertions[i]) { - user_bin_minimisers[bin_idx].push_back(hash); + for (size_t bin_idx : bins) + { + user_bin_minimisers[bin_idx].push_back(hash); + } } } - } - // !Workaround: Bins cannot be empty in HIBF - for (size_t i = 0; i < user_bin_minimisers.size(); ++i) - { - if (user_bin_minimisers[i].empty()) + // !Workaround: Bins cannot be empty in HIBF + for (size_t i = 0; i < user_bin_minimisers.size(); ++i) { - user_bin_minimisers[i].push_back(0); + if (user_bin_minimisers[i].empty()) + { + user_bin_minimisers[i].push_back(0); + } } - } - // Maybe Todo: Original needle checks `size == num_files` for each expression level, and throws if true becasue - // the thresholds are bad if this happens. - // We could probably add a similar check here. - // If "sum of all files for an expression level == num_files", and re-enable the test in ibf_test.cpp + // Maybe Todo: Original needle checks `size == num_files` for each expression level, and throws if true becasue + // the thresholds are bad if this happens. + // We could probably add a similar check here. + // If "sum of all files for an expression level == num_files", and re-enable the test in ibf_test.cpp + } - // HIBF input function - auto hibf_input = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it) + if constexpr (!minimiser_files_given) { - for (auto hash : user_bin_minimisers[user_bin_id]) - it = hash; - }; - - // HIBF config - seqan::hibf::config hibf_config{.input_fn = hibf_input, - .number_of_user_bins = user_bin_minimisers.size(), - .number_of_hash_functions = num_hash, - .maximum_fpr = fprs[0], - .threads = ibf_args.threads, - .track_occupancy = true}; - - // Construct the HIBF - seqan::hibf::hierarchical_interleaved_bloom_filter hibf{hibf_config}; + // HIBF input function + auto hibf_input = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it) + { + for (auto hash : user_bin_minimisers[user_bin_id]) + it = hash; + }; + + // HIBF config + seqan::hibf::config hibf_config{.input_fn = hibf_input, + .number_of_user_bins = user_bin_minimisers.size(), + .number_of_hash_functions = num_hash, + .maximum_fpr = fprs[0], + .threads = ibf_args.threads, + .track_occupancy = true}; + hibf_config.validate_and_set_defaults(); + + // Construct the HIBF — prefer the ctor that consumes a precomputed layout if provided. + auto hibf = [&]() + { + if (fast_layout) + { + seqan::hibf::layout::layout hibf_layout = compute_fast_layout(hibf_config, ibf_args); + return seqan::hibf::hierarchical_interleaved_bloom_filter{hibf_config, hibf_layout}; + } + else + { + return seqan::hibf::hierarchical_interleaved_bloom_filter{hibf_config}; + } + }(); - // Store the HIBF - std::filesystem::path const filename = filenames::ibf(ibf_args.path_out, samplewise, 0, ibf_args); - store_ibf(hibf, filename); + // Store the HIBF + std::filesystem::path const filename = filenames::ibf(ibf_args.path_out, samplewise, 0, ibf_args); + store_ibf(hibf, filename); - // Store all expression thresholds per level. - if constexpr (samplewise) - { - std::ofstream outfile{filenames::levels(ibf_args.path_out)}; - for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++) + // Store all expression thresholds per level. + if constexpr (samplewise) { - for (size_t i = 0; i < num_files; i++) - outfile << expressions[i][j] << " "; - outfile << "\n"; + std::ofstream outfile{filenames::levels(ibf_args.path_out)}; + for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++) + { + for (size_t i = 0; i < num_files; i++) + outfile << expressions[i][j] << " "; + outfile << "\n"; + } + outfile << "/\n"; } - outfile << "/\n"; - } - store_fpr_information(hibf, num_files, ibf_args); + store_fpr_information(hibf, num_files, ibf_args); + } } // Create ibfs @@ -492,7 +657,8 @@ std::vector ibf(std::vector const & sequence_fi std::vector & fpr, std::vector & cutoffs, std::filesystem::path const & expression_by_genome_file, - size_t num_hash) + size_t num_hash, + bool const fast_layout) { // Declarations robin_hood::unordered_node_map hash_table{}; // Storage for minimisers @@ -522,7 +688,8 @@ std::vector ibf(std::vector const & sequence_fi cutoffs, num_hash, expression_by_genome_file, - minimiser_args); + minimiser_args, + fast_layout); else ibf_helper(sequence_files, fpr, @@ -530,7 +697,8 @@ std::vector ibf(std::vector const & sequence_fi cutoffs, num_hash, expression_by_genome_file, - minimiser_args); + minimiser_args, + fast_layout); store_args(ibf_args, filenames::data(ibf_args.path_out)); @@ -542,7 +710,8 @@ std::vector ibf(std::vector const & minimiser_f estimate_ibf_arguments & ibf_args, std::vector & fpr, std::filesystem::path const & expression_by_genome_file, - size_t num_hash) + size_t num_hash, + bool const fast_layout) { check_expression(ibf_args.expression_thresholds, ibf_args.number_expression_thresholds, expression_by_genome_file); check_fpr(ibf_args.number_expression_thresholds, fpr); @@ -551,9 +720,23 @@ std::vector ibf(std::vector const & minimiser_f std::vector cutoffs{}; if (ibf_args.samplewise) - ibf_helper(minimiser_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file); + ibf_helper(minimiser_files, + fpr, + ibf_args, + cutoffs, + num_hash, + expression_by_genome_file, + minimiser_file_input_arguments{}, + fast_layout); else - ibf_helper(minimiser_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file); + ibf_helper(minimiser_files, + fpr, + ibf_args, + cutoffs, + num_hash, + expression_by_genome_file, + minimiser_file_input_arguments{}, + fast_layout); store_args(ibf_args, filenames::data(ibf_args.path_out)); diff --git a/src/minimiser.cpp b/src/minimiser.cpp index 4c09971..afb85e2 100644 --- a/src/minimiser.cpp +++ b/src/minimiser.cpp @@ -4,6 +4,7 @@ #include "minimiser.hpp" +#include #include #include diff --git a/src/misc/stream.cpp b/src/misc/stream.cpp index fbd6d52..0b5bae7 100644 --- a/src/misc/stream.cpp +++ b/src/misc/stream.cpp @@ -4,6 +4,11 @@ #include "misc/stream.hpp" +#include +#include +#include +#include + void read_binary(std::filesystem::path const & filename, robin_hood::unordered_node_map & hash_table) { @@ -54,3 +59,30 @@ void read_binary_start(minimiser_arguments & args, else read_stream(fin, args.shape); } + +void iterate_minimiser_file(std::filesystem::path const & filename, std::function callback) +{ + std::ifstream fin{filename, std::ios::binary}; + if (!fin) + return; + + // Skip the first 22 bytes (num_minimisers, cutoff, k, w_size, s) + fin.ignore(22); + + bool ungapped{}; + if (!read_stream(fin, ungapped)) + return; + if (!ungapped) + { + fin.ignore(8); // args.shape + } + + uint64_t minimiser{}; + uint16_t minimiser_count{}; + while (read_stream(fin, minimiser)) + { + if (!read_stream(fin, minimiser_count)) + break; + callback(minimiser, minimiser_count); + } +} diff --git a/src/needle.cpp b/src/needle.cpp index fa0222f..22c6c11 100644 --- a/src/needle.cpp +++ b/src/needle.cpp @@ -319,6 +319,7 @@ int run_needle_ibf(sharg::parser & parser) std::filesystem::path expression_by_genome_file = ""; std::vector fpr{}; // The fpr of one IBF, can be different for different expression levels std::vector cutoffs{}; + bool fast_layout{}; initialise_minimiser_arguments(parser, ibf_args); initialise_arguments_ibf(parser, ibf_args, num_hash, fpr); @@ -336,27 +337,20 @@ int run_needle_ibf(sharg::parser & parser) .long_id = "experiment-names", .description = "Use experiment names from the given txt file."}); - parser.add_option(expression_by_genome_file, - sharg::config{.short_id = '\0', - .long_id = "levels-by-genome", - .description = "Sequence file containing minimizers, only those minimizers will be " - "considered for determining the expression thresholds.", - .validator = sharg::input_file_validator{}}); - - parser.add_flag(minimiser_args.ram_friendly, - sharg::config{.short_id = '\0', - .long_id = "ram", - .description = "When multithreading, prioritize lower RAM usage over speed."}); - + parser.add_option( + expression_by_genome_file, + sharg::config{.short_id = '\0', + .long_id = "levels-by-genome", + .description = "Sequence file containing minimizers to determine expression thresholds.", + .validator = sharg::input_file_validator{}}); + + parser.add_flag(fast_layout, + sharg::config{.short_id = '\0', // + .long_id = "fast-layout", + .description = "Use fast layout algorithm."}); parsing(parser, ibf_args); - if (sequence_files[0].extension() == ".lst") - { - input_file = sequence_files[0]; - sequence_files = {}; - read_input_file_list(sequence_files, input_file); - } - ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs, expression_by_genome_file, num_hash); + ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs, expression_by_genome_file, num_hash, fast_layout); return 0; }