From d83782833c91f29444f781a181681e12183aa396 Mon Sep 17 00:00:00 2001 From: Russell Davidson Date: Mon, 1 Jun 2026 15:53:59 -0500 Subject: [PATCH 1/4] Sort sequences largest to smallest before splitting the set into shards --- pipelines/shared/nextflow/blast.nf | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pipelines/shared/nextflow/blast.nf b/pipelines/shared/nextflow/blast.nf index 2162bbbf..57bbe0d0 100644 --- a/pipelines/shared/nextflow/blast.nf +++ b/pipelines/shared/nextflow/blast.nf @@ -182,7 +182,7 @@ process restore_condensed { } -process split_fasta { +process sort_and_split_fasta { input: tuple val(fid), path(fasta_file) @@ -192,10 +192,11 @@ process split_fasta { script: """ mkdir parts - seqkit split2 \ - ${fasta_file} \ + seqkit sort -l --reverse -2 ${fasta_file} \ + | seqkit split2 \ -p ${params.num_fasta_shards} \ - --out-dir parts + --out-dir parts \ + --out-prefix "all_sequences.fasta_" """ } From 7510ac6888bf2f2bf2cf74dabbefd912da2bfeb9 Mon Sep 17 00:00:00 2001 From: Russell Davidson Date: Mon, 1 Jun 2026 15:55:53 -0500 Subject: [PATCH 2/4] Update process block name where its used --- pipelines/convergenceratio/convergenceratio.nf | 4 ++-- pipelines/est/subworkflows/all_by_all.nf | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pipelines/convergenceratio/convergenceratio.nf b/pipelines/convergenceratio/convergenceratio.nf index 0cd55fd7..bcedcc5c 100644 --- a/pipelines/convergenceratio/convergenceratio.nf +++ b/pipelines/convergenceratio/convergenceratio.nf @@ -1,5 +1,5 @@ -include { all_by_all_blast; blastreduce; blastreduce_transcode_fasta; condense_redundant; create_blast_db; restore_condensed; split_fasta } from "../shared/nextflow/blast.nf" +include { all_by_all_blast; blastreduce; blastreduce_transcode_fasta; condense_redundant; create_blast_db; restore_condensed; sort_and_split_fasta } from "../shared/nextflow/blast.nf" include { unzip_ssn } from "../shared/nextflow/util.nf" include { compute_clusters; get_conv_ratio_table; get_id_list; get_ssn_id_info } from "../shared/nextflow/color_workflow.nf" @@ -131,7 +131,7 @@ workflow { fasta_lengths_parquet = blastreduce_transcode_fasta(cluster_fasta) - fasta_shards = split_fasta(reduced_fasta.fasta_file) + fasta_shards = sort_and_split_fasta(reduced_fasta.fasta_file) blast_input = blast_databases.combine(fasta_shards.transpose(), by: 0) diff --git a/pipelines/est/subworkflows/all_by_all.nf b/pipelines/est/subworkflows/all_by_all.nf index 78b8e4a3..3c5812cb 100644 --- a/pipelines/est/subworkflows/all_by_all.nf +++ b/pipelines/est/subworkflows/all_by_all.nf @@ -1,11 +1,14 @@ -include { all_by_all_blast; blastreduce; blastreduce_transcode_fasta; condense_redundant; create_blast_db; restore_condensed; split_fasta } from "../../shared/nextflow/blast.nf" +include { all_by_all_blast; blastreduce; blastreduce_transcode_fasta; condense_redundant; create_blast_db; restore_condensed; sort_and_split_fasta } from "../../shared/nextflow/blast.nf" workflow ALL_BY_ALL { take: original_fasta main: + // For stats computation later; using the original fasta file + fasta_lengths_parquet = blastreduce_transcode_fasta(original_fasta) + // Cluster redundant sequences for BLAST computation (formerly known as multiplex). // Only performed when input sequences are from Uniprot, since sequence sets from // UniRef90 and UniRef50 are already sequence-unique. @@ -21,11 +24,8 @@ workflow ALL_BY_ALL { // Create BLAST database blastdb = create_blast_db(blast_input_fasta) - // For stats computation later - fasta_lengths_parquet = blastreduce_transcode_fasta(original_fasta) - // All-by-all BLAST - fasta_shards = split_fasta(blast_input_fasta) + fasta_shards = sort_and_split_fasta(blast_input_fasta) blast_input = blastdb.combine(fasta_shards.transpose(), by: 0) blast_fractions = all_by_all_blast( blast_input ).groupTuple() From b15c49b26ae28097d6a73e5d4003a30bfe95807c Mon Sep 17 00:00:00 2001 From: Russell Davidson Date: Tue, 2 Jun 2026 09:37:44 -0500 Subject: [PATCH 3/4] Add memory efficient file reading call and explicitly set the number of processors to be used --- pipelines/shared/nextflow/blast.nf | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pipelines/shared/nextflow/blast.nf b/pipelines/shared/nextflow/blast.nf index 57bbe0d0..55768243 100644 --- a/pipelines/shared/nextflow/blast.nf +++ b/pipelines/shared/nextflow/blast.nf @@ -192,8 +192,14 @@ process sort_and_split_fasta { script: """ mkdir parts - seqkit sort -l --reverse -2 ${fasta_file} \ + seqkit sort --by-length \ + --reverse \ + --two-pass \ + --threads ${task.cpus} \ + ${fasta_file} \ | seqkit split2 \ + --two-pass \ + --threads ${task.cpus} \ -p ${params.num_fasta_shards} \ --out-dir parts \ --out-prefix "all_sequences.fasta_" From 11e11544adb66a825a647a2556a84f9ee189a97b Mon Sep 17 00:00:00 2001 From: Russell Davidson Date: Tue, 2 Jun 2026 10:21:53 -0500 Subject: [PATCH 4/4] Remove incorrect arg --- pipelines/shared/nextflow/blast.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/shared/nextflow/blast.nf b/pipelines/shared/nextflow/blast.nf index 55768243..305baf8e 100644 --- a/pipelines/shared/nextflow/blast.nf +++ b/pipelines/shared/nextflow/blast.nf @@ -198,7 +198,6 @@ process sort_and_split_fasta { --threads ${task.cpus} \ ${fasta_file} \ | seqkit split2 \ - --two-pass \ --threads ${task.cpus} \ -p ${params.num_fasta_shards} \ --out-dir parts \