diff --git a/pipelines/convergenceratio/convergenceratio.nf b/pipelines/convergenceratio/convergenceratio.nf index 0cd55fd7..bcedcc5c 100644 --- a/pipelines/convergenceratio/convergenceratio.nf +++ b/pipelines/convergenceratio/convergenceratio.nf @@ -1,5 +1,5 @@ -include { all_by_all_blast; blastreduce; blastreduce_transcode_fasta; condense_redundant; create_blast_db; restore_condensed; split_fasta } from "../shared/nextflow/blast.nf" +include { all_by_all_blast; blastreduce; blastreduce_transcode_fasta; condense_redundant; create_blast_db; restore_condensed; sort_and_split_fasta } from "../shared/nextflow/blast.nf" include { unzip_ssn } from "../shared/nextflow/util.nf" include { compute_clusters; get_conv_ratio_table; get_id_list; get_ssn_id_info } from "../shared/nextflow/color_workflow.nf" @@ -131,7 +131,7 @@ workflow { fasta_lengths_parquet = blastreduce_transcode_fasta(cluster_fasta) - fasta_shards = split_fasta(reduced_fasta.fasta_file) + fasta_shards = sort_and_split_fasta(reduced_fasta.fasta_file) blast_input = blast_databases.combine(fasta_shards.transpose(), by: 0) diff --git a/pipelines/est/subworkflows/all_by_all.nf b/pipelines/est/subworkflows/all_by_all.nf index 78b8e4a3..3c5812cb 100644 --- a/pipelines/est/subworkflows/all_by_all.nf +++ b/pipelines/est/subworkflows/all_by_all.nf @@ -1,11 +1,14 @@ -include { all_by_all_blast; blastreduce; blastreduce_transcode_fasta; condense_redundant; create_blast_db; restore_condensed; split_fasta } from "../../shared/nextflow/blast.nf" +include { all_by_all_blast; blastreduce; blastreduce_transcode_fasta; condense_redundant; create_blast_db; restore_condensed; sort_and_split_fasta } from "../../shared/nextflow/blast.nf" workflow ALL_BY_ALL { take: original_fasta main: + // For stats computation later; using the original fasta file + fasta_lengths_parquet = blastreduce_transcode_fasta(original_fasta) + // Cluster redundant sequences for BLAST computation (formerly known as multiplex). // Only performed when input sequences are from Uniprot, since sequence sets from // UniRef90 and UniRef50 are already sequence-unique. @@ -21,11 +24,8 @@ workflow ALL_BY_ALL { // Create BLAST database blastdb = create_blast_db(blast_input_fasta) - // For stats computation later - fasta_lengths_parquet = blastreduce_transcode_fasta(original_fasta) - // All-by-all BLAST - fasta_shards = split_fasta(blast_input_fasta) + fasta_shards = sort_and_split_fasta(blast_input_fasta) blast_input = blastdb.combine(fasta_shards.transpose(), by: 0) blast_fractions = all_by_all_blast( blast_input ).groupTuple() diff --git a/pipelines/shared/nextflow/blast.nf b/pipelines/shared/nextflow/blast.nf index 2162bbbf..305baf8e 100644 --- a/pipelines/shared/nextflow/blast.nf +++ b/pipelines/shared/nextflow/blast.nf @@ -182,7 +182,7 @@ process restore_condensed { } -process split_fasta { +process sort_and_split_fasta { input: tuple val(fid), path(fasta_file) @@ -192,10 +192,16 @@ process split_fasta { script: """ mkdir parts - seqkit split2 \ + seqkit sort --by-length \ + --reverse \ + --two-pass \ + --threads ${task.cpus} \ ${fasta_file} \ + | seqkit split2 \ + --threads ${task.cpus} \ -p ${params.num_fasta_shards} \ - --out-dir parts + --out-dir parts \ + --out-prefix "all_sequences.fasta_" """ }