From a568629f6a0c8235b7f908ca90cd3529e04f29b1 Mon Sep 17 00:00:00 2001 From: Alex Raiyemo Date: Thu, 21 May 2026 01:41:20 -0400 Subject: [PATCH 01/14] Add coalescent species tree pipeline as step 4astral Introduces an optional fourth step that builds a coalescent-based species tree from the per-OG alignments produced by step 3combine, complementing the existing supermatrix approach. This addresses the limitation that concatenation ignores differing gene-tree histories across orthogroups. New step: read2tree --step 4astral - Filters per-OG alignments by taxon occupancy (--min_samples) and gap fraction (--max_gap), converting phylip-relaxed .fa files to clean FASTA - Optionally trims filtered alignments with ClipKIT (--trim flag) - Runs IQ-TREE per gene in parallel via multiprocessing.Pool using the LG+F+G model with SH-aLRT branch support (-alrt 1000) in fast mode - Collects gene trees and runs ASTER (astral3) to estimate the coalescent species tree, writing astral_tree_merge.nwk to the output directory New wrappers: Clipkit (wrappers/aligners/clipkit.py) and Aster (wrappers/treebuilders/aster.py) following existing wrapper conventions. New helper get_gene_tree_options() added to iqtree.py. New dependencies: aster and clipkit added to environment.yml. README updated with step 4 usage, output files, and installation notes. --- README.md | 34 ++- environment.yml | 2 + read2tree/CoalescentInference.py | 226 ++++++++++++++++++++ read2tree/main.py | 33 ++- read2tree/wrappers/aligners/__init__.py | 1 + read2tree/wrappers/aligners/clipkit.py | 71 ++++++ read2tree/wrappers/treebuilders/__init__.py | 1 + read2tree/wrappers/treebuilders/aster.py | 79 +++++++ read2tree/wrappers/treebuilders/iqtree.py | 12 ++ 9 files changed, 456 insertions(+), 3 deletions(-) create mode 100644 read2tree/CoalescentInference.py create mode 100644 read2tree/wrappers/aligners/clipkit.py create mode 100644 read2tree/wrappers/treebuilders/aster.py diff --git a/README.md b/README.md index 13282ea4..50a566a2 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,11 @@ For this version, the `--read_type` argument accepts any minimap2 options string conda install -c bioconda mafft iqtree minimap2 samtools ``` +For the coalescent species tree (step 4), [ASTER](https://github.com/chaoszhang/ASTER) (the C++ implementation of ASTRAL-III) is required. [ClipKIT](https://github.com/JLSteenwyk/ClipKIT) is optional and used only when `--trim` is passed. +``` +conda install -c bioconda aster clipkit +``` + Then, you can install the read2tree package after downlaoding the package from this GitHub repo using ``` @@ -79,7 +84,9 @@ cat marker_genes/*.fna > dna_ref.fa ### output -The output of Read2Tree is the concatenated alignments as a fasta file where each record corresponds to one species. We also provide the option `--tree` for inferring the species tree using IQTREE as default. +The output of Read2Tree is the concatenated alignments as a fasta file where each record corresponds to one species. We also provide the option `--tree` for inferring the species tree using IQTREE as default (concatenation/supermatrix approach). + +For a coalescent-based species tree that accounts for incomplete lineage sorting and differing gene tree histories, run the optional **step 4** after step 3 (see below). ### Single species mode @@ -121,6 +128,31 @@ Tunable filters (only active with `--meta`): **Note on false positives:** metagenomic mode is permissive by design and may include species that share marker reads only by chance. We recommend tuning `--meta_min_markers` and `--meta_marker_fraction` to your dataset (e.g. `50` and `0.5` as a starting point for typical microbial communities) to reduce false positives. +#### step4 (optional: coalescent species tree) + +Step 3 builds a supermatrix tree by concatenating all OG alignments. If you want a **coalescent-based species tree** instead — which better handles incomplete lineage sorting and the different evolutionary histories of individual genes — run step 4 after step 3: + +``` +read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 +``` + +Step 4 does the following automatically: +1. Filters the per-OG alignments from step 3 by taxon occupancy (`--min_samples`, default 10) and gap fraction (`--max_gap`, default 0.80). +2. Runs IQ-TREE on each passing alignment in parallel (`-m LG+F+G`, `-alrt 1000`, `-fast`) to infer individual gene trees. +3. Collects all gene trees and passes them to [ASTER](https://github.com/chaoszhang/ASTER) (`astral3`) to produce the final coalescent species tree. + +Optionally, pass `--trim` to run [ClipKIT](https://github.com/JLSteenwyk/ClipKIT) column-trimming on each alignment before gene tree inference: +``` +read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --trim +``` + +Key output files written to `output/`: +- `07_astral_filtered_aa/` — per-OG FASTA alignments that passed filtering +- `07_astral_trimmed_aa/` — ClipKIT-trimmed alignments (only when `--trim` is used) +- `08_gene_trees/` — individual IQ-TREE gene tree files +- `gene_trees_merge.nwk` — all gene trees concatenated into one file (input to ASTER) +- `astral_tree_merge.nwk` — the final coalescent species tree in Newick format + ### bootstraping To have bootstrap values a metric for quality of internal nodes, you can run the following diff --git a/environment.yml b/environment.yml index 5355b124..c798783c 100644 --- a/environment.yml +++ b/environment.yml @@ -23,3 +23,5 @@ dependencies: - samtools - filelock - pysam + - aster + - clipkit diff --git a/read2tree/CoalescentInference.py b/read2tree/CoalescentInference.py new file mode 100644 index 00000000..6b85867e --- /dev/null +++ b/read2tree/CoalescentInference.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python +''' + CoalescentInference: filter per-OG alignments, infer per-gene trees with IQ-TREE, + and run ASTER to produce a coalescent species tree (step 4astral). + + -- Step 4 of the read2tree pipeline. +''' +import os +import glob +import time +import logging +from multiprocessing import Pool +from Bio import SeqIO, AlignIO + +from read2tree.wrappers.treebuilders.iqtree import Iqtree, get_gene_tree_options +from read2tree.wrappers.treebuilders.base_treebuilder import DataType +from read2tree.wrappers.treebuilders.aster import Aster +from read2tree.wrappers import WrapperError + +logger = logging.getLogger(__name__) + + +def _run_gene_tree(task): + """ + Module-level worker for multiprocessing pool. + Runs IQ-TREE on a single alignment file and writes the treefile. + Returns the Newick tree string, or None on failure. + """ + alignment_file, gene_trees_folder = task + og_name = os.path.basename(alignment_file).rsplit('.', 1)[0] + try: + iqtree_wrapper = Iqtree(alignment_file, datatype=DataType.PROTEIN) + iqtree_wrapper.options = get_gene_tree_options() + tree = iqtree_wrapper() + if tree: + treefile = os.path.join(gene_trees_folder, og_name + '.treefile') + with open(treefile, 'w') as fh: + fh.write(tree.strip() + '\n') + return tree + except Exception as e: + logger.error('Gene tree failed for {}: {}'.format(og_name, e)) + return None + + +class CoalescentInference(object): + """ + Orchestrates the coalescent species tree pipeline (step 4astral): + + 1. Filter per-OG alignments from 06_align_merge_aa by gap fraction and + taxon occupancy, writing clean FASTA to 07_astral_filtered_aa. + 2. Optionally trim filtered alignments with ClipKIT (--trim flag), + writing results to 07_astral_trimmed_aa. + 3. Run IQ-TREE on each alignment in parallel using multiprocessing, + writing individual gene treefiles to 08_gene_trees. + 4. Concatenate gene trees into a single treefile and run ASTER to + produce the coalescent species tree. + """ + + def __init__(self, args): + self.args = args + self._species_name = 'merge' + self._filtered_folder = self._make_output_path('07_astral_filtered_aa') + self._trimmed_folder = self._make_output_path('07_astral_trimmed_aa') if args.trim else None + self._gene_trees_folder = self._make_output_path('08_gene_trees') + self.elapsed_time = 0 + self.tree = None + self._run() + + def _run(self): + start = time.time() + + filtered_files = self._filter_alignments() + if not filtered_files: + logger.error('{}: No alignments passed filtering for step 4astral.'.format(self._species_name)) + return + + if self.args.trim: + input_files = self._trim_alignments(filtered_files) + if not input_files: + logger.error('{}: No alignments remain after ClipKIT trimming.'.format(self._species_name)) + return + else: + input_files = filtered_files + + gene_tree_file = self._infer_gene_trees(input_files) + self.tree = self._infer_species_tree(gene_tree_file) + + end = time.time() + self.elapsed_time = end - start + logger.info('{}: Step 4astral coalescent inference took {:.2f}s.'.format( + self._species_name, self.elapsed_time)) + + def _make_output_path(self, prefix): + path = os.path.join(self.args.output_path, prefix) + if not os.path.exists(path): + os.makedirs(path) + return path + + def _filter_alignments(self): + """ + Filter per-OG alignments from 06_align_merge_aa by gap fraction and occupancy. + + The .fa files produced by step 3combine are phylip-relaxed format despite their + extension; this method reads them correctly and writes passing alignments as + standard FASTA to 07_astral_filtered_aa. + + :return: list of paths to filtered FASTA files + """ + input_folder = os.path.join(self.args.output_path, '06_align_merge_aa') + log_path = os.path.join(self._filtered_folder, 'filtering_summary.txt') + filtered_files = [] + total = 0 + passed = 0 + dropped = 0 + + with open(log_path, 'w') as log: + log.write('OG_Name\tOriginal_Sequences\tSequences_Passing_Gap_Filter\tStatus\n') + for filepath in sorted(glob.glob(os.path.join(input_folder, '*.fa'))): + total += 1 + og_name = os.path.basename(filepath) + orig_count = 0 + valid_records = [] + try: + alignment = AlignIO.read(filepath, 'phylip-relaxed') + for record in alignment: + orig_count += 1 + seq_str = str(record.seq).upper() + gap_count = seq_str.count('-') + seq_str.count('X') + seq_str.count('N') + if len(seq_str) > 0 and (gap_count / len(seq_str)) <= self.args.max_gap: + valid_records.append(record) + + if len(valid_records) >= self.args.min_samples: + out_path = os.path.join(self._filtered_folder, + og_name.replace('.fa', '.fasta')) + SeqIO.write(valid_records, out_path, 'fasta') + filtered_files.append(out_path) + passed += 1 + status = 'KEPT' + else: + dropped += 1 + status = 'DROPPED (Low Occupancy)' + except Exception as e: + dropped += 1 + status = 'ERROR: {}'.format(e) + + log.write('{}\t{}\t{}\t{}\n'.format(og_name, orig_count, len(valid_records), status)) + + log.write('\n=== TOTALS ===\n') + log.write('Total evaluated: {}\nKept: {}\nDropped/failed: {}\n'.format( + total, passed, dropped)) + if total > 0: + log.write('Retention rate: {:.2f}%\n'.format((passed / total) * 100)) + + logger.info('{}: Alignment filtering kept {} of {} OGs.'.format( + self._species_name, passed, total)) + return filtered_files + + def _trim_alignments(self, filtered_files): + """ + Run ClipKIT on each filtered alignment. + + :param filtered_files: list of FASTA alignment paths + :return: list of trimmed FASTA paths that are non-empty after trimming + """ + from read2tree.wrappers.aligners.clipkit import Clipkit + trimmed_files = [] + for fasta_file in filtered_files: + og_name = os.path.basename(fasta_file) + trimmed_path = os.path.join(self._trimmed_folder, og_name) + try: + clipkit_wrapper = Clipkit(fasta_file, trimmed_path) + result = clipkit_wrapper() + if result: + trimmed_files.append(result) + else: + logger.warning('{}: ClipKIT produced empty output for {}, skipping.'.format( + self._species_name, og_name)) + except WrapperError as e: + logger.error('{}: ClipKIT failed for {}: {}'.format(self._species_name, og_name, e)) + logger.info('{}: ClipKIT trimming kept {} of {} alignments.'.format( + self._species_name, len(trimmed_files), len(filtered_files))) + return trimmed_files + + def _infer_gene_trees(self, alignment_files): + """ + Run IQ-TREE on each alignment in parallel using multiprocessing.Pool. + Collects all gene trees into a single Newick file for ASTER. + + :param alignment_files: list of FASTA alignment paths + :return: path to the concatenated gene treefile + """ + tasks = [(f, self._gene_trees_folder) for f in alignment_files] + logger.info('{}: Running per-gene IQ-TREE on {} alignments with {} workers.'.format( + self._species_name, len(tasks), self.args.threads)) + + p = Pool(self.args.threads) + results = p.map(_run_gene_tree, tasks) + p.close() + p.join() + + trees = [t for t in results if t is not None] + gene_tree_file = os.path.join(self.args.output_path, + 'gene_trees_' + self._species_name + '.nwk') + with open(gene_tree_file, 'w') as fh: + for tree in trees: + fh.write(tree.strip() + '\n') + + logger.info('{}: {} of {} gene trees successfully inferred.'.format( + self._species_name, len(trees), len(tasks))) + return gene_tree_file + + def _infer_species_tree(self, gene_tree_file): + """ + Run ASTER (astral3) on the collected gene trees to estimate a coalescent species tree. + + :param gene_tree_file: path to file containing one gene tree (Newick) per line + :return: coalescent species tree in Newick format + """ + species_tree_file = os.path.join(self.args.output_path, + 'astral_tree_' + self._species_name + '.nwk') + aster_wrapper = Aster(gene_tree_file, species_tree_file) + aster_wrapper.options.options['-t'].set_value(self.args.threads) + tree = aster_wrapper() + logger.info('{}: Coalescent species tree written to {}'.format( + self._species_name, species_tree_file)) + return tree diff --git a/read2tree/main.py b/read2tree/main.py index 2afcdb52..320fc29b 100644 --- a/read2tree/main.py +++ b/read2tree/main.py @@ -25,6 +25,7 @@ from read2tree.Aligner import Aligner # from read2tree.Progress import Progress from read2tree.TreeInference import TreeInference +from read2tree.CoalescentInference import CoalescentInference from read2tree.parser import OMAOutputParser import argparse import glob @@ -172,8 +173,21 @@ def parse_args(argv, exe_name, desc): help='[Default is false] Compute tree, otherwise just ' 'output concatenated alignment!') + arg_parser.add_argument('--min_samples', type=int, default=10, + help='[Default is 10] Minimum number of sequences per OG ' + 'after gap filtering. Used by step 4astral.') + + arg_parser.add_argument('--max_gap', type=float, default=0.80, + help='[Default is 0.80] Maximum allowed fraction of gaps ' + '(-, X, N) per sequence. Used by step 4astral.') + + arg_parser.add_argument('--trim', action='store_true', + help='[Default is false] Run ClipKIT trimming on filtered ' + 'alignments before per-gene IQ-TREE. Requires clipkit ' + 'in PATH. Used by step 4astral.') + arg_parser.add_argument('--step', default="all", - help='[Default is all 1marker 2map 3combine ') + help='[Default is all 1marker 2map 3combine 4astral') # arg_parser.add_argument('--merge_all_mappings', action='store_true', # help='[Default is off] In case multiple species were mapped to ' @@ -247,7 +261,7 @@ def parse_args(argv, exe_name, desc): if args.species_name: _species_name = args.species_name - if args.step == "3combine": # todo why is needed? + if args.step == "3combine" or args.step == "4astral": _species_name = 'merge' args.reads = _reads @@ -445,6 +459,21 @@ def main(argv, exe_name, desc=''): logger.info(' ------- Read2Tree finished -*- -------') + if args.step == "4astral": + input_align_folder = os.path.join(args.output_path, '06_align_merge_aa') + if not os.path.exists(input_align_folder): + logger.error( + 'Step 4astral requires completed step 3combine output. ' + 'Folder not found: {}'.format(input_align_folder)) + sys.exit() + logger.info('{}: ------- Read2Tree step 4astral (coalescent species tree) -------'.format( + args.species_name)) + coalescent = CoalescentInference(args) + if coalescent.tree: + logger.info(str(coalescent.tree)) + print("done- 4astral") + logger.info(' ------- Read2Tree step 4astral finished -*- -------') + print("done- main ") # TODO: Check whether all the necessary binaries are available diff --git a/read2tree/wrappers/aligners/__init__.py b/read2tree/wrappers/aligners/__init__.py index 363b16b8..ed9f0fe7 100644 --- a/read2tree/wrappers/aligners/__init__.py +++ b/read2tree/wrappers/aligners/__init__.py @@ -2,4 +2,5 @@ from .muscle import Muscle from .prographmsa import ProGraphMSA from .probcons import ProbCons +from .clipkit import Clipkit from .base_aligner import AlignmentInput, DataType, WrapperError \ No newline at end of file diff --git a/read2tree/wrappers/aligners/clipkit.py b/read2tree/wrappers/aligners/clipkit.py new file mode 100644 index 00000000..9bbec9e9 --- /dev/null +++ b/read2tree/wrappers/aligners/clipkit.py @@ -0,0 +1,71 @@ +import os +import time +import logging +from ..abstract_cli import AbstractCLI +from ..options import StringOption, FloatOption, OptionSet +from read2tree.wrappers import WrapperError + +logger = logging.getLogger(__name__) + + +class ClipkitCLI(AbstractCLI): + @property + def _default_exe(self): + return 'clipkit' + + +class Clipkit(object): + """ + Wrapper for ClipKIT alignment trimmer. + + Takes an input alignment file and writes a trimmed alignment to output_file. + Returns the output file path on success, None if the output is empty. + + :Example: + + :: + + clipkit_wrapper = Clipkit('alignment.fasta', 'alignment_trimmed.fasta') + result = clipkit_wrapper() + time_taken = clipkit_wrapper.elapsed_time + """ + + def __init__(self, input_file, output_file, binary=None): + self.input_file = input_file + self.output_file = output_file + self.options = get_default_options() + self.elapsed_time = None + self.stdout = None + self.stderr = None + self.result = None + try: + self.cli = ClipkitCLI(executable=binary) + except IOError as err: + raise WrapperError('Error searching for clipkit binary: {}'.format(err)) + + def __call__(self, *args, **kwargs): + start = time.time() + output, error = self._call(self.input_file, self.output_file) + self.stdout = output + self.stderr = error + self.result = self.output_file if os.path.exists(self.output_file) and os.path.getsize(self.output_file) > 0 else None + end = time.time() + self.elapsed_time = end - start + return self.result + + def _call(self, input_file, output_file): + self.cli('{} {} -o {}'.format(input_file, self.command(), output_file), wait=True) + return self.cli.get_stdout(), self.cli.get_stderr() + + def command(self): + return str(self.options) + + def _init_cli(self, binary): + return ClipkitCLI(executable=binary) + + +def get_default_options(): + return OptionSet([ + StringOption('-m', 'gappy', active=True), + FloatOption('-g', 0.8, active=True), + ]) diff --git a/read2tree/wrappers/treebuilders/__init__.py b/read2tree/wrappers/treebuilders/__init__.py index 1fa1f09b..4ff07347 100644 --- a/read2tree/wrappers/treebuilders/__init__.py +++ b/read2tree/wrappers/treebuilders/__init__.py @@ -2,3 +2,4 @@ from .raxml import Raxml from .iqtree import Iqtree from .fasttree import Fasttree +from .aster import Aster diff --git a/read2tree/wrappers/treebuilders/aster.py b/read2tree/wrappers/treebuilders/aster.py new file mode 100644 index 00000000..98c2ea7d --- /dev/null +++ b/read2tree/wrappers/treebuilders/aster.py @@ -0,0 +1,79 @@ +import os +import time +import logging +from ..abstract_cli import AbstractCLI +from ..options import IntegerOption, OptionSet +from read2tree.wrappers import WrapperError + +logger = logging.getLogger(__name__) +logger.addHandler(logging.StreamHandler()) +logger.setLevel(logging.INFO) + + +class AsterCLI(AbstractCLI): + @property + def _default_exe(self): + return 'astral3' + + +class Aster(object): + """ + Wrapper for ASTER (astral3) coalescent species tree estimator. + + Takes a file of gene trees (one Newick tree per line) and writes a species + tree to output_file using the ASTRAL-III algorithm. Returns the species + tree in Newick format. + + :Example: + + :: + + aster_wrapper = Aster('gene_trees.nwk', 'species_tree.nwk') + aster_wrapper.options.options['-t'].set_value(8) + result = aster_wrapper() + time_taken = aster_wrapper.elapsed_time + """ + + def __init__(self, gene_tree_file, output_file, binary=None): + self.gene_tree_file = gene_tree_file + self.output_file = output_file + self.options = get_default_options() + self.elapsed_time = None + self.stdout = None + self.stderr = None + self.result = None + try: + self.cli = AsterCLI(executable=binary) + except IOError as err: + raise WrapperError('Error searching for astral3 binary: {}'.format(err)) + + def __call__(self, *args, **kwargs): + start = time.time() + output, error = self._call(self.gene_tree_file, self.output_file) + self.stdout = output + self.stderr = error + self.result = self._read_result(self.output_file) + end = time.time() + self.elapsed_time = end - start + return self.result + + def _call(self, gene_tree_file, output_file): + self.cli('{} -i {} -o {}'.format(self.command(), gene_tree_file, output_file), wait=True) + return self.cli.get_stdout(), self.cli.get_stderr() + + def command(self): + return str(self.options) + + def _read_result(self, output_file): + try: + with open(output_file, 'r') as fh: + return fh.read().strip() + except IOError: + logger.error('Error reading ASTER output: {}'.format(output_file)) + return None + + +def get_default_options(): + return OptionSet([ + IntegerOption('-t', 1, active=True), + ]) diff --git a/read2tree/wrappers/treebuilders/iqtree.py b/read2tree/wrappers/treebuilders/iqtree.py index 4875a9cc..074d042a 100644 --- a/read2tree/wrappers/treebuilders/iqtree.py +++ b/read2tree/wrappers/treebuilders/iqtree.py @@ -147,3 +147,15 @@ def get_default_options(): # Bootstrap + ML tree + consensus tree (>=100) IntegerOption('-b', 0, active=False) ]) + + +def get_gene_tree_options(): + """Options for per-gene tree inference in the coalescent pipeline (step 4astral).""" + return OptionSet([ + IntegerOption('-nt', 1, active=True), + StringOption('-m', 'LG+F+G', active=True), + StringOption('-st', 'AA', active=True), + StringOption('-mem', '4G', active=True), + IntegerOption('-alrt', 1000, active=True), + FlagOption('-fast', True, active=True), + ]) From dbc5f96fdf6a6a82cea0468919cb552c62592c0b Mon Sep 17 00:00:00 2001 From: Alex Raiyemo Date: Thu, 21 May 2026 02:04:43 -0400 Subject: [PATCH 02/14] README: add empirical guidance on --min_samples threshold for large datasets --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 50a566a2..9ac89e05 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,8 @@ Step 4 does the following automatically: 2. Runs IQ-TREE on each passing alignment in parallel (`-m LG+F+G`, `-alrt 1000`, `-fast`) to infer individual gene trees. 3. Collects all gene trees and passes them to [ASTER](https://github.com/chaoszhang/ASTER) (`astral3`) to produce the final coalescent species tree. +**Choosing `--min_samples` for large datasets.** The default of 10 is intentionally permissive so the tool works out of the box for small test datasets. For studies with many samples, the occupancy threshold has a large effect on how many OGs survive filtering and on the quality of the resulting gene trees. A useful empirical guideline is to require at least **30–40% taxon occupancy** — for example, `--min_samples 100` for a dataset of ~300 samples. In practice, applying a meaningful occupancy threshold together with `--max_gap 0.80` can reduce the number of OGs from tens of thousands to a few hundred; this is expected and desirable, as the surviving alignments are well-sampled across the tree and produce far more reliable gene trees for ASTRAL than a large set of sparse, gap-heavy alignments would. If the filtered set is very small (fewer than ~50 OGs), consider relaxing `--min_samples` slightly rather than `--max_gap`, since taxon occupancy drives gene tree resolution more than column-level gap content. + Optionally, pass `--trim` to run [ClipKIT](https://github.com/JLSteenwyk/ClipKIT) column-trimming on each alignment before gene tree inference: ``` read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --trim From b4047b3a397c7a57e974f3359bf92c10bfc3a3e0 Mon Sep 17 00:00:00 2001 From: Alex Raiyemo Date: Thu, 21 May 2026 02:16:05 -0400 Subject: [PATCH 03/14] Make ASTER binary configurable via --astral_binary; auto-detect astral3/astral-pro3/astral-pro2 --- read2tree/CoalescentInference.py | 3 ++- read2tree/main.py | 6 ++++++ read2tree/wrappers/treebuilders/aster.py | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/read2tree/CoalescentInference.py b/read2tree/CoalescentInference.py index 6b85867e..8ca6dea6 100644 --- a/read2tree/CoalescentInference.py +++ b/read2tree/CoalescentInference.py @@ -218,7 +218,8 @@ def _infer_species_tree(self, gene_tree_file): """ species_tree_file = os.path.join(self.args.output_path, 'astral_tree_' + self._species_name + '.nwk') - aster_wrapper = Aster(gene_tree_file, species_tree_file) + aster_wrapper = Aster(gene_tree_file, species_tree_file, + binary=getattr(self.args, 'astral_binary', None)) aster_wrapper.options.options['-t'].set_value(self.args.threads) tree = aster_wrapper() logger.info('{}: Coalescent species tree written to {}'.format( diff --git a/read2tree/main.py b/read2tree/main.py index 320fc29b..d73cc180 100644 --- a/read2tree/main.py +++ b/read2tree/main.py @@ -186,6 +186,12 @@ def parse_args(argv, exe_name, desc): 'alignments before per-gene IQ-TREE. Requires clipkit ' 'in PATH. Used by step 4astral.') + arg_parser.add_argument('--astral_binary', default=None, + help='[Default is auto-detect] Name or path of the ASTER ' + 'binary to use for coalescent species tree estimation ' + '(e.g. astral3, astral-pro3, astral-pro2). If not set, ' + 'the first available binary is used. Used by step 4astral.') + arg_parser.add_argument('--step', default="all", help='[Default is all 1marker 2map 3combine 4astral') diff --git a/read2tree/wrappers/treebuilders/aster.py b/read2tree/wrappers/treebuilders/aster.py index 98c2ea7d..3654747e 100644 --- a/read2tree/wrappers/treebuilders/aster.py +++ b/read2tree/wrappers/treebuilders/aster.py @@ -13,7 +13,7 @@ class AsterCLI(AbstractCLI): @property def _default_exe(self): - return 'astral3' + return ['astral3', 'astral-pro3', 'astral-pro2'] class Aster(object): From 42805ab8ce339288280d448d2241efbc6578128f Mon Sep 17 00:00:00 2001 From: Alex Raiyemo Date: Thu, 21 May 2026 02:17:51 -0400 Subject: [PATCH 04/14] README: document --astral_binary option and ASTER binary auto-detection --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9ac89e05..9f8a92fa 100644 --- a/README.md +++ b/README.md @@ -139,7 +139,12 @@ read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref. Step 4 does the following automatically: 1. Filters the per-OG alignments from step 3 by taxon occupancy (`--min_samples`, default 10) and gap fraction (`--max_gap`, default 0.80). 2. Runs IQ-TREE on each passing alignment in parallel (`-m LG+F+G`, `-alrt 1000`, `-fast`) to infer individual gene trees. -3. Collects all gene trees and passes them to [ASTER](https://github.com/chaoszhang/ASTER) (`astral3`) to produce the final coalescent species tree. +3. Collects all gene trees and passes them to [ASTER](https://github.com/chaoszhang/ASTER) to produce the final coalescent species tree. + +**ASTER binary.** The ASTER suite provides several binaries (`astral3`, `astral-pro3`, `astral-pro2`). By default, step 4 auto-detects the first one available in your PATH in that order. To specify one explicitly — for example if you prefer `astral-pro3`, which handles both single and multi-copy gene trees: +``` +read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --astral_binary astral-pro3 +``` **Choosing `--min_samples` for large datasets.** The default of 10 is intentionally permissive so the tool works out of the box for small test datasets. For studies with many samples, the occupancy threshold has a large effect on how many OGs survive filtering and on the quality of the resulting gene trees. A useful empirical guideline is to require at least **30–40% taxon occupancy** — for example, `--min_samples 100` for a dataset of ~300 samples. In practice, applying a meaningful occupancy threshold together with `--max_gap 0.80` can reduce the number of OGs from tens of thousands to a few hundred; this is expected and desirable, as the surviving alignments are well-sampled across the tree and produce far more reliable gene trees for ASTRAL than a large set of sparse, gap-heavy alignments would. If the filtered set is very small (fewer than ~50 OGs), consider relaxing `--min_samples` slightly rather than `--max_gap`, since taxon occupancy drives gene tree resolution more than column-level gap content. From 61e02ab45b61214f508be047e86ce8c8f452ff6a Mon Sep 17 00:00:00 2001 From: Alex Raiyemo Date: Thu, 21 May 2026 15:24:12 -0400 Subject: [PATCH 05/14] Add wASTRAL and ASTRAL-IV support; include --abayes in per-gene IQ-TREE runs - get_gene_tree_options() now includes --abayes so aBayes posterior supports are annotated on gene trees alongside SH-aLRT, providing the best weighting signal for wASTRAL hybrid mode - AsterCLI auto-detection extended to wastral and astral4 as final fallbacks; astral3 remains the default - README updated with a comparison table of all ASTER binaries and usage examples for wastral and astral4, with guidance on when to choose each --- README.md | 16 ++++++++++++++-- read2tree/wrappers/treebuilders/aster.py | 2 +- read2tree/wrappers/treebuilders/iqtree.py | 8 +++++++- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9f8a92fa..8b7cda67 100644 --- a/README.md +++ b/README.md @@ -141,9 +141,21 @@ Step 4 does the following automatically: 2. Runs IQ-TREE on each passing alignment in parallel (`-m LG+F+G`, `-alrt 1000`, `-fast`) to infer individual gene trees. 3. Collects all gene trees and passes them to [ASTER](https://github.com/chaoszhang/ASTER) to produce the final coalescent species tree. -**ASTER binary.** The ASTER suite provides several binaries (`astral3`, `astral-pro3`, `astral-pro2`). By default, step 4 auto-detects the first one available in your PATH in that order. To specify one explicitly — for example if you prefer `astral-pro3`, which handles both single and multi-copy gene trees: +**ASTER binary.** The ASTER suite provides several binaries, all installed by `conda install aster`. By default, step 4 auto-detects the first available in your PATH in this order: `astral3` → `astral-pro3` → `astral-pro2` → `wastral` → `astral4`. Use `--astral_binary` to opt into a specific estimator: + +| Binary | When to use | +|---|---| +| `astral3` | Default. Standard ASTRAL-III for single-copy orthologs. | +| `astral-pro3` | Multi-copy gene trees or allopolyploid taxa. | +| `wastral` | Recommended for noisy gene trees. Weights each quartet by the branch support and branch length of the gene tree branches that define it (hybrid mode), so poorly supported splits contribute less to the species tree. IQ-TREE's `--abayes` supports (already included in step 4) provide the best weighting signal. | +| `astral4` | Large datasets with substantial missing taxa, or when substitution-rate branch lengths on internal nodes are needed for downstream rate analyses. Implements ASTRAL-IV (Zhang et al., *MBE* 2025). | + ``` -read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --astral_binary astral-pro3 +# weighted ASTRAL — better accuracy when gene tree support is variable +read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --astral_binary wastral + +# ASTRAL-IV — better robustness under missing data +read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --astral_binary astral4 ``` **Choosing `--min_samples` for large datasets.** The default of 10 is intentionally permissive so the tool works out of the box for small test datasets. For studies with many samples, the occupancy threshold has a large effect on how many OGs survive filtering and on the quality of the resulting gene trees. A useful empirical guideline is to require at least **30–40% taxon occupancy** — for example, `--min_samples 100` for a dataset of ~300 samples. In practice, applying a meaningful occupancy threshold together with `--max_gap 0.80` can reduce the number of OGs from tens of thousands to a few hundred; this is expected and desirable, as the surviving alignments are well-sampled across the tree and produce far more reliable gene trees for ASTRAL than a large set of sparse, gap-heavy alignments would. If the filtered set is very small (fewer than ~50 OGs), consider relaxing `--min_samples` slightly rather than `--max_gap`, since taxon occupancy drives gene tree resolution more than column-level gap content. diff --git a/read2tree/wrappers/treebuilders/aster.py b/read2tree/wrappers/treebuilders/aster.py index 3654747e..7165a037 100644 --- a/read2tree/wrappers/treebuilders/aster.py +++ b/read2tree/wrappers/treebuilders/aster.py @@ -13,7 +13,7 @@ class AsterCLI(AbstractCLI): @property def _default_exe(self): - return ['astral3', 'astral-pro3', 'astral-pro2'] + return ['astral3', 'astral-pro3', 'astral-pro2', 'wastral', 'astral4'] class Aster(object): diff --git a/read2tree/wrappers/treebuilders/iqtree.py b/read2tree/wrappers/treebuilders/iqtree.py index 074d042a..189364e2 100644 --- a/read2tree/wrappers/treebuilders/iqtree.py +++ b/read2tree/wrappers/treebuilders/iqtree.py @@ -150,7 +150,12 @@ def get_default_options(): def get_gene_tree_options(): - """Options for per-gene tree inference in the coalescent pipeline (step 4astral).""" + """Options for per-gene tree inference in the coalescent pipeline (step 4astral). + + --abayes computes aBayes posterior branch supports alongside SH-aLRT values. + Both are annotated as node labels on the output tree, which wASTRAL uses for + quartet weighting when --astral_binary wastral is selected. + """ return OptionSet([ IntegerOption('-nt', 1, active=True), StringOption('-m', 'LG+F+G', active=True), @@ -158,4 +163,5 @@ def get_gene_tree_options(): StringOption('-mem', '4G', active=True), IntegerOption('-alrt', 1000, active=True), FlagOption('-fast', True, active=True), + FlagOption('--abayes', True, active=True), ]) From 1c528ff3480594b3b1fa00e1b44ee1a433eb09a0 Mon Sep 17 00:00:00 2001 From: Alex Raiyemo Date: Thu, 21 May 2026 15:28:05 -0400 Subject: [PATCH 06/14] Fix stale astral3 references in aster.py and README Update docstring and error message in aster.py to reflect that multiple ASTER algorithms (astral3, wastral, astral4, etc.) are supported, not just ASTRAL-III. Add --abayes to the step 4 IQ-TREE invocation summary in the README. --- README.md | 2 +- read2tree/wrappers/treebuilders/aster.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 8b7cda67..3291cb32 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref. Step 4 does the following automatically: 1. Filters the per-OG alignments from step 3 by taxon occupancy (`--min_samples`, default 10) and gap fraction (`--max_gap`, default 0.80). -2. Runs IQ-TREE on each passing alignment in parallel (`-m LG+F+G`, `-alrt 1000`, `-fast`) to infer individual gene trees. +2. Runs IQ-TREE on each passing alignment in parallel (`-m LG+F+G`, `-alrt 1000`, `--abayes`, `-fast`) to infer individual gene trees. 3. Collects all gene trees and passes them to [ASTER](https://github.com/chaoszhang/ASTER) to produce the final coalescent species tree. **ASTER binary.** The ASTER suite provides several binaries, all installed by `conda install aster`. By default, step 4 auto-detects the first available in your PATH in this order: `astral3` → `astral-pro3` → `astral-pro2` → `wastral` → `astral4`. Use `--astral_binary` to opt into a specific estimator: diff --git a/read2tree/wrappers/treebuilders/aster.py b/read2tree/wrappers/treebuilders/aster.py index 7165a037..efd48d5b 100644 --- a/read2tree/wrappers/treebuilders/aster.py +++ b/read2tree/wrappers/treebuilders/aster.py @@ -18,11 +18,11 @@ def _default_exe(self): class Aster(object): """ - Wrapper for ASTER (astral3) coalescent species tree estimator. + Wrapper for the ASTER suite of coalescent species tree estimators. Takes a file of gene trees (one Newick tree per line) and writes a species - tree to output_file using the ASTRAL-III algorithm. Returns the species - tree in Newick format. + tree to output_file using the selected ASTER algorithm. Returns the + species tree in Newick format. :Example: @@ -45,7 +45,7 @@ def __init__(self, gene_tree_file, output_file, binary=None): try: self.cli = AsterCLI(executable=binary) except IOError as err: - raise WrapperError('Error searching for astral3 binary: {}'.format(err)) + raise WrapperError('Error searching for ASTER binary: {}'.format(err)) def __call__(self, *args, **kwargs): start = time.time() From e743dfc016e22af692cd9f9cfd632c5a3245894e Mon Sep 17 00:00:00 2001 From: Alex Raiyemo Date: Thu, 21 May 2026 15:50:08 -0400 Subject: [PATCH 07/14] Add --iqtree_model, --iqtree_args, --astral_args pass-through for step 4astral Expose three optional arguments scoped to step 4astral that let users override the built-in IQ-TREE substitution model and append arbitrary flags to both IQ-TREE and ASTER invocations. Defaults are unchanged. README documents the new options with example invocations. --- README.md | 19 +++++++++++++++++++ read2tree/CoalescentInference.py | 14 ++++++++++++-- read2tree/main.py | 19 +++++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3291cb32..53bc61b8 100644 --- a/README.md +++ b/README.md @@ -165,6 +165,25 @@ Optionally, pass `--trim` to run [ClipKIT](https://github.com/JLSteenwyk/ClipKIT read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --trim ``` +**IQ-TREE and ASTER options.** Step 4 uses sensible defaults for per-gene tree inference (`-m LG+F+G -alrt 1000 --abayes -fast`). For cases where these need to be adjusted, three pass-through arguments are available: + +| Argument | Default | Purpose | +|---|---|---| +| `--iqtree_model` | `LG+F+G` | Substitution model for per-gene IQ-TREE runs (e.g. `WAG+G`, `LG+G`, `TEST` for ModelFinder) | +| `--iqtree_args` | none | Extra flags appended verbatim to every per-gene IQ-TREE call (e.g. `"-bb 1000 -redo"`) | +| `--astral_args` | none | Extra flags appended verbatim to the ASTER call (e.g. `"-C --root OUTGROUP"`) | + +``` +# Use WAG+G model instead of LG+F+G +read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --iqtree_model WAG+G + +# Run ModelFinder per gene (much slower, but selects best-fit model for each OG) +read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --iqtree_model TEST + +# Pass extra flags to ASTER (e.g. polytomy test) +read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --astral_args "-C" +``` + Key output files written to `output/`: - `07_astral_filtered_aa/` — per-OG FASTA alignments that passed filtering - `07_astral_trimmed_aa/` — ClipKIT-trimmed alignments (only when `--trim` is used) diff --git a/read2tree/CoalescentInference.py b/read2tree/CoalescentInference.py index 8ca6dea6..23df0e80 100644 --- a/read2tree/CoalescentInference.py +++ b/read2tree/CoalescentInference.py @@ -15,6 +15,7 @@ from read2tree.wrappers.treebuilders.iqtree import Iqtree, get_gene_tree_options from read2tree.wrappers.treebuilders.base_treebuilder import DataType from read2tree.wrappers.treebuilders.aster import Aster +from read2tree.wrappers.options import StringOption from read2tree.wrappers import WrapperError logger = logging.getLogger(__name__) @@ -26,11 +27,15 @@ def _run_gene_tree(task): Runs IQ-TREE on a single alignment file and writes the treefile. Returns the Newick tree string, or None on failure. """ - alignment_file, gene_trees_folder = task + alignment_file, gene_trees_folder, iqtree_model, iqtree_extra = task og_name = os.path.basename(alignment_file).rsplit('.', 1)[0] try: iqtree_wrapper = Iqtree(alignment_file, datatype=DataType.PROTEIN) iqtree_wrapper.options = get_gene_tree_options() + if iqtree_model: + iqtree_wrapper.options.options['-m'].set_value(iqtree_model) + if iqtree_extra: + iqtree_wrapper.options.options['_extra'] = StringOption('', iqtree_extra, active=True) tree = iqtree_wrapper() if tree: treefile = os.path.join(gene_trees_folder, og_name + '.treefile') @@ -189,7 +194,9 @@ def _infer_gene_trees(self, alignment_files): :param alignment_files: list of FASTA alignment paths :return: path to the concatenated gene treefile """ - tasks = [(f, self._gene_trees_folder) for f in alignment_files] + iqtree_model = getattr(self.args, 'iqtree_model', None) + iqtree_extra = getattr(self.args, 'iqtree_args', None) + tasks = [(f, self._gene_trees_folder, iqtree_model, iqtree_extra) for f in alignment_files] logger.info('{}: Running per-gene IQ-TREE on {} alignments with {} workers.'.format( self._species_name, len(tasks), self.args.threads)) @@ -221,6 +228,9 @@ def _infer_species_tree(self, gene_tree_file): aster_wrapper = Aster(gene_tree_file, species_tree_file, binary=getattr(self.args, 'astral_binary', None)) aster_wrapper.options.options['-t'].set_value(self.args.threads) + astral_extra = getattr(self.args, 'astral_args', None) + if astral_extra: + aster_wrapper.options.options['_extra'] = StringOption('', astral_extra, active=True) tree = aster_wrapper() logger.info('{}: Coalescent species tree written to {}'.format( self._species_name, species_tree_file)) diff --git a/read2tree/main.py b/read2tree/main.py index d73cc180..311afb69 100644 --- a/read2tree/main.py +++ b/read2tree/main.py @@ -192,6 +192,25 @@ def parse_args(argv, exe_name, desc): '(e.g. astral3, astral-pro3, astral-pro2). If not set, ' 'the first available binary is used. Used by step 4astral.') + arg_parser.add_argument('--iqtree_model', default=None, + help='[Default is LG+F+G] Substitution model passed to IQ-TREE ' + 'for per-gene tree inference in step 4astral. Overrides the ' + 'built-in default (e.g. --iqtree_model WAG+G, ' + '--iqtree_model LG+G, --iqtree_model TEST). ' + 'Used by step 4astral.') + + arg_parser.add_argument('--iqtree_args', default=None, + help='[Default is none] Extra IQ-TREE flags appended verbatim to ' + 'every per-gene tree invocation in step 4astral. Use quotes ' + 'for multiple flags (e.g. --iqtree_args "-bb 1000 -redo"). ' + 'Used by step 4astral.') + + arg_parser.add_argument('--astral_args', default=None, + help='[Default is none] Extra flags appended verbatim to the ASTER ' + 'command in step 4astral. Use quotes for multiple flags ' + '(e.g. --astral_args "-C --root OUTGROUP"). ' + 'Used by step 4astral.') + arg_parser.add_argument('--step', default="all", help='[Default is all 1marker 2map 3combine 4astral') From c2941540f7b23631cbe51d8863e6e96ca97411ee Mon Sep 17 00:00:00 2001 From: Alex Raiyemo Date: Fri, 22 May 2026 15:40:32 -0400 Subject: [PATCH 08/14] Refine ASTER binary descriptions in README --- README.md | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 53bc61b8..661479a2 100644 --- a/README.md +++ b/README.md @@ -146,9 +146,9 @@ Step 4 does the following automatically: | Binary | When to use | |---|---| | `astral3` | Default. Standard ASTRAL-III for single-copy orthologs. | -| `astral-pro3` | Multi-copy gene trees or allopolyploid taxa. | -| `wastral` | Recommended for noisy gene trees. Weights each quartet by the branch support and branch length of the gene tree branches that define it (hybrid mode), so poorly supported splits contribute less to the species tree. IQ-TREE's `--abayes` supports (already included in step 4) provide the best weighting signal. | -| `astral4` | Large datasets with substantial missing taxa, or when substitution-rate branch lengths on internal nodes are needed for downstream rate analyses. Implements ASTRAL-IV (Zhang et al., *MBE* 2025). | +| `astral-pro3` | Gene trees with paralogs (multi-copy gene families). | +| `wastral` | Recommended for noisy gene trees. Weights each quartet by gene tree branch support, so poorly supported splits contribute less to the species tree. IQ-TREE's `--abayes` supports (already included in step 4) provide the weighting signal. | +| `astral4` | Large datasets with substantial missing taxa, or when substitution-rate branch lengths on internal nodes are needed for downstream rate analyses. | ``` # weighted ASTRAL — better accuracy when gene tree support is variable @@ -171,7 +171,7 @@ read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref. |---|---|---| | `--iqtree_model` | `LG+F+G` | Substitution model for per-gene IQ-TREE runs (e.g. `WAG+G`, `LG+G`, `TEST` for ModelFinder) | | `--iqtree_args` | none | Extra flags appended verbatim to every per-gene IQ-TREE call (e.g. `"-bb 1000 -redo"`) | -| `--astral_args` | none | Extra flags appended verbatim to the ASTER call (e.g. `"-C --root OUTGROUP"`) | +| `--astral_args` | none | Extra flags appended verbatim to the ASTER call; see the ASTER documentation for available options | ``` # Use WAG+G model instead of LG+F+G @@ -179,9 +179,6 @@ read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref. # Run ModelFinder per gene (much slower, but selects best-fit model for each OG) read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --iqtree_model TEST - -# Pass extra flags to ASTER (e.g. polytomy test) -read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --astral_args "-C" ``` Key output files written to `output/`: From 0424027f586848017a60247bee9ba0f587d3d188 Mon Sep 17 00:00:00 2001 From: Alex Raiyemo Date: Sat, 23 May 2026 12:58:15 -0400 Subject: [PATCH 09/14] Minor README formatting and phrasing cleanup in step 4 section --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 661479a2..1d49ea71 100644 --- a/README.md +++ b/README.md @@ -130,7 +130,7 @@ Tunable filters (only active with `--meta`): #### step4 (optional: coalescent species tree) -Step 3 builds a supermatrix tree by concatenating all OG alignments. If you want a **coalescent-based species tree** instead — which better handles incomplete lineage sorting and the different evolutionary histories of individual genes — run step 4 after step 3: +Step 3 builds a supermatrix tree by concatenating all OG alignments. If you want a **coalescent-based species tree** instead, which better handles incomplete lineage sorting and the different evolutionary histories of individual genes, run step 4 after step 3: ``` read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 @@ -141,7 +141,7 @@ Step 4 does the following automatically: 2. Runs IQ-TREE on each passing alignment in parallel (`-m LG+F+G`, `-alrt 1000`, `--abayes`, `-fast`) to infer individual gene trees. 3. Collects all gene trees and passes them to [ASTER](https://github.com/chaoszhang/ASTER) to produce the final coalescent species tree. -**ASTER binary.** The ASTER suite provides several binaries, all installed by `conda install aster`. By default, step 4 auto-detects the first available in your PATH in this order: `astral3` → `astral-pro3` → `astral-pro2` → `wastral` → `astral4`. Use `--astral_binary` to opt into a specific estimator: +**ASTER binary.** The ASTER suite provides several binaries, all installed by `conda install aster`. By default, step 4 auto-detects the first available in your PATH (`astral3`,`astral-pro3`,`astral-pro2`,`wastral`,`astral4`). Use `--astral_binary` to opt into a specific estimator: | Binary | When to use | |---|---| @@ -158,7 +158,7 @@ read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref. read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --astral_binary astral4 ``` -**Choosing `--min_samples` for large datasets.** The default of 10 is intentionally permissive so the tool works out of the box for small test datasets. For studies with many samples, the occupancy threshold has a large effect on how many OGs survive filtering and on the quality of the resulting gene trees. A useful empirical guideline is to require at least **30–40% taxon occupancy** — for example, `--min_samples 100` for a dataset of ~300 samples. In practice, applying a meaningful occupancy threshold together with `--max_gap 0.80` can reduce the number of OGs from tens of thousands to a few hundred; this is expected and desirable, as the surviving alignments are well-sampled across the tree and produce far more reliable gene trees for ASTRAL than a large set of sparse, gap-heavy alignments would. If the filtered set is very small (fewer than ~50 OGs), consider relaxing `--min_samples` slightly rather than `--max_gap`, since taxon occupancy drives gene tree resolution more than column-level gap content. +**Choosing `--min_samples` for large datasets.** The default of 10 is intentionally permissive so the tool works out of the box for small test datasets. For studies with many samples, the occupancy threshold has a large effect on how many OGs survive filtering and on the quality of the resulting gene trees. A useful empirical guideline is to require at least 30–40% taxon occupancy. For example, `--min_samples 100` for a dataset of ~300 samples. In practice, applying a meaningful occupancy threshold together with `--max_gap 0.80` can reduce the number of OGs from tens of thousands to a few hundred; this is expected and desirable, as the surviving alignments are well-sampled across the tree and produce far more reliable gene trees for ASTRAL than a large set of sparse, gap-heavy alignments would. If the filtered set is very small (fewer than ~50 OGs), consider relaxing `--min_samples` slightly rather than `--max_gap`, since taxon occupancy drives gene tree resolution more than column-level gap content. Optionally, pass `--trim` to run [ClipKIT](https://github.com/JLSteenwyk/ClipKIT) column-trimming on each alignment before gene tree inference: ``` @@ -182,11 +182,11 @@ read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref. ``` Key output files written to `output/`: -- `07_astral_filtered_aa/` — per-OG FASTA alignments that passed filtering -- `07_astral_trimmed_aa/` — ClipKIT-trimmed alignments (only when `--trim` is used) -- `08_gene_trees/` — individual IQ-TREE gene tree files -- `gene_trees_merge.nwk` — all gene trees concatenated into one file (input to ASTER) -- `astral_tree_merge.nwk` — the final coalescent species tree in Newick format +- `07_astral_filtered_aa/` - per-OG FASTA alignments that passed filtering +- `07_astral_trimmed_aa/` - ClipKIT-trimmed alignments (only when `--trim` is used) +- `08_gene_trees/` - individual IQ-TREE gene tree files +- `gene_trees_merge.nwk` - all gene trees concatenated into one file (input to ASTER) +- `astral_tree_merge.nwk` - the final coalescent species tree in Newick format ### bootstraping From d66353a4ec1626361f7fb99ebc5c70ba1f44cdc9 Mon Sep 17 00:00:00 2001 From: Alex Raiyemo Date: Sat, 23 May 2026 13:05:56 -0400 Subject: [PATCH 10/14] Resume interrupted step 4astral runs by skipping completed gene trees If a .treefile already exists and is non-empty in 08_gene_trees/, the worker skips the IQ-TREE call and returns the existing tree directly. This lets a user re-invoke --step 4astral after an interruption without re-running gene tree inference from scratch. --- read2tree/CoalescentInference.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/read2tree/CoalescentInference.py b/read2tree/CoalescentInference.py index 23df0e80..e16ae4ea 100644 --- a/read2tree/CoalescentInference.py +++ b/read2tree/CoalescentInference.py @@ -29,6 +29,10 @@ def _run_gene_tree(task): """ alignment_file, gene_trees_folder, iqtree_model, iqtree_extra = task og_name = os.path.basename(alignment_file).rsplit('.', 1)[0] + treefile = os.path.join(gene_trees_folder, og_name + '.treefile') + if os.path.exists(treefile) and os.path.getsize(treefile) > 0: + with open(treefile, 'r') as fh: + return fh.read().strip() try: iqtree_wrapper = Iqtree(alignment_file, datatype=DataType.PROTEIN) iqtree_wrapper.options = get_gene_tree_options() @@ -38,7 +42,6 @@ def _run_gene_tree(task): iqtree_wrapper.options.options['_extra'] = StringOption('', iqtree_extra, active=True) tree = iqtree_wrapper() if tree: - treefile = os.path.join(gene_trees_folder, og_name + '.treefile') with open(treefile, 'w') as fh: fh.write(tree.strip() + '\n') return tree From 0e1f04a303bd0ac1322ba3d9f0bb334430376edb Mon Sep 17 00:00:00 2001 From: Alex Raiyemo Date: Sat, 23 May 2026 13:14:33 -0400 Subject: [PATCH 11/14] Minor cleanup in --astral_args help text and docstring --- read2tree/CoalescentInference.py | 2 +- read2tree/main.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/read2tree/CoalescentInference.py b/read2tree/CoalescentInference.py index e16ae4ea..5f5d385f 100644 --- a/read2tree/CoalescentInference.py +++ b/read2tree/CoalescentInference.py @@ -221,7 +221,7 @@ def _infer_gene_trees(self, alignment_files): def _infer_species_tree(self, gene_tree_file): """ - Run ASTER (astral3) on the collected gene trees to estimate a coalescent species tree. + Run ASTER on the collected gene trees to estimate a coalescent species tree. :param gene_tree_file: path to file containing one gene tree (Newick) per line :return: coalescent species tree in Newick format diff --git a/read2tree/main.py b/read2tree/main.py index 311afb69..4be0fd6c 100644 --- a/read2tree/main.py +++ b/read2tree/main.py @@ -207,8 +207,8 @@ def parse_args(argv, exe_name, desc): arg_parser.add_argument('--astral_args', default=None, help='[Default is none] Extra flags appended verbatim to the ASTER ' - 'command in step 4astral. Use quotes for multiple flags ' - '(e.g. --astral_args "-C --root OUTGROUP"). ' + 'command in step 4astral. Use quotes for multiple flags. ' + 'See ASTER documentation for available options. ' 'Used by step 4astral.') arg_parser.add_argument('--step', default="all", From b42ae86c836f5c4bd6155e0f0804affb9bb1dfc3 Mon Sep 17 00:00:00 2001 From: Alex Raiyemo Date: Sat, 23 May 2026 13:17:00 -0400 Subject: [PATCH 12/14] Minor README edits --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1d49ea71..dea3ab19 100644 --- a/README.md +++ b/README.md @@ -151,10 +151,10 @@ Step 4 does the following automatically: | `astral4` | Large datasets with substantial missing taxa, or when substitution-rate branch lengths on internal nodes are needed for downstream rate analyses. | ``` -# weighted ASTRAL — better accuracy when gene tree support is variable +# weighted ASTRAL - better accuracy when gene tree support is variable read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --astral_binary wastral -# ASTRAL-IV — better robustness under missing data +# ASTRAL-IV - better robustness under missing data read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --astral_binary astral4 ``` From 6ce94b1ca418619e5ae071395cadad685780b994 Mon Sep 17 00:00:00 2001 From: Alex Raiyemo Date: Sun, 31 May 2026 19:52:23 -0400 Subject: [PATCH 13/14] Add --no_fast flag; fix thread flag for IQ-TREE 2/3 compatibility Replace -nt with -T in get_gene_tree_options() as IQ-TREE 2/3 uses -T for thread specification. Add --no_fast to allow disabling the -fast flag for a full ML search, which is required when combining with bootstrap options such as -B via --iqtree_args. --- README.md | 6 +++++- read2tree/CoalescentInference.py | 7 +++++-- read2tree/main.py | 6 ++++++ read2tree/wrappers/treebuilders/iqtree.py | 3 ++- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index dea3ab19..6c134c9a 100644 --- a/README.md +++ b/README.md @@ -170,8 +170,9 @@ read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref. | Argument | Default | Purpose | |---|---|---| | `--iqtree_model` | `LG+F+G` | Substitution model for per-gene IQ-TREE runs (e.g. `WAG+G`, `LG+G`, `TEST` for ModelFinder) | -| `--iqtree_args` | none | Extra flags appended verbatim to every per-gene IQ-TREE call (e.g. `"-bb 1000 -redo"`) | +| `--iqtree_args` | none | Extra flags appended verbatim to every per-gene IQ-TREE call (e.g. `"-B 1000"`) | | `--astral_args` | none | Extra flags appended verbatim to the ASTER call; see the ASTER documentation for available options | +| `--no_fast` | off | Disable the `-fast` flag to run a full ML tree search per gene. Required when using bootstrap via `--iqtree_args` (e.g. `--iqtree_args "-B 1000"`), as `-fast` and bootstrap are incompatible in IQ-TREE | ``` # Use WAG+G model instead of LG+F+G @@ -179,6 +180,9 @@ read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref. # Run ModelFinder per gene (much slower, but selects best-fit model for each OG) read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --iqtree_model TEST + +# Full ML search with ultrafast bootstrap (--no_fast required when using -B) +read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --no_fast --iqtree_args "-B 1000" ``` Key output files written to `output/`: diff --git a/read2tree/CoalescentInference.py b/read2tree/CoalescentInference.py index 5f5d385f..47a5f531 100644 --- a/read2tree/CoalescentInference.py +++ b/read2tree/CoalescentInference.py @@ -27,7 +27,7 @@ def _run_gene_tree(task): Runs IQ-TREE on a single alignment file and writes the treefile. Returns the Newick tree string, or None on failure. """ - alignment_file, gene_trees_folder, iqtree_model, iqtree_extra = task + alignment_file, gene_trees_folder, iqtree_model, iqtree_extra, no_fast = task og_name = os.path.basename(alignment_file).rsplit('.', 1)[0] treefile = os.path.join(gene_trees_folder, og_name + '.treefile') if os.path.exists(treefile) and os.path.getsize(treefile) > 0: @@ -38,6 +38,8 @@ def _run_gene_tree(task): iqtree_wrapper.options = get_gene_tree_options() if iqtree_model: iqtree_wrapper.options.options['-m'].set_value(iqtree_model) + if no_fast: + iqtree_wrapper.options.options['-fast'].active = False if iqtree_extra: iqtree_wrapper.options.options['_extra'] = StringOption('', iqtree_extra, active=True) tree = iqtree_wrapper() @@ -199,7 +201,8 @@ def _infer_gene_trees(self, alignment_files): """ iqtree_model = getattr(self.args, 'iqtree_model', None) iqtree_extra = getattr(self.args, 'iqtree_args', None) - tasks = [(f, self._gene_trees_folder, iqtree_model, iqtree_extra) for f in alignment_files] + no_fast = getattr(self.args, 'no_fast', False) + tasks = [(f, self._gene_trees_folder, iqtree_model, iqtree_extra, no_fast) for f in alignment_files] logger.info('{}: Running per-gene IQ-TREE on {} alignments with {} workers.'.format( self._species_name, len(tasks), self.args.threads)) diff --git a/read2tree/main.py b/read2tree/main.py index 4be0fd6c..10290a16 100644 --- a/read2tree/main.py +++ b/read2tree/main.py @@ -211,6 +211,12 @@ def parse_args(argv, exe_name, desc): 'See ASTER documentation for available options. ' 'Used by step 4astral.') + arg_parser.add_argument('--no_fast', action='store_true', + help='[Default is false] Disable the -fast flag for per-gene ' + 'IQ-TREE runs in step 4astral, enabling a full ML tree ' + 'search. Required when using bootstrap via --iqtree_args ' + '(e.g. --iqtree_args "-B 1000"). Used by step 4astral.') + arg_parser.add_argument('--step', default="all", help='[Default is all 1marker 2map 3combine 4astral') diff --git a/read2tree/wrappers/treebuilders/iqtree.py b/read2tree/wrappers/treebuilders/iqtree.py index 189364e2..30310a68 100644 --- a/read2tree/wrappers/treebuilders/iqtree.py +++ b/read2tree/wrappers/treebuilders/iqtree.py @@ -152,12 +152,13 @@ def get_default_options(): def get_gene_tree_options(): """Options for per-gene tree inference in the coalescent pipeline (step 4astral). + Uses -T (IQ-TREE 2/3 thread flag) rather than the legacy -nt flag. --abayes computes aBayes posterior branch supports alongside SH-aLRT values. Both are annotated as node labels on the output tree, which wASTRAL uses for quartet weighting when --astral_binary wastral is selected. """ return OptionSet([ - IntegerOption('-nt', 1, active=True), + IntegerOption('-T', 1, active=True), StringOption('-m', 'LG+F+G', active=True), StringOption('-st', 'AA', active=True), StringOption('-mem', '4G', active=True), From 6eb9cb254d83ab20c19083e0b8c7092240ef961d Mon Sep 17 00:00:00 2001 From: Alex Raiyemo Date: Sun, 31 May 2026 20:13:10 -0400 Subject: [PATCH 14/14] Add --dna flag for DNA-based coalescent inference in step 4astral Adds --dna to switch step 4astral from amino acid to DNA alignments (06_align_merge_dna). Default IQ-TREE model is GTR+G for DNA runs. All output folders and files now carry an _aa or _dna suffix to distinguish runs: 08_gene_trees_aa/dna, gene_trees_merge_aa/dna.nwk, astral_tree_merge_aa/dna.nwk. --- README.md | 16 +++++++++----- read2tree/CoalescentInference.py | 27 ++++++++++++++--------- read2tree/main.py | 10 ++++++++- read2tree/wrappers/treebuilders/iqtree.py | 20 ++++++++++++++++- 4 files changed, 54 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 6c134c9a..95d2e3e2 100644 --- a/README.md +++ b/README.md @@ -173,6 +173,7 @@ read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref. | `--iqtree_args` | none | Extra flags appended verbatim to every per-gene IQ-TREE call (e.g. `"-B 1000"`) | | `--astral_args` | none | Extra flags appended verbatim to the ASTER call; see the ASTER documentation for available options | | `--no_fast` | off | Disable the `-fast` flag to run a full ML tree search per gene. Required when using bootstrap via `--iqtree_args` (e.g. `--iqtree_args "-B 1000"`), as `-fast` and bootstrap are incompatible in IQ-TREE | +| `--dna` | off | Use DNA alignments from `06_align_merge_dna` instead of amino acid alignments. Recommended for closely related species. Default model switches to `GTR+G` | ``` # Use WAG+G model instead of LG+F+G @@ -183,14 +184,17 @@ read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref. # Full ML search with ultrafast bootstrap (--no_fast required when using -B) read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --no_fast --iqtree_args "-B 1000" + +# DNA-based coalescent tree (recommended for closely related species) +read2tree --step 4astral --standalone_path marker_genes --dna_reference dna_ref.fa --output_path output --threads 24 --dna ``` -Key output files written to `output/`: -- `07_astral_filtered_aa/` - per-OG FASTA alignments that passed filtering -- `07_astral_trimmed_aa/` - ClipKIT-trimmed alignments (only when `--trim` is used) -- `08_gene_trees/` - individual IQ-TREE gene tree files -- `gene_trees_merge.nwk` - all gene trees concatenated into one file (input to ASTER) -- `astral_tree_merge.nwk` - the final coalescent species tree in Newick format +Key output files written to `output/` (suffix is `aa` by default, `dna` when `--dna` is used): +- `07_astral_filtered_aa/` or `07_astral_filtered_dna/` - per-OG FASTA alignments that passed filtering +- `07_astral_trimmed_aa/` or `07_astral_trimmed_dna/` - ClipKIT-trimmed alignments (only when `--trim` is used) +- `08_gene_trees_aa/` or `08_gene_trees_dna/` - individual IQ-TREE gene tree files +- `gene_trees_merge_aa.nwk` or `gene_trees_merge_dna.nwk` - all gene trees concatenated into one file (input to ASTER) +- `astral_tree_merge_aa.nwk` or `astral_tree_merge_dna.nwk` - the final coalescent species tree in Newick format ### bootstraping diff --git a/read2tree/CoalescentInference.py b/read2tree/CoalescentInference.py index 47a5f531..f954a088 100644 --- a/read2tree/CoalescentInference.py +++ b/read2tree/CoalescentInference.py @@ -12,7 +12,7 @@ from multiprocessing import Pool from Bio import SeqIO, AlignIO -from read2tree.wrappers.treebuilders.iqtree import Iqtree, get_gene_tree_options +from read2tree.wrappers.treebuilders.iqtree import Iqtree, get_gene_tree_options, get_gene_tree_dna_options from read2tree.wrappers.treebuilders.base_treebuilder import DataType from read2tree.wrappers.treebuilders.aster import Aster from read2tree.wrappers.options import StringOption @@ -27,15 +27,19 @@ def _run_gene_tree(task): Runs IQ-TREE on a single alignment file and writes the treefile. Returns the Newick tree string, or None on failure. """ - alignment_file, gene_trees_folder, iqtree_model, iqtree_extra, no_fast = task + alignment_file, gene_trees_folder, iqtree_model, iqtree_extra, no_fast, seq_type = task og_name = os.path.basename(alignment_file).rsplit('.', 1)[0] treefile = os.path.join(gene_trees_folder, og_name + '.treefile') if os.path.exists(treefile) and os.path.getsize(treefile) > 0: with open(treefile, 'r') as fh: return fh.read().strip() try: - iqtree_wrapper = Iqtree(alignment_file, datatype=DataType.PROTEIN) - iqtree_wrapper.options = get_gene_tree_options() + if seq_type == 'dna': + iqtree_wrapper = Iqtree(alignment_file, datatype=DataType.DNA) + iqtree_wrapper.options = get_gene_tree_dna_options() + else: + iqtree_wrapper = Iqtree(alignment_file, datatype=DataType.PROTEIN) + iqtree_wrapper.options = get_gene_tree_options() if iqtree_model: iqtree_wrapper.options.options['-m'].set_value(iqtree_model) if no_fast: @@ -69,9 +73,10 @@ class CoalescentInference(object): def __init__(self, args): self.args = args self._species_name = 'merge' - self._filtered_folder = self._make_output_path('07_astral_filtered_aa') - self._trimmed_folder = self._make_output_path('07_astral_trimmed_aa') if args.trim else None - self._gene_trees_folder = self._make_output_path('08_gene_trees') + self._seq_type = 'dna' if getattr(args, 'dna', False) else 'aa' + self._filtered_folder = self._make_output_path('07_astral_filtered_' + self._seq_type) + self._trimmed_folder = self._make_output_path('07_astral_trimmed_' + self._seq_type) if args.trim else None + self._gene_trees_folder = self._make_output_path('08_gene_trees_' + self._seq_type) self.elapsed_time = 0 self.tree = None self._run() @@ -116,7 +121,7 @@ def _filter_alignments(self): :return: list of paths to filtered FASTA files """ - input_folder = os.path.join(self.args.output_path, '06_align_merge_aa') + input_folder = os.path.join(self.args.output_path, '06_align_merge_' + self._seq_type) log_path = os.path.join(self._filtered_folder, 'filtering_summary.txt') filtered_files = [] total = 0 @@ -202,7 +207,7 @@ def _infer_gene_trees(self, alignment_files): iqtree_model = getattr(self.args, 'iqtree_model', None) iqtree_extra = getattr(self.args, 'iqtree_args', None) no_fast = getattr(self.args, 'no_fast', False) - tasks = [(f, self._gene_trees_folder, iqtree_model, iqtree_extra, no_fast) for f in alignment_files] + tasks = [(f, self._gene_trees_folder, iqtree_model, iqtree_extra, no_fast, self._seq_type) for f in alignment_files] logger.info('{}: Running per-gene IQ-TREE on {} alignments with {} workers.'.format( self._species_name, len(tasks), self.args.threads)) @@ -213,7 +218,7 @@ def _infer_gene_trees(self, alignment_files): trees = [t for t in results if t is not None] gene_tree_file = os.path.join(self.args.output_path, - 'gene_trees_' + self._species_name + '.nwk') + 'gene_trees_{}_{}.nwk'.format(self._species_name, self._seq_type)) with open(gene_tree_file, 'w') as fh: for tree in trees: fh.write(tree.strip() + '\n') @@ -230,7 +235,7 @@ def _infer_species_tree(self, gene_tree_file): :return: coalescent species tree in Newick format """ species_tree_file = os.path.join(self.args.output_path, - 'astral_tree_' + self._species_name + '.nwk') + 'astral_tree_{}_{}.nwk'.format(self._species_name, self._seq_type)) aster_wrapper = Aster(gene_tree_file, species_tree_file, binary=getattr(self.args, 'astral_binary', None)) aster_wrapper.options.options['-t'].set_value(self.args.threads) diff --git a/read2tree/main.py b/read2tree/main.py index 10290a16..ef1196a7 100644 --- a/read2tree/main.py +++ b/read2tree/main.py @@ -217,6 +217,13 @@ def parse_args(argv, exe_name, desc): 'search. Required when using bootstrap via --iqtree_args ' '(e.g. --iqtree_args "-B 1000"). Used by step 4astral.') + arg_parser.add_argument('--dna', action='store_true', + help='[Default is false] Use DNA alignments from ' + '06_align_merge_dna instead of amino acid alignments ' + 'for step 4astral. Recommended for closely related ' + 'species. Default IQ-TREE model switches to GTR+G. ' + 'Used by step 4astral.') + arg_parser.add_argument('--step', default="all", help='[Default is all 1marker 2map 3combine 4astral') @@ -491,7 +498,8 @@ def main(argv, exe_name, desc=''): logger.info(' ------- Read2Tree finished -*- -------') if args.step == "4astral": - input_align_folder = os.path.join(args.output_path, '06_align_merge_aa') + _seq_type = 'dna' if args.dna else 'aa' + input_align_folder = os.path.join(args.output_path, '06_align_merge_' + _seq_type) if not os.path.exists(input_align_folder): logger.error( 'Step 4astral requires completed step 3combine output. ' diff --git a/read2tree/wrappers/treebuilders/iqtree.py b/read2tree/wrappers/treebuilders/iqtree.py index 30310a68..3d824573 100644 --- a/read2tree/wrappers/treebuilders/iqtree.py +++ b/read2tree/wrappers/treebuilders/iqtree.py @@ -150,7 +150,7 @@ def get_default_options(): def get_gene_tree_options(): - """Options for per-gene tree inference in the coalescent pipeline (step 4astral). + """Options for per-gene protein tree inference in the coalescent pipeline (step 4astral). Uses -T (IQ-TREE 2/3 thread flag) rather than the legacy -nt flag. --abayes computes aBayes posterior branch supports alongside SH-aLRT values. @@ -166,3 +166,21 @@ def get_gene_tree_options(): FlagOption('-fast', True, active=True), FlagOption('--abayes', True, active=True), ]) + + +def get_gene_tree_dna_options(): + """Options for per-gene DNA tree inference in the coalescent pipeline (step 4astral). + + Uses GTR+G as the default model for nucleotide alignments. Recommended for + closely related species where DNA-level variation is more informative than + amino acid sequences. + """ + return OptionSet([ + IntegerOption('-T', 1, active=True), + StringOption('-m', 'GTR+G', active=True), + StringOption('-st', 'DNA', active=True), + StringOption('-mem', '4G', active=True), + IntegerOption('-alrt', 1000, active=True), + FlagOption('-fast', True, active=True), + FlagOption('--abayes', True, active=True), + ])