From c6962fb4f07b878f7d2ae6835d3f82f200936238 Mon Sep 17 00:00:00 2001 From: Zubiriguitar <64590799+Zubiriguitar@users.noreply.github.com> Date: Mon, 11 Nov 2024 00:43:42 +0700 Subject: [PATCH 01/15] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5d9a5b8..0511799 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Pipeline to construct species phylogenies using [BUSCO](https://busco.ezlab.org/ To use this workflow, you can either download and extract the [latest release](https://github.com/tomarovsky/BuscoClade/releases) or clone the repository: ``` -git clone https://github.com/tomarovsky/BuscoClade.git +git clone https://github.com/Zubiriguitar/BuscoClade.git ``` ### Step 2. Add species genomes From 1ce26beda02c54d6260856d86f61cd4c5b57aef2 Mon Sep 17 00:00:00 2001 From: Zubiriguitar Date: Mon, 17 Mar 2025 00:19:15 +0700 Subject: [PATCH 02/15] raxml addition to the pipeline --- Snakefile | 9 ++++++ config/default.yaml | 35 +++++++++++++-------- workflow/envs/conda.yaml | 1 + workflow/rules/raxml.smk | 54 ++++++++++++++++++++++++++++++++ workflow/rules/visualization.smk | 29 +++++++++++++++++ 5 files changed, 115 insertions(+), 13 deletions(-) create mode 100644 workflow/rules/raxml.smk diff --git a/Snakefile b/Snakefile index 57fa010..eedb0c1 100644 --- a/Snakefile +++ b/Snakefile @@ -26,6 +26,7 @@ mrbayes_dir_path = output_dir_path / config["mrbayes_dir"] astral_dir_path = output_dir_path / config["astral_dir"] rapidnj_dir_path = output_dir_path / config["rapidnj_dir"] phylip_dir_path = output_dir_path / config["phylip_dir"] +raxml_dir_path = output_dir_path / config["raxml_dir"] if "species_list" not in config: config["species_list"] = [f.stem for f in genome_dir_path.iterdir() if f.is_file() and f.suffix == ".fasta"] @@ -44,6 +45,7 @@ astral_filtered_trees = "{0}.iqtree_per_fna.concat.{1}.treefile".format(config[" astral_tree = "{0}.{1}.fna.astral.treefile".format(config["alignment_file_prefix"], config["nodes_filtrataion_by_support"]) rapidnj_tree = "{}.fna.rapidnj.treefile".format(config["alignment_file_prefix"]) phylip_tree = "{}.fna.phy.namefix.treefile".format(config["alignment_file_prefix"]) +raxml_tree = "{}.fna.raxml.treefile".format(config["alignment_file_prefix"]) # ---- Necessary functions ---- @@ -109,6 +111,12 @@ if "dna_alignment" in config: if "draw_phylotrees" in config: if config["draw_phylotrees"]: output_files.append(phylip_dir_path / f"{fasta_dna_filename}.only_tree.svg") + if "raxml" in config: + if config["raxml"]: + output_files.append(raxml_dir_path / raxml_tree) + if "draw_phylotrees" in config: + if config["draw_phylotrees"]: + output_files.append(raxml_dir_path / f"{fasta_dna_filename}.only_tree.svg") #todo if "protein_alignment" in config: if config["protein_alignment"]: output_files.append(lambda w: expand_faa_from_merged_sequences(w, alignments_dir_path / "faa" / "{N}.faa")) @@ -155,3 +163,4 @@ include: "workflow/rules/visualization.smk" include: "workflow/rules/astral.smk" include: "workflow/rules/rapidnj.smk" include: "workflow/rules/phylip.smk" +include: "workflow/rules/raxml.smk" diff --git a/config/default.yaml b/config/default.yaml index 314544f..23e0678 100644 --- a/config/default.yaml +++ b/config/default.yaml @@ -7,7 +7,7 @@ ete3_conda_config: "workflow/envs/ete3_conda.yaml" # ---- Assembly statistics ---- quastcore: True # ---- Alignment ---- -dna_alignment: 'prank' # 'prank' or 'mafft' +dna_alignment: 'mafft' # 'prank' or 'mafft' protein_alignment: False # 'prank' or 'mafft' # ---- Filtration ---- dna_filtration: 'gblocks' # 'gblocks' or 'trimal' @@ -16,10 +16,11 @@ protein_filtration: False # 'gblocks' or 'trimal' iqtree_dna: True iqtree_protein: True mrbayes_dna: False -mrbayes_protein: False +mrbayes_protein: True astral: True # use iqtree for each dna alignment rapidnj: True phylip: True +raxml: True # ---- Рhylogenetic tree visualization ---- draw_phylotrees: True @@ -68,6 +69,9 @@ rapidnj_params: "-b 1000" # PHYLIP phylip_dnadist_params: "D\n" # use 'D\n' to set model to 'Kimura 2-parameter'. F84 model (default) if "". phylip_neighbor_params: "" # use 'N\n' to set UPGMA. NJ (default) if "". + +#RAXML +raxml_params: "--model GTR+G --bs-trees 100" # ---- Рhylogenetic tree visualization ---- tree_visualization_params: "" # "--outgroup Homo_sapiens" or "--outgroup species_1,species_2" @@ -90,6 +94,7 @@ mrbayes_dir: "phylogeny/mrbayes" astral_dir: "phylogeny/astral" rapidnj_dir: "phylogeny/rapidnj" phylip_dir: "phylogeny/phylip" +raxml_dir: "phylogeny/raxml" log_dir: "logs" cluster_log_dir: "cluster_logs" benchmark_dir: "benchmarks" @@ -106,21 +111,23 @@ mrbayes_queue: "main" astral_queue: "main" rapidnj_queue: "main" phylip_queue: "main" +raxml_queue: "main" processing_queue: "main" # ---- Tool threads ---- -busco_threads: 8 -mafft_threads: 1 +busco_threads: 20 # Для млеков около 30-60, если есть ресурсы (но не обязательно) +mafft_threads: 2 prank_threads: 1 -gblocks_threads: 1 -trimal_threads: 1 -iqtree_threads: 8 -mrbayes_threads: 8 -iqtree_per_fna_threads: 4 -astral_threads: 4 -rapidnj_threads: 4 -phylip_threads: 1 -processing_threads: 1 +gblocks_threads: 2 +trimal_threads: 2 +iqtree_threads: 16 +mrbayes_threads: 16 +iqtree_per_fna_threads: 8 +astral_threads: 8 +rapidnj_threads: 8 +phylip_threads: 2 +raxml_threads: 16 +processing_threads: 2 # ---- Tool memory ---- busco_mem_mb: 10000 @@ -134,6 +141,7 @@ iqtree_per_fna_mem_mb: 10000 astral_mem_mb: 10000 rapidnj_mem_mb: 8000 phylip_mem_mb: 4000 +raxml_mem_mb: 10000 processing_mem_mb: 2000 # ---- Tool time ---- @@ -148,6 +156,7 @@ iqtree_per_fna_time: "10:00:00" astral_time: "5:00:00" rapidnj_time: "5:00:00" phylip_time: "5:00:00" +raxml_time: "5:00:00" processing_time: "5:00:00" diff --git a/workflow/envs/conda.yaml b/workflow/envs/conda.yaml index 886e3fb..4ffc8e7 100644 --- a/workflow/envs/conda.yaml +++ b/workflow/envs/conda.yaml @@ -15,6 +15,7 @@ dependencies: - newick_utils - rapidnj - phylip + - raxml-ng # - mrbayes - pip: - "--editable=git+https://github.com/tomarovsky/Biocrutch.git#egg=Biocrutch" diff --git a/workflow/rules/raxml.smk b/workflow/rules/raxml.smk new file mode 100644 index 0000000..460a5b4 --- /dev/null +++ b/workflow/rules/raxml.smk @@ -0,0 +1,54 @@ +if config["raxml"]: + + rule raxml_tree: + input: + concat_alignments_dir_path / fasta_dna_filename, + output: + raxml_dir_path / f"{fasta_dna_filename}.raxml.bestTree", + raxml_dir_path / f"{fasta_dna_filename}.raxml.log", + raxml_dir_path / f"{fasta_dna_filename}.raxml.rba", + params: + options=config["raxml_params"], + outdir=raxml_dir_path, + prefix=fasta_dna_filename, + log: + std=log_dir_path / "raxml.log", + cluster_log=cluster_log_dir_path / "raxml.cluster.log", + cluster_err=cluster_log_dir_path / "raxml.cluster.err", + benchmark: + benchmark_dir_path / "raxml.benchmark.txt", + conda: + "../../%s" % config["conda_config"], + resources: + queue=config["raxml_queue"], + cpus=config["raxml_threads"], + time=config["raxml_time"], + mem_mb=config["raxml_mem_mb"], + threads: config["raxml_threads"], + shell: + "mkdir -p {params.outdir}; raxml-ng --all --msa {input} --prefix {params.outdir}/{params.prefix} " + "{params.options} 1> {log.std} 2>&1; " + + rule raxml_tree_rename: + input: + raxml_dir_path / f"{fasta_dna_filename}.raxml.bestTree", + output: + raxml_dir_path / f"{fasta_dna_filename}.raxml.treefile", + params: + outdir=raxml_dir_path, + log: + std=log_dir_path / "raxml_rename.log", + cluster_log=cluster_log_dir_path / "raxml_rename.cluster.log", + cluster_err=cluster_log_dir_path / "raxml_rename.cluster.err", + benchmark: + benchmark_dir_path / "raxml.benchmark.txt", + conda: + "../../%s" % config["conda_config"], + resources: + queue=config["raxml_queue"], + cpus=config["raxml_threads"], + time=config["raxml_time"], + mem_mb=config["raxml_mem_mb"], + threads: config["raxml_threads"], + shell: + "mv {input} {output} > {log.std} 2>&1; " \ No newline at end of file diff --git a/workflow/rules/visualization.smk b/workflow/rules/visualization.smk index 3150099..2360e65 100644 --- a/workflow/rules/visualization.smk +++ b/workflow/rules/visualization.smk @@ -150,6 +150,35 @@ if config["draw_phylotrees"]: " QT_QPA_PLATFORM=offscreen workflow/scripts/draw_phylotrees.py -i {input} " " -o {params.prefix} {params.options} 1> {log.std} 2>&1; " +if config["draw_phylotrees"]: + + rule raxml_tree_visualization: + input: + raxml_dir_path / raxml_tree, + output: + raxml_dir_path / f"{fasta_dna_filename}.length_and_support_tree.svg", + raxml_dir_path / f"{fasta_dna_filename}.only_support_tree.svg", + raxml_dir_path / f"{fasta_dna_filename}.only_tree.svg", + params: + prefix=raxml_dir_path / f"{fasta_dna_filename}", + options=config["tree_visualization_params"], + log: + std=log_dir_path / "raxml_tree_visualization.log", + cluster_log=cluster_log_dir_path / "raxml_tree_visualization.cluster.log", + cluster_err=cluster_log_dir_path / "raxml_tree_visualization.cluster.err", + benchmark: + benchmark_dir_path / "raxml_tree_visualization.benchmark.txt", + conda: + "../../%s" % config["ete3_conda_config"], + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + threads: config["processing_threads"], + shell: + "QT_QPA_PLATFORM=offscreen workflow/scripts/draw_phylotrees.py -i {input} " + "-o {params.prefix} {params.options} 1> {log.std} 2>&1; " rule species_ids_plot: input: From 299ec8ad52e7574ae990d7177a4bc58120bc2c92 Mon Sep 17 00:00:00 2001 From: Zubiriguitar Date: Fri, 21 Mar 2025 00:31:26 +0700 Subject: [PATCH 03/15] Tree legend + outgroup (0..n species) --- Snakefile | 12 ++-- workflow/rules/visualization.smk | 35 ++++++----- workflow/scripts/draw_phylotrees.py | 92 +++++++++++++++++++---------- 3 files changed, 87 insertions(+), 52 deletions(-) diff --git a/Snakefile b/Snakefile index eedb0c1..1c76119 100644 --- a/Snakefile +++ b/Snakefile @@ -90,33 +90,33 @@ if "dna_alignment" in config: output_files.append(iqtree_dir_path / "fna" / f"{fasta_dna_filename}.treefile") if "draw_phylotrees" in config: if config["draw_phylotrees"]: - output_files.append(iqtree_dir_path / "fna" / f"{fasta_dna_filename}.length_and_support_tree.svg") + output_files.append(iqtree_dir_path / "fna" / f"{fasta_dna_filename}.length_and_support_tree.png") if "astral" in config: if config["astral"]: output_files.append(astral_dir_path / astral_tree) if "draw_phylotrees" in config: if config["draw_phylotrees"]: - output_files.append(astral_dir_path / f"{astral_tree}.svg") + output_files.append(astral_dir_path / f"{astral_tree}.png") if "rapidnj" in config: if config["rapidnj"]: output_files.append(concat_alignments_dir_path / stockholm_dna_filename) output_files.append(rapidnj_dir_path / rapidnj_tree) if "draw_phylotrees" in config: if config["draw_phylotrees"]: - output_files.append(rapidnj_dir_path / f"{fasta_dna_filename}.only_tree.svg") + output_files.append(rapidnj_dir_path / f"{fasta_dna_filename}.only_tree.png") if "phylip" in config: if config["phylip"]: output_files.append(concat_alignments_dir_path / phylip_dna_filename) output_files.append(phylip_dir_path / phylip_tree) if "draw_phylotrees" in config: if config["draw_phylotrees"]: - output_files.append(phylip_dir_path / f"{fasta_dna_filename}.only_tree.svg") + output_files.append(phylip_dir_path / f"{fasta_dna_filename}.only_tree.png") if "raxml" in config: if config["raxml"]: output_files.append(raxml_dir_path / raxml_tree) if "draw_phylotrees" in config: if config["draw_phylotrees"]: - output_files.append(raxml_dir_path / f"{fasta_dna_filename}.only_tree.svg") #todo + output_files.append(raxml_dir_path / f"{fasta_dna_filename}.only_tree.png") #todo if "protein_alignment" in config: if config["protein_alignment"]: output_files.append(lambda w: expand_faa_from_merged_sequences(w, alignments_dir_path / "faa" / "{N}.faa")) @@ -132,7 +132,7 @@ if "protein_alignment" in config: output_files.append(iqtree_dir_path / "faa" / f"{fasta_protein_filename}.treefile") if "draw_phylotrees" in config: if config["draw_phylotrees"]: - output_files.append(iqtree_dir_path / "faa" / f"{fasta_protein_filename}.length_and_support_tree.svg") + output_files.append(iqtree_dir_path / "faa" / f"{fasta_protein_filename}.length_and_support_tree.png") if "mrbayes_dna" in config: if config["mrbayes_dna"]: # to-do: upgrade output_files.append(mrbayes_dir_path / "fna") diff --git a/workflow/rules/visualization.smk b/workflow/rules/visualization.smk index 2360e65..7951663 100644 --- a/workflow/rules/visualization.smk +++ b/workflow/rules/visualization.smk @@ -4,9 +4,10 @@ if config["draw_phylotrees"]: input: iqtree_dir_path / "fna" / f"{fasta_dna_filename}.treefile", output: - iqtree_dir_path / "fna" / f"{fasta_dna_filename}.length_and_support_tree.svg", - iqtree_dir_path / "fna" / f"{fasta_dna_filename}.only_support_tree.svg", - iqtree_dir_path / "fna" / f"{fasta_dna_filename}.only_tree.svg", + iqtree_dir_path / "fna" / f"{fasta_dna_filename}.length_and_support_tree.png", + iqtree_dir_path / "fna" / f"{fasta_dna_filename}.only_support_tree.png", + iqtree_dir_path / "fna" / f"{fasta_dna_filename}.only_tree.png", + iqtree_dir_path / "fna" / f"{fasta_dna_filename}.dots.png", params: prefix=iqtree_dir_path / "fna" / f"{fasta_dna_filename}", options=config["tree_visualization_params"], @@ -35,9 +36,10 @@ if config["draw_phylotrees"]: input: iqtree_dir_path / "faa" / f"{fasta_protein_filename}.treefile", output: - iqtree_dir_path / "faa" / f"{fasta_protein_filename}.length_and_support_tree.svg", - iqtree_dir_path / "faa" / f"{fasta_protein_filename}.only_support_tree.svg", - iqtree_dir_path / "faa" / f"{fasta_protein_filename}.only_tree.svg", + iqtree_dir_path / "faa" / f"{fasta_protein_filename}.length_and_support_tree.png", + iqtree_dir_path / "faa" / f"{fasta_protein_filename}.only_support_tree.png", + iqtree_dir_path / "faa" / f"{fasta_protein_filename}.only_tree.png", + iqtree_dir_path / "faa" / f"{fasta_protein_filename}.dots.png", params: prefix=iqtree_dir_path / "faa" / f"{fasta_protein_filename}", options=config["tree_visualization_params"], @@ -95,9 +97,10 @@ if config["draw_phylotrees"]: input: rapidnj_dir_path / rapidnj_tree, output: - rapidnj_dir_path / f"{fasta_dna_filename}.length_and_support_tree.svg", - rapidnj_dir_path / f"{fasta_dna_filename}.only_support_tree.svg", - rapidnj_dir_path / f"{fasta_dna_filename}.only_tree.svg", + rapidnj_dir_path / f"{fasta_dna_filename}.length_and_support_tree.png", + rapidnj_dir_path / f"{fasta_dna_filename}.only_support_tree.png", + rapidnj_dir_path / f"{fasta_dna_filename}.only_tree.png", + rapidnj_dir_path / f"{fasta_dna_filename}.dots.png", params: prefix=rapidnj_dir_path / f"{fasta_dna_filename}", options=config["tree_visualization_params"], @@ -126,9 +129,10 @@ if config["draw_phylotrees"]: input: phylip_dir_path / phylip_tree, output: - phylip_dir_path / f"{fasta_dna_filename}.length_and_support_tree.svg", - phylip_dir_path / f"{fasta_dna_filename}.only_support_tree.svg", - phylip_dir_path / f"{fasta_dna_filename}.only_tree.svg", + phylip_dir_path / f"{fasta_dna_filename}.length_and_support_tree.png", + phylip_dir_path / f"{fasta_dna_filename}.only_support_tree.png", + phylip_dir_path / f"{fasta_dna_filename}.only_tree.png", + phylip_dir_path / f"{fasta_dna_filename}.dots.png", params: prefix=phylip_dir_path / f"{fasta_dna_filename}", options=config["tree_visualization_params"], @@ -156,9 +160,10 @@ if config["draw_phylotrees"]: input: raxml_dir_path / raxml_tree, output: - raxml_dir_path / f"{fasta_dna_filename}.length_and_support_tree.svg", - raxml_dir_path / f"{fasta_dna_filename}.only_support_tree.svg", - raxml_dir_path / f"{fasta_dna_filename}.only_tree.svg", + raxml_dir_path / f"{fasta_dna_filename}.length_and_support_tree.png", + raxml_dir_path / f"{fasta_dna_filename}.only_support_tree.png", + raxml_dir_path / f"{fasta_dna_filename}.only_tree.png", + raxml_dir_path / f"{fasta_dna_filename}.dots.png", params: prefix=raxml_dir_path / f"{fasta_dna_filename}", options=config["tree_visualization_params"], diff --git a/workflow/scripts/draw_phylotrees.py b/workflow/scripts/draw_phylotrees.py index c875f99..0c180e3 100755 --- a/workflow/scripts/draw_phylotrees.py +++ b/workflow/scripts/draw_phylotrees.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -__author__ = "tomarovsky" -from ete3 import Tree, faces, AttrFace, TreeStyle, NodeStyle +__author__ = 'tomarovsky' +from ete3 import TextFace, Tree, faces, AttrFace, TreeStyle, NodeStyle from argparse import ArgumentParser @@ -8,39 +8,60 @@ def mylayout(node): if node.is_leaf(): N = AttrFace("name", fgcolor="black", text_prefix=" ", fstyle="italic", fsize=12) faces.add_face_to_node(N, node, 0) + # F = AttrFace("support", fsize=8, fgcolor="green", text_prefix="") + # faces.add_face_to_node(F, node, 0) node.img_style["size"] = 1 node.img_style["shape"] = "circle" node.img_style["fgcolor"] = "Black" + else: + node.img_style["shape"] = "circle" + node.img_style["size"] = 5 + if node.support > 90: + node.img_style["fgcolor"] = "LimeGreen" + elif node.support > 70: + node.img_style["fgcolor"] = '#008cf0' + elif node.support > 50: + node.img_style["fgcolor"] = '#883ac2' + else: + node.img_style["fgcolor"] = '#ff0000' + + +def export_legend(palette): + legend = TreeStyle() + legend.show_leaf_name = False + legend.title.add_face(TextFace("LegendColors of \nnormalized values", fsize=10, bold=True), column=0) + + for label, color in palette.items(): + legend.legend.add_face(faces.CircleFace(6, color), column=0) + legend.legend.add_face(TextFace(f" {label}", fsize=8, fgcolor="black"), column=1) + + return legend def main(): - tree = "".join(open(args.input).readlines()).strip() - if "'" in tree: - tree = tree.replace("'", "") - t = Tree(tree) + palette = {"90": "LimeGreen", "70": '#008cf0', "50": '#883ac2', "<50": '#ff0000'} + + t = Tree(args.input) + for i in t.get_leaves(): + i.name = i.name.replace("_", " ").replace("GCA ", "GCA_").replace("GCF ", "GCF_") if args.outgroup: - outgroup = args.outgroup.replace(" ", "_") - if "," in args.outgroup: - try: - nodes_to_root = outgroup.split(",") - common_ancestor = t.get_common_ancestor(*nodes_to_root) - t.set_outgroup(common_ancestor) - except: - R = t.get_midpoint_outgroup() - t.set_outgroup(R) - nodes_to_root = outgroup.split(",") - common_ancestor = t.get_common_ancestor(*nodes_to_root) - t.set_outgroup(common_ancestor) + outgroup_species = args.outgroup.split(',') + if len(outgroup_species) == 1: + t.set_outgroup(outgroup_species[0]) else: - t.set_outgroup(outgroup) + common_ancestor = t.get_common_ancestor(outgroup_species) + t.set_outgroup(common_ancestor) else: t.unroot() - for i in t.get_leaves(): # 'Homo_sapiens' -> 'Homo sapiens' - i.name = i.name.replace("_", " ") + ts = TreeStyle() ts.mode = "r" ts.layout_fn = mylayout + ts.show_scale = True ts.show_leaf_name = False + ts.legend_position = 4 + ts.legend = export_legend(palette).legend + for n in t.traverse(): nstyle = NodeStyle() nstyle["fgcolor"] = "Blue" @@ -52,26 +73,35 @@ def main(): ts.show_branch_length = True ts.show_branch_support = True ts.branch_vertical_margin = -12 - t.render(f"{args.output}.length_and_support_tree.svg", w=500, units="px", tree_style=ts) + t.render(f"{args.output}.length_and_support_tree.png", w=3000, units="px", tree_style=ts) + ts.show_branch_length = False ts.show_branch_support = True ts.branch_vertical_margin = -4 - t.render(f"{args.output}.only_support_tree.svg", w=500, units="px", tree_style=ts) + t.render(f"{args.output}.only_support_tree.png", w=3000, units="px", tree_style=ts) + + ts.show_branch_length = False + ts.show_branch_support = False + ts.branch_vertical_margin = 0 + t.render(f"{args.output}.only_tree.png", w=6000, units="px", tree_style=ts) + ts.show_branch_length = False ts.show_branch_support = False - ts.branch_vertical_margin = -2 - t.render(f"{args.output}.only_tree.svg", w=500, units="px", tree_style=ts) + ts.branch_vertical_margin = 0 + t.render(f"{args.output}.dots.png", w=6000, units="px", tree_style=ts) + + if args.show: t.show(tree_style=ts) if __name__ == "__main__": parser = ArgumentParser(description="script to visualize phylogenetic trees using ete3 (required python < 3.10)") - group_required = parser.add_argument_group("Required options") - group_required.add_argument("-i", "--input", type=str, help="NEWICK file") - group_required.add_argument("-o", "--output", type=str, help="outfile prefix") - group_additional = parser.add_argument_group("Additional options") - group_additional.add_argument("-g", "--outgroup", type=str, default=False, help="outgroup species name (default = unrooted)") - group_additional.add_argument("--show", action="store_true", help="option to show tree using GUI") + group_required = parser.add_argument_group('Required options') + group_required.add_argument('-i', '--input', type=str, help="NEWICK file") + group_required.add_argument('-o', '--output', type=str, help="outfile name") + group_additional = parser.add_argument_group('Additional options') + group_additional.add_argument('-g', '--outgroup', type=str, default=False, help="outgroup species name (default = unrooted)") + group_additional.add_argument('--show', action="store_true", help="option to show tree using GUI") args = parser.parse_args() main() From 3de9bc46a8999632160691eb2a7822c2b91d961a Mon Sep 17 00:00:00 2001 From: Zubiriguitar Date: Fri, 21 Mar 2025 23:18:54 +0700 Subject: [PATCH 04/15] Tree support fix --- workflow/rules/raxml.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/rules/raxml.smk b/workflow/rules/raxml.smk index 460a5b4..7bc39ec 100644 --- a/workflow/rules/raxml.smk +++ b/workflow/rules/raxml.smk @@ -7,6 +7,7 @@ if config["raxml"]: raxml_dir_path / f"{fasta_dna_filename}.raxml.bestTree", raxml_dir_path / f"{fasta_dna_filename}.raxml.log", raxml_dir_path / f"{fasta_dna_filename}.raxml.rba", + raxml_dir_path / f"{fasta_dna_filename}.raxml.support", params: options=config["raxml_params"], outdir=raxml_dir_path, @@ -31,7 +32,7 @@ if config["raxml"]: rule raxml_tree_rename: input: - raxml_dir_path / f"{fasta_dna_filename}.raxml.bestTree", + raxml_dir_path / f"{fasta_dna_filename}.raxml.support", output: raxml_dir_path / f"{fasta_dna_filename}.raxml.treefile", params: From d84187da3a5f661080bdfe0ed4509601c4bffd43 Mon Sep 17 00:00:00 2001 From: Zubiriguitar Date: Sun, 13 Apr 2025 23:19:27 +0700 Subject: [PATCH 05/15] extension fix --- workflow/rules/visualization.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/visualization.smk b/workflow/rules/visualization.smk index 7951663..da75247 100644 --- a/workflow/rules/visualization.smk +++ b/workflow/rules/visualization.smk @@ -69,7 +69,7 @@ if config["draw_phylotrees"]: treefile=astral_dir_path / astral_tree, common_ids=rules.common_ids.output, output: - astral_dir_path / f"{astral_tree}.svg", + astral_dir_path / f"{astral_tree}.png", params: prefix=astral_dir_path / astral_tree, log: From f62117d5d9b93e0f67fef2d5b6f3b1c0fbffd9be Mon Sep 17 00:00:00 2001 From: Zubiriguitar Date: Wed, 16 Apr 2025 01:52:37 +0700 Subject: [PATCH 06/15] vcf to fasta reconstruction --- Snakefile | 138 +++-- profile/slurm/config.yaml | 2 +- workflow/envs/conda.yaml | 4 + workflow/rules/busco.smk | 4 +- workflow/rules/concat_alignments.smk | 4 +- workflow/rules/raxml.smk | 6 +- workflow/rules/vcf.smk | 225 ++++++++ workflow/rules/visualization.smk | 12 +- .../scripts/draw_phylotrees_from_astral.py | 2 +- .../draw_phylotrees_from_astral_extended.py | 2 +- workflow/scripts/vcf2phylip.py | 502 ++++++++++++++++++ 11 files changed, 851 insertions(+), 50 deletions(-) create mode 100644 workflow/rules/vcf.smk create mode 100644 workflow/scripts/vcf2phylip.py diff --git a/Snakefile b/Snakefile index 1c76119..5581495 100644 --- a/Snakefile +++ b/Snakefile @@ -1,7 +1,6 @@ from pathlib import Path import os - # ---- Setup config ---- configfile: "config/default.yaml" @@ -12,6 +11,8 @@ genome_dir_path = Path(config["genome_dir"]).resolve() log_dir_path = Path(config["log_dir"]) benchmark_dir_path = Path(config["benchmark_dir"]) output_dir_path = Path(config["output_dir"]) +vcf_dir_path = Path(config["vcf_dir"]).resolve() + quastcore_dir_path = output_dir_path / config["quastcore_dir"] busco_dir_path = output_dir_path / config["busco_dir"] @@ -28,13 +29,17 @@ rapidnj_dir_path = output_dir_path / config["rapidnj_dir"] phylip_dir_path = output_dir_path / config["phylip_dir"] raxml_dir_path = output_dir_path / config["raxml_dir"] -if "species_list" not in config: - config["species_list"] = [f.stem for f in genome_dir_path.iterdir() if f.is_file() and f.suffix == ".fasta"] +# if "species_list" not in config: +# config["species_list"] = [f.stem for f in genome_dir_path.iterdir() if f.is_file() and f.suffix == ".fasta"] + # ---- Setup filenames ---- -fasta_dna_filename = "{}.fna".format(config["alignment_file_prefix"]) +vcf_filename = "{}.{ext,vcf|vcf.gz}" +vcf_mask_filename = "{}.{ext,bed|bed.gz}" +vcf_reference_filename = "{}.{ext,fna|fasta}" +fasta_dna_filename = f"concat_alignment{config['altref_suffix']}.fna" fasta_protein_filename = "{}.faa".format(config["alignment_file_prefix"]) -nexus_dna_filename = "{}.fna.nex".format(config["alignment_file_prefix"]) +nexus_dna_filename = f"{fasta_dna_filename}.nex" nexus_protein_filename = "{}.faa.nex".format(config["alignment_file_prefix"]) phylip_dna_filename = "{}.fna.phy".format(config["alignment_file_prefix"]) phylip_protein_filename = "{}.faa.phy".format(config["alignment_file_prefix"]) @@ -45,7 +50,8 @@ astral_filtered_trees = "{0}.iqtree_per_fna.concat.{1}.treefile".format(config[" astral_tree = "{0}.{1}.fna.astral.treefile".format(config["alignment_file_prefix"], config["nodes_filtrataion_by_support"]) rapidnj_tree = "{}.fna.rapidnj.treefile".format(config["alignment_file_prefix"]) phylip_tree = "{}.fna.phy.namefix.treefile".format(config["alignment_file_prefix"]) -raxml_tree = "{}.fna.raxml.treefile".format(config["alignment_file_prefix"]) +raxml_tree = f"{fasta_dna_filename}.raxml.treefile" + # ---- Necessary functions ---- @@ -60,95 +66,157 @@ def expand_faa_from_merged_sequences(wildcards, template): N = glob_wildcards(os.path.join(checkpoint_output, "{N}.faa")).N return expand(str(template), N=N) +def get_samples_from_file(sample_list_path): + with open(sample_list_path) as f: + return [line.strip() for line in f if line.strip()] + + + +#---- VCF ---- +if config["vcf"]: + include: "workflow/rules/vcf.smk" + + checkpoint vcf_processed: + input: + expand( + genome_dir_path / "{sample}.AltRef.fasta", + sample=glob_wildcards(os.path.join(vcf_dir_path, "*", "{sample}_PASS.vcf.gz")).sample + ) + +def get_vcf_samples(vcf_dir): + vcf_files = list(Path(vcf_dir).rglob("*.vcf.gz")) + if not vcf_files: + raise ValueError(f"No VCF samples found in {vcf_dir} subdirectories.") + return sorted({f.stem.replace("_PASS", "") for f in vcf_files}) + +if "species_list" not in config: + if config.get("vcf", False): + config["species_list"] = [] + else: + config["species_list"] = [f.stem for f in genome_dir_path.glob("*.fasta") if f.is_file()] # +-----------------+ # | the "All" rule | # +-----------------+ output_files = [ - # ---- Busco ---- + # ---- VCF ---- + checkpoints.vcf_processed.get().output[0] if config["vcf"] else [], + *( + expand( + genome_dir_path / "{sample}.AltRef.fasta", + sample=config["species_list"] + ) + if config.get("vcf", False) + else [] + ), expand(busco_dir_path / "{species}/short_summary_{species}.txt", species=config["species_list"]), + # ---- Busco ---- + expand(busco_dir_path / + "{species}/short_summary_{species}.txt", species=config["species_list"]), # ---- Merge sequences with common ids ---- - lambda w: expand_fna_from_merged_sequences(w, merged_sequences_dir_path / "{N}.fna"), - lambda w: expand_faa_from_merged_sequences(w, merged_sequences_dir_path / "{N}.faa"), + lambda w: expand_fna_from_merged_sequences( + w, merged_sequences_dir_path / "{N}.fna"), + lambda w: expand_faa_from_merged_sequences( + w, merged_sequences_dir_path / "{N}.faa"), species_ids_dir_path / "unique_species_ids.png", busco_dir_path / "busco_summaries.svg", + + + ] +# if "vcf" in config: +# if config["vcf"]: +# expand("genomes/{sample}.AltRef.fasta", sample=get_sample_names()) if "quastcore" in config: if config["quastcore"]: output_files.append(quastcore_dir_path / "assembly_stats.csv") if "dna_alignment" in config: if config["dna_alignment"]: - output_files.append(lambda w: expand_fna_from_merged_sequences(w, alignments_dir_path / "fna" / "{N}.fna")) + output_files.append(lambda w: expand_fna_from_merged_sequences( + w, alignments_dir_path / "fna" / "{N}.fna")) if "dna_filtration" in config: if config["dna_filtration"]: - output_files.append(lambda w: expand_fna_from_merged_sequences(w, filtered_alignments_dir_path / "fna" / "{N}.fna")) - output_files.append(concat_alignments_dir_path / fasta_dna_filename) - output_files.append(concat_alignments_dir_path / nexus_dna_filename) + output_files.append(lambda w: expand_fna_from_merged_sequences( + w, filtered_alignments_dir_path / "fna" / "{N}.fna")) + output_files.append( + concat_alignments_dir_path / fasta_dna_filename) + output_files.append( + concat_alignments_dir_path / nexus_dna_filename) if "iqtree_dna" in config: if config["iqtree_dna"]: - output_files.append(iqtree_dir_path / "fna" / f"{fasta_dna_filename}.treefile") + output_files.append( + iqtree_dir_path / "fna" / f"{fasta_dna_filename}.treefile") if "draw_phylotrees" in config: if config["draw_phylotrees"]: - output_files.append(iqtree_dir_path / "fna" / f"{fasta_dna_filename}.length_and_support_tree.png") + output_files.append( + iqtree_dir_path / "fna" / f"{fasta_dna_filename}.length_and_support_tree.png") if "astral" in config: if config["astral"]: output_files.append(astral_dir_path / astral_tree) if "draw_phylotrees" in config: if config["draw_phylotrees"]: - output_files.append(astral_dir_path / f"{astral_tree}.png") + output_files.append( + astral_dir_path / f"{astral_tree}.png") if "rapidnj" in config: if config["rapidnj"]: - output_files.append(concat_alignments_dir_path / stockholm_dna_filename) + output_files.append( + concat_alignments_dir_path / stockholm_dna_filename) output_files.append(rapidnj_dir_path / rapidnj_tree) if "draw_phylotrees" in config: if config["draw_phylotrees"]: - output_files.append(rapidnj_dir_path / f"{fasta_dna_filename}.only_tree.png") + output_files.append( + rapidnj_dir_path / f"{fasta_dna_filename}.only_tree.png") if "phylip" in config: if config["phylip"]: - output_files.append(concat_alignments_dir_path / phylip_dna_filename) + output_files.append( + concat_alignments_dir_path / phylip_dna_filename) output_files.append(phylip_dir_path / phylip_tree) if "draw_phylotrees" in config: if config["draw_phylotrees"]: - output_files.append(phylip_dir_path / f"{fasta_dna_filename}.only_tree.png") + output_files.append( + phylip_dir_path / f"{fasta_dna_filename}.only_tree.png") if "raxml" in config: if config["raxml"]: output_files.append(raxml_dir_path / raxml_tree) - if "draw_phylotrees" in config: - if config["draw_phylotrees"]: - output_files.append(raxml_dir_path / f"{fasta_dna_filename}.only_tree.png") #todo +# if "draw_phylotrees" in config: +# if config["draw_phylotrees"]: +# output_files.append( +# raxml_dir_path / f"{fasta_dna_filename}.only_tree.png") if "protein_alignment" in config: if config["protein_alignment"]: - output_files.append(lambda w: expand_faa_from_merged_sequences(w, alignments_dir_path / "faa" / "{N}.faa")) + output_files.append(lambda w: expand_faa_from_merged_sequences( + w, alignments_dir_path / "faa" / "{N}.faa")) if "protein_filtration" in config: if config["protein_filtration"]: - output_files.append(lambda w: expand_fna_from_merged_sequences(w, filtered_alignments_dir_path / "faa" / "{N}.faa")) - output_files.append(concat_alignments_dir_path / fasta_protein_filename) + output_files.append(lambda w: expand_fna_from_merged_sequences( + w, filtered_alignments_dir_path / "faa" / "{N}.faa")) + output_files.append( + concat_alignments_dir_path / fasta_protein_filename) # output_files.append(concat_alignments_dir_path / nexus_protein_filename) # output_files.append(concat_alignments_dir_path / stockholm_protein_filename) # output_files.append(concat_alignments_dir_path / phylip_protein_filename) if "iqtree_protein" in config: if config["iqtree_protein"]: - output_files.append(iqtree_dir_path / "faa" / f"{fasta_protein_filename}.treefile") + output_files.append( + iqtree_dir_path / "faa" / f"{fasta_protein_filename}.treefile") if "draw_phylotrees" in config: if config["draw_phylotrees"]: - output_files.append(iqtree_dir_path / "faa" / f"{fasta_protein_filename}.length_and_support_tree.png") + output_files.append( + iqtree_dir_path / "faa" / f"{fasta_protein_filename}.length_and_support_tree.png") if "mrbayes_dna" in config: - if config["mrbayes_dna"]: # to-do: upgrade + if config["mrbayes_dna"]: output_files.append(mrbayes_dir_path / "fna") if "mrbayes_protein" in config: if config["mrbayes_protein"]: output_files.append(mrbayes_dir_path / "faa") -localrules: - all, - +localrules: all rule all: input: - output_files, - + output_files # ---- Load rules ---- include: "workflow/rules/quastcore.smk" @@ -161,6 +229,6 @@ include: "workflow/rules/iqtree.smk" include: "workflow/rules/mrbayes.smk" include: "workflow/rules/visualization.smk" include: "workflow/rules/astral.smk" +include: "workflow/rules/raxml.smk" include: "workflow/rules/rapidnj.smk" include: "workflow/rules/phylip.smk" -include: "workflow/rules/raxml.smk" diff --git a/profile/slurm/config.yaml b/profile/slurm/config.yaml index e320e35..801eaef 100644 --- a/profile/slurm/config.yaml +++ b/profile/slurm/config.yaml @@ -6,4 +6,4 @@ printshellcmds: True show-failed-logs: True rerun-incomplete: True cluster-cancel: "scancel" -cluster: "sbatch -p {resources.queue} -t {resources.time} --mem={resources.mem_mb} -c {resources.cpus} -o {log.cluster_log} -e {log.cluster_err}" +cluster: "sbatch -p {resources.queue} -t {resources.time} --mem={resources.mem_mb} -c {resources.cpus} -o {log.cluster_log} -e {log.cluster_err} --qos=high_all" diff --git a/workflow/envs/conda.yaml b/workflow/envs/conda.yaml index 4ffc8e7..7f01fe0 100644 --- a/workflow/envs/conda.yaml +++ b/workflow/envs/conda.yaml @@ -16,6 +16,10 @@ dependencies: - rapidnj - phylip - raxml-ng + - gatk4 + - bedtools + - bcftools + - samtools # - mrbayes - pip: - "--editable=git+https://github.com/tomarovsky/Biocrutch.git#egg=Biocrutch" diff --git a/workflow/rules/busco.smk b/workflow/rules/busco.smk index 9f96004..267e82f 100644 --- a/workflow/rules/busco.smk +++ b/workflow/rules/busco.smk @@ -2,7 +2,9 @@ if config["busco_gene_prediction_tool"] == "metaeuk": rule busco_metaeuk: input: - genome_dir_path / "{species}.fasta", + genome_dir_path / "{species}{suffix}.fasta".format( + species="{species}", + suffix=config["altref_suffix"]) output: busco_outdir=directory(busco_dir_path / "{species}"), single_copy_busco_sequences=directory(busco_dir_path / "{species}/busco_sequences/single_copy_busco_sequences"), diff --git a/workflow/rules/concat_alignments.smk b/workflow/rules/concat_alignments.smk index 04b7ccf..8d5312e 100644 --- a/workflow/rules/concat_alignments.smk +++ b/workflow/rules/concat_alignments.smk @@ -44,9 +44,9 @@ rule concat_fasta_protein: rule concat_nexus_dna: input: - rules.concat_fasta_dna.output, + rules.concat_fasta_dna.output, output: - concat_alignments_dir_path / nexus_dna_filename, + concat_alignments_dir_path / nexus_dna_filename params: type="DNA", block=config["mrbayes_block"], diff --git a/workflow/rules/raxml.smk b/workflow/rules/raxml.smk index 7bc39ec..95bfb83 100644 --- a/workflow/rules/raxml.smk +++ b/workflow/rules/raxml.smk @@ -2,13 +2,13 @@ if config["raxml"]: rule raxml_tree: input: - concat_alignments_dir_path / fasta_dna_filename, + concat_alignments_dir_path / fasta_dna_filename output: raxml_dir_path / f"{fasta_dna_filename}.raxml.bestTree", raxml_dir_path / f"{fasta_dna_filename}.raxml.log", raxml_dir_path / f"{fasta_dna_filename}.raxml.rba", - raxml_dir_path / f"{fasta_dna_filename}.raxml.support", - params: + raxml_dir_path / f"{fasta_dna_filename}.raxml.support" + params: options=config["raxml_params"], outdir=raxml_dir_path, prefix=fasta_dna_filename, diff --git a/workflow/rules/vcf.smk b/workflow/rules/vcf.smk new file mode 100644 index 0000000..6b0d3be --- /dev/null +++ b/workflow/rules/vcf.smk @@ -0,0 +1,225 @@ +if config["vcf"]: + + checkpoint vcf_discovery: + input: + vcf_dir_path + output: + touch(vcf_dir_path / "vcf_files.discovered") + shell: + "find {input} -name '*.vcf*' > {output}" + + if config["vcf_masking"]: + rule vcf_masking: + input: + vcf_file=lambda w: f"vcf_dir_path/{w.vcf_directory}/{w.sample}.vcf.gz", + mask_file=lambda w: f"vcf_dir_path/{w.vcf_directory}/mask.bed" + output: + vcf_dir_path / "{vcf_directory}/{sample}.masked.vcf.gz" + params: + type="DNA" + log: + std=log_dir_path / "{vcf_directory}/{sample}_vcf_masking.log", + cluster_log=cluster_log_dir_path / "{vcf_directory}/{sample}_vcf_masking.cluster.log", + cluster_err=cluster_log_dir_path / "{vcf_directory}/{sample}_vcf_masking.cluster.err", + benchmark: + benchmark_dir_path / "{vcf_directory}/{sample}_vcf_masking.benchmark.txt" + conda: + "../../%s" % config["conda_config"] + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + shell: + "bedtools intersect -header -v -a {input.vcf_file} -b {input.mask_file} > {output} 2> {log.std}" + + + rule vcf_extract_sample_names: + input: + vcf=lambda w: f"vcf_dir_path/{w.vcf_directory}/sample.masked.vcf.gz", + checkpoint_file=checkpoints.vcf_discovery.get().output[0] + output: + f"vcf_dir_path/{{vcf_directory}}/sample_list.txt", + params: + type="DNA" + log: + std=log_dir_path / "{vcf_directory}/vcf_extract_sample_names.log", + cluster_log=cluster_log_dir_path / "{vcf_directory}/vcf_extract_sample_names.cluster.log", + cluster_err=cluster_log_dir_path / "{vcf_directory}/vcf_extract_sample_names.cluster.err", + benchmark: + benchmark_dir_path / "{vcf_directory}/vcf_extract_sample_names.benchmark.txt" + conda: + "../../%s" % config["conda_config"] + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + shell: + "bcftools query -l {input.vcf} > {output}" + + + rule vcf_separation: + input: + vcf=lambda w: f"vcf_dir_path/{w.vcf_directory}/sample.masked.vcf.gz", + sample_list="vcf_dir_path/{vcf_directory}/sample_list.txt" + output: + directory(f"vcf_dir_path/{{vcf_directory}}/separated"), + params: + type="DNA" + log: + std=log_dir_path / "{vcf_directory}/vcf_separation.log", + cluster_log=cluster_log_dir_path / "{vcf_directory}/vcf_separation.cluster.log", + cluster_err=cluster_log_dir_path / "{vcf_directory}/vcf_separation.cluster.err", + benchmark: + benchmark_dir_path / "{vcf_directory}/vcf_separation.benchmark.txt" + conda: + "../../%s" % config["conda_config"] + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + shell: + """ + mkdir -p {output} + while read -r SAMPLE; do + [ -z "$SAMPLE" ] && continue + bcftools view --threads 10 --min-ac 1 \\ + -s "$SAMPLE" \\ + -O z -o {output}/"$SAMPLE".separated.vcf.gz \\ + {input.vcf} + done < {input.sample_list} + """ + rule vcf_filtering: + input: + "vcf_dir_path/{vcf_directory}/separated/{sample}.separated.vcf.gz", + output: + "vcf_dir_path/{vcf_directory}/filtered/{sample}.filtered.vcf.gz", + params: + vcf_directory="{vcf_directory}", + common_filters=config["common_filters"], + haploid_filters=config["haploid_filters"], + diploid_filters=config["diploid_filters"], + log: + std=log_dir_path / "{vcf_directory}/{sample}_vcf_filtering.log", # Removed {ext} + cluster_log=cluster_log_dir_path / "{vcf_directory}/{sample}_vcf_filtering.cluster.log", + cluster_err=cluster_log_dir_path / "{vcf_directory}/{sample}_vcf_filtering.cluster.err", + benchmark: + benchmark_dir_path / "{vcf_directory}/{sample}_vcf_filtering.benchmark.txt" + conda: + "../../%s" % config["conda_config"] + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + shell: + """ + mkdir -p {vcf_dir_path}/{wildcards.vcf_directory}/filtered + ploidy=$(bcftools query -f '%%GT\\n' {input} | \\ + awk '$1 ~ /\// || $1 ~ /\|/ {{print "diploid"; exit}} END {{print "haploid"}}') + + if [ "$ploidy" = "diploid" ]; then + bcftools filter -i {params.common_filters} \\ + -e {params.diploid_filters} {input} > {output} + else + bcftools filter -i {params.common_filters} \\ + -e {params.haploid_filters} {input} > {output} + fi + """ + + rule vcf_sequence_dictionary: + input: + "vcf_dir_path/{vcf_directory}/{reference}.{ext}", # Simplified pattern + output: + "vcf_dir_path/{vcf_directory}/{reference}.dict" + log: + std=log_dir_path / "{vcf_directory}/{reference}_vcf_sequence_dictionary.log", + cluster_log=cluster_log_dir_path / "{vcf_directory}/{reference}_vcf_sequence_dictionary.cluster.log", + cluster_err=cluster_log_dir_path / "{vcf_directory}/{reference}_vcf_sequence_dictionary.cluster.err", + benchmark: + benchmark_dir_path / "{vcf_directory}/{reference}_vcf_sequence_dictionary.benchmark.txt" + conda: + "../../%s" % config["conda_config"] + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + shell: + "java -Xmx5g -jar picard.jar CreateSequenceDictionary R={input} O={output} 2> {log.std}" + + + rule vcf_indexing: + input: + "vcf_dir_path/{vcf_directory}/filtered/{sample}.filtered.{ext}", + output: + "vcf_dir_path/{vcf_directory}/filtered/{sample}.filtered.{ext}.idx", + log: + std=log_dir_path / "{vcf_directory}/{sample}.filtered.{ext}_vcf_indexing.log", + cluster_log=cluster_log_dir_path / "{vcf_directory}/{sample}.filtered.{ext}_vcf_indexing.cluster.log", # Fixed typo + cluster_err=cluster_log_dir_path / "{vcf_directory}/{sample}.filtered.{ext}_vcf_indexing.cluster.err", + benchmark: + benchmark_dir_path / "{vcf_directory}/{sample}.filtered.{ext}_vcf_indexing.benchmark.txt" + conda: + "../../%s" % config["conda_config"] + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + shell: + "java -Xmx10g -jar gatk IndexFeatureFile -I {input} 1> {log.std} 2>&1" + + rule reference_indexing: + input: + "vcf_dir_path/{vcf_directory}/{reference}.{ext}", + output: + "vcf_dir_path/{vcf_directory}/{reference}.{ext}.fai", + log: + std=log_dir_path / "{vcf_directory}/{reference}.{ext}_reference_indexing.log", + cluster_log=cluster_log_dir_path / "{vcf_directory}/{reference}.{ext}_reference_indexing.cluster.log", + cluster_err=cluster_log_dir_path / "{vcf_directory}/{reference}.{ext}_reference_indexing.cluster.err", + benchmark: + benchmark_dir_path / "{vcf_directory}/{reference}.{ext}_reference_indexing.benchmark.txt" + conda: + "../../%s" % config["conda_config"] + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + shell: + "samtools faidx {input} 2> {log.std}" + + rule vcf_to_fasta: + input: + vcf_file="vcf_dir_path/{vcf_directory}/filtered/{sample}.filtered.vcf.gz", + reference_file="vcf_dir_path/{vcf_directory}/reference.fna" + output: + genome_dir_path / "{sample}.AltRef.fasta", # Keep flat structure + params: + vcf_directory=lambda w: os.path.basename(os.path.dirname(os.path.dirname(input.vcf_file))), + log: + std=log_dir_path / "{sample}_vcf_to_fasta.log", + cluster_log=cluster_log_dir_path / "{sample}_vcf_to_fasta.cluster.log", + cluster_err=cluster_log_dir_path / "{sample}_vcf_to_fasta.cluster.err", + benchmark: + benchmark_dir_path / "{sample}_vcf_to_fasta.benchmark.txt" + params: + vcf_directory="{sample}" + conda: + "../../%s" % config["conda_config"] + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + shell: + "java -Xmx10g -jar gatk FastaAlternateReferenceMaker " + "--output {output} " + "--reference {input.reference_file} " + "--variant {input.vcf_file} " + "--showHidden true " + "--use-iupac-sample {wildcards.sample} 2> {log.std}" \ No newline at end of file diff --git a/workflow/rules/visualization.smk b/workflow/rules/visualization.smk index da75247..3063cd2 100644 --- a/workflow/rules/visualization.smk +++ b/workflow/rules/visualization.smk @@ -158,14 +158,14 @@ if config["draw_phylotrees"]: rule raxml_tree_visualization: input: - raxml_dir_path / raxml_tree, + raxml_dir_path / f"{fasta_dna_filename}.raxml.treefile" output: - raxml_dir_path / f"{fasta_dna_filename}.length_and_support_tree.png", - raxml_dir_path / f"{fasta_dna_filename}.only_support_tree.png", - raxml_dir_path / f"{fasta_dna_filename}.only_tree.png", - raxml_dir_path / f"{fasta_dna_filename}.dots.png", + raxml_dir_path / f"{fasta_dna_filename}.raxml.length_and_support_tree.png", + raxml_dir_path / f"{fasta_dna_filename}.raxml.only_support_tree.png", + raxml_dir_path / f"{fasta_dna_filename}.raxml.only_tree.png", + raxml_dir_path / f"{fasta_dna_filename}.raxml.dots.png", params: - prefix=raxml_dir_path / f"{fasta_dna_filename}", + prefix=raxml_dir_path / f"{fasta_dna_filename}.raxml", options=config["tree_visualization_params"], log: std=log_dir_path / "raxml_tree_visualization.log", diff --git a/workflow/scripts/draw_phylotrees_from_astral.py b/workflow/scripts/draw_phylotrees_from_astral.py index 6065eb3..6f5a783 100755 --- a/workflow/scripts/draw_phylotrees_from_astral.py +++ b/workflow/scripts/draw_phylotrees_from_astral.py @@ -133,7 +133,7 @@ def main(): "--output_formats", dest="output_formats", type=lambda s: s.split(","), - default=("svg"), + default=("png"), help="Comma-separated list of formats (supported by ete3) of output figure. Default: svg,png", ) args = parser.parse_args() diff --git a/workflow/scripts/draw_phylotrees_from_astral_extended.py b/workflow/scripts/draw_phylotrees_from_astral_extended.py index daf65f5..14410b8 100755 --- a/workflow/scripts/draw_phylotrees_from_astral_extended.py +++ b/workflow/scripts/draw_phylotrees_from_astral_extended.py @@ -149,7 +149,7 @@ def main(): group_additional.add_argument('--width', type=int, default=1500, help="width for result rendering") group_additional.add_argument('--show', action="store_true", help="option to show tree using GUI") group_additional.add_argument("-e", "--output_formats", dest="output_formats", type=lambda s: s.split(","), - default=("svg"), help="Comma-separated list of formats (supported by ete3) of output figure. Default: svg,png") + default=("png"), help="Comma-separated list of formats (supported by ete3) of output figure. Default: svg,png") args = parser.parse_args() main() diff --git a/workflow/scripts/vcf2phylip.py b/workflow/scripts/vcf2phylip.py new file mode 100644 index 0000000..9a2cb5b --- /dev/null +++ b/workflow/scripts/vcf2phylip.py @@ -0,0 +1,502 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +The script converts a collection of SNPs in VCF format into a PHYLIP, FASTA, +NEXUS, or binary NEXUS file for phylogenetic analysis. The code is optimized +to process VCF files with sizes >1GB. For small VCF files the algorithm slows +down as the number of taxa increases (but is still fast). + +Any ploidy is allowed, but binary NEXUS is produced only for diploid VCFs. +""" + +__author__ = "Edgardo M. Ortiz" +__credits__ = "Juan D. Palacio-Mejía" +__version__ = "2.9" +__email__ = "e.ortiz.v@gmail.com" +__date__ = "2023-07-07" + +import argparse +import gzip +import random +import sys +from pathlib import Path + +# Dictionary of IUPAC ambiguities for nucleotides +# '*' is a deletion in GATK, deletions are ignored in consensus, lowercase consensus is used when an +# 'N' or '*' is part of the genotype. Capitalization is used by some software but ignored by Geneious +# for example +AMBIG = { + "A" :"A", "C" :"C", "G" :"G", "N" :"N", "T" :"T", + "*A" :"a", "*C" :"c", "*G" :"g", "*N" :"n", "*T" :"t", + "AC" :"M", "AG" :"R", "AN" :"a", "AT" :"W", "CG" :"S", + "CN" :"c", "CT" :"Y", "GN" :"g", "GT" :"K", "NT" :"t", + "*AC" :"m", "*AG" :"r", "*AN" :"a", "*AT" :"w", "*CG" :"s", + "*CN" :"c", "*CT" :"y", "*GN" :"g", "*GT" :"k", "*NT" :"t", + "ACG" :"V", "ACN" :"m", "ACT" :"H", "AGN" :"r", "AGT" :"D", + "ANT" :"w", "CGN" :"s", "CGT" :"B", "CNT" :"y", "GNT" :"k", + "*ACG" :"v", "*ACN" :"m", "*ACT" :"h", "*AGN" :"r", "*AGT" :"d", + "*ANT" :"w", "*CGN" :"s", "*CGT" :"b", "*CNT" :"y", "*GNT" :"k", + "ACGN" :"v", "ACGT" :"N", "ACNT" :"h", "AGNT" :"d", "CGNT" :"b", + "*ACGN":"v", "*ACGT":"N", "*ACNT":"h", "*AGNT":"d", "*CGNT":"b", + "*" :"-", "*ACGNT":"N", +} + +# Dictionary for translating biallelic SNPs into SNAPP, only for diploid VCF +# 0 is homozygous reference +# 1 is heterozygous +# 2 is homozygous alternative +GEN_BIN = { + "./.":"?", + ".|.":"?", + "0/0":"0", + "0|0":"0", + "0/1":"1", + "0|1":"1", + "1/0":"1", + "1|0":"1", + "1/1":"2", + "1|1":"2", +} + + +def extract_sample_names(vcf_file): + """ + Extract sample names from VCF file + """ + if vcf_file.lower().endswith(".gz"): + opener = gzip.open + else: + opener = open + sample_names = [] + with opener(vcf_file, "rt") as vcf: + for line in vcf: + line = line.strip("\n") + if line.startswith("#CHROM"): + record = line.split("\t") + sample_names = [record[i].replace("./", "") for i in range(9, len(record))] + break + return sample_names + + +def is_anomalous(record, num_samples): + """ + Determine if the number of samples in current record corresponds to number of samples described + in the line '#CHROM' + """ + return bool(len(record) != num_samples + 9) + + +def is_snp(record): + """ + Determine if current VCF record is a SNP (single nucleotide polymorphism) as opposed to MNP + (multinucleotide polymorphism) + """ + # must be replaced by the REF in the ALT field for GVCFs from GATK + alt = record[4].replace("", record[3]) + return bool(len(record[3]) == 1 and len(alt) - alt.count(",") == alt.count(",") + 1) + + +def num_genotypes(record, num_samples): + """ + Get number of genotypes in VCF record, total number of samples - missing genotypes + """ + missing = 0 + for i in range(9, num_samples + 9): + if record[i].startswith("."): + missing += 1 + return num_samples - missing + + +def get_matrix_column(record, num_samples, resolve_IUPAC): + """ + Transform a VCF record into a phylogenetic matrix column with nucleotides instead of numbers + """ + nt_dict = {str(0): record[3].replace("-","*").upper(), ".": "N"} + # must be replaced by the REF in the ALT field for GVCFs from GATK + alt = record[4].replace("-", "*").replace("", nt_dict["0"]) + alt = alt.split(",") + for n in range(len(alt)): + nt_dict[str(n+1)] = alt[n] + column = "" + for i in range(9, num_samples + 9): + geno_num = record[i].split(":")[0].replace("/", "").replace("|", "") + try: + geno_nuc = "".join(sorted(set([nt_dict[j] for j in geno_num]))) + except KeyError: + return "malformed" + if resolve_IUPAC is False: + column += AMBIG[geno_nuc] + else: + column += AMBIG[nt_dict[random.choice(geno_num)]] + return column + + +def get_matrix_column_bin(record, num_samples): + """ + Return an alignment column in NEXUS binary from a VCF record, if genotype is not diploid with at + most two alleles it will return '?' as state + """ + column = "" + for i in range(9, num_samples + 9): + genotype = record[i].split(":")[0] + if genotype in GEN_BIN: + column += GEN_BIN[genotype] + else: + column += "?" + return column + + +def main(): + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("-i", "--input", + action = "store", + dest = "filename", + required = True, + help = "Name of the input VCF file, can be gzipped") + parser.add_argument("--output-folder", + action = "store", + dest = "folder", + default = "./", + help = "Output folder name, it will be created if it does not exist (same folder as input by " + "default)") + parser.add_argument("--output-prefix", + action = "store", + dest = "prefix", + help = "Prefix for output filenames (same as the input VCF filename without the extension by " + "default)") + parser.add_argument("-m", "--min-samples-locus", + action = "store", + dest = "min_samples_locus", + type = int, + default = 4, + help = "Minimum of samples required to be present at a locus (default=4)") + parser.add_argument("-o", "--outgroup", + action = "store", + dest = "outgroup", + default = "", + help = "Name of the outgroup in the matrix. Sequence will be written as first taxon in the " + "alignment.") + parser.add_argument("-p", "--phylip-disable", + action = "store_true", + dest = "phylipdisable", + help = "A PHYLIP matrix is written by default unless you enable this flag") + parser.add_argument("-f", "--fasta", + action = "store_true", + dest = "fasta", + help = "Write a FASTA matrix (disabled by default)") + parser.add_argument("-n", "--nexus", + action = "store_true", + dest = "nexus", + help = "Write a NEXUS matrix (disabled by default)") + parser.add_argument("-b", "--nexus-binary", + action = "store_true", + dest = "nexusbin", + help = "Write a binary NEXUS matrix for analysis of biallelic SNPs in SNAPP, only diploid " + "genotypes will be processed (disabled by default)") + parser.add_argument("-r", "--resolve-IUPAC", + action = "store_true", + dest = "resolve_IUPAC", + help = "Randomly resolve heterozygous genotypes to avoid IUPAC ambiguities in the matrices " + "(disabled by default)") + parser.add_argument("-w", "--write-used-sites", + action = "store_true", + dest = "write_used", + help = "Save the list of coordinates that passed the filters and were used in the alignments " + "(disabled by default)") + parser.add_argument("-v", "--version", + action = "version", + version = "%(prog)s {version}".format(version=__version__)) + args = parser.parse_args() + + outgroup = args.outgroup.split(",")[0].split(";")[0] + + # Get samples names and number of samples in VCF + if Path(args.filename).exists(): + sample_names = extract_sample_names(args.filename) + else: + print("\nInput VCF file not found, please verify the provided path") + sys.exit() + num_samples = len(sample_names) + if num_samples == 0: + print("\nSample names not found in VCF, your file may be corrupt or missing the header.\n") + sys.exit() + print("\nConverting file '{}':\n".format(args.filename)) + print("Number of samples in VCF: {:d}".format(num_samples)) + + # If the 'min_samples_locus' is larger than the actual number of samples in VCF readjust it + args.min_samples_locus = min(num_samples, args.min_samples_locus) + + # Output filename will be the same as input file, indicating the minimum of samples specified + if not args.prefix: + parts = Path(args.filename).name.split(".") + args.prefix = [] + for p in parts: + if p.lower() == "vcf": + break + else: + args.prefix.append(p) + args.prefix = ".".join(args.prefix) + args.prefix += ".min" + str(args.min_samples_locus) + + # Check if outfolder exists, create it if it doesn't + if not Path(args.folder).exists(): + Path(args.folder).mkdir(parents=True) + + outfile = str(Path(args.folder, args.prefix)) + + # We need to create an intermediate file to hold the sequence data vertically and then transpose + # it to create the matrices + if args.fasta or args.nexus or not args.phylipdisable: + temporal = open(outfile+".tmp", "w") + + # If binary NEXUS is selected also create a separate temporal + if args.nexusbin: + temporalbin = open(outfile+".bin.tmp", "w") + + + ########################## + # PROCESS GENOTYPES IN VCF + + if args.write_used: + used_sites = open(outfile+".used_sites.tsv", "w") + used_sites.write("#CHROM\tPOS\tNUM_SAMPLES\n") + + if args.filename.lower().endswith(".gz"): + opener = gzip.open + else: + opener = open + + with opener(args.filename, "rt") as vcf: + # Initialize line counter + snp_num = 0 + snp_accepted = 0 + snp_shallow = 0 + mnp_num = 0 + snp_biallelic = 0 + + while 1: + # Load large chunks of file into memory + vcf_chunk = vcf.readlines(50000) + if not vcf_chunk: + break + + for line in vcf_chunk: + line = line.strip() + + if line and not line.startswith("#"): # skip empty and commented lines + # Split line into columns + record = line.split("\t") + # Keep track of number of genotypes processed + snp_num += 1 + # Print progress every 500000 lines + if snp_num % 500000 == 0: + print("{:d} genotypes processed.".format(snp_num)) + if is_anomalous(record, num_samples): + print("Skipping malformed line:\n{}".format(line)) + continue + else: + # Check if the SNP has the minimum number of samples required + num_samples_locus = num_genotypes(record, num_samples) + if num_samples_locus < args.min_samples_locus: + # Keep track of loci rejected due to exceeded missing data + snp_shallow += 1 + continue + else: + # Check that neither REF nor ALT contain MNPs + if is_snp(record): + # If nucleotide matrices are requested + if args.fasta or args.nexus or not args.phylipdisable: + # Uncomment for debugging + # print(record) + # Transform VCF record into an alignment column + site_tmp = get_matrix_column(record, num_samples, + args.resolve_IUPAC) + # Uncomment for debugging + # print(site_tmp) + # Write entire row of single nucleotide genotypes to temp file + if site_tmp == "malformed": + print("Skipping malformed line:\n{}".format(line)) + continue + else: + # Add to running sum of accepted SNPs + snp_accepted += 1 + temporal.write(site_tmp+"\n") + if args.write_used: + used_sites.write(record[0] + "\t" + + record[1] + "\t" + + str(num_samples_locus) + "\n") + # Write binary NEXUS for SNAPP if requested + if args.nexusbin: + # Check that the SNP only has two alleles + if len(record[4]) == 1: + # Add to running sum of biallelic SNPs + snp_biallelic += 1 + # Translate genotype into 0 for homozygous REF, 1 for + # heterozygous, and 2 for homozygous ALT + binsite_tmp = get_matrix_column_bin(record, num_samples) + # Write entire row to temporary file + temporalbin.write(binsite_tmp+"\n") + else: + # Keep track of loci rejected due to multinucleotide genotypes + mnp_num += 1 + + # Print useful information about filtering of SNPs + print("Total of genotypes processed: {:d}".format(snp_num)) + print("Genotypes excluded because they exceeded the amount " + "of missing data allowed: {:d}".format(snp_shallow)) + print("Genotypes that passed missing data filter but were " + "excluded for being MNPs: {:d}".format(mnp_num)) + print("SNPs that passed the filters: {:d}".format(snp_accepted)) + if args.nexusbin: + print("Biallelic SNPs selected for binary NEXUS: {:d}".format(snp_biallelic)) + + if args.write_used: + print("Used sites saved to: '" + outfile + ".used_sites.tsv'") + used_sites.close() + print("") + + if args.fasta or args.nexus or not args.phylipdisable: + temporal.close() + if args.nexusbin: + temporalbin.close() + + + ####################### + # WRITE OUTPUT MATRICES + + if not args.phylipdisable: + output_phy = open(outfile+".phy", "w") + output_phy.write("{:d} {:d}\n".format(len(sample_names), snp_accepted)) + + if args.fasta: + output_fas = open(outfile+".fasta", "w") + + if args.nexus: + output_nex = open(outfile+".nexus", "w") + output_nex.write("#NEXUS\n\nBEGIN DATA;\n\tDIMENSIONS NTAX={:d} NCHAR={:d};\n\tFORMAT " + "DATATYPE=DNA MISSING=N GAP=- ;\nMATRIX\n".format(len(sample_names), + snp_accepted)) + + if args.nexusbin: + output_nexbin = open(outfile+".bin.nexus", "w") + output_nexbin.write("#NEXUS\n\nBEGIN DATA;\n\tDIMENSIONS NTAX={:d} NCHAR={:d};\n\tFORMAT " + "DATATYPE=SNP MISSING=? GAP=- ;\nMATRIX\n".format(len(sample_names), + snp_biallelic)) + + # Get length of longest sequence name + len_longest_name = 0 + for name in sample_names: + if len(name) > len_longest_name: + len_longest_name = len(name) + + # Write outgroup as first sequence in alignment if the name is specified + idx_outgroup = None + if outgroup in sample_names: + idx_outgroup = sample_names.index(outgroup) + + if args.fasta or args.nexus or not args.phylipdisable: + with open(outfile+".tmp") as tmp_seq: + seqout = "" + + # This is where the transposing happens + for line in tmp_seq: + seqout += line[idx_outgroup] + + # Write FASTA line + if args.fasta: + output_fas.write(">"+sample_names[idx_outgroup]+"\n"+seqout+"\n") + + # Pad sequences names and write PHYLIP or NEXUS lines + padding = (len_longest_name + 3 - len(sample_names[idx_outgroup])) * " " + if not args.phylipdisable: + output_phy.write(sample_names[idx_outgroup]+padding+seqout+"\n") + if args.nexus: + output_nex.write(sample_names[idx_outgroup]+padding+seqout+"\n") + + # Print current progress + print("Outgroup, '{}', added to the matrix(ces).".format(outgroup)) + + if args.nexusbin: + with open(outfile+".bin.tmp") as bin_tmp_seq: + seqout = "" + + # This is where the transposing happens + for line in bin_tmp_seq: + seqout += line[idx_outgroup] + + # Write line of binary SNPs to NEXUS + padding = (len_longest_name + 3 - len(sample_names[idx_outgroup])) * " " + output_nexbin.write(sample_names[idx_outgroup]+padding+seqout+"\n") + + # Print current progress + print("Outgroup, '{}', added to the binary matrix.".format(outgroup)) + + # Write sequences of the ingroup + for s in range(0, len(sample_names)): + if s != idx_outgroup: + if args.fasta or args.nexus or not args.phylipdisable: + with open(outfile+".tmp") as tmp_seq: + seqout = "" + + # This is where the transposing happens + for line in tmp_seq: + seqout += line[s] + + # Write FASTA line + if args.fasta: + output_fas.write(">"+sample_names[s]+"\n"+seqout+"\n") + + # Pad sequences names and write PHYLIP or NEXUS lines + padding = (len_longest_name + 3 - len(sample_names[s])) * " " + if not args.phylipdisable: + output_phy.write(sample_names[s]+padding+seqout+"\n") + if args.nexus: + output_nex.write(sample_names[s]+padding+seqout+"\n") + + # Print current progress + print("Sample {:d} of {:d}, '{}', added to the nucleotide matrix(ces).".format( + s+1, len(sample_names), sample_names[s])) + + if args.nexusbin: + with open(outfile+".bin.tmp") as bin_tmp_seq: + seqout = "" + + # This is where the transposing happens + for line in bin_tmp_seq: + seqout += line[s] + + # Write line of binary SNPs to NEXUS + padding = (len_longest_name + 3 - len(sample_names[s])) * " " + output_nexbin.write(sample_names[s]+padding+seqout+"\n") + + # Print current progress + print("Sample {:d} of {:d}, '{}', added to the binary matrix.".format( + s+1, len(sample_names), sample_names[s])) + + print() + if not args.phylipdisable: + print("PHYLIP matrix saved to: " + outfile+".phy") + output_phy.close() + if args.fasta: + print("FASTA matrix saved to: " + outfile+".fasta") + output_fas.close() + if args.nexus: + output_nex.write(";\nEND;\n") + print("NEXUS matrix saved to: " + outfile+".nex") + output_nex.close() + if args.nexusbin: + output_nexbin.write(";\nEND;\n") + print("BINARY NEXUS matrix saved to: " + outfile+".bin.nex") + output_nexbin.close() + + if args.fasta or args.nexus or not args.phylipdisable: + Path(outfile+".tmp").unlink() + if args.nexusbin: + Path(outfile+".bin.tmp").unlink() + + print( "\nDone!\n") + +if __name__ == "__main__": + main() \ No newline at end of file From 11fd5b933feeda1e5350d15f4ff8b7aaf4921ea2 Mon Sep 17 00:00:00 2001 From: Zubiriguitar Date: Sun, 20 Apr 2025 04:13:47 +0700 Subject: [PATCH 07/15] wildcard fix + spicies_list function + sample_list iterator --- Snakefile | 42 ++++++++++++++++++++++++++------ workflow/rules/busco.smk | 6 ++--- workflow/rules/vcf.smk | 31 ++++++++++++++++------- workflow/rules/visualization.smk | 12 ++++----- 4 files changed, 64 insertions(+), 27 deletions(-) diff --git a/Snakefile b/Snakefile index 5581495..2fd7722 100644 --- a/Snakefile +++ b/Snakefile @@ -1,5 +1,7 @@ from pathlib import Path import os +from pathlib import Path +import subprocess # ---- Setup config ---- configfile: "config/default.yaml" @@ -83,17 +85,41 @@ if config["vcf"]: sample=glob_wildcards(os.path.join(vcf_dir_path, "*", "{sample}_PASS.vcf.gz")).sample ) -def get_vcf_samples(vcf_dir): - vcf_files = list(Path(vcf_dir).rglob("*.vcf.gz")) - if not vcf_files: - raise ValueError(f"No VCF samples found in {vcf_dir} subdirectories.") - return sorted({f.stem.replace("_PASS", "") for f in vcf_files}) +#def get_vcf_samples(vcf_dir): +# vcf_files = list(Path(vcf_dir).rglob("*.vcf.gz")) +# if not vcf_files: +# raise ValueError(f"No VCF samples found in {vcf_dir} subdirectories.") +# return sorted({f.stem.replace("_PASS", "") for f in vcf_files}) -if "species_list" not in config: +#if "species_list" not in config: +# if config.get("vcf", False): +# config["species_list"] = [] +# else: +# config["species_list"] = [f.stem for f in genome_dir_path.glob("*.fasta") if f.is_file()] + +# if "species_list" not in config: +# config["species_list"] = [f.stem for f in genome_dir_path.iterdir() if f.is_file() and f.suffix == ".fasta"] + +def get_species_list(): + fasta_species = [f.stem for f in genome_dir_path.iterdir() if f.is_file() and f.suffix == ".fasta"] if config.get("vcf", False): - config["species_list"] = [] + vcf_samples = set() + for vcf in vcf_dir_path.glob("**/*.vcf*"): + if vcf.suffix in [".vcf", ".vcf.gz"]: + try: + samples = subprocess.check_output( + ["bcftools", "query", "-l", str(vcf)], + text=True + ).splitlines() + vcf_samples.update(samples) + except subprocess.CalledProcessError as e: + print(f"Error processing {vcf}: {e}") + return list(set(fasta_species + list(vcf_samples))) else: - config["species_list"] = [f.stem for f in genome_dir_path.glob("*.fasta") if f.is_file()] + return fasta_species + +if "species_list" not in config: + config["species_list"] = get_species_list() # +-----------------+ # | the "All" rule | diff --git a/workflow/rules/busco.smk b/workflow/rules/busco.smk index 267e82f..34187b5 100644 --- a/workflow/rules/busco.smk +++ b/workflow/rules/busco.smk @@ -2,9 +2,7 @@ if config["busco_gene_prediction_tool"] == "metaeuk": rule busco_metaeuk: input: - genome_dir_path / "{species}{suffix}.fasta".format( - species="{species}", - suffix=config["altref_suffix"]) + genome_dir_path / "{species}.fasta", output: busco_outdir=directory(busco_dir_path / "{species}"), single_copy_busco_sequences=directory(busco_dir_path / "{species}/busco_sequences/single_copy_busco_sequences"), @@ -83,4 +81,4 @@ elif config["busco_gene_prediction_tool"] == "augustus": " mv run*/* . ; rm -r run* 1> $MYPWD/{log.std} 2>&1; " " mv full_table.tsv full_table_{params.output_prefix}.tsv 1> $MYPWD/{log.std} 2>&1; " " mv missing_busco_list.tsv missing_busco_list_{params.output_prefix}.tsv 1> $MYPWD/{log.std} 2>&1; " - " mv short_summary.txt short_summary_{params.output_prefix}.txt 1> $MYPWD/{log.std} 2>&1; " + " mv short_summary.txt short_summary_{params.output_prefix}.txt 1> $MYPWD/{log.std} 2>&1; " \ No newline at end of file diff --git a/workflow/rules/vcf.smk b/workflow/rules/vcf.smk index 6b0d3be..aa062fe 100644 --- a/workflow/rules/vcf.smk +++ b/workflow/rules/vcf.smk @@ -14,7 +14,7 @@ if config["vcf"]: vcf_file=lambda w: f"vcf_dir_path/{w.vcf_directory}/{w.sample}.vcf.gz", mask_file=lambda w: f"vcf_dir_path/{w.vcf_directory}/mask.bed" output: - vcf_dir_path / "{vcf_directory}/{sample}.masked.vcf.gz" + "vcf_dir_path / "{vcf_directory}/{sample}.masked.vcf.gz" params: type="DNA" log: @@ -36,10 +36,14 @@ if config["vcf"]: rule vcf_extract_sample_names: input: - vcf=lambda w: f"vcf_dir_path/{w.vcf_directory}/sample.masked.vcf.gz", + vcf_files=lambda w: ( + glob.glob(f"vcf_dir_path/{w.vcf_directory}/*.masked.vcf.gz") + if config["vcf_masking"] + else glob.glob(f"vcf_dir_path/{w.vcf_directory}/*.vcf.gz") + ), checkpoint_file=checkpoints.vcf_discovery.get().output[0] output: - f"vcf_dir_path/{{vcf_directory}}/sample_list.txt", + "vcf_dir_path/{{vcf_directory}}/sample_list.txt", params: type="DNA" log: @@ -56,12 +60,19 @@ if config["vcf"]: time=config["processing_time"], mem_mb=config["processing_mem_mb"], shell: - "bcftools query -l {input.vcf} > {output}" + """ + for vcf in {input.vcf_files}; do + bcftools query -l "$vcf" >> {output} + done + """ rule vcf_separation: input: - vcf=lambda w: f"vcf_dir_path/{w.vcf_directory}/sample.masked.vcf.gz", + if config["vcf_masking"]: + vcf=lambda w: f"vcf_dir_path/{w.vcf_directory}/{sample}.masked.vcf.gz", + else: + vcf=lambda w: f"vcf_dir_path/{w.vcf_directory}/{sample}.vcf.gz", sample_list="vcf_dir_path/{vcf_directory}/sample_list.txt" output: directory(f"vcf_dir_path/{{vcf_directory}}/separated"), @@ -91,6 +102,8 @@ if config["vcf"]: {input.vcf} done < {input.sample_list} """ + + rule vcf_filtering: input: "vcf_dir_path/{vcf_directory}/separated/{sample}.separated.vcf.gz", @@ -102,7 +115,7 @@ if config["vcf"]: haploid_filters=config["haploid_filters"], diploid_filters=config["diploid_filters"], log: - std=log_dir_path / "{vcf_directory}/{sample}_vcf_filtering.log", # Removed {ext} + std=log_dir_path / "{vcf_directory}/{sample}_vcf_filtering.log", cluster_log=cluster_log_dir_path / "{vcf_directory}/{sample}_vcf_filtering.cluster.log", cluster_err=cluster_log_dir_path / "{vcf_directory}/{sample}_vcf_filtering.cluster.err", benchmark: @@ -131,7 +144,7 @@ if config["vcf"]: rule vcf_sequence_dictionary: input: - "vcf_dir_path/{vcf_directory}/{reference}.{ext}", # Simplified pattern + "vcf_dir_path/{vcf_directory}/{reference}.{ext}", output: "vcf_dir_path/{vcf_directory}/{reference}.dict" log: @@ -153,7 +166,7 @@ if config["vcf"]: rule vcf_indexing: input: - "vcf_dir_path/{vcf_directory}/filtered/{sample}.filtered.{ext}", + "vcf_dir_path/{vcf_directory}/filtered/{sample}.filtered.vcf.gz", output: "vcf_dir_path/{vcf_directory}/filtered/{sample}.filtered.{ext}.idx", log: @@ -196,7 +209,7 @@ if config["vcf"]: rule vcf_to_fasta: input: vcf_file="vcf_dir_path/{vcf_directory}/filtered/{sample}.filtered.vcf.gz", - reference_file="vcf_dir_path/{vcf_directory}/reference.fna" + reference_file="vcf_dir_path/{vcf_directory}/{reference}.{ext}", output: genome_dir_path / "{sample}.AltRef.fasta", # Keep flat structure params: diff --git a/workflow/rules/visualization.smk b/workflow/rules/visualization.smk index 3063cd2..da75247 100644 --- a/workflow/rules/visualization.smk +++ b/workflow/rules/visualization.smk @@ -158,14 +158,14 @@ if config["draw_phylotrees"]: rule raxml_tree_visualization: input: - raxml_dir_path / f"{fasta_dna_filename}.raxml.treefile" + raxml_dir_path / raxml_tree, output: - raxml_dir_path / f"{fasta_dna_filename}.raxml.length_and_support_tree.png", - raxml_dir_path / f"{fasta_dna_filename}.raxml.only_support_tree.png", - raxml_dir_path / f"{fasta_dna_filename}.raxml.only_tree.png", - raxml_dir_path / f"{fasta_dna_filename}.raxml.dots.png", + raxml_dir_path / f"{fasta_dna_filename}.length_and_support_tree.png", + raxml_dir_path / f"{fasta_dna_filename}.only_support_tree.png", + raxml_dir_path / f"{fasta_dna_filename}.only_tree.png", + raxml_dir_path / f"{fasta_dna_filename}.dots.png", params: - prefix=raxml_dir_path / f"{fasta_dna_filename}.raxml", + prefix=raxml_dir_path / f"{fasta_dna_filename}", options=config["tree_visualization_params"], log: std=log_dir_path / "raxml_tree_visualization.log", From ef26cfd4cbe82d335a83b0a7db8a9de749ffe711 Mon Sep 17 00:00:00 2001 From: zubiriguitar Date: Thu, 15 May 2025 23:38:53 +0700 Subject: [PATCH 08/15] vcf_fix + visualisation --- Snakefile | 286 ++++++++++--------- config/default.yaml | 156 ++++++---- {genomes => input/genomes}/DESCRIPTION | 0 workflow/envs/{ete3_conda.yaml => ete3.yaml} | 1 + workflow/envs/gatk.yaml | 11 + workflow/envs/{conda.yaml => main.yaml} | 5 +- workflow/rules/alignment.smk | 4 +- workflow/rules/astral.smk | 201 ++++++------- workflow/rules/busco.smk | 4 +- workflow/rules/common_ids.smk | 173 +++++------ workflow/rules/concat_alignments.smk | 31 +- workflow/rules/filtration.smk | 8 +- workflow/rules/iqtree.smk | 4 +- workflow/rules/phylip.smk | 4 +- workflow/rules/quastcore.smk | 2 +- workflow/rules/rapidnj.smk | 2 +- workflow/rules/raxml.smk | 4 +- workflow/rules/vcf.smk | 238 --------------- workflow/rules/vcf_reconstruct.smk | 204 +++++++++++++ workflow/rules/visualization.smk | 195 ++++++------- workflow/scripts/draw_phylotrees.py | 45 ++- 21 files changed, 819 insertions(+), 759 deletions(-) rename {genomes => input/genomes}/DESCRIPTION (100%) rename workflow/envs/{ete3_conda.yaml => ete3.yaml} (94%) create mode 100644 workflow/envs/gatk.yaml rename workflow/envs/{conda.yaml => main.yaml} (87%) delete mode 100644 workflow/rules/vcf.smk create mode 100644 workflow/rules/vcf_reconstruct.smk diff --git a/Snakefile b/Snakefile index 2fd7722..0807c6f 100644 --- a/Snakefile +++ b/Snakefile @@ -1,22 +1,27 @@ from pathlib import Path import os -from pathlib import Path import subprocess +import pandas as pd + # ---- Setup config ---- configfile: "config/default.yaml" # ---- Setup paths ---- -cluster_log_dir_path = Path(config["cluster_log_dir"]) +# -- Input -- genome_dir_path = Path(config["genome_dir"]).resolve() +vcf_reconstruct_dir_path = Path(config["vcf_reconstruct_dir"]).resolve() + +# -- Logs and benchmarks -- +cluster_log_dir_path = Path(config["cluster_log_dir"]) log_dir_path = Path(config["log_dir"]) benchmark_dir_path = Path(config["benchmark_dir"]) output_dir_path = Path(config["output_dir"]) -vcf_dir_path = Path(config["vcf_dir"]).resolve() - +# -- Results -- quastcore_dir_path = output_dir_path / config["quastcore_dir"] +altref_dir_path = output_dir_path / config["altref_dir"] busco_dir_path = output_dir_path / config["busco_dir"] species_ids_dir_path = output_dir_path / config["species_ids_dir"] common_ids_dir_path = output_dir_path / config["common_ids_dir"] @@ -31,17 +36,10 @@ rapidnj_dir_path = output_dir_path / config["rapidnj_dir"] phylip_dir_path = output_dir_path / config["phylip_dir"] raxml_dir_path = output_dir_path / config["raxml_dir"] -# if "species_list" not in config: -# config["species_list"] = [f.stem for f in genome_dir_path.iterdir() if f.is_file() and f.suffix == ".fasta"] - - # ---- Setup filenames ---- -vcf_filename = "{}.{ext,vcf|vcf.gz}" -vcf_mask_filename = "{}.{ext,bed|bed.gz}" -vcf_reference_filename = "{}.{ext,fna|fasta}" -fasta_dna_filename = f"concat_alignment{config['altref_suffix']}.fna" +fasta_dna_filename = "{}.fna".format(config["alignment_file_prefix"]) fasta_protein_filename = "{}.faa".format(config["alignment_file_prefix"]) -nexus_dna_filename = f"{fasta_dna_filename}.nex" +nexus_dna_filename = "{}.fna.nex".format(config["alignment_file_prefix"]) nexus_protein_filename = "{}.faa.nex".format(config["alignment_file_prefix"]) phylip_dna_filename = "{}.fna.phy".format(config["alignment_file_prefix"]) phylip_protein_filename = "{}.faa.phy".format(config["alignment_file_prefix"]) @@ -51,200 +49,210 @@ astral_input_trees = "{}.iqtree_per_fna.concat.treefile".format(config["alignmen astral_filtered_trees = "{0}.iqtree_per_fna.concat.{1}.treefile".format(config["alignment_file_prefix"], config["nodes_filtrataion_by_support"]) astral_tree = "{0}.{1}.fna.astral.treefile".format(config["alignment_file_prefix"], config["nodes_filtrataion_by_support"]) rapidnj_tree = "{}.fna.rapidnj.treefile".format(config["alignment_file_prefix"]) +rapidnj_matrix = "{}.fna.rapidnj.matrix".format(config["alignment_file_prefix"]) phylip_tree = "{}.fna.phy.namefix.treefile".format(config["alignment_file_prefix"]) -raxml_tree = f"{fasta_dna_filename}.raxml.treefile" - +raxml_tree = "{}.fna.raxml.treefile".format(config["alignment_file_prefix"]) # ---- Necessary functions ---- -def expand_fna_from_merged_sequences(wildcards, template): +def get_vcf_reconstruct_map(vcf_dir: Path) -> dict: + """Returns a dictionary where keys are species names and values are dictionaries of VCF and FASTA files.""" + vcf_mapping = {} + for vcf_subdir in vcf_dir.iterdir(): + if vcf_subdir.is_dir(): + ref_files = list(vcf_subdir.glob("*.fasta")) + if not ref_files: + continue + ref_file = ref_files[0] + ref_prefix = ref_file.stem + + for vcf_file in vcf_subdir.glob("*.vcf.gz"): + vcf_id = vcf_file.stem.split('.')[0] + alt_name = f"{vcf_id}.{ref_prefix}.AltRef" + vcf_mapping[alt_name] = { + 'vcf': vcf_file, + 'reference': ref_file + } + return vcf_mapping + + +def get_species_list(vcf_species: list, genome_species: list) -> list: + """Merges and returns final species list""" + return list(set(vcf_species + genome_species)) + + +def expand_fna_from_merged_sequences(wildcards, template, busco_blacklist=None): checkpoint_output = checkpoints.merged_sequences.get(**wildcards).output[0] N = glob_wildcards(os.path.join(checkpoint_output, "{N}.fna")).N + print(f"busco_blacklist: {busco_blacklist}") + print(f"Length of busco_blacklist: {len(N)}") + if busco_blacklist is not None: + N = set(N) - set(list(map(lambda s: f"{s}.merged", busco_blacklist))) + print(f"Length of final common_ids: {len(N)}") return expand(str(template), N=N) -def expand_faa_from_merged_sequences(wildcards, template): +def expand_faa_from_merged_sequences(wildcards, template, busco_blacklist=None): checkpoint_output = checkpoints.merged_sequences.get(**wildcards).output[0] N = glob_wildcards(os.path.join(checkpoint_output, "{N}.faa")).N + if busco_blacklist is not None: + N = set(N) - set(list(map(lambda s: f"{s}.merged", busco_blacklist))) return expand(str(template), N=N) -def get_samples_from_file(sample_list_path): - with open(sample_list_path) as f: - return [line.strip() for line in f if line.strip()] - - - -#---- VCF ---- -if config["vcf"]: - include: "workflow/rules/vcf.smk" - - checkpoint vcf_processed: - input: - expand( - genome_dir_path / "{sample}.AltRef.fasta", - sample=glob_wildcards(os.path.join(vcf_dir_path, "*", "{sample}_PASS.vcf.gz")).sample - ) - -#def get_vcf_samples(vcf_dir): -# vcf_files = list(Path(vcf_dir).rglob("*.vcf.gz")) -# if not vcf_files: -# raise ValueError(f"No VCF samples found in {vcf_dir} subdirectories.") -# return sorted({f.stem.replace("_PASS", "") for f in vcf_files}) - -#if "species_list" not in config: -# if config.get("vcf", False): -# config["species_list"] = [] -# else: -# config["species_list"] = [f.stem for f in genome_dir_path.glob("*.fasta") if f.is_file()] - -# if "species_list" not in config: -# config["species_list"] = [f.stem for f in genome_dir_path.iterdir() if f.is_file() and f.suffix == ".fasta"] - -def get_species_list(): - fasta_species = [f.stem for f in genome_dir_path.iterdir() if f.is_file() and f.suffix == ".fasta"] - if config.get("vcf", False): - vcf_samples = set() - for vcf in vcf_dir_path.glob("**/*.vcf*"): - if vcf.suffix in [".vcf", ".vcf.gz"]: - try: - samples = subprocess.check_output( - ["bcftools", "query", "-l", str(vcf)], - text=True - ).splitlines() - vcf_samples.update(samples) - except subprocess.CalledProcessError as e: - print(f"Error processing {vcf}: {e}") - return list(set(fasta_species + list(vcf_samples))) - else: - return fasta_species + +# ------------------ TEMPORARY CODE!!!!!!!!!!!!! ----------------------- +# blacklist is applied at the concatenation stage +busco_blacklist = None +busco_blacklist_path = Path("input/BUSCO.blacklist") +if busco_blacklist_path.exists() and (busco_blacklist_path.stat().st_size > 0): + busco_blacklist = pd.read_csv(busco_blacklist_path, sep="\t", header=None).squeeze() +# --------------------------------------------------------------------- + + +# ---- Input data ---- +genome_species = [f.stem for f in genome_dir_path.glob("*.fasta") if f.is_file()] +vcf_reconstruct_map = get_vcf_reconstruct_map(vcf_reconstruct_dir_path) +vcf_reconstruct_species = list(vcf_reconstruct_map.keys()) if "species_list" not in config: - config["species_list"] = get_species_list() - -# +-----------------+ -# | the "All" rule | -# +-----------------+ - -output_files = [ - # ---- VCF ---- - checkpoints.vcf_processed.get().output[0] if config["vcf"] else [], - *( - expand( - genome_dir_path / "{sample}.AltRef.fasta", - sample=config["species_list"] - ) - if config.get("vcf", False) - else [] - ), - expand(busco_dir_path / "{species}/short_summary_{species}.txt", species=config["species_list"]), - # ---- Busco ---- - expand(busco_dir_path / - "{species}/short_summary_{species}.txt", species=config["species_list"]), - # ---- Merge sequences with common ids ---- - lambda w: expand_fna_from_merged_sequences( - w, merged_sequences_dir_path / "{N}.fna"), - lambda w: expand_faa_from_merged_sequences( - w, merged_sequences_dir_path / "{N}.faa"), - species_ids_dir_path / "unique_species_ids.png", - busco_dir_path / "busco_summaries.svg", - - - -] -# if "vcf" in config: -# if config["vcf"]: -# expand("genomes/{sample}.AltRef.fasta", sample=get_sample_names()) + config["species_list"] = get_species_list(genome_species, vcf_reconstruct_species) + print(config["species_list"]) + + +# ---- "All" rule ---- +if config["vcf2phylip"] != True: + output_files = [ + # ---- Busco ---- + expand(busco_dir_path / "{species}/short_summary_{species}.txt", species=config["species_list"]), + # ---- Merge sequences with common ids ---- + lambda w: expand_fna_from_merged_sequences(w, merged_sequences_dir_path / "{N}.fna", busco_blacklist=busco_blacklist), + lambda w: expand_faa_from_merged_sequences(w, merged_sequences_dir_path / "{N}.faa", busco_blacklist=busco_blacklist), + species_ids_dir_path / "unique_species_ids.png", + busco_dir_path / "busco_summaries.svg", + ] +else: + output_files = [] + + + + +if "vcf2phylip" in config: + if config["vcf2phylip"]: + output_files.append(altref_dir_path / "consensus.fasta") + output_files.append(concat_alignments_dir_path / fasta_dna_filename) + if "iqtree_dna" in config: + if config["iqtree_dna"]: + output_files.append(iqtree_dir_path / "fna" / f"{fasta_dna_filename}.treefile") + if "draw_phylotrees" in config: + if config["draw_phylotrees"]: + output_files.append(iqtree_dir_path / "fna" / f"{fasta_dna_filename}.length_and_support_tree.png") + if config["rapidnj"]: + output_files.append(concat_alignments_dir_path / stockholm_dna_filename) + output_files.append(rapidnj_dir_path / rapidnj_tree) + if "draw_phylotrees" in config: + if config["draw_phylotrees"]: + output_files.append(rapidnj_dir_path / f"{fasta_dna_filename}.only_tree.png") + if "phylip" in config: + if config["phylip"]: + output_files.append(concat_alignments_dir_path / phylip_dna_filename) + output_files.append(phylip_dir_path / phylip_tree) + if "draw_phylotrees" in config: + if config["draw_phylotrees"]: + output_files.append(phylip_dir_path / f"{fasta_dna_filename}.only_tree.png") + if "raxml" in config: + if config["raxml"]: + output_files.append(raxml_dir_path / raxml_tree) + if "draw_phylotrees" in config: + if config["draw_phylotrees"]: + output_files.append(raxml_dir_path / f"{fasta_dna_filename}.only_tree.png") + if "quastcore" in config: if config["quastcore"]: output_files.append(quastcore_dir_path / "assembly_stats.csv") + + if "dna_alignment" in config: if config["dna_alignment"]: - output_files.append(lambda w: expand_fna_from_merged_sequences( - w, alignments_dir_path / "fna" / "{N}.fna")) + output_files.append(lambda w: expand_fna_from_merged_sequences(w, alignments_dir_path / "fna" / "{N}.fna", busco_blacklist=busco_blacklist)) if "dna_filtration" in config: if config["dna_filtration"]: - output_files.append(lambda w: expand_fna_from_merged_sequences( - w, filtered_alignments_dir_path / "fna" / "{N}.fna")) output_files.append( - concat_alignments_dir_path / fasta_dna_filename) - output_files.append( - concat_alignments_dir_path / nexus_dna_filename) + lambda w: expand_fna_from_merged_sequences(w, filtered_alignments_dir_path / "fna" / "{N}.fna", busco_blacklist=busco_blacklist) + ) + output_files.append(concat_alignments_dir_path / fasta_dna_filename) if "iqtree_dna" in config: if config["iqtree_dna"]: - output_files.append( - iqtree_dir_path / "fna" / f"{fasta_dna_filename}.treefile") + output_files.append(iqtree_dir_path / "fna" / f"{fasta_dna_filename}.treefile") if "draw_phylotrees" in config: if config["draw_phylotrees"]: - output_files.append( - iqtree_dir_path / "fna" / f"{fasta_dna_filename}.length_and_support_tree.png") + output_files.append(iqtree_dir_path / "fna" / f"{fasta_dna_filename}.length_and_support_tree.png") if "astral" in config: if config["astral"]: output_files.append(astral_dir_path / astral_tree) if "draw_phylotrees" in config: if config["draw_phylotrees"]: - output_files.append( - astral_dir_path / f"{astral_tree}.png") + output_files.append(astral_dir_path / f"{astral_tree}.png") if "rapidnj" in config: if config["rapidnj"]: - output_files.append( - concat_alignments_dir_path / stockholm_dna_filename) + output_files.append(concat_alignments_dir_path / stockholm_dna_filename) output_files.append(rapidnj_dir_path / rapidnj_tree) if "draw_phylotrees" in config: if config["draw_phylotrees"]: - output_files.append( - rapidnj_dir_path / f"{fasta_dna_filename}.only_tree.png") + output_files.append(rapidnj_dir_path / f"{fasta_dna_filename}.only_tree.png") if "phylip" in config: if config["phylip"]: - output_files.append( - concat_alignments_dir_path / phylip_dna_filename) + output_files.append(concat_alignments_dir_path / phylip_dna_filename) output_files.append(phylip_dir_path / phylip_tree) if "draw_phylotrees" in config: if config["draw_phylotrees"]: - output_files.append( - phylip_dir_path / f"{fasta_dna_filename}.only_tree.png") + output_files.append(phylip_dir_path / f"{fasta_dna_filename}.only_tree.png") if "raxml" in config: if config["raxml"]: output_files.append(raxml_dir_path / raxml_tree) -# if "draw_phylotrees" in config: -# if config["draw_phylotrees"]: -# output_files.append( -# raxml_dir_path / f"{fasta_dna_filename}.only_tree.png") + if "draw_phylotrees" in config: + if config["draw_phylotrees"]: + output_files.append(raxml_dir_path / f"{fasta_dna_filename}.only_tree.png") + if "protein_alignment" in config: if config["protein_alignment"]: - output_files.append(lambda w: expand_faa_from_merged_sequences( - w, alignments_dir_path / "faa" / "{N}.faa")) + output_files.append(lambda w: expand_faa_from_merged_sequences(w, alignments_dir_path / "faa" / "{N}.faa")) if "protein_filtration" in config: if config["protein_filtration"]: - output_files.append(lambda w: expand_fna_from_merged_sequences( - w, filtered_alignments_dir_path / "faa" / "{N}.faa")) output_files.append( - concat_alignments_dir_path / fasta_protein_filename) + lambda w: expand_fna_from_merged_sequences(w, filtered_alignments_dir_path / "faa" / "{N}.faa", busco_blacklist=busco_blacklist) + ) + output_files.append(concat_alignments_dir_path / fasta_protein_filename) # output_files.append(concat_alignments_dir_path / nexus_protein_filename) # output_files.append(concat_alignments_dir_path / stockholm_protein_filename) # output_files.append(concat_alignments_dir_path / phylip_protein_filename) if "iqtree_protein" in config: if config["iqtree_protein"]: - output_files.append( - iqtree_dir_path / "faa" / f"{fasta_protein_filename}.treefile") + output_files.append(iqtree_dir_path / "faa" / f"{fasta_protein_filename}.treefile") if "draw_phylotrees" in config: if config["draw_phylotrees"]: - output_files.append( - iqtree_dir_path / "faa" / f"{fasta_protein_filename}.length_and_support_tree.png") + output_files.append(iqtree_dir_path / "faa" / f"{fasta_protein_filename}.length_and_support_tree.svg") + if "mrbayes_dna" in config: - if config["mrbayes_dna"]: + if config["mrbayes_dna"]: # TODO: upgrade + output_files.append(concat_alignments_dir_path / nexus_dna_filename) output_files.append(mrbayes_dir_path / "fna") + if "mrbayes_protein" in config: if config["mrbayes_protein"]: output_files.append(mrbayes_dir_path / "faa") -localrules: all +localrules: + all, + rule all: input: - output_files + output_files, + # ---- Load rules ---- +include: "workflow/rules/vcf_reconstruct.smk" include: "workflow/rules/quastcore.smk" include: "workflow/rules/busco.smk" include: "workflow/rules/common_ids.smk" @@ -255,6 +263,6 @@ include: "workflow/rules/iqtree.smk" include: "workflow/rules/mrbayes.smk" include: "workflow/rules/visualization.smk" include: "workflow/rules/astral.smk" -include: "workflow/rules/raxml.smk" include: "workflow/rules/rapidnj.smk" include: "workflow/rules/phylip.smk" +include: "workflow/rules/raxml.smk" \ No newline at end of file diff --git a/config/default.yaml b/config/default.yaml index 23e0678..feda1fa 100644 --- a/config/default.yaml +++ b/config/default.yaml @@ -1,107 +1,148 @@ -conda_config: "workflow/envs/conda.yaml" -ete3_conda_config: "workflow/envs/ete3_conda.yaml" +# ---- Conda ---- +"conda": + "buscoclade_main": + "yaml": "workflow/envs/main.yaml" + "name": "buscoclade_main" + "buscoclade_ete3": + "yaml": "workflow/envs/ete3.yaml" + "name": "buscoclade_ete3" + "buscoclade_gatk": + "yaml": "workflow/envs/gatk.yaml" + "name": "buscoclade_gatk" + +"use_existing_envs": True # Pipeline will use pre-installed envs # +------------------------+ # | Pipeline configuration | # False or comment line to disable # +------------------------+ + # ---- Assembly statistics ---- -quastcore: True +quastcore: True + # ---- Alignment ---- -dna_alignment: 'mafft' # 'prank' or 'mafft' +dna_alignment: 'prank' # 'prank' or 'mafft' protein_alignment: False # 'prank' or 'mafft' + # ---- Filtration ---- dna_filtration: 'gblocks' # 'gblocks' or 'trimal' protein_filtration: False # 'gblocks' or 'trimal' + # ---- Рhylogenetic tree inference ---- iqtree_dna: True -iqtree_protein: True +iqtree_protein: False mrbayes_dna: False -mrbayes_protein: True +mrbayes_protein: False astral: True # use iqtree for each dna alignment rapidnj: True phylip: True raxml: True + # ---- Рhylogenetic tree visualization ---- draw_phylotrees: True # +-----------------+ # | Tool parameters | # +-----------------+ +# ---- VCF ---- +vcf_gatk: False # .vcf to .fasta reconstruction using gatk package and reference .fasta +vcf2phylip: True # .vcf to concat_alignment reconstruction +gatk_path: "" # gatk installation directory + # ---- Assembly statistics ---- -quastcore_params: "-m 0 150 500 1000" # сutoffs by minimum scaffold length +quastcore_params: "-m 0 150 500 1000" # Cutoffs by minimum scaffold length + # ---- BUSCO ---- busco_gene_prediction_tool: "metaeuk" # 'metaeuk' or 'augustus' -busco_augustus_species: "human" # used only if busco_gene_prediction_tool == 'augustus' -busco_dataset_path: "" # path to orthodb directory -busco_options: "--offline" -busco_mode: "genome" +busco_augustus_species: "human" # Used only if busco_gene_prediction_tool == 'augustus' +busco_dataset_path: "/mnt/tank/scratch/ponomarev/busco/saccharomycetes_odb10/" # Path to orthodb directory +busco_options: "--offline --metaeuk_parameters='--disk-space-limit=1000M,--remove-tmp-files=1' --metaeuk_rerun_parameters='--disk-space-limit=1000M,--remove-tmp-files=1'" +busco_mode: "genome" busco_histogram_colors: "#23b4e8,#008dbf,#fbbc04,#ea4335" # per S, D, F, M BUSCOs + # ---- main IDs ---- gene_blacklist: # IDs of genes to be deleted from main_ids - 1155at40674 # long sequences + # ---- Alignment ---- prank_dna_params: "-codon" mafft_dna_params: "" prank_protein_params: "" mafft_protein_params: "--anysymbol" + # ---- Filtration ---- gblocks_dna_params: "-t=Codons" gblocks_protein_params: "-t=Protein" trimal_dna_params: "-automated1" trimal_protein_params: "-automated1" + # ---- Concatenation of alignments ---- alignment_file_prefix: "concat_alignment" + # ---- Рhylogenetic tree inference ---- # IQtree: iqtree_dna_params: "-m TESTNEW -bb 1000" # -o homo_sapiens iqtree_protein_params: "-m TESTNEW -bb 1000" # -o homo_sapiens + # Astral: iqtree_per_fna_params: "-m TESTNEW -bb 1000 -keep-ident" nodes_filtrataion_by_support: 70 astral_params: "--branch-annotate 2 --reps 1000" + # MrBayes: mrbayes_dna_params: "" mrbayes_protein_params: "" mrbayes_block: "workflow/envs/mrbayes.block" -mrbayes_path: # path of MrBayes binary file +mrbayes_path: # Path of MrBayes binary file + # RapidNJ rapidnj_params: "-b 1000" + # PHYLIP -phylip_dnadist_params: "D\n" # use 'D\n' to set model to 'Kimura 2-parameter'. F84 model (default) if "". -phylip_neighbor_params: "" # use 'N\n' to set UPGMA. NJ (default) if "". +phylip_dnadist_params: "D\n" # Use 'D\n' to set model to 'Kimura 2-parameter'. F84 model (default) if "". +phylip_neighbor_params: "" # Use 'N\n' to set UPGMA. NJ (default) if "". #RAXML raxml_params: "--model GTR+G --bs-trees 100" + # ---- Рhylogenetic tree visualization ---- -tree_visualization_params: "" # "--outgroup Homo_sapiens" or "--outgroup species_1,species_2" +tree_visualization_params: "" # Specify vizualisation outgroup as it should be named "--outgroup Homo sapiens" or "--outgroup species 1,species 2" # +---------------------+ # | Directory structure | # +---------------------+ -genome_dir: "genomes" -output_dir: "results" -quastcore_dir: "assembly_stats" -busco_dir: "busco" +# ---- Input ---- +genome_dir: "input/genomes/" +vcf_reconstruct_dir: "input/vcf_reconstruct/" + +# ---- Output ---- +output_dir: "results/" +altref_dir: "altref_genomes/" +quastcore_dir: "assembly_stats/" +busco_dir: "busco/" main_ids_dir: "ids/" -species_ids_dir: "ids/species_ids" -common_ids_dir: "ids/common_ids" -merged_sequences_dir: "ids/merged_sequences" +species_ids_dir: "ids/species_ids/" +common_ids_dir: "ids/common_ids/" +merged_sequences_dir: "ids/merged_sequences/" alignments_dir: "alignments/raw/" filtered_alignments_dir: "alignments/filtered/" -concat_alignments_dir: "concat_alignments" -iqtree_dir: "phylogeny/iqtree" -mrbayes_dir: "phylogeny/mrbayes" -astral_dir: "phylogeny/astral" -rapidnj_dir: "phylogeny/rapidnj" -phylip_dir: "phylogeny/phylip" +concat_alignments_dir: "concat_alignments/" +iqtree_dir: "phylogeny/iqtree/" +mrbayes_dir: "phylogeny/mrbayes/" +astral_dir: "phylogeny/astral/" +rapidnj_dir: "phylogeny/rapidnj/" +phylip_dir: "phylogeny/phylip/" raxml_dir: "phylogeny/raxml" -log_dir: "logs" -cluster_log_dir: "cluster_logs" -benchmark_dir: "benchmarks" + +log_dir: "logs/" +cluster_log_dir: "cluster_logs/" +benchmark_dir: "benchmarks/" # +-----------+ -# | Resources | +# | Resources | # +-----------+ + +# NOTE : Be aware of Snakemake parallelization when specifying resources + # ---- Slurm partition ---- busco_queue: "main" alignment_queue: "main" @@ -115,34 +156,35 @@ raxml_queue: "main" processing_queue: "main" # ---- Tool threads ---- -busco_threads: 20 # Для млеков около 30-60, если есть ресурсы (но не обязательно) -mafft_threads: 2 -prank_threads: 1 -gblocks_threads: 2 -trimal_threads: 2 -iqtree_threads: 16 -mrbayes_threads: 16 -iqtree_per_fna_threads: 8 -astral_threads: 8 -rapidnj_threads: 8 -phylip_threads: 2 +busco_threads: 40 +mafft_threads: 8 +prank_threads: 8 +gblocks_threads: 8 +trimal_threads: 8 +iqtree_threads: 40 +mrbayes_threads: 40 +iqtree_per_fna_threads: 40 +astral_threads: 40 +rapidnj_threads: 40 +phylip_threads: 40 raxml_threads: 16 -processing_threads: 2 +processing_threads: 20 # ---- Tool memory ---- -busco_mem_mb: 10000 -mafft_mem_mb: 2000 -prank_mem_mb: 2000 -gblocks_mem_mb: 2000 -trimal_mem_mb: 2000 -iqtree_mem_mb: 10000 +busco_mem_mb: 120000 +mafft_mem_mb: 20000 +prank_mem_mb: 20000 +gblocks_mem_mb: 20000 +trimal_mem_mb: 20000 +iqtree_mem_mb: 40000 mrbayes_mem_mb: 10000 -iqtree_per_fna_mem_mb: 10000 +iqtree_per_fna_mem_mb: 20000 astral_mem_mb: 10000 -rapidnj_mem_mb: 8000 -phylip_mem_mb: 4000 +rapidnj_mem_mb: 40000 +phylip_mem_mb: 40000 +raxml_mem_mb: 40000 raxml_mem_mb: 10000 -processing_mem_mb: 2000 +processing_mem_mb: 20000 # ---- Tool time ---- busco_time: "150:00:00" @@ -153,9 +195,9 @@ trimal_time: "10:00:00" iqtree_time: "100:00:00" mrbayes_time: "100:00:00" iqtree_per_fna_time: "10:00:00" -astral_time: "5:00:00" -rapidnj_time: "5:00:00" -phylip_time: "5:00:00" +astral_time: "50:00:00" +rapidnj_time: "150:00:00" +phylip_time: "50:00:00" raxml_time: "5:00:00" processing_time: "5:00:00" diff --git a/genomes/DESCRIPTION b/input/genomes/DESCRIPTION similarity index 100% rename from genomes/DESCRIPTION rename to input/genomes/DESCRIPTION diff --git a/workflow/envs/ete3_conda.yaml b/workflow/envs/ete3.yaml similarity index 94% rename from workflow/envs/ete3_conda.yaml rename to workflow/envs/ete3.yaml index 91d9091..d178b2f 100644 --- a/workflow/envs/ete3_conda.yaml +++ b/workflow/envs/ete3.yaml @@ -10,3 +10,4 @@ dependencies: - ete3 - pip: - supervenn + - regex \ No newline at end of file diff --git a/workflow/envs/gatk.yaml b/workflow/envs/gatk.yaml new file mode 100644 index 0000000..d6e7a4e --- /dev/null +++ b/workflow/envs/gatk.yaml @@ -0,0 +1,11 @@ +name: buscoclade_gatk +channels: + - conda-forge + - bioconda +dependencies: + - openjdk=17 + - picard + - samtools + - bcftools + - gatk4 + - seqtk diff --git a/workflow/envs/conda.yaml b/workflow/envs/main.yaml similarity index 87% rename from workflow/envs/conda.yaml rename to workflow/envs/main.yaml index 7f01fe0..ea6b384 100644 --- a/workflow/envs/conda.yaml +++ b/workflow/envs/main.yaml @@ -1,10 +1,9 @@ -# ---- Conda config ---- name: buscoclade channels: - conda-forge - bioconda dependencies: - - busco=5.8.0 + - busco=5.5.0 - biopython - prank - mafft @@ -23,3 +22,5 @@ dependencies: # - mrbayes - pip: - "--editable=git+https://github.com/tomarovsky/Biocrutch.git#egg=Biocrutch" + - argparse + - indexed_gzip \ No newline at end of file diff --git a/workflow/rules/alignment.smk b/workflow/rules/alignment.smk index 27a473d..e3fb4f7 100644 --- a/workflow/rules/alignment.smk +++ b/workflow/rules/alignment.smk @@ -16,7 +16,7 @@ if "dna_alignment" in config: benchmark: benchmark_dir_path / "prank_dna.{N}.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["alignment_queue"], cpus=config["prank_threads"], @@ -43,7 +43,7 @@ if "dna_alignment" in config: benchmark: benchmark_dir_path / "mafft_dna.{N}.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["alignment_queue"], cpus=config["mafft_threads"], diff --git a/workflow/rules/astral.smk b/workflow/rules/astral.smk index 3a44079..1ef8916 100644 --- a/workflow/rules/astral.smk +++ b/workflow/rules/astral.smk @@ -1,104 +1,105 @@ -if "astral" in config: +if config["vcf2phylip"] != True: + if "astral" in config: - rule iqtree_per_fna: - input: - filtered_alignments_dir_path / "fna" / "{N}.fna", - output: - astral_dir_path / "iqtree_per_fna" / "{N}.fna.bionj", - astral_dir_path / "iqtree_per_fna" / "{N}.fna.ckp.gz", - astral_dir_path / "iqtree_per_fna" / "{N}.fna.iqtree", - astral_dir_path / "iqtree_per_fna" / "{N}.fna.log", - astral_dir_path / "iqtree_per_fna" / "{N}.fna.mldist", - astral_dir_path / "iqtree_per_fna" / "{N}.fna.model.gz", - astral_dir_path / "iqtree_per_fna" / "{N}.fna.treefile", - params: - options=config["iqtree_per_fna_params"], - outdir=astral_dir_path / "iqtree_per_fna", - prefix=lambda wildcards, output: output[0][:-6], - log: - std=log_dir_path / "iqtree_per_fna.{N}.log", - cluster_log=cluster_log_dir_path / "iqtree_per_fna.{N}.cluster.log", - cluster_err=cluster_log_dir_path / "iqtree_per_fna.{N}.cluster.err", - benchmark: - benchmark_dir_path / "iqtree_per_fna.{N}.benchmark.txt" - conda: - "../../%s" % config["conda_config"] - resources: - queue=config["astral_queue"], - cpus=config["iqtree_per_fna_threads"], - time=config["iqtree_per_fna_time"], - mem_mb=config["iqtree_per_fna_mem_mb"], - threads: config["iqtree_per_fna_threads"] - shell: - " mkdir -p {params.outdir}; " - " iqtree -nt {threads} -s {input} --prefix {params.prefix} {params.options} > {log.std} 2>&1; " + rule iqtree_per_fna: + input: + filtered_alignments_dir_path / "fna" / "{N}.fna", + output: + astral_dir_path / "iqtree_per_fna" / "{N}.fna.bionj", + astral_dir_path / "iqtree_per_fna" / "{N}.fna.ckp.gz", + astral_dir_path / "iqtree_per_fna" / "{N}.fna.iqtree", + astral_dir_path / "iqtree_per_fna" / "{N}.fna.log", + astral_dir_path / "iqtree_per_fna" / "{N}.fna.mldist", + astral_dir_path / "iqtree_per_fna" / "{N}.fna.model.gz", + astral_dir_path / "iqtree_per_fna" / "{N}.fna.treefile", + params: + options=config["iqtree_per_fna_params"], + outdir=astral_dir_path / "iqtree_per_fna", + prefix=lambda wildcards, output: output[0][:-6], + log: + std=log_dir_path / "iqtree_per_fna.{N}.log", + cluster_log=cluster_log_dir_path / "iqtree_per_fna.{N}.cluster.log", + cluster_err=cluster_log_dir_path / "iqtree_per_fna.{N}.cluster.err", + benchmark: + benchmark_dir_path / "iqtree_per_fna.{N}.benchmark.txt" + conda: + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + resources: + queue=config["astral_queue"], + cpus=config["iqtree_per_fna_threads"], + time=config["iqtree_per_fna_time"], + mem_mb=config["iqtree_per_fna_mem_mb"], + threads: config["iqtree_per_fna_threads"] + shell: + " mkdir -p {params.outdir}; " + " iqtree -nt {threads} -s {input} --prefix {params.prefix} {params.options} > {log.std} 2>&1; " - rule concat_newick_files: - input: - lambda w: expand_fna_from_merged_sequences(w, astral_dir_path / "iqtree_per_fna" / "{N}.fna.treefile"), - output: - astral_dir_path / astral_input_trees, - log: - std=log_dir_path / "concat_newick_files.log", - cluster_log=cluster_log_dir_path / "concat_newick_files.cluster.log", - cluster_err=cluster_log_dir_path / "concat_newick_files.cluster.err", - benchmark: - benchmark_dir_path / "concat_newick_files.benchmark.txt" - conda: - "../../%s" % config["conda_config"] - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - threads: config["processing_threads"] - shell: - " cat {input} > {output} 2> {log.std}; " + rule concat_newick_files: + input: + lambda w: expand_fna_from_merged_sequences(w, astral_dir_path / "iqtree_per_fna" / "{N}.fna.treefile"), + output: + astral_dir_path / astral_input_trees, + log: + std=log_dir_path / "concat_newick_files.log", + cluster_log=cluster_log_dir_path / "concat_newick_files.cluster.log", + cluster_err=cluster_log_dir_path / "concat_newick_files.cluster.err", + benchmark: + benchmark_dir_path / "concat_newick_files.benchmark.txt" + conda: + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + threads: config["processing_threads"] + shell: + " cat {input} > {output} 2> {log.std}; " - rule nodes_filtrataion_by_support: - input: - astral_dir_path / astral_input_trees, - output: - astral_dir_path / astral_filtered_trees, - params: - support=config["nodes_filtrataion_by_support"], - log: - std=log_dir_path / "nodes_filtrataion_by_support.log", - cluster_log=cluster_log_dir_path / "nodes_filtrataion_by_support.cluster.log", - cluster_err=cluster_log_dir_path / "nodes_filtrataion_by_support.cluster.err", - benchmark: - benchmark_dir_path / "nodes_filtrataion_by_support.benchmark.txt" - conda: - "../../%s" % config["conda_config"] - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - threads: config["processing_threads"] - shell: - " nw_ed {input} 'i & b<{params.support}' o > {output} 2> {log.std}; " + rule nodes_filtrataion_by_support: + input: + astral_dir_path / astral_input_trees, + output: + astral_dir_path / astral_filtered_trees, + params: + support=config["nodes_filtrataion_by_support"], + log: + std=log_dir_path / "nodes_filtrataion_by_support.log", + cluster_log=cluster_log_dir_path / "nodes_filtrataion_by_support.cluster.log", + cluster_err=cluster_log_dir_path / "nodes_filtrataion_by_support.cluster.err", + benchmark: + benchmark_dir_path / "nodes_filtrataion_by_support.benchmark.txt" + conda: + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + threads: config["processing_threads"] + shell: + " nw_ed {input} 'i & b<{params.support}' o > {output} 2> {log.std}; " - rule astral_tree: - input: - astral_dir_path / astral_filtered_trees, - output: - astral_dir_path / astral_tree, - params: - config["astral_params"], - log: - std=log_dir_path / "astral_tree.log", - cluster_log=cluster_log_dir_path / "astral_tree.cluster.log", - cluster_err=cluster_log_dir_path / "astral_tree.cluster.err", - benchmark: - benchmark_dir_path / "astral_tree.benchmark.txt" - conda: - "../../%s" % config["conda_config"] - resources: - queue=config["astral_queue"], - cpus=config["astral_threads"], - time=config["astral_time"], - mem_mb=config["astral_mem_mb"], - threads: config["astral_threads"] - shell: - " astral -i {input} -o {output} {params} > {log.std} 2>&1" + rule astral_tree: + input: + astral_dir_path / astral_filtered_trees, + output: + astral_dir_path / astral_tree, + params: + config["astral_params"], + log: + std=log_dir_path / "astral_tree.log", + cluster_log=cluster_log_dir_path / "astral_tree.cluster.log", + cluster_err=cluster_log_dir_path / "astral_tree.cluster.err", + benchmark: + benchmark_dir_path / "astral_tree.benchmark.txt" + conda: + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + resources: + queue=config["astral_queue"], + cpus=config["astral_threads"], + time=config["astral_time"], + mem_mb=config["astral_mem_mb"], + threads: config["astral_threads"] + shell: + " astral -i {input} -o {output} {params} > {log.std} 2>&1" diff --git a/workflow/rules/busco.smk b/workflow/rules/busco.smk index 34187b5..b7b03ed 100644 --- a/workflow/rules/busco.smk +++ b/workflow/rules/busco.smk @@ -20,7 +20,7 @@ if config["busco_gene_prediction_tool"] == "metaeuk": benchmark: benchmark_dir_path / "busco.{species}.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["busco_queue"], cpus=config["busco_threads"], @@ -64,7 +64,7 @@ elif config["busco_gene_prediction_tool"] == "augustus": benchmark: benchmark_dir_path / "busco.{species}.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["busco_queue"], cpus=config["busco_threads"], diff --git a/workflow/rules/common_ids.smk b/workflow/rules/common_ids.smk index 0c8d975..8be6a27 100644 --- a/workflow/rules/common_ids.smk +++ b/workflow/rules/common_ids.smk @@ -1,94 +1,95 @@ -localrules: - species_single_copy_ids, - species_multi_copy_ids, - common_ids, +if config["vcf2phylip"] != True: + localrules: + species_single_copy_ids, + species_multi_copy_ids, + common_ids, -rule species_single_copy_ids: # get single copy ids for each species - input: - busco_dir_path / "{species}/busco_sequences/single_copy_busco_sequences", - output: - species_ids_dir_path / "single_copy/{species}.ids", - log: - std=log_dir_path / "species_ids.{species}.log", - cluster_log=cluster_log_dir_path / "species_ids.{species}.cluster.log", - cluster_err=cluster_log_dir_path / "species_ids.{species}.cluster.err", - benchmark: - benchmark_dir_path / "species_ids.{species}.benchmark.txt" - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - threads: config["processing_threads"] - shell: - " ls {input} | grep -P '.fna$' | sed 's/.fna//' > {output} 2> {log.std}; " + rule species_single_copy_ids: # get single copy ids for each species + input: + busco_dir_path / "{species}/busco_sequences/single_copy_busco_sequences", + output: + species_ids_dir_path / "single_copy/{species}.ids", + log: + std=log_dir_path / "species_ids.{species}.log", + cluster_log=cluster_log_dir_path / "species_ids.{species}.cluster.log", + cluster_err=cluster_log_dir_path / "species_ids.{species}.cluster.err", + benchmark: + benchmark_dir_path / "species_ids.{species}.benchmark.txt" + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + threads: config["processing_threads"] + shell: + " ls {input} | grep -P '.fna$' | sed 's/.fna//' > {output} 2> {log.std}; " -rule species_multi_copy_ids: # get multi copy ids for each species - input: - busco_dir_path / "{species}/busco_sequences/multi_copy_busco_sequences", - output: - species_ids_dir_path / "multi_copy/{species}.ids", - log: - std=log_dir_path / "species_ids.{species}.log", - cluster_log=cluster_log_dir_path / "species_ids.{species}.cluster.log", - cluster_err=cluster_log_dir_path / "species_ids.{species}.cluster.err", - benchmark: - benchmark_dir_path / "species_ids.{species}.benchmark.txt" - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - threads: config["processing_threads"] - shell: - " ls {input} | grep -P '.fna$' | sed 's/.fna//' > {output} 2> {log.std}; " + rule species_multi_copy_ids: # get multi copy ids for each species + input: + busco_dir_path / "{species}/busco_sequences/multi_copy_busco_sequences", + output: + species_ids_dir_path / "multi_copy/{species}.ids", + log: + std=log_dir_path / "species_ids.{species}.log", + cluster_log=cluster_log_dir_path / "species_ids.{species}.cluster.log", + cluster_err=cluster_log_dir_path / "species_ids.{species}.cluster.err", + benchmark: + benchmark_dir_path / "species_ids.{species}.benchmark.txt" + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + threads: config["processing_threads"] + shell: + " ls {input} | grep -P '.fna$' | sed 's/.fna//' > {output} 2> {log.std}; " -rule common_ids: # get common IDs for all species given config["gene_blacklist"] and split them into files - input: - expand(species_ids_dir_path / "single_copy/{species}.ids", species=config["species_list"]), - output: - common_ids_dir_path / "common.ids", - params: - nfiles=len(config["species_list"]), - gene_blacklist=config["gene_blacklist"], - log: - std=log_dir_path / "common_ids.log", - cluster_log=cluster_log_dir_path / "common_ids.cluster.log", - cluster_err=cluster_log_dir_path / "common_ids.cluster.err", - benchmark: - benchmark_dir_path / "common_ids.benchmark.txt" - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - threads: config["processing_threads"] - shell: - " cat {input} | sort | uniq -c | awk '{{if($1=={params.nfiles}){{print $2}}}}' > {output} 2> {log.std}; " - ' for id in {params.gene_blacklist}; do sed -i "/$id$/d" {output}; done 2> {log.std}; ' + rule common_ids: # get common IDs for all species given config["gene_blacklist"] and split them into files + input: + expand(species_ids_dir_path / "single_copy/{species}.ids", species=config["species_list"]), + output: + common_ids_dir_path / "common.ids", + params: + nfiles=len(config["species_list"]), + gene_blacklist=config["gene_blacklist"], + log: + std=log_dir_path / "common_ids.log", + cluster_log=cluster_log_dir_path / "common_ids.cluster.log", + cluster_err=cluster_log_dir_path / "common_ids.cluster.err", + benchmark: + benchmark_dir_path / "common_ids.benchmark.txt" + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + threads: config["processing_threads"] + shell: + " cat {input} | sort | uniq -c | awk '{{if($1=={params.nfiles}){{print $2}}}}' > {output} 2> {log.std}; " + ' for id in {params.gene_blacklist}; do sed -i "/$id$/d" {output}; done 2> {log.std}; ' -checkpoint merged_sequences: # get merged sequences by common IDs - input: - rules.common_ids.output, - output: - directory(merged_sequences_dir_path), - params: - single_copy_files=expand(busco_dir_path / "{species}/busco_sequences/single_copy_busco_sequences", species=config["species_list"]), - log: - std=log_dir_path / "merged_sequences.log", - cluster_log=cluster_log_dir_path / "merged_sequences.cluster.log", - cluster_err=cluster_log_dir_path / "merged_sequences.cluster.err", - benchmark: - benchmark_dir_path / "merged_sequences.benchmark.txt" - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - threads: config["processing_threads"] - shell: - " workflow/scripts/merge_common_ids.py -c {input} -s {params.single_copy_files} -o {output} > {log.std} 2>&1; " + checkpoint merged_sequences: # get merged sequences by common IDs + input: + rules.common_ids.output, + output: + directory(merged_sequences_dir_path), + params: + single_copy_files=expand(busco_dir_path / "{species}/busco_sequences/single_copy_busco_sequences", species=config["species_list"]), + log: + std=log_dir_path / "merged_sequences.log", + cluster_log=cluster_log_dir_path / "merged_sequences.cluster.log", + cluster_err=cluster_log_dir_path / "merged_sequences.cluster.err", + benchmark: + benchmark_dir_path / "merged_sequences.benchmark.txt" + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + threads: config["processing_threads"] + shell: + " workflow/scripts/merge_common_ids.py -c {input} -s {params.single_copy_files} -o {output} > {log.std} 2>&1; " diff --git a/workflow/rules/concat_alignments.smk b/workflow/rules/concat_alignments.smk index 8d5312e..90026b7 100644 --- a/workflow/rules/concat_alignments.smk +++ b/workflow/rules/concat_alignments.smk @@ -1,8 +1,13 @@ rule concat_fasta_dna: input: - lambda w: expand_fna_from_merged_sequences(w, filtered_alignments_dir_path / "fna" / "{N}.fna"), + lambda w: ( + rules.consolidate_vcf2phylip.output + if config["vcf2phylip"] + else expand_fna_from_merged_sequences(w, filtered_alignments_dir_path / "fna" / "{N}.fna")), output: concat_alignments_dir_path / fasta_dna_filename, + params: + vcf2phylip=config["vcf2phylip"], log: std=log_dir_path / "concat_fasta_dna.log", cluster_log=cluster_log_dir_path / "concat_fasta_dna.cluster.log", @@ -10,14 +15,20 @@ rule concat_fasta_dna: benchmark: benchmark_dir_path / "concat_fasta_dna.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], time=config["processing_time"], mem_mb=config["processing_mem_mb"], shell: - " workflow/scripts/concat_fasta.py -i {input} -o {output} 1> {log.std} 2>&1; " + """ + if {params.vcf2phylip}; then + mv {input} {output} 1> {log.std} 2>&1 + else + workflow/scripts/concat_fasta.py -i {input} -o {output} 1> {log.std} 2>&1 + fi + """ rule concat_fasta_protein: @@ -32,7 +43,7 @@ rule concat_fasta_protein: benchmark: benchmark_dir_path / "concat_fasta_protein.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -57,7 +68,7 @@ rule concat_nexus_dna: benchmark: benchmark_dir_path / "concat_nexus_dna.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -83,7 +94,7 @@ rule concat_nexus_protein: benchmark: benchmark_dir_path / "concat_nexus_protein.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -106,7 +117,7 @@ rule concat_stockholm_dna: benchmark: benchmark_dir_path / "concat_stockholm_dna.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -128,7 +139,7 @@ rule concat_stockholm_protein: benchmark: benchmark_dir_path / "concat_stockholm_protein.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -152,7 +163,7 @@ rule concat_phylip_dna: benchmark: benchmark_dir_path / "concat_phylip_dna.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -177,7 +188,7 @@ rule concat_phylip_protein: benchmark: benchmark_dir_path / "concat_phylip_protein.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], diff --git a/workflow/rules/filtration.smk b/workflow/rules/filtration.smk index c2bf0be..82af0e1 100644 --- a/workflow/rules/filtration.smk +++ b/workflow/rules/filtration.smk @@ -16,7 +16,7 @@ if "dna_filtration" in config: benchmark: benchmark_dir_path / "{N}.fna.gblocks.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["filtration_queue"], cpus=config["gblocks_threads"], @@ -45,7 +45,7 @@ if "dna_filtration" in config: benchmark: benchmark_dir_path / "trimal_dna.{N}.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["filtration_queue"], cpus=config["trimal_threads"], @@ -75,7 +75,7 @@ if "protein_filtration" in config: benchmark: benchmark_dir_path / "{N}.faa.gblocks.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["filtration_queue"], cpus=config["gblocks_threads"], @@ -104,7 +104,7 @@ if "protein_filtration" in config: benchmark: benchmark_dir_path / "trimal_protein.{N}.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["filtration_queue"], cpus=config["trimal_threads"], diff --git a/workflow/rules/iqtree.smk b/workflow/rules/iqtree.smk index 2954867..a11a20e 100644 --- a/workflow/rules/iqtree.smk +++ b/workflow/rules/iqtree.smk @@ -22,7 +22,7 @@ if config["iqtree_dna"]: benchmark: benchmark_dir_path / "iqtree_dna.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["iqtree_queue"], cpus=config["iqtree_threads"], @@ -58,7 +58,7 @@ if config["iqtree_protein"]: benchmark: benchmark_dir_path / "iqtree_protein.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["iqtree_queue"], cpus=config["iqtree_threads"], diff --git a/workflow/rules/phylip.smk b/workflow/rules/phylip.smk index e71416b..9956959 100644 --- a/workflow/rules/phylip.smk +++ b/workflow/rules/phylip.smk @@ -17,7 +17,7 @@ rule phylip_dnadist: benchmark: benchmark_dir_path / "phylip_dnadist.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["phylip_queue"], cpus=config["phylip_threads"], @@ -48,7 +48,7 @@ rule phylip_neighbor: benchmark: benchmark_dir_path / "phylip_neighbor.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["phylip_queue"], cpus=config["phylip_threads"], diff --git a/workflow/rules/quastcore.smk b/workflow/rules/quastcore.smk index dc63498..0f329e4 100644 --- a/workflow/rules/quastcore.smk +++ b/workflow/rules/quastcore.smk @@ -12,7 +12,7 @@ rule quastcore: benchmark: benchmark_dir_path / "quastcore.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]), resources: queue=config["processing_queue"], cpus=config["processing_threads"], diff --git a/workflow/rules/rapidnj.smk b/workflow/rules/rapidnj.smk index a64d807..e25f2da 100644 --- a/workflow/rules/rapidnj.smk +++ b/workflow/rules/rapidnj.smk @@ -12,7 +12,7 @@ rule rapidnj_tree: benchmark: benchmark_dir_path / "rapidnj_tree.benchmark.txt" conda: - "../../%s" % config["conda_config"] + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["rapidnj_queue"], cpus=config["rapidnj_threads"], diff --git a/workflow/rules/raxml.smk b/workflow/rules/raxml.smk index 95bfb83..bb39d87 100644 --- a/workflow/rules/raxml.smk +++ b/workflow/rules/raxml.smk @@ -19,7 +19,7 @@ if config["raxml"]: benchmark: benchmark_dir_path / "raxml.benchmark.txt", conda: - "../../%s" % config["conda_config"], + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["raxml_queue"], cpus=config["raxml_threads"], @@ -44,7 +44,7 @@ if config["raxml"]: benchmark: benchmark_dir_path / "raxml.benchmark.txt", conda: - "../../%s" % config["conda_config"], + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["raxml_queue"], cpus=config["raxml_threads"], diff --git a/workflow/rules/vcf.smk b/workflow/rules/vcf.smk deleted file mode 100644 index aa062fe..0000000 --- a/workflow/rules/vcf.smk +++ /dev/null @@ -1,238 +0,0 @@ -if config["vcf"]: - - checkpoint vcf_discovery: - input: - vcf_dir_path - output: - touch(vcf_dir_path / "vcf_files.discovered") - shell: - "find {input} -name '*.vcf*' > {output}" - - if config["vcf_masking"]: - rule vcf_masking: - input: - vcf_file=lambda w: f"vcf_dir_path/{w.vcf_directory}/{w.sample}.vcf.gz", - mask_file=lambda w: f"vcf_dir_path/{w.vcf_directory}/mask.bed" - output: - "vcf_dir_path / "{vcf_directory}/{sample}.masked.vcf.gz" - params: - type="DNA" - log: - std=log_dir_path / "{vcf_directory}/{sample}_vcf_masking.log", - cluster_log=cluster_log_dir_path / "{vcf_directory}/{sample}_vcf_masking.cluster.log", - cluster_err=cluster_log_dir_path / "{vcf_directory}/{sample}_vcf_masking.cluster.err", - benchmark: - benchmark_dir_path / "{vcf_directory}/{sample}_vcf_masking.benchmark.txt" - conda: - "../../%s" % config["conda_config"] - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - shell: - "bedtools intersect -header -v -a {input.vcf_file} -b {input.mask_file} > {output} 2> {log.std}" - - - rule vcf_extract_sample_names: - input: - vcf_files=lambda w: ( - glob.glob(f"vcf_dir_path/{w.vcf_directory}/*.masked.vcf.gz") - if config["vcf_masking"] - else glob.glob(f"vcf_dir_path/{w.vcf_directory}/*.vcf.gz") - ), - checkpoint_file=checkpoints.vcf_discovery.get().output[0] - output: - "vcf_dir_path/{{vcf_directory}}/sample_list.txt", - params: - type="DNA" - log: - std=log_dir_path / "{vcf_directory}/vcf_extract_sample_names.log", - cluster_log=cluster_log_dir_path / "{vcf_directory}/vcf_extract_sample_names.cluster.log", - cluster_err=cluster_log_dir_path / "{vcf_directory}/vcf_extract_sample_names.cluster.err", - benchmark: - benchmark_dir_path / "{vcf_directory}/vcf_extract_sample_names.benchmark.txt" - conda: - "../../%s" % config["conda_config"] - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - shell: - """ - for vcf in {input.vcf_files}; do - bcftools query -l "$vcf" >> {output} - done - """ - - - rule vcf_separation: - input: - if config["vcf_masking"]: - vcf=lambda w: f"vcf_dir_path/{w.vcf_directory}/{sample}.masked.vcf.gz", - else: - vcf=lambda w: f"vcf_dir_path/{w.vcf_directory}/{sample}.vcf.gz", - sample_list="vcf_dir_path/{vcf_directory}/sample_list.txt" - output: - directory(f"vcf_dir_path/{{vcf_directory}}/separated"), - params: - type="DNA" - log: - std=log_dir_path / "{vcf_directory}/vcf_separation.log", - cluster_log=cluster_log_dir_path / "{vcf_directory}/vcf_separation.cluster.log", - cluster_err=cluster_log_dir_path / "{vcf_directory}/vcf_separation.cluster.err", - benchmark: - benchmark_dir_path / "{vcf_directory}/vcf_separation.benchmark.txt" - conda: - "../../%s" % config["conda_config"] - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - shell: - """ - mkdir -p {output} - while read -r SAMPLE; do - [ -z "$SAMPLE" ] && continue - bcftools view --threads 10 --min-ac 1 \\ - -s "$SAMPLE" \\ - -O z -o {output}/"$SAMPLE".separated.vcf.gz \\ - {input.vcf} - done < {input.sample_list} - """ - - - rule vcf_filtering: - input: - "vcf_dir_path/{vcf_directory}/separated/{sample}.separated.vcf.gz", - output: - "vcf_dir_path/{vcf_directory}/filtered/{sample}.filtered.vcf.gz", - params: - vcf_directory="{vcf_directory}", - common_filters=config["common_filters"], - haploid_filters=config["haploid_filters"], - diploid_filters=config["diploid_filters"], - log: - std=log_dir_path / "{vcf_directory}/{sample}_vcf_filtering.log", - cluster_log=cluster_log_dir_path / "{vcf_directory}/{sample}_vcf_filtering.cluster.log", - cluster_err=cluster_log_dir_path / "{vcf_directory}/{sample}_vcf_filtering.cluster.err", - benchmark: - benchmark_dir_path / "{vcf_directory}/{sample}_vcf_filtering.benchmark.txt" - conda: - "../../%s" % config["conda_config"] - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - shell: - """ - mkdir -p {vcf_dir_path}/{wildcards.vcf_directory}/filtered - ploidy=$(bcftools query -f '%%GT\\n' {input} | \\ - awk '$1 ~ /\// || $1 ~ /\|/ {{print "diploid"; exit}} END {{print "haploid"}}') - - if [ "$ploidy" = "diploid" ]; then - bcftools filter -i {params.common_filters} \\ - -e {params.diploid_filters} {input} > {output} - else - bcftools filter -i {params.common_filters} \\ - -e {params.haploid_filters} {input} > {output} - fi - """ - - rule vcf_sequence_dictionary: - input: - "vcf_dir_path/{vcf_directory}/{reference}.{ext}", - output: - "vcf_dir_path/{vcf_directory}/{reference}.dict" - log: - std=log_dir_path / "{vcf_directory}/{reference}_vcf_sequence_dictionary.log", - cluster_log=cluster_log_dir_path / "{vcf_directory}/{reference}_vcf_sequence_dictionary.cluster.log", - cluster_err=cluster_log_dir_path / "{vcf_directory}/{reference}_vcf_sequence_dictionary.cluster.err", - benchmark: - benchmark_dir_path / "{vcf_directory}/{reference}_vcf_sequence_dictionary.benchmark.txt" - conda: - "../../%s" % config["conda_config"] - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - shell: - "java -Xmx5g -jar picard.jar CreateSequenceDictionary R={input} O={output} 2> {log.std}" - - - rule vcf_indexing: - input: - "vcf_dir_path/{vcf_directory}/filtered/{sample}.filtered.vcf.gz", - output: - "vcf_dir_path/{vcf_directory}/filtered/{sample}.filtered.{ext}.idx", - log: - std=log_dir_path / "{vcf_directory}/{sample}.filtered.{ext}_vcf_indexing.log", - cluster_log=cluster_log_dir_path / "{vcf_directory}/{sample}.filtered.{ext}_vcf_indexing.cluster.log", # Fixed typo - cluster_err=cluster_log_dir_path / "{vcf_directory}/{sample}.filtered.{ext}_vcf_indexing.cluster.err", - benchmark: - benchmark_dir_path / "{vcf_directory}/{sample}.filtered.{ext}_vcf_indexing.benchmark.txt" - conda: - "../../%s" % config["conda_config"] - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - shell: - "java -Xmx10g -jar gatk IndexFeatureFile -I {input} 1> {log.std} 2>&1" - - rule reference_indexing: - input: - "vcf_dir_path/{vcf_directory}/{reference}.{ext}", - output: - "vcf_dir_path/{vcf_directory}/{reference}.{ext}.fai", - log: - std=log_dir_path / "{vcf_directory}/{reference}.{ext}_reference_indexing.log", - cluster_log=cluster_log_dir_path / "{vcf_directory}/{reference}.{ext}_reference_indexing.cluster.log", - cluster_err=cluster_log_dir_path / "{vcf_directory}/{reference}.{ext}_reference_indexing.cluster.err", - benchmark: - benchmark_dir_path / "{vcf_directory}/{reference}.{ext}_reference_indexing.benchmark.txt" - conda: - "../../%s" % config["conda_config"] - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - shell: - "samtools faidx {input} 2> {log.std}" - - rule vcf_to_fasta: - input: - vcf_file="vcf_dir_path/{vcf_directory}/filtered/{sample}.filtered.vcf.gz", - reference_file="vcf_dir_path/{vcf_directory}/{reference}.{ext}", - output: - genome_dir_path / "{sample}.AltRef.fasta", # Keep flat structure - params: - vcf_directory=lambda w: os.path.basename(os.path.dirname(os.path.dirname(input.vcf_file))), - log: - std=log_dir_path / "{sample}_vcf_to_fasta.log", - cluster_log=cluster_log_dir_path / "{sample}_vcf_to_fasta.cluster.log", - cluster_err=cluster_log_dir_path / "{sample}_vcf_to_fasta.cluster.err", - benchmark: - benchmark_dir_path / "{sample}_vcf_to_fasta.benchmark.txt" - params: - vcf_directory="{sample}" - conda: - "../../%s" % config["conda_config"] - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - shell: - "java -Xmx10g -jar gatk FastaAlternateReferenceMaker " - "--output {output} " - "--reference {input.reference_file} " - "--variant {input.vcf_file} " - "--showHidden true " - "--use-iupac-sample {wildcards.sample} 2> {log.std}" \ No newline at end of file diff --git a/workflow/rules/vcf_reconstruct.smk b/workflow/rules/vcf_reconstruct.smk new file mode 100644 index 0000000..db01869 --- /dev/null +++ b/workflow/rules/vcf_reconstruct.smk @@ -0,0 +1,204 @@ +localrules: + link_altref_to_genomes_dir, + +if config["vcf_gatk"]: + rule picard_index: + input: + "{reference}.fasta", + output: + dict="{reference}.dict", + fai="{reference}.fasta.fai", + log: + std=log_dir_path / "picard_index.{reference}.log", + cluster_log=cluster_log_dir_path / "picard_index.{reference}.cluster.log", + cluster_err=cluster_log_dir_path / "picard_index.{reference}.cluster.err", + benchmark: + benchmark_dir_path / "picard_index.{reference}.benchmark.txt" + conda: + config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]), + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + shell: + "picard CreateSequenceDictionary -R {input} && " + "samtools faidx {input} " + "1> {log.std} 2>&1" + + + rule gatk_vcf_index: + input: + vcf="{vcf}", + output: + tbi="{vcf}.tbi", + sample_list=temp("{vcf}.sample_list.txt"), + params: + gatk_path=config["gatk_path"], + log: + std=log_dir_path / "gatk_vcf_index.{vcf}.log", + cluster_log=cluster_log_dir_path / "gatk_vcf_index.{vcf}.cluster.log", + cluster_err=cluster_log_dir_path / "gatk_vcf_index.{vcf}.cluster.err", + benchmark: + benchmark_dir_path / "gatk_vcf_index.{vcf}.benchmark.txt" + conda: + config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]) + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + shell: + """ + {params.gatk_path}/gatk --java-options -Xmx{resources.mem_mb}m IndexFeatureFile -I {input.vcf} + + bcftools query -l {input.vcf} | grep -v '^$' > {output.sample_list} + """ + + checkpoint vcf_separation: + input: + vcf="{vcf}", + tbi="{vcf}.tbi", + sample_list="{vcf}.sample_list.txt", + output: + separated=temp("{vcf}.separation.done"), + params: + gatk_path=config["gatk_path"], + vcf_basename=lambda wildcards: os.path.basename(wildcards.vcf).replace('.vcf.gz', ''), + output_dir=lambda wildcards: os.path.dirname(wildcards.vcf), + original_vcf_dir="results/original_vcf", + log: + std=log_dir_path / "vcf_separation.{vcf}.log", + cluster_log=cluster_log_dir_path / "vcf_separation.{vcf}.cluster.log", + cluster_err=cluster_log_dir_path / "vcf_separation.{vcf}.cluster.err", + benchmark: + benchmark_dir_path / "vcf_separation.{vcf}.benchmark.txt" + conda: + config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]) + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + shell: + r""" + SAMPLES=($(grep -v '^$' {input.sample_list})) + + for SAMPLE in "${{SAMPLES[@]}}"; do + + bcftools view \ + --threads {resources.cpus} \ + --min-ac 1 \ + -s "$SAMPLE" \ + -Ov -o "{params.output_dir}/$SAMPLE.{params.vcf_basename}.unfiltered.vcf" \ + {input.vcf} + + bcftools filter \ + -e 'ALT ~ "[^ATCGN]"' \ + "{params.output_dir}/$SAMPLE.{params.vcf_basename}.unfiltered.vcf" \ + > "{params.output_dir}/$SAMPLE.{params.vcf_basename}.filtered.vcf" + + bgzip -c "{params.output_dir}/$SAMPLE.{params.vcf_basename}.filtered.vcf" \ + > "{params.output_dir}/$SAMPLE.{params.vcf_basename}.vcf.gz" + + tabix -p vcf "{params.output_dir}/$SAMPLE.{params.vcf_basename}.vcf.gz" + + {params.gatk_path}/gatk --java-options -Xmx{resources.mem_mb}m \ + IndexFeatureFile -I "{params.output_dir}/$SAMPLE.{params.vcf_basename}.vcf.gz" + + rm -f \ + "{params.output_dir}/$SAMPLE.{params.vcf_basename}.unfiltered.vcf" \ + "{params.output_dir}/$SAMPLE.{params.vcf_basename}.filtered.vcf" + + done + + touch {output.separated} + """ + + + rule gatk_altref: + input: + ref=lambda wc: vcf_reconstruct_map[wc.species]["reference"], + refidx=lambda wc: vcf_reconstruct_map[wc.species]["reference"].with_suffix(".dict"), + vcf=lambda wc: checkpoints.vcf_separation.get(vcf=vcf_reconstruct_map[wc.species]["vcf"]).output.separated, + vcfidx=lambda wc: vcf_reconstruct_map[wc.species]["vcf"].with_name(vcf_reconstruct_map[wc.species]["vcf"].name + ".tbi"), + output: + altref_dir_path / "{species}.fasta", + params: + gatk_path=config["gatk_path"], + sample=lambda wc: wc.species.split(".")[0], + log: + std=log_dir_path / "gatk_altref.{species}.log", + cluster_log=cluster_log_dir_path / "gatk_altref.{species}.cluster.log", + cluster_err=cluster_log_dir_path / "gatk_altref.{species}.cluster.err", + benchmark: + benchmark_dir_path / "gatk_altref.{species}.benchmark.txt" + conda: + config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]) + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + shell: + " {params.gatk_path}/gatk --java-options -Xmx{resources.mem_mb}m FastaAlternateReferenceMaker " + " --output {output} --reference {input.ref} --variant {input.vcf} --showHidden true --use-iupac-sample {params.sample} 1> {log.std} 2>&1; " + + + + + checkpoint move_altref_to_genomes_dir: + input: + altref_dir_path / "{species}.fasta", + output: + genome_dir_path / "{species}.fasta", + log: + std=log_dir_path / "link_altref_to_genomes_dir.{species}.log", + cluster_log=cluster_log_dir_path / "link_altref_to_genomes_dir.{species}.cluster.log", + cluster_err=cluster_log_dir_path / "link_altref_to_genomes_dir.{species}.cluster.err", + shell: + " cp -a {input} {output} 1> {log.std} 2>&1; " + +if config["vcf2phylip"]: + from pathlib import Path, PurePath + + vcf_dir = Path(config["vcf_reconstruct_dir"]) + vcf_list = list(vcf_dir.rglob("*.vcf.gz")) + if len(vcf_list) != 1: + raise ValueError(f"vcf2phylip requires exactly one VCF in {vcf_dir}, found: {vcf_list}") + vcf_file = vcf_list[0] + prefix = vcf_file.stem + + checkpoint vcf2phylip: + input: + vcf=str(vcf_file), + output: + altref_dir_path / f"{prefix}.fasta", + params: + prefix=prefix, + log: + std=log_dir_path / f"vcf2phylip.{prefix}.log", + cluster_log=cluster_log_dir_path / f"vcf2phylip.{prefix}.cluster.log", + cluster_err=cluster_log_dir_path / f"vcf2phylip.{prefix}.cluster.err", + benchmark: + benchmark_dir_path / f"vcf2phylip.{prefix}.benchmark.txt", + conda: + config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] + else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]), + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + shell: + "workflow/scripts/vcf2phylip.py -i {input.vcf} --fasta --output-prefix {params.prefix} --resolve-IUPAC" + + rule consolidate_vcf2phylip: + input: + fasta=altref_dir_path / f"{prefix}.fasta", + output: + altref_dir_path / "consensus.fasta", + log: + std=log_dir_path / "consolidate_vcf2phylip.log", + run: + shell("mv {input.fasta} {output}") diff --git a/workflow/rules/visualization.smk b/workflow/rules/visualization.smk index da75247..86775ff 100644 --- a/workflow/rules/visualization.smk +++ b/workflow/rules/visualization.smk @@ -18,7 +18,7 @@ if config["draw_phylotrees"]: benchmark: benchmark_dir_path / "iqtree_dna_tree_visualization.benchmark.txt" conda: - "../../%s" % config["ete3_conda_config"] + config["conda"]["buscoclade_ete3"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_ete3"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -50,7 +50,7 @@ if config["draw_phylotrees"]: benchmark: benchmark_dir_path / "iqtree_protein_tree_visualization.benchmark.txt" conda: - "../../%s" % config["ete3_conda_config"] + config["conda"]["buscoclade_ete3"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_ete3"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -63,32 +63,32 @@ if config["draw_phylotrees"]: if config["draw_phylotrees"]: - - rule astral_tree_visualization: - input: - treefile=astral_dir_path / astral_tree, - common_ids=rules.common_ids.output, - output: - astral_dir_path / f"{astral_tree}.png", - params: - prefix=astral_dir_path / astral_tree, - log: - std=log_dir_path / "astral_tree_visualization.log", - cluster_log=cluster_log_dir_path / "astral_tree_visualization.cluster.log", - cluster_err=cluster_log_dir_path / "astral_tree_visualization.cluster.err", - benchmark: - benchmark_dir_path / "astral_tree_visualization.benchmark.txt" - conda: - "../../%s" % config["ete3_conda_config"] - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - threads: config["processing_threads"] - shell: - " QT_QPA_PLATFORM=offscreen workflow/scripts/draw_phylotrees_from_astral.py " - " -i {input.treefile} -o {params.prefix} -n $(cat {input.common_ids} | wc -l) > {log.std} 2>&1; " + if config["vcf2phylip"] != True: + rule astral_tree_visualization: + input: + treefile=astral_dir_path / astral_tree, + common_ids=rules.common_ids.output, + output: + astral_dir_path / f"{astral_tree}.png", + params: + prefix=astral_dir_path / astral_tree, + log: + std=log_dir_path / "astral_tree_visualization.log", + cluster_log=cluster_log_dir_path / "astral_tree_visualization.cluster.log", + cluster_err=cluster_log_dir_path / "astral_tree_visualization.cluster.err", + benchmark: + benchmark_dir_path / "astral_tree_visualization.benchmark.txt" + conda: + config["conda"]["buscoclade_ete3"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_ete3"]["yaml"]) + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + threads: config["processing_threads"] + shell: + " QT_QPA_PLATFORM=offscreen workflow/scripts/draw_phylotrees_from_astral.py " + " -i {input.treefile} -o {params.prefix} -n $(cat {input.common_ids} | wc -l) > {log.std} 2>&1; " if config["draw_phylotrees"]: @@ -111,7 +111,7 @@ if config["draw_phylotrees"]: benchmark: benchmark_dir_path / "rapidnj_tree_visualization.benchmark.txt" conda: - "../../%s" % config["ete3_conda_config"] + config["conda"]["buscoclade_ete3"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_ete3"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -143,7 +143,7 @@ if config["draw_phylotrees"]: benchmark: benchmark_dir_path / "phylip_tree_visualization.benchmark.txt" conda: - "../../%s" % config["ete3_conda_config"] + config["conda"]["buscoclade_ete3"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_ete3"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -174,7 +174,7 @@ if config["draw_phylotrees"]: benchmark: benchmark_dir_path / "raxml_tree_visualization.benchmark.txt", conda: - "../../%s" % config["ete3_conda_config"], + config["conda"]["buscoclade_ete3"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_ete3"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -185,72 +185,73 @@ if config["draw_phylotrees"]: "QT_QPA_PLATFORM=offscreen workflow/scripts/draw_phylotrees.py -i {input} " "-o {params.prefix} {params.options} 1> {log.std} 2>&1; " -rule species_ids_plot: - input: - single_copy_ids=expand(species_ids_dir_path / "single_copy/{species}.ids", species=config["species_list"]), - multi_copy_ids=expand(species_ids_dir_path / "multi_copy/{species}.ids", species=config["species_list"]), - output: - species_ids_dir_path / "unique_species_ids.png", - log: - std=log_dir_path / "species_ids_plot.log", - cluster_log=cluster_log_dir_path / "species_ids_plot.cluster.log", - cluster_err=cluster_log_dir_path / "species_ids_plot.cluster.err", - conda: - "../../%s" % config["ete3_conda_config"] - benchmark: - benchmark_dir_path / "species_ids_plot.benchmark.txt" - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - threads: config["processing_threads"] - shell: - " workflow/scripts/unique_ids_plot.py --single_copy_ids_files {input.single_copy_ids} --multi_copy_ids_files {input.multi_copy_ids}" - " --outplot {output} > {log.std} 2>&1 " - +if config["vcf2phylip"] != True: + rule species_ids_plot: + input: + single_copy_ids=expand(species_ids_dir_path / "single_copy/{species}.ids", species=config["species_list"]), + multi_copy_ids=expand(species_ids_dir_path / "multi_copy/{species}.ids", species=config["species_list"]), + output: + species_ids_dir_path / "unique_species_ids.png", + log: + std=log_dir_path / "species_ids_plot.log", + cluster_log=cluster_log_dir_path / "species_ids_plot.cluster.log", + cluster_err=cluster_log_dir_path / "species_ids_plot.cluster.err", + conda: + config["conda"]["buscoclade_ete3"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_ete3"]["yaml"]) + benchmark: + benchmark_dir_path / "species_ids_plot.benchmark.txt" + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + threads: config["processing_threads"] + shell: + " workflow/scripts/unique_ids_plot.py --single_copy_ids_files {input.single_copy_ids} --multi_copy_ids_files {input.multi_copy_ids}" + " --outplot {output} > {log.std} 2>&1 " -rule busco_summaries_to_tsv: - input: - expand(busco_dir_path / "{species}/short_summary_{species}.txt", species=config["species_list"]), - output: - busco_dir_path / "busco_summaries.tsv", - log: - std=log_dir_path / "busco_summaries_to_tsv.log", - cluster_log=cluster_log_dir_path / "busco_summaries_to_tsv.cluster.log", - cluster_err=cluster_log_dir_path / "busco_summaries_to_tsv.cluster.err", - benchmark: - benchmark_dir_path / "busco_summaries_to_tsv.benchmark.txt" - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - threads: config["processing_threads"] - shell: - " workflow/scripts/busco_summaries_to_tsv.py -i {input} -o {output} > {log.std} 2>&1; " +if config["vcf2phylip"] != True: + rule busco_summaries_to_tsv: + input: + expand(busco_dir_path / "{species}/short_summary_{species}.txt", species=config["species_list"]), + output: + busco_dir_path / "busco_summaries.tsv", + log: + std=log_dir_path / "busco_summaries_to_tsv.log", + cluster_log=cluster_log_dir_path / "busco_summaries_to_tsv.cluster.log", + cluster_err=cluster_log_dir_path / "busco_summaries_to_tsv.cluster.err", + conda: + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + threads: config["processing_threads"] + shell: + " workflow/scripts/busco_summaries_to_tsv.py -i {input} -o {output} > {log.std} 2>&1; " -rule busco_histogram: - input: - busco_dir_path / "busco_summaries.tsv", - output: - busco_dir_path / "busco_summaries.svg", - params: - colors=config["busco_histogram_colors"], - log: - std=log_dir_path / "busco_histogram.log", - cluster_log=cluster_log_dir_path / "busco_histogram.cluster.log", - cluster_err=cluster_log_dir_path / "busco_histogram.cluster.err", - benchmark: - benchmark_dir_path / "busco_histogram.benchmark.txt" - conda: - "../../%s" % config["conda_config"] - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - threads: config["processing_threads"] - shell: - " workflow/scripts/busco_histogram.py -i {input} -o {output} -c '{params.colors}' > {log.std} 2>&1; " + rule busco_histogram: + input: + busco_dir_path / "busco_summaries.tsv", + output: + busco_dir_path / "busco_summaries.svg", + params: + colors=config["busco_histogram_colors"], + log: + std=log_dir_path / "busco_histogram.log", + cluster_log=cluster_log_dir_path / "busco_histogram.cluster.log", + cluster_err=cluster_log_dir_path / "busco_histogram.cluster.err", + benchmark: + benchmark_dir_path / "busco_histogram.benchmark.txt" + conda: + config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + threads: config["processing_threads"] + shell: + " workflow/scripts/busco_histogram.py -i {input} -o {output} -c '{params.colors}' > {log.std} 2>&1; " diff --git a/workflow/scripts/draw_phylotrees.py b/workflow/scripts/draw_phylotrees.py index 0c180e3..d657cbc 100755 --- a/workflow/scripts/draw_phylotrees.py +++ b/workflow/scripts/draw_phylotrees.py @@ -1,8 +1,16 @@ #!/usr/bin/env python3 __author__ = 'tomarovsky' -from ete3 import TextFace, Tree, faces, AttrFace, TreeStyle, NodeStyle +from ete3 import TextFace, Tree, faces, AttrFace, TreeStyle, NodeStyle, CircleFace from argparse import ArgumentParser +import re +def format_name(name): + name = re.sub(r'(GCA|GCF)_', r'\1@', name) + name = name.replace('_', ' ') + name = name.replace('GCA@', 'GCA_').replace('GCF@', 'GCF_') + name = re.sub(r'(\d) (\d)', r'\1_\2', name) + + return name def mylayout(node): if node.is_leaf(): @@ -14,17 +22,21 @@ def mylayout(node): node.img_style["shape"] = "circle" node.img_style["fgcolor"] = "Black" else: - node.img_style["shape"] = "circle" - node.img_style["size"] = 5 - if node.support > 90: - node.img_style["fgcolor"] = "LimeGreen" - elif node.support > 70: - node.img_style["fgcolor"] = '#008cf0' - elif node.support > 50: - node.img_style["fgcolor"] = '#883ac2' + support_value = node.support + if support_value > 90: + color = "LimeGreen" + elif support_value > 70: + color = "#008cf0" + elif support_value > 50: + color = "#883ac2" else: - node.img_style["fgcolor"] = '#ff0000' - + color = "#ff0000" + + support_circle = CircleFace(radius=3, color=color, style="circle") + faces.add_face_to_node(support_circle, node, column=0, position="float") + node.img_style["size"] = 0 + + def export_legend(palette): legend = TreeStyle() @@ -39,11 +51,12 @@ def export_legend(palette): def main(): - palette = {"90": "LimeGreen", "70": '#008cf0', "50": '#883ac2', "<50": '#ff0000'} + palette = {"91 - 100": "LimeGreen", "71-90": '#008cf0', "51-70": '#883ac2', "<50": '#ff0000'} t = Tree(args.input) for i in t.get_leaves(): - i.name = i.name.replace("_", " ").replace("GCA ", "GCA_").replace("GCF ", "GCF_") + i.name = format_name(i.name) + #i.name = i.name.replace("_", " ").replace("GCA ", "GCA_").replace("GCF ", "GCF_") if args.outgroup: outgroup_species = args.outgroup.split(',') if len(outgroup_species) == 1: @@ -59,7 +72,7 @@ def main(): ts.layout_fn = mylayout ts.show_scale = True ts.show_leaf_name = False - ts.legend_position = 4 + ts.legend_position = 1 ts.legend = export_legend(palette).legend for n in t.traverse(): @@ -73,21 +86,25 @@ def main(): ts.show_branch_length = True ts.show_branch_support = True ts.branch_vertical_margin = -12 + t.ladderize(direction=1) t.render(f"{args.output}.length_and_support_tree.png", w=3000, units="px", tree_style=ts) ts.show_branch_length = False ts.show_branch_support = True ts.branch_vertical_margin = -4 + t.ladderize(direction=1) t.render(f"{args.output}.only_support_tree.png", w=3000, units="px", tree_style=ts) ts.show_branch_length = False ts.show_branch_support = False ts.branch_vertical_margin = 0 + t.ladderize(direction=1) t.render(f"{args.output}.only_tree.png", w=6000, units="px", tree_style=ts) ts.show_branch_length = False ts.show_branch_support = False ts.branch_vertical_margin = 0 + t.ladderize(direction=1) t.render(f"{args.output}.dots.png", w=6000, units="px", tree_style=ts) From d3292c4eddc2e7ae3116fb2107adce385f3e92cc Mon Sep 17 00:00:00 2001 From: zubiriguitar Date: Fri, 16 May 2025 01:02:20 +0700 Subject: [PATCH 09/15] gatk_vcf_index fix --- workflow/rules/vcf_reconstruct.smk | 180 +++++++++++++++-------------- 1 file changed, 94 insertions(+), 86 deletions(-) diff --git a/workflow/rules/vcf_reconstruct.smk b/workflow/rules/vcf_reconstruct.smk index db01869..e270a93 100644 --- a/workflow/rules/vcf_reconstruct.smk +++ b/workflow/rules/vcf_reconstruct.smk @@ -27,100 +27,108 @@ if config["vcf_gatk"]: "1> {log.std} 2>&1" - rule gatk_vcf_index: - input: - vcf="{vcf}", - output: - tbi="{vcf}.tbi", - sample_list=temp("{vcf}.sample_list.txt"), - params: - gatk_path=config["gatk_path"], - log: - std=log_dir_path / "gatk_vcf_index.{vcf}.log", - cluster_log=cluster_log_dir_path / "gatk_vcf_index.{vcf}.cluster.log", - cluster_err=cluster_log_dir_path / "gatk_vcf_index.{vcf}.cluster.err", - benchmark: - benchmark_dir_path / "gatk_vcf_index.{vcf}.benchmark.txt" - conda: - config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]) - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - shell: - """ - {params.gatk_path}/gatk --java-options -Xmx{resources.mem_mb}m IndexFeatureFile -I {input.vcf} +rule gatk_vcf_index: + input: + vcf="{vcf_dir}/{vcf}", + output: + tbi="{vcf_dir}/{vcf}.tbi", + sample_list=temp("{vcf_dir}/{vcf}.sample_list.txt"), + params: + gatk_path=config["gatk_path"], + log: + std_gatk= "{vcf_dir}/gatk_vcf_index.{vcf}.log", + std_bcf= "{vcf_dir}/gatk_vcf_index.{vcf}.bcf.log", + std_grep= "{vcf_dir}/gatk_vcf_index.{vcf}.grep.log", + cluster_log="{vcf_dir}/gatk_vcf_index.{vcf}.cluster.log", + cluster_err="{vcf_dir}/gatk_vcf_index.{vcf}.cluster.err", + benchmark: + "{vcf_dir}/gatk_vcf_index.{vcf}.benchmark.txt" + conda: + config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]) + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + shell: + """ + {params.gatk_path}/gatk --java-options -Xmx{resources.mem_mb}m IndexFeatureFile -I {input.vcf} > {log.std_gatk} 2>&1; + bcftools query -l {input.vcf} 2>{log.std_bcf}| + grep -v '^$' > {output.sample_list} 2>{log.std_grep}; + """ + +checkpoint vcf_separation: + input: + vcf="{vcf_dir}/{vcf}", + tbi="{vcf_dir}/{vcf}.tbi", + sample_list="{vcf_dir}/{vcf}.sample_list.txt", + output: + separated="{vcf_dir}/{vcf}.separation.done", + params: + gatk_path=config["gatk_path"], + vcf_basename=lambda wildcards: os.path.basename(wildcards.vcf).replace('.vcf.gz', ''), + output_dir=lambda wildcards: os.path.dirname(wildcards.vcf), + original_vcf_dir="results/original_vcf", + log: + std="{vcf_dir}/vcf_separation.{vcf}.log", + std_bcf_view="{vcf_dir}/vcf_separation.{vcf}.bcf_view.log", + std_bcf_filter="{vcf_dir}/vcf_separation.{vcf}.bcf_filter.log", + std_zip="{vcf_dir}/vcf_separation.{vcf}.zip.log", + std_tabix="{vcf_dir}/vcf_separation.{vcf}.tabix.log", + std_iff="{vcf_dir}/vcf_separation.{vcf}.iff.log", + cluster_log="{vcf_dir}/vcf_separation.{vcf}.cluster.log", + cluster_err="{vcf_dir}/vcf_separation.{vcf}.cluster.err", + benchmark: + "{vcf_dir}/vcf_separation.{vcf}.benchmark.txt" + conda: + config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]) + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + shell: + r""" + SAMPLES=($(grep -v '^$' {input.sample_list})) + + for SAMPLE in "${{SAMPLES[@]}}"; do + + bcftools view \ + --threads {resources.cpus} \ + --min-ac 1 \ + -s "$SAMPLE" \ + -Ov -o "{params.output_dir}/$SAMPLE.{params.vcf_basename}.unfiltered.vcf" \ + {input.vcf} > {log.std_bcf_view} 2>&1 + + bcftools filter \ + -e 'ALT ~ "[^ATCGN]"' \ + "{params.output_dir}/$SAMPLE.{params.vcf_basename}.unfiltered.vcf" \ + > "{params.output_dir}/$SAMPLE.{params.vcf_basename}.filtered.vcf" 2>{log.std_bcf_filter} + + bgzip -c "{params.output_dir}/$SAMPLE.{params.vcf_basename}.filtered.vcf" \ + > "{params.output_dir}/$SAMPLE.{params.vcf_basename}.vcf.gz" 2>{log.std_zip} + + tabix -p vcf "{params.output_dir}/$SAMPLE.{params.vcf_basename}.vcf.gz" > {log.std_tabix} 2>&1 + + {params.gatk_path}/gatk --java-options -Xmx{resources.mem_mb}m \ + IndexFeatureFile -I "{params.output_dir}/$SAMPLE.{params.vcf_basename}.vcf.gz" >{log.std_iff} 2>&1 - bcftools query -l {input.vcf} | grep -v '^$' > {output.sample_list} - """ + rm -f \ + "{params.output_dir}/$SAMPLE.{params.vcf_basename}.unfiltered.vcf" \ + "{params.output_dir}/$SAMPLE.{params.vcf_basename}.filtered.vcf" - checkpoint vcf_separation: - input: - vcf="{vcf}", - tbi="{vcf}.tbi", - sample_list="{vcf}.sample_list.txt", - output: - separated=temp("{vcf}.separation.done"), - params: - gatk_path=config["gatk_path"], - vcf_basename=lambda wildcards: os.path.basename(wildcards.vcf).replace('.vcf.gz', ''), - output_dir=lambda wildcards: os.path.dirname(wildcards.vcf), - original_vcf_dir="results/original_vcf", - log: - std=log_dir_path / "vcf_separation.{vcf}.log", - cluster_log=cluster_log_dir_path / "vcf_separation.{vcf}.cluster.log", - cluster_err=cluster_log_dir_path / "vcf_separation.{vcf}.cluster.err", - benchmark: - benchmark_dir_path / "vcf_separation.{vcf}.benchmark.txt" - conda: - config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]) - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - shell: - r""" - SAMPLES=($(grep -v '^$' {input.sample_list})) - - for SAMPLE in "${{SAMPLES[@]}}"; do - - bcftools view \ - --threads {resources.cpus} \ - --min-ac 1 \ - -s "$SAMPLE" \ - -Ov -o "{params.output_dir}/$SAMPLE.{params.vcf_basename}.unfiltered.vcf" \ - {input.vcf} - - bcftools filter \ - -e 'ALT ~ "[^ATCGN]"' \ - "{params.output_dir}/$SAMPLE.{params.vcf_basename}.unfiltered.vcf" \ - > "{params.output_dir}/$SAMPLE.{params.vcf_basename}.filtered.vcf" - - bgzip -c "{params.output_dir}/$SAMPLE.{params.vcf_basename}.filtered.vcf" \ - > "{params.output_dir}/$SAMPLE.{params.vcf_basename}.vcf.gz" - - tabix -p vcf "{params.output_dir}/$SAMPLE.{params.vcf_basename}.vcf.gz" - - {params.gatk_path}/gatk --java-options -Xmx{resources.mem_mb}m \ - IndexFeatureFile -I "{params.output_dir}/$SAMPLE.{params.vcf_basename}.vcf.gz" - - rm -f \ - "{params.output_dir}/$SAMPLE.{params.vcf_basename}.unfiltered.vcf" \ - "{params.output_dir}/$SAMPLE.{params.vcf_basename}.filtered.vcf" - - done - - touch {output.separated} - """ + done + touch {output.separated} + """ +if config["vcf_gatk"]: rule gatk_altref: input: ref=lambda wc: vcf_reconstruct_map[wc.species]["reference"], refidx=lambda wc: vcf_reconstruct_map[wc.species]["reference"].with_suffix(".dict"), - vcf=lambda wc: checkpoints.vcf_separation.get(vcf=vcf_reconstruct_map[wc.species]["vcf"]).output.separated, + vcf=lambda wc: checkpoints.vcf_separation.get(vcf=Path(vcf_reconstruct_map[wc.species]["vcf"]).name, + vcf_dir=Path(vcf_reconstruct_map[wc.species]["vcf"]).parent).output.separated, vcfidx=lambda wc: vcf_reconstruct_map[wc.species]["vcf"].with_name(vcf_reconstruct_map[wc.species]["vcf"].name + ".tbi"), output: altref_dir_path / "{species}.fasta", From ed1f843a3d2afb77c4e759f775edf2bddf55dda5 Mon Sep 17 00:00:00 2001 From: zubiriguitar Date: Fri, 16 May 2025 01:04:36 +0700 Subject: [PATCH 10/15] gatk_vcf_index fix_2 --- .vscode/settings.json | 3 +++ workflow/rules/vcf_reconstruct.smk | 1 + 2 files changed, 4 insertions(+) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..76c50c3 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "editor.renderWhitespace": "all" +} \ No newline at end of file diff --git a/workflow/rules/vcf_reconstruct.smk b/workflow/rules/vcf_reconstruct.smk index e270a93..d52529a 100644 --- a/workflow/rules/vcf_reconstruct.smk +++ b/workflow/rules/vcf_reconstruct.smk @@ -27,6 +27,7 @@ if config["vcf_gatk"]: "1> {log.std} 2>&1" + rule gatk_vcf_index: input: vcf="{vcf_dir}/{vcf}", From 2f2e37be0b77f786c60cd0424fdfb6fa155fa553 Mon Sep 17 00:00:00 2001 From: zubiriguitar Date: Fri, 16 May 2025 01:56:38 +0700 Subject: [PATCH 11/15] gatk_altref fix --- workflow/rules/vcf_reconstruct.smk | 31 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/workflow/rules/vcf_reconstruct.smk b/workflow/rules/vcf_reconstruct.smk index d52529a..9153e37 100644 --- a/workflow/rules/vcf_reconstruct.smk +++ b/workflow/rules/vcf_reconstruct.smk @@ -27,13 +27,12 @@ if config["vcf_gatk"]: "1> {log.std} 2>&1" - rule gatk_vcf_index: input: vcf="{vcf_dir}/{vcf}", output: tbi="{vcf_dir}/{vcf}.tbi", - sample_list=temp("{vcf_dir}/{vcf}.sample_list.txt"), + sample_list="{vcf_dir}/{vcf}.sample_list.txt", params: gatk_path=config["gatk_path"], log: @@ -68,8 +67,8 @@ checkpoint vcf_separation: params: gatk_path=config["gatk_path"], vcf_basename=lambda wildcards: os.path.basename(wildcards.vcf).replace('.vcf.gz', ''), - output_dir=lambda wildcards: os.path.dirname(wildcards.vcf), original_vcf_dir="results/original_vcf", + reference_prefix=lambda wc: list(Path(wc.vcf_dir).glob("*.fasta"))[0].stem log: std="{vcf_dir}/vcf_separation.{vcf}.log", std_bcf_view="{vcf_dir}/vcf_separation.{vcf}.bcf_view.log", @@ -98,25 +97,25 @@ checkpoint vcf_separation: --threads {resources.cpus} \ --min-ac 1 \ -s "$SAMPLE" \ - -Ov -o "{params.output_dir}/$SAMPLE.{params.vcf_basename}.unfiltered.vcf" \ + -Ov -o "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.unfiltered.vcf" \ {input.vcf} > {log.std_bcf_view} 2>&1 bcftools filter \ -e 'ALT ~ "[^ATCGN]"' \ - "{params.output_dir}/$SAMPLE.{params.vcf_basename}.unfiltered.vcf" \ - > "{params.output_dir}/$SAMPLE.{params.vcf_basename}.filtered.vcf" 2>{log.std_bcf_filter} + "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.unfiltered.vcf" \ + > "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.filtered.vcf" 2>{log.std_bcf_filter} - bgzip -c "{params.output_dir}/$SAMPLE.{params.vcf_basename}.filtered.vcf" \ - > "{params.output_dir}/$SAMPLE.{params.vcf_basename}.vcf.gz" 2>{log.std_zip} + bgzip -c "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.filtered.vcf" \ + > "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.vcf.gz" 2>{log.std_zip} - tabix -p vcf "{params.output_dir}/$SAMPLE.{params.vcf_basename}.vcf.gz" > {log.std_tabix} 2>&1 + tabix -p vcf "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.vcf.gz" > {log.std_tabix} 2>&1 {params.gatk_path}/gatk --java-options -Xmx{resources.mem_mb}m \ - IndexFeatureFile -I "{params.output_dir}/$SAMPLE.{params.vcf_basename}.vcf.gz" >{log.std_iff} 2>&1 + IndexFeatureFile -I "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.vcf.gz" >{log.std_iff} 2>&1 rm -f \ - "{params.output_dir}/$SAMPLE.{params.vcf_basename}.unfiltered.vcf" \ - "{params.output_dir}/$SAMPLE.{params.vcf_basename}.filtered.vcf" + "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.unfiltered.vcf" \ + "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.filtered.vcf" done @@ -128,8 +127,10 @@ if config["vcf_gatk"]: input: ref=lambda wc: vcf_reconstruct_map[wc.species]["reference"], refidx=lambda wc: vcf_reconstruct_map[wc.species]["reference"].with_suffix(".dict"), - vcf=lambda wc: checkpoints.vcf_separation.get(vcf=Path(vcf_reconstruct_map[wc.species]["vcf"]).name, - vcf_dir=Path(vcf_reconstruct_map[wc.species]["vcf"]).parent).output.separated, + checkpoint=lambda wc: checkpoints.vcf_separation.get(vcf=Path(vcf_reconstruct_map[wc.species]["vcf"]).name, + vcf_dir=Path(vcf_reconstruct_map[wc.species]["vcf"]).parent).output.separated, + vcf=lambda wc: "%s/{species}.%s.vcf.gz" % (Path(vcf_reconstruct_map[wc.species]["vcf"]).parent, + Path(vcf_reconstruct_map[wc.species]["vcf"]).name.replace('.vcf.gz', '')), vcfidx=lambda wc: vcf_reconstruct_map[wc.species]["vcf"].with_name(vcf_reconstruct_map[wc.species]["vcf"].name + ".tbi"), output: altref_dir_path / "{species}.fasta", @@ -154,8 +155,6 @@ if config["vcf_gatk"]: " --output {output} --reference {input.ref} --variant {input.vcf} --showHidden true --use-iupac-sample {params.sample} 1> {log.std} 2>&1; " - - checkpoint move_altref_to_genomes_dir: input: altref_dir_path / "{species}.fasta", From 03f683c92e7fea543317f486902086845fd84111 Mon Sep 17 00:00:00 2001 From: zubiriguitar Date: Tue, 20 May 2025 01:34:30 +0700 Subject: [PATCH 12/15] Final commit --- README.md | 149 +++++++++++------ Snakefile | 65 +++++++- config/default.yaml | 6 + workflow/envs/main.yaml | 4 +- workflow/rules/alignment.smk | 4 +- workflow/rules/astral.smk | 8 +- workflow/rules/busco.smk | 41 ++--- workflow/rules/concat_alignments.smk | 18 +- workflow/rules/filtration.smk | 8 +- workflow/rules/iqtree.smk | 4 +- workflow/rules/phylip.smk | 4 +- workflow/rules/quastcore.smk | 2 +- workflow/rules/rapidnj.smk | 2 +- workflow/rules/raxml.smk | 4 +- workflow/rules/vcf_reconstruct.smk | 241 ++++++++++++--------------- workflow/rules/visualization.smk | 4 +- 16 files changed, 317 insertions(+), 247 deletions(-) diff --git a/README.md b/README.md index 0511799..0b8472b 100644 --- a/README.md +++ b/README.md @@ -3,83 +3,124 @@ [![Snakemake](https://img.shields.io/badge/snakemake-<8.0-brightgreen.svg)](https://snakemake.github.io) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -## Description +# BuscoClade Pipeline Documentation -Pipeline to construct species phylogenies using [BUSCO](https://busco.ezlab.org/). +**BuscoClade** is a **Snakemake**-based workflow that constructs species phylogenies using *BUSCO* (Benchmarking Universal Single-Copy Orthologs). It runs multiple analysis stages—from preparing inputs and running BUSCO on each genome, through multiple-sequence alignment (MSA), trimming, and tree-building—to produce a final phylogenetic tree and related visualizations. BuscoClade is organized as modular Snakemake rules; each rule corresponds to a step (e.g. alignment, filtering, tree inference). Snakemake automatically infers dependencies between rules based on input/output files. By leveraging Snakemake and Conda, BuscoClade ensures a reproducible, scalable pipeline that can run locally or on an HPC cluster with minimal user effort. -![Workflow scheme](./workflow.png) +Key components and stages of the pipeline include: -- Alignment: [PRANK](http://wasabiapp.org/software/prank/), [MAFFT](https://mafft.cbrc.jp/alignment/software/). -- Trimming: [GBlocks](https://academic.oup.com/mbe/article/17/4/540/1127654), [TrimAl](http://trimal.cgenomics.org/). -- Phylogenetic tree constraction: [IQTree](http://www.iqtree.org/), [MrBayes](https://nbisweden.github.io/MrBayes/), [ASTRAL III](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2129-y), [RapidNJ](https://birc.au.dk/software/rapidnj), [PHYLIP](https://phylipweb.github.io/phylip/). -- Visualization: [Etetoolkit](http://etetoolkit.org/), [Matplotlib](https://matplotlib.org/stable/). +* **Input Preparation:** Genome assemblies (FASTA files) and optionally variant data (VCF files) are placed in designated input directories. +* **[BUSCO](https://busco.ezlab.org/) Analysis:** Each genome is analyzed with BUSCO to identify conserved single-copy orthologous genes. BuscoClade can use either the *MetaEuk* or *AUGUSTUS* gene predictor (configurable). The pipeline collects single-copy BUSCO gene IDs that are common across all species. +* **Sequence Extraction:** BUSCO outputs are parsed to extract the DNA or protein sequences of the common single-copy genes from each genome. +* **Alignment:** Extracted gene sequences are aligned across species. The pipeline supports [PRANK](http://wasabiapp.org/software/prank/) or [MAFFT](https://mafft.cbrc.jp/alignment/software/) for alignment; users select by configuration (e.g. `dna_alignment: 'prank'` or `'mafft'`). +* **Filtering/Trimming:** Alignments are cleaned of poorly aligned or gap-rich regions. BuscoClade supports [GBlocks](https://academic.oup.com/mbe/article/17/4/540/1127654) or [TrimAl](http://trimal.cgenomics.org/) for trimming (configurable). This reduces noise before tree inference. +* **Concatenation:** Individual gene alignments are concatenated into a supermatrix (combined alignment). A concatenated alignment file (FASTA, PHYLIP, etc.) is produced for downstream analyses. +* **Phylogenetic Inference:** The pipeline can infer phylogenies using multiple methods: -## Usage + * **[IQTree](http://www.iqtree.org/) (Maximum Likelihood):** Fast ML tree reconstruction with model testing and bootstrapping. + * **[ASTRAL III](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2129-y) (Coalescent):** Builds a species tree from gene trees (produced by IQ-TREE) using the ASTRAL algorithm for coalescent-based inference. + * **[RapidNJ](https://birc.au.dk/software/rapidnj) (Neighbor Joining):** Quick distance-based tree (with bootstraps) from the concatenated alignment. + * **[PHYLIP](https://phylipweb.github.io/phylip/) (Neighbor Joining/UPGMA):** Tree building via the classic PHYLIP package. + * **[Raxml-NG](https://github.com/amkozlov/raxml-ng) (Maximum Likelihood):** Alternative ML tree builder. + * **[MrBayes](https://nbisweden.github.io/MrBayes/) (Bayesian MCMC):** Optional Bayesian inference (requires more time). +* **VCF-Based Alternative:** Instead of BUSCO, BuscoClade can operate on aligned variant data. Provided multi-sample VCFs can be split per sample and converted to a phylogenetic alignment (via [vcf2phylip](https://github.com/edgardomortiz/vcf2phylip)), enabling phylogeny from SNP data. +* **Visualization:** Final trees are visualized ([Etetoolkit](http://etetoolkit.org/)) and saved as image files. The pipeline also produces summary plots such as BUSCO completeness histograms and charts of ortholog presence using [Matplotlib](https://matplotlib.org/stable/) scripts. -### Step 1. Deploy workflow -To use this workflow, you can either download and extract the [latest release](https://github.com/tomarovsky/BuscoClade/releases) or clone the repository: -``` -git clone https://github.com/Zubiriguitar/BuscoClade.git -``` -### Step 2. Add species genomes +## Quick Start Guide -Place your unpacked FASTA genome assemblies into the `genomes/` directory. Keep in mind that the file prefixes will influence the output phylogeny. Ensure that your files have a `.fasta` extension. +Follow these steps to run BuscoClade on your data. This guide assumes basic familiarity with the command line. -### Step 3. Configure workflow +1. **Clone the Repository:** -To set up the workflow, modify `config/default.yaml`. I recommend to copy config gile and do all modifications in this copy. Some of the options (all nonested options from default.yaml) could also be set via command line using `--config` flag. Sections of config file: + ```bash + git clone https://github.com/Zubiriguitar/BuscoClade.git + cd BuscoClade + ``` -- **Pipeline Configuration:** -This section outlines the workflow. By default, it includes alignments and following filtration of nucleotide sequences, and all tools for phylogeny reconstruction, except for MrBayes (it is recommended to run the GPU compiled version separately). To disable a tool, set its value to `False` or comment out the corresponding line. +2. **Install Requirements:** - **NB!** When constructing a phylogeny using the Neighbor-Joining (NJ) method with PHYLIP, ensure that the first 10 characters of each species name are unique and distinct from one another. + BuscoClade relies on Snakemake and various bioinformatics tools. It is recommended to use Conda to install dependencies. You can either install Snakemake and tools globally or let Snakemake create environments. For example: -- **Tool Parameters:** -Specify parameters for each tool. To perform BUSCO, it is important to specify: - - `busco_dataset_path`: Download the BUSCO dataset beforehand and specify its path here. - - `busco_params`: Use the `--offline` flag and the `--download_path` parameter, indicating the path to the `busco_downloads/` directory. + ```bash + # (Optional) Create a new Conda environment with Snakemake + conda create -n buscoclade_env snakemake python=3.8 -y + conda activate buscoclade_env -- **Directory structure:** -Define output file structure in the `results/` directory. It is recommended to leave it unchanged. + # Or install Snakemake and Mamba/Conda globally as needed + ``` -- **Resources:** -Specify Slurm queue, threads, memory, and runtime for each tool. +3. **Prepare Input Data:** -### Step 4. Execute workflow + * Create the input directory structure: put your genome FASTA files in `input/genomes/`. Each file should have a `.fasta` extension. For example: -For a dry run: + ``` + input/genomes/ + species_1_.fasta + ... + species_n_.fasta + ``` + * (Optional) If you have variant data instead of raw genomes, place VCF files in .gz archive format with reference FASTA file if using GATK into `input/vcf_reconstruct/*any_folder*`. The pipeline will split multi-sample VCFs into per-sample VCFs and reconstruct alignments from them. Genome input is also an option combining with VCF. + + ``` + input/genomes/ + species_1_.fasta + ... + species_n_.fasta + input/vcf_reconstruct/*folder_1* + reference_1.fasta + vcf_file_1.vcf.gz + ... + vcf_file_m.vcf.gz + input/vcf_reconstruct/*folder_k* + reference_k.fasta + vcf_file_1.vcf.gz + ... + vcf_file_i.vcf.gz + ``` -``` -snakemake --profile profile/slurm/ --configfile config/default.yaml --dry-run -``` + * If you using vcf2phylip yo are able to place only one VCF `input/vcf_reconstruct/*any_folder*` -Snakemake will print all the rules that will be executed. Remove `--dry-run` to initiate the actual run. +4. **Download BUSCO Datasets:** -**How to run the workflow if I have completed BUSCOs?** + BuscoClade requires a BUSCO lineage dataset. Download the appropriate BUSCO dataset (e.g. `saccharomycetes_odb10`) from the BUSCO website and note its path. Update the `busco_dataset_path` in the config (see next step). -First, move the genome assemblies to the ` genomes/` directory or create empty files with corresponding names. Then, create a `results/busco/` directory and move the BUSCO output directories into it. Note that BUSCO output must be formatted. Thus, for `Ailurus_fulgens.fasta` BUSCO output should look like this: +5. **Configure the Workflow:** -``` -results/ - busco/ - Ailurus_fulgens/ - busco_sequences/ - fragmented_busco_sequences/ - multi_copy_busco_sequences/ - single_copy_busco_sequences/ - hmmer_output/ - logs/ - metaeuk_output/ - full_table_Ailurus_fulgens.tsv - missing_busco_list_Ailurus_fulgens.tsv - short_summary_Ailurus_fulgens.txt - short_summary.json - short_summary.specific.mammalia_odb10.Ailurus_fulgens.json - short_summary.specific.mammalia_odb10.Ailurus_fulgens.txt -``` + * Open `config/default.yaml` in an editor. Adjust the parameters as needed (see next section for details). At minimum, set: + + ```yaml + genome_dir: "input/genomes/" + busco_dataset_path: "/path/to/busco/saccharomycetes_odb10/" + output_dir: "results/" + ``` + * You can leave many defaults unchanged on first run. The default config enables DNA alignment with PRANK, trimming with Gblocks, and tree inference with IQ-TREE, ASTRAL, RapidNJ, PHYLIP, and RAxML. + +6. **Run the Pipeline:** + + Use Snakemake to execute the workflow. For a local run (no cluster), a simple command is: + + ```bash + snakemake --cores 4 --configfile config/default.yaml + ``` + + This tells Snakemake to use at most 4 CPU cores and to create/use Conda environments as specified. The `--use-conda` flag enables environment management for reproducibility. Snakemake will execute the pipeline steps in order. + + If you have a SLURM cluster, you can use the provided profile: + + ```bash + snakemake --profile profile/slurm --configfile config/default.yaml + ``` + + This submits jobs to SLURM as per the `profile/slurm/config.yaml` settings. Adjust the `profile/slurm/config.yaml` (queues, time, etc.) if needed for your cluster. + +7. **Inspect Results:** + + After completion, see the `results/` directory (or the directory you set). It will contain subdirectories for alignments, phylogenetic trees, and summary outputs (see **Output Description** below). + +Throughout execution, Snakemake will produce logs in the `logs/` directory (and cluster logs in `cluster_logs/`). You can resume or rerun specific steps by adjusting the command (e.g. adding `-R` to rerun certain rules). ## Contact diff --git a/Snakefile b/Snakefile index 0807c6f..0b0fdda 100644 --- a/Snakefile +++ b/Snakefile @@ -14,10 +14,10 @@ genome_dir_path = Path(config["genome_dir"]).resolve() vcf_reconstruct_dir_path = Path(config["vcf_reconstruct_dir"]).resolve() # -- Logs and benchmarks -- -cluster_log_dir_path = Path(config["cluster_log_dir"]) -log_dir_path = Path(config["log_dir"]) -benchmark_dir_path = Path(config["benchmark_dir"]) -output_dir_path = Path(config["output_dir"]) +cluster_log_dir_path = Path(config["cluster_log_dir"]).resolve() +log_dir_path = Path(config["log_dir"]).resolve() +benchmark_dir_path = Path(config["benchmark_dir"]).resolve() +output_dir_path = Path(config["output_dir"]).resolve() # -- Results -- quastcore_dir_path = output_dir_path / config["quastcore_dir"] @@ -55,8 +55,8 @@ raxml_tree = "{}.fna.raxml.treefile".format(config["alignment_file_prefix"]) # ---- Necessary functions ---- -def get_vcf_reconstruct_map(vcf_dir: Path) -> dict: - """Returns a dictionary where keys are species names and values are dictionaries of VCF and FASTA files.""" +"""def get_vcf_reconstruct_map(vcf_dir: Path) -> dict: + Returns a dictionary where keys are species names and values are dictionaries of VCF and FASTA files. vcf_mapping = {} for vcf_subdir in vcf_dir.iterdir(): if vcf_subdir.is_dir(): @@ -73,9 +73,58 @@ def get_vcf_reconstruct_map(vcf_dir: Path) -> dict: 'vcf': vcf_file, 'reference': ref_file } + return vcf_mapping """ + +def get_vcf_reconstruct_map(vcf_dir: Path) -> dict: + """Returns a dictionary where keys are species names and values are dictionaries of VCF and FASTA files.""" + import gzip + + vcf_mapping = {} + for vcf_subdir in vcf_dir.iterdir(): + if vcf_subdir.is_dir(): + + ref_files = list(vcf_subdir.glob("*.fasta")) + + if not ref_files: + continue + vcf_mapping[vcf_subdir] = {} + ref_file = ref_files[0] + ref_prefix = ref_file.stem + + vcf_mapping[vcf_subdir]["reference_prefix"] = ref_prefix + vcf_mapping[vcf_subdir]["vcf_prefix_list"] = [vcf_file.name.replace(".vcf.gz", "") for vcf_file in vcf_subdir.glob("*.vcf.gz")] + vcf_mapping[vcf_subdir]["sample_dict"] = {} + + for vcf_prefix in vcf_mapping[vcf_subdir]["vcf_prefix_list"]: + vcf_file = vcf_subdir / f"{vcf_prefix}.vcf.gz" + + sample_names = [] + with gzip.open(vcf_file, "rt") as f: + for line in f: + if line.startswith("#CHROM"): + parts = line.strip().split("\t") + sample_names = parts[9:] + break + + vcf_mapping[vcf_subdir]["sample_dict"][vcf_prefix] = sample_names + return vcf_mapping +def vcf_species(vcf_reconstruct_map): + vcf_species_list = [] + vcf_species_location_dict = {} + for vcf_subdir in vcf_reconstruct_map: + for vcf_prefix in vcf_reconstruct_map[vcf_subdir]['vcf_prefix_list']: + for sample_name in vcf_reconstruct_map[vcf_subdir]['sample_dict'][vcf_prefix]: + vcf_species_list.append(f"{sample_name}.{vcf_prefix}.{vcf_reconstruct_map[vcf_subdir]['reference_prefix']}") + vcf_species_location_dict[vcf_species_list[-1]] = vcf_subdir + return vcf_species_list, vcf_species_location_dict +# altref_dir_path / "{sample}.{vcf_prefix}.{reference}.fasta", +# {vcf_subfolder: {"vcf_prefix_list": [], "reference_prefix": "reference_prefix", "sample_dict": {"vcf_prefix": []}}, } +#{"{sample}.{vcf_prefix}.{reference}": vcf_subfolder} + + def get_species_list(vcf_species: list, genome_species: list) -> list: """Merges and returns final species list""" return list(set(vcf_species + genome_species)) @@ -108,11 +157,11 @@ if busco_blacklist_path.exists() and (busco_blacklist_path.stat().st_size > 0): busco_blacklist = pd.read_csv(busco_blacklist_path, sep="\t", header=None).squeeze() # --------------------------------------------------------------------- - # ---- Input data ---- genome_species = [f.stem for f in genome_dir_path.glob("*.fasta") if f.is_file()] vcf_reconstruct_map = get_vcf_reconstruct_map(vcf_reconstruct_dir_path) -vcf_reconstruct_species = list(vcf_reconstruct_map.keys()) +vcf_reconstruct_species, vcf_species_location_dict = vcf_species(vcf_reconstruct_map) + if "species_list" not in config: config["species_list"] = get_species_list(genome_species, vcf_reconstruct_species) diff --git a/config/default.yaml b/config/default.yaml index feda1fa..dc5afb8 100644 --- a/config/default.yaml +++ b/config/default.yaml @@ -144,6 +144,9 @@ benchmark_dir: "benchmarks/" # NOTE : Be aware of Snakemake parallelization when specifying resources # ---- Slurm partition ---- + +vcf_separation_queue: "main" + busco_queue: "main" alignment_queue: "main" filtration_queue: "main" @@ -156,6 +159,7 @@ raxml_queue: "main" processing_queue: "main" # ---- Tool threads ---- +vcf_separation_threads: 4 busco_threads: 40 mafft_threads: 8 prank_threads: 8 @@ -171,6 +175,7 @@ raxml_threads: 16 processing_threads: 20 # ---- Tool memory ---- +vcf_separation_mem_mb: 10000 busco_mem_mb: 120000 mafft_mem_mb: 20000 prank_mem_mb: 20000 @@ -187,6 +192,7 @@ raxml_mem_mb: 10000 processing_mem_mb: 20000 # ---- Tool time ---- +vcf_separation_time: "10:00:00" busco_time: "150:00:00" mafft_time: "10:00:00" prank_time: "100:00:00" diff --git a/workflow/envs/main.yaml b/workflow/envs/main.yaml index ea6b384..b5eb3fb 100644 --- a/workflow/envs/main.yaml +++ b/workflow/envs/main.yaml @@ -1,9 +1,9 @@ -name: buscoclade +name: buscoclade_main channels: - conda-forge - bioconda dependencies: - - busco=5.5.0 + - busco=5.8.0 - biopython - prank - mafft diff --git a/workflow/rules/alignment.smk b/workflow/rules/alignment.smk index e3fb4f7..389d8a9 100644 --- a/workflow/rules/alignment.smk +++ b/workflow/rules/alignment.smk @@ -16,7 +16,7 @@ if "dna_alignment" in config: benchmark: benchmark_dir_path / "prank_dna.{N}.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["alignment_queue"], cpus=config["prank_threads"], @@ -43,7 +43,7 @@ if "dna_alignment" in config: benchmark: benchmark_dir_path / "mafft_dna.{N}.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["alignment_queue"], cpus=config["mafft_threads"], diff --git a/workflow/rules/astral.smk b/workflow/rules/astral.smk index 1ef8916..d98fa6e 100644 --- a/workflow/rules/astral.smk +++ b/workflow/rules/astral.smk @@ -23,7 +23,7 @@ if config["vcf2phylip"] != True: benchmark: benchmark_dir_path / "iqtree_per_fna.{N}.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["astral_queue"], cpus=config["iqtree_per_fna_threads"], @@ -46,7 +46,7 @@ if config["vcf2phylip"] != True: benchmark: benchmark_dir_path / "concat_newick_files.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -70,7 +70,7 @@ if config["vcf2phylip"] != True: benchmark: benchmark_dir_path / "nodes_filtrataion_by_support.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -94,7 +94,7 @@ if config["vcf2phylip"] != True: benchmark: benchmark_dir_path / "astral_tree.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["astral_queue"], cpus=config["astral_threads"], diff --git a/workflow/rules/busco.smk b/workflow/rules/busco.smk index b7b03ed..4b18782 100644 --- a/workflow/rules/busco.smk +++ b/workflow/rules/busco.smk @@ -14,13 +14,14 @@ if config["busco_gene_prediction_tool"] == "metaeuk": busco_options=config["busco_options"], output_prefix="{species}", log: - std=log_dir_path / "busco.{species}.log", + std=(log_dir_path / "busco.{species}.log").resolve(), + busco_log=(log_dir_path / "busco_log.{species}.log").resolve(), cluster_log=cluster_log_dir_path / "busco.{species}.cluster.log", cluster_err=cluster_log_dir_path / "busco.{species}.cluster.err", benchmark: benchmark_dir_path / "busco.{species}.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["busco_queue"], cpus=config["busco_threads"], @@ -28,17 +29,17 @@ if config["busco_gene_prediction_tool"] == "metaeuk": mem_mb=config["busco_mem_mb"], threads: config["busco_threads"] shell: - " MYPWD=$(pwd); mkdir -p {output.busco_outdir}; cd {output.busco_outdir}; " + " mkdir -p {output.busco_outdir}; cd {output.busco_outdir}; " " busco --metaeuk -m {params.mode} -i {input} -c {threads} -l {params.busco_dataset_path} " - " -o {params.output_prefix} {params.busco_options} 1> $MYPWD/{log.std} 2>&1; " - " mv {params.output_prefix}/* . 1> $MYPWD/{log.std} 2>&1; " - " rm -r {params.output_prefix}/ 1> $MYPWD/{log.std} 2>&1; " - " rm -r busco_sequences/ 1> $MYPWD/{log.std} 2>&1; " - " mv run*/* . 1> $MYPWD/{log.std} 2>&1; " - " rm -r run* 1> $MYPWD/{log.std} 2>&1; " - " mv full_table.tsv full_table_{params.output_prefix}.tsv 1> $MYPWD/{log.std} 2>&1; " - " mv missing_busco_list.tsv missing_busco_list_{params.output_prefix}.tsv 1> $MYPWD/{log.std} 2>&1; " - " mv short_summary.txt short_summary_{params.output_prefix}.txt 1> $MYPWD/{log.std} 2>&1; " + " -o {params.output_prefix} {params.busco_options} 1> {log.busco_log} 2>&1; " + " mv {params.output_prefix}/* . 1> {log.std} 2>&1; " + " rm -r {params.output_prefix}/ 1>> {log.std} 2>&1; " + " rm -r busco_sequences/ 1>> {log.std} 2>&1; " + " mv run*/* . 1>> {log.std} 2>&1; " + " rm -r run* 1>> {log.std} 2>&1; " + " mv full_table.tsv full_table_{params.output_prefix}.tsv 1>> {log.std} 2>&1; " + " mv missing_busco_list.tsv missing_busco_list_{params.output_prefix}.tsv 1>> {log.std} 2>&1; " + " mv short_summary.txt short_summary_{params.output_prefix}.txt 1>> {log.std} 2>&1; " elif config["busco_gene_prediction_tool"] == "augustus": @@ -64,7 +65,7 @@ elif config["busco_gene_prediction_tool"] == "augustus": benchmark: benchmark_dir_path / "busco.{species}.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["busco_queue"], cpus=config["busco_threads"], @@ -75,10 +76,10 @@ elif config["busco_gene_prediction_tool"] == "augustus": " MYPWD=$(pwd); mkdir -p {output.busco_outdir}; cd {output.busco_outdir}; " " busco --augustus --augustus_species {params.species} -m {params.mode} -i {input} -c {threads} " " -l {params.busco_dataset_path} -o {params.output_prefix} {params.busco_options} 1> $MYPWD/{log.std} 2>&1; " - " mv {params.output_prefix}/* ./ 1> $MYPWD/{log.std} 2>&1; " - " rm -r {params.output_prefix}/ 1> $MYPWD/{log.std} 2>&1; " - " rm -r augustus_output/ 1> $MYPWD/{log.std} 2>&1; " - " mv run*/* . ; rm -r run* 1> $MYPWD/{log.std} 2>&1; " - " mv full_table.tsv full_table_{params.output_prefix}.tsv 1> $MYPWD/{log.std} 2>&1; " - " mv missing_busco_list.tsv missing_busco_list_{params.output_prefix}.tsv 1> $MYPWD/{log.std} 2>&1; " - " mv short_summary.txt short_summary_{params.output_prefix}.txt 1> $MYPWD/{log.std} 2>&1; " \ No newline at end of file + " mv {params.output_prefix}/* ./ 1>> $MYPWD/{log.std} 2>&1; " + " rm -r {params.output_prefix}/ 1>> $MYPWD/{log.std} 2>&1; " + " rm -r augustus_output/ 1>> $MYPWD/{log.std} 2>&1; " + " mv run*/* . ; rm -r run* 1>> $MYPWD/{log.std} 2>&1; " + " mv full_table.tsv full_table_{params.output_prefix}.tsv 1>> $MYPWD/{log.std} 2>&1; " + " mv missing_busco_list.tsv missing_busco_list_{params.output_prefix}.tsv 1>> $MYPWD/{log.std} 2>&1; " + " mv short_summary.txt short_summary_{params.output_prefix}.txt 1>> $MYPWD/{log.std} 2>&1; " \ No newline at end of file diff --git a/workflow/rules/concat_alignments.smk b/workflow/rules/concat_alignments.smk index 90026b7..04b6ee3 100644 --- a/workflow/rules/concat_alignments.smk +++ b/workflow/rules/concat_alignments.smk @@ -15,7 +15,7 @@ rule concat_fasta_dna: benchmark: benchmark_dir_path / "concat_fasta_dna.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -23,7 +23,7 @@ rule concat_fasta_dna: mem_mb=config["processing_mem_mb"], shell: """ - if {params.vcf2phylip}; then + if [ "{params.vcf2phylip}" = "True" ]; then mv {input} {output} 1> {log.std} 2>&1 else workflow/scripts/concat_fasta.py -i {input} -o {output} 1> {log.std} 2>&1 @@ -43,7 +43,7 @@ rule concat_fasta_protein: benchmark: benchmark_dir_path / "concat_fasta_protein.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -68,7 +68,7 @@ rule concat_nexus_dna: benchmark: benchmark_dir_path / "concat_nexus_dna.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -94,7 +94,7 @@ rule concat_nexus_protein: benchmark: benchmark_dir_path / "concat_nexus_protein.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -117,7 +117,7 @@ rule concat_stockholm_dna: benchmark: benchmark_dir_path / "concat_stockholm_dna.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -139,7 +139,7 @@ rule concat_stockholm_protein: benchmark: benchmark_dir_path / "concat_stockholm_protein.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -163,7 +163,7 @@ rule concat_phylip_dna: benchmark: benchmark_dir_path / "concat_phylip_dna.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -188,7 +188,7 @@ rule concat_phylip_protein: benchmark: benchmark_dir_path / "concat_phylip_protein.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], diff --git a/workflow/rules/filtration.smk b/workflow/rules/filtration.smk index 82af0e1..d455e04 100644 --- a/workflow/rules/filtration.smk +++ b/workflow/rules/filtration.smk @@ -16,7 +16,7 @@ if "dna_filtration" in config: benchmark: benchmark_dir_path / "{N}.fna.gblocks.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["filtration_queue"], cpus=config["gblocks_threads"], @@ -45,7 +45,7 @@ if "dna_filtration" in config: benchmark: benchmark_dir_path / "trimal_dna.{N}.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["filtration_queue"], cpus=config["trimal_threads"], @@ -75,7 +75,7 @@ if "protein_filtration" in config: benchmark: benchmark_dir_path / "{N}.faa.gblocks.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["filtration_queue"], cpus=config["gblocks_threads"], @@ -104,7 +104,7 @@ if "protein_filtration" in config: benchmark: benchmark_dir_path / "trimal_protein.{N}.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["filtration_queue"], cpus=config["trimal_threads"], diff --git a/workflow/rules/iqtree.smk b/workflow/rules/iqtree.smk index a11a20e..133d890 100644 --- a/workflow/rules/iqtree.smk +++ b/workflow/rules/iqtree.smk @@ -22,7 +22,7 @@ if config["iqtree_dna"]: benchmark: benchmark_dir_path / "iqtree_dna.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["iqtree_queue"], cpus=config["iqtree_threads"], @@ -58,7 +58,7 @@ if config["iqtree_protein"]: benchmark: benchmark_dir_path / "iqtree_protein.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["iqtree_queue"], cpus=config["iqtree_threads"], diff --git a/workflow/rules/phylip.smk b/workflow/rules/phylip.smk index 9956959..0367fd2 100644 --- a/workflow/rules/phylip.smk +++ b/workflow/rules/phylip.smk @@ -17,7 +17,7 @@ rule phylip_dnadist: benchmark: benchmark_dir_path / "phylip_dnadist.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["phylip_queue"], cpus=config["phylip_threads"], @@ -48,7 +48,7 @@ rule phylip_neighbor: benchmark: benchmark_dir_path / "phylip_neighbor.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["phylip_queue"], cpus=config["phylip_threads"], diff --git a/workflow/rules/quastcore.smk b/workflow/rules/quastcore.smk index 0f329e4..43a112c 100644 --- a/workflow/rules/quastcore.smk +++ b/workflow/rules/quastcore.smk @@ -12,7 +12,7 @@ rule quastcore: benchmark: benchmark_dir_path / "quastcore.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]), + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]), resources: queue=config["processing_queue"], cpus=config["processing_threads"], diff --git a/workflow/rules/rapidnj.smk b/workflow/rules/rapidnj.smk index e25f2da..e11f03b 100644 --- a/workflow/rules/rapidnj.smk +++ b/workflow/rules/rapidnj.smk @@ -12,7 +12,7 @@ rule rapidnj_tree: benchmark: benchmark_dir_path / "rapidnj_tree.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["rapidnj_queue"], cpus=config["rapidnj_threads"], diff --git a/workflow/rules/raxml.smk b/workflow/rules/raxml.smk index bb39d87..dc3e24d 100644 --- a/workflow/rules/raxml.smk +++ b/workflow/rules/raxml.smk @@ -19,7 +19,7 @@ if config["raxml"]: benchmark: benchmark_dir_path / "raxml.benchmark.txt", conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["raxml_queue"], cpus=config["raxml_threads"], @@ -44,7 +44,7 @@ if config["raxml"]: benchmark: benchmark_dir_path / "raxml.benchmark.txt", conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["raxml_queue"], cpus=config["raxml_threads"], diff --git a/workflow/rules/vcf_reconstruct.smk b/workflow/rules/vcf_reconstruct.smk index 9153e37..cd6d6a4 100644 --- a/workflow/rules/vcf_reconstruct.smk +++ b/workflow/rules/vcf_reconstruct.smk @@ -4,158 +4,115 @@ localrules: if config["vcf_gatk"]: rule picard_index: input: - "{reference}.fasta", + "{reference_dir}/{reference}.fasta", output: - dict="{reference}.dict", - fai="{reference}.fasta.fai", + dict="{reference_dir}/{reference}.dict", + fai="{reference_dir}/{reference}.fasta.fai", log: - std=log_dir_path / "picard_index.{reference}.log", - cluster_log=cluster_log_dir_path / "picard_index.{reference}.cluster.log", - cluster_err=cluster_log_dir_path / "picard_index.{reference}.cluster.err", + std= "{reference_dir}/picard_index.{reference}.log", + cluster_log= "{reference_dir}/picard_index.{reference}.cluster.log", + cluster_err= "{reference_dir}/picard_index.{reference}.cluster.err", benchmark: - benchmark_dir_path / "picard_index.{reference}.benchmark.txt" + "{reference_dir}/picard_index.{reference}.benchmark.txt" conda: config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]), resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], + queue=config["vcf_processing_queue"], + cpus=config["vcf_processing_threads"], + time=config["vcf_processing_time"], + mem_mb=config["vcf_processing_mem_mb"], shell: "picard CreateSequenceDictionary -R {input} && " "samtools faidx {input} " "1> {log.std} 2>&1" -rule gatk_vcf_index: - input: - vcf="{vcf_dir}/{vcf}", - output: - tbi="{vcf_dir}/{vcf}.tbi", - sample_list="{vcf_dir}/{vcf}.sample_list.txt", - params: - gatk_path=config["gatk_path"], - log: - std_gatk= "{vcf_dir}/gatk_vcf_index.{vcf}.log", - std_bcf= "{vcf_dir}/gatk_vcf_index.{vcf}.bcf.log", - std_grep= "{vcf_dir}/gatk_vcf_index.{vcf}.grep.log", - cluster_log="{vcf_dir}/gatk_vcf_index.{vcf}.cluster.log", - cluster_err="{vcf_dir}/gatk_vcf_index.{vcf}.cluster.err", - benchmark: - "{vcf_dir}/gatk_vcf_index.{vcf}.benchmark.txt" - conda: - config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]) - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - shell: - """ - {params.gatk_path}/gatk --java-options -Xmx{resources.mem_mb}m IndexFeatureFile -I {input.vcf} > {log.std_gatk} 2>&1; - bcftools query -l {input.vcf} 2>{log.std_bcf}| - grep -v '^$' > {output.sample_list} 2>{log.std_grep}; - """ - -checkpoint vcf_separation: - input: - vcf="{vcf_dir}/{vcf}", - tbi="{vcf_dir}/{vcf}.tbi", - sample_list="{vcf_dir}/{vcf}.sample_list.txt", - output: - separated="{vcf_dir}/{vcf}.separation.done", - params: - gatk_path=config["gatk_path"], - vcf_basename=lambda wildcards: os.path.basename(wildcards.vcf).replace('.vcf.gz', ''), - original_vcf_dir="results/original_vcf", - reference_prefix=lambda wc: list(Path(wc.vcf_dir).glob("*.fasta"))[0].stem - log: - std="{vcf_dir}/vcf_separation.{vcf}.log", - std_bcf_view="{vcf_dir}/vcf_separation.{vcf}.bcf_view.log", - std_bcf_filter="{vcf_dir}/vcf_separation.{vcf}.bcf_filter.log", - std_zip="{vcf_dir}/vcf_separation.{vcf}.zip.log", - std_tabix="{vcf_dir}/vcf_separation.{vcf}.tabix.log", - std_iff="{vcf_dir}/vcf_separation.{vcf}.iff.log", - cluster_log="{vcf_dir}/vcf_separation.{vcf}.cluster.log", - cluster_err="{vcf_dir}/vcf_separation.{vcf}.cluster.err", - benchmark: - "{vcf_dir}/vcf_separation.{vcf}.benchmark.txt" - conda: - config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]) - resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], - shell: - r""" - SAMPLES=($(grep -v '^$' {input.sample_list})) - - for SAMPLE in "${{SAMPLES[@]}}"; do - - bcftools view \ - --threads {resources.cpus} \ - --min-ac 1 \ - -s "$SAMPLE" \ - -Ov -o "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.unfiltered.vcf" \ - {input.vcf} > {log.std_bcf_view} 2>&1 - - bcftools filter \ - -e 'ALT ~ "[^ATCGN]"' \ - "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.unfiltered.vcf" \ - > "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.filtered.vcf" 2>{log.std_bcf_filter} - - bgzip -c "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.filtered.vcf" \ - > "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.vcf.gz" 2>{log.std_zip} - - tabix -p vcf "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.vcf.gz" > {log.std_tabix} 2>&1 - - {params.gatk_path}/gatk --java-options -Xmx{resources.mem_mb}m \ - IndexFeatureFile -I "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.vcf.gz" >{log.std_iff} 2>&1 - - rm -f \ - "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.unfiltered.vcf" \ - "{wildcards.vcf_dir}/$SAMPLE.{reference_prefix}.{params.vcf_basename}.filtered.vcf" + rule gatk_vcf_index: + input: + vcf="{vcf_dir}/{vcf}", + output: + tbi="{vcf_dir}/{vcf}.tbi", + params: + gatk_path=config["gatk_path"], + log: + std_gatk= "{vcf_dir}/gatk_vcf_index.{vcf}.log", + cluster_log="{vcf_dir}/gatk_vcf_index.{vcf}.cluster.log", + cluster_err="{vcf_dir}/gatk_vcf_index.{vcf}.cluster.err", + benchmark: + "{vcf_dir}/gatk_vcf_index.{vcf}.benchmark.txt" + conda: + config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]) + resources: + queue=config["vcf_processing_queue"], + cpus=config["vcf_processing_threads"], + time=config["vcf_processing_time"], + mem_mb=config["vcf_processing_mem_mb"], + shell: + """ + {params.gatk_path}/gatk --java-options -Xmx{resources.mem_mb}m IndexFeatureFile -I {input.vcf} > {log.std_gatk} 2>&1; + """ - done + - touch {output.separated} - """ + rule vcf_separation: + input: + vcf="{vcf_dir}/{vcf_prefix}.vcf.gz", + output: + separated_vcf="{vcf_dir}/tmp/{sample}.{vcf_prefix}.vcf.gz", + params: + gatk_path=config["gatk_path"], + log: + std="{vcf_dir}/vcf_separation.{sample}.{vcf_prefix}.log", + std_bcf_view="{vcf_dir}/vcf_separation.{sample}.{vcf_prefix}.bcf_view.log", + std_bcf_filter="{vcf_dir}/vcf_separation.{sample}.{vcf_prefix}.bcf_filter.log", + std_zip="{vcf_dir}/vcf_separation.{sample}.{vcf_prefix}.zip.log", + cluster_log="{vcf_dir}/vcf_separation.{sample}.{vcf_prefix}.cluster.log", + cluster_err="{vcf_dir}/vcf_separation.{sample}.{vcf_prefix}.cluster.err", + benchmark: + "{vcf_dir}/vcf_separation.{sample}.{vcf_prefix}.benchmark.txt" + conda: + config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]) + resources: + queue=config["vcf_separation_queue"], + cpus=config["vcf_separation_threads"], + time=config["vcf_separation_time"], + mem_mb=config["vcf_separation_mem_mb"], + shell: + """ + bcftools view --threads {resources.cpus} --min-ac 1 -s {wildcards.sample} -Ov {input.vcf} 2> {log.std_bcf_view} |\ + bcftools filter -e 'ALT ~ "[^ATCGN]"' 2>{log.std_bcf_filter} |\ + bgzip -c > "{wildcards.vcf_dir}/tmp/{wildcards.sample}.{wildcards.vcf_prefix}.vcf.gz" 2>{log.std_zip} + """ -if config["vcf_gatk"]: rule gatk_altref: input: - ref=lambda wc: vcf_reconstruct_map[wc.species]["reference"], - refidx=lambda wc: vcf_reconstruct_map[wc.species]["reference"].with_suffix(".dict"), - checkpoint=lambda wc: checkpoints.vcf_separation.get(vcf=Path(vcf_reconstruct_map[wc.species]["vcf"]).name, - vcf_dir=Path(vcf_reconstruct_map[wc.species]["vcf"]).parent).output.separated, - vcf=lambda wc: "%s/{species}.%s.vcf.gz" % (Path(vcf_reconstruct_map[wc.species]["vcf"]).parent, - Path(vcf_reconstruct_map[wc.species]["vcf"]).name.replace('.vcf.gz', '')), - vcfidx=lambda wc: vcf_reconstruct_map[wc.species]["vcf"].with_name(vcf_reconstruct_map[wc.species]["vcf"].name + ".tbi"), + ref=lambda wc: "{0}/{1}.fasta".format(vcf_species_location_dict["{0}.{1}.{2}".format(wc.sample, wc.vcf_prefix, wc.reference)], wc.reference), + refidx=lambda wc: "{0}/{1}.dict".format(vcf_species_location_dict["{0}.{1}.{2}".format(wc.sample, wc.vcf_prefix, wc.reference)], wc.reference), + vcf=lambda wc: "{0}/tmp/{1}.{2}.vcf.gz".format(vcf_species_location_dict["{0}.{1}.{2}".format(wc.sample, wc.vcf_prefix, wc.reference)], wc.sample, wc.vcf_prefix), + vcfidx=lambda wc: "{0}/tmp/{1}.{2}.vcf.gz.tbi".format(vcf_species_location_dict["{0}.{1}.{2}".format(wc.sample, wc.vcf_prefix, wc.reference)], wc.sample, wc.vcf_prefix), output: - altref_dir_path / "{species}.fasta", + altref_dir_path / "{sample}.{vcf_prefix}.{reference}.fasta", params: gatk_path=config["gatk_path"], - sample=lambda wc: wc.species.split(".")[0], log: - std=log_dir_path / "gatk_altref.{species}.log", - cluster_log=cluster_log_dir_path / "gatk_altref.{species}.cluster.log", - cluster_err=cluster_log_dir_path / "gatk_altref.{species}.cluster.err", + std=log_dir_path / "gatk_altref.{sample}.{vcf_prefix}.{reference}.log", + cluster_log=cluster_log_dir_path / "gatk_altref.{sample}.{vcf_prefix}.{reference}.cluster.log", + cluster_err=cluster_log_dir_path / "gatk_altref.{sample}.{vcf_prefix}.{reference}.cluster.err", benchmark: - benchmark_dir_path / "gatk_altref.{species}.benchmark.txt" + benchmark_dir_path / "gatk_altref.{sample}.{vcf_prefix}.{reference}.benchmark.txt", conda: - config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]) + config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]), resources: - queue=config["processing_queue"], - cpus=config["processing_threads"], - time=config["processing_time"], - mem_mb=config["processing_mem_mb"], + queue=config["vcf_altref_queue"], + cpus=config["vcf_altref_threads"], + time=config["vcf_altref_time"], + mem_mb=config["vcf_altref_mem_mb"], shell: " {params.gatk_path}/gatk --java-options -Xmx{resources.mem_mb}m FastaAlternateReferenceMaker " - " --output {output} --reference {input.ref} --variant {input.vcf} --showHidden true --use-iupac-sample {params.sample} 1> {log.std} 2>&1; " + " --output {output} --reference {input.ref} --variant {input.vcf} --showHidden true --use-iupac-sample {wildcards.sample} 1> {log.std} 2>&1; " - checkpoint move_altref_to_genomes_dir: + rule move_altref_to_genomes_dir: input: altref_dir_path / "{species}.fasta", output: @@ -164,6 +121,11 @@ if config["vcf_gatk"]: std=log_dir_path / "link_altref_to_genomes_dir.{species}.log", cluster_log=cluster_log_dir_path / "link_altref_to_genomes_dir.{species}.cluster.log", cluster_err=cluster_log_dir_path / "link_altref_to_genomes_dir.{species}.cluster.err", + resources: + queue=config["vcf_separation_queue"], + cpus=config["vcf_separation_threads"], + time=config["vcf_separation_time"], + mem_mb=config["vcf_separation_mem_mb"], shell: " cp -a {input} {output} 1> {log.std} 2>&1; " @@ -175,15 +137,13 @@ if config["vcf2phylip"]: if len(vcf_list) != 1: raise ValueError(f"vcf2phylip requires exactly one VCF in {vcf_dir}, found: {vcf_list}") vcf_file = vcf_list[0] - prefix = vcf_file.stem + prefix = vcf_file.name.replace(".vcf.gz", "") checkpoint vcf2phylip: input: vcf=str(vcf_file), output: - altref_dir_path / f"{prefix}.fasta", - params: - prefix=prefix, + f"{prefix}.min4.fasta", log: std=log_dir_path / f"vcf2phylip.{prefix}.log", cluster_log=cluster_log_dir_path / f"vcf2phylip.{prefix}.cluster.log", @@ -191,22 +151,35 @@ if config["vcf2phylip"]: benchmark: benchmark_dir_path / f"vcf2phylip.{prefix}.benchmark.txt", conda: - config["conda"]["buscoclade_gatk"]["name"] if config["use_existing_envs"] - else ("../../%s" % config["conda"]["buscoclade_gatk"]["yaml"]), + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] + else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]), resources: queue=config["processing_queue"], cpus=config["processing_threads"], time=config["processing_time"], mem_mb=config["processing_mem_mb"], shell: - "workflow/scripts/vcf2phylip.py -i {input.vcf} --fasta --output-prefix {params.prefix} --resolve-IUPAC" + "workflow/scripts/vcf2phylip.py -i {input.vcf} --fasta " rule consolidate_vcf2phylip: input: - fasta=altref_dir_path / f"{prefix}.fasta", + fasta=f"{prefix}.min4.fasta", output: altref_dir_path / "consensus.fasta", log: std=log_dir_path / "consolidate_vcf2phylip.log", - run: - shell("mv {input.fasta} {output}") + cluster_log=cluster_log_dir_path / "consolidate_vcf2phylip.cluster.log", + cluster_err=cluster_log_dir_path / "consolidate_vcf2phylip.cluster.err", + benchmark: + benchmark_dir_path / f"consolidate_vcf2phylip.benchmark.txt", + conda: + config["conda"]["buscoclade_main"]["name"] + if config["use_existing_envs"] + else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]), + resources: + queue=config["processing_queue"], + cpus=config["processing_threads"], + time=config["processing_time"], + mem_mb=config["processing_mem_mb"], + shell: + "mv {input.fasta} {output}" diff --git a/workflow/rules/visualization.smk b/workflow/rules/visualization.smk index 86775ff..c1fa626 100644 --- a/workflow/rules/visualization.smk +++ b/workflow/rules/visualization.smk @@ -221,7 +221,7 @@ if config["vcf2phylip"] != True: cluster_log=cluster_log_dir_path / "busco_summaries_to_tsv.cluster.log", cluster_err=cluster_log_dir_path / "busco_summaries_to_tsv.cluster.err", conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], @@ -246,7 +246,7 @@ if config["vcf2phylip"] != True: benchmark: benchmark_dir_path / "busco_histogram.benchmark.txt" conda: - config["conda"]["buscoclade"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade"]["yaml"]) + config["conda"]["buscoclade_main"]["name"] if config["use_existing_envs"] else ("../../%s" % config["conda"]["buscoclade_main"]["yaml"]) resources: queue=config["processing_queue"], cpus=config["processing_threads"], From a970583433c9fc4bccaa83c1d095745169dc2d94 Mon Sep 17 00:00:00 2001 From: zubiriguitar Date: Tue, 20 May 2025 01:34:44 +0700 Subject: [PATCH 13/15] Vis fix --- workflow/scripts/draw_phylotrees.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflow/scripts/draw_phylotrees.py b/workflow/scripts/draw_phylotrees.py index d657cbc..3261e34 100755 --- a/workflow/scripts/draw_phylotrees.py +++ b/workflow/scripts/draw_phylotrees.py @@ -5,10 +5,10 @@ import re def format_name(name): - name = re.sub(r'(GCA|GCF)_', r'\1@', name) - name = name.replace('_', ' ') - name = name.replace('GCA@', 'GCA_').replace('GCF@', 'GCF_') - name = re.sub(r'(\d) (\d)', r'\1_\2', name) + name = re.sub(r'\.(GCA|GCF)_\d+\.\d+$', '', name) + name = re.sub(r'[_\.]', ' ', name) + return name.strip() + return name From 4419bd181bab753504c7dba1c27d802ee8019892 Mon Sep 17 00:00:00 2001 From: Zubiriguitar <64590799+Zubiriguitar@users.noreply.github.com> Date: Tue, 20 May 2025 01:38:24 +0700 Subject: [PATCH 14/15] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0b8472b..231a1e1 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Snakemake](https://img.shields.io/badge/snakemake-<8.0-brightgreen.svg)](https://snakemake.github.io) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -# BuscoClade Pipeline Documentation +# BuscoClade Pipeline **BuscoClade** is a **Snakemake**-based workflow that constructs species phylogenies using *BUSCO* (Benchmarking Universal Single-Copy Orthologs). It runs multiple analysis stages—from preparing inputs and running BUSCO on each genome, through multiple-sequence alignment (MSA), trimming, and tree-building—to produce a final phylogenetic tree and related visualizations. BuscoClade is organized as modular Snakemake rules; each rule corresponds to a step (e.g. alignment, filtering, tree inference). Snakemake automatically infers dependencies between rules based on input/output files. By leveraging Snakemake and Conda, BuscoClade ensures a reproducible, scalable pipeline that can run locally or on an HPC cluster with minimal user effort. From 67684a2a34fe312cb3328e7579e185e448e648b8 Mon Sep 17 00:00:00 2001 From: Zubiriguitar <64590799+Zubiriguitar@users.noreply.github.com> Date: Tue, 20 May 2025 01:39:20 +0700 Subject: [PATCH 15/15] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 231a1e1..0ec45c6 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Follow these steps to run BuscoClade on your data. This guide assumes basic fami 1. **Clone the Repository:** ```bash - git clone https://github.com/Zubiriguitar/BuscoClade.git + git clone https://github.com/tomarovsky/BuscoClade.git cd BuscoClade ```