Pipeline working from aligned reads (Step 2 and onwards)

2815a2ad · Lihouck Flavien · fb13c0ac · 2815a2ad · 2815a2ad · 2815a2ad
Commit 2815a2ad authored May 17, 2022 by Lihouck Flavien
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,4 @@ data/
 .#*
 *.fasta
 MTool
-log/
\ No newline at end of file
+logs/
\ No newline at end of file
--- a/src/workflow/Snakefile
+++ b/src/workflow/Snakefile
 configfile: "./workflow/default_config.yaml"

-DATASET = expand("{data}/{format}", data=config["data"], format=config["format"])
+DATASET = expand("{data_in}/{format}", data_in=config["data_out"], format=config["format"])
 GRAPH_TYPES = ["identity", "similarity", "equivalence", "mismatches"]
 STAT_HEADER= "threshold, percent_identity, percent_similarity, total_equivalence, total_mismatches\n "


-
 rule all:
     input:
-          expand("consensus/{region}/{depth}/{align}_meta_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta", region=config["region"], depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle")
-     message: "Meta consensus can be found in ./consensus/[region]/[depth]/"
-
-
+          expand("{output_folder}/{region}/{depth}/{align}_meta_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta", region=config["region"], depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])
+     message: "Meta consensus can be found in ./[output_folder]/[region]/[depth]/"

+# Installs the tool used for consensus and meta-consensus #

 rule MTool:
     input:
@@ -23,12 +21,124 @@ rule MTool:
     shell:
          "cd msa_handler && make"

+
+
+# Creation of MSA from the reads pile using tools  #
+
+rule muscle :
+    input :
+        os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
+    output :
+        out = os.path.join('{output_folder}', 'data','msa','MSA_muscle_r{region_size}_d{depth}.fasta')
+    message:
+        "Muscle for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
+    log:
+        os.path.join('{output_folder}','logs','6_muscle_r{region_size}_d{depth}.log')
+    conda:
+        "envs/muscle.yaml"
+    shell :
+        "muscle -in {input} -out {output.out} 2> {log}"
+
+rule mafft :
+    input :
+        os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
+    output :
+        out = os.path.join('{output_folder}', 'data' ,'msa','mafft_uncorrected_r{region_size}_d{depth}.fasta')
+    message:
+        "Mafft for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
+    log:
+        os.path.join('{output_folder}','logs','6_mafft_r{region_size}_d{depth}.log')
+    conda:
+        "envs/mafft.yaml"
+    shell :
+       "mafft {input} > {output} 2> {log}"
+
+rule mafft_correction :
+    input :
+        os.path.join('{output_folder}', 'data' ,'msa','mafft_uncorrected_r{region_size}_d{depth}.fasta')
+    output :
+        os.path.join('{output_folder}', 'data' ,'msa','MSA_mafft_r{region_size}_d{depth}.fasta')
+    message:
+        "Mafft correction for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
+    shell :
+        'cat {input} | tr "atgc" "ATGC" >{output} ; rm -f {input}'
+
+rule spoa :
+    input :
+        os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
+    output :
+        out = os.path.join('{output_folder}', 'data','msa','MSA_spoa_r{region_size}_d{depth}.fasta')
+    message:
+        "Spoa for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
+    log:
+        os.path.join('{output_folder}','logs','6_spoa_r{region_size}_d{depth}.log')
+    conda:
+        "envs/spoa.yaml"
+    shell :
+       "spoa -r 1 -l 1 {input} > {output} 2> {log}"
+
+rule kalign :
+    input :
+        os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
+    output :
+        out = os.path.join('{output_folder}', 'data','msa','MSA_kalign_r{region_size}_d{depth}.fasta')
+    message:
+        "Kalign for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
+    log:
+        os.path.join('{output_folder}','logs','6_kalign_r{region_size}_d{depth}.log')
+    conda:
+        "envs/kalign2.yaml"
+    shell :
+        "kalign -i {input} -out {output.out} 1> {log} 2> {log}"
+
+rule kalign3 :
+    input :
+        os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
+    output :
+        out = os.path.join('{output_folder}', 'data' ,'msa','MSA_kalign3_r{region_size}_d{depth}.fasta')
+    message:
+        "Kalign3 for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
+    log:
+        os.path.join('{output_folder}','logs','6_kalign3_r{region_size}_d{depth}.log')
+    conda:
+        "envs/kalign3.yaml"
+    shell :
+        "kalign -i {input} -o {output.out} 1>{log} 2> {log}"
+
+rule abpoa :
+    input :
+        os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
+    output :
+        out = os.path.join('{output_folder}' , 'data' ,'msa','abpoa_uncorrected_r{region_size}_d{depth}.fasta')
+    message:
+        "Abpoa for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
+    log:
+        os.path.join('{output_folder}','logs','6_abpoa_r{region_size}_d{depth}.log')
+    conda:
+        "envs/abpoa.yaml"
+    shell :
+        "abpoa -r 1 {input} > {output} 2> {log}"
+rule abpoa_correction :
+    input :
+        os.path.join('{output_folder}' , 'data','msa','abpoa_uncorrected_r{region_size}_d{depth}.fasta')
+    output :
+        os.path.join('{output_folder}', 'data' ,'msa','MSA_abpoa_r{region_size}_d{depth}.fasta')
+    message:
+        "Abpoa correction for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
+    shell :
+        'tail -{wildcards.depth} {input}| sed "s/^/>seq\\n/" >{output};rm -f {input}'
+
+
+
+
+# Creation of the consensus #
+
 rule create_consensus_per_tool:
     input:
          rules.MTool.output, data=expand("{dataset}/MSA_{tool}_r{region}_d{depth}.fasta", allow_missing=True, dataset=DATASET)
     output:
-          "consensus/{region}/{depth}/individual_consensus/{threshold}/consensus_{tool}_{region}_{depth}.fasta"
-     log: "logs/consensus_per_tool_{tool}_r{region}_d{depth}_t{threshold}.log"
+          "{output_folder}/{region}/{depth}/individual_consensus/{threshold}/consensus_{tool}_{region}_{depth}.fasta"
+     log: "logs/{output_folder}/consensus_per_tool_{tool}_r{region}_d{depth}_t{threshold}.log"
     shell:
          "./msa_handler/MTool {input.data} {output} {wildcards.threshold}"

@@ -36,43 +146,49 @@ rule create_consensus:
     input:
          rules.MTool.output, "MTool_over_folder.py", expand("{dataset}/MSA_{tool}_r{region}_d{depth}.fasta", tool=config["tool"], allow_missing=True, dataset=DATASET)
     params: data=DATASET
-     log: "logs/consensus_r{region}_d{depth}_t{threshold}.log"
+     log: "logs/{output_folder}consensus_r{region}_d{depth}_t{threshold}.log"
     output:
-          "consensus/{region}/{depth}/test_consensus_{region}_{depth}_{threshold}.fasta"
+          "{output_folder}/{region}/{depth}/test_consensus_{region}_{depth}_{threshold}.fasta"
     message: "Consensus region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"  
     shell:
          "./MTool_over_folder.py -dir {params.data} -o {output} -region {wildcards.region} -depth {wildcards.depth} -threshold {wildcards.threshold}"

+
+# Alignment for the meta-consensus #
+
 rule align_mafft:
     input:
          rules.create_consensus.output
     output:
-          "consensus/{region}/{depth}/mafft_consensus_{region}_{depth}_{threshold}.fasta"
+          "{output_folder}/{region}/{depth}/mafft_consensus_{region}_{depth}_{threshold}.fasta"
     conda:
          "envs/mafft.yaml"
-     log: "logs/align_mafft_r{region}_d{depth}_t{threshold}.log"
+     log: "logs/{output_folder}/align_mafft_r{region}_d{depth}_t{threshold}.log"
     message: "Alignment with mafft region: {wildcards.region} , depth: {wildcards.depth} , threshold: {threshold}\n"
     shell:
-          "mafft --auto {input} > {output}"
+          "mafft --auto {input} > {output} 2> {log}"

 rule align_muscle:
     input:
          rules.create_consensus.output
     output:
-          "consensus/{region}/{depth}/muscle_consensus_{region}_{depth}_{threshold}.fasta"
+          "{output_folder}/{region}/{depth}/muscle_consensus_{region}_{depth}_{threshold}.fasta"
     conda:
          "envs/muscle.yaml"
-     log: "logs/align_muscle_r{region}_d{depth}_t{threshold}.log"
+     log: "logs/{output_folder}/align_muscle_r{region}_d{depth}_t{threshold}.log"
     message: "Alignment muscle region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
     shell:
-          "muscle -align {input} -output {output} > /dev/null 2>/dev/null"
+          "muscle -in {input} -out {output} > {log} 2>{log}"
+
+
+# Meta consensus #

 rule meta_consensus:
     input:
          muscle=rules.align_muscle.output
     output:
-          "consensus/{region}/{depth}/{align}_meta_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta"
-     log: "logs/meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.log"
+          "{output_folder}/{region}/{depth}/{align}_meta_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta"
+     log: "logs/{output_folder}/meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.log"
     message: "Meta-consensus region: {wildcards.region} , depth: {wildcards.depth} , consensus threshold: {wildcards.threshold} , meta consensus threshold: {wildcards.value}\n"
     shell:
          "./msa_handler/MTool {input.muscle} {output} {wildcards.value}"
@@ -83,10 +199,10 @@ rule meta_consensus:
 rule exonerate_ref_result:
     input:
          files=expand(rules.meta_consensus.output, align="muscle", allow_missing=True) ,
-          ref=expand("{data}/seq_selectes_region/region_seq_r{region}.fasta", data=config["data"], allow_missing=True),
+          ref=expand("{data_in}/seq_selectes_region/region_seq_r{region}.fasta", data_in=config["data_in"], allow_missing=True),
          consensus_per_tool=expand(rules.create_consensus_per_tool.output, tool=config["tool"], allow_missing=True)
-     output: "consensus/stats/{region}/aln_all_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.txt"
-     log: "logs/exonerate_ref_result_r{region}_d{depth}_t1_{threshold}_t2{value}.log"
+     output: "{output_folder}/stats/{region}/aln_all_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.txt"
+     log: "logs/{output_folder}/exonerate_ref_result_r{region}_d{depth}_t1_{threshold}_t2{value}.log"
     params: format="\"%pi, %ps, %et, %em\n\""
     conda: "envs/exonerate.yaml"
     shell: "./exonerate_stats.sh {output} {input.ref} {input.files} {input.consensus_per_tool}"
@@ -94,49 +210,55 @@ rule exonerate_ref_result:


 rule plottable_exonerate_meta:
-     input: files=expand(rules.meta_consensus.output, align="muscle", value=config["threshold_2"], allow_missing=True) , ref=expand("{data}/seq_selectes_region/region_seq_r{region}.fasta", data=config["data"], allow_missing=True)
-     output: "consensus/plot/{region}/{depth}/{threshold}/aln_stats_meta.txt"
-     log: "logs/plot_exonerate_meta_r{region}_d{depth}_t{threshold}.log"
+     input: files=expand(rules.meta_consensus.output, align="muscle", value=config["threshold_2"], allow_missing=True) , ref=expand("{data_in}/seq_selectes_region/region_seq_r{region}.fasta", data_in=config["data_in"], allow_missing=True)
+     output: "{output_folder}/plot/{region}/{depth}/{threshold}/aln_stats_meta.txt"
+     log: "logs/{output_folder}plot_exonerate_meta_r{region}_d{depth}_t{threshold}.log"
     conda: "envs/exonerate.yaml"
+     message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold}) for meta-consensus threshold in [ {config[threshold_2]} ]"
     shell: "./exonerate_stats.sh {output} {input.ref} {input.files}"

 rule plottable_exonerate_tools:
     input:
-          ref=expand("{data}/seq_selectes_region/region_seq_r{region}.fasta", data=config["data"], allow_missing=True),
+          ref=expand("{data_in}/seq_selectes_region/region_seq_r{region}.fasta", data_in=config["data_in"], allow_missing=True),
          consensus_per_tool=expand(rules.create_consensus_per_tool.output, value=config["threshold_2"], allow_missing=True)
-     output: "consensus/plot/{region}/{depth}/{threshold}/aln_stats_{tool}.txt"
-     log: "logs/plot_exonerate_tools_{tool}_r{region}_d{depth}_t{threshold}.log"
+     output: "{output_folder}/plot/{region}/{depth}/{threshold}/aln_stats_{tool}.txt"
+     log: "logs/{output_folder}/plot_exonerate_tools_{tool}_r{region}_d{depth}_t{threshold}.log"
     conda: "envs/exonerate.yaml"
-     shell: "./exonerate_stats.sh {output} {input.ref} {input.consensus_per_tool}"
+     message: "Exonerate comparison between consensus for tool {wildcards.tool} and ref (region: {wildcards.region}, depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold}) for meta-consensus threshold in [ {config[threshold_2]} ]"
+     shell: "./exonerate_stats.sh {output} {input.ref} {input.consensus_per_tool} 2> {log}"


 rule plot_depth:
     input: 
     	     expand(rules.plottable_exonerate_meta.output, depth=config["depth"], allow_missing=True),
     params: target_folder="consensus/plot/{region}/graph"
-     log: "logs/plot_depth_r{region}_t{threshold}.log"
-     output: expand("consensus/plot/{region}/graph/graph_r{region}_t{threshold}_{graph}.png", graph=GRAPH_TYPES, depth=config["depth"],  allow_missing=True)
-     shell: "./plot_results.py {params.target_folder}/graph_r{wildcards.region}_t{wildcards.threshold} {input}"
+     log: "logs/{output_folder}/plot_depth_r{region}_t{threshold}.log"
+     output: expand("{output_folder}/plot/{region}/graph/graph_r{region}_t{threshold}_{graph}.png", graph=GRAPH_TYPES, depth=config["depth"],  allow_missing=True)
+     message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, or preliminary consensus threshold in [ {config[threshold_1]} ],  meta-consensus threshold in [ {config[threshold_2]} ]"
+     shell: "./plot_results.py {params.target_folder}/graph_r{wildcards.region}_t{wildcards.threshold} {input} 2> {log}"

 rule plot_t1:
     input: 
     	     expand(rules.plottable_exonerate_meta.output, threshold=config["threshold_1"], allow_missing=True)
-     params: target_folder="consensus/plot/{region}/{depth}"
-     log: "logs/plot_t1_r{region}_d{depth}.log"
-     output: expand("consensus/plot/{region}/{depth}/graph_r{region}_d{depth}_{graph}.png", graph=GRAPH_TYPES, allow_missing=True)
-     shell: "./plot_results.py {params.target_folder}/graph_r{wildcards.region}_d{wildcards.depth} {input}"
+     params: target_folder="{output_folder}/plot/{region}/{depth}"
+     log: "logs/{output_folder}/plot_t1_r{region}_d{depth}.log"
+     output: expand("{output_folder}/plot/{region}/{depth}/graph_r{region}_d{depth}_{graph}.png", graph=GRAPH_TYPES, allow_missing=True)
+     message: "Creating plot for meta-consensus quality with preliminary threshold in [ {config[threshold_1]} ] and meta-consensus in [ {config[threshold_2]} ] (region: {wildcards.region}, depth: {wildcards.depth})"
+     shell: "./plot_results.py {params.target_folder}/graph_r{wildcards.region}_d{wildcards.depth} {input} 2> {log} "
     

 rule plot_t2:
     input: expand(rules.plottable_exonerate_tools.output, tool=config["tool"], allow_missing=True),
     	     rules.plottable_exonerate_meta.output
-     params: target_folder="consensus/plot/{region}/{depth}/{threshold}/"
-     log: "logs/plot_t2_r{region}_d{depth}_t_{threshold}.log"
-     output: expand("consensus/plot/{region}/{depth}/{threshold}/graph_r{region}_d{depth}_t1_{threshold}_{graph}.png", graph=GRAPH_TYPES, allow_missing=True)
-     shell: "./plot_results.py {params.target_folder}graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold}  {input}"
+     params: target_folder="{output_folder}/plot/{region}/{depth}/{threshold}/"
+     log: "logs/{output_folder}/plot_t2_r{region}_d{depth}_t_{threshold}.log"
+     output: expand("{output_folder}/plot/{region}/{depth}/{threshold}/graph_r{region}_d{depth}_t1_{threshold}_{graph}.png", graph=GRAPH_TYPES, allow_missing=True)
+     message: "Creating plot for meta-consensus compared to preliminary consensus with meta-consensus threshold in {config[threshold_2]}  (region: {wildcards.region}, depth: {wildcards.depth}, threshold: {wildcards.threshold})"
+     shell: "./plot_results.py {params.target_folder}graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold}  {input} 2> {log}"


 rule make_plots:
-     input: expand(rules.plot_t2.output, region=config["region"], depth=config["depth"], threshold=config["threshold_1"]),
-     	    expand(rules.plot_t1.output, region=config["region"], depth=config["depth"]),
-	    expand(rules.plot_depth.output, region=config["region"], threshold=config["threshold_1"])
\ No newline at end of file
+     input: expand(rules.plot_t2.output, region=config["region"], depth=config["depth"], threshold=config["threshold_1"], output_folder=config["output_folder"]),
+     	    expand(rules.plot_t1.output, region=config["region"], depth=config["depth"] , output_folder=config["output_folder"]),
+	    expand(rules.plot_depth.output, region=config["region"], threshold=config["threshold_1"], output_folder=config["output_folder"])
+     message: "Created all plots"
\ No newline at end of file
--- a/src/workflow/default_config.yaml
+++ b/src/workflow/default_config.yaml
-data: "../../data"
+data_in: "../../data"
+data_out: "consensus/data"
 format: "msa"
 tool:  ["abpoa", "kalign", "kalign3", "mafft", "muscle", "spoa"]
 threshold_1: ["30", "40", "50", "60", "70", "80"]
@@ -6,3 +7,4 @@ threshold_2 : ["30", "40", "50", "60", "70", "80"]
 region: ["100", "200", "500", "1000", "2000"]
 depth: ["10", "20", "50", "100", "150"]
 align: ["muscle", "mafft"]
+output_folder: "consensus"
\ No newline at end of file
--- a/src/workflow/envs/abpoa.yaml
+++ b/src/workflow/envs/abpoa.yaml
+name: abpoa
+channels:
+  - bioconda
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=4.5=1_gnu
+  - abpoa=1.2.5=h5bf99c6_0
+  - libgcc-ng=9.3.0=h5101ec6_17
+  - libgomp=9.3.0=h5101ec6_17
+  - zlib=1.2.11=h7b6447c_3
--- a/src/workflow/envs/kalign2.yaml
+++ b/src/workflow/envs/kalign2.yaml
+name: kalign2
+channels:
+  - bioconda
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=4.5=1_gnu
+  - kalign2=2.04=h779adbc_2
+  - libgcc-ng=9.3.0=h5101ec6_17
+  - libgomp=9.3.0=h5101ec6_17
--- a/src/workflow/envs/kalign3.yaml
+++ b/src/workflow/envs/kalign3.yaml
+name: kalign3
+channels:
+  - bioconda
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=4.5=1_gnu
+  - kalign3=3.2.2=h779adbc_2
+  - libgcc-ng=9.3.0=h5101ec6_17
+  - libgomp=9.3.0=h5101ec6_17
--- a/src/workflow/envs/muscle.yaml
+++ b/src/workflow/envs/muscle.yaml
 name: muscle
 channels:
-  - conda-forge
  - bioconda
  - defaults
 dependencies:
-  - _libgcc_mutex=0.1=conda_forge
-  - _openmp_mutex=4.5=2_gnu
-  - libgcc-ng=11.2.0=h1d223b6_16
-  - libgomp=11.2.0=h1d223b6_16
-  - libstdcxx-ng=11.2.0=he4da1e4_16
-  - muscle=5.1=h9f5acd7_1
-prefix: /home/flav/anaconda3/envs/muscle
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=4.5=1_gnu
+  - libgcc-ng=9.3.0=h5101ec6_17
+  - libgomp=9.3.0=h5101ec6_17
+  - libstdcxx-ng=9.3.0=hd4cf53a_17
+  - muscle=3.8.1551=h7d875b9_6
--- a/src/workflow/envs/spoa.yaml
+++ b/src/workflow/envs/spoa.yaml
+name: spoa
+channels:
+  - bioconda
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=4.5=1_gnu
+  - libgcc-ng=9.3.0=h5101ec6_17
+  - libgomp=9.3.0=h5101ec6_17
+  - libstdcxx-ng=9.3.0=hd4cf53a_17
+  - spoa=4.0.7=h9a82719_1
+  - zlib=1.2.11=h7b6447c_3