Cleanup on the repository

0df69d1c · Lihouck Flavien · 2815a2ad · 0df69d1c · 0df69d1c · 0df69d1c
Commit 0df69d1c authored 3 years ago by Lihouck Flavien
--- a/src/workflow/Snakefile
+++ b/src/workflow/Snakefile
 configfile: "./workflow/default_config.yaml"

-DATASET = expand("{data_in}/{format}", data_in=config["data_out"], format=config["format"])
+DATASET = expand("{data}/{format}", data=config["data_out"], format=config["format"])
 GRAPH_TYPES = ["identity", "similarity", "equivalence", "mismatches"]
 STAT_HEADER= "threshold, percent_identity, percent_similarity, total_equivalence, total_mismatches\n "

@@ -10,6 +10,8 @@ rule all:
          expand("{output_folder}/{region}/{depth}/{align}_meta_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta", region=config["region"], depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])
     message: "Meta consensus can be found in ./[output_folder]/[region]/[depth]/"

+
+
 # Installs the tool used for consensus and meta-consensus #

 rule MTool:
@@ -27,27 +29,27 @@ rule MTool:

 rule muscle :
    input :
-        os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
+        os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
    output :
-        out = os.path.join('{output_folder}', 'data','msa','MSA_muscle_r{region_size}_d{depth}.fasta')
+        out = os.path.join('{output_folder}', "data", config["format"]  ,'MSA_muscle_r{region}_d{depth}.fasta')
    message:
-        "Muscle for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
+        "Muscle for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('{output_folder}','logs','6_muscle_r{region_size}_d{depth}.log')
+        os.path.join('{output_folder}','logs','6_muscle_r{region}_d{depth}.log')
    conda:
        "envs/muscle.yaml"
    shell :
-        "muscle -in {input} -out {output.out} 2> {log}"
+        "muscle -in {input} -out {output} 2> {log}"

 rule mafft :
    input :
-        os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
+        os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
    output :
-        out = os.path.join('{output_folder}', 'data' ,'msa','mafft_uncorrected_r{region_size}_d{depth}.fasta')
+        out = os.path.join('{output_folder}', "data" ,config["format"] ,'mafft_uncorrected_r{region}_d{depth}.fasta')
    message:
-        "Mafft for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
+        "Mafft for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('{output_folder}','logs','6_mafft_r{region_size}_d{depth}.log')
+        os.path.join('{output_folder}','logs','6_mafft_r{region}_d{depth}.log')
    conda:
        "envs/mafft.yaml"
    shell :
@@ -55,23 +57,23 @@ rule mafft :

 rule mafft_correction :
    input :
-        os.path.join('{output_folder}', 'data' ,'msa','mafft_uncorrected_r{region_size}_d{depth}.fasta')
+        os.path.join('{output_folder}', "data" ,config["format"],'mafft_uncorrected_r{region}_d{depth}.fasta')
    output :
-        os.path.join('{output_folder}', 'data' ,'msa','MSA_mafft_r{region_size}_d{depth}.fasta')
+        os.path.join('{output_folder}', "data"  ,config["format"],'MSA_mafft_r{region}_d{depth}.fasta')
    message:
-        "Mafft correction for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
+        "Mafft correction for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    shell :
        'cat {input} | tr "atgc" "ATGC" >{output} ; rm -f {input}'

 rule spoa :
    input :
-        os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
+        os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
    output :
-        out = os.path.join('{output_folder}', 'data','msa','MSA_spoa_r{region_size}_d{depth}.fasta')
+        out = os.path.join('{output_folder}', "data" ,config["format"],'MSA_spoa_r{region}_d{depth}.fasta')
    message:
-        "Spoa for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
+        "Spoa for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('{output_folder}','logs','6_spoa_r{region_size}_d{depth}.log')
+        os.path.join('{output_folder}','logs','6_spoa_r{region}_d{depth}.log')
    conda:
        "envs/spoa.yaml"
    shell :
@@ -79,13 +81,13 @@ rule spoa :

 rule kalign :
    input :
-        os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
+        os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
    output :
-        out = os.path.join('{output_folder}', 'data','msa','MSA_kalign_r{region_size}_d{depth}.fasta')
+        out = os.path.join('{output_folder}', "data" ,config["format"],'MSA_kalign_r{region}_d{depth}.fasta')
    message:
-        "Kalign for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
+        "Kalign for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('{output_folder}','logs','6_kalign_r{region_size}_d{depth}.log')
+        os.path.join('{output_folder}','logs','6_kalign_r{region}_d{depth}.log')
    conda:
        "envs/kalign2.yaml"
    shell :
@@ -93,13 +95,13 @@ rule kalign :

 rule kalign3 :
    input :
-        os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
+        os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
    output :
-        out = os.path.join('{output_folder}', 'data' ,'msa','MSA_kalign3_r{region_size}_d{depth}.fasta')
+        out = os.path.join('{output_folder}', "data" ,config["format"],'MSA_kalign3_r{region}_d{depth}.fasta')
    message:
-        "Kalign3 for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
+        "Kalign3 for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('{output_folder}','logs','6_kalign3_r{region_size}_d{depth}.log')
+        os.path.join('{output_folder}','logs','6_kalign3_r{region}_d{depth}.log')
    conda:
        "envs/kalign3.yaml"
    shell :
@@ -107,24 +109,24 @@ rule kalign3 :

 rule abpoa :
    input :
-        os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
+        os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
    output :
-        out = os.path.join('{output_folder}' , 'data' ,'msa','abpoa_uncorrected_r{region_size}_d{depth}.fasta')
+        out = os.path.join('{output_folder}' , "data" ,config["format"],'abpoa_uncorrected_r{region}_d{depth}.fasta')
    message:
-        "Abpoa for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
+        "Abpoa for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('{output_folder}','logs','6_abpoa_r{region_size}_d{depth}.log')
+        os.path.join('{output_folder}','logs','6_abpoa_r{region}_d{depth}.log')
    conda:
        "envs/abpoa.yaml"
    shell :
        "abpoa -r 1 {input} > {output} 2> {log}"
 rule abpoa_correction :
    input :
-        os.path.join('{output_folder}' , 'data','msa','abpoa_uncorrected_r{region_size}_d{depth}.fasta')
+        os.path.join('{output_folder}' , "data" ,config["format"],'abpoa_uncorrected_r{region}_d{depth}.fasta')
    output :
-        os.path.join('{output_folder}', 'data' ,'msa','MSA_abpoa_r{region_size}_d{depth}.fasta')
+        os.path.join('{output_folder}', "data" ,config["format"],'MSA_abpoa_r{region}_d{depth}.fasta')
    message:
-        "Abpoa correction for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
+        "Abpoa correction for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    shell :
        'tail -{wildcards.depth} {input}| sed "s/^/>seq\\n/" >{output};rm -f {input}'

@@ -137,21 +139,22 @@ rule create_consensus_per_tool:
     input:
          rules.MTool.output, data=expand("{dataset}/MSA_{tool}_r{region}_d{depth}.fasta", allow_missing=True, dataset=DATASET)
     output:
-          "{output_folder}/{region}/{depth}/individual_consensus/{threshold}/consensus_{tool}_{region}_{depth}.fasta"
+          "{output_folder}/{region}/{depth}/{threshold}/consensus_{tool}_{region}_{depth}.fasta"
     log: "logs/{output_folder}/consensus_per_tool_{tool}_r{region}_d{depth}_t{threshold}.log"
+     message: "Consensus for tools
     shell:
          "./msa_handler/MTool {input.data} {output} {wildcards.threshold}"

 rule create_consensus:
     input:
-          rules.MTool.output, "MTool_over_folder.py", expand("{dataset}/MSA_{tool}_r{region}_d{depth}.fasta", tool=config["tool"], allow_missing=True, dataset=DATASET)
+          rules.MTool.output, "workflow/scripts/MTool_over_folder.py", expand("{dataset}/MSA_{tool}_r{region}_d{depth}.fasta", tool=config["tool"], allow_missing=True, dataset=DATASET)
     params: data=DATASET
     log: "logs/{output_folder}consensus_r{region}_d{depth}_t{threshold}.log"
     output:
          "{output_folder}/{region}/{depth}/test_consensus_{region}_{depth}_{threshold}.fasta"
     message: "Consensus region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"  
     shell:
-          "./MTool_over_folder.py -dir {params.data} -o {output} -region {wildcards.region} -depth {wildcards.depth} -threshold {wildcards.threshold}"
+          "./workflow/scripts/MTool_over_folder.py -dir {params.data} -o {output} -region {wildcards.region} -depth {wildcards.depth} -threshold {wildcards.threshold}"


 # Alignment for the meta-consensus #
@@ -199,33 +202,33 @@ rule meta_consensus:
 rule exonerate_ref_result:
     input:
          files=expand(rules.meta_consensus.output, align="muscle", allow_missing=True) ,
-          ref=expand("{data_in}/seq_selectes_region/region_seq_r{region}.fasta", data_in=config["data_in"], allow_missing=True),
+          ref=expand("{ref}/region_seq_r{region}.fasta", data_in=config["data_in"], ref=config["ref_folder"], allow_missing=True),
          consensus_per_tool=expand(rules.create_consensus_per_tool.output, tool=config["tool"], allow_missing=True)
     output: "{output_folder}/stats/{region}/aln_all_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.txt"
     log: "logs/{output_folder}/exonerate_ref_result_r{region}_d{depth}_t1_{threshold}_t2{value}.log"
     params: format="\"%pi, %ps, %et, %em\n\""
     conda: "envs/exonerate.yaml"
-     shell: "./exonerate_stats.sh {output} {input.ref} {input.files} {input.consensus_per_tool}"
+     shell: "./workflow/scripts/exonerate_stats.sh {output} {input.ref} {input.files} {input.consensus_per_tool}"
 #          "echo {STAT_HEADER} >> {output} \n for FILE in {input.files}; do exonerate --bestn 1 -Q dna -E -m a:g --showalignment false --showsugar false --showvulgar false --showcigar false  --ryo {params.format} --verbose 0 -q {input.ref} -t $FILE >> {output}  2>/dev/null; done" #


 rule plottable_exonerate_meta:
-     input: files=expand(rules.meta_consensus.output, align="muscle", value=config["threshold_2"], allow_missing=True) , ref=expand("{data_in}/seq_selectes_region/region_seq_r{region}.fasta", data_in=config["data_in"], allow_missing=True)
+     input: files=expand(rules.meta_consensus.output, align="muscle", value=config["threshold_2"], allow_missing=True) , ref=expand("{ref}/region_seq_r{region}.fasta", data_in=config["data_in"], ref=config["ref_folder"], allow_missing=True)
     output: "{output_folder}/plot/{region}/{depth}/{threshold}/aln_stats_meta.txt"
     log: "logs/{output_folder}plot_exonerate_meta_r{region}_d{depth}_t{threshold}.log"
     conda: "envs/exonerate.yaml"
     message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold}) for meta-consensus threshold in [ {config[threshold_2]} ]"
-     shell: "./exonerate_stats.sh {output} {input.ref} {input.files}"
+     shell: "./workflow/scripts/exonerate_stats.sh {output} {input.ref} {input.files}"

 rule plottable_exonerate_tools:
     input:
-          ref=expand("{data_in}/seq_selectes_region/region_seq_r{region}.fasta", data_in=config["data_in"], allow_missing=True),
+          ref=expand("{ref}/region_seq_r{region}.fasta", data_in=config["data_in"], ref=config["ref_folder"], allow_missing=True),
          consensus_per_tool=expand(rules.create_consensus_per_tool.output, value=config["threshold_2"], allow_missing=True)
     output: "{output_folder}/plot/{region}/{depth}/{threshold}/aln_stats_{tool}.txt"
     log: "logs/{output_folder}/plot_exonerate_tools_{tool}_r{region}_d{depth}_t{threshold}.log"
     conda: "envs/exonerate.yaml"
     message: "Exonerate comparison between consensus for tool {wildcards.tool} and ref (region: {wildcards.region}, depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold}) for meta-consensus threshold in [ {config[threshold_2]} ]"
-     shell: "./exonerate_stats.sh {output} {input.ref} {input.consensus_per_tool} 2> {log}"
+     shell: "./workflow/scripts/exonerate_stats.sh {output} {input.ref} {input.consensus_per_tool} 2> {log}"


 rule plot_depth:
@@ -235,7 +238,7 @@ rule plot_depth:
     log: "logs/{output_folder}/plot_depth_r{region}_t{threshold}.log"
     output: expand("{output_folder}/plot/{region}/graph/graph_r{region}_t{threshold}_{graph}.png", graph=GRAPH_TYPES, depth=config["depth"],  allow_missing=True)
     message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, or preliminary consensus threshold in [ {config[threshold_1]} ],  meta-consensus threshold in [ {config[threshold_2]} ]"
-     shell: "./plot_results.py {params.target_folder}/graph_r{wildcards.region}_t{wildcards.threshold} {input} 2> {log}"
+     shell: "./workflow/scripts/plot_results.py {params.target_folder}/graph_r{wildcards.region}_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"

 rule plot_t1:
     input: 
@@ -244,7 +247,7 @@ rule plot_t1:
     log: "logs/{output_folder}/plot_t1_r{region}_d{depth}.log"
     output: expand("{output_folder}/plot/{region}/{depth}/graph_r{region}_d{depth}_{graph}.png", graph=GRAPH_TYPES, allow_missing=True)
     message: "Creating plot for meta-consensus quality with preliminary threshold in [ {config[threshold_1]} ] and meta-consensus in [ {config[threshold_2]} ] (region: {wildcards.region}, depth: {wildcards.depth})"
-     shell: "./plot_results.py {params.target_folder}/graph_r{wildcards.region}_d{wildcards.depth} {input} 2> {log} "
+     shell: "./workflow/scripts/plot_results.py {params.target_folder}/graph_r{wildcards.region}_d{wildcards.depth} '[{config[threshold_2]}]' {input} 2> {log} "
     

 rule plot_t2:
@@ -254,11 +257,13 @@ rule plot_t2:
     log: "logs/{output_folder}/plot_t2_r{region}_d{depth}_t_{threshold}.log"
     output: expand("{output_folder}/plot/{region}/{depth}/{threshold}/graph_r{region}_d{depth}_t1_{threshold}_{graph}.png", graph=GRAPH_TYPES, allow_missing=True)
     message: "Creating plot for meta-consensus compared to preliminary consensus with meta-consensus threshold in {config[threshold_2]}  (region: {wildcards.region}, depth: {wildcards.depth}, threshold: {wildcards.threshold})"
-     shell: "./plot_results.py {params.target_folder}graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold}  {input} 2> {log}"
+     shell: "./workflow/scripts/plot_results.py {params.target_folder}graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold} '[{config[threshold_2]}]'  {input} 2> {log}"


-rule make_plots:
+rule plot:
     input: expand(rules.plot_t2.output, region=config["region"], depth=config["depth"], threshold=config["threshold_1"], output_folder=config["output_folder"]),
     	    expand(rules.plot_t1.output, region=config["region"], depth=config["depth"] , output_folder=config["output_folder"]),
 	    expand(rules.plot_depth.output, region=config["region"], threshold=config["threshold_1"], output_folder=config["output_folder"])
     message: "Created all plots"
+
+# minimap2 : minimap2 -a  -c longest_read.fasta reads/diploid_reads_shuffle.fasta > test_minimap_aln.sam
--- a/src/workflow/default_config.yaml
+++ b/src/workflow/default_config.yaml
-data_in: "../../data"
+data_in: "../../data/selected_read"
 data_out: "consensus/data"
 format: "msa"
 tool:  ["abpoa", "kalign", "kalign3", "mafft", "muscle", "spoa"]
@@ -8,3 +8,4 @@ region: ["100", "200", "500", "1000", "2000"]
 depth: ["10", "20", "50", "100", "150"]
 align: ["muscle", "mafft"]
 output_folder: "consensus"
+ref_folder: "../../data/seq_selectes_region"
\ No newline at end of file
--- a/src/workflow/envs/samtools.yaml
+++ b/src/workflow/envs/samtools.yaml
+name: samtools
+channels:
+  - bioconda/label/cf201901
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - bzip2=1.0.8=h7f98852_4
+  - c-ares=1.18.1=h7f98852_0
+  - ca-certificates=2021.10.8=ha878542_0
+  - curl=7.83.1=h2283fc2_0
+  - keyutils=1.6.1=h166bdaf_0
+  - krb5=1.19.3=h08a2579_0
+  - libcurl=7.83.1=h2283fc2_0
+  - libedit=3.1.20191231=he28a2e2_2
+  - libev=4.33=h516909a_1
+  - libgcc=7.2.0=h69d50b8_2
+  - libgcc-ng=12.1.0=h8d9b700_16
+  - libgomp=12.1.0=h8d9b700_16
+  - libnghttp2=1.47.0=he49606f_0
+  - libssh2=1.10.0=ha35d2d1_2
+  - libstdcxx-ng=12.1.0=ha89aaad_16
+  - libzlib=1.2.11=h166bdaf_1014
+  - ncurses=6.3=h27087fc_1
+  - openssl=3.0.3=h166bdaf_0
+  - samtools=1.7=1
+  - xz=5.2.5=h516909a_1
+  - zlib=1.2.11=h166bdaf_1014
+prefix: /home/flav/anaconda3/envs/samtools
--- a/src/workflow/envs/star.yaml
+++ b/src/workflow/envs/star.yaml
+name: star
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - star=2.7.10a=h9ee0642_0
+prefix: /home/flav/anaconda3/envs/star
--- a/src/MTool_over_folder.py
+++ b/src/MTool_over_folder.py
--- a/src/exonerate_stats.sh
+++ b/src/exonerate_stats.sh
--- a/src/workflow/scripts/extract_longest_read.py
+++ b/src/workflow/scripts/extract_longest_read.py
+import sys
+
+
+def extract_longest_read(filename, nb_reads):
+    max_len = [0 for x in range(nb_reads)]
+    descr= ""
+    longest_read = ["" for x in range(nb_reads)]
+    is_descr = True
+    with open(filename, 'r') as f:
+        for line in f:
+            if not is_descr:
+                min_len = min(max_len)
+                if len(line) > min_len and line[0] in ['A','C','G','T','N']:
+                    index = max_len.index(min_len)
+                    max_len[index] = len(line)
+                    longest_read[index] = descr + line
+                is_descr = True
+            elif (line[0] == '>' or line[0] == '@'):
+                is_descr = False
+                descr = ">" +line
+                
+    return longest_read
+
+def main(arg):
+    results = extract_longest_read(arg, 10)
+    with open("longest_reads.fasta", 'w') as f:
+    	for read in results:   
+            f.write("".join(read))
+
+if (__name__ == "__main__"):
+    main(sys.argv[1])
--- a/src/plot_results.py
+++ b/src/plot_results.py
@@ -8,7 +8,6 @@ import re
 import sys
 import os

-th = [30, 40, 50, 60, 70, 80]

 def parse_file(filename, x_scale):
    identity, similarity, equivalence, mismatches = [ [], [] , [], []]
@@ -68,19 +67,21 @@ def plot_from_folder(args, target, labels, folder):
                plt.plot(th,[tab for x in range(len(th))], label=labels[graph_cnt])
            tab_cnt+=1
        legend = plt.legend(shadow=True, fontsize='x-large')
-        plt.savefig(target+"_" + target_att[graph_cnt] + ".png", format="png",)
+        plt.savefig(target+"_" + target_att[graph_cnt] + ".png", format="png")
        graph_cnt+=1

 if __name__ == "__main__":
+    th = list(map(int, sys.argv[2][1:-1].split(" ")))

-    if (len(sys.argv) < 3):
+    if (len(sys.argv) < 4):
        exit(-1)
-    if (len(sys.argv) == 3):
+    if (len(sys.argv) < 4):
        print(sys.argv)
-        folder_it = os.scandir(sys.argv[2])
+        
+        folder_it = os.scandir(sys.argv[3])
        name= [ x.name for x in folder_it]
        #folder_it= os.scandir(sys.argv[2])
-        plot_from_folder(folder_it, sys.argv[1], name, folder=sys.argv[2])
+        plot_from_folder(folder_it, sys.argv[1], name, folder=sys.argv[3])
        exit(0)
    lb= []
    for elem in th:

--- a/src/split_fasta.sh
+++ b/src/split_fasta.sh
--- a/src/workflow/test_all.yaml
+++ b/src/workflow/test_all.yaml
+data_in: "../../data"
+data_out: "consensus/data"
+format: "msa"
+tool:  ["abpoa", "kalign", "kalign3", "mafft", "muscle", "spoa"]
+threshold_1: ["40", "50", "60", "70"]
+threshold_2 : ["40", "50", "60", "70"]
+region: ["0"]
+depth: ["0"]
+align: ["muscle", "mafft"]
+output_folder: "consensus"
+ref_folder: "../../data"
\ No newline at end of file