Ajout de la progression du pipeline dans le cas ou l'on a pas de réferences

970d980c · Lihouck Flavien · e42c81a3 · 970d980c · 970d980c · 970d980c
Commit 970d980c authored 3 years ago by Lihouck Flavien
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,6 @@ data/
 *.fasta
 MTool
 logs/
+*.bam
+*.sam
+*.txt
\ No newline at end of file
--- a/src/msa_handler/MSA_cumulative.cpp
+++ b/src/msa_handler/MSA_cumulative.cpp
+#include "MSA_cumulative.h"
+
+string MSA_cumulative::consensus(int threshold, int ploidity) {
+  string consensus_seq("");
+  int size_alpha = int(alphabet.size());
+  int current_weight(1);
+  for (int i(0); i < length; ++i) {
+    int size_bl = int(text.size());
+    int scores[size_alpha] = {0};
+    int depth_index = 0;
+    int alpha_index = 0;
+    int highest_score =0;
+    while (depth_index < size_bl && threshold*current_weight > (highest_score*100)/size_bl) {
+      alpha_index = alphabet.find(text[depth_index][i]);
+      if (alpha_index != -1) {
+	scores[alpha_index] += current_weight;
+      }
+      else {
+	vector<char> multiple_basis = IUPAC_to_basis(text[depth_index][i]);
+	int size = multiple_basis.size();
+	if (current_weight % size != 0) {
+	  current_weight *= size;
+	  for( int k(0); k <(int)alphabet.size();++k) {
+	    scores[k] *= size;
+	  }
+	}
+	for (auto it = begin(multiple_basis); it != end(multiple_basis); ++it) {
+	  scores[alphabet.find(*it)] += (current_weight / size);
+	}
+      }
+      highest_score = max(highest_score, scores[alpha_index]);
+      
+      //cerr << "index " <<alpha_index << " score " << highest_score << " percentage : " << (highest_score*100)/size_bl << endl;
+      depth_index++;
+    }
+    if (depth_index < size_bl){
+      //cerr << "char at pos " << i << " is " << alphabet[alpha_index] << " end pos is " << length << endl;
+      if (alpha_index != 0 && alpha_index != 16)
+	consensus_seq += alphabet[alpha_index];
+    }
+    else {
+      vector<char> conserved_nuc;
+      while (threshold*current_weight > (highest_score*100)/size_bl) {
+	alpha_index = 0;
+	while (alpha_index < size_alpha && scores[alpha_index] <= highest_score) {
+	  ////cerr << "alpha_index " << alpha_index << "scores :" << scores[alpha_index] << endl;
+	  alpha_index++;
+	}
+	highest_score = scores[alpha_index];
+	conserved_nuc.insert(conserved_nuc.begin(), alphabet[alpha_index]);
+	for (int k(0); k < size_alpha; k++)
+	  scores[k] += highest_score;
+	scores[alpha_index] = 0;
+      }
+     
+      char nuc = basis_to_IUPAC(conserved_nuc);
+      //cerr << "char at pos " << i << " is " << nuc << endl;
+      if (nuc != '-' && nuc != '.')
+	consensus_seq += nuc;
+    }
+  }
+  return consensus_seq;
+}
--- a/src/msa_handler/MSA_cumulative.h
+++ b/src/msa_handler/MSA_cumulative.h
+#ifndef MSA_CUMUL_H
+#define MSA_CUMUL_H
+
+#include "MSA.h"
+
+class MSA_cumulative : public MSA
+{
+ public:
+  string consensus(int similarity_threshold, int ploidity);
+};
+
+#endif
--- a/src/msa_handler/Makefile
+++ b/src/msa_handler/Makefile
@@ -6,10 +6,19 @@ all: MTool clean
 MTool: main.o MSA.o
 	$(CC) -o MTool main.o MSA.o

+MTool2: main2.o MSA.o MSA_cumulative.o
+	$(CC) -o MTool main2.o MSA.o MSA_cumulative.o
+
+
+MSA_cumulative.o: MSA_cumulative.cpp MSA_cumulative.h MSA.h
+	$(CC) $(CFLAGS) MSA_cumulative.cpp

 MSA.o: MSA.cpp MSA.h
 	$(CC) $(CFLAGS) MSA.cpp

+main2.o: main2.cpp MSA.h MSA_cumulative.h
+	$(CC) $(CFLAGS) main2.cpp
+
 main.o: main.cpp MSA.h
 	$(CC) $(CFLAGS) main.cpp


--- a/src/msa_handler/main2.cpp
+++ b/src/msa_handler/main2.cpp
+#include "MSA.h"
+#include "MSA_cumulative.h"
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#define THRESHOLD 70
+#define PLOIDITY 2
+
+
+using namespace std;
+
+int main(int argc, char** argv){
+  if (argc > 1) {
+    MSA msa_fasta = MSA_cumulative();
+    msa_fasta.parser_fasta(argv[1]);
+    ofstream output;
+    string filename;
+    if (argc > 2) {
+      filename = argv[2];
+    }
+    else
+      filename = "output.txt";
+    output.open (filename, ios::out | ios::app | ios::ate);
+    string cons_name = argv[1]; 
+    int index = cons_name.find_last_of('/') +1;
+    int threshold;
+    if (argc > 3)
+      threshold = stoi(argv[3]);
+    else
+      threshold = THRESHOLD;
+    int ploidity;
+    if (argc > 4)
+      ploidity = stoi(argv[4]);
+    else
+      ploidity = PLOIDITY;
+    cons_name = ">lcl|consensus_"+cons_name.substr(index) + " threshold = " + to_string(threshold);
+    output << cons_name << endl << msa_fasta.consensus(threshold, ploidity) << endl;
+    output.close();
+  }
+}
--- a/src/workflow/Snakefile
+++ b/src/workflow/Snakefile
-configfile: "./workflow/test_all.yaml"
+import os, sys
+
+configfile: "./workflow/lambda_phage.yaml"

 DATASET = expand("{data}/{format}", data=config["data_out"], format=config["format"])
 GRAPH_TYPES = ["identity", "similarity", "equivalence", "mismatches","sequence_length", "aligned_length"]
-STAT_HEADER= "threshold, percent_identity, percent_similarity, total_equivalence, total_mismatches, target length, alignement length\n "
+STAT_HEADER= "threshold, percent_identity, percent_similarity, total_equivalence, total_mismatches, target length, alignment length\n "
+
+REGION_SIZE = 20000
+REGION_HEAD_OVERLAP = 200

-#SPLIT_SIZE = 0#
-#REGIONS_STARTS = []#
+def cut_region_from_length(length):
+    start = 2
+    region = []
+    while start < int(length):
+        region.append(str(start)+"_"+str( min(start+REGION_SIZE, int(length))))
+        start += REGION_SIZE - REGION_HEAD_OVERLAP
+    return region

-#def split_points(amount, length):#
-#    SPLIT_SIZE = math.floor(amount / length)#
-#    REGION_START = [ x for x in range(1, length, SPLIT_SIZE) ]#
+if not "region" in config:
+     config["region"] = cut_region_from_length(config["length"])

-STARTS = [1, 4000, 8000]
-ENDS = [4000, 8000, 12000]
+# Generates the meta-consensus from given reads #

 rule all:
     input:
          expand("{output_folder}/{region}/{depth}/{align}_meta_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta", region=config["region"], depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])
     message: "Meta consensus can be found in ./[output_folder]/[region]/[depth]/"

+
+
+# -------------------------------------------------------- #
 # Installs the tool used for consensus and meta-consensus #

 rule MTool:
@@ -28,18 +39,20 @@ rule MTool:
     output:
          "msa_handler/MTool"
     shell:
-          "cd msa_handler && make"
+          "cd msa_handler && make MTool2"

+# -------------------------------------------------------- #
 # Aligns the reads on a given reference with minimap2 #

+if "ref_file" in config:
    rule alignment_reads_on_ref :
          input :
               ref=os.path.join(config["data_in"],config["ref_file"]),
               read=os.path.join(config["data_in"], config["read_file"])
          output :
-        os.path.join(config["data_in"],'alignement','aln_reads_on_ref.sam')
+               os.path.join(config["data_out"],'alignement','aln_reads_on_ref.sam')
          message :
-        "Minimap for "
+               "Minimap for {config[data_in]}"
          log:
               os.path.join('logs','2_alignment_reads_on_ref.log')
          conda:
@@ -49,13 +62,11 @@ rule alignment_reads_on_ref :
            'echo "ORDER: $ORDER" >{log};'
            '$ORDER  >{output} 2>> {log}'

-
 # Cuts the reads into smaller regions to reduce the overall size and lower the computational cost, and creates fasta to build MSA#

-
    rule reads_map_region:
        input :
-        aln = os.path.join(config['data_in'],'alignement','aln_reads_on_ref.sam'),
+            aln = os.path.join(config['data_out'],'alignement','aln_reads_on_ref.sam')
        output :
            os.path.join('{data_set}','read_map_region','read_r{start}_{end}.fasta')
        message:
@@ -65,14 +76,80 @@ rule reads_map_region:
        conda:
            "envs/perl.yaml"
        shell :
-        'ORDER="./workflow/scripts/reads_map_region.pl -s {wildcards.start} -e {wildcards.end} {input.aln} {output}";'
+            'ORDER="./workflow/scripts/reads_map_region.pl -s {wildcards.start} -e {wildcards.end} -c 100 {input.aln} {output}";'
+            'echo "ORDER: $ORDER" >{log};'
+            '$ORDER  2>&1 >>{log}'
+
+else:
+# -------------------------------------------------------- #
+# Extracts the longest read from the read pile to use as an anchor for a prealignment #
+
+    rule extract_longest_read:
+        input : read=os.path.join(config["data_in"],config["read_file"])
+        output : os.path.join(config["data_out"],"anchor.fasta")
+        message: "Extracting the longest read from {config[read_file]}"
+        shell :
+     	   "./workflow/scripts/extract_longest_read.py -i {input.read} -o {output} -n 1 "
+
+
+# Align the reads on anchor read #
+
+    rule alignment_reads_on_anchor :
+        input :
+            ref=os.path.join(config["data_out"],"anchor.fasta"),
+            read=os.path.join(config["data_in"], config["read_file"])
+        output :
+            os.path.join(config["data_in"],'alignement','aln_reads_on_read.sam')
+        message :
+            "Minimap for {config[data_out]}"
+        log:
+            os.path.join('logs','2_alignment_reads_on_anchor.log')
+        conda:
+            "envs/minimap2.yaml"
+        shell :
+            'ORDER="minimap2 -cax map-ont -t 8  {input.ref} {input.read}";'
+            'echo "ORDER: $ORDER" >{log};'
+            '$ORDER  >{output} 2>> {log}'
+
+# Select the region to keep from the anchoring #
+
+    rule get_regions_for_read :
+        input : sam={rules.alignment_reads_on_anchor.output}
+        output: os.path.join(config["data_in"], 'regions.txt')
+        conda: "envs/samtools.yaml"
+        params: depth=max(config["depth"])
+        shell: "./workflow/scripts/prepare_region_from_read.sh {input.sam} {output} {params.depth}"
+
+#    rule set_region_elem:
+#        input: {rules.get_regions_for_read.output}
+#        output: os.path.join(config["data_out"], 'region_fixed.txt')
+#        run:
+#            for elem in {output}:
+#                with open(str(elem), 'w') as f:
+#	                f.write(set_region({input}))
+#            print(config['region'])
+
+
+    rule reads_map_regions_no_ref:
+        input: sam={rules.alignment_reads_on_anchor.output}, regions={rules.get_regions_for_read.output} #, fix={rules.set_region_elem.output}
+        output: os.path.join('{data_set}', "read_map_region",'read_r{start}_{end, \d+}.fasta')
+        conda: "envs/perl.yaml"
+        log:
+            os.path.join('{data_set}','logs','3_reads_map_region_r{start}_{end}.log')
+        shell :
+                'ORDER="./workflow/scripts/reads_map_region.pl -s {wildcards.start} -e {wildcards.end} -c 100 {input.sam} {output}";'
                'echo "ORDER: $ORDER" >{log};'
                '$ORDER  2>&1 >>{log}'

-rule duct_tape:
-     input: rules.reads_map_region.output
-     output: expand(os.path.join('{data_set}','read_map_region', 'read_r{region}.fasta'), region='{start}_{end}', allow_missing=True)
-     shell: "echo 'DuctTape'"
+#'./workflow/scripts/cut_regions_with_file {input.sam} {wildcards.region}'
+
+
+
+     
+
+
+# -------------------------------------------------------- #
+# Depth selection

 rule select_depth :
    input :
@@ -89,17 +166,18 @@ rule select_depth :
        '$ORDER >{output} 2>>{log}'


+# --------------------------------------------------------#
 # Creation of MSA from the reads pile using tools  #

 rule muscle :
    input :
-        os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
+        os.path.join(config["data_out"],'reads_r{region}_d{depth}.fasta')
    output :
        out = os.path.join('{output_folder}', "data", config["format"]  ,'MSA_muscle_r{region}_d{depth}.fasta')
    message:
        "Muscle for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('{output_folder}','logs','6_muscle_r{region}_d{depth}.log')
+        os.path.join('logs','{output_folder}','6_muscle_r{region}_d{depth}.log')
    conda:
        "envs/muscle.yaml"
    shell :
@@ -107,13 +185,13 @@ rule muscle :

 rule mafft :
    input :
-        os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
+        os.path.join(config["data_out"],'reads_r{region}_d{depth}.fasta')
    output :
        out = os.path.join('{output_folder}', "data" ,config["format"] ,'mafft_uncorrected_r{region}_d{depth}.fasta')
    message:
        "Mafft for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('{output_folder}','logs','6_mafft_r{region}_d{depth}.log')
+        os.path.join('logs','{output_folder}','6_mafft_r{region}_d{depth}.log')
    conda:
        "envs/mafft.yaml"
    shell :
@@ -131,13 +209,13 @@ rule mafft_correction :

 rule spoa :
    input :
-        os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
+        os.path.join(config["data_out"],'reads_r{region}_d{depth}.fasta')
    output :
        out = os.path.join('{output_folder}', "data" ,config["format"],'MSA_spoa_r{region}_d{depth}.fasta')
    message:
        "Spoa for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('{output_folder}','logs','6_spoa_r{region}_d{depth}.log')
+        os.path.join('logs','{output_folder}','6_spoa_r{region}_d{depth}.log')
    conda:
        "envs/spoa.yaml"
    shell :
@@ -145,13 +223,13 @@ rule spoa :

 rule kalign :
    input :
-        os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
+        os.path.join(config["data_out"],'reads_r{region}_d{depth}.fasta')
    output :
        out = os.path.join('{output_folder}', "data" ,config["format"],'MSA_kalign_r{region}_d{depth}.fasta')
    message:
        "Kalign for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('{output_folder}','logs','6_kalign_r{region}_d{depth}.log')
+        os.path.join('logs','{output_folder}','6_kalign_r{region}_d{depth}.log')
    conda:
        "envs/kalign2.yaml"
    shell :
@@ -159,13 +237,13 @@ rule kalign :

 rule kalign3 :
    input :
-        os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
+        os.path.join(config["data_out"],'reads_r{region}_d{depth}.fasta')
    output :
        out = os.path.join('{output_folder}', "data" ,config["format"],'MSA_kalign3_r{region}_d{depth}.fasta')
    message:
        "Kalign3 for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('{output_folder}','logs','6_kalign3_r{region}_d{depth}.log')
+        os.path.join('logs','{output_folder}','6_kalign3_r{region}_d{depth}.log')
    conda:
        "envs/kalign3.yaml"
    shell :
@@ -173,13 +251,13 @@ rule kalign3 :

 rule abpoa :
    input :
-        os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
+        os.path.join(config["data_out"],'reads_r{region}_d{depth}.fasta')
    output :
        out = os.path.join('{output_folder}' , "data" ,config["format"],'abpoa_uncorrected_r{region}_d{depth}.fasta')
    message:
        "Abpoa for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('{output_folder}','logs','6_abpoa_r{region}_d{depth}.log')
+        os.path.join('logs','{output_folder}','6_abpoa_r{region}_d{depth}.log')
    conda:
        "envs/abpoa.yaml"
    shell :
@@ -197,8 +275,8 @@ rule abpoa_correction :



-
-# Creation of the consensus #
+# --------------------------------------------------------#
+# Creation of the consensus for each tools #

 rule create_consensus_per_tool:
     input:
@@ -214,12 +292,14 @@ rule create_consensus:
     params: data=DATASET
     log: "logs/{output_folder}consensus_r{region}_d{depth}_t{threshold}.log"
     output:
-          "{output_folder}/{region}/{depth}/test_consensus_{region}_{depth}_{threshold}.fasta"
+          "{output_folder}/{region}/{depth}/all_tools_consensus_{region}_{depth}_{threshold}.fasta"
     message: "Consensus region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"  
     shell: 'cat {input} > {output}' 
        #  "./workflow/scripts/MTool_over_folder.py -dir {params.data} -o {output} -region {wildcards.region} -depth {wildcards.depth} -threshold {wildcards.threshold}"#

-# Alignment for the meta-consensus #
+
+# --------------------------------------------------------#
+# Alignment of the consensus for the meta-consensus #

 rule align_mafft:
     input:
@@ -229,7 +309,7 @@ rule align_mafft:
     conda:
          "envs/mafft.yaml"
     log: "logs/{output_folder}/align_mafft_r{region}_d{depth}_t{threshold}.log"
-     message: "Alignment with mafft region: {wildcards.region} , depth: {wildcards.depth} , threshold: {threshold}\n"
+     message: "Alignment with mafft region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
     shell:
          "mafft --auto {input} > {output} 2> {log}"

@@ -246,6 +326,8 @@ rule align_muscle:
          "muscle -in {input} -out {output} > {log} 2>{log}"


+
+# --------------------------------------------------------#
 # Meta consensus #

 rule meta_consensus:
@@ -259,7 +341,32 @@ rule meta_consensus:
          "./msa_handler/MTool {input.muscle} {output} {wildcards.value}"


+
+
+# -------------------------------------------------------- #
+# Trying to merge regions #
+
+
+# --------------------------------------------------------#
 # This part handle comparison to a reference and creates plots in order to better visualize the efficiency of the consensus strategie used for both the preliminary consensus and the meta-consensus. #
+if ("ref_file" in config):
+
+
+   rule region_seq :
+    input :
+        ref =os.path.join(config["data_in"],config["ref_file"])
+    output :
+        "{data_set}/region_seq_r{start}_{end}.fasta"
+    message:
+        "Region's seq for {wildcards.data_set} (Region {wildcards.start} - {wildcards.end})"
+    log:
+        "{data_set}/logs/6_region_seq_r{start}_{end}.log"
+    shell :
+        'ORDER="./workflow/scripts/region_seq.sh {wildcards.start} {input.ref} {output} {wildcards.end}";'
+        'echo "ORDER: $ORDER" >{log};'
+        '$ORDER 2>&1 >>{log}'
+
+

   rule exonerate_ref_result:
     input:
@@ -294,12 +401,21 @@ rule plottable_exonerate_tools:


   rule plot_depth:
-     input: expand(rules.plottable_exonerate_meta.output, depth=config["depth"], allow_missing=True),
+     input: expand(rules.plottable_exonerate_meta.output, depth=config["depth"], allow_missing=True)
     params: target_folder="consensus/plot/{region}/graph"
     log: "logs/{output_folder}/plot_depth_r{region}_t{threshold}.log"
-     output: expand("{output_folder}/plot/{region}/graph/graph_r{region}_t{threshold}_{graph}.png", graph=GRAPH_TYPES, depth=config["depth"],  allow_missing=True)
+     output: expand("{output_folder}/plot/{region}/graph/graph_depth_r{region}_t{threshold}_{graph}.png", graph=GRAPH_TYPES, depth=config["depth"],  allow_missing=True)
     message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, or preliminary consensus threshold in [ {config[threshold_1]} ],  meta-consensus threshold in [ {config[threshold_2]} ]"
-     shell: "./workflow/scripts/plot_results.py {params.target_folder}/graph_r{wildcards.region}_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
+     shell: "./workflow/scripts/plot_results.py {wildcards.output_folder}/plot/{wildcards.region}/graph/graph_depth_r{wildcards.region}_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
+
+   rule plot_depth_all:
+     input: expand(rules.plottable_exonerate_meta.output, depth=config["depth"], allow_missing=True), expand(rules.plottable_exonerate_tools.output, tool=config["tool"], depth=config["depth"], allow_missing=True)
+     params: target_folder="consensus/plot/{region}/graph"
+     log: "logs/{output_folder}/plot_all_depth_r{region}_t{threshold}.log"
+     output: expand("{output_folder}/plot/{region}/graph/graph_depth_all_r{region}_t{threshold}_{graph}.png", graph=GRAPH_TYPES, depth=config["depth"],  allow_missing=True)
+     message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, or preliminary consensus threshold in [ {config[threshold_1]} ],  meta-consensus threshold in [ {config[threshold_2]} ]"
+     shell: "./workflow/scripts/plot_results.py {wildcards.output_folder}/plot/{wildcards.region}/graph/graph_depth_all_r{wildcards.region}_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
+

   rule plot_t1:
     input: 
@@ -308,8 +424,7 @@ rule plot_t1:
     log: "logs/{output_folder}/plot_t1_r{region}_d{depth}.log"
     output: expand("{output_folder}/plot/{region}/{depth}/graph_r{region}_d{depth}_{graph}.png", graph=GRAPH_TYPES, allow_missing=True)
     message: "Creating plot for meta-consensus quality with preliminary threshold in [ {config[threshold_1]} ] and meta-consensus in [ {config[threshold_2]} ] (region: {wildcards.region}, depth: {wildcards.depth})"
-     shell: "./workflow/scripts/plot_results.py {params.target_folder}/graph_r{wildcards.region}_d{wildcards.depth} '[{config[threshold_2]}]' {input} 2> {log} "
-     
+     shell: "./workflow/scripts/plot_results.py {wildcards.output_folder}/plot/{wildcards.region}/{wildcards.depth}/graph_r{wildcards.region}_d{wildcards.depth} '[{config[threshold_2]}]' {input} 2> {log} "
     
   rule plot_t2:
     input: expand(rules.plottable_exonerate_tools.output, tool=config["tool"], allow_missing=True),
@@ -318,13 +433,18 @@ rule plot_t2:
     log: "logs/{output_folder}/plot_t2_r{region}_d{depth}_t_{threshold}.log"
     output: expand("{output_folder}/plot/{region}/{depth}/{threshold}/graph_r{region}_d{depth}_t1_{threshold}_{graph}.png", graph=GRAPH_TYPES, allow_missing=True)
     message: "Creating plot for meta-consensus compared to preliminary consensus with meta-consensus threshold in {config[threshold_2]}  (region: {wildcards.region}, depth: {wildcards.depth}, threshold: {wildcards.threshold})"
-     shell: "./workflow/scripts/plot_results.py {params.target_folder}graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold} '[{config[threshold_2]}]'  {input} 2> {log}"
+     shell: "./workflow/scripts/plot_results.py {wildcards.output_folder}/plot/{wildcards.region}/{wildcards.depth}/{wildcards.threshold}/graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold} '[{config[threshold_2]}]'  {input} 2> {log}"
+


+# --------------------------------------------------------#
   rule plot:
     input: expand(rules.plot_t2.output, region=config["region"], depth=config["depth"], threshold=config["threshold_1"], output_folder=config["output_folder"]),
     	    expand(rules.plot_t1.output, region=config["region"], depth=config["depth"] , output_folder=config["output_folder"]),
-	    expand(rules.plot_depth.output, region=config["region"], threshold=config["threshold_1"], output_folder=config["output_folder"])
+	        expand(rules.plot_depth.output, region=config["region"], threshold=config["threshold_1"], output_folder=config["output_folder"]),
+	        expand(rules.plot_depth_all.output, region=config["region"], threshold=config["threshold_1"], output_folder=config["output_folder"])
     message: "Created all plots"

-# minimap2 : minimap2 -a  -c longest_read.fasta reads/diploid_reads_shuffle.fasta > test_minimap_aln.sam
+else:
+   rule plot:
+     run: sys.exit("Plotting requires a reference to create statistics")
\ No newline at end of file
--- a/src/workflow/envs/samtools.yaml
+++ b/src/workflow/envs/samtools.yaml
 name: samtools
 channels:
-  - bioconda/label/cf201901
  - conda-forge
  - bioconda
  - defaults
@@ -9,12 +8,14 @@ dependencies:
  - _openmp_mutex=4.5=2_gnu
  - bzip2=1.0.8=h7f98852_4
  - c-ares=1.18.1=h7f98852_0
-  - ca-certificates=2021.10.8=ha878542_0
+  - ca-certificates=2022.5.18=ha878542_0
  - curl=7.83.1=h2283fc2_0
+  - htslib=1.9=h4da6232_3
  - keyutils=1.6.1=h166bdaf_0
  - krb5=1.19.3=h08a2579_0
  - libcurl=7.83.1=h2283fc2_0
-  - libedit=3.1.20191231=he28a2e2_2
+  - libdeflate=1.2=h516909a_1
+  - libedit=3.1.20191231=h46ee950_2
  - libev=4.33=h516909a_1
  - libgcc=7.2.0=h69d50b8_2
  - libgcc-ng=12.1.0=h8d9b700_16
@@ -23,9 +24,9 @@ dependencies:
  - libssh2=1.10.0=ha35d2d1_2
  - libstdcxx-ng=12.1.0=ha89aaad_16
  - libzlib=1.2.11=h166bdaf_1014
-  - ncurses=6.3=h27087fc_1
+  - ncurses=6.1=hf484d3e_1002
  - openssl=3.0.3=h166bdaf_0
-  - samtools=1.7=1
+  - samtools=1.9=h10a08f8_12
  - xz=5.2.5=h516909a_1
  - zlib=1.2.11=h166bdaf_1014
 prefix: /home/flav/anaconda3/envs/samtools
--- a/src/workflow/lambda_phage.yaml
+++ b/src/workflow/lambda_phage.yaml
+data_in: "../../Lambda_phage"
+read_file: "lambda_1000x_10p.fa"
+#ref_file: "lambda_virus.fa"
+data_out: "lambda/data"
+format: "msa"
+tool:  [ "kalign", "kalign3", "mafft", "muscle"]
+threshold_1: ["50",  "60"]
+threshold_2 :  ["50", "60"]
+#region: ["100_1100","1100_2100", "2100_4100", "4100_6100"]
+length: '40500'
+depth: ["15","20","30"]
+align: ["muscle"]
+output_folder: "lambda/consensus"
+ref_folder: "lambda/data"
--- a/src/workflow/scripts/delimit_regions.py
+++ b/src/workflow/scripts/delimit_regions.py
+import sys
+
+
+def main(args):
+    start_r = 0
+    regions = []
+    amount = int(args[3])
+    with open(args[1], 'r') as f:
+        for line in f:
+            line.replace("\n", '')
+            pos, depth = line.split("\t")
+            if int(depth) > amount :
+                if start_r == 0:
+                    start_r=pos
+            else:
+                if start_r != 0:
+                    regions.append((start_r, pos))
+                    start_r= 0
+    with open(args[2], 'w') as out:
+        out.write('[')
+        for elem in regions:
+            out.write('"'+elem[0]+"_"+elem[1]+'",')
+        out.write(']')
+    
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 4:
+        main(sys.argv)
+    else:
+        sys.exit("Usage:\ndelimit_regions input output min_depth\n")
+        
--- a/src/workflow/scripts/extract_longest_read.py
+++ b/src/workflow/scripts/extract_longest_read.py
-import sys
+#!/usr/bin/env python3

+import sys
+import argparse

 def extract_longest_read(filename, nb_reads):
    max_len = [0 for x in range(nb_reads)]
@@ -21,11 +23,27 @@ def extract_longest_read(filename, nb_reads):
                
    return longest_read

-def main(arg):
-    results = extract_longest_read(arg, 10)
-    with open("longest_reads.fasta", 'w') as f:
+def main():
+    parser=argparse.ArgumentParser()
+    parser.add_argument("-input", help="the file to extract reads from", required=True)
+    parser.add_argument("-number", help="the amount of reads to extract (default=1)")
+    parser.add_argument("-output", help="the output file's name (the reads will be printed to stdout otherwise)") 
+    amount = 1
+    args = parser.parse_args()
+    if args.number:
+        try:
+            amount = int(args.number)
+        except:
+            sys.exit("'-n' requires a number")
+    results = extract_longest_read(args.input, amount)
+    if args.output:
+        output= args.output
+        with open(output, 'w') as f:
    	    for read in results:   
                f.write("".join(read))
+    else:
+        for read in results:
+            print("".join(read))
    
 if (__name__ == "__main__"):
-    main(sys.argv[1])
+    main()
--- a/src/workflow/scripts/prepare_region_from_read.sh
+++ b/src/workflow/scripts/prepare_region_from_read.sh
+#!/bin/bash
+
+INPUT=$1
+OUTPUT=$2
+MIN=$3
+samtools sort -O sam $INPUT -o tmp.sam
+samtools depth tmp.sam > depth.txt
+cut -f2- depth.txt > depths_clean.txt 
+python3 workflow/scripts/delimit_regions.py depths_clean.txt $OUTPUT $MIN
+rm -f tmp.sam depth.txt depths_clean.txt
+
+
--- a/src/workflow/scripts/region_seq.sh
+++ b/src/workflow/scripts/region_seq.sh
@@ -7,8 +7,8 @@ else
    INPUT_REF=$2
    OUTPUT=$3
    REGION_SIZE=$4
-    START_REGION=`cat $INPUT_START`
-    let "END_REGION = $REGION_SIZE + START_REGION -1"
+    let "START_REGION= $INPUT_START-1"
+    let "END_REGION = $REGION_SIZE"
    cat $INPUT_REF |grep ">" >$OUTPUT
    cat $INPUT_REF |grep -v ">"|tr -d "\n"|cut -c $START_REGION-$END_REGION >>$OUTPUT
 fi
--- a/src/workflow/test_all.yaml
+++ b/src/workflow/test_all.yaml
@@ -3,11 +3,11 @@ ref_file: "ref_diploid.fasta"
 read_file: "diploid_reads_shuffle.fastq"
 data_out: "consensus/data"
 format: "msa"
-tool:  ["abpoa", "kalign", "kalign3", "mafft", "muscle", "spoa"]
+tool:  [ "kalign", "kalign3", "mafft", "muscle"]
 threshold_1: ["35", "36", "37", "38", "39","40", "41", "42", "43", "44", "45"]
 threshold_2 :  ["35", "36", "37", "38", "39","40", "41", "42", "43", "44", "45"]
-region: ["1_2000", "2000_4000"]
-depth: ["10", "20", "30"]
-align: ["muscle", "mafft"]
+region: ["2000_4000","4000_6000"]
+depth: ["5","10","15","20","30"]
+align: ["muscle"]
 output_folder: "consensus"
 ref_folder: "../../data"