Fixed the file for pipeline launcher/installer, everything should work now....

Fixed the file for pipeline launcher/installer, everything should work now. Added concat strategy for joining regions

Fixed the file for pipeline launcher/installer, everything should work now....
ee47cc40 · Lihouck Flavien · 822a6f77 · ee47cc40 · ee47cc40 · ee47cc40
Commit ee47cc40 authored 3 years ago by Lihouck Flavien
--- a/mc_msa.py
+++ b/mc_msa.py
@@ -2,26 +2,32 @@

 import subprocess
 import os
-import sys , argparse
+import sys
+import argparse
+
+

 config_file = "workflow/config.yaml"

 def create_config() :
    pattern = ""
    parser = argparse.ArgumentParser(description="Creates the config file then runs the Meta-consensus pipeline")
-    parser.add_argument("-input", help="Reads file", required=True)
-    parser.add_argument("-output", help="Target directory for the pipeline results", required=True)
-    parser.add_argument("-reference", help="Reference for alignment and statistics")
-    parser.add_argument("-list", help="A list of regions to work on (format: [r1, r2, ...] or [rStart_End, ...]) (default: no region)")
-    parser.add_argument("-size", help="The desired region size (default: 2000)", default='2000')
-    parser.add_argument("-tools", help="The list of tools to use in the meta-consensus (default: ['abpoa', 'spoa', 'kalign2', 'kalign3', 'mafft', 'muscle'])", default="['abpoa', 'spoa', 'kalign2', 'kalign3', 'mafft', 'muscle']")
-    parser.add_argument("-consensus_threshold", help="Threshold(s) used for the MSA consensus step (default:  [70])", default="[70]")
-    parser.add_argument("-metaconsensus_threshold", help="Threshold(s) used for the Meta-consensus result (default: [60])", default="[60]")
-    parser.add_argument("-depth", help="The depth used in the process (default: [60])", default="[60]")
-    parser.add_argument("-plot", help="Analyse the meta-consensus and MSA consensus quality (requires reference)", action='store_true')
-    parser.add_argument("-region_overlap", help="The size of the overlap between regions", default="50")
-    parser.add_argument("-cores", help="The amount of cores to use in the pipeline run (default 1)", default=1)
-    args = parser.parse_args()
+    required = parser.add_argument_group("Required arguments")
+    basic = parser.add_argument_group("Standard arguments")
+    advanced = parser.add_argument_group("Advanced arguments")
+    required.add_argument( "-input", "-i", help="Reads file", required=True)
+    required.add_argument("-output", "-o", help="Target directory for the pipeline results", required=True)
+    basic.add_argument("-reference", "-r", help="Reference for alignment and statistics")
+    advanced.add_argument("-list", help="A list of regions to work on (format: [r1, r2, ...] or [rStart_End, ...]) (default: no region)")
+    basic.add_argument("-size", "-s", help="The desired region size (default: 2000)", default='2000')
+    basic.add_argument("-tools", "-t", help="The list of tools to use in the meta-consensus (default: ['abpoa', 'spoa', 'kalign2', 'kalign3', 'mafft', 'muscle'])", default="['abpoa', 'spoa', 'kalign2', 'kalign3', 'mafft', 'muscle']")
+    basic.add_argument("-consensus_threshold", "-ct", help="Threshold(s) used for the MSA consensus step (default:  [70])", default="[70]")
+    basic.add_argument("-metaconsensus_threshold", "-mt",  help="Threshold(s) used for the Meta-consensus result (default: [60])", default="[60]")
+    basic.add_argument("-depth", "-d", help="The depth used in the process (default: [60])", default="[60]")
+    #parser.add_argument("-plot", help="Analyse the meta-consensus and MSA consensus quality (requires reference)", action='store_true')
+    advanced.add_argument("-region_overlap", help="The size of the overlap between regions", default="50")
+    advanced.add_argument("-cores", help="The amount of cores to use in the pipeline run (default 1)", default=1)
+    args = parser.parse_args(['-h'])

    with open(config_file, 'w') as c_file:
        c_file.write('read_file: ' + args.input + "\n")
@@ -56,8 +62,8 @@ def create_config() :
        c_file.write("threshold_2: "+ args.metaconsensus_threshold +"\n")
        c_file.write("depth: "+args.depth +" \n")

-        plot = None
-        if args.plot and args.reference:
+        plot = 'plot'
+        if args.reference:
            plot = 'plot'
        return {"core": str(args.cores), "plot" : plot}

@@ -77,8 +83,6 @@ def config_conda():
 def main():
    config_conda()
    data = create_config()
-    if not data['plot']:
-        data['plot'] = ""
    result = subprocess.run(["./workflow/scripts/snakemake_launcher.sh",os.path.join("workflow","config.yaml"), data["core"] , data['plot']])

 if __name__ == "__main__":

--- a/src/util/exonerate_parser.py
+++ b/src/util/exonerate_parser.py
+#!/usr/bin/env python3
+
+import sys
+
+DEFAULT_DISTANCE = 10
+
+def main():
+    if len(sys.argv) > 3:
+        distance = int(sys.argv[3])
+    else:
+        distance = DEFAULT_DISTANCE
+    with open(sys.argv[1], 'r') as f:
+        lines = [line.replace("\n","") for line in f][12:-2]
+
+        line_cnt = 1
+        problems = []
+        while line_cnt < len(lines):
+            char_cnt = 8
+            while char_cnt < len(lines[line_cnt])-9:
+                if lines[line_cnt][char_cnt] != "|":
+                    print((char_cnt, lines[line_cnt-1][char_cnt], lines[line_cnt][char_cnt], lines[line_cnt+1][char_cnt]))
+                    problems.append((line_cnt, char_cnt-8))
+                    char_cnt += distance
+                else:
+                    char_cnt += 1
+            line_cnt += 4
+        region = []
+        with open(sys.argv[2], 'w') as output:
+            pb_cnt = 1
+            for pb in problems:
+                x, y = pb
+                target = lines[x-1].split(":")
+                query = lines[x+1].split(":")
+                target[1] = target[1].strip(" ")
+                query[1] = query[1].strip(" ")
+                value1 = int(target[0])
+                value2 = int(query[0])
+                start = y - distance if y - distance > 0 else 0
+                end = y + distance if y + distance < len(target[1]) else len(target[1])-1
+                output.write("Error "+str(pb_cnt) + ":\n" + str(value1 + start)+ ": " + target[1][start:end] + "\n")
+                output.write(str(value1+start)+": "+lines[x][start+8:end+8]+"`\n")
+                output.write(str(value2 + start) + ": " + query[1][start:end] + "\n\n")
+                pb_cnt += 1
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -51,10 +51,10 @@ def get_length(file):
    return len(l)

 def cut_region_from_length(length):
-    start = 2
+    start = 1
    region = []
    if config["region_size"] == 0:
-        return ["2_"+str(length -1)]
+        return ["1_"+str(length -1)]
    while start+config["region_size"]  < int(length):
        region.append(str(start)+"_"+str( min(start+config["region_size"] , int(length)-2)))
        start += config["region_size"] - REGION_HEAD_OVERLAP
@@ -74,7 +74,9 @@ if not 'region' in config:

 rule all:
     input:
-          expand("{output_folder}/meta_consensus/{region}/meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta", region=config["region"], depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])
+#          expand("{output_folder}/meta_consensus/{region}/meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta", region=config["region"], depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])
+        expand(os.path.join("{output_folder}","meta_consensus","meta_consensus_d{depth}_t1_{threshold}_t2_{value}.fasta"), depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])
+
     message: "Meta consensus can be found in {input}"


@@ -154,7 +156,7 @@ else :
        output :
            os.path.join(config["data_out"],'alignment','aln_reads_on_read.sam')
        message :
-            "Minimap for {config[data_out]}"
+            "Minimap for {config['read_file']}"
        log: os.path.join(config["output_folder"],'logs','2_alignment_reads_on_anchor.log')
        conda:
            "envs/minimap2.yaml"
@@ -172,6 +174,7 @@ else :
    rule get_regions_for_read :
        input : sam={rules.alignment_reads_on_anchor.output}
        output: os.path.join(config["data_out"], 'regions.txt')
+        message: "Getting the regions where the reads are aligned on the anchor"
        conda: "envs/samtools.yaml"
        params: depth=max(config["depth"])
        shell: "./workflow/scripts/prepare_region_from_read.sh {input.sam} {output} {params.depth}"
@@ -424,6 +427,14 @@ rule meta_consensus:
     shell:
          "./src/msa_handler/MTool {input.muscle} {output} {wildcards.value}"

+
+rule concat_region:
+    input: expand(rules.meta_consensus.output, region=config["region"], align="muscle" , allow_missing=True)
+    output: os.path.join("{output_folder}", "meta_consensus","meta_consensus_d{depth}_t1_{threshold}_t2_{value}.fasta")
+    shell:
+        "./workflow/scripts/concat_regions.py {output} {input}"
+
+
 # --------------------------------------------------------#
 # This part handle comparison to a reference and creates plots in order to better visualize the efficiency of the consensus strategy used for both the preliminary consensus and the meta-consensus. #
 if "ref_file" in config:
@@ -451,7 +462,7 @@ if "ref_file" in config:
     params: format="\"%pi, %ps, %et, %em\n\""
     conda: "envs/exonerate.yaml"
     shell: "./workflow/scripts/exonerate_stats.sh {output} {input.ref} {input.files} {input.consensus_per_tool}"
-#          "echo {STAT_HEADER} >> {output} \n for FILE in {input.files}; do exonerate --bestn 1 -Q dna -E -m a:g --showalignment false --showsugar false --showvulgar false --showcigar false  --ryo {params.format} --verbose 0 -q {input.ref} -t $FILE >> {output}  2>/dev/null; done" #
+#          "echo {STAT_HEADER} >> {output} \n for FILE in {input.files}; do exonerate --bestn 1 -Q dna -E -m a:g --showsugar false --showvulgar false --showcigar false  --ryo {params.format} --verbose 0

   rule exonerate_stats_meta:
     input: files=expand(rules.meta_consensus.output, align="muscle", value=config["threshold_2"], allow_missing=True) , ref=expand("{ref}/region_references/region_seq_r{region}.fasta", data_in=config["data_in"], ref=config["data_out"], allow_missing=True)
@@ -497,8 +508,8 @@ if "ref_file" in config:
     shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.region}/{wildcards.depth}/graph_r{wildcards.region}_d{wildcards.depth} '[{config[threshold_2]}]' {input} 2> {log} "
     
   rule plot_t2:
-     input: expand(rules.exonerate_stats_tools.output, tool=config["tool"], output_folder=config["consensus_folder"], allow_missing=True),
-            expand(rules.exonerate_stats_meta.output, output_folder=config["consensus_folder"], allow_missing=True)
+     input: expand(rules.exonerate_stats_tools.output, tool=config["tool"], output_folder=config["output_folder"], allow_missing=True),
+            expand(rules.exonerate_stats_meta.output, output_folder=config["output_folder"], allow_missing=True)
     params: target_folder="{plot_folder}/plot/{region}/{depth}/{threshold}/"
     log: "{plot_folder}/logs/plot_t2_r{region}_d{depth}_t_{threshold}.log"
     output: expand(os.path.join("{plot_folder}","plot","{region}","{depth}","{threshold}","graph_r{region}_d{depth}_t1_{threshold}_{graph}.png"), graph=GRAPH_TYPES, allow_missing=True)

--- a/workflow/config.yaml
+++ b/workflow/config.yaml
-read_file: ../Lambda_phage/lambda_1000x_5p.fa
-output_folder: lambda_config
+read_file: ../Lambda_phage/lambda_1000x_10p.fa
+output_folder: Lambda_10
 ref_file: ../Lambda_phage/lambda_virus.fa
-region_size: 2000
-region_overlap: 50
+region_size: 4000
+region_overlap: 0
 tool: ['abpoa', 'spoa', 'kalign2', 'kalign3', 'mafft', 'muscle']
 threshold_1: [70]
 threshold_2: [60]
-depth: [60] 
+depth: 30 
--- a/workflow/scripts/concat_regions.py
+++ b/workflow/scripts/concat_regions.py
+#!/usr/bin/env python3
+
+import sys
+
+
+def main():
+    args = sys.argv
+    with open(args[1], "w") as output:
+        with open(args[2], 'r') as file:
+            lines = file.readlines()
+            output.write(">meta_consensus\n"+ lines[1][:-1])
+        file_cnt = 3
+        while file_cnt < len(args):
+            with open(args[file_cnt], "r") as file:
+                lines = file.readlines()
+                output.write(lines[1][1:-1])
+            file_cnt += 1
+
+        output.write('\n')
+if __name__ == "__main__" :
+    main()
--- a/workflow/scripts/region_seq.sh
+++ b/workflow/scripts/region_seq.sh
@@ -14,7 +14,7 @@ else
    INPUT_REF=$2
    OUTPUT=$3
    REGION_SIZE=$4
-    let "START_REGION= $INPUT_START-1"
+    let "START_REGION= $INPUT_START"
    let "END_REGION = $REGION_SIZE"
    cat $INPUT_REF |grep ">" >$OUTPUT
    cat $INPUT_REF |grep -v ">"|tr -d "\n"|cut -c $START_REGION-$END_REGION >>$OUTPUT

--- a/workflow/scripts/snakemake_launcher.sh
+++ b/workflow/scripts/snakemake_launcher.sh
@@ -2,7 +2,7 @@
 source ./workflow/scripts/config_conda.sh
 if [[ ! -d .conda_snakemake ]]; then
  echo "Create a conda env for snakemake"
-  conda env create -p .conda_snakemake -f src/workflow/envs/snakemake.yaml  >/dev/null
+  conda env create -p .conda_snakemake -f workflow/envs/snakemake.yaml  >/dev/null
 fi
 CURRENT_PATH=`pwd`
 conda activate $CURRENT_PATH/.conda_snakemake