Version

fdcaab75 · Lihouck Flavien · e765ad48 · fdcaab75 · fdcaab75 · fdcaab75
Commit fdcaab75 authored Jul 11, 2022 by Lihouck Flavien
--- a/mc_msa.py
+++ b/mc_msa.py
@@ -11,7 +11,7 @@ config_file = "workflow/config.yaml"

 def create_config() :
    pattern = ""
-    parser = argparse.ArgumentParser(description="Creates the config file then runs the Meta-consensus pipeline", add_help=False)
+    parser = argparse.ArgumentParser(usage="mc_msa -i INPUT -o OUTPUT [-r REFERENCE -t TOOLS ...]" ,description="Creates the config file then runs the Meta-consensus pipeline", add_help=False)
    required = parser.add_argument_group("Required arguments")
    basic = parser.add_argument_group("Standard arguments")
    advanced = parser.add_argument_group("Advanced arguments")
@@ -20,20 +20,23 @@ def create_config() :
    required.add_argument("-output", "-o", help="Target directory for the pipeline results", required=True)
    basic.add_argument("-reference", "-r", help="Reference for alignment and statistics")
    advanced.add_argument("-list", help="A list of regions to work on (format: [r1, r2, ...] or [rStart_End, ...]) (default: no region)")
-
-    basic.add_argument("-size", "-s", help="The desired region size (default: maximum)", default='0')
+    advanced.add_argument("-size", "-s", help="The desired region size (default: maximum)", default='0')
    basic.add_argument("-tools", "-t", help="The list of tools to use in the meta-consensus (default: ['abpoa', 'spoa', 'kalign2', 'kalign3', 'mafft', 'muscle'])", default="['abpoa', 'spoa', 'kalign2', 'kalign3', 'mafft', 'muscle']")
-    basic.add_argument("-consensus_threshold", "-ct", help="Threshold(s) used for the MSA consensus step (default:  [70])", default="[70]")
-    basic.add_argument("-metaconsensus_threshold", "-mt",  help="Threshold(s) used for the Meta-consensus result (default: [60])", default="[60]")
-    basic.add_argument("-depth", "-d", help="The depth used in the process (default: max)", default="0")
+    advanced.add_argument("-consensus_threshold", "-ct", help="Threshold(s) used for the MSA consensus step (default:  [70])", default="[70]")
+    advanced.add_argument("-metaconsensus_threshold", "-mt",  help="Threshold(s) used for the Meta-consensus result (default: [60])", default="[60]")
+    advanced.add_argument("-depth", "-d", help="The depth used in the process (default: max)", default="0")
    #parser.add_argument("-plot", help="Analyse the meta-consensus and MSA consensus quality (requires reference)", action='store_true')
-    advanced.add_argument("-region_overlap", help="The size of the overlap between regions", default="50")
-    advanced.add_argument("-cores", help="The amount of cores to use in the pipeline run (default 1)", default=1)
+    advanced.add_argument("-region_overlap", help="The size of the overlap between regions", default="0")
+    basic.add_argument("-cores", "-c", help="The amount of cores to use in the pipeline run (default 1)", default=1)
    args = parser.parse_args()

    with open(config_file, 'w') as c_file:
        c_file.write('read_file: ' + args.input + "\n")
-        c_file.write("output_folder: "+args.output + "\n")
+        if args.output[-1] == "/":
+            output = args.output[:-1]
+        else:
+            output = args.output
+        c_file.write("output_folder: "+ output + "\n")
        if args.reference:
            c_file.write("ref_file: "+args.reference+"\n")
        region_param = False
@@ -64,7 +67,7 @@ def create_config() :
        c_file.write("threshold_2: "+ args.metaconsensus_threshold +"\n")
        c_file.write("depth: "+args.depth +" \n")

-        plot = 'plot'
+        plot = ''
        if args.reference:
            plot = 'plot'
        return {"core": str(args.cores), "plot" : plot}

--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -15,6 +15,7 @@ else :

 if not "data_out" in config:
    config["data_out"] = os.path.join(config["output_folder"],"data")
+
 if not "consensus_folder" in config:
    config["consensus_folder"] = os.path.join(config["output_folder"],"consensus")

@@ -34,10 +35,13 @@ if not "data_in" in config:
    config["data_in"] = ""


+#
+
 HIGH_DEPTH = 60
 HIGH_LENGTH = 5000
+DEPTH_MIN = 50

-REGION_HEAD_OVERLAP = int(config["region_overlap"]) if "region_overlap" in config else 20
+REGION_HEAD_OVERLAP = int(config["region_overlap"]) if "region_overlap" in config else 0

 DATASET = expand("{data}/{format}", data=config["data_out"], format="msa")
 GRAPH_TYPES = ["identity", "similarity", "equivalence", "mismatches","sequence_length", "aligned_length"]
@@ -72,15 +76,6 @@ if config["depth"] == 0:
 else:
    depth_check = config['depth']

-if int(depth_check) > HIGH_DEPTH:
-    eprint("\nWarning: a high depth may cause the Multiple Sequence Alignment phase to be extremely SLOW")
-    if "muscle" in config["tool"]:
-        eprint("\t- muscle may be extremely SLOW for high depth")
-    if "abpoa" in config["tool"]:
-        eprint("\t- abpoa may require a lot of memory to run, consider disabling it or lowering the depth if you encounter problems allocating memory")
-    if "spoa" in config["tool"]:
-        eprint("\t- spoa may require a lot of memory to run, consider disabling it or lowering the depth if you encounter problems allocating memory")
-    time.sleep(1.7)

 if not 'region' in config:
    if not 'length' in config:
@@ -91,6 +86,20 @@ if not 'region' in config:
            config['length'] = len(anchor)
    config['region']= cut_region_from_length(config['length'])

+# --------------------------------- #
+# Print warnings for high depth and length #
+
+if int(depth_check) > HIGH_DEPTH:
+    eprint("\nWarning: a high depth may cause the Multiple Sequence Alignment phase to be extremely SLOW")
+    if "muscle" in config["tool"]:
+        eprint("\t- muscle may be extremely SLOW for high depth")
+    if "abpoa" in config["tool"]:
+        eprint("\t- abpoa may require a lot of memory to run, consider disabling it or lowering the depth if you encounter problems allocating memory")
+    if "spoa" in config["tool"]:
+        eprint("\t- spoa may require a lot of memory to run, consider disabling it or lowering the depth if you encounter problems allocating memory")
+    time.sleep(1.3)
+
+
 if 'length' in config and int(config['length']) > HIGH_LENGTH:
    eprint("\nWarning: a high region length may cause the Multiple Sequence Alignment phase to be extremely SLOW")
    if "muscle" in config["tool"]:
@@ -99,9 +108,10 @@ if 'length' in config and int(config['length']) > HIGH_LENGTH:
        eprint("\t- abpoa may require a lot of memory to run, consider disabling it or working on smaller regions if you encounter problems allocating memory")
    if "spoa" in config["tool"]:
        eprint("\t- spoa may require a lot of memory to run, consider disabling it or working on smaller regions if you encounter problems allocating memory")
-    time.sleep(2)
-# Generates the meta-consensus from given reads #
+    time.sleep(1.3)
+

+# Generates the meta-consensus from given reads #
 rule all:
     input:
 #          expand("{output_folder}/meta_consensus/{region}/meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta", region=config["region"], depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])
@@ -144,9 +154,6 @@ if "ref_file" in config:
            'echo "ORDER: $ORDER" >{log};'
            '$ORDER  >{output} 2>> {log}'

-#    config['length'] = get_length('os.path.join(config["data_in"],config["ref_file"])'))
- #   config['region'] = cut_region_from_length(config['length'])
-
 # Cuts the reads into smaller regions to reduce the overall size and lower the computational cost, and creates fasta to build MSA#

    rule reads_map_region:
@@ -186,7 +193,7 @@ else :
        output :
            os.path.join(config["data_out"],'alignment','aln_reads_on_read.sam')
        message :
-            "Minimap for {config['read_file']}"
+            "Minimap for {config[read_file]}"
        log: os.path.join(config["output_folder"],'logs','2_alignment_reads_on_anchor.log')
        conda:
            "envs/minimap2.yaml"
@@ -196,17 +203,12 @@ else :
            '$ORDER  >{output} 2>> {log}'


-
-# Select the region to keep from the anchoring #
-  #  config['length'] = get_length(config['os.path.join(config["data_in"],config["ref_file"])'))
-   # config['region'] = cut_region_from_length(config['length'])
-
    rule get_regions_for_read :
        input : sam={rules.alignment_reads_on_anchor.output}
        output: os.path.join(config["data_out"], 'regions.txt')
        message: "Getting the regions where the reads are aligned on the anchor"
        conda: "envs/samtools.yaml"
-        params: depth=max(config["depth"])
+        params: depth=DEPTH_MIN
        shell: "./workflow/scripts/prepare_region_from_read.sh {input.sam} {output} {params.depth}"

 #    rule set_region_elem:
@@ -225,16 +227,10 @@ else :
        log:
            os.path.join(config['output_folder'],'logs','3_reads_map_region_r{start}_{end}.log')
        shell :
-                'ORDER="./workflow/scripts/reads_map_region.pl -s {wildcards.start} -e {wildcards.end} -c 100 {input.sam} {output}";'
+                'ORDER="./workflow/scripts/reads_map_region.pl -s {wildcards.start} -e {wildcards.end} -c {DEPTH_MIN} {input.sam} {output}";'
                'echo "ORDER: $ORDER" >{log};'
                '$ORDER  2>&1 >>{log}'

-#'./workflow/scripts/cut_regions_with_file {input.sam} {wildcards.region}'
-
-
-
-     
-

 # -------------------------------------------------------- #
 # Depth selection
@@ -265,7 +261,7 @@ rule muscle :
    message:
        "Muscle for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('logs','{output_folder}','6_muscle_r{region}_d{depth}.log')
+        os.path.join('{output_folder}','logs','6_muscle_r{region}_d{depth}.log')
    conda:
        "envs/muscle.yaml"
    shell :
@@ -279,7 +275,7 @@ rule mafft :
    message:
        "Mafft for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('logs','{output_folder}','6_mafft_r{region}_d{depth}.log')
+        os.path.join('{output_folder}','logs','6_mafft_r{region}_d{depth}.log')
    conda:
        "envs/mafft.yaml"
    shell :
@@ -303,7 +299,7 @@ rule spoa :
    message:
        "Spoa for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('logs','{output_folder}','6_spoa_r{region}_d{depth}.log')
+        os.path.join('{output_folder}','logs','6_spoa_r{region}_d{depth}.log')
    conda:
        "envs/spoa.yaml"
    shell :
@@ -317,7 +313,7 @@ rule kalign2 :
    message:
        "Kalign2 for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('logs','{output_folder}','6_kalign_r{region}_d{depth}.log')
+        os.path.join('{output_folder}','logs','6_kalign_r{region}_d{depth}.log')
    conda:
        "envs/kalign2.yaml"
    shell :
@@ -331,7 +327,7 @@ rule kalign3 :
    message:
        "Kalign3 for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
    log:
-        os.path.join('logs','{output_folder}','6_kalign3_r{region}_d{depth}.log')
+        os.path.join('{output_folder}','logs','6_kalign3_r{region}_d{depth}.log')
    conda:
        "envs/kalign3.yaml"
    shell :
@@ -378,7 +374,7 @@ rule abpoa_correction :
 #    shell :
 #        "the shell command for your tool. with {input} as the input and {output} as the output"
 # This will only work if the command output the files as written in output.
-# If all is done correctly, you should 
+# If all is done correctly, you should be able to add the tool in the pipeline run



@@ -449,7 +445,7 @@ rule align_muscle:

 rule meta_consensus:
     input:
-          muscle=rules.align_muscle.output
+          muscle=expand(rules.align_muscle.output, output_folder=config['consensus_folder'], allow_missing=True)
     output:
         os.path.join("{output_folder}","meta_consensus", "{region}", "meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta")
     log: "{output_folder}/logs/meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.log"
@@ -468,6 +464,10 @@ rule concat_region:
 # --------------------------------------------------------#
 # This part handle comparison to a reference and creates plots in order to better visualize the efficiency of the consensus strategy used for both the preliminary consensus and the meta-consensus. #
 if "ref_file" in config:
+
+# --------------------------------------------------------#
+# Cut the reference region
+
   rule region_seq :
    input :
        ref =os.path.join(config["data_in"],config["ref_file"])
@@ -492,7 +492,6 @@ if "ref_file" in config:
     params: format="\"%pi, %ps, %et, %em\n\""
     conda: "envs/exonerate.yaml"
     shell: "./workflow/scripts/exonerate_stats.sh {output} {input.ref} {input.files} {input.consensus_per_tool}"
-#          "echo {STAT_HEADER} >> {output} \n for FILE in {input.files}; do exonerate --bestn 1 -Q dna -E -m a:g --showsugar false --showvulgar false --showcigar false  --ryo {params.format} --verbose 0

   rule exonerate_stats_meta:
     input: files=expand(rules.meta_consensus.output, align="muscle", value=config["threshold_2"], allow_missing=True) , ref=expand("{ref}/region_references/region_seq_r{region}.fasta", data_in=config["data_in"], ref=config["data_out"], allow_missing=True)
@@ -547,6 +546,7 @@ if "ref_file" in config:
     shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.region}/{wildcards.depth}/{wildcards.threshold}/graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold} '[{config[threshold_2]}]'  {input} 2> {log}"


+# Extracting the mismatching regions, using exonerate to align the meta-consensus with the reference to find them.

   rule comparison_ref_metaconsensus:
    input: query=rules.concat_region.output, target=config["ref_file"]

--- a/workflow/config.yaml
+++ b/workflow/config.yaml
 read_file: ../Lambda_phage/lambda_1000x_10p.fa
 output_folder: Lambda_10
 ref_file: ../Lambda_phage/lambda_virus.fa
-region_size: 0
-region_overlap: 50
+region_size: 5000
+region_overlap: 0
 tool: ['abpoa', 'spoa', 'kalign2', 'kalign3', 'mafft', 'muscle']
 threshold_1: [70]
 threshold_2: [60]
-depth: 0 
+depth: 30 
--- a/workflow/scripts/concat_regions.py
+++ b/workflow/scripts/concat_regions.py
@@ -17,5 +17,7 @@ def main():
            file_cnt += 1

        output.write('\n')
+
+
 if __name__ == "__main__" :
    main()
--- a/workflow/scripts/overlap_merge.py
+++ b/workflow/scripts/overlap_merge.py
+#!/usr/bin/env python3
+
+import sys
+
+
+def main():
+    args = sys.argv
+    overlap = int(args[2])
+    with open(args[1], "w") as output:
+        with open(args[3], 'r') as file:
+            lines = file.readlines()
+            output.write(">meta_consensus\n"+ lines[1][:-1])
+        file_cnt = 4
+        while file_cnt < len(args):
+            with open(args[file_cnt], "r") as file:
+                lines = file.readlines()
+                output.write(lines[1][1:-1])
+            file_cnt += 1
+
+        output.write('\n')
+
+
+if __name__ == "__main__" :
+    main()