Fixed default values

e765ad48 · Lihouck Flavien · 1500fc37 · e765ad48 · e765ad48 · e765ad48
Commit e765ad48 authored 3 years ago by Lihouck Flavien
--- a/mc_msa.py
+++ b/mc_msa.py
@@ -11,23 +11,25 @@ config_file = "workflow/config.yaml"

 def create_config() :
    pattern = ""
-    parser = argparse.ArgumentParser(description="Creates the config file then runs the Meta-consensus pipeline")
+    parser = argparse.ArgumentParser(description="Creates the config file then runs the Meta-consensus pipeline", add_help=False)
    required = parser.add_argument_group("Required arguments")
    basic = parser.add_argument_group("Standard arguments")
    advanced = parser.add_argument_group("Advanced arguments")
+    basic.add_argument("-h", "--help", action="help", help="show this help message and exit")
    required.add_argument( "-input", "-i", help="Reads file", required=True)
    required.add_argument("-output", "-o", help="Target directory for the pipeline results", required=True)
    basic.add_argument("-reference", "-r", help="Reference for alignment and statistics")
    advanced.add_argument("-list", help="A list of regions to work on (format: [r1, r2, ...] or [rStart_End, ...]) (default: no region)")
-    basic.add_argument("-size", "-s", help="The desired region size (default: 2000)", default='2000')
+
+    basic.add_argument("-size", "-s", help="The desired region size (default: maximum)", default='0')
    basic.add_argument("-tools", "-t", help="The list of tools to use in the meta-consensus (default: ['abpoa', 'spoa', 'kalign2', 'kalign3', 'mafft', 'muscle'])", default="['abpoa', 'spoa', 'kalign2', 'kalign3', 'mafft', 'muscle']")
    basic.add_argument("-consensus_threshold", "-ct", help="Threshold(s) used for the MSA consensus step (default:  [70])", default="[70]")
    basic.add_argument("-metaconsensus_threshold", "-mt",  help="Threshold(s) used for the Meta-consensus result (default: [60])", default="[60]")
-    basic.add_argument("-depth", "-d", help="The depth used in the process (default: [60])", default="[60]")
+    basic.add_argument("-depth", "-d", help="The depth used in the process (default: max)", default="0")
    #parser.add_argument("-plot", help="Analyse the meta-consensus and MSA consensus quality (requires reference)", action='store_true')
    advanced.add_argument("-region_overlap", help="The size of the overlap between regions", default="50")
    advanced.add_argument("-cores", help="The amount of cores to use in the pipeline run (default 1)", default=1)
-    args = parser.parse_args(['-h'])
+    args = parser.parse_args()

    with open(config_file, 'w') as c_file:
        c_file.write('read_file: ' + args.input + "\n")

--- a/src/util/exonerate_parser.py
+++ b/src/util/exonerate_parser.py
@@ -15,7 +15,7 @@ def main():
        line_cnt = 1
        problems = []
        while line_cnt < len(lines):
-            char_cnt = 8
+            char_cnt = 9
            while char_cnt < len(lines[line_cnt])-9:
                if lines[line_cnt][char_cnt] != "|":
                    print((char_cnt, lines[line_cnt-1][char_cnt], lines[line_cnt][char_cnt], lines[line_cnt+1][char_cnt]))
@@ -38,7 +38,7 @@ def main():
                start = y - distance if y - distance > 0 else 0
                end = y + distance if y + distance < len(target[1]) else len(target[1])-1
                output.write("Error "+str(pb_cnt) + ":\n" + str(value1 + start)+ ": " + target[1][start:end] + "\n")
-                output.write(str(value1+start)+": "+lines[x][start+8:end+8]+"`\n")
+                output.write(str(value1+start)+": "+lines[x][start+9:end+9]+"\n")
                output.write(str(value2 + start) + ": " + query[1][start:end] + "\n\n")
                pb_cnt += 1


--- a/workflow/Snakefile
+++ b/workflow/Snakefile
 import os, sys
+import time
 from scripts.extract_longest_read import extract_longest_read



 #configfile: "./workflow/config.yaml"

-
 # ------------------------------------ #
 # Value tests and initialization #
 if not "region_size" in config:
@@ -13,9 +13,6 @@ if not "region_size" in config:
 else :
    config["region_size"] = int(config["region_size"])

-if not "depth" in config:
-    config["depth"] =  "80"
-
 if not "data_out" in config:
    config["data_out"] = os.path.join(config["output_folder"],"data")
 if not "consensus_folder" in config:
@@ -37,6 +34,8 @@ if not "data_in" in config:
    config["data_in"] = ""


+HIGH_DEPTH = 60
+HIGH_LENGTH = 5000

 REGION_HEAD_OVERLAP = int(config["region_overlap"]) if "region_overlap" in config else 20

@@ -60,16 +59,47 @@ def cut_region_from_length(length):
        start += config["region_size"] - REGION_HEAD_OVERLAP
    return region

+def get_depth(file):
+    out = subprocess.run(["wc", "-l", file], capture_output=True, text=True)
+    return int(int(out.stdout.split(" ")[0]) /2)
+
+def eprint(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs)
+
+
+if config["depth"] == 0:
+    depth_check = get_depth(config["read_file"])
+else:
+    depth_check = config['depth']
+
+if int(depth_check) > HIGH_DEPTH:
+    eprint("\nWarning: a high depth may cause the Multiple Sequence Alignment phase to be extremely SLOW")
+    if "muscle" in config["tool"]:
+        eprint("\t- muscle may be extremely SLOW for high depth")
+    if "abpoa" in config["tool"]:
+        eprint("\t- abpoa may require a lot of memory to run, consider disabling it or lowering the depth if you encounter problems allocating memory")
+    if "spoa" in config["tool"]:
+        eprint("\t- spoa may require a lot of memory to run, consider disabling it or lowering the depth if you encounter problems allocating memory")
+    time.sleep(1.7)

 if not 'region' in config:
   if not 'length' in config:
      if 'ref_file' in config:
         config['length'] = get_length(os.path.join(config["data_in"],config["ref_file"]))
      else:
-         config['length'] = len(extract_longest_read(os.path.join(config["data_in"], config["read_file"]), 1)[0])
+         anchor = extract_longest_read(os.path.join(config["data_in"], config["read_file"]), 1)[0]
+         config['length'] = len(anchor)
   config['region']= cut_region_from_length(config['length'])

-
+if 'length' in config and int(config['length']) > HIGH_LENGTH:
+    eprint("\nWarning: a high region length may cause the Multiple Sequence Alignment phase to be extremely SLOW")
+    if "muscle" in config["tool"]:
+        eprint("\t- muscle may be extremely SLOW for high length")
+    if "abpoa" in config["tool"]:
+        eprint("\t- abpoa may require a lot of memory to run, consider disabling it or working on smaller regions if you encounter problems allocating memory")
+    if "spoa" in config["tool"]:
+        eprint("\t- spoa may require a lot of memory to run, consider disabling it or working on smaller regions if you encounter problems allocating memory")
+    time.sleep(2)
 # Generates the meta-consensus from given reads #

 rule all:
@@ -518,12 +548,25 @@ if "ref_file" in config:



+   rule comparison_ref_metaconsensus:
+    input: query=rules.concat_region.output, target=config["ref_file"]
+    output: os.path.join("{output_folder}", "mismatches", "alignment_meta_ref_d{depth}_t1_{threshold}_t2_{value}.fasta")
+    conda: "envs/exonerate.yaml"
+    shell: "exonerate --bestn 1 -Q dna -E -m a:g --showvulgar false --showsugar false --showcigar false --showalignment true {input.target} {input.query} > {output} 2>/dev/null"
+
+   rule extract_mismatches:
+    input: rules.comparison_ref_metaconsensus.output
+    output: os.path.join("{output_folder}", "mismatches", "mismatches_d{depth}_t1_{threshold}_t2_{value}.fasta")
+    shell: "./src/util/exonerate_parser.py {input} {output} >/dev/null"
+
 # --------------------------------------------------------#
   rule plot:
     input: expand(rules.plot_t2.output, region=config["region"], depth=config["depth"], threshold=config["threshold_1"], plot_folder=config["output_folder"]),
            expand(rules.plot_t1.output, region=config["region"], depth=config["depth"] , plot_folder=config["output_folder"]),
            expand(rules.plot_depth.output, region=config["region"], threshold=config["threshold_1"], plot_folder=config["output_folder"]),
-            expand(rules.plot_depth_all.output, region=config["region"], threshold=config["threshold_1"], plot_folder=config["output_folder"])
+            expand(rules.plot_depth_all.output, region=config["region"], threshold=config["threshold_1"], plot_folder=config["output_folder"]),
+            expand(rules.extract_mismatches.output , threshold=config["threshold_1"], value=config["threshold_2"], output_folder=config["output_folder"], depth=config["depth"])
+
     message: "Created all plots"

 else :

--- a/workflow/config.yaml
+++ b/workflow/config.yaml
 read_file: ../Lambda_phage/lambda_1000x_10p.fa
 output_folder: Lambda_10
 ref_file: ../Lambda_phage/lambda_virus.fa
-region_size: 4000
-region_overlap: 0
+region_size: 0
+region_overlap: 50
 tool: ['abpoa', 'spoa', 'kalign2', 'kalign3', 'mafft', 'muscle']
 threshold_1: [70]
 threshold_2: [60]
-depth: 30 
+depth: 0