Cleaned read cuts output

e1386df6 · Lihouck Flavien · 73b56147 · e1386df6 · e1386df6
Commit e1386df6 authored Jul 21, 2022 by Lihouck Flavien
--- a/src/util/exonerate_parser.py
+++ b/src/util/exonerate_parser.py
 #!/usr/bin/env python3

-##
-# This script aims to extract the mismatches found by exonerate and
+# # This script aims to extract the mismatches found by exonerate from the matching reads region, creating a file of
+# all the problems with their "origin" region #

 import sys

 DEFAULT_DISTANCE = 10
 LINE_HEADER_SIZE = 9

-def get_region_file(regions, folder, value):
+
+def get_region_file(regions, folder, value, format_func="{0}/read_r{1}.fasta".format):
+    """
+    Find the read file
+    :param regions: (list/string) the list of every regions format to look for in
+    :param folder: the read regions' folder
+    :param value: the position to extract from target file
+    :param format_func: the read regions' file naming format
+    :return:
+    """
    target_region = ""
    if not isinstance(regions, list):
        regions = regions.split(",")
@@ -20,25 +29,39 @@ def get_region_file(regions, folder, value):
        val = elem.split("_")
        if int(val[0]) <= value < int(val[1]):
            target_region = elem
-    return folder+"/read_r"+target_region+".fasta" if target_region != "" else ""

-def extract_read_pile(file, value, distance, output, padding, safe):
+    return format_func(folder, target_region) if target_region != "" else ""
+
+
+def extract_read_pile(file, position, distance, output, padding):
+    """
+    Extracts every reads matching the region from file and writes them into the output file
+    :param file: the reads file
+    :param position: the mismatch position
+    :param distance: the size of elements to pick before and after the position
+    :param output: the output file to write in
+    :param padding: the padding length to add at the beginning of each line
+    :return: the amount of reads written
+    """
    line_cnt = 0
-    print(value, safe)
    with open(file, "r") as input:
        for line in input:
            output.write(" "*padding)
            if line_cnt % 2 == 0:
                output.write(line)
            else:
-                start = value - distance if value - distance > 0 else 0
-                end = value + distance if value + distance < len(line) else len(line)
-                output.write(line[start:end]+"\n")
+                start = position - distance if position - distance > 0 else 0
+                end = position + distance if position + distance < len(line) else len(line)
+                output.write(line[start:end].strip("\n")+"\n")
            line_cnt += 1
    output.write("\n\n")
+    return line_cnt
+

 def main():
-    assert(len(sys.argv) > 5)
+    if len(sys.argv) < 7:
+        print("Usage: ./exonerate_parser.py INPUT OUTPUT OUTPUT_LENGTH READ_REGION_LENGTH READ_REGION_FOLDER READ_REGION_LIST\n")
+        exit("Missing parameters")
    if len(sys.argv) > 3:
        distance = int(sys.argv[3])
    else:
@@ -51,11 +74,13 @@ def main():
        lines = [line.replace("\n","") for line in f][12:-2]
        line_cnt = 1
        problems = []
+        print(lines[0].find(":"))
+        header_size = lines[0].find(": ")+2
        while line_cnt < len(lines):
-            char_cnt = LINE_HEADER_SIZE
-            while char_cnt < len(lines[line_cnt])-LINE_HEADER_SIZE:
+            char_cnt = header_size
+            while char_cnt < len(lines[line_cnt])-header_size:
                if lines[line_cnt][char_cnt] != "|":
-                    problems.append((line_cnt, char_cnt-LINE_HEADER_SIZE+1))
+                    problems.append((line_cnt, char_cnt-header_size+1))
                    char_cnt += distance
                else:
                    char_cnt += 1
@@ -78,9 +103,9 @@ def main():
                    prefix_start = None
                if prefix_start and x > 1:
                    prefix_x = x-4
-                    prefix_1 = lines[prefix_x-1].split(":")[1][prefix_start:-1]
-                    prefix_2 = lines[prefix_x][prefix_start:-1]
-                    prefix_3 = lines[prefix_x+1].split(":")[1][prefix_start:-1]
+                    prefix_1 = lines[prefix_x-1].split(":")[1][prefix_start:-1]+"/"
+                    prefix_2 = lines[prefix_x][prefix_start:-1]+"/"
+                    prefix_3 = lines[prefix_x+1].split(":")[1][prefix_start:-1]+"/"
                else:
                    prefix_x = None
                    prefix_1, prefix_2, prefix_3 = ("", "", "")
@@ -92,19 +117,19 @@ def main():
                    suffix_end = None
                if suffix_end and x < len(lines) - 2:
                    suffix_x = x+4
-                    suffix_1 = lines[suffix_x-1].split(":")[1][1:suffix_end+1]
-                    suffix_2 = lines[suffix_x][LINE_HEADER_SIZE:suffix_end+LINE_HEADER_SIZE]
-                    suffix_3 = lines[suffix_x+1].split(":")[1][1:suffix_end+1]
+                    suffix_1 = "\\"+lines[suffix_x-1].split(":")[1][1:suffix_end+1]
+                    suffix_2 = "\\"+lines[suffix_x][header_size:suffix_end+header_size]
+                    suffix_3 = "\\"+lines[suffix_x+1].split(":")[1][1:suffix_end+1]
                else:
                    suffix_x = None
                    suffix_1, suffix_2, suffix_3 = ("","", "")
-                output.write("Error "+str(pb_cnt) + ":\n" + str(value1 + start)+ ": " + prefix_1 + target[1][start:end] + suffix_1 + "\n")
-                output.write(str(value1+start)+": "+prefix_2 +lines[x][start+LINE_HEADER_SIZE:end+LINE_HEADER_SIZE]+ suffix_2 +"\n")
-                output.write(str(value2 + start) + ": " + prefix_3 + query[1][start:end] + suffix_3 + "\n\n")
+                output.write("Error {0} : Target position = {1} Query position = {2}\n{3}{4}{5}\n".format(str(pb_cnt), str(value1+start), str(value2 + start), prefix_1, target[1][start:end], suffix_1))
+                output.write("{0}{1}{2}\n".format(prefix_2, lines[x][start+header_size:end+header_size], suffix_2))
+                output.write( "{0}{1}{2}\n\n".format(prefix_3, query[1][start:end], suffix_3))
                pb_cnt += 1
                file = get_region_file(regions, folder, y+value2)
-                output.write(file+ "\n")
-                extract_read_pile(file, value2 % size, distance, output, len(str(value1+start)+": "), value2)
+                output.write(file+ "\n\n")
+                extract_read_pile(file, value2 % size, distance+2, output, 0)

 if __name__ == "__main__":
    main()
\ No newline at end of file
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -38,10 +38,11 @@ config["early_concat"] = True
 HIGH_DEPTH = 60
 HIGH_LENGTH = 5000

-# This constant represent the minimal coverage percentage required for a read to be kept for a specific region
+# This constant represents the minimal coverage percentage required for a read to be kept for a specific region
 READ_COVERAGE_PERCENTAGE = int(50)

-MISMATCH_AREA = 20
+# This constant represents the expected region size extracted before and after the mismatch position.
+MISMATCH_AREA_SIZE = 40

 REGION_HEAD_OVERLAP = int(config["region_overlap"]) if "region_overlap" in config else 0

@@ -114,7 +115,8 @@ if not 'region' in config:
            anchor = extract_longest_read(os.path.join(config["data_in"], config["read_file"]), 1)[0]
            config['length'] = len(anchor)
    config['region']= cut_region_from_length(config['length'])
-#    config['region_size'] = config['length']
+    if config['region_size'] == 0 :
+        config['region_size'] = config['length']+1

 # --------------------------------- #
 # Print warnings for high depth and length #
@@ -709,7 +711,7 @@ if "ref_file" in config:
    rule extract_mismatches:
      input: rules.comparison_ref_metaconsensus.output
      output: os.path.join("{output_folder}", "mismatches", "mismatches_d{depth}_t1_{threshold}_t2_{value}.fasta")
-      shell: "./src/util/exonerate_parser.py {input} {output} {MISMATCH_AREA} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "
+      shell: "./src/util/exonerate_parser.py {input} {output} {MISMATCH_AREA_SIZE} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "


    # ------------------------------------------------------ #
@@ -734,7 +736,7 @@ if "ref_file" in config:
    rule extract_mismatches_concat:
      input: rules.comparison_ref_metaconsensus_concat.output
      output: os.path.join("{output_folder}", "mismatches", "mismatches_concat_d{depth}_t1_{threshold}_t2_{value}.fasta")
-      shell: "./src/util/exonerate_parser.py {input} {output}  {MISMATCH_AREA} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "
+      shell: "./src/util/exonerate_parser.py {input} {output}  {MISMATCH_AREA_SIZE} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "

    # --------------------------------------------------------#
    # VI.d Comparisons between the two concatenation strategies