Skip to content
Snippets Groups Projects
Commit e1386df6 authored by Lihouck Flavien's avatar Lihouck Flavien
Browse files

Cleaned read cuts output

parent 73b56147
Branches main
No related tags found
No related merge requests found
#!/usr/bin/env python3
##
# This script aims to extract the mismatches found by exonerate and
# # This script aims to extract the mismatches found by exonerate from the matching reads region, creating a file of
# all the problems with their "origin" region #
import sys
DEFAULT_DISTANCE = 10
LINE_HEADER_SIZE = 9
def get_region_file(regions, folder, value):
def get_region_file(regions, folder, value, format_func="{0}/read_r{1}.fasta".format):
"""
Find the read file
:param regions: (list/string) the list of every regions format to look for in
:param folder: the read regions' folder
:param value: the position to extract from target file
:param format_func: the read regions' file naming format
:return:
"""
target_region = ""
if not isinstance(regions, list):
regions = regions.split(",")
......@@ -20,25 +29,39 @@ def get_region_file(regions, folder, value):
val = elem.split("_")
if int(val[0]) <= value < int(val[1]):
target_region = elem
return folder+"/read_r"+target_region+".fasta" if target_region != "" else ""
def extract_read_pile(file, value, distance, output, padding, safe):
return format_func(folder, target_region) if target_region != "" else ""
def extract_read_pile(file, position, distance, output, padding):
"""
Extracts every reads matching the region from file and writes them into the output file
:param file: the reads file
:param position: the mismatch position
:param distance: the size of elements to pick before and after the position
:param output: the output file to write in
:param padding: the padding length to add at the beginning of each line
:return: the amount of reads written
"""
line_cnt = 0
print(value, safe)
with open(file, "r") as input:
for line in input:
output.write(" "*padding)
if line_cnt % 2 == 0:
output.write(line)
else:
start = value - distance if value - distance > 0 else 0
end = value + distance if value + distance < len(line) else len(line)
output.write(line[start:end]+"\n")
start = position - distance if position - distance > 0 else 0
end = position + distance if position + distance < len(line) else len(line)
output.write(line[start:end].strip("\n")+"\n")
line_cnt += 1
output.write("\n\n")
return line_cnt
def main():
assert(len(sys.argv) > 5)
if len(sys.argv) < 7:
print("Usage: ./exonerate_parser.py INPUT OUTPUT OUTPUT_LENGTH READ_REGION_LENGTH READ_REGION_FOLDER READ_REGION_LIST\n")
exit("Missing parameters")
if len(sys.argv) > 3:
distance = int(sys.argv[3])
else:
......@@ -51,11 +74,13 @@ def main():
lines = [line.replace("\n","") for line in f][12:-2]
line_cnt = 1
problems = []
print(lines[0].find(":"))
header_size = lines[0].find(": ")+2
while line_cnt < len(lines):
char_cnt = LINE_HEADER_SIZE
while char_cnt < len(lines[line_cnt])-LINE_HEADER_SIZE:
char_cnt = header_size
while char_cnt < len(lines[line_cnt])-header_size:
if lines[line_cnt][char_cnt] != "|":
problems.append((line_cnt, char_cnt-LINE_HEADER_SIZE+1))
problems.append((line_cnt, char_cnt-header_size+1))
char_cnt += distance
else:
char_cnt += 1
......@@ -78,9 +103,9 @@ def main():
prefix_start = None
if prefix_start and x > 1:
prefix_x = x-4
prefix_1 = lines[prefix_x-1].split(":")[1][prefix_start:-1]
prefix_2 = lines[prefix_x][prefix_start:-1]
prefix_3 = lines[prefix_x+1].split(":")[1][prefix_start:-1]
prefix_1 = lines[prefix_x-1].split(":")[1][prefix_start:-1]+"/"
prefix_2 = lines[prefix_x][prefix_start:-1]+"/"
prefix_3 = lines[prefix_x+1].split(":")[1][prefix_start:-1]+"/"
else:
prefix_x = None
prefix_1, prefix_2, prefix_3 = ("", "", "")
......@@ -92,19 +117,19 @@ def main():
suffix_end = None
if suffix_end and x < len(lines) - 2:
suffix_x = x+4
suffix_1 = lines[suffix_x-1].split(":")[1][1:suffix_end+1]
suffix_2 = lines[suffix_x][LINE_HEADER_SIZE:suffix_end+LINE_HEADER_SIZE]
suffix_3 = lines[suffix_x+1].split(":")[1][1:suffix_end+1]
suffix_1 = "\\"+lines[suffix_x-1].split(":")[1][1:suffix_end+1]
suffix_2 = "\\"+lines[suffix_x][header_size:suffix_end+header_size]
suffix_3 = "\\"+lines[suffix_x+1].split(":")[1][1:suffix_end+1]
else:
suffix_x = None
suffix_1, suffix_2, suffix_3 = ("","", "")
output.write("Error "+str(pb_cnt) + ":\n" + str(value1 + start)+ ": " + prefix_1 + target[1][start:end] + suffix_1 + "\n")
output.write(str(value1+start)+": "+prefix_2 +lines[x][start+LINE_HEADER_SIZE:end+LINE_HEADER_SIZE]+ suffix_2 +"\n")
output.write(str(value2 + start) + ": " + prefix_3 + query[1][start:end] + suffix_3 + "\n\n")
output.write("Error {0} : Target position = {1} Query position = {2}\n{3}{4}{5}\n".format(str(pb_cnt), str(value1+start), str(value2 + start), prefix_1, target[1][start:end], suffix_1))
output.write("{0}{1}{2}\n".format(prefix_2, lines[x][start+header_size:end+header_size], suffix_2))
output.write( "{0}{1}{2}\n\n".format(prefix_3, query[1][start:end], suffix_3))
pb_cnt += 1
file = get_region_file(regions, folder, y+value2)
output.write(file+ "\n")
extract_read_pile(file, value2 % size, distance, output, len(str(value1+start)+": "), value2)
output.write(file+ "\n\n")
extract_read_pile(file, value2 % size, distance+2, output, 0)
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -38,10 +38,11 @@ config["early_concat"] = True
HIGH_DEPTH = 60
HIGH_LENGTH = 5000
# This constant represent the minimal coverage percentage required for a read to be kept for a specific region
# This constant represents the minimal coverage percentage required for a read to be kept for a specific region
READ_COVERAGE_PERCENTAGE = int(50)
MISMATCH_AREA = 20
# This constant represents the expected region size extracted before and after the mismatch position.
MISMATCH_AREA_SIZE = 40
REGION_HEAD_OVERLAP = int(config["region_overlap"]) if "region_overlap" in config else 0
......@@ -114,7 +115,8 @@ if not 'region' in config:
anchor = extract_longest_read(os.path.join(config["data_in"], config["read_file"]), 1)[0]
config['length'] = len(anchor)
config['region']= cut_region_from_length(config['length'])
# config['region_size'] = config['length']
if config['region_size'] == 0 :
config['region_size'] = config['length']+1
# --------------------------------- #
# Print warnings for high depth and length #
......@@ -709,7 +711,7 @@ if "ref_file" in config:
rule extract_mismatches:
input: rules.comparison_ref_metaconsensus.output
output: os.path.join("{output_folder}", "mismatches", "mismatches_d{depth}_t1_{threshold}_t2_{value}.fasta")
shell: "./src/util/exonerate_parser.py {input} {output} {MISMATCH_AREA} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "
shell: "./src/util/exonerate_parser.py {input} {output} {MISMATCH_AREA_SIZE} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "
# ------------------------------------------------------ #
......@@ -734,7 +736,7 @@ if "ref_file" in config:
rule extract_mismatches_concat:
input: rules.comparison_ref_metaconsensus_concat.output
output: os.path.join("{output_folder}", "mismatches", "mismatches_concat_d{depth}_t1_{threshold}_t2_{value}.fasta")
shell: "./src/util/exonerate_parser.py {input} {output} {MISMATCH_AREA} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "
shell: "./src/util/exonerate_parser.py {input} {output} {MISMATCH_AREA_SIZE} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "
# --------------------------------------------------------#
# VI.d Comparisons between the two concatenation strategies
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment