Skip to content
Snippets Groups Projects
Commit a07d2d16 authored by Lihouck Flavien's avatar Lihouck Flavien
Browse files

Extracting read regions seems to work

parent 40a0400e
No related branches found
No related tags found
No related merge requests found
......@@ -5,27 +5,54 @@ import sys
DEFAULT_DISTANCE = 10
LINE_HEADER_SIZE = 9
def get_region_file(regions, folder, value):
target_region = ""
for elem in regions:
val = elem.split("_")
#print(val, value)
if int(val[0]) <= value < int(val[1]):
target_region = elem
return folder+"/read_r"+target_region+".fasta" if target_region != "" else ""
def extract_read_pile(file, value, distance, output, padding):
line_cnt = 0
with open(file, "r") as input:
for line in input:
output.write(" "*padding)
if line_cnt % 2 == 0:
output.write(line)
else:
start = value - distance if value - distance > 0 else 0
end = value + distance if value + distance < len(line) else len(line)
output.write(line[start:end]+"\n")
line_cnt += 1
output.write("\n\n")
def main():
assert(len(sys.argv) > 5)
if len(sys.argv) > 3:
distance = int(sys.argv[3])
else:
distance = DEFAULT_DISTANCE
size = int(sys.argv[4])
folder = sys.argv[5]
regions = sys.argv[6].strip("[").strip("]").split(",")
with open(sys.argv[1], 'r') as f:
lines = [line.replace("\n","") for line in f][12:-2]
line_cnt = 1
problems = []
while line_cnt < len(lines):
char_cnt = LINE_HEADER_SIZE
while char_cnt < len(lines[line_cnt])-LINE_HEADER_SIZE:
if lines[line_cnt][char_cnt] != "|":
print((char_cnt, lines[line_cnt-1][char_cnt], lines[line_cnt][char_cnt], lines[line_cnt+1][char_cnt]))
#print((char_cnt, lines[line_cnt-1][char_cnt], lines[line_cnt][char_cnt], lines[line_cnt+1][char_cnt]))
problems.append((line_cnt, char_cnt-LINE_HEADER_SIZE+1))
char_cnt += distance
#print(distance)
else:
char_cnt += 1
line_cnt += 4
region = []
with open(sys.argv[2], 'w') as output:
pb_cnt = 1
for pb in problems:
......@@ -36,12 +63,39 @@ def main():
query[1] = query[1].strip(" ")
value1 = int(target[0])
value2 = int(query[0])
start = y - distance if y - distance > 0 else 0
end = y + distance if y + distance < len(target[1]) else len(target[1])-1
output.write("Error "+str(pb_cnt) + ":\n" + str(value1 + start)+ ": " + target[1][start:end] + "\n")
output.write(str(value1+start)+": "+lines[x][start+LINE_HEADER_SIZE-1:end+LINE_HEADER_SIZE-1]+"\n")
output.write(str(value2 + start) + ": " + query[1][start:end] + "\n\n")
if y - distance < 0:
start = 0
prefix_start = y - distance
else:
start = y - distance
prefix_start = None
if prefix_start and x > 1:
prefix_x = x-4
prefix_1 = lines[prefix_x-1].split(":")[1][prefix_start:-1]
prefix_2 = lines[prefix_x][prefix_start:-1]
prefix_3 = lines[prefix_x+1].split(":")[1][prefix_start:-1]
else:
prefix_1, prefix_2, prefix_3 = ("", "", "")
if y + distance > len(target[1]):
end = len(target[1])-1
suffix_end = y+distance - len(target)
else:
end = y + distance
suffix_end = None
if suffix_end and x < len(lines) -2 :
suffix_x = x+4
suffix_1 = lines[suffix_x-1].split(":")[1][1:suffix_end]
suffix_2 = lines[suffix_x][LINE_HEADER_SIZE-1:suffix_end-1]
suffix_3 = lines[suffix_x+1].split(":")[1][1:suffix_end]
else:
suffix_1, suffix_2, suffix_3 = ("","", "")
output.write("Error "+str(pb_cnt) + ":\n" + str(value1 + start)+ ": " + prefix_1 + target[1][start:end] + suffix_1 + "\n")
output.write(str(value1+start)+": "+prefix_2 +lines[x][start+LINE_HEADER_SIZE-1:end+LINE_HEADER_SIZE-1]+ suffix_2 +"\n")
output.write(str(value2 + start) + ": " + prefix_3 + query[1][start:end] + suffix_3 + "\n\n")
pb_cnt += 1
file = get_region_file(regions, folder, y)
print(file)
extract_read_pile(file, value2 % size, distance, output, len(str(value1+start)+": "))
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -34,12 +34,14 @@ if not "threshold_2" in config:
if not "data_in" in config:
config["data_in"] = ""
config["early_concat"] = True
#
HIGH_DEPTH = 60
HIGH_LENGTH = 5000
DEPTH_MIN = int(50)
MISMATCH_AREA = 20
REGION_HEAD_OVERLAP = int(config["region_overlap"]) if "region_overlap" in config else 0
......@@ -85,6 +87,7 @@ if not 'region' in config:
anchor = extract_longest_read(os.path.join(config["data_in"], config["read_file"]), 1)[0]
config['length'] = len(anchor)
config['region']= cut_region_from_length(config['length'])
config['region_size'] = config['length']
# --------------------------------- #
# Print warnings for high depth and length #
......@@ -467,7 +470,7 @@ rule concat_region:
# ------------------------------------------------------- #
# Concatenating region before consensus phase #
if "early_concat" in config:
rule create_consensus_per_tool_concat:
input:
expand(rules.create_consensus_per_tool.output, region=config["region"], allow_missing=True)
......@@ -660,7 +663,7 @@ if "ref_file" in config:
rule extract_mismatches:
input: rules.comparison_ref_metaconsensus.output
output: os.path.join("{output_folder}", "mismatches", "mismatches_d{depth}_t1_{threshold}_t2_{value}.fasta")
shell: "./src/util/exonerate_parser.py {input} {output} >/dev/null"
shell: "./src/util/exonerate_parser.py {input} {output} {MISMATCH_AREA} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "
# ------------------------------------------------------
# Regions concatenated after the consensus phase #
......@@ -680,7 +683,7 @@ if "ref_file" in config:
rule extract_mismatches_concat:
input: rules.comparison_ref_metaconsensus_concat.output
output: os.path.join("{output_folder}", "mismatches", "mismatches_concat_d{depth}_t1_{threshold}_t2_{value}.fasta")
shell: "./src/util/exonerate_parser.py {input} {output} >/dev/null"
shell: "./src/util/exonerate_parser.py {input} {output} {MISMATCH_AREA} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "
# --------------------------------------------------------#
# Plots comparing the two strategies of concatenation #
......
read_file: ../data_yeast_diploid/diploid_reads_shuffle.fastq
output_folder: ../TESTS/Yeast_Diplo
ref_file: ../data_yeast_diploid/ref_diploid.fasta
region_size: 2000
read_file: test_data/stup_virus_seq.fa
output_folder: stup_virus
ref_file: test_data/Stup_virus.fa
region_size: 0
tool: ['abpoa', 'spoa', 'kalign2', 'kalign3', 'mafft', 'muscle']
threshold_1: [50]
threshold_2: [50]
depth: [30,40]
depth: 0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment