Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
MC_MSA
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Lihouck Flavien
MC_MSA
Commits
a07d2d16
Commit
a07d2d16
authored
3 years ago
by
Lihouck Flavien
Browse files
Options
Downloads
Patches
Plain Diff
Extracting read regions seems to work
parent
40a0400e
No related branches found
No related tags found
No related merge requests found
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
src/util/exonerate_parser.py
+62
-8
62 additions, 8 deletions
src/util/exonerate_parser.py
workflow/Snakefile
+35
-32
35 additions, 32 deletions
workflow/Snakefile
workflow/config.yaml
+5
-5
5 additions, 5 deletions
workflow/config.yaml
with
102 additions
and
45 deletions
src/util/exonerate_parser.py
+
62
−
8
View file @
a07d2d16
...
...
@@ -5,27 +5,54 @@ import sys
DEFAULT_DISTANCE
=
10
LINE_HEADER_SIZE
=
9
def
get_region_file
(
regions
,
folder
,
value
):
target_region
=
""
for
elem
in
regions
:
val
=
elem
.
split
(
"
_
"
)
#print(val, value)
if
int
(
val
[
0
])
<=
value
<
int
(
val
[
1
]):
target_region
=
elem
return
folder
+
"
/read_r
"
+
target_region
+
"
.fasta
"
if
target_region
!=
""
else
""
def
extract_read_pile
(
file
,
value
,
distance
,
output
,
padding
):
line_cnt
=
0
with
open
(
file
,
"
r
"
)
as
input
:
for
line
in
input
:
output
.
write
(
"
"
*
padding
)
if
line_cnt
%
2
==
0
:
output
.
write
(
line
)
else
:
start
=
value
-
distance
if
value
-
distance
>
0
else
0
end
=
value
+
distance
if
value
+
distance
<
len
(
line
)
else
len
(
line
)
output
.
write
(
line
[
start
:
end
]
+
"
\n
"
)
line_cnt
+=
1
output
.
write
(
"
\n\n
"
)
def
main
():
assert
(
len
(
sys
.
argv
)
>
5
)
if
len
(
sys
.
argv
)
>
3
:
distance
=
int
(
sys
.
argv
[
3
])
else
:
distance
=
DEFAULT_DISTANCE
size
=
int
(
sys
.
argv
[
4
])
folder
=
sys
.
argv
[
5
]
regions
=
sys
.
argv
[
6
].
strip
(
"
[
"
).
strip
(
"
]
"
).
split
(
"
,
"
)
with
open
(
sys
.
argv
[
1
],
'
r
'
)
as
f
:
lines
=
[
line
.
replace
(
"
\n
"
,
""
)
for
line
in
f
][
12
:
-
2
]
line_cnt
=
1
problems
=
[]
while
line_cnt
<
len
(
lines
):
char_cnt
=
LINE_HEADER_SIZE
while
char_cnt
<
len
(
lines
[
line_cnt
])
-
LINE_HEADER_SIZE
:
if
lines
[
line_cnt
][
char_cnt
]
!=
"
|
"
:
print
((
char_cnt
,
lines
[
line_cnt
-
1
][
char_cnt
],
lines
[
line_cnt
][
char_cnt
],
lines
[
line_cnt
+
1
][
char_cnt
]))
#
print((char_cnt, lines[line_cnt-1][char_cnt], lines[line_cnt][char_cnt], lines[line_cnt+1][char_cnt]))
problems
.
append
((
line_cnt
,
char_cnt
-
LINE_HEADER_SIZE
+
1
))
char_cnt
+=
distance
#print(distance)
else
:
char_cnt
+=
1
line_cnt
+=
4
region
=
[]
with
open
(
sys
.
argv
[
2
],
'
w
'
)
as
output
:
pb_cnt
=
1
for
pb
in
problems
:
...
...
@@ -36,12 +63,39 @@ def main():
query
[
1
]
=
query
[
1
].
strip
(
"
"
)
value1
=
int
(
target
[
0
])
value2
=
int
(
query
[
0
])
start
=
y
-
distance
if
y
-
distance
>
0
else
0
end
=
y
+
distance
if
y
+
distance
<
len
(
target
[
1
])
else
len
(
target
[
1
])
-
1
output
.
write
(
"
Error
"
+
str
(
pb_cnt
)
+
"
:
\n
"
+
str
(
value1
+
start
)
+
"
:
"
+
target
[
1
][
start
:
end
]
+
"
\n
"
)
output
.
write
(
str
(
value1
+
start
)
+
"
:
"
+
lines
[
x
][
start
+
LINE_HEADER_SIZE
-
1
:
end
+
LINE_HEADER_SIZE
-
1
]
+
"
\n
"
)
output
.
write
(
str
(
value2
+
start
)
+
"
:
"
+
query
[
1
][
start
:
end
]
+
"
\n\n
"
)
if
y
-
distance
<
0
:
start
=
0
prefix_start
=
y
-
distance
else
:
start
=
y
-
distance
prefix_start
=
None
if
prefix_start
and
x
>
1
:
prefix_x
=
x
-
4
prefix_1
=
lines
[
prefix_x
-
1
].
split
(
"
:
"
)[
1
][
prefix_start
:
-
1
]
prefix_2
=
lines
[
prefix_x
][
prefix_start
:
-
1
]
prefix_3
=
lines
[
prefix_x
+
1
].
split
(
"
:
"
)[
1
][
prefix_start
:
-
1
]
else
:
prefix_1
,
prefix_2
,
prefix_3
=
(
""
,
""
,
""
)
if
y
+
distance
>
len
(
target
[
1
]):
end
=
len
(
target
[
1
])
-
1
suffix_end
=
y
+
distance
-
len
(
target
)
else
:
end
=
y
+
distance
suffix_end
=
None
if
suffix_end
and
x
<
len
(
lines
)
-
2
:
suffix_x
=
x
+
4
suffix_1
=
lines
[
suffix_x
-
1
].
split
(
"
:
"
)[
1
][
1
:
suffix_end
]
suffix_2
=
lines
[
suffix_x
][
LINE_HEADER_SIZE
-
1
:
suffix_end
-
1
]
suffix_3
=
lines
[
suffix_x
+
1
].
split
(
"
:
"
)[
1
][
1
:
suffix_end
]
else
:
suffix_1
,
suffix_2
,
suffix_3
=
(
""
,
""
,
""
)
output
.
write
(
"
Error
"
+
str
(
pb_cnt
)
+
"
:
\n
"
+
str
(
value1
+
start
)
+
"
:
"
+
prefix_1
+
target
[
1
][
start
:
end
]
+
suffix_1
+
"
\n
"
)
output
.
write
(
str
(
value1
+
start
)
+
"
:
"
+
prefix_2
+
lines
[
x
][
start
+
LINE_HEADER_SIZE
-
1
:
end
+
LINE_HEADER_SIZE
-
1
]
+
suffix_2
+
"
\n
"
)
output
.
write
(
str
(
value2
+
start
)
+
"
:
"
+
prefix_3
+
query
[
1
][
start
:
end
]
+
suffix_3
+
"
\n\n
"
)
pb_cnt
+=
1
file
=
get_region_file
(
regions
,
folder
,
y
)
print
(
file
)
extract_read_pile
(
file
,
value2
%
size
,
distance
,
output
,
len
(
str
(
value1
+
start
)
+
"
:
"
))
if
__name__
==
"
__main__
"
:
main
()
\ No newline at end of file
This diff is collapsed.
Click to expand it.
workflow/Snakefile
+
35
−
32
View file @
a07d2d16
...
...
@@ -34,12 +34,14 @@ if not "threshold_2" in config:
if not "data_in" in config:
config["data_in"] = ""
config["early_concat"] = True
#
HIGH_DEPTH = 60
HIGH_LENGTH = 5000
DEPTH_MIN = int(50)
MISMATCH_AREA = 20
REGION_HEAD_OVERLAP = int(config["region_overlap"]) if "region_overlap" in config else 0
...
...
@@ -85,6 +87,7 @@ if not 'region' in config:
anchor = extract_longest_read(os.path.join(config["data_in"], config["read_file"]), 1)[0]
config['length'] = len(anchor)
config['region']= cut_region_from_length(config['length'])
config['region_size'] = config['length']
# --------------------------------- #
# Print warnings for high depth and length #
...
...
@@ -467,7 +470,7 @@ rule concat_region:
# ------------------------------------------------------- #
# Concatenating region before consensus phase #
if "early_concat" in config:
rule create_consensus_per_tool_concat:
input:
expand(rules.create_consensus_per_tool.output, region=config["region"], allow_missing=True)
...
...
@@ -660,7 +663,7 @@ if "ref_file" in config:
rule extract_mismatches:
input: rules.comparison_ref_metaconsensus.output
output: os.path.join("{output_folder}", "mismatches", "mismatches_d{depth}_t1_{threshold}_t2_{value}.fasta")
shell: "./src/util/exonerate_parser.py {input} {output}
>/dev/null
"
shell: "./src/util/exonerate_parser.py {input} {output}
{MISMATCH_AREA} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}'
"
# ------------------------------------------------------
# Regions concatenated after the consensus phase #
...
...
@@ -680,7 +683,7 @@ if "ref_file" in config:
rule extract_mismatches_concat:
input: rules.comparison_ref_metaconsensus_concat.output
output: os.path.join("{output_folder}", "mismatches", "mismatches_concat_d{depth}_t1_{threshold}_t2_{value}.fasta")
shell: "./src/util/exonerate_parser.py {input} {output}
>/dev/null
"
shell: "./src/util/exonerate_parser.py {input} {output}
{MISMATCH_AREA} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}'
"
# --------------------------------------------------------#
# Plots comparing the two strategies of concatenation #
...
...
This diff is collapsed.
Click to expand it.
workflow/config.yaml
+
5
−
5
View file @
a07d2d16
read_file
:
../data_yeast_diploid/diploid_reads_shuffle.fastq
output_folder
:
../TESTS/Yeast_Diplo
ref_file
:
../data_yeast_diploid/ref_diploid.fast
a
region_size
:
200
0
read_file
:
test_data/stup_virus_seq.fa
output_folder
:
stup_virus
ref_file
:
test_data/Stup_virus.f
a
region_size
:
0
tool
:
[
'
abpoa'
,
'
spoa'
,
'
kalign2'
,
'
kalign3'
,
'
mafft'
,
'
muscle'
]
threshold_1
:
[
50
]
threshold_2
:
[
50
]
depth
:
[
30
,
40
]
depth
:
0
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment