Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
MC_MSA
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Lihouck Flavien
MC_MSA
Commits
e1386df6
Commit
e1386df6
authored
Jul 21, 2022
by
Lihouck Flavien
Browse files
Options
Downloads
Patches
Plain Diff
Cleaned read cuts output
parent
73b56147
Branches
main
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/util/exonerate_parser.py
+50
-25
50 additions, 25 deletions
src/util/exonerate_parser.py
workflow/Snakefile
+7
-5
7 additions, 5 deletions
workflow/Snakefile
with
57 additions
and
30 deletions
src/util/exonerate_parser.py
+
50
−
25
View file @
e1386df6
#!/usr/bin/env python3
#
#
#
This script aims to extract the mismatches found by exonerate and
#
# This script aims to extract the mismatches found by exonerate from the matching reads region, creating a file of
#
all the problems with their "origin" region #
import
sys
DEFAULT_DISTANCE
=
10
LINE_HEADER_SIZE
=
9
def
get_region_file
(
regions
,
folder
,
value
):
def
get_region_file
(
regions
,
folder
,
value
,
format_func
=
"
{0}/read_r{1}.fasta
"
.
format
):
"""
Find the read file
:param regions: (list/string) the list of every regions format to look for in
:param folder: the read regions
'
folder
:param value: the position to extract from target file
:param format_func: the read regions
'
file naming format
:return:
"""
target_region
=
""
if
not
isinstance
(
regions
,
list
):
regions
=
regions
.
split
(
"
,
"
)
...
...
@@ -20,25 +29,39 @@ def get_region_file(regions, folder, value):
val
=
elem
.
split
(
"
_
"
)
if
int
(
val
[
0
])
<=
value
<
int
(
val
[
1
]):
target_region
=
elem
return
folder
+
"
/read_r
"
+
target_region
+
"
.fasta
"
if
target_region
!=
""
else
""
def
extract_read_pile
(
file
,
value
,
distance
,
output
,
padding
,
safe
):
return
format_func
(
folder
,
target_region
)
if
target_region
!=
""
else
""
def
extract_read_pile
(
file
,
position
,
distance
,
output
,
padding
):
"""
Extracts every reads matching the region from file and writes them into the output file
:param file: the reads file
:param position: the mismatch position
:param distance: the size of elements to pick before and after the position
:param output: the output file to write in
:param padding: the padding length to add at the beginning of each line
:return: the amount of reads written
"""
line_cnt
=
0
print
(
value
,
safe
)
with
open
(
file
,
"
r
"
)
as
input
:
for
line
in
input
:
output
.
write
(
"
"
*
padding
)
if
line_cnt
%
2
==
0
:
output
.
write
(
line
)
else
:
start
=
value
-
distance
if
value
-
distance
>
0
else
0
end
=
value
+
distance
if
value
+
distance
<
len
(
line
)
else
len
(
line
)
output
.
write
(
line
[
start
:
end
]
+
"
\n
"
)
start
=
position
-
distance
if
position
-
distance
>
0
else
0
end
=
position
+
distance
if
position
+
distance
<
len
(
line
)
else
len
(
line
)
output
.
write
(
line
[
start
:
end
]
.
strip
(
"
\n
"
)
+
"
\n
"
)
line_cnt
+=
1
output
.
write
(
"
\n\n
"
)
return
line_cnt
def
main
():
assert
(
len
(
sys
.
argv
)
>
5
)
if
len
(
sys
.
argv
)
<
7
:
print
(
"
Usage: ./exonerate_parser.py INPUT OUTPUT OUTPUT_LENGTH READ_REGION_LENGTH READ_REGION_FOLDER READ_REGION_LIST
\n
"
)
exit
(
"
Missing parameters
"
)
if
len
(
sys
.
argv
)
>
3
:
distance
=
int
(
sys
.
argv
[
3
])
else
:
...
...
@@ -51,11 +74,13 @@ def main():
lines
=
[
line
.
replace
(
"
\n
"
,
""
)
for
line
in
f
][
12
:
-
2
]
line_cnt
=
1
problems
=
[]
print
(
lines
[
0
].
find
(
"
:
"
))
header_size
=
lines
[
0
].
find
(
"
:
"
)
+
2
while
line_cnt
<
len
(
lines
):
char_cnt
=
LINE_HEADER_SIZE
while
char_cnt
<
len
(
lines
[
line_cnt
])
-
LINE_HEADER_SIZE
:
char_cnt
=
header_size
while
char_cnt
<
len
(
lines
[
line_cnt
])
-
header_size
:
if
lines
[
line_cnt
][
char_cnt
]
!=
"
|
"
:
problems
.
append
((
line_cnt
,
char_cnt
-
LINE_HEADER_SIZE
+
1
))
problems
.
append
((
line_cnt
,
char_cnt
-
header_size
+
1
))
char_cnt
+=
distance
else
:
char_cnt
+=
1
...
...
@@ -78,9 +103,9 @@ def main():
prefix_start
=
None
if
prefix_start
and
x
>
1
:
prefix_x
=
x
-
4
prefix_1
=
lines
[
prefix_x
-
1
].
split
(
"
:
"
)[
1
][
prefix_start
:
-
1
]
prefix_2
=
lines
[
prefix_x
][
prefix_start
:
-
1
]
prefix_3
=
lines
[
prefix_x
+
1
].
split
(
"
:
"
)[
1
][
prefix_start
:
-
1
]
prefix_1
=
lines
[
prefix_x
-
1
].
split
(
"
:
"
)[
1
][
prefix_start
:
-
1
]
+
"
/
"
prefix_2
=
lines
[
prefix_x
][
prefix_start
:
-
1
]
+
"
/
"
prefix_3
=
lines
[
prefix_x
+
1
].
split
(
"
:
"
)[
1
][
prefix_start
:
-
1
]
+
"
/
"
else
:
prefix_x
=
None
prefix_1
,
prefix_2
,
prefix_3
=
(
""
,
""
,
""
)
...
...
@@ -92,19 +117,19 @@ def main():
suffix_end
=
None
if
suffix_end
and
x
<
len
(
lines
)
-
2
:
suffix_x
=
x
+
4
suffix_1
=
lines
[
suffix_x
-
1
].
split
(
"
:
"
)[
1
][
1
:
suffix_end
+
1
]
suffix_2
=
lines
[
suffix_x
][
LINE_HEADER_SIZE
:
suffix_end
+
LINE_HEADER_SIZE
]
suffix_3
=
lines
[
suffix_x
+
1
].
split
(
"
:
"
)[
1
][
1
:
suffix_end
+
1
]
suffix_1
=
"
\\
"
+
lines
[
suffix_x
-
1
].
split
(
"
:
"
)[
1
][
1
:
suffix_end
+
1
]
suffix_2
=
"
\\
"
+
lines
[
suffix_x
][
header_size
:
suffix_end
+
header_size
]
suffix_3
=
"
\\
"
+
lines
[
suffix_x
+
1
].
split
(
"
:
"
)[
1
][
1
:
suffix_end
+
1
]
else
:
suffix_x
=
None
suffix_1
,
suffix_2
,
suffix_3
=
(
""
,
""
,
""
)
output
.
write
(
"
Error
"
+
str
(
pb_cnt
)
+
"
:
\n
"
+
str
(
value1
+
start
)
+
"
:
"
+
prefix_1
+
target
[
1
][
start
:
end
]
+
suffix_1
+
"
\n
"
)
output
.
write
(
str
(
value1
+
start
)
+
"
:
"
+
prefix_2
+
lines
[
x
][
start
+
LINE_HEADER_SIZE
:
end
+
LINE_HEADER_SIZE
]
+
suffix_2
+
"
\n
"
)
output
.
write
(
str
(
value2
+
start
)
+
"
:
"
+
prefix_3
+
query
[
1
][
start
:
end
]
+
suffix_3
+
"
\n\n
"
)
output
.
write
(
"
Error
{0} : Target position = {1} Query position = {2}
\n
{3}{4}{5}
\n
"
.
format
(
str
(
pb_cnt
)
,
str
(
value1
+
start
)
,
str
(
value2
+
start
),
prefix_1
,
target
[
1
][
start
:
end
]
,
suffix_1
)
)
output
.
write
(
"
{0}{1}{2}
\n
"
.
format
(
prefix_2
,
lines
[
x
][
start
+
header_size
:
end
+
header_size
],
suffix_2
)
)
output
.
write
(
"
{0}{1}{2}
\n\n
"
.
format
(
prefix_3
,
query
[
1
][
start
:
end
]
,
suffix_3
)
)
pb_cnt
+=
1
file
=
get_region_file
(
regions
,
folder
,
y
+
value2
)
output
.
write
(
file
+
"
\n
"
)
extract_read_pile
(
file
,
value2
%
size
,
distance
,
output
,
len
(
str
(
value1
+
start
)
+
"
:
"
),
value2
)
output
.
write
(
file
+
"
\n
\n
"
)
extract_read_pile
(
file
,
value2
%
size
,
distance
+
2
,
output
,
0
)
if
__name__
==
"
__main__
"
:
main
()
\ No newline at end of file
This diff is collapsed.
Click to expand it.
workflow/Snakefile
+
7
−
5
View file @
e1386df6
...
...
@@ -38,10 +38,11 @@ config["early_concat"] = True
HIGH_DEPTH = 60
HIGH_LENGTH = 5000
# This constant represent the minimal coverage percentage required for a read to be kept for a specific region
# This constant represent
s
the minimal coverage percentage required for a read to be kept for a specific region
READ_COVERAGE_PERCENTAGE = int(50)
MISMATCH_AREA = 20
# This constant represents the expected region size extracted before and after the mismatch position.
MISMATCH_AREA_SIZE = 40
REGION_HEAD_OVERLAP = int(config["region_overlap"]) if "region_overlap" in config else 0
...
...
@@ -114,7 +115,8 @@ if not 'region' in config:
anchor = extract_longest_read(os.path.join(config["data_in"], config["read_file"]), 1)[0]
config['length'] = len(anchor)
config['region']= cut_region_from_length(config['length'])
# config['region_size'] = config['length']
if config['region_size'] == 0 :
config['region_size'] = config['length']+1
# --------------------------------- #
# Print warnings for high depth and length #
...
...
@@ -709,7 +711,7 @@ if "ref_file" in config:
rule extract_mismatches:
input: rules.comparison_ref_metaconsensus.output
output: os.path.join("{output_folder}", "mismatches", "mismatches_d{depth}_t1_{threshold}_t2_{value}.fasta")
shell: "./src/util/exonerate_parser.py {input} {output} {MISMATCH_AREA} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "
shell: "./src/util/exonerate_parser.py {input} {output} {MISMATCH_AREA
_SIZE
} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "
# ------------------------------------------------------ #
...
...
@@ -734,7 +736,7 @@ if "ref_file" in config:
rule extract_mismatches_concat:
input: rules.comparison_ref_metaconsensus_concat.output
output: os.path.join("{output_folder}", "mismatches", "mismatches_concat_d{depth}_t1_{threshold}_t2_{value}.fasta")
shell: "./src/util/exonerate_parser.py {input} {output} {MISMATCH_AREA} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "
shell: "./src/util/exonerate_parser.py {input} {output} {MISMATCH_AREA
_SIZE
} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "
# --------------------------------------------------------#
# VI.d Comparisons between the two concatenation strategies
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment