Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
msa-limit
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Rohmer Coralie
msa-limit
Commits
08241ced
Commit
08241ced
authored
3 years ago
by
Rohmer Coralie
Browse files
Options
Downloads
Patches
Plain Diff
add consensus of consensus
parent
74ba567c
No related branches found
No related tags found
No related merge requests found
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
Snakefile
+72
-1
72 additions, 1 deletion
Snakefile
src/separate_consensus.sh
+1
-0
1 addition, 0 deletions
src/separate_consensus.sh
src/sequence_consensus.py
+13
-4
13 additions, 4 deletions
src/sequence_consensus.py
src/tool_consensus.py
+40
-0
40 additions, 0 deletions
src/tool_consensus.py
with
126 additions
and
5 deletions
Snakefile
+
72
−
1
View file @
08241ced
...
...
@@ -113,7 +113,8 @@ rule all :
input :
expand(EXP + "/" + EXP_NAME + "/results/" + EXP_NAME + "_data_align_t{threshold}.csv", data_set=DATA_SETS, threshold=THRESHOLDS),
expand('{data_set}/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA),
expand(EXP + '/'+EXP_NAME + '/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA)
expand(EXP + '/'+EXP_NAME + '/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA),
expand("{data_set}/seq_consensus/t{threshold}/r{region_size}/align_consensus_consensus_ref_d{depth}.txt", data_set=DATA_SETS, threshold=THRESHOLDS,region_size=REGION_SIZES, depth=DEPTHS)
#-------------------------------------------------------------------------------
# Data set preparation
...
...
@@ -493,3 +494,73 @@ rule data_time :
'ORDER="./src/data_time.pl {input}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER >{output} 2>>{log}'
# -------------------------------------------------------------------------------
# Consensus of consensus
# -------------------------------------------------------------------------------
rule separate_consensus :
input :
"{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus.fasta"
output :
"{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus_d{depth}.fasta"
message:
"Separate consensus for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
log:
"{data_set}/logs/14_separate_consensus_t{threshold}_r{region_size}_d{depth}.log"
shell:
'ORDER="./src/separate_consensus.sh {input} {wildcards.depth} {output}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER >{output} 2>>{log}'
rule consensus_msa :
input :
"{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus_d{depth}.fasta"
output :
time = os.path.join('{data_set}','time','consensus_msa_t{threshold}_r{region_size}_d{depth}'),
out = os.path.join('{data_set}','seq_consensus','t{threshold}','r{region_size}','consensus_msa_d{depth}.fasta')
message:
"Consensus msa for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
log:
"{data_set}/logs/15_consensus_msa_t{threshold}_r{region_size}_d{depth}.log"
conda:
"env_conda/muscle.yaml"
shell:
'./src/run_MSA.sh "muscle -in {input} -out {output.out}" {input} {output.out} {output.time} {log} 1'
rule consensus_consensus:
input :
os.path.join('{data_set}','seq_consensus','t{threshold}','r{region_size}','consensus_msa_d{depth}.fasta')
output :
os.path.join('{data_set}','seq_consensus','t{threshold}','r{region_size}','consensus_consensus_d{depth}.fasta')
message:
"Consensus consensus for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
log:
"{data_set}/logs/16_consensus_consensus_t{threshold}_r{region_size}_d{depth}.log"
conda:
"env_conda/python3.yaml"
shell:
'ORDER="./src/sequence_consensus.py -m -in {input}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER >{output} 2>>{log}'
rule alignment_consensus_consensus_ref :
input :
consensus=os.path.join('{data_set}','seq_consensus','t{threshold}','r{region_size}','consensus_consensus_d{depth}.fasta'),
region="{data_set}/seq_selectes_region/region_seq_r{region_size}.fasta"
output :
"{data_set}/seq_consensus/t{threshold}/r{region_size}/align_consensus_consensus_ref_d{depth}.txt",
message:
"Alignment_consensus_consensus_ref for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
log:
"{data_set}/logs/17_alignment_consensus_consensus_ref_t{threshold}_r{region_size}_d{depth}.log"
conda:
"env_conda/exonerate.yaml"
shell :
'if [ `cat {input.consensus}|grep ">"|wc -l` -ne 0 ]; then'
' ORDER="exonerate -Q dna -E -m a:g {input.consensus} {input.region}";'
' echo "ORDER: $ORDER" >{log};'
' $ORDER >{output} 2>>{log};'
'else'
' echo "ERROR: No sequences" >>{log};'
' touch {output};'
'fi'
This diff is collapsed.
Click to expand it.
src/separate_consensus.sh
0 → 100755
+
1
−
0
View file @
08241ced
cat
$1
|tr
'\n'
'@'
|
sed
's/@>/\n>/g'
|grep d
$2
|tr
'@'
'\n'
>
$3
This diff is collapsed.
Click to expand it.
src/sequence_consensus.py
+
13
−
4
View file @
08241ced
...
...
@@ -28,6 +28,7 @@ def use():
"
-t : threshold above which the nucleotide is kept in percent.
"
,
"
-optional: -h: display help.
"
,
"
-I: use IUPAC code.
\n
"
,
"
-m: use majority consensus. (useless -t)
\n
"
,
"
Note: Nucleotide kept if highest frequency and above threshold.
"
,
"
In case of a tie, an N is kept (no IUPAC code). If equal
"
,
"
with a gap, it
'
s the nucleotide is preserved.
\n
"
,
sep
=
"
\n
"
)
...
...
@@ -62,10 +63,15 @@ else:
next_file
=
sys
.
argv
[
sys
.
argv
.
index
(
"
-in
"
)
+
i
]
except
:
end_files
=
1
try
:
if
(
sys
.
argv
[
sys
.
argv
.
index
(
"
-m
"
)]):
majority
=
1
except
:
majority
=
0
try
:
threshold
=
float
(
sys
.
argv
[
sys
.
argv
.
index
(
"
-t
"
)
+
1
])
/
100
except
:
if
(
majority
==
False
):
print
(
"
ERROR: The nucleotide selection threshold is mandatory.
\n
"
)
use
()
...
...
@@ -104,6 +110,9 @@ for file_name in file_names:
try
:
if
(
iupac
):
seq
=
tool_consensus
.
consensus_iupac
(
alignments
[
0
],
threshold
)
else
:
if
(
majority
):
seq
=
tool_consensus
.
majority_consensus
(
alignments
[
0
])
else
:
seq
=
tool_consensus
.
consensus
(
alignments
[
0
],
threshold
)
print
(
"
>
"
+
file_name
.
split
(
"
/
"
)[
-
1
].
split
(
"
.
"
)[
0
]
+
"
\n
"
+
seq
)
...
...
This diff is collapsed.
Click to expand it.
src/tool_consensus.py
+
40
−
0
View file @
08241ced
...
...
@@ -140,6 +140,46 @@ def consensus(alignment,threshold):
seq
+=
max_letters
[
0
]
return
seq
def
majority_consensus
(
alignment
):
threshold
=
0.5
summary_align
=
AlignInfo
.
SummaryInfo
(
alignment
)
nb_reads
=
len
(
summary_align
.
get_column
(
0
))
align_size
=
len
(
alignment
[
1
].
seq
)
letters
=
summary_align
.
_get_all_letters
()
seq
=
""
#Run the alignment column by column
for
i
in
range
(
align_size
):
pass
nucleotides
=
summary_align
.
get_column
(
i
)
frequency_letters
=
{}
max
=
[
0
,[]]
#Calculates the frequency of each letter
#and keeps the letter(s) with the maximum
max_frequency
=
0
max_letters
=
[]
for
letter
in
letters
:
pass
frequency_letter
=
(
len
(
re
.
findall
(
letter
,
nucleotides
))
/
nb_reads
)
if
max_frequency
<
frequency_letter
:
pass
max_frequency
=
frequency_letter
max_letters
=
[
letter
]
else
:
if
max_frequency
==
frequency_letter
:
pass
max_letters
=
max_letters
+
[
letter
]
if
len
(
max_letters
)
<=
1
:
if
max_letters
[
0
]
!=
'
-
'
:
pass
seq
+=
max_letters
[
0
]
else
:
max_letters
.
sort
()
id_iupac
=
""
.
join
((
max_letters
))
id_iupac
=
id_iupac
.
replace
(
'
-
'
,
''
)
seq
+=
nuc_to_iupac
[
id_iupac
]
return
seq
#Creation of the consensus sequence
def
index_jaccard
(
nuc_ref
,
nuc_cons
):
nb_nuc_ref
=
len
(
nuc_ref
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment