Skip to content
Snippets Groups Projects
Commit 08241ced authored by Rohmer Coralie's avatar Rohmer Coralie
Browse files

add consensus of consensus

parent 74ba567c
No related branches found
No related tags found
No related merge requests found
......@@ -113,7 +113,8 @@ rule all :
input :
expand(EXP + "/" + EXP_NAME + "/results/" + EXP_NAME + "_data_align_t{threshold}.csv", data_set=DATA_SETS, threshold=THRESHOLDS),
expand('{data_set}/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA),
expand(EXP + '/'+EXP_NAME + '/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA)
expand(EXP + '/'+EXP_NAME + '/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA),
expand("{data_set}/seq_consensus/t{threshold}/r{region_size}/align_consensus_consensus_ref_d{depth}.txt", data_set=DATA_SETS, threshold=THRESHOLDS,region_size=REGION_SIZES, depth=DEPTHS)
#-------------------------------------------------------------------------------
# Data set preparation
......@@ -493,3 +494,73 @@ rule data_time :
'ORDER="./src/data_time.pl {input}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER >{output} 2>>{log}'
# -------------------------------------------------------------------------------
# Consensus of consensus
# -------------------------------------------------------------------------------
rule separate_consensus :
input :
"{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus.fasta"
output :
"{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus_d{depth}.fasta"
message:
"Separate consensus for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
log:
"{data_set}/logs/14_separate_consensus_t{threshold}_r{region_size}_d{depth}.log"
shell:
'ORDER="./src/separate_consensus.sh {input} {wildcards.depth} {output}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER >{output} 2>>{log}'
rule consensus_msa :
input :
"{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus_d{depth}.fasta"
output :
time = os.path.join('{data_set}','time','consensus_msa_t{threshold}_r{region_size}_d{depth}'),
out = os.path.join('{data_set}','seq_consensus','t{threshold}','r{region_size}','consensus_msa_d{depth}.fasta')
message:
"Consensus msa for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
log:
"{data_set}/logs/15_consensus_msa_t{threshold}_r{region_size}_d{depth}.log"
conda:
"env_conda/muscle.yaml"
shell:
'./src/run_MSA.sh "muscle -in {input} -out {output.out}" {input} {output.out} {output.time} {log} 1'
rule consensus_consensus:
input :
os.path.join('{data_set}','seq_consensus','t{threshold}','r{region_size}','consensus_msa_d{depth}.fasta')
output :
os.path.join('{data_set}','seq_consensus','t{threshold}','r{region_size}','consensus_consensus_d{depth}.fasta')
message:
"Consensus consensus for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
log:
"{data_set}/logs/16_consensus_consensus_t{threshold}_r{region_size}_d{depth}.log"
conda:
"env_conda/python3.yaml"
shell:
'ORDER="./src/sequence_consensus.py -m -in {input}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER >{output} 2>>{log}'
rule alignment_consensus_consensus_ref :
input :
consensus=os.path.join('{data_set}','seq_consensus','t{threshold}','r{region_size}','consensus_consensus_d{depth}.fasta'),
region="{data_set}/seq_selectes_region/region_seq_r{region_size}.fasta"
output :
"{data_set}/seq_consensus/t{threshold}/r{region_size}/align_consensus_consensus_ref_d{depth}.txt",
message:
"Alignment_consensus_consensus_ref for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
log:
"{data_set}/logs/17_alignment_consensus_consensus_ref_t{threshold}_r{region_size}_d{depth}.log"
conda:
"env_conda/exonerate.yaml"
shell :
'if [ `cat {input.consensus}|grep ">"|wc -l` -ne 0 ]; then'
' ORDER="exonerate -Q dna -E -m a:g {input.consensus} {input.region}";'
' echo "ORDER: $ORDER" >{log};'
' $ORDER >{output} 2>>{log};'
'else'
' echo "ERROR: No sequences" >>{log};'
' touch {output};'
'fi'
cat $1 |tr '\n' '@'| sed 's/@>/\n>/g'|grep d$2|tr '@' '\n'>$3
......@@ -28,6 +28,7 @@ def use():
" -t : threshold above which the nucleotide is kept in percent.",
" -optional: -h: display help.",
" -I: use IUPAC code.\n",
" -m: use majority consensus. (useless -t)\n",
"Note: Nucleotide kept if highest frequency and above threshold.",
" In case of a tie, an N is kept (no IUPAC code). If equal",
" with a gap, it's the nucleotide is preserved.\n",sep="\n")
......@@ -62,10 +63,15 @@ else:
next_file = sys.argv[sys.argv.index("-in")+i]
except:
end_files = 1
try:
if (sys.argv[sys.argv.index("-m")]):
majority=1
except:
majority=0
try:
threshold=float(sys.argv[sys.argv.index("-t")+1])/100
except:
if (majority==False):
print("ERROR: The nucleotide selection threshold is mandatory.\n")
use()
......@@ -104,6 +110,9 @@ for file_name in file_names:
try:
if (iupac):
seq=tool_consensus.consensus_iupac(alignments[0],threshold)
else:
if(majority):
seq=tool_consensus.majority_consensus(alignments[0])
else:
seq=tool_consensus.consensus(alignments[0],threshold)
print(">" + file_name.split("/")[-1].split(".")[0] + "\n" + seq)
......
......@@ -140,6 +140,46 @@ def consensus(alignment,threshold):
seq+=max_letters[0]
return seq
def majority_consensus(alignment):
threshold=0.5
summary_align = AlignInfo.SummaryInfo(alignment)
nb_reads = len(summary_align.get_column(0))
align_size = len(alignment[1].seq)
letters = summary_align._get_all_letters()
seq=""
#Run the alignment column by column
for i in range(align_size):
pass
nucleotides=summary_align.get_column(i)
frequency_letters={}
max = [0,[]]
#Calculates the frequency of each letter
#and keeps the letter(s) with the maximum
max_frequency = 0
max_letters=[]
for letter in letters:
pass
frequency_letter=(len(re.findall(letter,nucleotides))/nb_reads)
if max_frequency < frequency_letter:
pass
max_frequency = frequency_letter
max_letters = [letter]
else:
if max_frequency==frequency_letter:
pass
max_letters = max_letters + [letter]
if len(max_letters) <= 1:
if max_letters[0] != '-':
pass
seq+=max_letters[0]
else:
max_letters.sort()
id_iupac = "".join((max_letters))
id_iupac = id_iupac.replace('-','')
seq+=nuc_to_iupac[id_iupac]
return seq
#Creation of the consensus sequence
def index_jaccard(nuc_ref,nuc_cons):
nb_nuc_ref = len(nuc_ref)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment