Skip to content
Snippets Groups Projects
Commit 69d79b42 authored by Rohmer Coralie's avatar Rohmer Coralie
Browse files

Meta consensus

parent cfea4c00
No related branches found
No related tags found
No related merge requests found
......@@ -114,7 +114,7 @@ rule all :
expand(EXP + "/" + EXP_NAME + "/results/" + EXP_NAME + "_data_align_t{threshold}.csv", data_set=DATA_SETS, threshold=THRESHOLDS),
expand('{data_set}/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA),
expand(EXP + '/'+EXP_NAME + '/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA),
expand(EXP + "/" + EXP_NAME + "/results/" + EXP_NAME + "_consensus_consensus_data_align_t{threshold}.csv", threshold=THRESHOLDS)
expand(EXP + "/" + EXP_NAME + "/results/" + EXP_NAME + "_meta_consensus_data_align_t{threshold}.csv", threshold=THRESHOLDS)
#-------------------------------------------------------------------------------
# Data set preparation
......@@ -387,7 +387,7 @@ rule sequence_consensus :
input :
expand("{{data_set}}/msa/MSA_{msa}_r{{region_size}}_d{depth}.fasta" , msa = MSA, depth=DEPTHS)
output :
"{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus.fasta"
"{data_set}/seq_consensus/seq_consensus_t{threshold}_r{region_size}.fasta"
message:
"Sequence_consensus for {wildcards.data_set} (Threshold={wildcards.threshold} & Region size={wildcards.region_size})"
log:
......@@ -404,10 +404,10 @@ rule sequence_consensus :
#-------------------------------------------------------------------------------
rule alignment_consensus_ref :
input :
consensus="{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus.fasta",
consensus="{data_set}/seq_consensus/seq_consensus_t{threshold}_r{region_size}.fasta",
region="{data_set}/seq_selectes_region/region_seq_r{region_size}.fasta"
output :
"{data_set}/seq_consensus/t{threshold}/r{region_size}/align_consensus_ref.txt",
"{data_set}/seq_consensus/align_consensus_ref_t{threshold}_r{region_size}.txt",
message:
"Alignment_consensus_ref for {wildcards.data_set} (Threshold={wildcards.threshold} & Region size={wildcards.region_size})"
log:
......@@ -429,7 +429,7 @@ rule alignment_consensus_ref :
#-------------------------------------------------------------------------------
rule data_formatting :
input :
exo = expand("{{data_set}}/seq_consensus/t{{threshold}}/r{region_size}/align_consensus_ref.txt" , region_size=REGION_SIZES),
exo = expand("{{data_set}}/seq_consensus/align_consensus_ref_t{{threshold}}_r{region_size}.txt" , region_size=REGION_SIZES),
time = "{data_set}/time/all_MSA_time.csv"
output :
"{data_set}/results/"+EXP_NAME+"_data_align_t{threshold}.csv"
......@@ -496,13 +496,13 @@ rule data_time :
'$ORDER >{output} 2>>{log}'
# -------------------------------------------------------------------------------
# Consensus of consensus
# Meta consensus
# -------------------------------------------------------------------------------
rule separate_consensus :
input :
"{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus.fasta"
"{data_set}/seq_consensus/seq_consensus_t{threshold}_r{region_size}.fasta"
output :
"{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus_r{region_size}_d{depth}.fasta"
"{data_set}/seq_consensus/separate_by_depth/seq_consensus_t{threshold}_r{region_size}_d{depth}.fasta"
message:
"Separate consensus for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
log:
......@@ -512,31 +512,30 @@ rule separate_consensus :
'echo "ORDER: $ORDER" >{log};'
'$ORDER >{output} 2>>{log}'
rule consensus_msa :
rule msa_consensus :
input :
"{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus_r{region_size}_d{depth}.fasta"
"{data_set}/seq_consensus/separate_by_depth/seq_consensus_t{threshold}_r{region_size}_d{depth}.fasta"
output :
time = os.path.join('{data_set}','time','consensus_msa_t{threshold}_r{region_size}_d{depth}'),
out = os.path.join('{data_set}','seq_consensus','t{threshold}','r{region_size}','msa_consensus_r{region_size}_d{depth}.fasta')
time = os.path.join('{data_set}','time','msa_consensus_t{threshold}_r{region_size}_d{depth}'),
out = os.path.join('{data_set}','meta_consensus','msa_consensus_t{threshold}_r{region_size}_d{depth}.fasta')
message:
"Consensus msa for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
"MSA Consensus for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
log:
"{data_set}/logs/15_consensus_msa_t{threshold}_r{region_size}_d{depth}.log"
"{data_set}/logs/15_msa_consensus_t{threshold}_r{region_size}_d{depth}.log"
conda:
"env_conda/muscle.yaml"
shell:
'./src/run_MSA.sh "muscle -in {input} -out {output.out}" {input} {output.out} {output.time} {log} 1'
rule consensus_consensus:
rule meta_consensus:
input :
expand('{{data_set}}/seq_consensus/t{{threshold}}/r{{region_size}}/msa_consensus_r{{region_size}}_d{depth}.fasta',depth=DEPTHS)
expand('{{data_set}}/meta_consensus/msa_consensus_t{{threshold}}_r{{region_size}}_d{depth}.fasta',depth=DEPTHS)
output :
"{data_set}/seq_consensus/t{threshold}/r{region_size}/consensus_consensus_r{region_size}.fasta"
"{data_set}/meta_consensus/meta_consensus_t{threshold}_r{region_size}.fasta"
message:
"Consensus consensus for {wildcards.data_set} (Threshold={wildcards.threshold} & Region size={wildcards.region_size})"
"Meta consensus for {wildcards.data_set} (Threshold={wildcards.threshold} & Region size={wildcards.region_size})"
log:
"{data_set}/logs/16_consensus_consensus_t{threshold}_r{region_size}.log"
"{data_set}/logs/16_meta_consensus_t{threshold}_r{region_size}.log"
conda:
"env_conda/python3.yaml"
shell:
......@@ -544,16 +543,16 @@ rule consensus_consensus:
'echo "ORDER: $ORDER" >{log};'
'$ORDER >{output} 2>>{log}'
rule alignment_consensus_consensus_ref :
rule alignment_meta_consensus_ref :
input :
consensus=os.path.join('{data_set}','seq_consensus','t{threshold}','r{region_size}','consensus_consensus_r{region_size}.fasta'),
consensus=os.path.join('{data_set}','meta_consensus','meta_consensus_t{threshold}_r{region_size}.fasta'),
region="{data_set}/seq_selectes_region/region_seq_r{region_size}.fasta"
output :
"{data_set}/seq_consensus/t{threshold}/r{region_size}/align_consensus_consensus_ref_r{region_size}.txt",
"{data_set}/meta_consensus/align_meta_consensus_ref_t{threshold}_r{region_size}.txt",
message:
"Alignment_consensus_consensus_ref for {wildcards.data_set} (Threshold={wildcards.threshold} & Region size={wildcards.region_size})"
"Alignment meta consensus ref for {wildcards.data_set} (Threshold={wildcards.threshold} & Region size={wildcards.region_size})"
log:
"{data_set}/logs/17_alignment_consensus_consensus_ref_t{threshold}_r{region_size}.log"
"{data_set}/logs/17_alignment_meta_consensus_ref_t{threshold}_r{region_size}.log"
conda:
"env_conda/exonerate.yaml"
shell :
......@@ -566,15 +565,15 @@ rule alignment_consensus_consensus_ref :
' touch {output};'
'fi'
rule consensus_consensus_data_formatting :
rule meta_consensus_data_formatting :
input :
expand("{{data_set}}/seq_consensus/t{{threshold}}/r{region_size}/align_consensus_consensus_ref_r{region_size}.txt" , region_size=REGION_SIZES, depth=DEPTHS),
expand("{{data_set}}/meta_consensus/align_meta_consensus_ref_t{{threshold}}_r{region_size}.txt" , region_size=REGION_SIZES),
output :
"{data_set}/results/"+EXP_NAME+"_consensus_consensus_data_align_t{threshold}.csv"
"{data_set}/results/"+EXP_NAME+"_meta_consensus_data_align_t{threshold}.csv"
message:
"Consensus consensus data formatting for {wildcards.data_set} (Threshold={wildcards.threshold})"
"Meta consensus data formatting for {wildcards.data_set} (Threshold={wildcards.threshold})"
log:
"{data_set}/logs/18_consensus_consensus_data_formatting_t{threshold}.log"
"{data_set}/logs/18_meta_consensus_data_formatting_t{threshold}.log"
conda:
"env_conda/python3.yaml"
shell :
......@@ -582,15 +581,15 @@ rule consensus_consensus_data_formatting :
'echo "ORDER: $ORDER" >{log};'
'$ORDER >{output} 2>>{log}'
rule consensus_consensus_region_mean:
rule meta_consensus_region_mean:
input :
expand("{data_set}/results/"+EXP_NAME+"_consensus_consensus_data_align_t{{threshold}}.csv" , data_set = DATA_SETS)
expand("{data_set}/results/"+EXP_NAME+"_meta_consensus_data_align_t{{threshold}}.csv" , data_set = DATA_SETS)
output :
EXP + '/'+ EXP_NAME + "/results/"+EXP_NAME+"_consensus_consensus_data_align_t{threshold}.csv"
EXP + '/'+ EXP_NAME + "/results/"+EXP_NAME+"_meta_consensus_data_align_t{threshold}.csv"
message:
"Consensus consensus region mean for " + EXP + '/'+ EXP_NAME + " (threshold={wildcards.threshold})"
"Meta consensus region mean for " + EXP + '/'+ EXP_NAME + " (threshold={wildcards.threshold})"
log:
EXP + '/'+EXP_NAME + "/logs/19_consensus_consensus_region_mean_t{threshold}.log"
EXP + '/'+EXP_NAME + "/logs/19_meta_consensus_region_mean_t{threshold}.log"
conda:
"env_conda/python3.yaml"
shell :
......
D: [10,20,50]
S: [100,200]
T: [50]
M: [muscle,mafft,poa,spoa,abpoa,kalign,kalign3,clustalo,tcoffee]
M: [muscle,mafft,poa,spoa,abpoa,kalign,kalign3]
O: 10
......@@ -77,15 +77,10 @@ def config_conda():
sys.exit()
def test():
answer = input("Do you want to run a test? (y/n) ")
if (answer == 'y' or answer == 'Y' ):
pass
print("Launch test")
if not os.path.exists("test_data"):
print("Preparation of the test data set")
result = subprocess.run("./src/test_data.sh")
print("Launch Snakemake")
result = subprocess.run(["./src/snakemake_launcher.sh","configuration_files/test.yaml"])
else :
usage()
def experiments_list():
if os.path.exists("experiments"):
......@@ -111,8 +106,13 @@ def summary():
result = subprocess.Popen("ls experiments",shell=True,stdout=subprocess.PIPE)
exp_names=result.communicate()[0].decode("utf-8")
exp_names=re.sub('\n', r' ', exp_names)
result = subprocess.run("./src/total_data_format.py -m -n " + exp_names,shell=True)
if os.path.exists("results_mean"):
result = subprocess.run("rm -r results_mean",shell=True)
if os.path.exists("results_all_regions"):
result = subprocess.run("rm -r results_all_regions",shell=True)
result = subprocess.run("./src/total_data_format.py -n " + exp_names,shell=True)
result = subprocess.run("./src/total_data_format.py -m -n " + exp_names,shell=True)
print("See folders: results_mean & results_all_regions")
else:
print("No experiment has been launched yet")
......
......@@ -97,6 +97,7 @@ for file_name in file_names:
if bool_file_type != None:
pass
bool_fasta=re.search(">", infile.read())
#alignments[0]=""
if bool_fasta != None:
pass
alignments = list(AlignIO.parse(file_name, "fasta"))
......@@ -105,9 +106,8 @@ for file_name in file_names:
else:
print("ERROR:The format of",file_name,"isn't correct.\n")
use()
#Retrieves the consensus sequence
try:
if (len(alignments) >= 1 ):
if (iupac):
seq=tool_consensus.consensus_iupac(alignments[0],threshold)
else:
......@@ -116,5 +116,5 @@ for file_name in file_names:
else:
seq=tool_consensus.consensus(alignments[0],threshold)
print(">" + file_name.split("/")[-1].split(".")[0] + "\n" + seq)
except:
else:
print("",end="")
......@@ -13,5 +13,5 @@ if [[ $2 == "" ]]; then
else
CORES=$2
fi
echo "Launche Snakemake with $CONFIG_EXPERIMENT"
echo "Launch Snakemake with $CONFIG_EXPERIMENT"
snakemake --configfile $CONFIG_EXPERIMENT -c$CORES --use-conda --rerun-incomplete
......@@ -7,6 +7,11 @@ from Bio.pairwise2 import format_alignment
from Bio import SeqIO
nuc_to_iupac={}
nuc_to_iupac["-"]="-"
nuc_to_iupac["A"]="A"
nuc_to_iupac["G"]="G"
nuc_to_iupac["C"]="C"
nuc_to_iupac["T"]="T"
nuc_to_iupac["AG"]="R"
nuc_to_iupac["CT"]="Y"
nuc_to_iupac["CG"]="S"
......@@ -145,41 +150,67 @@ def majority_consensus(alignment):
summary_align = AlignInfo.SummaryInfo(alignment)
nb_reads = len(summary_align.get_column(0))
align_size = len(alignment[1].seq)
letters = summary_align._get_all_letters()
seq=""
#Run the alignment column by column
for i in range(align_size):
pass
nucleotides=summary_align.get_column(i)
nucleotides_IUPAC=summary_align.get_column(i)
freq_gap=len(re.findall("-",nucleotides_IUPAC))/nb_reads
if(freq_gap < 0.5):
frequency_letters={}
max = [0,[]]
#Calculates the frequency of each letter
#and keeps the letter(s) with the maximum
max_frequency = 0
nucleotides_no_IUPAC=[]
for nuc in nucleotides_IUPAC:
nucleotides_no_IUPAC.append(iupac_to_nuc[nuc])
max_counter = 0
max_letters=[]
for letter in letters:
pass
frequency_letter=(len(re.findall(letter,nucleotides))/nb_reads)
if max_frequency < frequency_letter:
pass
max_frequency = frequency_letter
for letter in "ATGC":
letter_counter=0;
for nuc in nucleotides_no_IUPAC:
if(nuc.find(letter) > -1):
letter_counter+=1/len(nuc)
if(letter_counter > max_counter):
max_counter = letter_counter
max_letters = [letter]
else:
if max_frequency==frequency_letter:
pass
max_letters = max_letters + [letter]
if len(max_letters) <= 1:
if max_letters[0] != '-':
pass
seq+=max_letters[0]
else:
max_letters.sort()
if(letter_counter == max_counter):
max_letters.append(letter)
if ( len(max_letters) > 1):
id_iupac = "".join((max_letters))
id_iupac = id_iupac.replace('-','')
seq+=nuc_to_iupac[id_iupac]
else:
seq+=max_letters[0]
return seq
# max_frequency = 0
# max_letters=[]
# for letter in letters:
# pass
# frequency_letter=(len(re.findall(letter,nucleotides))/nb_reads)
# if max_frequency < frequency_letter:
# pass
# max_frequency = frequency_letter
# max_letters = [letter]
# else:
# if max_frequency==frequency_letter:
# pass
# max_letters = max_letters + [letter]
# if len(max_letters) <= 1:
# if max_letters[0] != '-':
# pass
# seq+=max_letters[0]
# else:
# max_letters.sort()
# id_iupac = "".join((max_letters))
# id_iupac = id_iupac.replace('-','')
# try:
# seq+=nuc_to_iupac[id_iupac]
# except:
# seq+="["+id_iupac+"]"
#Creation of the consensus sequence
def index_jaccard(nuc_ref,nuc_cons):
nb_nuc_ref = len(nuc_ref)
......
......@@ -7,7 +7,7 @@ ATTRIBUTES_TO_DISPLAY_THRESHOLD_INDEPENDANT=["time","memory"]
PREFIX="region_"
RESULT_FOLDER="results"
NAME_DATA_FILE="data_align_t"
NAME_META_CONSENSUS="consensus_consensus_"
NAME_META_CONSENSUS="meta_consensus_"
#Script use
def use():
print("\nScript:\ttotal_data_formating.py",
......@@ -181,7 +181,6 @@ for threshold in files :
bases=sorted(bases)
MSA=sorted(MSA)
reads=sorted(reads)
print(reads,bases,MSA)
for base in bases :
for read in reads :
output_mean.write(str(base) + "," + str(read) + ",")
......@@ -206,4 +205,3 @@ for threshold in files :
output_all.write("\n")
output_mean.write("\n")
output_all.write("\n")
print("See folders: results_mean & results_all_regions")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment