Skip to content
Snippets Groups Projects
Commit fa995991 authored by Rohmer Coralie's avatar Rohmer Coralie
Browse files

Diploide metric + new doc

parent da6034b8
No related branches found
No related tags found
No related merge requests found
doc/pipeline.jpg

87.8 KiB

doc/pipeline.png

138 KiB

......@@ -2,6 +2,8 @@
import sys,os,re
import subprocess
from datetime import datetime
EXP = "experiments"
START = "start_position"
def usage():
print("msa-limit: Launch the MSA analysis pipeline\n",
"Usage:",
......@@ -42,7 +44,7 @@ def usage():
" -h",
" help",
"\nEx: ./msa-limit.py -i reads.fastq -r ref.fasta -b 1,15000 -n exp -d 10,100 -s 100,200 -t 50,75 -m mafft,poa",
"\nOther modes: ",
"\nOther modes: ./msa-limit.py [mode]",
" test ",
" Launches a pipeline test",
" list ",
......@@ -53,6 +55,9 @@ def usage():
" -n <string> <string> <string> ...",
" default: all the names of the experiments",
" names of the experiments you want to display in the summary.",
" -p <string> ",
" default: nothing",
" Prefix for results folders."
" run_config <string> <string> ...",
" Launches the pipeline from configuration file(s)",
" required: path to the configuration file(s).",
......@@ -60,6 +65,12 @@ def usage():
" -c <int>",
" default: 24",
" number of cores",
" diploid_metric",
" Calculates more advanced metrics for the diploid case",
" optional: ",
" -n <string> <string> <string> ...",
" default: all the names of the experiments",
" names of the experiments for which you want to calculate the metrics.",
" rulegraph",
" Displays a graph of the snakemake rules"
,sep="\n")
......@@ -83,13 +94,13 @@ def test():
result = subprocess.run(["./src/snakemake_launcher.sh","configuration_files/test.yaml"])
def experiments_list():
if os.path.exists("experiments"):
result = subprocess.run("ls experiments",shell=True)
if os.path.exists(EXP):
result = subprocess.run("ls " + EXP,shell=True)
else:
print("No experiment has been launched yet")
def summary():
if os.path.exists("experiments"):
if os.path.exists(EXP):
try:
next_name=sys.argv[sys.argv.index("-n")+1]
i=1
......@@ -103,16 +114,27 @@ def summary():
except:
end_names = 1
except:
result = subprocess.Popen("ls experiments",shell=True,stdout=subprocess.PIPE)
result = subprocess.Popen("ls " + EXP,shell=True,stdout=subprocess.PIPE)
exp_names=result.communicate()[0].decode("utf-8")
exp_names=re.sub('\n', r' ', exp_names)
if os.path.exists("results_mean"):
result = subprocess.run("rm -r results_mean",shell=True)
if os.path.exists("results_all_start_positions"):
result = subprocess.run("rm -r results_all_start_positions",shell=True)
result = subprocess.run("./src/total_data_format.py -n " + exp_names,shell=True)
result = subprocess.run("./src/total_data_format.py -m -n " + exp_names,shell=True)
print("See folders: results_mean & results_all_start_positions")
try:
prefix=sys.argv[sys.argv.index("-p")+1]
prefix=prefix+"_"
except:
prefix=""
if os.path.exists(prefix+"results_mean"):
result = subprocess.run("rm -r "+prefix+"results_mean",shell=True)
if os.path.exists(prefix+"results_all_start_positions"):
result = subprocess.run("rm -r "+prefix+"results_all_start_positions",shell=True)
result = subprocess.run("./src/total_data_format.py -o "+prefix+"results -n " + exp_names,shell=True)
result = subprocess.run("./src/total_data_format.py -o "+prefix+"results -m -n " + exp_names,shell=True)
try:
a = sys.argv[sys.argv.index("-d")]
result = subprocess.run("./src/total_data_format.py -o "+prefix+"results -d -n " + exp_names,shell=True)
except:
pass
print("See folders: "+prefix+"results_mean & "+prefix+"results_all_start_positions")
else:
print("No experiment has been launched yet")
......@@ -143,6 +165,68 @@ def run_config():
except:
end_files = 1
def diploid_metric():
if os.path.exists(EXP):
try:
next_name=sys.argv[sys.argv.index("-n")+1]
i=1
end_names = 0
exp_names = []
while ( end_names == 0 and re.search("-", next_name) == None ):
if not os.path.exists(os.path.join(EXP,next_name)):
print("ERROR: The name " + next_name + " isn't the name of an experiment")
else:
exp_names.append(next_name)
i+=1
try:
next_name = sys.argv[sys.argv.index("-n")+i]
except:
end_names = 1
except:
result = subprocess.Popen("ls " + EXP,shell=True,stdout=subprocess.PIPE)
names=result.communicate()[0].decode("utf-8")
names=re.sub('\n', r' ', names)
exp_names=names.split(" ")
for exp_name in exp_names:
result = subprocess.Popen("ls " + os.path.join(EXP,exp_name) +"|grep "+ START ,shell=True,stdout=subprocess.PIPE)
dir=result.communicate()[0].decode("utf-8")
dir_start=dir.split("\n")
dir_start.pop()
threshold=""
output_files={}
for start in dir_start:
result = subprocess.Popen("ls " + os.path.join(EXP,exp_name,start,"seq_consensus")+"|grep align" ,shell=True,stdout=subprocess.PIPE)
files=result.communicate()[0].decode("utf-8")
p = re.compile('_t[0-9]+_')
thresholds=set(p.findall(files))
files=files.split('\n')
files.pop()
for threshold in thresholds:
if threshold not in output_files:
output_files[threshold]=""
threshold_files=[]
for file in files:
p = re.compile(threshold)
if (p.search(file)):
threshold_files.append(os.path.join(EXP,exp_name,start,"seq_consensus",file))
output_file=os.path.join(EXP,exp_name,start,"results",exp_name+"_diploid_data_align"+threshold[0:-1]+".csv")
output_files[threshold]+=output_file + " "
order=["./src/diploid_metric.py",'-in']
for file in threshold_files:
order.append(file)
order.append("-o")
order.append(output_file)
result = subprocess.run(order)
for threshold in thresholds:
output_mean=os.path.join(EXP,exp_name,"results",exp_name+"_diploid_data_align"+threshold[0:-1]+".csv")
result = subprocess.run(["./src/region_mean_launcher.sh","-in ",output_files[threshold]
," -out ",output_mean," -t ",threshold[2:-1]])
print("Data generated. You will find them in the experiments results folder.")
else:
print("No experiment has been launched yet")
def rulegraph():
if not os.path.exists("test_data"):
print("Preparation of the test data set")
......@@ -175,6 +259,8 @@ elif mode == "summary":
summary()
elif mode == "run_config":
run_config()
elif mode == "diploid_metric":
diploid_metric()
elif mode == "rulegraph":
rulegraph()
else:
......
......@@ -20,7 +20,7 @@ def use():
"To Run:",
"./sequence_consensus.py -in <file1> <file2>... -t <file>\n",
"Arguments: ",
" -required: -in : file alignement (fasta).",
" -required: -in : file alignement (exonerate).",
" -t : output time_format.pl",sep="\n")
sys.exit()
......@@ -102,7 +102,7 @@ for file in files_alignement:
if (read_size not in data[MSA]):
data[MSA][read_size] = {}
match=round(tool_consensus.score_between_two_iupac(hsp.hit.seq,seq.seq),1)
error=round(len(seq)-match,1)
error=round(len(seq)-nb_identity,1)
nb_substitution = round(error - nb_deletion - nb_insertion,1)
r_ambiguity=round( (nb_ambiguity/(size_seq)*100) ,1)
r_identity=round( (nb_identity/len(seq)*100) ,1)
......
#!/usr/bin/env python3
from Bio import SearchIO
import sys,re,os
from collections import Counter
import tool_consensus
"""
Project "The Limits of Multiple Aligners"
Authors: ROHMER Coralie
Date: 2020-02
Description:
"""
#Script use
def use():
print("\nScript:\tdata_formating.py",
"Objectif: \n",
"To Run:",
"./SNP_or_Error.py -in <file1> <file2>...",
"Arguments: ",
" -required: -in : file alignement (exonerate).",sep="\n")
sys.exit()
#Argument testing
try:
files_alignement=[sys.argv[sys.argv.index("-in")+1]]
except:
print("ERROR: The name of the input exonerate file(s) is missing.\n")
use()
else:
i=2
try:
next_file = sys.argv[sys.argv.index("-in")+i]
except:
end_files = 1
else:
end_files = 0
while ( end_files == 0 and re.search("-", next_file) == None ):
files_alignement.append(next_file)
i+=1
try:
next_file = sys.argv[sys.argv.index("-in")+i]
except:
end_files = 1
try:
output=sys.argv[sys.argv.index("-o")+1]
except:
print("ERROR: The name of the output file is missing.\n")
use()
#Main
data = {}
read_sizes=[]
depths=[]
for file in files_alignement:
all_qresult = list(SearchIO.parse(file, 'exonerate-text'))
for i in range(len(all_qresult)):
qresult = all_qresult[i]
id_query = qresult.id.split("_")
nb_read = int(id_query[-1][1:])
read_size = int(id_query[-2][1:])
MSA = id_query[-3]
hsp = qresult[0][0]
similarity=hsp.aln_annotation['similarity']
seq=hsp.query
occurence_seq = Counter(seq)
occurence_seq_hit = Counter(hsp.hit)
occurence_similarity = Counter(similarity)
nb_ambiguity_seq_consensus=0
nb_ambiguity_ref=0
for i in "RYSWKMBDHVN":
if (i in occurence_seq):
nb_ambiguity_seq_consensus += occurence_seq[i]
for i in "RYSWKMBDHVN":
if (i in occurence_seq_hit):
nb_ambiguity_ref += occurence_seq_hit[i]
identical_iupac=0
iupac_vs_iupac=0
iupac_vs_nuc=0
iupac_vs_gap=0
nuc_vs_iupac=0
gap_vs_iupac=0
for i in range(0,len(seq)):
if(seq[i] in "RYSWKMBDHVN"):
if( hsp.hit[i] in "RYSWKMBDHVN"):
if(seq[i]==hsp.hit[i]):
identical_iupac+=1
else:
iupac_vs_iupac+=1
else:
if(hsp.hit[i]=="-"):
iupac_vs_gap+=1
else:
iupac_vs_nuc+=1
else:
if(hsp.hit[i] in "RYSWKMBDHVN"):
if(seq[i]=="-"):
gap_vs_iupac+=1
else:
nuc_vs_iupac+=1
if (MSA not in data):
data[MSA]={}
if (read_size not in data[MSA]):
data[MSA][read_size] = {}
if (read_size not in read_sizes):
read_sizes.append(read_size)
if (nb_read not in depths):
depths.append(nb_read)
data[MSA][read_size][nb_read]=[nb_ambiguity_ref,nb_ambiguity_seq_consensus,identical_iupac,iupac_vs_iupac,iupac_vs_nuc,iupac_vs_gap,nuc_vs_iupac,gap_vs_iupac]
depths.sort()
read_sizes.sort()
#output
sep=","
output=open(output,"w")
output.write("MSA,region_size,depth,nb_iupac_ref,nb_iupac_seq_consensus,identical_iupac,iupac_vs_iupac,sc_iupac_vs_r_nuc,sc_iupac_vs_r_gap,sc_nuc_vs_r_iupac,sc_gap_vs_r_iupac\n")
for MSA in data:
for read_size in read_sizes:
if(read_size in data[MSA]):
for nb_read in depths:
if(nb_read in data[MSA][read_size]):
output.write(MSA + "," + str(read_size) + "," + str(nb_read))
for val in data[MSA][read_size][nb_read]:
output.write("," + str(val))
pass
output.write("\n")
......@@ -46,7 +46,6 @@ try:
except:
print("ERROR: The threshold is missing.\n")
use()
data={}
data["order"]=[]
attributes=[]
......@@ -61,7 +60,6 @@ for file in files :
msa=tab_line[0]
region_size=tab_line[1]
depth=tab_line[2]
if (msa not in data):
data[msa] = {}
data["order"].append(msa)
......@@ -90,9 +88,9 @@ for file in files :
first_line+=attributes.pop(0)
for attribute in attributes:
first_line += "," + attribute + ",sd_" + attribute
read.close()
except:
print(file,"don't exists.\n")
out=open(output,"w")
out.write(first_line)
for msa in data["order"]:
......
#!/bin/bash
source ./src/config_conda.sh
if [[ ! -d .conda_python3 ]]; then
conda env create -p .conda_python3 -f env_conda/python3.yaml >/dev/null
fi
CURRENT_PATH=`pwd`
conda activate $CURRENT_PATH/.conda_python3
./src/region_mean.py $@
......@@ -2,21 +2,31 @@
import sys,re,os
import subprocess
EXP = "experiments"
ATTRIBUTES_TO_DISPLAY=["percentage_Ambiguity","sd_percentage_Ambiguity","percentage_Identity","sd_percentage_Identity","percentage_Match","sd_percentage_Match","percentage_Error","sd_percentage_Error","percentage_Substitution","sd_percentage_Substitution","percentage_Deletion","sd_percentage_Deletion","percentage_Insertion","sd_percentage_Insertion"]
ATTRIBUTES_TO_DISPLAY=["percentage_Ambiguity","sd_percentage_Ambiguity","percentage_Identity","sd_percentage_Identity","percentage_Match","sd_percentage_Match","percentage_Error","sd_percentage_Error","percentage_Substitution","sd_percentage_Substitution","percentage_Deletion","sd_percentage_Deletion","percentage_Insertion","sd_percentage_Insertion","size","sd_size"]
ATTRIBUTES_TO_DISPLAY_THRESHOLD_INDEPENDANT=["time","memory","sd_time","sd_memory"]
PREFIX="start_position_"
RESULT_FOLDER="results"
NAME_DATA_FILE="data_align_t"
NAME_META_CONSENSUS="meta_consensus_"
NAME_DIPLOID="diploid_data_align"
#Script use
def use():
print("\nScript:\ttotal_data_formating.py",
"Objectif: \n",
"To Run:",
"./total_data_formatting.py -e <exp_name1> <exp_name2> <exp_name3> ...\n",
"./total_data_formatting.py -n <exp_name1> <exp_name2> <exp_name3> ...\n",
"Arguments: ",
" -required: -n : name of the experiments you want to display.",sep="\n")
" required: ",
" -n <string> <string>... : name of the experiments you want to display.",
" optional: ",
" -o <string> : output folders",
" -m : for meta consensus",
" -d : for diploid metrique",sep="\n")
sys.exit()
try:
output_dir= sys.argv[sys.argv.index("-o")+1]
except:
output_dir="results"
try:
exp_names= [sys.argv[sys.argv.index("-n")+1]]
......@@ -41,15 +51,28 @@ else:
end_names = 1
META_CONSENSUS=True
DIPLOID=False
try:
a = [sys.argv[sys.argv.index("-m")]]
a = sys.argv[sys.argv.index("-m")]
except:
META_CONSENSUS=False
try:
a = sys.argv[sys.argv.index("-d")]
DIPLOID=True
ATTRIBUTES_TO_DISPLAY=["nb_iupac_ref","nb_iupac_seq_consensus","identical_iupac",
"iupac_vs_iupac","sc_iupac_vs_r_nuc","sc_iupac_vs_r_gap","sc_nuc_vs_r_iupac",
"sc_gap_vs_r_iupac",
"sd_nb_iupac_ref","sd_nb_iupac_seq_consensus","sd_identical_iupac","sd_iupac_vs_iupac",
"sd_sc_iupac_vs_r_nuc","sd_sc_iupac_vs_r_gap","sd_sc_nuc_vs_r_iupac","sd_sc_gap_vs_r_iupac"]
ATTRIBUTES_TO_DISPLAY_THRESHOLD_INDEPENDANT=[]
except:
pass
files={}
i=0
result = subprocess.run("if [ ! -d results_mean ]; then mkdir results_mean;fi",shell=True)
result = subprocess.run("if [ ! -d results_all_start_positions ]; then mkdir results_all_start_positions;fi",shell=True)
result = subprocess.run("if [ ! -d "+output_dir+"_mean ]; then mkdir "+output_dir+"_mean;fi",shell=True)
result = subprocess.run("if [ ! -d "+output_dir+"_all_start_positions ]; then mkdir "+output_dir+"_all_start_positions;fi",shell=True)
#-----------------------------------------------------------------------------
# Retrieved what was needed to read the files
#-----------------------------------------------------------------------------
......@@ -65,7 +88,10 @@ for i in range(len(exp_names)):
for filename in current_files:
if re.search(NAME_DATA_FILE,filename):
search_meta_consensus=re.search(NAME_META_CONSENSUS,filename)
if ((search_meta_consensus and META_CONSENSUS) or (not search_meta_consensus and not META_CONSENSUS)):
search_diploid=re.search(NAME_DIPLOID,filename)
if ((search_meta_consensus and META_CONSENSUS)
or (not search_meta_consensus and not META_CONSENSUS and not DIPLOID and not search_diploid)
or(search_diploid and DIPLOID)):
threshold=filename.split(".")[0].split("_t")[-1]
if (threshold not in files):
files[threshold]={}
......@@ -116,7 +142,6 @@ for threshold in files :
bases.append(base_pair)
if (nb_read not in reads):
reads.append(nb_read)
if (msa not in MSA[exp_name]):
MSA[exp_name].append(msa)
......@@ -130,7 +155,6 @@ for threshold in files :
MSA[exp_name]=sorted(MSA[exp_name])
except:
print(name_file , "don't exists.")
#-----------------------------------------------------------------------------
# Write to output files
#-----------------------------------------------------------------------------
......@@ -142,17 +166,18 @@ for threshold in files :
attributes_to_display+=ATTRIBUTES_TO_DISPLAY_THRESHOLD_INDEPENDANT
else:
add_name_file_output=NAME_META_CONSENSUS
#print(attributes_to_display)
for attribute in attributes_to_display:
#Header for files
if attribute in ATTRIBUTES_TO_DISPLAY_THRESHOLD_INDEPENDANT:
pass
output_mean=open("results_mean/data_" + add_name_file_output + "mean_"+ attribute + ".csv","w")
output_mean=open(output_dir+"_mean/data_" + add_name_file_output + "mean_"+ attribute + ".csv","w")
if(not re.search("^sd_",attribute)):
output_all=open("results_all_start_positions/data_" + add_name_file_output + "all_start_position_"+ attribute + ".csv","w")
output_all=open(output_dir+"_all_start_positions/data_" + add_name_file_output + "all_start_position_"+ attribute + ".csv","w")
else:
output_mean=open("results_mean/data_" + add_name_file_output + "mean_"+ attribute + "_" + threshold + ".csv","w")
output_mean=open(output_dir+"_mean/data_" + add_name_file_output + "mean_"+ attribute + "_" + threshold + ".csv","w")
if(not re.search("^sd_",attribute)):
output_all=open("results_all_start_positions/data_" + add_name_file_output + "all_start_position_"+ attribute + "_" + threshold + ".csv","w")
output_all=open(output_dir+"_all_start_positions/data_" + add_name_file_output + "all_start_position_"+ attribute + "_" + threshold + ".csv","w")
output_mean.write(",,")
if(not re.search("^sd_",attribute)):
output_all.write(",,")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment