Diploide metric + new doc

fa995991 · Rohmer Coralie · da6034b8 · fa995991 · da6034b8 · fa995991
Commit fa995991 authored 3 years ago by Rohmer Coralie
--- a/doc/pipeline.jpg
+++ b/doc/pipeline.jpg
--- a/doc/pipeline.png
+++ b/doc/pipeline.png
--- a/msa_limit.py
+++ b/msa_limit.py
@@ -2,6 +2,8 @@
 import sys,os,re
 import subprocess
 from datetime import datetime
+EXP = "experiments"
+START = "start_position"
 def usage():
    print("msa-limit: Launch the MSA analysis pipeline\n",
        "Usage:",
@@ -42,7 +44,7 @@ def usage():
        "    -h",
        "       help",
        "\nEx: ./msa-limit.py -i reads.fastq -r ref.fasta -b 1,15000 -n exp -d 10,100 -s 100,200 -t 50,75 -m mafft,poa",
-        "\nOther modes: ",
+        "\nOther modes: ./msa-limit.py [mode]",
        "  test ",
        "      Launches a pipeline test",
        "  list ",
@@ -53,6 +55,9 @@ def usage():
        "         -n <string> <string> <string> ...",
        "            default: all the names of the experiments",
        "            names of the experiments you want to display in the summary.",
+        "         -p <string> ",
+        "            default: nothing",
+        "            Prefix for results folders."
        "  run_config <string> <string> ...",
        "       Launches the pipeline from configuration file(s)",
        "        required: path to the configuration file(s).",
@@ -60,6 +65,12 @@ def usage():
        "         -c <int>",
        "            default: 24",
        "            number of cores",
+        "  diploid_metric",
+        "     Calculates more advanced metrics for the diploid case",
+        "        optional: ",
+        "         -n <string> <string> <string> ...",
+        "            default: all the names of the experiments",
+        "            names of the experiments for which you want to calculate the metrics.",
        "  rulegraph",
        "       Displays a graph of the snakemake rules"
        ,sep="\n")
@@ -83,13 +94,13 @@ def test():
    result = subprocess.run(["./src/snakemake_launcher.sh","configuration_files/test.yaml"])

 def experiments_list():
-    if os.path.exists("experiments"):
-        result = subprocess.run("ls experiments",shell=True)
+    if os.path.exists(EXP):
+        result = subprocess.run("ls " + EXP,shell=True)
    else:
        print("No experiment has been launched yet")

 def summary():
-    if os.path.exists("experiments"):
+    if os.path.exists(EXP):
        try:
            next_name=sys.argv[sys.argv.index("-n")+1]
            i=1
@@ -103,16 +114,27 @@ def summary():
                except:
                    end_names = 1
        except:
-            result = subprocess.Popen("ls experiments",shell=True,stdout=subprocess.PIPE)
+            result = subprocess.Popen("ls " + EXP,shell=True,stdout=subprocess.PIPE)
            exp_names=result.communicate()[0].decode("utf-8")
            exp_names=re.sub('\n', r' ', exp_names)
-        if os.path.exists("results_mean"):
-            result = subprocess.run("rm -r results_mean",shell=True)
-        if os.path.exists("results_all_start_positions"):
-            result = subprocess.run("rm -r results_all_start_positions",shell=True)
-        result = subprocess.run("./src/total_data_format.py -n " + exp_names,shell=True)
-        result = subprocess.run("./src/total_data_format.py -m -n " + exp_names,shell=True)
-        print("See folders: results_mean & results_all_start_positions")
+        try:
+            prefix=sys.argv[sys.argv.index("-p")+1]
+            prefix=prefix+"_"
+        except:
+            prefix=""
+
+        if os.path.exists(prefix+"results_mean"):
+            result = subprocess.run("rm -r "+prefix+"results_mean",shell=True)
+        if os.path.exists(prefix+"results_all_start_positions"):
+            result = subprocess.run("rm -r "+prefix+"results_all_start_positions",shell=True)
+        result = subprocess.run("./src/total_data_format.py -o "+prefix+"results -n " + exp_names,shell=True)
+        result = subprocess.run("./src/total_data_format.py -o "+prefix+"results -m -n " + exp_names,shell=True)
+        try:
+            a = sys.argv[sys.argv.index("-d")]
+            result = subprocess.run("./src/total_data_format.py -o "+prefix+"results -d -n " + exp_names,shell=True)
+        except:
+            pass
+        print("See folders: "+prefix+"results_mean & "+prefix+"results_all_start_positions")
    else:
        print("No experiment has been launched yet")

@@ -143,6 +165,68 @@ def run_config():
        except:
            end_files = 1

+def diploid_metric():
+    if os.path.exists(EXP):
+        try:
+            next_name=sys.argv[sys.argv.index("-n")+1]
+            i=1
+            end_names = 0
+            exp_names = []
+            while ( end_names == 0 and re.search("-", next_name) == None ):
+                if not os.path.exists(os.path.join(EXP,next_name)):
+                    print("ERROR: The name " + next_name + " isn't the name of an experiment")
+                else:
+                    exp_names.append(next_name)
+                i+=1
+                try:
+                    next_name = sys.argv[sys.argv.index("-n")+i]
+                except:
+                    end_names = 1
+        except:
+            result = subprocess.Popen("ls " + EXP,shell=True,stdout=subprocess.PIPE)
+            names=result.communicate()[0].decode("utf-8")
+            names=re.sub('\n', r' ', names)
+            exp_names=names.split(" ")
+        for exp_name in exp_names:
+            result = subprocess.Popen("ls " + os.path.join(EXP,exp_name) +"|grep "+ START ,shell=True,stdout=subprocess.PIPE)
+            dir=result.communicate()[0].decode("utf-8")
+            dir_start=dir.split("\n")
+            dir_start.pop()
+            threshold=""
+            output_files={}
+            for start in dir_start:
+                result = subprocess.Popen("ls " + os.path.join(EXP,exp_name,start,"seq_consensus")+"|grep align" ,shell=True,stdout=subprocess.PIPE)
+                files=result.communicate()[0].decode("utf-8")
+                p = re.compile('_t[0-9]+_')
+                thresholds=set(p.findall(files))
+                files=files.split('\n')
+                files.pop()
+                for threshold in thresholds:
+                    if threshold not in output_files:
+                        output_files[threshold]=""
+                    threshold_files=[]
+                    for file in files:
+                        p = re.compile(threshold)
+                        if (p.search(file)):
+                            threshold_files.append(os.path.join(EXP,exp_name,start,"seq_consensus",file))
+                    output_file=os.path.join(EXP,exp_name,start,"results",exp_name+"_diploid_data_align"+threshold[0:-1]+".csv")
+                    output_files[threshold]+=output_file + " "
+                    order=["./src/diploid_metric.py",'-in']
+                    for file in threshold_files:
+                        order.append(file)
+                    order.append("-o")
+                    order.append(output_file)
+                    result = subprocess.run(order)
+            for threshold in thresholds:
+                output_mean=os.path.join(EXP,exp_name,"results",exp_name+"_diploid_data_align"+threshold[0:-1]+".csv")
+                result = subprocess.run(["./src/region_mean_launcher.sh","-in ",output_files[threshold]
+                ," -out ",output_mean," -t ",threshold[2:-1]])
+        print("Data generated. You will find them in the experiments results folder.")
+
+
+
+    else:
+        print("No experiment has been launched yet")
 def rulegraph():
    if not os.path.exists("test_data"):
        print("Preparation of the test data set")
@@ -175,6 +259,8 @@ elif mode == "summary":
    summary()
 elif mode == "run_config":
    run_config()
+elif mode == "diploid_metric":
+    diploid_metric()
 elif mode == "rulegraph":
    rulegraph()
 else:

--- a/src/data_formatting.py
+++ b/src/data_formatting.py
@@ -20,7 +20,7 @@ def use():
          "To Run:",
 		  "./sequence_consensus.py -in <file1> <file2>... -t <file>\n",
          "Arguments: ",
-		  "  -required: -in : file alignement (fasta).",
+		  "  -required: -in : file alignement (exonerate).",
 		  "             -t  : output time_format.pl",sep="\n")
 	sys.exit()

@@ -102,7 +102,7 @@ for file in files_alignement:
 		if (read_size not in data[MSA]):
 			data[MSA][read_size] = {}
 		match=round(tool_consensus.score_between_two_iupac(hsp.hit.seq,seq.seq),1)
-		error=round(len(seq)-match,1)
+		error=round(len(seq)-nb_identity,1)
 		nb_substitution = round(error - nb_deletion - nb_insertion,1)
 		r_ambiguity=round( (nb_ambiguity/(size_seq)*100) ,1)
 		r_identity=round( (nb_identity/len(seq)*100) ,1)

--- a/src/diploid_metric.py
+++ b/src/diploid_metric.py
+#!/usr/bin/env python3
+from Bio import SearchIO
+import sys,re,os
+from collections import Counter
+import tool_consensus
+
+"""
+		Project "The Limits of Multiple Aligners"
+
+Authors: ROHMER Coralie
+Date: 2020-02
+
+Description:
+
+"""
+#Script use
+def use():
+	print("\nScript:\tdata_formating.py",
+          "Objectif: \n",
+          "To Run:",
+		  "./SNP_or_Error.py -in <file1> <file2>...",
+          "Arguments: ",
+		  "  -required: -in : file alignement (exonerate).",sep="\n")
+	sys.exit()
+
+#Argument testing
+try:
+    files_alignement=[sys.argv[sys.argv.index("-in")+1]]
+except:
+	print("ERROR: The name of the input exonerate file(s) is missing.\n")
+	use()
+else:
+	i=2
+	try:
+		next_file = sys.argv[sys.argv.index("-in")+i]
+	except:
+		end_files = 1
+	else:
+		end_files = 0
+
+	while ( end_files == 0 and re.search("-", next_file) == None ):
+		files_alignement.append(next_file)
+		i+=1
+		try:
+			next_file = sys.argv[sys.argv.index("-in")+i]
+		except:
+			end_files = 1
+try:
+    output=sys.argv[sys.argv.index("-o")+1]
+except:
+	print("ERROR: The name of the output file is missing.\n")
+	use()
+
+
+#Main
+data = {}
+read_sizes=[]
+depths=[]
+for file in files_alignement:
+	all_qresult = list(SearchIO.parse(file, 'exonerate-text'))
+
+	for i in range(len(all_qresult)):
+		qresult = all_qresult[i]
+		id_query = qresult.id.split("_")
+		nb_read = int(id_query[-1][1:])
+		read_size = int(id_query[-2][1:])
+		MSA = id_query[-3]
+		hsp = qresult[0][0]
+		similarity=hsp.aln_annotation['similarity']
+		seq=hsp.query
+		occurence_seq = Counter(seq)
+		occurence_seq_hit = Counter(hsp.hit)
+		occurence_similarity = Counter(similarity)
+		nb_ambiguity_seq_consensus=0
+		nb_ambiguity_ref=0
+
+		for i in "RYSWKMBDHVN":
+			if (i in occurence_seq):
+				nb_ambiguity_seq_consensus += occurence_seq[i]
+
+		for i in "RYSWKMBDHVN":
+			if (i in occurence_seq_hit):
+				nb_ambiguity_ref += occurence_seq_hit[i]
+
+		identical_iupac=0
+		iupac_vs_iupac=0
+		iupac_vs_nuc=0
+		iupac_vs_gap=0
+		nuc_vs_iupac=0
+		gap_vs_iupac=0
+		for i in range(0,len(seq)):
+			if(seq[i] in "RYSWKMBDHVN"):
+				if( hsp.hit[i] in "RYSWKMBDHVN"):
+					if(seq[i]==hsp.hit[i]):
+						identical_iupac+=1
+					else:
+						iupac_vs_iupac+=1
+				else:
+					if(hsp.hit[i]=="-"):
+						iupac_vs_gap+=1
+					else:
+						iupac_vs_nuc+=1
+			else:
+				if(hsp.hit[i] in "RYSWKMBDHVN"):
+					if(seq[i]=="-"):
+						gap_vs_iupac+=1
+					else:
+						nuc_vs_iupac+=1
+
+		if (MSA not in data):
+			data[MSA]={}
+
+		if (read_size not in data[MSA]):
+			data[MSA][read_size] = {}
+
+		if (read_size not in read_sizes):
+			read_sizes.append(read_size)
+
+		if (nb_read not in depths):
+			depths.append(nb_read)
+
+		data[MSA][read_size][nb_read]=[nb_ambiguity_ref,nb_ambiguity_seq_consensus,identical_iupac,iupac_vs_iupac,iupac_vs_nuc,iupac_vs_gap,nuc_vs_iupac,gap_vs_iupac]
+depths.sort()
+read_sizes.sort()
+#output
+sep=","
+output=open(output,"w")
+output.write("MSA,region_size,depth,nb_iupac_ref,nb_iupac_seq_consensus,identical_iupac,iupac_vs_iupac,sc_iupac_vs_r_nuc,sc_iupac_vs_r_gap,sc_nuc_vs_r_iupac,sc_gap_vs_r_iupac\n")
+for MSA in data:
+	for read_size in read_sizes:
+		if(read_size in data[MSA]):
+			for nb_read in depths:
+				if(nb_read in data[MSA][read_size]):
+					output.write(MSA  + "," + str(read_size) + "," + str(nb_read))
+					for val in data[MSA][read_size][nb_read]:
+						output.write("," + str(val))
+						pass
+					output.write("\n")
--- a/src/region_mean.py
+++ b/src/region_mean.py
@@ -46,7 +46,6 @@ try:
 except:
 	print("ERROR: The threshold is missing.\n")
 	use()
-
 data={}
 data["order"]=[]
 attributes=[]
@@ -61,7 +60,6 @@ for file in files :
 				msa=tab_line[0]
 				region_size=tab_line[1]
 				depth=tab_line[2]
-
 				if (msa not in data):
 					data[msa] = {}
 					data["order"].append(msa)
@@ -90,9 +88,9 @@ for file in files :
 					first_line+=attributes.pop(0)
 					for attribute in attributes:
 						first_line += "," + attribute + ",sd_" + attribute
+		read.close()
 	except:
 		print(file,"don't exists.\n")
-
 out=open(output,"w")
 out.write(first_line)
 for msa in data["order"]:

--- a/src/region_mean_launcher.sh
+++ b/src/region_mean_launcher.sh
+#!/bin/bash
+source ./src/config_conda.sh
+if [[ ! -d .conda_python3 ]]; then
+  conda env create -p .conda_python3 -f env_conda/python3.yaml  >/dev/null
+fi
+CURRENT_PATH=`pwd`
+conda activate $CURRENT_PATH/.conda_python3
+
+./src/region_mean.py $@
--- a/src/total_data_format.py
+++ b/src/total_data_format.py
@@ -2,21 +2,31 @@
 import sys,re,os
 import subprocess
 EXP = "experiments"
-ATTRIBUTES_TO_DISPLAY=["percentage_Ambiguity","sd_percentage_Ambiguity","percentage_Identity","sd_percentage_Identity","percentage_Match","sd_percentage_Match","percentage_Error","sd_percentage_Error","percentage_Substitution","sd_percentage_Substitution","percentage_Deletion","sd_percentage_Deletion","percentage_Insertion","sd_percentage_Insertion"]
+ATTRIBUTES_TO_DISPLAY=["percentage_Ambiguity","sd_percentage_Ambiguity","percentage_Identity","sd_percentage_Identity","percentage_Match","sd_percentage_Match","percentage_Error","sd_percentage_Error","percentage_Substitution","sd_percentage_Substitution","percentage_Deletion","sd_percentage_Deletion","percentage_Insertion","sd_percentage_Insertion","size","sd_size"]
 ATTRIBUTES_TO_DISPLAY_THRESHOLD_INDEPENDANT=["time","memory","sd_time","sd_memory"]
 PREFIX="start_position_"
 RESULT_FOLDER="results"
 NAME_DATA_FILE="data_align_t"
 NAME_META_CONSENSUS="meta_consensus_"
+NAME_DIPLOID="diploid_data_align"
 #Script use
 def use():
 	print("\nScript:\ttotal_data_formating.py",
          "Objectif: \n",
          "To Run:",
-		  "./total_data_formatting.py -e <exp_name1> <exp_name2> <exp_name3> ...\n",
+		  "./total_data_formatting.py -n <exp_name1> <exp_name2> <exp_name3> ...\n",
          "Arguments: ",
-		  "  -required: -n : name of the experiments you want to display.",sep="\n")
+		  "  required: ",
+		  "		-n <string> <string>... : name of the experiments you want to display.",
+		  "  optional: ",
+		  "     -o <string> : output folders",
+		  "     -m : for meta consensus",
+		  "     -d : for diploid metrique",sep="\n")
 	sys.exit()
+try:
+    output_dir= sys.argv[sys.argv.index("-o")+1]
+except:
+	output_dir="results"

 try:
    exp_names= [sys.argv[sys.argv.index("-n")+1]]
@@ -41,15 +51,28 @@ else:
 			end_names = 1

 META_CONSENSUS=True
+DIPLOID=False
 try:
-    a = [sys.argv[sys.argv.index("-m")]]
+    a = sys.argv[sys.argv.index("-m")]
 except:
 	META_CONSENSUS=False
+	try:
+		a = sys.argv[sys.argv.index("-d")]
+		DIPLOID=True
+
+		ATTRIBUTES_TO_DISPLAY=["nb_iupac_ref","nb_iupac_seq_consensus","identical_iupac",
+		"iupac_vs_iupac","sc_iupac_vs_r_nuc","sc_iupac_vs_r_gap","sc_nuc_vs_r_iupac",
+		"sc_gap_vs_r_iupac",
+		"sd_nb_iupac_ref","sd_nb_iupac_seq_consensus","sd_identical_iupac","sd_iupac_vs_iupac",
+		"sd_sc_iupac_vs_r_nuc","sd_sc_iupac_vs_r_gap","sd_sc_nuc_vs_r_iupac","sd_sc_gap_vs_r_iupac"]
+		ATTRIBUTES_TO_DISPLAY_THRESHOLD_INDEPENDANT=[]
+	except:
+		pass

 files={}
 i=0
-result = subprocess.run("if [ ! -d results_mean ]; then mkdir results_mean;fi",shell=True)
-result = subprocess.run("if [ ! -d results_all_start_positions ]; then mkdir results_all_start_positions;fi",shell=True)
+result = subprocess.run("if [ ! -d "+output_dir+"_mean ]; then mkdir "+output_dir+"_mean;fi",shell=True)
+result = subprocess.run("if [ ! -d "+output_dir+"_all_start_positions ]; then mkdir "+output_dir+"_all_start_positions;fi",shell=True)
 #-----------------------------------------------------------------------------
 # Retrieved what was needed to read the files
 #-----------------------------------------------------------------------------
@@ -65,7 +88,10 @@ for i in range(len(exp_names)):
 						for filename in current_files:
 							if re.search(NAME_DATA_FILE,filename):
 								search_meta_consensus=re.search(NAME_META_CONSENSUS,filename)
-								if ((search_meta_consensus and META_CONSENSUS) or (not search_meta_consensus and not META_CONSENSUS)):
+								search_diploid=re.search(NAME_DIPLOID,filename)
+								if ((search_meta_consensus and META_CONSENSUS)
+									or (not search_meta_consensus and not META_CONSENSUS and not DIPLOID and not search_diploid)
+									or(search_diploid and DIPLOID)):
 									threshold=filename.split(".")[0].split("_t")[-1]
 									if (threshold not in files):
 										files[threshold]={}
@@ -116,7 +142,6 @@ for threshold in files :
 							bases.append(base_pair)
 						if (nb_read not in reads):
 							reads.append(nb_read)
-
 						if (msa not in MSA[exp_name]):
 							MSA[exp_name].append(msa)

@@ -130,7 +155,6 @@ for threshold in files :
 				MSA[exp_name]=sorted(MSA[exp_name])
 			except:
 				print(name_file , "don't exists.")
-
 #-----------------------------------------------------------------------------
 # Write to output files
 #-----------------------------------------------------------------------------
@@ -142,17 +166,18 @@ for threshold in files :
 			attributes_to_display+=ATTRIBUTES_TO_DISPLAY_THRESHOLD_INDEPENDANT
 	else:
 		add_name_file_output=NAME_META_CONSENSUS
+	#print(attributes_to_display)
 	for attribute in attributes_to_display:
 		#Header for files
 		if attribute in ATTRIBUTES_TO_DISPLAY_THRESHOLD_INDEPENDANT:
 			pass
-			output_mean=open("results_mean/data_" + add_name_file_output + "mean_"+ attribute + ".csv","w")
+			output_mean=open(output_dir+"_mean/data_" + add_name_file_output + "mean_"+ attribute + ".csv","w")
 			if(not re.search("^sd_",attribute)):
-				output_all=open("results_all_start_positions/data_" + add_name_file_output + "all_start_position_"+ attribute + ".csv","w")
+				output_all=open(output_dir+"_all_start_positions/data_" + add_name_file_output + "all_start_position_"+ attribute + ".csv","w")
 		else:
-			output_mean=open("results_mean/data_" + add_name_file_output + "mean_"+ attribute + "_" + threshold + ".csv","w")
+			output_mean=open(output_dir+"_mean/data_" + add_name_file_output + "mean_"+ attribute + "_" + threshold + ".csv","w")
 			if(not re.search("^sd_",attribute)):
-				output_all=open("results_all_start_positions/data_" + add_name_file_output + "all_start_position_"+ attribute + "_" + threshold + ".csv","w")
+				output_all=open(output_dir+"_all_start_positions/data_" + add_name_file_output + "all_start_position_"+ attribute + "_" + threshold + ".csv","w")
 		output_mean.write(",,")
 		if(not re.search("^sd_",attribute)):
 			output_all.write(",,")