Meta consensus

69d79b42 · Rohmer Coralie · cfea4c00 · 69d79b42 · 69d79b42 · 69d79b42
Commit 69d79b42 authored Feb 4, 2022 by Rohmer Coralie
--- a/Snakefile
+++ b/Snakefile
@@ -114,7 +114,7 @@ rule all :
        expand(EXP + "/" + EXP_NAME + "/results/" + EXP_NAME + "_data_align_t{threshold}.csv", data_set=DATA_SETS, threshold=THRESHOLDS),
        expand('{data_set}/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA),
        expand(EXP + '/'+EXP_NAME + '/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA),
-        expand(EXP + "/" + EXP_NAME + "/results/" + EXP_NAME + "_consensus_consensus_data_align_t{threshold}.csv", threshold=THRESHOLDS)
+        expand(EXP + "/" + EXP_NAME + "/results/" + EXP_NAME + "_meta_consensus_data_align_t{threshold}.csv", threshold=THRESHOLDS)

 #-------------------------------------------------------------------------------
 # Data set preparation
@@ -387,7 +387,7 @@ rule sequence_consensus :
    input :
        expand("{{data_set}}/msa/MSA_{msa}_r{{region_size}}_d{depth}.fasta" , msa = MSA, depth=DEPTHS)
    output :
-        "{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus.fasta"
+        "{data_set}/seq_consensus/seq_consensus_t{threshold}_r{region_size}.fasta"
    message:
        "Sequence_consensus for {wildcards.data_set} (Threshold={wildcards.threshold} & Region size={wildcards.region_size})"
    log:
@@ -404,10 +404,10 @@ rule sequence_consensus :
 #-------------------------------------------------------------------------------
 rule alignment_consensus_ref :
    input :
-        consensus="{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus.fasta",
+        consensus="{data_set}/seq_consensus/seq_consensus_t{threshold}_r{region_size}.fasta",
        region="{data_set}/seq_selectes_region/region_seq_r{region_size}.fasta"
    output :
-        "{data_set}/seq_consensus/t{threshold}/r{region_size}/align_consensus_ref.txt",
+        "{data_set}/seq_consensus/align_consensus_ref_t{threshold}_r{region_size}.txt",
    message:
        "Alignment_consensus_ref for {wildcards.data_set} (Threshold={wildcards.threshold} & Region size={wildcards.region_size})"
    log:
@@ -429,7 +429,7 @@ rule alignment_consensus_ref :
 #-------------------------------------------------------------------------------
 rule data_formatting :
    input :
-        exo = expand("{{data_set}}/seq_consensus/t{{threshold}}/r{region_size}/align_consensus_ref.txt" , region_size=REGION_SIZES),
+        exo = expand("{{data_set}}/seq_consensus/align_consensus_ref_t{{threshold}}_r{region_size}.txt" , region_size=REGION_SIZES),
        time = "{data_set}/time/all_MSA_time.csv"
    output :
        "{data_set}/results/"+EXP_NAME+"_data_align_t{threshold}.csv"
@@ -496,13 +496,13 @@ rule data_time :
        '$ORDER >{output} 2>>{log}'

 # -------------------------------------------------------------------------------
-# Consensus of consensus
+# Meta consensus
 # -------------------------------------------------------------------------------
 rule separate_consensus :
    input :
-        "{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus.fasta"
+        "{data_set}/seq_consensus/seq_consensus_t{threshold}_r{region_size}.fasta"
    output :
-        "{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus_r{region_size}_d{depth}.fasta"
+        "{data_set}/seq_consensus/separate_by_depth/seq_consensus_t{threshold}_r{region_size}_d{depth}.fasta"
    message:
        "Separate consensus for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
    log:
@@ -512,31 +512,30 @@ rule separate_consensus :
        'echo "ORDER: $ORDER" >{log};'
        '$ORDER >{output} 2>>{log}'

-rule consensus_msa :
+rule msa_consensus :
    input :
-        "{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus_r{region_size}_d{depth}.fasta"
+        "{data_set}/seq_consensus/separate_by_depth/seq_consensus_t{threshold}_r{region_size}_d{depth}.fasta"
    output :
-        time = os.path.join('{data_set}','time','consensus_msa_t{threshold}_r{region_size}_d{depth}'),
-        out = os.path.join('{data_set}','seq_consensus','t{threshold}','r{region_size}','msa_consensus_r{region_size}_d{depth}.fasta')
+        time = os.path.join('{data_set}','time','msa_consensus_t{threshold}_r{region_size}_d{depth}'),
+        out = os.path.join('{data_set}','meta_consensus','msa_consensus_t{threshold}_r{region_size}_d{depth}.fasta')
    message:
-        "Consensus msa for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
+        "MSA Consensus for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
    log:
-        "{data_set}/logs/15_consensus_msa_t{threshold}_r{region_size}_d{depth}.log"
+        "{data_set}/logs/15_msa_consensus_t{threshold}_r{region_size}_d{depth}.log"
    conda:
        "env_conda/muscle.yaml"
    shell:
        './src/run_MSA.sh "muscle -in {input} -out {output.out}" {input} {output.out} {output.time} {log} 1'

-
-rule consensus_consensus:
+rule meta_consensus:
    input :
-        expand('{{data_set}}/seq_consensus/t{{threshold}}/r{{region_size}}/msa_consensus_r{{region_size}}_d{depth}.fasta',depth=DEPTHS)
+        expand('{{data_set}}/meta_consensus/msa_consensus_t{{threshold}}_r{{region_size}}_d{depth}.fasta',depth=DEPTHS)
    output :
-        "{data_set}/seq_consensus/t{threshold}/r{region_size}/consensus_consensus_r{region_size}.fasta"
+        "{data_set}/meta_consensus/meta_consensus_t{threshold}_r{region_size}.fasta"
    message:
-       "Consensus consensus for {wildcards.data_set} (Threshold={wildcards.threshold} & Region size={wildcards.region_size})"
+       "Meta consensus for {wildcards.data_set} (Threshold={wildcards.threshold} & Region size={wildcards.region_size})"
    log:
-        "{data_set}/logs/16_consensus_consensus_t{threshold}_r{region_size}.log"
+        "{data_set}/logs/16_meta_consensus_t{threshold}_r{region_size}.log"
    conda:
        "env_conda/python3.yaml"
    shell:
@@ -544,16 +543,16 @@ rule consensus_consensus:
        'echo "ORDER: $ORDER" >{log};'
        '$ORDER >{output} 2>>{log}'

-rule alignment_consensus_consensus_ref :
+rule alignment_meta_consensus_ref :
    input :
-        consensus=os.path.join('{data_set}','seq_consensus','t{threshold}','r{region_size}','consensus_consensus_r{region_size}.fasta'),
+        consensus=os.path.join('{data_set}','meta_consensus','meta_consensus_t{threshold}_r{region_size}.fasta'),
        region="{data_set}/seq_selectes_region/region_seq_r{region_size}.fasta"
    output :
-        "{data_set}/seq_consensus/t{threshold}/r{region_size}/align_consensus_consensus_ref_r{region_size}.txt",
+        "{data_set}/meta_consensus/align_meta_consensus_ref_t{threshold}_r{region_size}.txt",
    message:
-        "Alignment_consensus_consensus_ref for {wildcards.data_set} (Threshold={wildcards.threshold} & Region size={wildcards.region_size})"
+        "Alignment meta consensus ref for {wildcards.data_set} (Threshold={wildcards.threshold} & Region size={wildcards.region_size})"
    log:
-        "{data_set}/logs/17_alignment_consensus_consensus_ref_t{threshold}_r{region_size}.log"
+        "{data_set}/logs/17_alignment_meta_consensus_ref_t{threshold}_r{region_size}.log"
    conda:
        "env_conda/exonerate.yaml"
    shell :
@@ -566,15 +565,15 @@ rule alignment_consensus_consensus_ref :
        '   touch {output};'
        'fi'

-rule consensus_consensus_data_formatting :
+rule meta_consensus_data_formatting :
    input :
-        expand("{{data_set}}/seq_consensus/t{{threshold}}/r{region_size}/align_consensus_consensus_ref_r{region_size}.txt" , region_size=REGION_SIZES, depth=DEPTHS),
+        expand("{{data_set}}/meta_consensus/align_meta_consensus_ref_t{{threshold}}_r{region_size}.txt" , region_size=REGION_SIZES),
    output :
-        "{data_set}/results/"+EXP_NAME+"_consensus_consensus_data_align_t{threshold}.csv"
+        "{data_set}/results/"+EXP_NAME+"_meta_consensus_data_align_t{threshold}.csv"
    message:
-        "Consensus consensus data formatting for {wildcards.data_set} (Threshold={wildcards.threshold})"
+        "Meta consensus data formatting for {wildcards.data_set} (Threshold={wildcards.threshold})"
    log:
-        "{data_set}/logs/18_consensus_consensus_data_formatting_t{threshold}.log"
+        "{data_set}/logs/18_meta_consensus_data_formatting_t{threshold}.log"
    conda:
        "env_conda/python3.yaml"
    shell :
@@ -582,15 +581,15 @@ rule consensus_consensus_data_formatting :
        'echo "ORDER: $ORDER" >{log};'
        '$ORDER >{output} 2>>{log}'

-rule consensus_consensus_region_mean:
+rule meta_consensus_region_mean:
    input :
-        expand("{data_set}/results/"+EXP_NAME+"_consensus_consensus_data_align_t{{threshold}}.csv" , data_set = DATA_SETS)
+        expand("{data_set}/results/"+EXP_NAME+"_meta_consensus_data_align_t{{threshold}}.csv" , data_set = DATA_SETS)
    output :
-        EXP + '/'+ EXP_NAME + "/results/"+EXP_NAME+"_consensus_consensus_data_align_t{threshold}.csv"
+        EXP + '/'+ EXP_NAME + "/results/"+EXP_NAME+"_meta_consensus_data_align_t{threshold}.csv"
    message:
-        "Consensus consensus region mean for " + EXP + '/'+ EXP_NAME + " (threshold={wildcards.threshold})"
+        "Meta consensus region mean for " + EXP + '/'+ EXP_NAME + " (threshold={wildcards.threshold})"
    log:
-        EXP + '/'+EXP_NAME + "/logs/19_consensus_consensus_region_mean_t{threshold}.log"
+        EXP + '/'+EXP_NAME + "/logs/19_meta_consensus_region_mean_t{threshold}.log"
    conda:
        "env_conda/python3.yaml"
    shell :

--- a/config.yaml
+++ b/config.yaml
 D: [10,20,50] 
 S: [100,200]
 T: [50]
-M: [muscle,mafft,poa,spoa,abpoa,kalign,kalign3,clustalo,tcoffee]
+M: [muscle,mafft,poa,spoa,abpoa,kalign,kalign3]
 O: 10
--- a/msa_limit.py
+++ b/msa_limit.py
@@ -77,15 +77,10 @@ def config_conda():
            sys.exit()

 def test():
-    answer = input("Do you want to run a test? (y/n) ")
-    if (answer == 'y' or answer == 'Y' ):
-        pass
-        print("Launch test")
+    if not os.path.exists("test_data"):
+        print("Preparation of the test data set")
        result = subprocess.run("./src/test_data.sh")
-        print("Launch Snakemake")
    result = subprocess.run(["./src/snakemake_launcher.sh","configuration_files/test.yaml"])
-    else :
-        usage()

 def experiments_list():
    if os.path.exists("experiments"):
@@ -111,8 +106,13 @@ def summary():
            result = subprocess.Popen("ls experiments",shell=True,stdout=subprocess.PIPE)
            exp_names=result.communicate()[0].decode("utf-8")
            exp_names=re.sub('\n', r' ', exp_names)
-        result = subprocess.run("./src/total_data_format.py -m -n " + exp_names,shell=True)
+        if os.path.exists("results_mean"):
+            result = subprocess.run("rm -r results_mean",shell=True)
+        if os.path.exists("results_all_regions"):
+            result = subprocess.run("rm -r results_all_regions",shell=True)
        result = subprocess.run("./src/total_data_format.py -n " + exp_names,shell=True)
+        result = subprocess.run("./src/total_data_format.py -m -n " + exp_names,shell=True)
+        print("See folders: results_mean & results_all_regions")
    else:
        print("No experiment has been launched yet")


--- a/src/sequence_consensus.py
+++ b/src/sequence_consensus.py
@@ -97,6 +97,7 @@ for file_name in file_names:
 	if bool_file_type != None:
 	    pass
 	    bool_fasta=re.search(">", infile.read())
+		#alignments[0]=""
 	    if bool_fasta != None:
 	        pass
 	        alignments = list(AlignIO.parse(file_name, "fasta"))
@@ -105,9 +106,8 @@ for file_name in file_names:
 	else:
 		print("ERROR:The format of",file_name,"isn't correct.\n")
 		use()
-
 	#Retrieves the consensus sequence
-	try:
+	if (len(alignments) >= 1 ):
 		if (iupac):
 			seq=tool_consensus.consensus_iupac(alignments[0],threshold)
 		else:
@@ -116,5 +116,5 @@ for file_name in file_names:
 			else:
 				seq=tool_consensus.consensus(alignments[0],threshold)
 		print(">" + file_name.split("/")[-1].split(".")[0] + "\n" + seq)
-	except:
+	else:
 		print("",end="")
--- a/src/snakemake_launcher.sh
+++ b/src/snakemake_launcher.sh
@@ -13,5 +13,5 @@ if [[ $2 == "" ]]; then
 else
  CORES=$2
 fi
-echo "Launche Snakemake with $CONFIG_EXPERIMENT"
+echo "Launch Snakemake with $CONFIG_EXPERIMENT"
 snakemake --configfile $CONFIG_EXPERIMENT -c$CORES --use-conda --rerun-incomplete
--- a/src/tool_consensus.py
+++ b/src/tool_consensus.py
@@ -7,6 +7,11 @@ from Bio.pairwise2 import format_alignment
 from Bio import SeqIO

 nuc_to_iupac={}
+nuc_to_iupac["-"]="-"
+nuc_to_iupac["A"]="A"
+nuc_to_iupac["G"]="G"
+nuc_to_iupac["C"]="C"
+nuc_to_iupac["T"]="T"
 nuc_to_iupac["AG"]="R"
 nuc_to_iupac["CT"]="Y"
 nuc_to_iupac["CG"]="S"
@@ -145,41 +150,67 @@ def majority_consensus(alignment):
 	summary_align = AlignInfo.SummaryInfo(alignment)
 	nb_reads = len(summary_align.get_column(0))
 	align_size = len(alignment[1].seq)
-	letters = summary_align._get_all_letters()
 	seq=""

 	#Run the alignment column by column
 	for i in range(align_size):
 		pass
-		nucleotides=summary_align.get_column(i)
+		nucleotides_IUPAC=summary_align.get_column(i)
+		freq_gap=len(re.findall("-",nucleotides_IUPAC))/nb_reads
+		if(freq_gap < 0.5):
 			frequency_letters={}
-		max = [0,[]]
-		#Calculates the frequency of each letter
-		#and keeps the letter(s) with the maximum
-		max_frequency = 0
+			nucleotides_no_IUPAC=[]
+			for nuc in nucleotides_IUPAC:
+				nucleotides_no_IUPAC.append(iupac_to_nuc[nuc])
+			max_counter = 0
 			max_letters=[]
-		for letter in letters:
-			pass
-			frequency_letter=(len(re.findall(letter,nucleotides))/nb_reads)
-			if max_frequency < frequency_letter:
-				pass
-				max_frequency = frequency_letter
+			for letter in "ATGC":
+				letter_counter=0;
+				for nuc in nucleotides_no_IUPAC:
+					if(nuc.find(letter) > -1):
+						letter_counter+=1/len(nuc)
+
+				if(letter_counter > max_counter):
+					max_counter = letter_counter
 					max_letters = [letter]
 				else:
-				if max_frequency==frequency_letter:
-					pass
-					max_letters = max_letters + [letter]
-		if len(max_letters) <= 1:
-			if max_letters[0] != '-':
-				pass
-				seq+=max_letters[0]
-		else:
-			max_letters.sort()
+					if(letter_counter == max_counter):
+						max_letters.append(letter)
+			if ( len(max_letters) > 1):
 				id_iupac = "".join((max_letters))
-			id_iupac = id_iupac.replace('-','')
 				seq+=nuc_to_iupac[id_iupac]
+			else:
+				seq+=max_letters[0]
 	return seq

+
+		# max_frequency = 0
+		# max_letters=[]
+		# for letter in letters:
+		# 	pass
+		# 	frequency_letter=(len(re.findall(letter,nucleotides))/nb_reads)
+		# 	if max_frequency < frequency_letter:
+		# 		pass
+		# 		max_frequency = frequency_letter
+		# 		max_letters = [letter]
+		# 	else:
+		# 		if max_frequency==frequency_letter:
+		# 			pass
+		# 			max_letters = max_letters + [letter]
+		# if len(max_letters) <= 1:
+		# 	if max_letters[0] != '-':
+		# 		pass
+		# 		seq+=max_letters[0]
+		# else:
+		# 	max_letters.sort()
+		# 	id_iupac = "".join((max_letters))
+		# 	id_iupac = id_iupac.replace('-','')
+		# 	try:
+		# 		seq+=nuc_to_iupac[id_iupac]
+		# 	except:
+		# 		seq+="["+id_iupac+"]"
+
+
 #Creation of the consensus sequence
 def index_jaccard(nuc_ref,nuc_cons):
 	nb_nuc_ref = len(nuc_ref)

--- a/src/total_data_format.py
+++ b/src/total_data_format.py
@@ -7,7 +7,7 @@ ATTRIBUTES_TO_DISPLAY_THRESHOLD_INDEPENDANT=["time","memory"]
 PREFIX="region_"
 RESULT_FOLDER="results"
 NAME_DATA_FILE="data_align_t"
-NAME_META_CONSENSUS="consensus_consensus_"
+NAME_META_CONSENSUS="meta_consensus_"
 #Script use
 def use():
 	print("\nScript:\ttotal_data_formating.py",
@@ -181,7 +181,6 @@ for threshold in files :
 		bases=sorted(bases)
 		MSA=sorted(MSA)
 		reads=sorted(reads)
-		print(reads,bases,MSA)
 		for base in bases :
 			for read in reads :
 				output_mean.write(str(base) + "," + str(read) + ",")
@@ -206,4 +205,3 @@ for threshold in files :
 				output_all.write("\n")
 			output_mean.write("\n")
 			output_all.write("\n")
-print("See folders: results_mean & results_all_regions")