add consensus of consensus

08241ced · Rohmer Coralie · 74ba567c · 08241ced · 08241ced · 08241ced
Commit 08241ced authored 3 years ago by Rohmer Coralie
--- a/Snakefile
+++ b/Snakefile
@@ -113,7 +113,8 @@ rule all :
    input :
        expand(EXP + "/" + EXP_NAME + "/results/" + EXP_NAME + "_data_align_t{threshold}.csv", data_set=DATA_SETS, threshold=THRESHOLDS),
        expand('{data_set}/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA),
-        expand(EXP + '/'+EXP_NAME + '/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA)
+        expand(EXP + '/'+EXP_NAME + '/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA),
+        expand("{data_set}/seq_consensus/t{threshold}/r{region_size}/align_consensus_consensus_ref_d{depth}.txt", data_set=DATA_SETS, threshold=THRESHOLDS,region_size=REGION_SIZES, depth=DEPTHS)

 #-------------------------------------------------------------------------------
 # Data set preparation
@@ -493,3 +494,73 @@ rule data_time :
        'ORDER="./src/data_time.pl {input}";'
        'echo "ORDER: $ORDER" >{log};'
        '$ORDER >{output} 2>>{log}'
+
+# -------------------------------------------------------------------------------
+# Consensus of consensus
+# -------------------------------------------------------------------------------
+rule separate_consensus :
+    input :
+        "{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus.fasta"
+    output :
+        "{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus_d{depth}.fasta"
+    message:
+        "Separate consensus for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
+    log:
+        "{data_set}/logs/14_separate_consensus_t{threshold}_r{region_size}_d{depth}.log"
+    shell:
+        'ORDER="./src/separate_consensus.sh {input} {wildcards.depth} {output}";'
+        'echo "ORDER: $ORDER" >{log};'
+        '$ORDER >{output} 2>>{log}'
+
+rule consensus_msa :
+    input :
+        "{data_set}/seq_consensus/t{threshold}/r{region_size}/seq_consensus_d{depth}.fasta"
+    output :
+        time = os.path.join('{data_set}','time','consensus_msa_t{threshold}_r{region_size}_d{depth}'),
+        out = os.path.join('{data_set}','seq_consensus','t{threshold}','r{region_size}','consensus_msa_d{depth}.fasta')
+    message:
+        "Consensus msa for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
+    log:
+        "{data_set}/logs/15_consensus_msa_t{threshold}_r{region_size}_d{depth}.log"
+    conda:
+        "env_conda/muscle.yaml"
+    shell:
+        './src/run_MSA.sh "muscle -in {input} -out {output.out}" {input} {output.out} {output.time} {log} 1'
+
+rule consensus_consensus:
+    input :
+        os.path.join('{data_set}','seq_consensus','t{threshold}','r{region_size}','consensus_msa_d{depth}.fasta')
+    output :
+        os.path.join('{data_set}','seq_consensus','t{threshold}','r{region_size}','consensus_consensus_d{depth}.fasta')
+    message:
+        "Consensus consensus for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
+    log:
+        "{data_set}/logs/16_consensus_consensus_t{threshold}_r{region_size}_d{depth}.log"
+    conda:
+        "env_conda/python3.yaml"
+    shell:
+        'ORDER="./src/sequence_consensus.py -m -in {input}";'
+        'echo "ORDER: $ORDER" >{log};'
+        '$ORDER >{output} 2>>{log}'
+
+rule alignment_consensus_consensus_ref :
+    input :
+        consensus=os.path.join('{data_set}','seq_consensus','t{threshold}','r{region_size}','consensus_consensus_d{depth}.fasta'),
+        region="{data_set}/seq_selectes_region/region_seq_r{region_size}.fasta"
+    output :
+        "{data_set}/seq_consensus/t{threshold}/r{region_size}/align_consensus_consensus_ref_d{depth}.txt",
+    message:
+        "Alignment_consensus_consensus_ref for {wildcards.data_set} (Threshold={wildcards.threshold}, Region size={wildcards.region_size} & Depth={wildcards.depth})"
+    log:
+        "{data_set}/logs/17_alignment_consensus_consensus_ref_t{threshold}_r{region_size}_d{depth}.log"
+    conda:
+        "env_conda/exonerate.yaml"
+    shell :
+        'if [ `cat {input.consensus}|grep ">"|wc -l` -ne 0 ]; then'
+        '   ORDER="exonerate -Q dna -E -m a:g {input.consensus} {input.region}";'
+        '   echo "ORDER: $ORDER" >{log};'
+        '   $ORDER >{output} 2>>{log};'
+        'else'
+        '   echo "ERROR: No sequences" >>{log};'
+        '   touch {output};'
+        'fi'
--- a/src/separate_consensus.sh
+++ b/src/separate_consensus.sh
+cat $1 |tr '\n' '@'| sed 's/@>/\n>/g'|grep d$2|tr '@' '\n'>$3
--- a/src/sequence_consensus.py
+++ b/src/sequence_consensus.py
@@ -28,6 +28,7 @@ def use():
          "             -t  : threshold above which the nucleotide is kept in percent.",
          "  -optional: -h: display help.",
 		  "             -I: use IUPAC code.\n",
+		  "             -m: use majority consensus. (useless -t)\n",
 		  "Note: Nucleotide kept if highest frequency and above threshold.",
 		  "      In case of a tie, an N is kept (no IUPAC code). If equal",
 		  "      with a gap, it's the nucleotide is preserved.\n",sep="\n")
@@ -62,10 +63,15 @@ else:
 			next_file = sys.argv[sys.argv.index("-in")+i]
 		except:
 			end_files = 1
-
+try:
+	if (sys.argv[sys.argv.index("-m")]):
+		majority=1
+except:
+	majority=0
 try:
    threshold=float(sys.argv[sys.argv.index("-t")+1])/100
 except:
+	if (majority==False):
 		print("ERROR: The nucleotide selection threshold is mandatory.\n")
 		use()

@@ -104,6 +110,9 @@ for file_name in file_names:
 	try:
 		if (iupac):
 			seq=tool_consensus.consensus_iupac(alignments[0],threshold)
+		else:
+			if(majority):
+				seq=tool_consensus.majority_consensus(alignments[0])
 			else:
 				seq=tool_consensus.consensus(alignments[0],threshold)
 		print(">" + file_name.split("/")[-1].split(".")[0] + "\n" + seq)

--- a/src/tool_consensus.py
+++ b/src/tool_consensus.py
@@ -140,6 +140,46 @@ def consensus(alignment,threshold):
 				seq+=max_letters[0]
 	return seq

+def majority_consensus(alignment):
+	threshold=0.5
+	summary_align = AlignInfo.SummaryInfo(alignment)
+	nb_reads = len(summary_align.get_column(0))
+	align_size = len(alignment[1].seq)
+	letters = summary_align._get_all_letters()
+	seq=""
+
+	#Run the alignment column by column
+	for i in range(align_size):
+		pass
+		nucleotides=summary_align.get_column(i)
+		frequency_letters={}
+		max = [0,[]]
+		#Calculates the frequency of each letter
+		#and keeps the letter(s) with the maximum
+		max_frequency = 0
+		max_letters=[]
+		for letter in letters:
+			pass
+			frequency_letter=(len(re.findall(letter,nucleotides))/nb_reads)
+			if max_frequency < frequency_letter:
+				pass
+				max_frequency = frequency_letter
+				max_letters = [letter]
+			else:
+				if max_frequency==frequency_letter:
+					pass
+					max_letters = max_letters + [letter]
+		if len(max_letters) <= 1:
+			if max_letters[0] != '-':
+				pass
+				seq+=max_letters[0]
+		else:
+			max_letters.sort()
+			id_iupac = "".join((max_letters))
+			id_iupac = id_iupac.replace('-','')
+			seq+=nuc_to_iupac[id_iupac]
+	return seq
+
 #Creation of the consensus sequence
 def index_jaccard(nuc_ref,nuc_cons):
 	nb_nuc_ref = len(nuc_ref)