Base version of pipeline (from MSA to comparison with refs)

948cadf0 · Lihouck Flavien · fe59070d · 948cadf0 · 948cadf0 · 948cadf0
Commit 948cadf0 authored 3 years ago by Lihouck Flavien
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,9 @@ consensus/
 *.o
 *~
 data/
+.*/
+\#*#
+*.gch
+.#*
+*.fasta
+MTool
\ No newline at end of file
--- a/src/MTool_over_folder.py
+++ b/src/MTool_over_folder.py
+#!/usr/bin/env python3
+
+import subprocess
+import os
+import sys, argparse
+
+directory = "../data/msa"
+region ="100"
+dimension ="10"
+
+def main() :
+    pattern = ""
+    parser=argparse.ArgumentParser()
+    parser.add_argument("-directory", help="target directory", required=True)
+    parser.add_argument("-output", help="output name")
+    parser.add_argument("-region", help="MSA region")
+    parser.add_argument("-dimension", help="MSA dimension")
+    parser.add_argument("-threshold", help="consensus threshold")
+    parser.add_argument("--ploidity", help="expected ploidity")
+    args = parser.parse_args()
+    if args.output:
+        output = args.output
+    else:
+        output = "default.txt"
+    if args.region:
+        pattern += "r"+args.region+"_"
+    if args.dimension:
+        pattern += "d"+args.dimension+"."
+    for filename in os.listdir(args.directory) :
+        command = ["./msa_handler/MTool", args.directory+"/"+filename, output]
+        if args.threshold:
+            command.append(args.threshold)
+        if pattern in filename and "fasta" in filename and not "_poa_" in filename:
+            subprocess.run(command, capture_output=True).stdout
+
+
+if __name__ == "__main__":
+    main()
--- a/src/msa_handler/MSA.cpp
+++ b/src/msa_handler/MSA.cpp
+#include "MSA.h"
+#include <iostream>
+
+
+using namespace std;
+
+
+char MSA::basis_to_IUPAC(vector<char> basis) {
+  int index(0);
+  if (basis.size() < 1) {
+    return '-';
+  }
+  for (auto it = std::begin(basis); it != std::end(basis); ++it) {
+    index = index | alphabet_IUPAC.find(*it);
+  }
+  return alphabet_IUPAC[index];
+}
+
+vector<char> MSA::IUPAC_to_basis(char code) {
+  vector<char> matching_basis;
+  int index(1);
+  while (index < 32) {
+    if ( (index & alphabet_IUPAC.find(code)) != 0) {
+      matching_basis.push_back(alphabet_IUPAC[index]);
+    }
+    index *= 2;
+  }
+  return matching_basis;
+}
+ 
+
+void MSA::add_sequence(const string& str){
+	text.push_back(str);
+	length=str.size();
+	++lines;
+}
+
+string MSA::consensus(int threshold, int ploidity) {
+	string consensus_seq("");
+	int score_index(0);
+	int current_weight(1);
+	for(int i(0);i<length;++i){
+		int scores[alphabet.size()]={0};
+		
+		for(int j(0);j<(int)text.size();++j){
+		  score_index = alphabet.find(text[j][i]);
+		  if (score_index != -1) {
+		    scores[score_index] += current_weight;
+		  }
+		  else {
+		    vector<char> multiple_basis = IUPAC_to_basis(text[j][i]);
+		    int size = multiple_basis.size();
+		    if (current_weight % size != 0) {
+		      current_weight *= size;
+		      for( int k(0); k <(int)alphabet.size();++k) {
+			scores[k] *= size;
+		      }
+		    }
+		    for (auto it = begin(multiple_basis); it != end(multiple_basis); ++it) {
+		      scores[alphabet.find(*it)] += (current_weight / size);
+		    }
+		  }
+		
+		}
+		vector<char> conserved_nuc(ploidity, '.');
+		//      int nuc_score[ploidity](0);
+	        int stored_nuc_cnt(0);
+		for(int k(0); k < (int)alphabet.size(); ++k) {
+		  if ( ((scores[k]*(100))/(int)text.size())> (threshold)*current_weight /ploidity) {
+		    conserved_nuc[stored_nuc_cnt] = alphabet[k];
+		    stored_nuc_cnt++;
+		  }
+		}
+		char nuc = basis_to_IUPAC(conserved_nuc);
+		if (nuc != '-' && nuc != '.') 
+		  consensus_seq += nuc;
+	}
+	return consensus_seq;
+}
+
+
+void MSA::parser_fasta(string file){
+	ifstream flux(file.c_str());
+	string seq("");
+	if(flux){
+		srand(time(NULL));
+		string ligne("");
+		while(getline(flux, ligne)){
+			if (ligne[0] == '>'){
+				name.push_back(ligne.substr(1, ligne.size()-1));
+				if (seq != ""){
+					add_sequence(seq);
+				}
+				seq = "";
+			}
+			else {
+				seq += ligne;
+			}
+		}
+		add_sequence(seq);
+	}
+	else{
+		cout << "ERROR: Unable to open the file.." << endl;
+		exit(0);
+	}
+}
--- a/src/msa_handler/MSA.h
+++ b/src/msa_handler/MSA.h
+#ifndef MSA_H
+#define MSA_H
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <fstream>
+
+using namespace std;
+
+class MSA
+{
+ public:
+  vector<string> text;
+  vector<string> name;
+  string alphabet=".ACGTN-acgtn";
+  // The IUPAC alphabet has been organised so that the element matches the binary code of - : (2^4) , T : (2^3), G : (2^2), C : (2^1), A : (2^0). This way, a mask is easy to create.
+  string alphabet_IUPAC=".ACMGRSVTWYHKDBN-acmgrsvtwyhkdbn";
+  int current_weight;
+  int length;
+  int lines;
+
+  //Takes a vector of basis and returns the matching (extended ?) IUPAC character.
+  char basis_to_IUPAC(vector<char> basis);
+  vector<char> IUPAC_to_basis(char code);
+  void parser_fasta(string file);
+  string consensus(int similarity_threshold, int ploidity);
+  void add_sequence(const string& str);
+
+
+  MSA() {
+    length =0;
+    lines = 0;
+    current_weight = 1;
+  };
+
+};
+#endif
--- a/src/msa_handler/Makefile
+++ b/src/msa_handler/Makefile
+CC=g++
+CFLAGS=-Wall -c -O3
+
+all: MTool clean
+
+MTool: main.o MSA.o
+	$(CC) -o MTool main.o MSA.o
+
+
+MSA.o: MSA.cpp MSA.h
+	$(CC) $(CFLAGS) MSA.cpp
+
+main.o: main.cpp MSA.h
+	$(CC) $(CFLAGS) main.cpp
+
+clean:
+	rm -f *.o
+	rm -f *~
+
+mr_proper:
+	rm *.o main
+	rm output.txt
--- a/src/msa_handler/main.cpp
+++ b/src/msa_handler/main.cpp
+#include "MSA.h"
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#define THRESHOLD 70
+#define PLOIDITY 2
+
+
+using namespace std;
+
+int main(int argc, char** argv){
+  if (argc > 1) {
+    MSA msa_fasta = MSA();
+    msa_fasta.parser_fasta(argv[1]);
+    ofstream output;
+    string filename;
+    if (argc > 2) {
+      filename = argv[2];
+    }
+    else
+      filename = "output.txt";
+    output.open (filename, ios::out | ios::app | ios::ate);
+    string cons_name = argv[1]; 
+    int index = cons_name.find_last_of('/') +1;
+    int threshold;
+    if (argc > 3)
+      threshold = stoi(argv[3]);
+    else
+      threshold = THRESHOLD;
+    int ploidity;
+    if (argc > 4)
+      ploidity = stoi(argv[4]);
+    else
+      ploidity = PLOIDITY;
+    cons_name = ">lcl|consensus_"+cons_name.substr(index) + "threshold = " + to_string(threshold);
+    output << cons_name << endl << msa_fasta.consensus(threshold, ploidity) << endl;
+    output.close();
+  }
+}
--- a/src/split_fasta.sh
+++ b/src/split_fasta.sh
+#! /bin/bash
+input=$1/$2
+
+file_cnt=0
+create=0
+while IFS= read -r line
+do
+    if [ $create == 0 ]
+    then
+	echo "$line" > "${1}/${file_cnt}_${2}"
+	((create=1))
+    else
+	((create=0))
+	echo "$line" >> "${1}/${file_cnt}_${2}"
+	((file_cnt=file_cnt+1))
+    fi
+
+done < "$input"
+	       
--- a/src/util/clustalToFasta.py
+++ b/src/util/clustalToFasta.py
+#!/usr/bin/env python3
+import sys,re,os
+from Bio import SeqIO
+
+try:
+  file_clustal=sys.argv[sys.argv.index("-in") + 1]
+except:
+  pass
+  print("No file clustal format. -in file")
+  sys.exit()
+
+try:
+  output_fasta=sys.argv[sys.argv.index("-out") + 1]
+except:
+  pass
+  print("No output. -out file")
+  sys.exit()
+
+records = SeqIO.parse(file_clustal, "clustal")
+count = SeqIO.write(records, output_fasta, "fasta")
--- a/src/workflow/Snakefile
+++ b/src/workflow/Snakefile
+DATASET = "../../data/msa"
+TOOL = ["abpoa", "kalign", "kalign3", "mafft", "muscle", "spoa"]
+THRESHOLD = ["50", "60", "70", "80", "90"]
+REGION = ["100", "200", "500", "1000", "2000"]
+DEPTH = ["10", "20", "50", "100", "150"]
+ALIGN = ["muscle", "mafft"]
+
+rule all:
+     input: expand("consensus/{region}/{depth}/aln_all_consensus.txt", region=REGION, depth=DEPTH)
+
+
+rule MTool:
+     input: "msa_handler/MSA.cpp", "msa_handler/MSA.h", "msa_handler/main.cpp", "msa_handler/Makefile"
+     output: "msa_handler/MTool"
+     shell: "cd msa_handler && make"
+
+rule create_consensus:
+     input: rules.MTool.output, "MTool_over_folder.py", expand("{dataset}/MSA_{tool}_r{region}_d{depth}.fasta", tool=TOOL, allow_missing=True, dataset=DATASET)
+     output: "consensus/{region}/{depth}/test_consensus_{region}_{depth}_{threshold}.fasta"
+     shell: "./MTool_over_folder.py -dir {DATASET} -o {output} -region {wildcards.region} -depth {wildcards.depth} -threshold {wildcards.threshold}"
+
+rule align_mafft:
+     input: rules.create_consensus.output
+     output: "consensus/{region}/{depth}/mafft_consensus_{region}_{depth}_{threshold}.fasta"
+     shell: "mafft --auto {input} > {output}"
+
+rule align_muscle:
+     input: rules.create_consensus.output
+     output: "consensus/{region}/{depth}/muscle_consensus_{region}_{depth}_{threshold}.fasta"
+     shell: "muscle -align {input} -output {output} > /dev/null 2> /dev/null"
+
+rule final_consensus:
+     input: rules.align_mafft.output, rules.align_muscle.output
+     output: "consensus/{region}/{depth}/{align}_final_consensus_{region}_{depth}_{threshold}.fasta"
+     shell: "./msa_handler/MTool consensus/{wildcards.region}/{wildcards.depth}/{wildcards.align}_consensus_{wildcards.region}_{wildcards.depth}_{wildcards.threshold}.fasta consensus/{wildcards.region}/{wildcards.depth}/{wildcards.align}_final_consensus_{wildcards.region}_{wildcards.depth}_{wildcards.threshold}.fasta {wildcards.threshold} 2"
+
+rule exonerate_ref_result:
+     input:  files=expand("consensus/{region}/{depth}/muscle_final_consensus_{region}_{depth}_{threshold}.fasta", threshold=THRESHOLD, allow_missing=True) , ref="../data/seq_selectes_region/region_seq_r{region}.fasta"
+     output: "consensus/aln_all_consensus_r{region}_d{depth}.txt"
+     conda: "envs/exonerate.yaml"
+     shell: "for FILE in {input.files}; do exonerate --bestn 1 -Q dna -E -m a:g {input.ref} $FILE >> {output}; done"
+
--- a/src/workflow/envs/exonerate.yaml
+++ b/src/workflow/envs/exonerate.yaml
+name: exonerate
+channels:
+  - defaults
+  - conda-forge
+  - bioconda
+dependencies:
+  - glib=2.69.1=h5202010_0
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=4.5=1_gnu
+  - exonerate=2.2.0=1
+  - libffi=3.3=he6710b0_2
+  - libgcc=7.2.0=h69d50b8_2
+  - libgcc-ng=9.3.0=h5101ec6_17
+  - libgomp=9.3.0=h5101ec6_17
+  - libstdcxx-ng=9.3.0=hd4cf53a_17
+  - pcre=8.45=h295c915_0
+  - zlib=1.2.11=h7b6447c_3