Skip to content
Snippets Groups Projects
Commit 948cadf0 authored by Lihouck Flavien's avatar Lihouck Flavien
Browse files

Base version of pipeline (from MSA to comparison with refs)

parent fe59070d
No related branches found
No related tags found
No related merge requests found
......@@ -2,3 +2,9 @@ consensus/
*.o
*~
data/
.*/
\#*#
*.gch
.#*
*.fasta
MTool
\ No newline at end of file
#!/usr/bin/env python3
import subprocess
import os
import sys, argparse
directory = "../data/msa"
region ="100"
dimension ="10"
def main() :
pattern = ""
parser=argparse.ArgumentParser()
parser.add_argument("-directory", help="target directory", required=True)
parser.add_argument("-output", help="output name")
parser.add_argument("-region", help="MSA region")
parser.add_argument("-dimension", help="MSA dimension")
parser.add_argument("-threshold", help="consensus threshold")
parser.add_argument("--ploidity", help="expected ploidity")
args = parser.parse_args()
if args.output:
output = args.output
else:
output = "default.txt"
if args.region:
pattern += "r"+args.region+"_"
if args.dimension:
pattern += "d"+args.dimension+"."
for filename in os.listdir(args.directory) :
command = ["./msa_handler/MTool", args.directory+"/"+filename, output]
if args.threshold:
command.append(args.threshold)
if pattern in filename and "fasta" in filename and not "_poa_" in filename:
subprocess.run(command, capture_output=True).stdout
if __name__ == "__main__":
main()
#include "MSA.h"
#include <iostream>
using namespace std;
char MSA::basis_to_IUPAC(vector<char> basis) {
int index(0);
if (basis.size() < 1) {
return '-';
}
for (auto it = std::begin(basis); it != std::end(basis); ++it) {
index = index | alphabet_IUPAC.find(*it);
}
return alphabet_IUPAC[index];
}
vector<char> MSA::IUPAC_to_basis(char code) {
vector<char> matching_basis;
int index(1);
while (index < 32) {
if ( (index & alphabet_IUPAC.find(code)) != 0) {
matching_basis.push_back(alphabet_IUPAC[index]);
}
index *= 2;
}
return matching_basis;
}
void MSA::add_sequence(const string& str){
text.push_back(str);
length=str.size();
++lines;
}
string MSA::consensus(int threshold, int ploidity) {
string consensus_seq("");
int score_index(0);
int current_weight(1);
for(int i(0);i<length;++i){
int scores[alphabet.size()]={0};
for(int j(0);j<(int)text.size();++j){
score_index = alphabet.find(text[j][i]);
if (score_index != -1) {
scores[score_index] += current_weight;
}
else {
vector<char> multiple_basis = IUPAC_to_basis(text[j][i]);
int size = multiple_basis.size();
if (current_weight % size != 0) {
current_weight *= size;
for( int k(0); k <(int)alphabet.size();++k) {
scores[k] *= size;
}
}
for (auto it = begin(multiple_basis); it != end(multiple_basis); ++it) {
scores[alphabet.find(*it)] += (current_weight / size);
}
}
}
vector<char> conserved_nuc(ploidity, '.');
// int nuc_score[ploidity](0);
int stored_nuc_cnt(0);
for(int k(0); k < (int)alphabet.size(); ++k) {
if ( ((scores[k]*(100))/(int)text.size())> (threshold)*current_weight /ploidity) {
conserved_nuc[stored_nuc_cnt] = alphabet[k];
stored_nuc_cnt++;
}
}
char nuc = basis_to_IUPAC(conserved_nuc);
if (nuc != '-' && nuc != '.')
consensus_seq += nuc;
}
return consensus_seq;
}
void MSA::parser_fasta(string file){
ifstream flux(file.c_str());
string seq("");
if(flux){
srand(time(NULL));
string ligne("");
while(getline(flux, ligne)){
if (ligne[0] == '>'){
name.push_back(ligne.substr(1, ligne.size()-1));
if (seq != ""){
add_sequence(seq);
}
seq = "";
}
else {
seq += ligne;
}
}
add_sequence(seq);
}
else{
cout << "ERROR: Unable to open the file.." << endl;
exit(0);
}
}
#ifndef MSA_H
#define MSA_H
#include <string>
#include <vector>
#include <iostream>
#include <fstream>
using namespace std;
class MSA
{
public:
vector<string> text;
vector<string> name;
string alphabet=".ACGTN-acgtn";
// The IUPAC alphabet has been organised so that the element matches the binary code of - : (2^4) , T : (2^3), G : (2^2), C : (2^1), A : (2^0). This way, a mask is easy to create.
string alphabet_IUPAC=".ACMGRSVTWYHKDBN-acmgrsvtwyhkdbn";
int current_weight;
int length;
int lines;
//Takes a vector of basis and returns the matching (extended ?) IUPAC character.
char basis_to_IUPAC(vector<char> basis);
vector<char> IUPAC_to_basis(char code);
void parser_fasta(string file);
string consensus(int similarity_threshold, int ploidity);
void add_sequence(const string& str);
MSA() {
length =0;
lines = 0;
current_weight = 1;
};
};
#endif
CC=g++
CFLAGS=-Wall -c -O3
all: MTool clean
MTool: main.o MSA.o
$(CC) -o MTool main.o MSA.o
MSA.o: MSA.cpp MSA.h
$(CC) $(CFLAGS) MSA.cpp
main.o: main.cpp MSA.h
$(CC) $(CFLAGS) main.cpp
clean:
rm -f *.o
rm -f *~
mr_proper:
rm *.o main
rm output.txt
#include "MSA.h"
#include <fstream>
#include <iostream>
#include <string>
#define THRESHOLD 70
#define PLOIDITY 2
using namespace std;
int main(int argc, char** argv){
if (argc > 1) {
MSA msa_fasta = MSA();
msa_fasta.parser_fasta(argv[1]);
ofstream output;
string filename;
if (argc > 2) {
filename = argv[2];
}
else
filename = "output.txt";
output.open (filename, ios::out | ios::app | ios::ate);
string cons_name = argv[1];
int index = cons_name.find_last_of('/') +1;
int threshold;
if (argc > 3)
threshold = stoi(argv[3]);
else
threshold = THRESHOLD;
int ploidity;
if (argc > 4)
ploidity = stoi(argv[4]);
else
ploidity = PLOIDITY;
cons_name = ">lcl|consensus_"+cons_name.substr(index) + "threshold = " + to_string(threshold);
output << cons_name << endl << msa_fasta.consensus(threshold, ploidity) << endl;
output.close();
}
}
#! /bin/bash
input=$1/$2
file_cnt=0
create=0
while IFS= read -r line
do
if [ $create == 0 ]
then
echo "$line" > "${1}/${file_cnt}_${2}"
((create=1))
else
((create=0))
echo "$line" >> "${1}/${file_cnt}_${2}"
((file_cnt=file_cnt+1))
fi
done < "$input"
#!/usr/bin/env python3
import sys,re,os
from Bio import SeqIO
try:
file_clustal=sys.argv[sys.argv.index("-in") + 1]
except:
pass
print("No file clustal format. -in file")
sys.exit()
try:
output_fasta=sys.argv[sys.argv.index("-out") + 1]
except:
pass
print("No output. -out file")
sys.exit()
records = SeqIO.parse(file_clustal, "clustal")
count = SeqIO.write(records, output_fasta, "fasta")
DATASET = "../../data/msa"
TOOL = ["abpoa", "kalign", "kalign3", "mafft", "muscle", "spoa"]
THRESHOLD = ["50", "60", "70", "80", "90"]
REGION = ["100", "200", "500", "1000", "2000"]
DEPTH = ["10", "20", "50", "100", "150"]
ALIGN = ["muscle", "mafft"]
rule all:
input: expand("consensus/{region}/{depth}/aln_all_consensus.txt", region=REGION, depth=DEPTH)
rule MTool:
input: "msa_handler/MSA.cpp", "msa_handler/MSA.h", "msa_handler/main.cpp", "msa_handler/Makefile"
output: "msa_handler/MTool"
shell: "cd msa_handler && make"
rule create_consensus:
input: rules.MTool.output, "MTool_over_folder.py", expand("{dataset}/MSA_{tool}_r{region}_d{depth}.fasta", tool=TOOL, allow_missing=True, dataset=DATASET)
output: "consensus/{region}/{depth}/test_consensus_{region}_{depth}_{threshold}.fasta"
shell: "./MTool_over_folder.py -dir {DATASET} -o {output} -region {wildcards.region} -depth {wildcards.depth} -threshold {wildcards.threshold}"
rule align_mafft:
input: rules.create_consensus.output
output: "consensus/{region}/{depth}/mafft_consensus_{region}_{depth}_{threshold}.fasta"
shell: "mafft --auto {input} > {output}"
rule align_muscle:
input: rules.create_consensus.output
output: "consensus/{region}/{depth}/muscle_consensus_{region}_{depth}_{threshold}.fasta"
shell: "muscle -align {input} -output {output} > /dev/null 2> /dev/null"
rule final_consensus:
input: rules.align_mafft.output, rules.align_muscle.output
output: "consensus/{region}/{depth}/{align}_final_consensus_{region}_{depth}_{threshold}.fasta"
shell: "./msa_handler/MTool consensus/{wildcards.region}/{wildcards.depth}/{wildcards.align}_consensus_{wildcards.region}_{wildcards.depth}_{wildcards.threshold}.fasta consensus/{wildcards.region}/{wildcards.depth}/{wildcards.align}_final_consensus_{wildcards.region}_{wildcards.depth}_{wildcards.threshold}.fasta {wildcards.threshold} 2"
rule exonerate_ref_result:
input: files=expand("consensus/{region}/{depth}/muscle_final_consensus_{region}_{depth}_{threshold}.fasta", threshold=THRESHOLD, allow_missing=True) , ref="../data/seq_selectes_region/region_seq_r{region}.fasta"
output: "consensus/aln_all_consensus_r{region}_d{depth}.txt"
conda: "envs/exonerate.yaml"
shell: "for FILE in {input.files}; do exonerate --bestn 1 -Q dna -E -m a:g {input.ref} $FILE >> {output}; done"
name: exonerate
channels:
- defaults
- conda-forge
- bioconda
dependencies:
- glib=2.69.1=h5202010_0
- _libgcc_mutex=0.1=main
- _openmp_mutex=4.5=1_gnu
- exonerate=2.2.0=1
- libffi=3.3=he6710b0_2
- libgcc=7.2.0=h69d50b8_2
- libgcc-ng=9.3.0=h5101ec6_17
- libgomp=9.3.0=h5101ec6_17
- libstdcxx-ng=9.3.0=hd4cf53a_17
- pcre=8.45=h295c915_0
- zlib=1.2.11=h7b6447c_3
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment