Skip to content
Snippets Groups Projects
Commit 970d980c authored by Lihouck Flavien's avatar Lihouck Flavien
Browse files

Ajout de la progression du pipeline dans le cas ou l'on a pas de réferences

parent e42c81a3
Branches
No related tags found
No related merge requests found
......@@ -9,3 +9,6 @@ data/
*.fasta
MTool
logs/
*.bam
*.sam
*.txt
\ No newline at end of file
#include "MSA_cumulative.h"
string MSA_cumulative::consensus(int threshold, int ploidity) {
string consensus_seq("");
int size_alpha = int(alphabet.size());
int current_weight(1);
for (int i(0); i < length; ++i) {
int size_bl = int(text.size());
int scores[size_alpha] = {0};
int depth_index = 0;
int alpha_index = 0;
int highest_score =0;
while (depth_index < size_bl && threshold*current_weight > (highest_score*100)/size_bl) {
alpha_index = alphabet.find(text[depth_index][i]);
if (alpha_index != -1) {
scores[alpha_index] += current_weight;
}
else {
vector<char> multiple_basis = IUPAC_to_basis(text[depth_index][i]);
int size = multiple_basis.size();
if (current_weight % size != 0) {
current_weight *= size;
for( int k(0); k <(int)alphabet.size();++k) {
scores[k] *= size;
}
}
for (auto it = begin(multiple_basis); it != end(multiple_basis); ++it) {
scores[alphabet.find(*it)] += (current_weight / size);
}
}
highest_score = max(highest_score, scores[alpha_index]);
//cerr << "index " <<alpha_index << " score " << highest_score << " percentage : " << (highest_score*100)/size_bl << endl;
depth_index++;
}
if (depth_index < size_bl){
//cerr << "char at pos " << i << " is " << alphabet[alpha_index] << " end pos is " << length << endl;
if (alpha_index != 0 && alpha_index != 16)
consensus_seq += alphabet[alpha_index];
}
else {
vector<char> conserved_nuc;
while (threshold*current_weight > (highest_score*100)/size_bl) {
alpha_index = 0;
while (alpha_index < size_alpha && scores[alpha_index] <= highest_score) {
////cerr << "alpha_index " << alpha_index << "scores :" << scores[alpha_index] << endl;
alpha_index++;
}
highest_score = scores[alpha_index];
conserved_nuc.insert(conserved_nuc.begin(), alphabet[alpha_index]);
for (int k(0); k < size_alpha; k++)
scores[k] += highest_score;
scores[alpha_index] = 0;
}
char nuc = basis_to_IUPAC(conserved_nuc);
//cerr << "char at pos " << i << " is " << nuc << endl;
if (nuc != '-' && nuc != '.')
consensus_seq += nuc;
}
}
return consensus_seq;
}
#ifndef MSA_CUMUL_H
#define MSA_CUMUL_H
#include "MSA.h"
class MSA_cumulative : public MSA
{
public:
string consensus(int similarity_threshold, int ploidity);
};
#endif
......@@ -6,10 +6,19 @@ all: MTool clean
MTool: main.o MSA.o
$(CC) -o MTool main.o MSA.o
MTool2: main2.o MSA.o MSA_cumulative.o
$(CC) -o MTool main2.o MSA.o MSA_cumulative.o
MSA_cumulative.o: MSA_cumulative.cpp MSA_cumulative.h MSA.h
$(CC) $(CFLAGS) MSA_cumulative.cpp
MSA.o: MSA.cpp MSA.h
$(CC) $(CFLAGS) MSA.cpp
main2.o: main2.cpp MSA.h MSA_cumulative.h
$(CC) $(CFLAGS) main2.cpp
main.o: main.cpp MSA.h
$(CC) $(CFLAGS) main.cpp
......
#include "MSA.h"
#include "MSA_cumulative.h"
#include <fstream>
#include <iostream>
#include <string>
#define THRESHOLD 70
#define PLOIDITY 2
using namespace std;
int main(int argc, char** argv){
if (argc > 1) {
MSA msa_fasta = MSA_cumulative();
msa_fasta.parser_fasta(argv[1]);
ofstream output;
string filename;
if (argc > 2) {
filename = argv[2];
}
else
filename = "output.txt";
output.open (filename, ios::out | ios::app | ios::ate);
string cons_name = argv[1];
int index = cons_name.find_last_of('/') +1;
int threshold;
if (argc > 3)
threshold = stoi(argv[3]);
else
threshold = THRESHOLD;
int ploidity;
if (argc > 4)
ploidity = stoi(argv[4]);
else
ploidity = PLOIDITY;
cons_name = ">lcl|consensus_"+cons_name.substr(index) + " threshold = " + to_string(threshold);
output << cons_name << endl << msa_fasta.consensus(threshold, ploidity) << endl;
output.close();
}
}
configfile: "./workflow/test_all.yaml"
import os, sys
configfile: "./workflow/lambda_phage.yaml"
DATASET = expand("{data}/{format}", data=config["data_out"], format=config["format"])
GRAPH_TYPES = ["identity", "similarity", "equivalence", "mismatches","sequence_length", "aligned_length"]
STAT_HEADER= "threshold, percent_identity, percent_similarity, total_equivalence, total_mismatches, target length, alignement length\n "
STAT_HEADER= "threshold, percent_identity, percent_similarity, total_equivalence, total_mismatches, target length, alignment length\n "
REGION_SIZE = 20000
REGION_HEAD_OVERLAP = 200
#SPLIT_SIZE = 0#
#REGIONS_STARTS = []#
def cut_region_from_length(length):
start = 2
region = []
while start < int(length):
region.append(str(start)+"_"+str( min(start+REGION_SIZE, int(length))))
start += REGION_SIZE - REGION_HEAD_OVERLAP
return region
#def split_points(amount, length):#
# SPLIT_SIZE = math.floor(amount / length)#
# REGION_START = [ x for x in range(1, length, SPLIT_SIZE) ]#
if not "region" in config:
config["region"] = cut_region_from_length(config["length"])
STARTS = [1, 4000, 8000]
ENDS = [4000, 8000, 12000]
# Generates the meta-consensus from given reads #
rule all:
input:
expand("{output_folder}/{region}/{depth}/{align}_meta_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta", region=config["region"], depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])
message: "Meta consensus can be found in ./[output_folder]/[region]/[depth]/"
# -------------------------------------------------------- #
# Installs the tool used for consensus and meta-consensus #
rule MTool:
......@@ -28,18 +39,20 @@ rule MTool:
output:
"msa_handler/MTool"
shell:
"cd msa_handler && make"
"cd msa_handler && make MTool2"
# -------------------------------------------------------- #
# Aligns the reads on a given reference with minimap2 #
if "ref_file" in config:
rule alignment_reads_on_ref :
input :
ref=os.path.join(config["data_in"],config["ref_file"]),
read=os.path.join(config["data_in"], config["read_file"])
output :
os.path.join(config["data_in"],'alignement','aln_reads_on_ref.sam')
os.path.join(config["data_out"],'alignement','aln_reads_on_ref.sam')
message :
"Minimap for "
"Minimap for {config[data_in]}"
log:
os.path.join('logs','2_alignment_reads_on_ref.log')
conda:
......@@ -49,13 +62,11 @@ rule alignment_reads_on_ref :
'echo "ORDER: $ORDER" >{log};'
'$ORDER >{output} 2>> {log}'
# Cuts the reads into smaller regions to reduce the overall size and lower the computational cost, and creates fasta to build MSA#
rule reads_map_region:
input :
aln = os.path.join(config['data_in'],'alignement','aln_reads_on_ref.sam'),
aln = os.path.join(config['data_out'],'alignement','aln_reads_on_ref.sam')
output :
os.path.join('{data_set}','read_map_region','read_r{start}_{end}.fasta')
message:
......@@ -65,14 +76,80 @@ rule reads_map_region:
conda:
"envs/perl.yaml"
shell :
'ORDER="./workflow/scripts/reads_map_region.pl -s {wildcards.start} -e {wildcards.end} {input.aln} {output}";'
'ORDER="./workflow/scripts/reads_map_region.pl -s {wildcards.start} -e {wildcards.end} -c 100 {input.aln} {output}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER 2>&1 >>{log}'
else:
# -------------------------------------------------------- #
# Extracts the longest read from the read pile to use as an anchor for a prealignment #
rule extract_longest_read:
input : read=os.path.join(config["data_in"],config["read_file"])
output : os.path.join(config["data_out"],"anchor.fasta")
message: "Extracting the longest read from {config[read_file]}"
shell :
"./workflow/scripts/extract_longest_read.py -i {input.read} -o {output} -n 1 "
# Align the reads on anchor read #
rule alignment_reads_on_anchor :
input :
ref=os.path.join(config["data_out"],"anchor.fasta"),
read=os.path.join(config["data_in"], config["read_file"])
output :
os.path.join(config["data_in"],'alignement','aln_reads_on_read.sam')
message :
"Minimap for {config[data_out]}"
log:
os.path.join('logs','2_alignment_reads_on_anchor.log')
conda:
"envs/minimap2.yaml"
shell :
'ORDER="minimap2 -cax map-ont -t 8 {input.ref} {input.read}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER >{output} 2>> {log}'
# Select the region to keep from the anchoring #
rule get_regions_for_read :
input : sam={rules.alignment_reads_on_anchor.output}
output: os.path.join(config["data_in"], 'regions.txt')
conda: "envs/samtools.yaml"
params: depth=max(config["depth"])
shell: "./workflow/scripts/prepare_region_from_read.sh {input.sam} {output} {params.depth}"
# rule set_region_elem:
# input: {rules.get_regions_for_read.output}
# output: os.path.join(config["data_out"], 'region_fixed.txt')
# run:
# for elem in {output}:
# with open(str(elem), 'w') as f:
# f.write(set_region({input}))
# print(config['region'])
rule reads_map_regions_no_ref:
input: sam={rules.alignment_reads_on_anchor.output}, regions={rules.get_regions_for_read.output} #, fix={rules.set_region_elem.output}
output: os.path.join('{data_set}', "read_map_region",'read_r{start}_{end, \d+}.fasta')
conda: "envs/perl.yaml"
log:
os.path.join('{data_set}','logs','3_reads_map_region_r{start}_{end}.log')
shell :
'ORDER="./workflow/scripts/reads_map_region.pl -s {wildcards.start} -e {wildcards.end} -c 100 {input.sam} {output}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER 2>&1 >>{log}'
rule duct_tape:
input: rules.reads_map_region.output
output: expand(os.path.join('{data_set}','read_map_region', 'read_r{region}.fasta'), region='{start}_{end}', allow_missing=True)
shell: "echo 'DuctTape'"
#'./workflow/scripts/cut_regions_with_file {input.sam} {wildcards.region}'
# -------------------------------------------------------- #
# Depth selection
rule select_depth :
input :
......@@ -89,17 +166,18 @@ rule select_depth :
'$ORDER >{output} 2>>{log}'
# --------------------------------------------------------#
# Creation of MSA from the reads pile using tools #
rule muscle :
input :
os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
os.path.join(config["data_out"],'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', "data", config["format"] ,'MSA_muscle_r{region}_d{depth}.fasta')
message:
"Muscle for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_muscle_r{region}_d{depth}.log')
os.path.join('logs','{output_folder}','6_muscle_r{region}_d{depth}.log')
conda:
"envs/muscle.yaml"
shell :
......@@ -107,13 +185,13 @@ rule muscle :
rule mafft :
input :
os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
os.path.join(config["data_out"],'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', "data" ,config["format"] ,'mafft_uncorrected_r{region}_d{depth}.fasta')
message:
"Mafft for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_mafft_r{region}_d{depth}.log')
os.path.join('logs','{output_folder}','6_mafft_r{region}_d{depth}.log')
conda:
"envs/mafft.yaml"
shell :
......@@ -131,13 +209,13 @@ rule mafft_correction :
rule spoa :
input :
os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
os.path.join(config["data_out"],'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', "data" ,config["format"],'MSA_spoa_r{region}_d{depth}.fasta')
message:
"Spoa for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_spoa_r{region}_d{depth}.log')
os.path.join('logs','{output_folder}','6_spoa_r{region}_d{depth}.log')
conda:
"envs/spoa.yaml"
shell :
......@@ -145,13 +223,13 @@ rule spoa :
rule kalign :
input :
os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
os.path.join(config["data_out"],'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', "data" ,config["format"],'MSA_kalign_r{region}_d{depth}.fasta')
message:
"Kalign for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_kalign_r{region}_d{depth}.log')
os.path.join('logs','{output_folder}','6_kalign_r{region}_d{depth}.log')
conda:
"envs/kalign2.yaml"
shell :
......@@ -159,13 +237,13 @@ rule kalign :
rule kalign3 :
input :
os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
os.path.join(config["data_out"],'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', "data" ,config["format"],'MSA_kalign3_r{region}_d{depth}.fasta')
message:
"Kalign3 for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_kalign3_r{region}_d{depth}.log')
os.path.join('logs','{output_folder}','6_kalign3_r{region}_d{depth}.log')
conda:
"envs/kalign3.yaml"
shell :
......@@ -173,13 +251,13 @@ rule kalign3 :
rule abpoa :
input :
os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
os.path.join(config["data_out"],'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}' , "data" ,config["format"],'abpoa_uncorrected_r{region}_d{depth}.fasta')
message:
"Abpoa for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_abpoa_r{region}_d{depth}.log')
os.path.join('logs','{output_folder}','6_abpoa_r{region}_d{depth}.log')
conda:
"envs/abpoa.yaml"
shell :
......@@ -197,8 +275,8 @@ rule abpoa_correction :
# Creation of the consensus #
# --------------------------------------------------------#
# Creation of the consensus for each tools #
rule create_consensus_per_tool:
input:
......@@ -214,12 +292,14 @@ rule create_consensus:
params: data=DATASET
log: "logs/{output_folder}consensus_r{region}_d{depth}_t{threshold}.log"
output:
"{output_folder}/{region}/{depth}/test_consensus_{region}_{depth}_{threshold}.fasta"
"{output_folder}/{region}/{depth}/all_tools_consensus_{region}_{depth}_{threshold}.fasta"
message: "Consensus region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
shell: 'cat {input} > {output}'
# "./workflow/scripts/MTool_over_folder.py -dir {params.data} -o {output} -region {wildcards.region} -depth {wildcards.depth} -threshold {wildcards.threshold}"#
# Alignment for the meta-consensus #
# --------------------------------------------------------#
# Alignment of the consensus for the meta-consensus #
rule align_mafft:
input:
......@@ -229,7 +309,7 @@ rule align_mafft:
conda:
"envs/mafft.yaml"
log: "logs/{output_folder}/align_mafft_r{region}_d{depth}_t{threshold}.log"
message: "Alignment with mafft region: {wildcards.region} , depth: {wildcards.depth} , threshold: {threshold}\n"
message: "Alignment with mafft region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
shell:
"mafft --auto {input} > {output} 2> {log}"
......@@ -246,6 +326,8 @@ rule align_muscle:
"muscle -in {input} -out {output} > {log} 2>{log}"
# --------------------------------------------------------#
# Meta consensus #
rule meta_consensus:
......@@ -259,7 +341,32 @@ rule meta_consensus:
"./msa_handler/MTool {input.muscle} {output} {wildcards.value}"
# -------------------------------------------------------- #
# Trying to merge regions #
# --------------------------------------------------------#
# This part handle comparison to a reference and creates plots in order to better visualize the efficiency of the consensus strategie used for both the preliminary consensus and the meta-consensus. #
if ("ref_file" in config):
rule region_seq :
input :
ref =os.path.join(config["data_in"],config["ref_file"])
output :
"{data_set}/region_seq_r{start}_{end}.fasta"
message:
"Region's seq for {wildcards.data_set} (Region {wildcards.start} - {wildcards.end})"
log:
"{data_set}/logs/6_region_seq_r{start}_{end}.log"
shell :
'ORDER="./workflow/scripts/region_seq.sh {wildcards.start} {input.ref} {output} {wildcards.end}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER 2>&1 >>{log}'
rule exonerate_ref_result:
input:
......@@ -294,12 +401,21 @@ rule plottable_exonerate_tools:
rule plot_depth:
input: expand(rules.plottable_exonerate_meta.output, depth=config["depth"], allow_missing=True),
input: expand(rules.plottable_exonerate_meta.output, depth=config["depth"], allow_missing=True)
params: target_folder="consensus/plot/{region}/graph"
log: "logs/{output_folder}/plot_depth_r{region}_t{threshold}.log"
output: expand("{output_folder}/plot/{region}/graph/graph_r{region}_t{threshold}_{graph}.png", graph=GRAPH_TYPES, depth=config["depth"], allow_missing=True)
output: expand("{output_folder}/plot/{region}/graph/graph_depth_r{region}_t{threshold}_{graph}.png", graph=GRAPH_TYPES, depth=config["depth"], allow_missing=True)
message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, or preliminary consensus threshold in [ {config[threshold_1]} ], meta-consensus threshold in [ {config[threshold_2]} ]"
shell: "./workflow/scripts/plot_results.py {params.target_folder}/graph_r{wildcards.region}_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
shell: "./workflow/scripts/plot_results.py {wildcards.output_folder}/plot/{wildcards.region}/graph/graph_depth_r{wildcards.region}_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
rule plot_depth_all:
input: expand(rules.plottable_exonerate_meta.output, depth=config["depth"], allow_missing=True), expand(rules.plottable_exonerate_tools.output, tool=config["tool"], depth=config["depth"], allow_missing=True)
params: target_folder="consensus/plot/{region}/graph"
log: "logs/{output_folder}/plot_all_depth_r{region}_t{threshold}.log"
output: expand("{output_folder}/plot/{region}/graph/graph_depth_all_r{region}_t{threshold}_{graph}.png", graph=GRAPH_TYPES, depth=config["depth"], allow_missing=True)
message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, or preliminary consensus threshold in [ {config[threshold_1]} ], meta-consensus threshold in [ {config[threshold_2]} ]"
shell: "./workflow/scripts/plot_results.py {wildcards.output_folder}/plot/{wildcards.region}/graph/graph_depth_all_r{wildcards.region}_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
rule plot_t1:
input:
......@@ -308,8 +424,7 @@ rule plot_t1:
log: "logs/{output_folder}/plot_t1_r{region}_d{depth}.log"
output: expand("{output_folder}/plot/{region}/{depth}/graph_r{region}_d{depth}_{graph}.png", graph=GRAPH_TYPES, allow_missing=True)
message: "Creating plot for meta-consensus quality with preliminary threshold in [ {config[threshold_1]} ] and meta-consensus in [ {config[threshold_2]} ] (region: {wildcards.region}, depth: {wildcards.depth})"
shell: "./workflow/scripts/plot_results.py {params.target_folder}/graph_r{wildcards.region}_d{wildcards.depth} '[{config[threshold_2]}]' {input} 2> {log} "
shell: "./workflow/scripts/plot_results.py {wildcards.output_folder}/plot/{wildcards.region}/{wildcards.depth}/graph_r{wildcards.region}_d{wildcards.depth} '[{config[threshold_2]}]' {input} 2> {log} "
rule plot_t2:
input: expand(rules.plottable_exonerate_tools.output, tool=config["tool"], allow_missing=True),
......@@ -318,13 +433,18 @@ rule plot_t2:
log: "logs/{output_folder}/plot_t2_r{region}_d{depth}_t_{threshold}.log"
output: expand("{output_folder}/plot/{region}/{depth}/{threshold}/graph_r{region}_d{depth}_t1_{threshold}_{graph}.png", graph=GRAPH_TYPES, allow_missing=True)
message: "Creating plot for meta-consensus compared to preliminary consensus with meta-consensus threshold in {config[threshold_2]} (region: {wildcards.region}, depth: {wildcards.depth}, threshold: {wildcards.threshold})"
shell: "./workflow/scripts/plot_results.py {params.target_folder}graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
shell: "./workflow/scripts/plot_results.py {wildcards.output_folder}/plot/{wildcards.region}/{wildcards.depth}/{wildcards.threshold}/graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
# --------------------------------------------------------#
rule plot:
input: expand(rules.plot_t2.output, region=config["region"], depth=config["depth"], threshold=config["threshold_1"], output_folder=config["output_folder"]),
expand(rules.plot_t1.output, region=config["region"], depth=config["depth"] , output_folder=config["output_folder"]),
expand(rules.plot_depth.output, region=config["region"], threshold=config["threshold_1"], output_folder=config["output_folder"])
expand(rules.plot_depth.output, region=config["region"], threshold=config["threshold_1"], output_folder=config["output_folder"]),
expand(rules.plot_depth_all.output, region=config["region"], threshold=config["threshold_1"], output_folder=config["output_folder"])
message: "Created all plots"
# minimap2 : minimap2 -a -c longest_read.fasta reads/diploid_reads_shuffle.fasta > test_minimap_aln.sam
else:
rule plot:
run: sys.exit("Plotting requires a reference to create statistics")
\ No newline at end of file
name: samtools
channels:
- bioconda/label/cf201901
- conda-forge
- bioconda
- defaults
......@@ -9,12 +8,14 @@ dependencies:
- _openmp_mutex=4.5=2_gnu
- bzip2=1.0.8=h7f98852_4
- c-ares=1.18.1=h7f98852_0
- ca-certificates=2021.10.8=ha878542_0
- ca-certificates=2022.5.18=ha878542_0
- curl=7.83.1=h2283fc2_0
- htslib=1.9=h4da6232_3
- keyutils=1.6.1=h166bdaf_0
- krb5=1.19.3=h08a2579_0
- libcurl=7.83.1=h2283fc2_0
- libedit=3.1.20191231=he28a2e2_2
- libdeflate=1.2=h516909a_1
- libedit=3.1.20191231=h46ee950_2
- libev=4.33=h516909a_1
- libgcc=7.2.0=h69d50b8_2
- libgcc-ng=12.1.0=h8d9b700_16
......@@ -23,9 +24,9 @@ dependencies:
- libssh2=1.10.0=ha35d2d1_2
- libstdcxx-ng=12.1.0=ha89aaad_16
- libzlib=1.2.11=h166bdaf_1014
- ncurses=6.3=h27087fc_1
- ncurses=6.1=hf484d3e_1002
- openssl=3.0.3=h166bdaf_0
- samtools=1.7=1
- samtools=1.9=h10a08f8_12
- xz=5.2.5=h516909a_1
- zlib=1.2.11=h166bdaf_1014
prefix: /home/flav/anaconda3/envs/samtools
data_in: "../../Lambda_phage"
read_file: "lambda_1000x_10p.fa"
#ref_file: "lambda_virus.fa"
data_out: "lambda/data"
format: "msa"
tool: [ "kalign", "kalign3", "mafft", "muscle"]
threshold_1: ["50", "60"]
threshold_2 : ["50", "60"]
#region: ["100_1100","1100_2100", "2100_4100", "4100_6100"]
length: '40500'
depth: ["15","20","30"]
align: ["muscle"]
output_folder: "lambda/consensus"
ref_folder: "lambda/data"
import sys
def main(args):
start_r = 0
regions = []
amount = int(args[3])
with open(args[1], 'r') as f:
for line in f:
line.replace("\n", '')
pos, depth = line.split("\t")
if int(depth) > amount :
if start_r == 0:
start_r=pos
else:
if start_r != 0:
regions.append((start_r, pos))
start_r= 0
with open(args[2], 'w') as out:
out.write('[')
for elem in regions:
out.write('"'+elem[0]+"_"+elem[1]+'",')
out.write(']')
if __name__ == "__main__":
if len(sys.argv) == 4:
main(sys.argv)
else:
sys.exit("Usage:\ndelimit_regions input output min_depth\n")
import sys
#!/usr/bin/env python3
import sys
import argparse
def extract_longest_read(filename, nb_reads):
max_len = [0 for x in range(nb_reads)]
......@@ -21,11 +23,27 @@ def extract_longest_read(filename, nb_reads):
return longest_read
def main(arg):
results = extract_longest_read(arg, 10)
with open("longest_reads.fasta", 'w') as f:
def main():
parser=argparse.ArgumentParser()
parser.add_argument("-input", help="the file to extract reads from", required=True)
parser.add_argument("-number", help="the amount of reads to extract (default=1)")
parser.add_argument("-output", help="the output file's name (the reads will be printed to stdout otherwise)")
amount = 1
args = parser.parse_args()
if args.number:
try:
amount = int(args.number)
except:
sys.exit("'-n' requires a number")
results = extract_longest_read(args.input, amount)
if args.output:
output= args.output
with open(output, 'w') as f:
for read in results:
f.write("".join(read))
else:
for read in results:
print("".join(read))
if (__name__ == "__main__"):
main(sys.argv[1])
main()
#!/bin/bash
INPUT=$1
OUTPUT=$2
MIN=$3
samtools sort -O sam $INPUT -o tmp.sam
samtools depth tmp.sam > depth.txt
cut -f2- depth.txt > depths_clean.txt
python3 workflow/scripts/delimit_regions.py depths_clean.txt $OUTPUT $MIN
rm -f tmp.sam depth.txt depths_clean.txt
......@@ -7,8 +7,8 @@ else
INPUT_REF=$2
OUTPUT=$3
REGION_SIZE=$4
START_REGION=`cat $INPUT_START`
let "END_REGION = $REGION_SIZE + START_REGION -1"
let "START_REGION= $INPUT_START-1"
let "END_REGION = $REGION_SIZE"
cat $INPUT_REF |grep ">" >$OUTPUT
cat $INPUT_REF |grep -v ">"|tr -d "\n"|cut -c $START_REGION-$END_REGION >>$OUTPUT
fi
......@@ -3,11 +3,11 @@ ref_file: "ref_diploid.fasta"
read_file: "diploid_reads_shuffle.fastq"
data_out: "consensus/data"
format: "msa"
tool: ["abpoa", "kalign", "kalign3", "mafft", "muscle", "spoa"]
tool: [ "kalign", "kalign3", "mafft", "muscle"]
threshold_1: ["35", "36", "37", "38", "39","40", "41", "42", "43", "44", "45"]
threshold_2 : ["35", "36", "37", "38", "39","40", "41", "42", "43", "44", "45"]
region: ["1_2000", "2000_4000"]
depth: ["10", "20", "30"]
align: ["muscle", "mafft"]
region: ["2000_4000","4000_6000"]
depth: ["5","10","15","20","30"]
align: ["muscle"]
output_folder: "consensus"
ref_folder: "../../data"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment