Skip to content
Snippets Groups Projects
Commit 0df69d1c authored by Lihouck Flavien's avatar Lihouck Flavien
Browse files

Cleanup on the repository

parent 2815a2ad
No related branches found
No related tags found
No related merge requests found
configfile: "./workflow/default_config.yaml"
DATASET = expand("{data_in}/{format}", data_in=config["data_out"], format=config["format"])
DATASET = expand("{data}/{format}", data=config["data_out"], format=config["format"])
GRAPH_TYPES = ["identity", "similarity", "equivalence", "mismatches"]
STAT_HEADER= "threshold, percent_identity, percent_similarity, total_equivalence, total_mismatches\n "
......@@ -10,6 +10,8 @@ rule all:
expand("{output_folder}/{region}/{depth}/{align}_meta_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta", region=config["region"], depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])
message: "Meta consensus can be found in ./[output_folder]/[region]/[depth]/"
# Installs the tool used for consensus and meta-consensus #
rule MTool:
......@@ -27,27 +29,27 @@ rule MTool:
rule muscle :
input :
os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', 'data','msa','MSA_muscle_r{region_size}_d{depth}.fasta')
out = os.path.join('{output_folder}', "data", config["format"] ,'MSA_muscle_r{region}_d{depth}.fasta')
message:
"Muscle for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
"Muscle for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_muscle_r{region_size}_d{depth}.log')
os.path.join('{output_folder}','logs','6_muscle_r{region}_d{depth}.log')
conda:
"envs/muscle.yaml"
shell :
"muscle -in {input} -out {output.out} 2> {log}"
"muscle -in {input} -out {output} 2> {log}"
rule mafft :
input :
os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', 'data' ,'msa','mafft_uncorrected_r{region_size}_d{depth}.fasta')
out = os.path.join('{output_folder}', "data" ,config["format"] ,'mafft_uncorrected_r{region}_d{depth}.fasta')
message:
"Mafft for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
"Mafft for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_mafft_r{region_size}_d{depth}.log')
os.path.join('{output_folder}','logs','6_mafft_r{region}_d{depth}.log')
conda:
"envs/mafft.yaml"
shell :
......@@ -55,23 +57,23 @@ rule mafft :
rule mafft_correction :
input :
os.path.join('{output_folder}', 'data' ,'msa','mafft_uncorrected_r{region_size}_d{depth}.fasta')
os.path.join('{output_folder}', "data" ,config["format"],'mafft_uncorrected_r{region}_d{depth}.fasta')
output :
os.path.join('{output_folder}', 'data' ,'msa','MSA_mafft_r{region_size}_d{depth}.fasta')
os.path.join('{output_folder}', "data" ,config["format"],'MSA_mafft_r{region}_d{depth}.fasta')
message:
"Mafft correction for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
"Mafft correction for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
shell :
'cat {input} | tr "atgc" "ATGC" >{output} ; rm -f {input}'
rule spoa :
input :
os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', 'data','msa','MSA_spoa_r{region_size}_d{depth}.fasta')
out = os.path.join('{output_folder}', "data" ,config["format"],'MSA_spoa_r{region}_d{depth}.fasta')
message:
"Spoa for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
"Spoa for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_spoa_r{region_size}_d{depth}.log')
os.path.join('{output_folder}','logs','6_spoa_r{region}_d{depth}.log')
conda:
"envs/spoa.yaml"
shell :
......@@ -79,13 +81,13 @@ rule spoa :
rule kalign :
input :
os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', 'data','msa','MSA_kalign_r{region_size}_d{depth}.fasta')
out = os.path.join('{output_folder}', "data" ,config["format"],'MSA_kalign_r{region}_d{depth}.fasta')
message:
"Kalign for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
"Kalign for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_kalign_r{region_size}_d{depth}.log')
os.path.join('{output_folder}','logs','6_kalign_r{region}_d{depth}.log')
conda:
"envs/kalign2.yaml"
shell :
......@@ -93,13 +95,13 @@ rule kalign :
rule kalign3 :
input :
os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', 'data' ,'msa','MSA_kalign3_r{region_size}_d{depth}.fasta')
out = os.path.join('{output_folder}', "data" ,config["format"],'MSA_kalign3_r{region}_d{depth}.fasta')
message:
"Kalign3 for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
"Kalign3 for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_kalign3_r{region_size}_d{depth}.log')
os.path.join('{output_folder}','logs','6_kalign3_r{region}_d{depth}.log')
conda:
"envs/kalign3.yaml"
shell :
......@@ -107,24 +109,24 @@ rule kalign3 :
rule abpoa :
input :
os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
os.path.join(config["data_in"],'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}' , 'data' ,'msa','abpoa_uncorrected_r{region_size}_d{depth}.fasta')
out = os.path.join('{output_folder}' , "data" ,config["format"],'abpoa_uncorrected_r{region}_d{depth}.fasta')
message:
"Abpoa for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
"Abpoa for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_abpoa_r{region_size}_d{depth}.log')
os.path.join('{output_folder}','logs','6_abpoa_r{region}_d{depth}.log')
conda:
"envs/abpoa.yaml"
shell :
"abpoa -r 1 {input} > {output} 2> {log}"
rule abpoa_correction :
input :
os.path.join('{output_folder}' , 'data','msa','abpoa_uncorrected_r{region_size}_d{depth}.fasta')
os.path.join('{output_folder}' , "data" ,config["format"],'abpoa_uncorrected_r{region}_d{depth}.fasta')
output :
os.path.join('{output_folder}', 'data' ,'msa','MSA_abpoa_r{region_size}_d{depth}.fasta')
os.path.join('{output_folder}', "data" ,config["format"],'MSA_abpoa_r{region}_d{depth}.fasta')
message:
"Abpoa correction for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
"Abpoa correction for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
shell :
'tail -{wildcards.depth} {input}| sed "s/^/>seq\\n/" >{output};rm -f {input}'
......@@ -137,21 +139,22 @@ rule create_consensus_per_tool:
input:
rules.MTool.output, data=expand("{dataset}/MSA_{tool}_r{region}_d{depth}.fasta", allow_missing=True, dataset=DATASET)
output:
"{output_folder}/{region}/{depth}/individual_consensus/{threshold}/consensus_{tool}_{region}_{depth}.fasta"
"{output_folder}/{region}/{depth}/{threshold}/consensus_{tool}_{region}_{depth}.fasta"
log: "logs/{output_folder}/consensus_per_tool_{tool}_r{region}_d{depth}_t{threshold}.log"
message: "Consensus for tools
shell:
"./msa_handler/MTool {input.data} {output} {wildcards.threshold}"
rule create_consensus:
input:
rules.MTool.output, "MTool_over_folder.py", expand("{dataset}/MSA_{tool}_r{region}_d{depth}.fasta", tool=config["tool"], allow_missing=True, dataset=DATASET)
rules.MTool.output, "workflow/scripts/MTool_over_folder.py", expand("{dataset}/MSA_{tool}_r{region}_d{depth}.fasta", tool=config["tool"], allow_missing=True, dataset=DATASET)
params: data=DATASET
log: "logs/{output_folder}consensus_r{region}_d{depth}_t{threshold}.log"
output:
"{output_folder}/{region}/{depth}/test_consensus_{region}_{depth}_{threshold}.fasta"
message: "Consensus region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
shell:
"./MTool_over_folder.py -dir {params.data} -o {output} -region {wildcards.region} -depth {wildcards.depth} -threshold {wildcards.threshold}"
"./workflow/scripts/MTool_over_folder.py -dir {params.data} -o {output} -region {wildcards.region} -depth {wildcards.depth} -threshold {wildcards.threshold}"
# Alignment for the meta-consensus #
......@@ -199,33 +202,33 @@ rule meta_consensus:
rule exonerate_ref_result:
input:
files=expand(rules.meta_consensus.output, align="muscle", allow_missing=True) ,
ref=expand("{data_in}/seq_selectes_region/region_seq_r{region}.fasta", data_in=config["data_in"], allow_missing=True),
ref=expand("{ref}/region_seq_r{region}.fasta", data_in=config["data_in"], ref=config["ref_folder"], allow_missing=True),
consensus_per_tool=expand(rules.create_consensus_per_tool.output, tool=config["tool"], allow_missing=True)
output: "{output_folder}/stats/{region}/aln_all_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.txt"
log: "logs/{output_folder}/exonerate_ref_result_r{region}_d{depth}_t1_{threshold}_t2{value}.log"
params: format="\"%pi, %ps, %et, %em\n\""
conda: "envs/exonerate.yaml"
shell: "./exonerate_stats.sh {output} {input.ref} {input.files} {input.consensus_per_tool}"
shell: "./workflow/scripts/exonerate_stats.sh {output} {input.ref} {input.files} {input.consensus_per_tool}"
# "echo {STAT_HEADER} >> {output} \n for FILE in {input.files}; do exonerate --bestn 1 -Q dna -E -m a:g --showalignment false --showsugar false --showvulgar false --showcigar false --ryo {params.format} --verbose 0 -q {input.ref} -t $FILE >> {output} 2>/dev/null; done" #
rule plottable_exonerate_meta:
input: files=expand(rules.meta_consensus.output, align="muscle", value=config["threshold_2"], allow_missing=True) , ref=expand("{data_in}/seq_selectes_region/region_seq_r{region}.fasta", data_in=config["data_in"], allow_missing=True)
input: files=expand(rules.meta_consensus.output, align="muscle", value=config["threshold_2"], allow_missing=True) , ref=expand("{ref}/region_seq_r{region}.fasta", data_in=config["data_in"], ref=config["ref_folder"], allow_missing=True)
output: "{output_folder}/plot/{region}/{depth}/{threshold}/aln_stats_meta.txt"
log: "logs/{output_folder}plot_exonerate_meta_r{region}_d{depth}_t{threshold}.log"
conda: "envs/exonerate.yaml"
message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold}) for meta-consensus threshold in [ {config[threshold_2]} ]"
shell: "./exonerate_stats.sh {output} {input.ref} {input.files}"
shell: "./workflow/scripts/exonerate_stats.sh {output} {input.ref} {input.files}"
rule plottable_exonerate_tools:
input:
ref=expand("{data_in}/seq_selectes_region/region_seq_r{region}.fasta", data_in=config["data_in"], allow_missing=True),
ref=expand("{ref}/region_seq_r{region}.fasta", data_in=config["data_in"], ref=config["ref_folder"], allow_missing=True),
consensus_per_tool=expand(rules.create_consensus_per_tool.output, value=config["threshold_2"], allow_missing=True)
output: "{output_folder}/plot/{region}/{depth}/{threshold}/aln_stats_{tool}.txt"
log: "logs/{output_folder}/plot_exonerate_tools_{tool}_r{region}_d{depth}_t{threshold}.log"
conda: "envs/exonerate.yaml"
message: "Exonerate comparison between consensus for tool {wildcards.tool} and ref (region: {wildcards.region}, depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold}) for meta-consensus threshold in [ {config[threshold_2]} ]"
shell: "./exonerate_stats.sh {output} {input.ref} {input.consensus_per_tool} 2> {log}"
shell: "./workflow/scripts/exonerate_stats.sh {output} {input.ref} {input.consensus_per_tool} 2> {log}"
rule plot_depth:
......@@ -235,7 +238,7 @@ rule plot_depth:
log: "logs/{output_folder}/plot_depth_r{region}_t{threshold}.log"
output: expand("{output_folder}/plot/{region}/graph/graph_r{region}_t{threshold}_{graph}.png", graph=GRAPH_TYPES, depth=config["depth"], allow_missing=True)
message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, or preliminary consensus threshold in [ {config[threshold_1]} ], meta-consensus threshold in [ {config[threshold_2]} ]"
shell: "./plot_results.py {params.target_folder}/graph_r{wildcards.region}_t{wildcards.threshold} {input} 2> {log}"
shell: "./workflow/scripts/plot_results.py {params.target_folder}/graph_r{wildcards.region}_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
rule plot_t1:
input:
......@@ -244,7 +247,7 @@ rule plot_t1:
log: "logs/{output_folder}/plot_t1_r{region}_d{depth}.log"
output: expand("{output_folder}/plot/{region}/{depth}/graph_r{region}_d{depth}_{graph}.png", graph=GRAPH_TYPES, allow_missing=True)
message: "Creating plot for meta-consensus quality with preliminary threshold in [ {config[threshold_1]} ] and meta-consensus in [ {config[threshold_2]} ] (region: {wildcards.region}, depth: {wildcards.depth})"
shell: "./plot_results.py {params.target_folder}/graph_r{wildcards.region}_d{wildcards.depth} {input} 2> {log} "
shell: "./workflow/scripts/plot_results.py {params.target_folder}/graph_r{wildcards.region}_d{wildcards.depth} '[{config[threshold_2]}]' {input} 2> {log} "
rule plot_t2:
......@@ -254,11 +257,13 @@ rule plot_t2:
log: "logs/{output_folder}/plot_t2_r{region}_d{depth}_t_{threshold}.log"
output: expand("{output_folder}/plot/{region}/{depth}/{threshold}/graph_r{region}_d{depth}_t1_{threshold}_{graph}.png", graph=GRAPH_TYPES, allow_missing=True)
message: "Creating plot for meta-consensus compared to preliminary consensus with meta-consensus threshold in {config[threshold_2]} (region: {wildcards.region}, depth: {wildcards.depth}, threshold: {wildcards.threshold})"
shell: "./plot_results.py {params.target_folder}graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold} {input} 2> {log}"
shell: "./workflow/scripts/plot_results.py {params.target_folder}graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
rule make_plots:
rule plot:
input: expand(rules.plot_t2.output, region=config["region"], depth=config["depth"], threshold=config["threshold_1"], output_folder=config["output_folder"]),
expand(rules.plot_t1.output, region=config["region"], depth=config["depth"] , output_folder=config["output_folder"]),
expand(rules.plot_depth.output, region=config["region"], threshold=config["threshold_1"], output_folder=config["output_folder"])
message: "Created all plots"
# minimap2 : minimap2 -a -c longest_read.fasta reads/diploid_reads_shuffle.fasta > test_minimap_aln.sam
data_in: "../../data"
data_in: "../../data/selected_read"
data_out: "consensus/data"
format: "msa"
tool: ["abpoa", "kalign", "kalign3", "mafft", "muscle", "spoa"]
......@@ -8,3 +8,4 @@ region: ["100", "200", "500", "1000", "2000"]
depth: ["10", "20", "50", "100", "150"]
align: ["muscle", "mafft"]
output_folder: "consensus"
ref_folder: "../../data/seq_selectes_region"
\ No newline at end of file
name: samtools
channels:
- bioconda/label/cf201901
- conda-forge
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=2_gnu
- bzip2=1.0.8=h7f98852_4
- c-ares=1.18.1=h7f98852_0
- ca-certificates=2021.10.8=ha878542_0
- curl=7.83.1=h2283fc2_0
- keyutils=1.6.1=h166bdaf_0
- krb5=1.19.3=h08a2579_0
- libcurl=7.83.1=h2283fc2_0
- libedit=3.1.20191231=he28a2e2_2
- libev=4.33=h516909a_1
- libgcc=7.2.0=h69d50b8_2
- libgcc-ng=12.1.0=h8d9b700_16
- libgomp=12.1.0=h8d9b700_16
- libnghttp2=1.47.0=he49606f_0
- libssh2=1.10.0=ha35d2d1_2
- libstdcxx-ng=12.1.0=ha89aaad_16
- libzlib=1.2.11=h166bdaf_1014
- ncurses=6.3=h27087fc_1
- openssl=3.0.3=h166bdaf_0
- samtools=1.7=1
- xz=5.2.5=h516909a_1
- zlib=1.2.11=h166bdaf_1014
prefix: /home/flav/anaconda3/envs/samtools
name: star
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- star=2.7.10a=h9ee0642_0
prefix: /home/flav/anaconda3/envs/star
File moved
import sys
def extract_longest_read(filename, nb_reads):
max_len = [0 for x in range(nb_reads)]
descr= ""
longest_read = ["" for x in range(nb_reads)]
is_descr = True
with open(filename, 'r') as f:
for line in f:
if not is_descr:
min_len = min(max_len)
if len(line) > min_len and line[0] in ['A','C','G','T','N']:
index = max_len.index(min_len)
max_len[index] = len(line)
longest_read[index] = descr + line
is_descr = True
elif (line[0] == '>' or line[0] == '@'):
is_descr = False
descr = ">" +line
return longest_read
def main(arg):
results = extract_longest_read(arg, 10)
with open("longest_reads.fasta", 'w') as f:
for read in results:
f.write("".join(read))
if (__name__ == "__main__"):
main(sys.argv[1])
......@@ -8,7 +8,6 @@ import re
import sys
import os
th = [30, 40, 50, 60, 70, 80]
def parse_file(filename, x_scale):
identity, similarity, equivalence, mismatches = [ [], [] , [], []]
......@@ -68,19 +67,21 @@ def plot_from_folder(args, target, labels, folder):
plt.plot(th,[tab for x in range(len(th))], label=labels[graph_cnt])
tab_cnt+=1
legend = plt.legend(shadow=True, fontsize='x-large')
plt.savefig(target+"_" + target_att[graph_cnt] + ".png", format="png",)
plt.savefig(target+"_" + target_att[graph_cnt] + ".png", format="png")
graph_cnt+=1
if __name__ == "__main__":
th = list(map(int, sys.argv[2][1:-1].split(" ")))
if (len(sys.argv) < 3):
if (len(sys.argv) < 4):
exit(-1)
if (len(sys.argv) == 3):
if (len(sys.argv) < 4):
print(sys.argv)
folder_it = os.scandir(sys.argv[2])
folder_it = os.scandir(sys.argv[3])
name= [ x.name for x in folder_it]
#folder_it= os.scandir(sys.argv[2])
plot_from_folder(folder_it, sys.argv[1], name, folder=sys.argv[2])
plot_from_folder(folder_it, sys.argv[1], name, folder=sys.argv[3])
exit(0)
lb= []
for elem in th:
......
File moved
data_in: "../../data"
data_out: "consensus/data"
format: "msa"
tool: ["abpoa", "kalign", "kalign3", "mafft", "muscle", "spoa"]
threshold_1: ["40", "50", "60", "70"]
threshold_2 : ["40", "50", "60", "70"]
region: ["0"]
depth: ["0"]
align: ["muscle", "mafft"]
output_folder: "consensus"
ref_folder: "../../data"
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment