Skip to content
Snippets Groups Projects
Commit 2815a2ad authored by Lihouck Flavien's avatar Lihouck Flavien
Browse files

Pipeline working from aligned reads (Step 2 and onwards)

parent fb13c0ac
No related branches found
No related tags found
No related merge requests found
......@@ -8,4 +8,4 @@ data/
.#*
*.fasta
MTool
log/
\ No newline at end of file
logs/
\ No newline at end of file
configfile: "./workflow/default_config.yaml"
DATASET = expand("{data}/{format}", data=config["data"], format=config["format"])
DATASET = expand("{data_in}/{format}", data_in=config["data_out"], format=config["format"])
GRAPH_TYPES = ["identity", "similarity", "equivalence", "mismatches"]
STAT_HEADER= "threshold, percent_identity, percent_similarity, total_equivalence, total_mismatches\n "
rule all:
input:
expand("consensus/{region}/{depth}/{align}_meta_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta", region=config["region"], depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle")
message: "Meta consensus can be found in ./consensus/[region]/[depth]/"
expand("{output_folder}/{region}/{depth}/{align}_meta_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta", region=config["region"], depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])
message: "Meta consensus can be found in ./[output_folder]/[region]/[depth]/"
# Installs the tool used for consensus and meta-consensus #
rule MTool:
input:
......@@ -23,12 +21,124 @@ rule MTool:
shell:
"cd msa_handler && make"
# Creation of MSA from the reads pile using tools #
rule muscle :
input :
os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', 'data','msa','MSA_muscle_r{region_size}_d{depth}.fasta')
message:
"Muscle for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_muscle_r{region_size}_d{depth}.log')
conda:
"envs/muscle.yaml"
shell :
"muscle -in {input} -out {output.out} 2> {log}"
rule mafft :
input :
os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', 'data' ,'msa','mafft_uncorrected_r{region_size}_d{depth}.fasta')
message:
"Mafft for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_mafft_r{region_size}_d{depth}.log')
conda:
"envs/mafft.yaml"
shell :
"mafft {input} > {output} 2> {log}"
rule mafft_correction :
input :
os.path.join('{output_folder}', 'data' ,'msa','mafft_uncorrected_r{region_size}_d{depth}.fasta')
output :
os.path.join('{output_folder}', 'data' ,'msa','MSA_mafft_r{region_size}_d{depth}.fasta')
message:
"Mafft correction for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
shell :
'cat {input} | tr "atgc" "ATGC" >{output} ; rm -f {input}'
rule spoa :
input :
os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', 'data','msa','MSA_spoa_r{region_size}_d{depth}.fasta')
message:
"Spoa for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_spoa_r{region_size}_d{depth}.log')
conda:
"envs/spoa.yaml"
shell :
"spoa -r 1 -l 1 {input} > {output} 2> {log}"
rule kalign :
input :
os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', 'data','msa','MSA_kalign_r{region_size}_d{depth}.fasta')
message:
"Kalign for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_kalign_r{region_size}_d{depth}.log')
conda:
"envs/kalign2.yaml"
shell :
"kalign -i {input} -out {output.out} 1> {log} 2> {log}"
rule kalign3 :
input :
os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', 'data' ,'msa','MSA_kalign3_r{region_size}_d{depth}.fasta')
message:
"Kalign3 for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_kalign3_r{region_size}_d{depth}.log')
conda:
"envs/kalign3.yaml"
shell :
"kalign -i {input} -o {output.out} 1>{log} 2> {log}"
rule abpoa :
input :
os.path.join(config["data_in"],'selected_read','reads_r{region_size}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}' , 'data' ,'msa','abpoa_uncorrected_r{region_size}_d{depth}.fasta')
message:
"Abpoa for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','6_abpoa_r{region_size}_d{depth}.log')
conda:
"envs/abpoa.yaml"
shell :
"abpoa -r 1 {input} > {output} 2> {log}"
rule abpoa_correction :
input :
os.path.join('{output_folder}' , 'data','msa','abpoa_uncorrected_r{region_size}_d{depth}.fasta')
output :
os.path.join('{output_folder}', 'data' ,'msa','MSA_abpoa_r{region_size}_d{depth}.fasta')
message:
"Abpoa correction for {wildcards.output_folder} (Region size={wildcards.region_size} & Depth={wildcards.depth})"
shell :
'tail -{wildcards.depth} {input}| sed "s/^/>seq\\n/" >{output};rm -f {input}'
# Creation of the consensus #
rule create_consensus_per_tool:
input:
rules.MTool.output, data=expand("{dataset}/MSA_{tool}_r{region}_d{depth}.fasta", allow_missing=True, dataset=DATASET)
output:
"consensus/{region}/{depth}/individual_consensus/{threshold}/consensus_{tool}_{region}_{depth}.fasta"
log: "logs/consensus_per_tool_{tool}_r{region}_d{depth}_t{threshold}.log"
"{output_folder}/{region}/{depth}/individual_consensus/{threshold}/consensus_{tool}_{region}_{depth}.fasta"
log: "logs/{output_folder}/consensus_per_tool_{tool}_r{region}_d{depth}_t{threshold}.log"
shell:
"./msa_handler/MTool {input.data} {output} {wildcards.threshold}"
......@@ -36,43 +146,49 @@ rule create_consensus:
input:
rules.MTool.output, "MTool_over_folder.py", expand("{dataset}/MSA_{tool}_r{region}_d{depth}.fasta", tool=config["tool"], allow_missing=True, dataset=DATASET)
params: data=DATASET
log: "logs/consensus_r{region}_d{depth}_t{threshold}.log"
log: "logs/{output_folder}consensus_r{region}_d{depth}_t{threshold}.log"
output:
"consensus/{region}/{depth}/test_consensus_{region}_{depth}_{threshold}.fasta"
"{output_folder}/{region}/{depth}/test_consensus_{region}_{depth}_{threshold}.fasta"
message: "Consensus region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
shell:
"./MTool_over_folder.py -dir {params.data} -o {output} -region {wildcards.region} -depth {wildcards.depth} -threshold {wildcards.threshold}"
# Alignment for the meta-consensus #
rule align_mafft:
input:
rules.create_consensus.output
output:
"consensus/{region}/{depth}/mafft_consensus_{region}_{depth}_{threshold}.fasta"
"{output_folder}/{region}/{depth}/mafft_consensus_{region}_{depth}_{threshold}.fasta"
conda:
"envs/mafft.yaml"
log: "logs/align_mafft_r{region}_d{depth}_t{threshold}.log"
log: "logs/{output_folder}/align_mafft_r{region}_d{depth}_t{threshold}.log"
message: "Alignment with mafft region: {wildcards.region} , depth: {wildcards.depth} , threshold: {threshold}\n"
shell:
"mafft --auto {input} > {output}"
"mafft --auto {input} > {output} 2> {log}"
rule align_muscle:
input:
rules.create_consensus.output
output:
"consensus/{region}/{depth}/muscle_consensus_{region}_{depth}_{threshold}.fasta"
"{output_folder}/{region}/{depth}/muscle_consensus_{region}_{depth}_{threshold}.fasta"
conda:
"envs/muscle.yaml"
log: "logs/align_muscle_r{region}_d{depth}_t{threshold}.log"
log: "logs/{output_folder}/align_muscle_r{region}_d{depth}_t{threshold}.log"
message: "Alignment muscle region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
shell:
"muscle -align {input} -output {output} > /dev/null 2>/dev/null"
"muscle -in {input} -out {output} > {log} 2>{log}"
# Meta consensus #
rule meta_consensus:
input:
muscle=rules.align_muscle.output
output:
"consensus/{region}/{depth}/{align}_meta_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta"
log: "logs/meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.log"
"{output_folder}/{region}/{depth}/{align}_meta_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta"
log: "logs/{output_folder}/meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.log"
message: "Meta-consensus region: {wildcards.region} , depth: {wildcards.depth} , consensus threshold: {wildcards.threshold} , meta consensus threshold: {wildcards.value}\n"
shell:
"./msa_handler/MTool {input.muscle} {output} {wildcards.value}"
......@@ -83,10 +199,10 @@ rule meta_consensus:
rule exonerate_ref_result:
input:
files=expand(rules.meta_consensus.output, align="muscle", allow_missing=True) ,
ref=expand("{data}/seq_selectes_region/region_seq_r{region}.fasta", data=config["data"], allow_missing=True),
ref=expand("{data_in}/seq_selectes_region/region_seq_r{region}.fasta", data_in=config["data_in"], allow_missing=True),
consensus_per_tool=expand(rules.create_consensus_per_tool.output, tool=config["tool"], allow_missing=True)
output: "consensus/stats/{region}/aln_all_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.txt"
log: "logs/exonerate_ref_result_r{region}_d{depth}_t1_{threshold}_t2{value}.log"
output: "{output_folder}/stats/{region}/aln_all_consensus_r{region}_d{depth}_t1_{threshold}_t2_{value}.txt"
log: "logs/{output_folder}/exonerate_ref_result_r{region}_d{depth}_t1_{threshold}_t2{value}.log"
params: format="\"%pi, %ps, %et, %em\n\""
conda: "envs/exonerate.yaml"
shell: "./exonerate_stats.sh {output} {input.ref} {input.files} {input.consensus_per_tool}"
......@@ -94,49 +210,55 @@ rule exonerate_ref_result:
rule plottable_exonerate_meta:
input: files=expand(rules.meta_consensus.output, align="muscle", value=config["threshold_2"], allow_missing=True) , ref=expand("{data}/seq_selectes_region/region_seq_r{region}.fasta", data=config["data"], allow_missing=True)
output: "consensus/plot/{region}/{depth}/{threshold}/aln_stats_meta.txt"
log: "logs/plot_exonerate_meta_r{region}_d{depth}_t{threshold}.log"
input: files=expand(rules.meta_consensus.output, align="muscle", value=config["threshold_2"], allow_missing=True) , ref=expand("{data_in}/seq_selectes_region/region_seq_r{region}.fasta", data_in=config["data_in"], allow_missing=True)
output: "{output_folder}/plot/{region}/{depth}/{threshold}/aln_stats_meta.txt"
log: "logs/{output_folder}plot_exonerate_meta_r{region}_d{depth}_t{threshold}.log"
conda: "envs/exonerate.yaml"
message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold}) for meta-consensus threshold in [ {config[threshold_2]} ]"
shell: "./exonerate_stats.sh {output} {input.ref} {input.files}"
rule plottable_exonerate_tools:
input:
ref=expand("{data}/seq_selectes_region/region_seq_r{region}.fasta", data=config["data"], allow_missing=True),
ref=expand("{data_in}/seq_selectes_region/region_seq_r{region}.fasta", data_in=config["data_in"], allow_missing=True),
consensus_per_tool=expand(rules.create_consensus_per_tool.output, value=config["threshold_2"], allow_missing=True)
output: "consensus/plot/{region}/{depth}/{threshold}/aln_stats_{tool}.txt"
log: "logs/plot_exonerate_tools_{tool}_r{region}_d{depth}_t{threshold}.log"
output: "{output_folder}/plot/{region}/{depth}/{threshold}/aln_stats_{tool}.txt"
log: "logs/{output_folder}/plot_exonerate_tools_{tool}_r{region}_d{depth}_t{threshold}.log"
conda: "envs/exonerate.yaml"
shell: "./exonerate_stats.sh {output} {input.ref} {input.consensus_per_tool}"
message: "Exonerate comparison between consensus for tool {wildcards.tool} and ref (region: {wildcards.region}, depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold}) for meta-consensus threshold in [ {config[threshold_2]} ]"
shell: "./exonerate_stats.sh {output} {input.ref} {input.consensus_per_tool} 2> {log}"
rule plot_depth:
input:
expand(rules.plottable_exonerate_meta.output, depth=config["depth"], allow_missing=True),
params: target_folder="consensus/plot/{region}/graph"
log: "logs/plot_depth_r{region}_t{threshold}.log"
output: expand("consensus/plot/{region}/graph/graph_r{region}_t{threshold}_{graph}.png", graph=GRAPH_TYPES, depth=config["depth"], allow_missing=True)
shell: "./plot_results.py {params.target_folder}/graph_r{wildcards.region}_t{wildcards.threshold} {input}"
log: "logs/{output_folder}/plot_depth_r{region}_t{threshold}.log"
output: expand("{output_folder}/plot/{region}/graph/graph_r{region}_t{threshold}_{graph}.png", graph=GRAPH_TYPES, depth=config["depth"], allow_missing=True)
message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, or preliminary consensus threshold in [ {config[threshold_1]} ], meta-consensus threshold in [ {config[threshold_2]} ]"
shell: "./plot_results.py {params.target_folder}/graph_r{wildcards.region}_t{wildcards.threshold} {input} 2> {log}"
rule plot_t1:
input:
expand(rules.plottable_exonerate_meta.output, threshold=config["threshold_1"], allow_missing=True)
params: target_folder="consensus/plot/{region}/{depth}"
log: "logs/plot_t1_r{region}_d{depth}.log"
output: expand("consensus/plot/{region}/{depth}/graph_r{region}_d{depth}_{graph}.png", graph=GRAPH_TYPES, allow_missing=True)
shell: "./plot_results.py {params.target_folder}/graph_r{wildcards.region}_d{wildcards.depth} {input}"
params: target_folder="{output_folder}/plot/{region}/{depth}"
log: "logs/{output_folder}/plot_t1_r{region}_d{depth}.log"
output: expand("{output_folder}/plot/{region}/{depth}/graph_r{region}_d{depth}_{graph}.png", graph=GRAPH_TYPES, allow_missing=True)
message: "Creating plot for meta-consensus quality with preliminary threshold in [ {config[threshold_1]} ] and meta-consensus in [ {config[threshold_2]} ] (region: {wildcards.region}, depth: {wildcards.depth})"
shell: "./plot_results.py {params.target_folder}/graph_r{wildcards.region}_d{wildcards.depth} {input} 2> {log} "
rule plot_t2:
input: expand(rules.plottable_exonerate_tools.output, tool=config["tool"], allow_missing=True),
rules.plottable_exonerate_meta.output
params: target_folder="consensus/plot/{region}/{depth}/{threshold}/"
log: "logs/plot_t2_r{region}_d{depth}_t_{threshold}.log"
output: expand("consensus/plot/{region}/{depth}/{threshold}/graph_r{region}_d{depth}_t1_{threshold}_{graph}.png", graph=GRAPH_TYPES, allow_missing=True)
shell: "./plot_results.py {params.target_folder}graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold} {input}"
params: target_folder="{output_folder}/plot/{region}/{depth}/{threshold}/"
log: "logs/{output_folder}/plot_t2_r{region}_d{depth}_t_{threshold}.log"
output: expand("{output_folder}/plot/{region}/{depth}/{threshold}/graph_r{region}_d{depth}_t1_{threshold}_{graph}.png", graph=GRAPH_TYPES, allow_missing=True)
message: "Creating plot for meta-consensus compared to preliminary consensus with meta-consensus threshold in {config[threshold_2]} (region: {wildcards.region}, depth: {wildcards.depth}, threshold: {wildcards.threshold})"
shell: "./plot_results.py {params.target_folder}graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold} {input} 2> {log}"
rule make_plots:
input: expand(rules.plot_t2.output, region=config["region"], depth=config["depth"], threshold=config["threshold_1"]),
expand(rules.plot_t1.output, region=config["region"], depth=config["depth"]),
expand(rules.plot_depth.output, region=config["region"], threshold=config["threshold_1"])
\ No newline at end of file
input: expand(rules.plot_t2.output, region=config["region"], depth=config["depth"], threshold=config["threshold_1"], output_folder=config["output_folder"]),
expand(rules.plot_t1.output, region=config["region"], depth=config["depth"] , output_folder=config["output_folder"]),
expand(rules.plot_depth.output, region=config["region"], threshold=config["threshold_1"], output_folder=config["output_folder"])
message: "Created all plots"
\ No newline at end of file
data: "../../data"
data_in: "../../data"
data_out: "consensus/data"
format: "msa"
tool: ["abpoa", "kalign", "kalign3", "mafft", "muscle", "spoa"]
threshold_1: ["30", "40", "50", "60", "70", "80"]
......@@ -6,3 +7,4 @@ threshold_2 : ["30", "40", "50", "60", "70", "80"]
region: ["100", "200", "500", "1000", "2000"]
depth: ["10", "20", "50", "100", "150"]
align: ["muscle", "mafft"]
output_folder: "consensus"
\ No newline at end of file
name: abpoa
channels:
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=4.5=1_gnu
- abpoa=1.2.5=h5bf99c6_0
- libgcc-ng=9.3.0=h5101ec6_17
- libgomp=9.3.0=h5101ec6_17
- zlib=1.2.11=h7b6447c_3
name: kalign2
channels:
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=4.5=1_gnu
- kalign2=2.04=h779adbc_2
- libgcc-ng=9.3.0=h5101ec6_17
- libgomp=9.3.0=h5101ec6_17
name: kalign3
channels:
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=4.5=1_gnu
- kalign3=3.2.2=h779adbc_2
- libgcc-ng=9.3.0=h5101ec6_17
- libgomp=9.3.0=h5101ec6_17
name: muscle
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=2_gnu
- libgcc-ng=11.2.0=h1d223b6_16
- libgomp=11.2.0=h1d223b6_16
- libstdcxx-ng=11.2.0=he4da1e4_16
- muscle=5.1=h9f5acd7_1
prefix: /home/flav/anaconda3/envs/muscle
- _libgcc_mutex=0.1=main
- _openmp_mutex=4.5=1_gnu
- libgcc-ng=9.3.0=h5101ec6_17
- libgomp=9.3.0=h5101ec6_17
- libstdcxx-ng=9.3.0=hd4cf53a_17
- muscle=3.8.1551=h7d875b9_6
name: spoa
channels:
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=4.5=1_gnu
- libgcc-ng=9.3.0=h5101ec6_17
- libgomp=9.3.0=h5101ec6_17
- libstdcxx-ng=9.3.0=hd4cf53a_17
- spoa=4.0.7=h9a82719_1
- zlib=1.2.11=h7b6447c_3
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment