Select Git revision
Snakefile 41.88 KiB
import os, sys
import time
from scripts.extract_longest_read import extract_longest_read
# ------------------------------------ #
# Values tests and initialization #
if not "region_size" in config:
config["region_size"] = 0
else :
config["region_size"] = int(config["region_size"])
if not "data_out" in config:
config["data_out"] = os.path.join(config["output_folder"],"data")
if not "consensus_folder" in config:
config["consensus_folder"] = os.path.join(config["output_folder"],"consensus")
if not "tool" in config:
config["tool"] = [ "kalign2", "kalign3", "mafft", "muscle", "abpoa", "spoa"]
if not "align" in config:
config["align"] = "muscle"
if not "threshold_1" in config:
config["threshold_1"] = [50]
if not "threshold_2" in config:
config["threshold_2"] = [50]
if not "data_in" in config:
config["data_in"] = ""
config["early_concat"] = True
# These constants are modifiable values that needs testing to better adapt the warnings and the overall pipeline run. #
# High depth and high length represent the values from which the warnings for slow runs are printed.
HIGH_DEPTH = 60
HIGH_LENGTH = 5000
# This constant represents the minimal coverage percentage required for a read to be kept for a specific region
READ_COVERAGE_PERCENTAGE = int(50)
# This constant represents the expected region size extracted before and after the mismatch position.
MISMATCH_AREA_SIZE = 40
REGION_HEAD_OVERLAP = int(config["region_overlap"]) if "region_overlap" in config else 0
# This constant is the relative path for MSA results #
DATASET = expand("{data}/{format}", data=config["data_out"], format="msa")
# This constant is a list of the various datas studied in the graph, this might need to be updated along with the exonerate stats and plotting rules
GRAPH_TYPES = ["identity", "similarity", "equivalence", "mismatches","sequence_length", "aligned_length"]
# Here are some functions required for the pipeline runs without references, or without defined length or depth
def get_length(file):
"""
This function gets the length of the first read in target file
:param file: the relative path to the file
:return: the length of the first read
"""
with open(file, 'r') as f:
f.readline()
l = f.readline()
return len(l)
def cut_region_from_length(length):
"""
Creates region intervals of the correct size, with the last region ending at length
:param length: the last value for the regions
:return: a list of regions written in the format "start_end"
"""
start = 1
region = []
if config["region_size"] == 0:
return ["1_"+str(length -1)]
while start < int(length):
region.append(str(start)+"_"+str( min(start+config["region_size"] , int(length)-1)))
start += config["region_size"] - REGION_HEAD_OVERLAP
return region
def get_depth(file):
"""
This function gets the maximum depth of target read file
:param file: the relative path to the read file
:return: the maximum read depth
"""
out = subprocess.run(["wc", "-l", file], capture_output=True, text=True)
return int(int(out.stdout.split(" ")[0]) /2)
def eprint(*args, **kwargs):
"""
This function is a wrapper to easily print in the error out
"""
print(*args, file=sys.stderr, **kwargs)
# Depth_check is used in the pipeline to print the correct depth in the file names.
if config["depth"] == 0:
depth_check = get_depth(config["read_file"])
else:
depth_check = config['depth']
if not isinstance(depth_check, list):
depth_check = [depth_check]
# This snippet of code is used to creates the regions if they aren't preset
if not 'region' in config:
if not 'length' in config:
if 'ref_file' in config:
config['length'] = get_length(os.path.join(config["data_in"],config["ref_file"]))
else:
anchor = extract_longest_read(os.path.join(config["data_in"], config["read_file"]), 1)[0]
config['length'] = len(anchor)
config['region']= cut_region_from_length(config['length'])
if config['region_size'] == 0 :
config['region_size'] = config['length']+1
# --------------------------------- #
# Print warnings for high depth and length #
if max(depth_check) > HIGH_DEPTH:
eprint("\nWarning: a high depth may cause the Multiple Sequence Alignment phase to be extremely SLOW")
if "muscle" in config["tool"]:
eprint("\t- muscle may be extremely SLOW for high depth")
if "abpoa" in config["tool"]:
eprint("\t- abpoa may require a lot of memory to run, consider disabling it or lowering the depth if you encounter problems allocating memory")
if "spoa" in config["tool"]:
eprint("\t- spoa may require a lot of memory to run, consider disabling it or lowering the depth if you encounter problems allocating memory")
time.sleep(1.3)
if 'length' in config and int(config['length']) > int(HIGH_LENGTH):
eprint("\nWarning: a high region length may cause the Multiple Sequence Alignment phase to be extremely SLOW")
if "muscle" in config["tool"]:
eprint("\t- muscle may be extremely SLOW for high length")
if "abpoa" in config["tool"]:
eprint("\t- abpoa may require a lot of memory to run, consider disabling it or working on smaller regions if you encounter problems allocating memory")
if "spoa" in config["tool"]:
eprint("\t- spoa may require a lot of memory to run, consider disabling it or working on smaller regions if you encounter problems allocating memory")
time.sleep(1.3)
# Generates the meta-consensus from given reads #
rule all:
input:
# expand("{output_folder}/meta_consensus/{region}/meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta", region=config["region"], depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])
expand(os.path.join("{output_folder}","meta_consensus","meta_consensus_d{depth}_t1_{threshold}_t2_{value}.fasta"), depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])
message: "Meta consensus can be found in {input}"
# -------------------------------------------------------- #
# Installs the tool used for consensus and meta-consensus using a provided Makefile#
rule MTool:
input:
"src/msa_handler/MSA.cpp", "src/msa_handler/MSA.h", "src/msa_handler/main.cpp", "src/msa_handler/Makefile"
log: "logs/MTool.log"
output:
"src/msa_handler/MTool"
shell:
"cd src/msa_handler && make MTool"
# -------------------------------------------------------- #
# I. Creating a read pile
# -------------------------------------------------------- #
# The initial part of the pipeline differs whether a reference file has been provided or not #
# This is the version with a reference #
if "ref_file" in config:
# -------------------------------------------------------- #
# Aligns the reads on a given reference with minimap2 #
rule alignment_reads_on_ref :
input :
ref=os.path.join(config["data_in"],config["ref_file"]),
read=os.path.join(config["data_in"], config["read_file"])
output :
os.path.join(config["data_out"],'alignment','aln_reads_on_ref.sam')
message :
"Minimap for {config[data_in]}"
log:
os.path.join(config["output_folder"],'logs','1_alignment_reads_on_ref.log')
conda:
"envs/minimap2.yaml"
shell :
'ORDER="minimap2 -cax map-ont -t 8 {input.ref} {input.read}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER >{output} 2>> {log}'
# Cuts the reads into smaller regions to reduce the overall size and lower the computational cost, and creates fasta to build MSA#
rule reads_map_region:
input :
aln = os.path.join(config['data_out'],'alignment','aln_reads_on_ref.sam')
output :
os.path.join(config['data_out'],'read_map_region','read_r{start}_{end}.fasta')
message:
"Reads map region for (Region start= {wildcards.start} end= {wildcards.end})"
log:
os.path.join(config['output_folder'],'logs','1_reads_map_region_r{start}_{end}.log')
conda:
"envs/perl.yaml"
params: cvalue=READ_COVERAGE_PERCENTAGE
shell :
'ORDER="./workflow/scripts/reads_map_region.pl -s {wildcards.start} -e {wildcards.end} -c {READ_COVERAGE_PERCENTAGE} {input.aln} {output}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER 2>&1 >>{log}'
# -------------------------------------------------------- #
# This is the version without a reference
else :
# Extracts the longest read from the read pile to use as an anchor for a pre-alignment #
rule extract_longest_read:
input : read=os.path.join(config["data_in"],config["read_file"])
output : os.path.join(config["data_out"],"anchor.fasta")
message: "Extracting the longest read from {config[read_file]}"
shell : "./workflow/scripts/extract_longest_read.py -i {input.read} -o {output} -n 1 "
# Align the reads on anchor read #
rule alignment_reads_on_anchor :
input :
ref=os.path.join(config["data_out"],"anchor.fasta"),
read=os.path.join(config["data_in"], config["read_file"])
output :
os.path.join(config["data_out"],'alignment','aln_reads_on_read.sam')
message :
"Minimap for {config[read_file]}"
log: os.path.join(config["output_folder"],'logs','1_alignment_reads_on_anchor.log')
conda:
"envs/minimap2.yaml"
shell :
'ORDER="minimap2 -cax map-ont -t 8 {input.ref} {input.read}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER >{output} 2>> {log}'
# This rule creates a file with possible regions cuts where the anchor read appears to have insertion errors.
# It has not find its use yet in the pipeline, but might still have an interest overall
rule get_regions_for_read :
input : sam={rules.alignment_reads_on_anchor.output}
output: os.path.join(config["data_out"], 'regions.txt')
message: "Getting the regions where the reads are aligned on the anchor"
conda: "envs/samtools.yaml"
params: depth=READ_COVERAGE_PERCENTAGE
shell: "./workflow/scripts/prepare_region_from_read.sh {input.sam} {output} {params.depth}"
# Cuts the reads into smaller regions to reduce the overall size and lower the computational cost, and creates fasta to build MSA #
rule reads_map_regions_no_ref:
input: sam={rules.alignment_reads_on_anchor.output}, regions={rules.get_regions_for_read.output} #, fix={rules.set_region_elem.output}
output: os.path.join(config['data_out'], "read_map_region",'read_r{start}_{end, \d+}.fasta')
conda: "envs/perl.yaml"
log:
os.path.join(config['output_folder'],'logs','1_reads_map_region_r{start}_{end}.log')
shell :
'ORDER="./workflow/scripts/reads_map_region.pl -s {wildcards.start} -e {wildcards.end} -c {READ_COVERAGE_PERCENTAGE} {input.sam} {output}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER 2>&1 >>{log}'
# -------------------------------------------------------- #
# This rule picks the amount of reads kept for target region
rule select_depth :
input :
os.path.join(config['data_out'],'read_map_region','read_r{region}.fasta')
output :
os.path.join(config['data_out'],'read_map_region','reads_r{region}_d{depth}.fasta')
message:
"Select depth for (Region: {wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join(config['output_folder'],'logs','1_select_number_reads_r{region}_d{depth}.log')
shell :
'ORDER="./workflow/scripts/selected_nb_read.sh {input} {wildcards.depth} {output}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER >{output} 2>>{log}'
# -------------------------------------------------------- #
# II. MSA phase
# ------------------------------------------------------- #
# Creation of MSA from the reads pile using tools #
# If you want to add an MSA tool to the pipeline, you need to create a rule following this format, create a matching conda env, and add the tool *NAME* in your configfile, in the tools list.
#rule *NAME* :
# input :
# os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
# output :
# out = os.path.join('{output_folder}', "data" ,"msa",'MSA_*NAME*_r{region}_d{depth}.fasta')
# message:
# "*NAME* for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
# log:
# os.path.join(''{output_folder}',logs','2_*NAME*_r{region}_d{depth}.log')
# conda:
# "envs/*NAME*.yaml"
# shell :
# "the shell command for your tool. with {input} as the input and {output} as the output"
# This will only work if the command output the files as written in output.
# If all is done correctly, you should be able to add the tool in the pipeline run
# -------------------------------------------------------- #
rule muscle :
input :
os.path.join('{output_folder}', 'data',"read_map_region",'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', "data", "msa" ,'MSA_muscle_r{region}_d{depth}.fasta')
message:
"Muscle for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','2_muscle_r{region}_d{depth}.log')
conda:
"envs/muscle.yaml"
shell :
"muscle -in {input} -out {output} 2> {log}"
rule mafft :
input :
os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', "data" ,"msa" ,'mafft_uncorrected_r{region}_d{depth}.fasta')
message:
"Mafft for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','2_mafft_r{region}_d{depth}.log')
conda:
"envs/mafft.yaml"
shell :
"mafft {input} > {output} 2> {log}"
rule mafft_correction :
input :
os.path.join('{output_folder}', "data" ,"msa",'mafft_uncorrected_r{region}_d{depth}.fasta')
output :
os.path.join('{output_folder}', "data" ,"msa",'MSA_mafft_r{region}_d{depth}.fasta')
message:
"Mafft correction for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
shell :
'cat {input} | tr "atgc" "ATGC" >{output} ; rm -f {input}'
rule spoa :
input :
os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', "data" ,"msa",'MSA_spoa_r{region}_d{depth}.fasta')
message:
"Spoa for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','2_spoa_r{region}_d{depth}.log')
conda:
"envs/spoa.yaml"
shell :
"spoa -r 1 -l 1 {input} > {output} 2> {log}"
rule kalign2 :
input :
os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', "data" ,"msa",'MSA_kalign2_r{region}_d{depth}.fasta')
message:
"Kalign2 for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','2_kalign_r{region}_d{depth}.log')
conda:
"envs/kalign2.yaml"
shell :
"kalign -i {input} -out {output.out} 1> {log} 2> {log}"
rule kalign3 :
input :
os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}', "data" ,"msa",'MSA_kalign3_r{region}_d{depth}.fasta')
message:
"Kalign3 for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}','logs','2_kalign3_r{region}_d{depth}.log')
conda:
"envs/kalign3.yaml"
shell :
"kalign -i {input} -o {output.out} 1>{log} 2> {log}"
rule abpoa :
input :
os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
output :
out = os.path.join('{output_folder}' , "data" ,"msa",'abpoa_uncorrected_r{region}_d{depth}.fasta')
message:
"Abpoa for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('{output_folder}',"logs",'2_abpoa_r{region}_d{depth}.log')
conda:
"envs/abpoa.yaml"
shell :
"abpoa -r1 {input} > {output} 2> {log}"
rule abpoa_correction :
input :
os.path.join('{output_folder}' , "data" ,"msa",'abpoa_uncorrected_r{region}_d{depth}.fasta')
output :
os.path.join('{output_folder}', "data" ,"msa",'MSA_abpoa_r{region}_d{depth}.fasta')
message:
"Abpoa correction for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
shell :
'tail -n +2 {input} | sed "s/^/>seq\\n/" > {output} 2> /dev/null;'
# -------------------------------------------------------- #
# III. Consensus phase
# -------------------------------------------------------- #
# Creation of every individual tool's consensus for every region #
rule create_consensus_per_tool:
input:
rules.MTool.output, data=expand(os.path.join("{dataset}","MSA_{tool}_r{region}_d{depth}.fasta"), allow_missing=True, dataset=DATASET)
output:
os.path.join("{output_folder}", "consensus", "{region}","{depth}","{threshold}","consensus_{tool}_{region}_{depth}.fasta")
log: "{output_folder}/logs/consensus_per_tool_{tool}_r{region}_d{depth}_t{threshold}.log"
message: "Consensus for tools"
shell: "(./src/msa_handler/MTool {input.data} {output} {wildcards.threshold} ) 2> {log}"
# After the regional consensus, there are two possible ways to concatenate the regions.
# We can concatenate the region here to align full size sequences and do a single meta-consensus, or concatenate the meta-consensus.
# Both options are under here
# -------------------------------------------------------- #
# a. Meta-consensus concatenation
# --------------------------------------------------------#
# III.a Consensus phase (end)
# --------------------------------------------------------#
# Creates the consensus pile in order to align it in the next phase
rule create_consensus:
input: expand(rules.create_consensus_per_tool.output, tool=config["tool"], allow_missing=True, dataset=DATASET)
params: data=DATASET
log: "{output_folder}/logs/3a_consensus_r{region}_d{depth}_t{threshold}.log"
output:
os.path.join("{output_folder}", "consensus" ,"{region}","{depth}","all_tools_consensus_{region}_{depth}_{threshold}.fasta")
message: "Consensus region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
shell: 'cat {input} > {output} 2>{log}'
# "./workflow/scripts/MTool_over_folder.py -dir {params.data} -o {output} -region {wildcards.region} -depth {wildcards.depth} -threshold {wildcards.threshold}"#
# -------------------------------------------------------- #
# IV.a Meta-MSA phase
# -------------------------------------------------------- #
# Alignment of the consensus for the meta-consensus #
rule align_mafft:
input:
rules.create_consensus.output
output:
os.path.join("{output_folder}", "consensus","{region}","{depth}","mafft_consensus_{region}_{depth}_{threshold}.fasta")
conda:
"envs/mafft.yaml"
log: "{output_folder}/logs/4a_align_mafft_r{region}_d{depth}_t{threshold}.log"
message: "Alignment with mafft region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
shell:
"mafft --auto {input} > {output} 2> {log}"
rule align_muscle:
input:
expand(rules.create_consensus.output, allow_missing=True)
output:
os.path.join("{output_folder}","consensus", "{region}","{depth}","muscle_consensus_{region}_{depth}_{threshold}.fasta")
conda:
"envs/muscle.yaml"
log: "{output_folder}/logs/4a_align_muscle_r{region}_d{depth}_t{threshold}.log"
message: "Alignment muscle region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
shell:
"muscle -in {input} -out {output} > {log} 2>{log}"
# -------------------------------------------------------- #
# V.a Meta-consensus
# -------------------------------------------------------- #
# Individual meta consensus #
rule regional_meta_consensus:
input:
muscle=expand(rules.align_muscle.output, output_folder=config['output_folder'], allow_missing=True)
output:
os.path.join("{output_folder}","meta_consensus", "{region}", "meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta")
log: "{output_folder}/logs/5a_meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.log"
message: "Meta-consensus region: {wildcards.region} , depth: {wildcards.depth} , consensus threshold: {wildcards.threshold} , meta consensus threshold: {wildcards.value}\n"
shell:
"./src/msa_handler/MTool {input.muscle} {output} {wildcards.value} 2> {log}"
rule meta_consensus:
input: expand(rules.regional_meta_consensus.output, region=config["region"], align="muscle" , allow_missing=True)
output: os.path.join("{output_folder}", "meta_consensus","meta_consensus_d{depth}_t1_{threshold}_t2_{value}.fasta")
log: "{output_folder}/logs/5a_meta_consensus_d{depth}_t1_{threshold}_t2_{value}.log"
message: "Concatenating regional meta-consensus into a single sequence"
shell:
"./workflow/scripts/concat_regions.py {output} {input} 2> {log}"
# ------------------------------------------------------- #
# b. Concatenating region before consensus phase #
# ------------------------------------------------------- #
if "early_concat" in config:
# -------------------------------------------------------- #
# III.b Consensus phase (end)
# --------------------------------------------------------#
# Concatenates the consensus into a single sequence
rule create_consensus_per_tool_concat:
input:
expand(rules.create_consensus_per_tool.output, region=config["region"], allow_missing=True)
output:
os.path.join("{output_folder}", "consensus", "{depth}","{threshold}","consensus_{tool}_{depth}.fasta")
log: "{output_folder}/logs/3b_consensus_per_tool_concat_{tool}_d{depth}_t{threshold}.log"
message: "Consensus for tools"
shell: "./workflow/scripts/concat_regions.py {output} {input}" # >{log} 2>&1"
# Creates the consensus pile for the meta-MSA phase
rule create_consensus_concat:
input: expand(rules.create_consensus_per_tool_concat.output, tool=config["tool"], allow_missing=True, dataset=DATASET)
params: data=DATASET
log: "{output_folder}/logs/3b_consensus_d{depth}_t{threshold}.log"
output:
os.path.join("{output_folder}", "consensus", "{depth}","all_tools_consensus_{depth}_{threshold}.fasta")
message: "Consensus depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
shell: 'cat {input} > {output} 2>{log}'
# -------------------------------------------------------- #
# IV.b Meta-MSA phase
# -------------------------------------------------------- #
# Creates the meta-MSA using muscle
rule align_muscle_concat:
input:
expand(rules.create_consensus_concat.output, output_folder=config["output_folder"], allow_missing=True)
output:
os.path.join("{output_folder}","consensus", "{depth}","muscle_consensus_{depth}_{threshold}.fasta")
conda:
"envs/muscle.yaml"
log: "{output_folder}/logs/4b_align_muscle_d{depth}_t{threshold}.log"
message: "Alignment muscle depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
shell:
"muscle -in {input} -out {output} > {log} 2>{log}"
# -------------------------------------------------------- #
# V.b Meta consensus phase#
# -------------------------------------------------------- #
rule meta_consensus_concat:
input:
muscle=expand(rules.align_muscle_concat.output, output_folder=config['output_folder'], allow_missing=True)
output:
os.path.join("{output_folder}","meta_consensus", "meta_consensus_concat_{align}_d{depth}_t1_{threshold}_t2_{value}.fasta")
log: "{output_folder}/logs/5b_meta_consensus_{align}_d{depth}_t1_{threshold}_t2_{value}.log"
message: "Meta-consensus depth: {wildcards.depth} , consensus threshold: {wildcards.threshold} , meta consensus threshold: {wildcards.value}\n"
shell:
"./src/msa_handler/MTool {input.muscle} {output} {wildcards.value}"
rule concat_before_consensus:
input: expand(rules.meta_consensus_concat.output, output_folder=config["output_folder"], align="muscle", depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"])
# --------------------------------------------------------#
# This part handle comparison to a reference and creates plots in order to better visualize the efficiency of the consensus strategy used for both the preliminary consensus and the meta-consensus. #
if "ref_file" in config:
# -------------------------------------------------------- #
# VI. Meta-consensus analysis
# -------------------------------------------------------- #
# VI.a Regional analysis
# -------------------------------------------------------- #
# Cut the reference for every region
rule region_seq :
input :
ref =os.path.join(config["data_in"],config["ref_file"])
output :
os.path.join(config['output_folder'], "data", "region_references","region_seq_r{start}_{end}.fasta")
message:
"Region's seq for (Region {wildcards.start} - {wildcards.end})"
log:
os.path.join(config['output_folder'],"logs","6_region_seq_r{start}_{end}.log")
shell :
'ORDER="./workflow/scripts/region_seq.sh {wildcards.start} {input.ref} {output} {wildcards.end}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER 2>&1 >>{log}'
# Exonerate sequence comparison between meta-consensus and reference for each region
rule exonerate_stats_meta:
input: files=expand(rules.regional_meta_consensus.output, align="muscle", value=config["threshold_2"], allow_missing=True) , ref=expand("{ref}/region_references/region_seq_r{region}.fasta", data_in=config["data_in"], ref=config["data_out"], allow_missing=True)
output: os.path.join("{output_folder}","plot","{region}","{depth}","{threshold}","aln_stats_meta.txt")
log: "{output_folder}/logs/6_plot_exonerate_meta_r{region}_d{depth}_t{threshold}.log"
conda: "envs/exonerate.yaml"
message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold}) for meta-consensus threshold in [ {config[threshold_2]} ]"
shell: "./workflow/scripts/exonerate_stats.sh {output} {input.ref} {input.files}"
# Exonerate sequence comparison between consensus and reference for each region and each tools
rule exonerate_stats_tools:
input:
ref=expand("{ref}/region_references/region_seq_r{region}.fasta", data_in=config["data_in"], ref=config["data_out"], allow_missing=True),
consensus_per_tool=expand(rules.create_consensus_per_tool.output, value=config["threshold_2"], allow_missing=True)
output: os.path.join("{output_folder}","plot","{region}","{depth}","{threshold}","aln_stats_{tool}.txt")
log: "{output_folder}/logs/6_plot_exonerate_tools_{tool}_r{region}_d{depth}_t{threshold}.log"
conda: "envs/exonerate.yaml"
message: "Exonerate comparison between consensus for tool {wildcards.tool} and ref (region: {wildcards.region}, depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold}) for meta-consensus threshold in [ {config[threshold_2]} ]"
shell: "./workflow/scripts/exonerate_stats.sh {output} {input.ref} {input.consensus_per_tool} 2> {log}"
# Stats for every region for each tools
rule stats_region_align:
input: expand(rules.exonerate_stats_tools.output, region=config["region"], allow_missing=True)
output: os.path.join("{output_folder}","plot","{depth}","{threshold}","aln_stats_region_{tool}.txt")
log: "{output_folder}/logs/6_plot_exonerate_tools_regions_{tool}_d{depth}_t{threshold}.log"
conda: "envs/exonerate.yaml"
message: "Exonerate comparison between consensus for all regions tool {wildcards.tool} and ref (depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold})"
shell: "cat {input} > {output} 2> {log}"
# Stats for every region for the meta-consensus
rule stats_region_meta:
input: expand(rules.exonerate_stats_meta.output, region=config["region"], allow_missing=True)
output: os.path.join("{output_folder}","plot","{depth}","{threshold}","meta_aln_stats_region.txt")
log: "{output_folder}/logs/6_plot_exonerate_meta_regions_d{depth}_t{threshold}.log"
conda: "envs/exonerate.yaml"
message: "Exonerate comparison between meta-consensus for all regions and ref (depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold})"
shell: "cat {input} > {output} 2> {log}"
# Plots for every region, every consensus threshold for given depth
rule plot_region_all:
input: expand(rules.stats_region_align.output, threshold=config["threshold_1"], tool=config["tool"], output_folder=config["output_folder"], allow_missing=True) , expand(rules.stats_region_meta.output, threshold=config["threshold_1"], output_folder=config["output_folder"], allow_missing=True)
params: target_folder="{plot_folder}/plot/{depth}" , region=[elem for elem in config["region"]]
log: "{plot_folder}/logs/6_plot_t1_all_region_d{depth}.log"
output: expand(os.path.join("{plot_folder}","plot","{depth}","graph_all_region_d{depth}_{graph}.png"), graph=GRAPH_TYPES, allow_missing=True)
message: "Creating plot for meta-consensus quality with preliminary threshold in [ {config[threshold_1]} ] and meta-consensus in [ {config[threshold_2]} ] (depth: {wildcards.depth})"
shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.depth}/graph_all_region_d{wildcards.depth} '[{params.region}]' {input} 2> {log} "
# Plots by depth for meta-consensus regions
rule plot_depth:
input: expand(rules.exonerate_stats_meta.output, depth=config["depth"], output_folder=config['output_folder'], allow_missing=True)
params: target_folder="plot/{region}/graph"
log: "{plot_folder}/logs/6_plot_depth_r{region}_t{threshold}.log"
output: expand(os.path.join("{plot_folder}","plot","{region}","graph","graph_depth_r{region}_t{threshold}_{graph}.png"), graph=GRAPH_TYPES, depth=config["depth"], allow_missing=True)
message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, or preliminary consensus threshold in [ {config[threshold_1]} ], meta-consensus threshold in [ {config[threshold_2]} ]"
shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.region}/graph/graph_depth_r{wildcards.region}_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
# Plots by depth for every region
rule plot_depth_all:
input: expand(rules.exonerate_stats_meta.output, depth=config["depth"], output_folder=config["output_folder"], allow_missing=True), expand(rules.exonerate_stats_tools.output, tool=config["tool"], depth=config["depth"], output_folder=config["output_folder"], allow_missing=True)
params: target_folder="consensus/plot/{region}/graph"
log: "{plot_folder}/logs/plot_all_depth_r{region}_t{threshold}.log"
output: expand(os.path.join("{plot_folder}","plot","{region}","graph","graph_depth_all_r{region}_t{threshold}_{graph}.png"), graph=GRAPH_TYPES, depth=config["depth"], allow_missing=True)
message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, or preliminary consensus threshold in [ {config[threshold_1]} ], meta-consensus threshold in [ {config[threshold_2]} ]"
shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.region}/graph/graph_depth_all_r{wildcards.region}_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
# Plots for every consensus threshold
rule plot_t1:
input: expand(rules.exonerate_stats_meta.output, threshold=config["threshold_1"], output_folder=config["output_folder"], allow_missing=True)
params: target_folder="{plot_folder}/plot/{region}/{depth}"
log: "{plot_folder}/logs/plot_t1_r{region}_d{depth}.log"
output: expand(os.path.join("{plot_folder}","plot","{region}","{depth}","graph_r{region}_d{depth}_{graph}.png"), graph=GRAPH_TYPES, allow_missing=True)
message: "Creating plot for meta-consensus quality with preliminary threshold in [ {config[threshold_1]} ] and meta-consensus in [ {config[threshold_2]} ] (region: {wildcards.region}, depth: {wildcards.depth})"
shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.region}/{wildcards.depth}/graph_r{wildcards.region}_d{wildcards.depth} '[{config[threshold_2]}]' {input} 2> {log} "
# Plots for every meta-consensus threshold
rule plot_t2:
input: expand(rules.exonerate_stats_tools.output, tool=config["tool"], output_folder=config["output_folder"], allow_missing=True),
expand(rules.exonerate_stats_meta.output, output_folder=config["output_folder"], allow_missing=True)
params: target_folder="{plot_folder}/plot/{region}/{depth}/{threshold}/"
log: "{plot_folder}/logs/plot_t2_r{region}_d{depth}_t_{threshold}.log"
output: expand(os.path.join("{plot_folder}","plot","{region}","{depth}","{threshold}","graph_r{region}_d{depth}_t1_{threshold}_{graph}.png"), graph=GRAPH_TYPES, allow_missing=True)
message: "Creating plot for meta-consensus compared to preliminary consensus with meta-consensus threshold in {config[threshold_2]} (region: {wildcards.region}, depth: {wildcards.depth}, threshold: {wildcards.threshold})"
shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.region}/{wildcards.depth}/{wildcards.threshold}/graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
# ------------------------------------------------------ #
# VI.b Regions concatenated after the meta-consensus #
# ------------------------------------------------------ #
# Exonerate for pair-wise alignement
rule comparison_ref_metaconsensus:
input: query=rules.meta_consensus.output, target=config["ref_file"]
output: os.path.join("{output_folder}", "mismatches", "alignment_meta_ref_d{depth}_t1_{threshold}_t2_{value}.fasta")
conda: "envs/exonerate.yaml"
shell: "exonerate --bestn 1 -Q dna -E -m a:g --showvulgar false --showsugar false --showcigar false --showalignment true {input.target} {input.query} > {output} 2>/dev/null"
# Exonerate for stats
rule stats_ref_metaconsensus:
input: query=rules.meta_consensus.output, target=config["ref_file"]
output: os.path.join("{output_folder}", "plot", "stats_meta_ref_d{depth}_t1_{threshold}_t2_{value}.fasta")
conda: "envs/exonerate.yaml"
shell: "./workflow/scripts/exonerate_stats.sh {output} {input.target} {input.query}"
# Extracting the mismatching regions, using exonerate to align the meta-consensus with the reference to find them.
rule extract_mismatches:
input: rules.comparison_ref_metaconsensus.output
output: os.path.join("{output_folder}", "mismatches", "mismatches_d{depth}_t1_{threshold}_t2_{value}.fasta")
shell: "./src/util/exonerate_parser.py {input} {output} {MISMATCH_AREA_SIZE} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "
# ------------------------------------------------------ #
# VI.c Regions concatenated after the consensus phase #
# ------------------------------------------------------ #
# Exonerate for pair-wise alignement
rule comparison_ref_metaconsensus_concat:
input: query=expand(rules.meta_consensus_concat.output, align="muscle", allow_missing=True), target=config["ref_file"]
output: os.path.join("{output_folder}", "mismatches", "alignment_meta_concat_ref_d{depth}_t1_{threshold}_t2_{value}.fasta")
conda: "envs/exonerate.yaml"
shell: "exonerate --bestn 1 -Q dna -E -m a:g --showvulgar false --showsugar false --showcigar false --showalignment true {input.target} {input.query} > {output} 2>/dev/null"
# Exonerate for stats
rule stats_ref_metaconsensus_concat:
input: query=expand(rules.meta_consensus_concat.output, align="muscle", allow_missing=True), target=config["ref_file"]
output: os.path.join("{output_folder}", "plot", "stats_meta_concat_ref_d{depth}_t1_{threshold}_t2_{value}.fasta")
conda: "envs/exonerate.yaml"
shell: "./workflow/scripts/exonerate_stats.sh {output} {input.target} {input.query}"
# Extracting the mismatching regions, using exonerate to align the meta-consensus with the reference to find them.
rule extract_mismatches_concat:
input: rules.comparison_ref_metaconsensus_concat.output
output: os.path.join("{output_folder}", "mismatches", "mismatches_concat_d{depth}_t1_{threshold}_t2_{value}.fasta")
shell: "./src/util/exonerate_parser.py {input} {output} {MISMATCH_AREA_SIZE} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "
# --------------------------------------------------------#
# VI.d Comparisons between the two concatenation strategies
# --------------------------------------------------------#
# Plots comparing the two strategies of concatenation #
# Stats for all meta_consensus threshold for concatenated meta-consensus
rule stats_ref_metaconsensus_all_t2:
input: expand(rules.stats_ref_metaconsensus.output, value=config["threshold_2"],allow_missing=True)
output: os.path.join("{output_folder}", "plot", "stats_meta_ref_d{depth}_t1_{threshold}_tALL.fasta")
message: "Regroup every meta-consensus stats "
shell: "cat {input} > {output}"
# Stats for all meta_consensus threshold for concatenated consensus
rule stats_ref_metaconsensus_all_t2_concat:
input: expand(rules.stats_ref_metaconsensus_concat.output, value=config["threshold_2"],allow_missing=True)
output: os.path.join("{output_folder}", "plot", "stats_meta_concat_ref_d{depth}_t1_{threshold}_tALL.fasta")
shell: "cat {input} > {output}"
# Plot for concatenated consensus, and both concatenated meta-consensus strategies
rule plot_depth_all_concat:
input: expand(rules.stats_ref_metaconsensus_all_t2.output, depth=config["depth"], output_folder=config["output_folder"], allow_missing=True), expand(rules.stats_ref_metaconsensus_all_t2_concat.output, depth=config["depth"], output_folder=config["output_folder"], allow_missing=True), expand(rules.exonerate_stats_tools.output, tool=config["tool"], depth=config["depth"], output_folder=config["output_folder"], allow_missing=True)
params: target_folder="consensus/plot/graph"
log: "{plot_folder}/logs/plot_all_depth_concat_t{threshold}.log"
output: expand(os.path.join("{plot_folder}","plot","graph","graph_depth_all_concat_t{threshold}_{graph}.png"), graph=GRAPH_TYPES, depth=config["depth"], allow_missing=True)
message: "Exonerate comparison between meta-consensus and ref (preliminary consensus threshold in [ {config[threshold_1]} ], meta-consensus threshold in [ {config[threshold_2]} ]"
shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/graph/graph_depth_all_concat_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
# Creates every plots (uncomment specific plots to add them to the analysis. Adding sections will increase the run time)
rule plot:
input: # VI.a Regional comparisons
#expand(rules.plot_t1.output, region=config["region"], depth=config["depth"] , plot_folder=config["output_folder"]),
#expand(rules.plot_t2.output, region=config["region"], depth=config["depth"], threshold=config["threshold_1"], plot_folder=config["output_folder"]),
#expand(rules.plot_region_all.output, region=config["region"], depth=config["depth"] , plot_folder=config["output_folder"]),
#expand(rules.plot_depth.output, region=config["region"], threshold=config["threshold_1"], plot_folder=config["output_folder"]),
#expand(rules.plot_depth_all.output, region=config["region"], threshold=config["threshold_1"], plot_folder=config["output_folder"]),
#expand(rules.plot_depth_all_concat.output, threshold=config["threshold_1"], plot_folder=config["output_folder"]),
# VI.b Meta-consensus concatenation
expand(rules.extract_mismatches.output , threshold=config["threshold_1"], value=config["threshold_2"], output_folder=config["output_folder"], depth=config["depth"]),
expand(rules.stats_ref_metaconsensus.output , threshold=config["threshold_1"], value=config["threshold_2"], output_folder=config["output_folder"], depth=config["depth"])
# VI.c Consensus concatenation
#expand(rules.stats_ref_metaconsensus_concat.output , threshold=config["threshold_1"], value=config["threshold_2"], output_folder=config["output_folder"], depth=config["depth"])
#expand(rules.extract_mismatches_concat.output , threshold=config["threshold_1"], value=config["threshold_2"], output_folder=config["output_folder"], depth=config["depth"]),
# VI.d Strategies comparisons
#expand(rules.plot_depth_all_concat.output, threshold=config["threshold_1"], plot_folder=config["output_folder"]),
message: "Completed all tasks"
else :
rule plot:
run: sys.exit("Plotting requires a reference to create statistics")