Skip to content
Snippets Groups Projects
Commit fdcaab75 authored by Lihouck Flavien's avatar Lihouck Flavien
Browse files

Version

parent e765ad48
No related branches found
No related tags found
No related merge requests found
......@@ -11,7 +11,7 @@ config_file = "workflow/config.yaml"
def create_config() :
pattern = ""
parser = argparse.ArgumentParser(description="Creates the config file then runs the Meta-consensus pipeline", add_help=False)
parser = argparse.ArgumentParser(usage="mc_msa -i INPUT -o OUTPUT [-r REFERENCE -t TOOLS ...]" ,description="Creates the config file then runs the Meta-consensus pipeline", add_help=False)
required = parser.add_argument_group("Required arguments")
basic = parser.add_argument_group("Standard arguments")
advanced = parser.add_argument_group("Advanced arguments")
......@@ -20,20 +20,23 @@ def create_config() :
required.add_argument("-output", "-o", help="Target directory for the pipeline results", required=True)
basic.add_argument("-reference", "-r", help="Reference for alignment and statistics")
advanced.add_argument("-list", help="A list of regions to work on (format: [r1, r2, ...] or [rStart_End, ...]) (default: no region)")
basic.add_argument("-size", "-s", help="The desired region size (default: maximum)", default='0')
advanced.add_argument("-size", "-s", help="The desired region size (default: maximum)", default='0')
basic.add_argument("-tools", "-t", help="The list of tools to use in the meta-consensus (default: ['abpoa', 'spoa', 'kalign2', 'kalign3', 'mafft', 'muscle'])", default="['abpoa', 'spoa', 'kalign2', 'kalign3', 'mafft', 'muscle']")
basic.add_argument("-consensus_threshold", "-ct", help="Threshold(s) used for the MSA consensus step (default: [70])", default="[70]")
basic.add_argument("-metaconsensus_threshold", "-mt", help="Threshold(s) used for the Meta-consensus result (default: [60])", default="[60]")
basic.add_argument("-depth", "-d", help="The depth used in the process (default: max)", default="0")
advanced.add_argument("-consensus_threshold", "-ct", help="Threshold(s) used for the MSA consensus step (default: [70])", default="[70]")
advanced.add_argument("-metaconsensus_threshold", "-mt", help="Threshold(s) used for the Meta-consensus result (default: [60])", default="[60]")
advanced.add_argument("-depth", "-d", help="The depth used in the process (default: max)", default="0")
#parser.add_argument("-plot", help="Analyse the meta-consensus and MSA consensus quality (requires reference)", action='store_true')
advanced.add_argument("-region_overlap", help="The size of the overlap between regions", default="50")
advanced.add_argument("-cores", help="The amount of cores to use in the pipeline run (default 1)", default=1)
advanced.add_argument("-region_overlap", help="The size of the overlap between regions", default="0")
basic.add_argument("-cores", "-c", help="The amount of cores to use in the pipeline run (default 1)", default=1)
args = parser.parse_args()
with open(config_file, 'w') as c_file:
c_file.write('read_file: ' + args.input + "\n")
c_file.write("output_folder: "+args.output + "\n")
if args.output[-1] == "/":
output = args.output[:-1]
else:
output = args.output
c_file.write("output_folder: "+ output + "\n")
if args.reference:
c_file.write("ref_file: "+args.reference+"\n")
region_param = False
......@@ -64,7 +67,7 @@ def create_config() :
c_file.write("threshold_2: "+ args.metaconsensus_threshold +"\n")
c_file.write("depth: "+args.depth +" \n")
plot = 'plot'
plot = ''
if args.reference:
plot = 'plot'
return {"core": str(args.cores), "plot" : plot}
......
......@@ -15,6 +15,7 @@ else :
if not "data_out" in config:
config["data_out"] = os.path.join(config["output_folder"],"data")
if not "consensus_folder" in config:
config["consensus_folder"] = os.path.join(config["output_folder"],"consensus")
......@@ -34,10 +35,13 @@ if not "data_in" in config:
config["data_in"] = ""
#
HIGH_DEPTH = 60
HIGH_LENGTH = 5000
DEPTH_MIN = 50
REGION_HEAD_OVERLAP = int(config["region_overlap"]) if "region_overlap" in config else 20
REGION_HEAD_OVERLAP = int(config["region_overlap"]) if "region_overlap" in config else 0
DATASET = expand("{data}/{format}", data=config["data_out"], format="msa")
GRAPH_TYPES = ["identity", "similarity", "equivalence", "mismatches","sequence_length", "aligned_length"]
......@@ -72,15 +76,6 @@ if config["depth"] == 0:
else:
depth_check = config['depth']
if int(depth_check) > HIGH_DEPTH:
eprint("\nWarning: a high depth may cause the Multiple Sequence Alignment phase to be extremely SLOW")
if "muscle" in config["tool"]:
eprint("\t- muscle may be extremely SLOW for high depth")
if "abpoa" in config["tool"]:
eprint("\t- abpoa may require a lot of memory to run, consider disabling it or lowering the depth if you encounter problems allocating memory")
if "spoa" in config["tool"]:
eprint("\t- spoa may require a lot of memory to run, consider disabling it or lowering the depth if you encounter problems allocating memory")
time.sleep(1.7)
if not 'region' in config:
if not 'length' in config:
......@@ -91,6 +86,20 @@ if not 'region' in config:
config['length'] = len(anchor)
config['region']= cut_region_from_length(config['length'])
# --------------------------------- #
# Print warnings for high depth and length #
if int(depth_check) > HIGH_DEPTH:
eprint("\nWarning: a high depth may cause the Multiple Sequence Alignment phase to be extremely SLOW")
if "muscle" in config["tool"]:
eprint("\t- muscle may be extremely SLOW for high depth")
if "abpoa" in config["tool"]:
eprint("\t- abpoa may require a lot of memory to run, consider disabling it or lowering the depth if you encounter problems allocating memory")
if "spoa" in config["tool"]:
eprint("\t- spoa may require a lot of memory to run, consider disabling it or lowering the depth if you encounter problems allocating memory")
time.sleep(1.3)
if 'length' in config and int(config['length']) > HIGH_LENGTH:
eprint("\nWarning: a high region length may cause the Multiple Sequence Alignment phase to be extremely SLOW")
if "muscle" in config["tool"]:
......@@ -99,9 +108,10 @@ if 'length' in config and int(config['length']) > HIGH_LENGTH:
eprint("\t- abpoa may require a lot of memory to run, consider disabling it or working on smaller regions if you encounter problems allocating memory")
if "spoa" in config["tool"]:
eprint("\t- spoa may require a lot of memory to run, consider disabling it or working on smaller regions if you encounter problems allocating memory")
time.sleep(2)
# Generates the meta-consensus from given reads #
time.sleep(1.3)
# Generates the meta-consensus from given reads #
rule all:
input:
# expand("{output_folder}/meta_consensus/{region}/meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta", region=config["region"], depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])
......@@ -144,9 +154,6 @@ if "ref_file" in config:
'echo "ORDER: $ORDER" >{log};'
'$ORDER >{output} 2>> {log}'
# config['length'] = get_length('os.path.join(config["data_in"],config["ref_file"])'))
# config['region'] = cut_region_from_length(config['length'])
# Cuts the reads into smaller regions to reduce the overall size and lower the computational cost, and creates fasta to build MSA#
rule reads_map_region:
......@@ -186,7 +193,7 @@ else :
output :
os.path.join(config["data_out"],'alignment','aln_reads_on_read.sam')
message :
"Minimap for {config['read_file']}"
"Minimap for {config[read_file]}"
log: os.path.join(config["output_folder"],'logs','2_alignment_reads_on_anchor.log')
conda:
"envs/minimap2.yaml"
......@@ -196,17 +203,12 @@ else :
'$ORDER >{output} 2>> {log}'
# Select the region to keep from the anchoring #
# config['length'] = get_length(config['os.path.join(config["data_in"],config["ref_file"])'))
# config['region'] = cut_region_from_length(config['length'])
rule get_regions_for_read :
input : sam={rules.alignment_reads_on_anchor.output}
output: os.path.join(config["data_out"], 'regions.txt')
message: "Getting the regions where the reads are aligned on the anchor"
conda: "envs/samtools.yaml"
params: depth=max(config["depth"])
params: depth=DEPTH_MIN
shell: "./workflow/scripts/prepare_region_from_read.sh {input.sam} {output} {params.depth}"
# rule set_region_elem:
......@@ -225,16 +227,10 @@ else :
log:
os.path.join(config['output_folder'],'logs','3_reads_map_region_r{start}_{end}.log')
shell :
'ORDER="./workflow/scripts/reads_map_region.pl -s {wildcards.start} -e {wildcards.end} -c 100 {input.sam} {output}";'
'ORDER="./workflow/scripts/reads_map_region.pl -s {wildcards.start} -e {wildcards.end} -c {DEPTH_MIN} {input.sam} {output}";'
'echo "ORDER: $ORDER" >{log};'
'$ORDER 2>&1 >>{log}'
#'./workflow/scripts/cut_regions_with_file {input.sam} {wildcards.region}'
# -------------------------------------------------------- #
# Depth selection
......@@ -265,7 +261,7 @@ rule muscle :
message:
"Muscle for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('logs','{output_folder}','6_muscle_r{region}_d{depth}.log')
os.path.join('{output_folder}','logs','6_muscle_r{region}_d{depth}.log')
conda:
"envs/muscle.yaml"
shell :
......@@ -279,7 +275,7 @@ rule mafft :
message:
"Mafft for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('logs','{output_folder}','6_mafft_r{region}_d{depth}.log')
os.path.join('{output_folder}','logs','6_mafft_r{region}_d{depth}.log')
conda:
"envs/mafft.yaml"
shell :
......@@ -303,7 +299,7 @@ rule spoa :
message:
"Spoa for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('logs','{output_folder}','6_spoa_r{region}_d{depth}.log')
os.path.join('{output_folder}','logs','6_spoa_r{region}_d{depth}.log')
conda:
"envs/spoa.yaml"
shell :
......@@ -317,7 +313,7 @@ rule kalign2 :
message:
"Kalign2 for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('logs','{output_folder}','6_kalign_r{region}_d{depth}.log')
os.path.join('{output_folder}','logs','6_kalign_r{region}_d{depth}.log')
conda:
"envs/kalign2.yaml"
shell :
......@@ -331,7 +327,7 @@ rule kalign3 :
message:
"Kalign3 for {wildcards.output_folder} (Region size={wildcards.region} & Depth={wildcards.depth})"
log:
os.path.join('logs','{output_folder}','6_kalign3_r{region}_d{depth}.log')
os.path.join('{output_folder}','logs','6_kalign3_r{region}_d{depth}.log')
conda:
"envs/kalign3.yaml"
shell :
......@@ -378,7 +374,7 @@ rule abpoa_correction :
# shell :
# "the shell command for your tool. with {input} as the input and {output} as the output"
# This will only work if the command output the files as written in output.
# If all is done correctly, you should
# If all is done correctly, you should be able to add the tool in the pipeline run
......@@ -449,7 +445,7 @@ rule align_muscle:
rule meta_consensus:
input:
muscle=rules.align_muscle.output
muscle=expand(rules.align_muscle.output, output_folder=config['consensus_folder'], allow_missing=True)
output:
os.path.join("{output_folder}","meta_consensus", "{region}", "meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta")
log: "{output_folder}/logs/meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.log"
......@@ -468,6 +464,10 @@ rule concat_region:
# --------------------------------------------------------#
# This part handle comparison to a reference and creates plots in order to better visualize the efficiency of the consensus strategy used for both the preliminary consensus and the meta-consensus. #
if "ref_file" in config:
# --------------------------------------------------------#
# Cut the reference region
rule region_seq :
input :
ref =os.path.join(config["data_in"],config["ref_file"])
......@@ -492,7 +492,6 @@ if "ref_file" in config:
params: format="\"%pi, %ps, %et, %em\n\""
conda: "envs/exonerate.yaml"
shell: "./workflow/scripts/exonerate_stats.sh {output} {input.ref} {input.files} {input.consensus_per_tool}"
# "echo {STAT_HEADER} >> {output} \n for FILE in {input.files}; do exonerate --bestn 1 -Q dna -E -m a:g --showsugar false --showvulgar false --showcigar false --ryo {params.format} --verbose 0
rule exonerate_stats_meta:
input: files=expand(rules.meta_consensus.output, align="muscle", value=config["threshold_2"], allow_missing=True) , ref=expand("{ref}/region_references/region_seq_r{region}.fasta", data_in=config["data_in"], ref=config["data_out"], allow_missing=True)
......@@ -547,6 +546,7 @@ if "ref_file" in config:
shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.region}/{wildcards.depth}/{wildcards.threshold}/graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
# Extracting the mismatching regions, using exonerate to align the meta-consensus with the reference to find them.
rule comparison_ref_metaconsensus:
input: query=rules.concat_region.output, target=config["ref_file"]
......
read_file: ../Lambda_phage/lambda_1000x_10p.fa
output_folder: Lambda_10
ref_file: ../Lambda_phage/lambda_virus.fa
region_size: 0
region_overlap: 50
region_size: 5000
region_overlap: 0
tool: ['abpoa', 'spoa', 'kalign2', 'kalign3', 'mafft', 'muscle']
threshold_1: [70]
threshold_2: [60]
depth: 0
depth: 30
......@@ -17,5 +17,7 @@ def main():
file_cnt += 1
output.write('\n')
if __name__ == "__main__" :
main()
#!/usr/bin/env python3
import sys
def main():
args = sys.argv
overlap = int(args[2])
with open(args[1], "w") as output:
with open(args[3], 'r') as file:
lines = file.readlines()
output.write(">meta_consensus\n"+ lines[1][:-1])
file_cnt = 4
while file_cnt < len(args):
with open(args[file_cnt], "r") as file:
lines = file.readlines()
output.write(lines[1][1:-1])
file_cnt += 1
output.write('\n')
if __name__ == "__main__" :
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment