Snakefile

import os, sys
import time
from scripts.extract_longest_read import extract_longest_read

# ------------------------------------ #
# Values tests and initialization #
if not "region_size" in config:
    config["region_size"] = 0
else :
    config["region_size"] = int(config["region_size"])

if not "data_out" in config:
    config["data_out"] = os.path.join(config["output_folder"],"data")

if not "consensus_folder" in config:
    config["consensus_folder"] = os.path.join(config["output_folder"],"consensus")

if not "tool" in config:
    config["tool"] = [ "kalign2", "kalign3", "mafft", "muscle", "abpoa", "spoa"]

if not "align" in config:
    config["align"] = "muscle"

if not "threshold_1" in config:
    config["threshold_1"] = [50]

if not "threshold_2" in config:
    config["threshold_2"] = [50]

if not "data_in" in config:
    config["data_in"] = ""

config["early_concat"] = True

# These constants are modifiable values that needs testing to better adapt the warnings and the overall pipeline run. #

# High depth and high length represent the values from which the warnings for slow runs are printed.
HIGH_DEPTH = 60
HIGH_LENGTH = 5000

# This constant represents the minimal coverage percentage required for a read to be kept for a specific region
READ_COVERAGE_PERCENTAGE = int(50)

# This constant represents the expected region size extracted before and after the mismatch position.
MISMATCH_AREA_SIZE = 40

REGION_HEAD_OVERLAP = int(config["region_overlap"]) if "region_overlap" in config else 0

# This constant is the relative path for MSA results #
DATASET = expand("{data}/{format}", data=config["data_out"], format="msa")

# This constant is a list of the various datas studied in the graph, this might need to be updated along with the exonerate stats and plotting rules
GRAPH_TYPES = ["identity", "similarity", "equivalence", "mismatches","sequence_length", "aligned_length"]


# Here are some functions required for the pipeline runs without references, or without defined length or depth

def get_length(file):
    """
    This function gets the length of the first read in target file
    :param file: the relative path to the file
    :return: the length of the first read
    """
    with open(file, 'r') as f:
      f.readline()
      l = f.readline()
    return len(l)

def cut_region_from_length(length):
    """
    Creates region intervals of the correct size, with the last region ending at length
    :param length: the last value for the regions
    :return: a list of regions written in the format "start_end"
    """
    start = 1
    region = []
    if config["region_size"] == 0:
        return ["1_"+str(length -1)]
    while start < int(length):
        region.append(str(start)+"_"+str( min(start+config["region_size"] , int(length)-1)))
        start += config["region_size"] - REGION_HEAD_OVERLAP
    return region

def get_depth(file):
    """
    This function gets the maximum depth of target read file
    :param file: the relative path to the read file
    :return: the maximum read depth
    """
    out = subprocess.run(["wc", "-l", file], capture_output=True, text=True)
    return int(int(out.stdout.split(" ")[0]) /2)

def eprint(*args, **kwargs):
    """
    This function is a wrapper to easily print in the error out
    """
    print(*args, file=sys.stderr, **kwargs)


# Depth_check is used in the pipeline to print the correct depth in the file names.
if config["depth"] == 0:
    depth_check = get_depth(config["read_file"])
else:
    depth_check = config['depth']
if not isinstance(depth_check, list):
    depth_check = [depth_check]

# This snippet of code is used to creates the regions if they aren't preset
if not 'region' in config:
    if not 'length' in config:
        if 'ref_file' in config:
            config['length'] = get_length(os.path.join(config["data_in"],config["ref_file"]))
        else:
            anchor = extract_longest_read(os.path.join(config["data_in"], config["read_file"]), 1)[0]
            config['length'] = len(anchor)
    config['region']= cut_region_from_length(config['length'])
    if config['region_size'] == 0 :
        config['region_size'] = config['length']+1

# --------------------------------- #
# Print warnings for high depth and length #
if max(depth_check) > HIGH_DEPTH:
    eprint("\nWarning: a high depth may cause the Multiple Sequence Alignment phase to be extremely SLOW")
    if "muscle" in config["tool"]:
        eprint("\t- muscle may be extremely SLOW for high depth")
    if "abpoa" in config["tool"]:
        eprint("\t- abpoa may require a lot of memory to run, consider disabling it or lowering the depth if you encounter problems allocating memory")
    if "spoa" in config["tool"]:
        eprint("\t- spoa may require a lot of memory to run, consider disabling it or lowering the depth if you encounter problems allocating memory")
    time.sleep(1.3)
if 'length' in config and int(config['length']) > int(HIGH_LENGTH):
    eprint("\nWarning: a high region length may cause the Multiple Sequence Alignment phase to be extremely SLOW")
    if "muscle" in config["tool"]:
        eprint("\t- muscle may be extremely SLOW for high length")
    if "abpoa" in config["tool"]:
        eprint("\t- abpoa may require a lot of memory to run, consider disabling it or working on smaller regions if you encounter problems allocating memory")
    if "spoa" in config["tool"]:
        eprint("\t- spoa may require a lot of memory to run, consider disabling it or working on smaller regions if you encounter problems allocating memory")
    time.sleep(1.3)


# Generates the meta-consensus from given reads #
rule all:
     input:
#          expand("{output_folder}/meta_consensus/{region}/meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta", region=config["region"], depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])
        expand(os.path.join("{output_folder}","meta_consensus","meta_consensus_d{depth}_t1_{threshold}_t2_{value}.fasta"), depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])

     message: "Meta consensus can be found in {input}"

# -------------------------------------------------------- #
# Installs the tool used for consensus and meta-consensus using a provided Makefile#
rule MTool:
     input:
          "src/msa_handler/MSA.cpp", "src/msa_handler/MSA.h", "src/msa_handler/main.cpp", "src/msa_handler/Makefile"
     log: "logs/MTool.log"
     output:
          "src/msa_handler/MTool"
     shell:
          "cd src/msa_handler && make MTool"

# -------------------------------------------------------- #

# I. Creating a read pile

# -------------------------------------------------------- #
# The initial part of the pipeline differs whether a reference file has been provided or not #
# This is the version with a reference #
if "ref_file" in config:
    # -------------------------------------------------------- #
    # Aligns the reads on a given reference with minimap2 #
    rule alignment_reads_on_ref :
          input :
               ref=os.path.join(config["data_in"],config["ref_file"]),
               read=os.path.join(config["data_in"], config["read_file"])
          output :
               os.path.join(config["data_out"],'alignment','aln_reads_on_ref.sam')
          message :
               "Minimap for {config[data_in]}"
          log:
               os.path.join(config["output_folder"],'logs','1_alignment_reads_on_ref.log')
          conda:
               "envs/minimap2.yaml"
          shell :
            'ORDER="minimap2 -cax map-ont -t 8  {input.ref} {input.read}";'
            'echo "ORDER: $ORDER" >{log};'
            '$ORDER  >{output} 2>> {log}'

    # Cuts the reads into smaller regions to reduce the overall size and lower the computational cost, and creates fasta to build MSA#
    rule reads_map_region:
        input :
            aln = os.path.join(config['data_out'],'alignment','aln_reads_on_ref.sam')
        output :
            os.path.join(config['data_out'],'read_map_region','read_r{start}_{end}.fasta')
        message:
            "Reads map region for (Region start= {wildcards.start} end= {wildcards.end})"
        log:
            os.path.join(config['output_folder'],'logs','1_reads_map_region_r{start}_{end}.log')
        conda:
            "envs/perl.yaml"
        params: cvalue=READ_COVERAGE_PERCENTAGE
        shell :
            'ORDER="./workflow/scripts/reads_map_region.pl -s {wildcards.start} -e {wildcards.end} -c {READ_COVERAGE_PERCENTAGE} {input.aln} {output}";'
            'echo "ORDER: $ORDER" >{log};'
            '$ORDER  2>&1 >>{log}'

# -------------------------------------------------------- #
# This is the version without a reference
else :
    # Extracts the longest read from the read pile to use as an anchor for a pre-alignment #
    rule extract_longest_read:
        input : read=os.path.join(config["data_in"],config["read_file"])
        output : os.path.join(config["data_out"],"anchor.fasta")
        message: "Extracting the longest read from {config[read_file]}"
        shell : "./workflow/scripts/extract_longest_read.py -i {input.read} -o {output} -n 1 "


    # Align the reads on anchor read #
    rule alignment_reads_on_anchor :
        input :
            ref=os.path.join(config["data_out"],"anchor.fasta"),
            read=os.path.join(config["data_in"], config["read_file"])
        output :
            os.path.join(config["data_out"],'alignment','aln_reads_on_read.sam')
        message :
            "Minimap for {config[read_file]}"
        log: os.path.join(config["output_folder"],'logs','1_alignment_reads_on_anchor.log')
        conda:
            "envs/minimap2.yaml"
        shell :
            'ORDER="minimap2 -cax map-ont -t 8  {input.ref} {input.read}";'
            'echo "ORDER: $ORDER" >{log};'
            '$ORDER  >{output} 2>> {log}'


    # This rule creates a file with possible regions cuts where the anchor read appears to have insertion errors.
    # It has not find its use yet in the pipeline, but might still have an interest overall
    rule get_regions_for_read :
        input : sam={rules.alignment_reads_on_anchor.output}
        output: os.path.join(config["data_out"], 'regions.txt')
        message: "Getting the regions where the reads are aligned on the anchor"
        conda: "envs/samtools.yaml"
        params: depth=READ_COVERAGE_PERCENTAGE
        shell: "./workflow/scripts/prepare_region_from_read.sh {input.sam} {output} {params.depth}"

    # Cuts the reads into smaller regions to reduce the overall size and lower the computational cost, and creates fasta to build MSA #
    rule reads_map_regions_no_ref:
        input: sam={rules.alignment_reads_on_anchor.output}, regions={rules.get_regions_for_read.output} #, fix={rules.set_region_elem.output}
        output: os.path.join(config['data_out'], "read_map_region",'read_r{start}_{end, \d+}.fasta')
        conda: "envs/perl.yaml"
        log:
            os.path.join(config['output_folder'],'logs','1_reads_map_region_r{start}_{end}.log')
        shell :
                'ORDER="./workflow/scripts/reads_map_region.pl -s {wildcards.start} -e {wildcards.end} -c {READ_COVERAGE_PERCENTAGE} {input.sam} {output}";'
                'echo "ORDER: $ORDER" >{log};'
                '$ORDER  2>&1 >>{log}'


# -------------------------------------------------------- #
# This rule picks the amount of reads kept for target region
rule select_depth :
    input :
        os.path.join(config['data_out'],'read_map_region','read_r{region}.fasta')
    output :
        os.path.join(config['data_out'],'read_map_region','reads_r{region}_d{depth}.fasta')
    message:
        "Select depth for (Region: {wildcards.region} & Depth={wildcards.depth})"
    log:
        os.path.join(config['output_folder'],'logs','1_select_number_reads_r{region}_d{depth}.log')
    shell :
        'ORDER="./workflow/scripts/selected_nb_read.sh {input} {wildcards.depth} {output}";'
        'echo "ORDER: $ORDER" >{log};'
        '$ORDER >{output} 2>>{log}'


# -------------------------------------------------------- #

# II. MSA phase
# ------------------------------------------------------- #
# Creation of MSA from the reads pile using tools #

# If you want to add an MSA tool to the pipeline, you need to create a rule following this format, create a matching conda env, and add the tool *NAME* in your configfile, in the tools list.

#rule *NAME* :
#    input :
#        os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
#    output :
#        out = os.path.join('{output_folder}', "data" ,"msa",'MSA_*NAME*_r{region}_d{depth}.fasta')
#    message:
#        "*NAME* for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
#    log:
#        os.path.join(''{output_folder}',logs','2_*NAME*_r{region}_d{depth}.log')
#    conda:
#        "envs/*NAME*.yaml"
#    shell :
#        "the shell command for your tool. with {input} as the input and {output} as the output"
# This will only work if the command output the files as written in output.
# If all is done correctly, you should be able to add the tool in the pipeline run

# -------------------------------------------------------- #

rule muscle :
    input :
        os.path.join('{output_folder}', 'data',"read_map_region",'reads_r{region}_d{depth}.fasta')
    output :
        out = os.path.join('{output_folder}', "data", "msa"  ,'MSA_muscle_r{region}_d{depth}.fasta')
    message:
        "Muscle for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
    log:
        os.path.join('{output_folder}','logs','2_muscle_r{region}_d{depth}.log')
    conda:
        "envs/muscle.yaml"
    shell :
        "muscle -in {input} -out {output} 2> {log}"

rule mafft :
    input :
        os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
    output :
        out = os.path.join('{output_folder}', "data" ,"msa" ,'mafft_uncorrected_r{region}_d{depth}.fasta')
    message:
        "Mafft for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
    log:
        os.path.join('{output_folder}','logs','2_mafft_r{region}_d{depth}.log')
    conda:
        "envs/mafft.yaml"
    shell :
       "mafft {input} > {output} 2> {log}"

rule mafft_correction :
    input :
        os.path.join('{output_folder}', "data" ,"msa",'mafft_uncorrected_r{region}_d{depth}.fasta')
    output :
        os.path.join('{output_folder}', "data"  ,"msa",'MSA_mafft_r{region}_d{depth}.fasta')
    message:
        "Mafft correction for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
    shell :
        'cat {input} | tr "atgc" "ATGC" >{output} ; rm -f {input}'

rule spoa :
    input :
        os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
    output :
        out = os.path.join('{output_folder}', "data" ,"msa",'MSA_spoa_r{region}_d{depth}.fasta')
    message:
        "Spoa for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
    log:
        os.path.join('{output_folder}','logs','2_spoa_r{region}_d{depth}.log')
    conda:
        "envs/spoa.yaml"
    shell :
       "spoa -r 1 -l 1 {input} > {output} 2> {log}"

rule kalign2 :
    input :
        os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
    output :
        out = os.path.join('{output_folder}', "data" ,"msa",'MSA_kalign2_r{region}_d{depth}.fasta')
    message:
        "Kalign2 for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
    log:
        os.path.join('{output_folder}','logs','2_kalign_r{region}_d{depth}.log')
    conda:
        "envs/kalign2.yaml"
    shell :
        "kalign -i {input} -out {output.out} 1> {log} 2> {log}"

rule kalign3 :
    input :
        os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
    output :
        out = os.path.join('{output_folder}', "data" ,"msa",'MSA_kalign3_r{region}_d{depth}.fasta')
    message:
        "Kalign3 for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
    log:
        os.path.join('{output_folder}','logs','2_kalign3_r{region}_d{depth}.log')
    conda:
        "envs/kalign3.yaml"
    shell :
        "kalign -i {input} -o {output.out} 1>{log} 2> {log}"

rule abpoa :
    input :
        os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
    output :
        out = os.path.join('{output_folder}' , "data" ,"msa",'abpoa_uncorrected_r{region}_d{depth}.fasta')
    message:
        "Abpoa for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
    log:
        os.path.join('{output_folder}',"logs",'2_abpoa_r{region}_d{depth}.log')
    conda:
        "envs/abpoa.yaml"
    shell :
        "abpoa -r1 {input} > {output} 2> {log}"

rule abpoa_correction :
    input :
        os.path.join('{output_folder}' , "data" ,"msa",'abpoa_uncorrected_r{region}_d{depth}.fasta')
    output :
        os.path.join('{output_folder}', "data" ,"msa",'MSA_abpoa_r{region}_d{depth}.fasta')
    message:
        "Abpoa correction for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
    shell :
        'tail -n +2 {input} | sed "s/^/>seq\\n/" > {output} 2> /dev/null;'


# -------------------------------------------------------- #

# III. Consensus phase

# -------------------------------------------------------- #
# Creation of every individual tool's consensus for every region #
rule create_consensus_per_tool:
     input:
        rules.MTool.output, data=expand(os.path.join("{dataset}","MSA_{tool}_r{region}_d{depth}.fasta"), allow_missing=True, dataset=DATASET)
     output:
         os.path.join("{output_folder}", "consensus", "{region}","{depth}","{threshold}","consensus_{tool}_{region}_{depth}.fasta")
     log: "{output_folder}/logs/consensus_per_tool_{tool}_r{region}_d{depth}_t{threshold}.log"
     message: "Consensus for tools"
     shell: "(./src/msa_handler/MTool {input.data} {output} {wildcards.threshold} ) 2> {log}"


# After the regional consensus, there are two possible ways to concatenate the regions.
# We can concatenate the region here to align full size sequences and do a single meta-consensus, or concatenate the meta-consensus.

# Both options are under here

# -------------------------------------------------------- #

# a. Meta-consensus concatenation

# --------------------------------------------------------#

# III.a Consensus phase (end)

# --------------------------------------------------------#

# Creates the consensus pile in order to align it in the next phase
rule create_consensus:
     input: expand(rules.create_consensus_per_tool.output, tool=config["tool"], allow_missing=True, dataset=DATASET)
     params: data=DATASET
     log: "{output_folder}/logs/3a_consensus_r{region}_d{depth}_t{threshold}.log"
     output:
         os.path.join("{output_folder}", "consensus" ,"{region}","{depth}","all_tools_consensus_{region}_{depth}_{threshold}.fasta")
     message: "Consensus region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"  
     shell: 'cat {input} > {output} 2>{log}'
        #  "./workflow/scripts/MTool_over_folder.py -dir {params.data} -o {output} -region {wildcards.region} -depth {wildcards.depth} -threshold {wildcards.threshold}"#

# -------------------------------------------------------- #

# IV.a Meta-MSA phase

# -------------------------------------------------------- #
# Alignment of the consensus for the meta-consensus #

rule align_mafft:
     input:
          rules.create_consensus.output
     output:
         os.path.join("{output_folder}", "consensus","{region}","{depth}","mafft_consensus_{region}_{depth}_{threshold}.fasta")
     conda:
          "envs/mafft.yaml"
     log: "{output_folder}/logs/4a_align_mafft_r{region}_d{depth}_t{threshold}.log"
     message: "Alignment with mafft region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
     shell:
          "mafft --auto {input} > {output} 2> {log}"

rule align_muscle:
     input:
         expand(rules.create_consensus.output, allow_missing=True)
     output:
         os.path.join("{output_folder}","consensus", "{region}","{depth}","muscle_consensus_{region}_{depth}_{threshold}.fasta")
     conda:
          "envs/muscle.yaml"
     log: "{output_folder}/logs/4a_align_muscle_r{region}_d{depth}_t{threshold}.log"
     message: "Alignment muscle region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
     shell:
          "muscle -in {input} -out {output} > {log} 2>{log}"

# -------------------------------------------------------- #

# V.a Meta-consensus

# -------------------------------------------------------- #
# Individual meta consensus #
rule regional_meta_consensus:
     input:
          muscle=expand(rules.align_muscle.output, output_folder=config['output_folder'], allow_missing=True)
     output:
         os.path.join("{output_folder}","meta_consensus", "{region}", "meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta")
     log: "{output_folder}/logs/5a_meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.log"
     message: "Meta-consensus region: {wildcards.region} , depth: {wildcards.depth} , consensus threshold: {wildcards.threshold} , meta consensus threshold: {wildcards.value}\n"
     shell:
          "./src/msa_handler/MTool {input.muscle} {output} {wildcards.value} 2> {log}"


rule meta_consensus:
    input: expand(rules.regional_meta_consensus.output, region=config["region"], align="muscle" , allow_missing=True)
    output: os.path.join("{output_folder}", "meta_consensus","meta_consensus_d{depth}_t1_{threshold}_t2_{value}.fasta")
    log: "{output_folder}/logs/5a_meta_consensus_d{depth}_t1_{threshold}_t2_{value}.log"
    message: "Concatenating regional meta-consensus into a single sequence"
    shell:
        "./workflow/scripts/concat_regions.py {output} {input} 2> {log}"

# ------------------------------------------------------- #
# b. Concatenating region before consensus phase #
# ------------------------------------------------------- #


if "early_concat" in config:

    # -------------------------------------------------------- #

    # III.b Consensus phase (end)

    # --------------------------------------------------------#
    # Concatenates the consensus into a single sequence
    rule create_consensus_per_tool_concat:
     input:
        expand(rules.create_consensus_per_tool.output, region=config["region"], allow_missing=True)
     output:
        os.path.join("{output_folder}", "consensus", "{depth}","{threshold}","consensus_{tool}_{depth}.fasta")
     log: "{output_folder}/logs/3b_consensus_per_tool_concat_{tool}_d{depth}_t{threshold}.log"
     message: "Consensus for tools"
     shell: "./workflow/scripts/concat_regions.py {output} {input}" # >{log} 2>&1"

    # Creates the consensus pile for the meta-MSA phase
    rule create_consensus_concat:
     input: expand(rules.create_consensus_per_tool_concat.output, tool=config["tool"], allow_missing=True, dataset=DATASET)
     params: data=DATASET
     log: "{output_folder}/logs/3b_consensus_d{depth}_t{threshold}.log"
     output:
        os.path.join("{output_folder}", "consensus", "{depth}","all_tools_consensus_{depth}_{threshold}.fasta")
     message: "Consensus depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
     shell: 'cat {input} > {output} 2>{log}'

    # -------------------------------------------------------- #

    # IV.b Meta-MSA phase

    # -------------------------------------------------------- #

    # Creates the meta-MSA using muscle
    rule align_muscle_concat:
     input:
        expand(rules.create_consensus_concat.output, output_folder=config["output_folder"], allow_missing=True)
     output:
        os.path.join("{output_folder}","consensus", "{depth}","muscle_consensus_{depth}_{threshold}.fasta")
     conda:
        "envs/muscle.yaml"
     log: "{output_folder}/logs/4b_align_muscle_d{depth}_t{threshold}.log"
     message: "Alignment muscle depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
     shell:
        "muscle -in {input} -out {output} > {log} 2>{log}"

    # -------------------------------------------------------- #

    # V.b Meta consensus phase#

    # -------------------------------------------------------- #

    rule meta_consensus_concat:
     input:
        muscle=expand(rules.align_muscle_concat.output, output_folder=config['output_folder'], allow_missing=True)
     output:
        os.path.join("{output_folder}","meta_consensus", "meta_consensus_concat_{align}_d{depth}_t1_{threshold}_t2_{value}.fasta")
     log: "{output_folder}/logs/5b_meta_consensus_{align}_d{depth}_t1_{threshold}_t2_{value}.log"
     message: "Meta-consensus depth: {wildcards.depth} , consensus threshold: {wildcards.threshold} , meta consensus threshold: {wildcards.value}\n"
     shell:
        "./src/msa_handler/MTool {input.muscle} {output} {wildcards.value}"


    rule concat_before_consensus:
     input: expand(rules.meta_consensus_concat.output, output_folder=config["output_folder"], align="muscle", depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"])

# --------------------------------------------------------#

# This part handle comparison to a reference and creates plots in order to better visualize the efficiency of the consensus strategy used for both the preliminary consensus and the meta-consensus. #

if "ref_file" in config:
    # -------------------------------------------------------- #

    # VI. Meta-consensus analysis

    # -------------------------------------------------------- #
    # VI.a Regional analysis
    # -------------------------------------------------------- #

    # Cut the reference for every region
    rule region_seq :
     input :
        ref =os.path.join(config["data_in"],config["ref_file"])
     output :
        os.path.join(config['output_folder'], "data", "region_references","region_seq_r{start}_{end}.fasta")
     message:
        "Region's seq for (Region {wildcards.start} - {wildcards.end})"
     log:
        os.path.join(config['output_folder'],"logs","6_region_seq_r{start}_{end}.log")
     shell :
        'ORDER="./workflow/scripts/region_seq.sh {wildcards.start} {input.ref} {output} {wildcards.end}";'
        'echo "ORDER: $ORDER" >{log};'
        '$ORDER 2>&1 >>{log}'

    # Exonerate sequence comparison between meta-consensus and reference for each region
    rule exonerate_stats_meta:
      input: files=expand(rules.regional_meta_consensus.output, align="muscle", value=config["threshold_2"], allow_missing=True) , ref=expand("{ref}/region_references/region_seq_r{region}.fasta", data_in=config["data_in"], ref=config["data_out"], allow_missing=True)
      output: os.path.join("{output_folder}","plot","{region}","{depth}","{threshold}","aln_stats_meta.txt")
      log: "{output_folder}/logs/6_plot_exonerate_meta_r{region}_d{depth}_t{threshold}.log"
      conda: "envs/exonerate.yaml"
      message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold}) for meta-consensus threshold in [ {config[threshold_2]} ]"
      shell: "./workflow/scripts/exonerate_stats.sh {output} {input.ref} {input.files}"

    # Exonerate sequence comparison between consensus and reference for each region and each tools
    rule exonerate_stats_tools:
      input:
          ref=expand("{ref}/region_references/region_seq_r{region}.fasta", data_in=config["data_in"], ref=config["data_out"], allow_missing=True),
          consensus_per_tool=expand(rules.create_consensus_per_tool.output, value=config["threshold_2"], allow_missing=True)
      output: os.path.join("{output_folder}","plot","{region}","{depth}","{threshold}","aln_stats_{tool}.txt")
      log: "{output_folder}/logs/6_plot_exonerate_tools_{tool}_r{region}_d{depth}_t{threshold}.log"
      conda: "envs/exonerate.yaml"
      message: "Exonerate comparison between consensus for tool {wildcards.tool} and ref (region: {wildcards.region}, depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold}) for meta-consensus threshold in [ {config[threshold_2]} ]"
      shell: "./workflow/scripts/exonerate_stats.sh {output} {input.ref} {input.consensus_per_tool} 2> {log}"

    # Stats for every region for each tools
    rule stats_region_align:
      input: expand(rules.exonerate_stats_tools.output, region=config["region"], allow_missing=True)
      output: os.path.join("{output_folder}","plot","{depth}","{threshold}","aln_stats_region_{tool}.txt")
      log: "{output_folder}/logs/6_plot_exonerate_tools_regions_{tool}_d{depth}_t{threshold}.log"
      conda: "envs/exonerate.yaml"
      message: "Exonerate comparison between consensus for all regions tool {wildcards.tool} and ref (depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold})"
      shell: "cat {input} > {output} 2> {log}"

    # Stats for every region for the meta-consensus
    rule stats_region_meta:
      input: expand(rules.exonerate_stats_meta.output, region=config["region"], allow_missing=True)
      output: os.path.join("{output_folder}","plot","{depth}","{threshold}","meta_aln_stats_region.txt")
      log: "{output_folder}/logs/6_plot_exonerate_meta_regions_d{depth}_t{threshold}.log"
      conda: "envs/exonerate.yaml"
      message: "Exonerate comparison between meta-consensus for all regions and ref (depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold})"
      shell: "cat {input} > {output} 2> {log}"

    # Plots for every region, every consensus threshold for given depth
    rule plot_region_all:
      input: expand(rules.stats_region_align.output, threshold=config["threshold_1"], tool=config["tool"], output_folder=config["output_folder"], allow_missing=True) , expand(rules.stats_region_meta.output, threshold=config["threshold_1"], output_folder=config["output_folder"], allow_missing=True)
      params: target_folder="{plot_folder}/plot/{depth}" , region=[elem for elem in config["region"]]
      log: "{plot_folder}/logs/6_plot_t1_all_region_d{depth}.log"
      output: expand(os.path.join("{plot_folder}","plot","{depth}","graph_all_region_d{depth}_{graph}.png"), graph=GRAPH_TYPES, allow_missing=True)
      message: "Creating plot for meta-consensus quality with preliminary threshold in [ {config[threshold_1]} ] and meta-consensus in [ {config[threshold_2]} ] (depth: {wildcards.depth})"
      shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.depth}/graph_all_region_d{wildcards.depth} '[{params.region}]' {input} 2> {log} "

    # Plots by depth for meta-consensus regions
    rule plot_depth:
      input: expand(rules.exonerate_stats_meta.output, depth=config["depth"], output_folder=config['output_folder'], allow_missing=True)
      params: target_folder="plot/{region}/graph"
      log: "{plot_folder}/logs/6_plot_depth_r{region}_t{threshold}.log"
      output: expand(os.path.join("{plot_folder}","plot","{region}","graph","graph_depth_r{region}_t{threshold}_{graph}.png"), graph=GRAPH_TYPES, depth=config["depth"],  allow_missing=True)
      message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, or preliminary consensus threshold in [ {config[threshold_1]} ],  meta-consensus threshold in [ {config[threshold_2]} ]"
      shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.region}/graph/graph_depth_r{wildcards.region}_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"

    # Plots by depth for every region
    rule plot_depth_all:
      input: expand(rules.exonerate_stats_meta.output, depth=config["depth"], output_folder=config["output_folder"], allow_missing=True), expand(rules.exonerate_stats_tools.output, tool=config["tool"], depth=config["depth"], output_folder=config["output_folder"], allow_missing=True)
      params: target_folder="consensus/plot/{region}/graph"
      log: "{plot_folder}/logs/plot_all_depth_r{region}_t{threshold}.log"
      output: expand(os.path.join("{plot_folder}","plot","{region}","graph","graph_depth_all_r{region}_t{threshold}_{graph}.png"), graph=GRAPH_TYPES, depth=config["depth"],  allow_missing=True)
      message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, or preliminary consensus threshold in [ {config[threshold_1]} ],  meta-consensus threshold in [ {config[threshold_2]} ]"
      shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.region}/graph/graph_depth_all_r{wildcards.region}_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"

    # Plots for every consensus threshold
    rule plot_t1:
      input: expand(rules.exonerate_stats_meta.output, threshold=config["threshold_1"], output_folder=config["output_folder"], allow_missing=True)
      params: target_folder="{plot_folder}/plot/{region}/{depth}"
      log: "{plot_folder}/logs/plot_t1_r{region}_d{depth}.log"
      output: expand(os.path.join("{plot_folder}","plot","{region}","{depth}","graph_r{region}_d{depth}_{graph}.png"), graph=GRAPH_TYPES, allow_missing=True)
      message: "Creating plot for meta-consensus quality with preliminary threshold in [ {config[threshold_1]} ] and meta-consensus in [ {config[threshold_2]} ] (region: {wildcards.region}, depth: {wildcards.depth})"
      shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.region}/{wildcards.depth}/graph_r{wildcards.region}_d{wildcards.depth} '[{config[threshold_2]}]' {input} 2> {log} "

    # Plots for every meta-consensus threshold
    rule plot_t2:
      input: expand(rules.exonerate_stats_tools.output, tool=config["tool"], output_folder=config["output_folder"], allow_missing=True),
             expand(rules.exonerate_stats_meta.output, output_folder=config["output_folder"], allow_missing=True)
      params: target_folder="{plot_folder}/plot/{region}/{depth}/{threshold}/"
      log: "{plot_folder}/logs/plot_t2_r{region}_d{depth}_t_{threshold}.log"
      output: expand(os.path.join("{plot_folder}","plot","{region}","{depth}","{threshold}","graph_r{region}_d{depth}_t1_{threshold}_{graph}.png"), graph=GRAPH_TYPES, allow_missing=True)
      message: "Creating plot for meta-consensus compared to preliminary consensus with meta-consensus threshold in {config[threshold_2]}  (region: {wildcards.region}, depth: {wildcards.depth}, threshold: {wildcards.threshold})"
      shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.region}/{wildcards.depth}/{wildcards.threshold}/graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold} '[{config[threshold_2]}]'  {input} 2> {log}"


    # ------------------------------------------------------ #
    #  VI.b Regions concatenated after the meta-consensus #
    # ------------------------------------------------------ #

    # Exonerate for pair-wise alignement
    rule comparison_ref_metaconsensus:
      input: query=rules.meta_consensus.output, target=config["ref_file"]
      output: os.path.join("{output_folder}", "mismatches", "alignment_meta_ref_d{depth}_t1_{threshold}_t2_{value}.fasta")
      conda: "envs/exonerate.yaml"
      shell: "exonerate --bestn 1 -Q dna -E -m a:g --showvulgar false --showsugar false --showcigar false --showalignment true {input.target} {input.query} > {output} 2>/dev/null"

    # Exonerate for stats
    rule stats_ref_metaconsensus:
      input: query=rules.meta_consensus.output, target=config["ref_file"]
      output: os.path.join("{output_folder}", "plot", "stats_meta_ref_d{depth}_t1_{threshold}_t2_{value}.fasta")
      conda: "envs/exonerate.yaml"
      shell: "./workflow/scripts/exonerate_stats.sh {output} {input.target} {input.query}"


    # Extracting the mismatching regions, using exonerate to align the meta-consensus with the reference to find them.
    rule extract_mismatches:
      input: rules.comparison_ref_metaconsensus.output
      output: os.path.join("{output_folder}", "mismatches", "mismatches_d{depth}_t1_{threshold}_t2_{value}.fasta")
      shell: "./src/util/exonerate_parser.py {input} {output} {MISMATCH_AREA_SIZE} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "


    # ------------------------------------------------------ #
    # VI.c Regions concatenated after the consensus phase #
    # ------------------------------------------------------ #

    # Exonerate for pair-wise alignement
    rule comparison_ref_metaconsensus_concat:
      input: query=expand(rules.meta_consensus_concat.output, align="muscle", allow_missing=True), target=config["ref_file"]
      output: os.path.join("{output_folder}", "mismatches", "alignment_meta_concat_ref_d{depth}_t1_{threshold}_t2_{value}.fasta")
      conda: "envs/exonerate.yaml"
      shell: "exonerate --bestn 1 -Q dna -E -m a:g --showvulgar false --showsugar false --showcigar false --showalignment true {input.target} {input.query} > {output} 2>/dev/null"

    # Exonerate for stats
    rule stats_ref_metaconsensus_concat:
      input: query=expand(rules.meta_consensus_concat.output, align="muscle", allow_missing=True), target=config["ref_file"]
      output: os.path.join("{output_folder}", "plot", "stats_meta_concat_ref_d{depth}_t1_{threshold}_t2_{value}.fasta")
      conda: "envs/exonerate.yaml"
      shell: "./workflow/scripts/exonerate_stats.sh {output} {input.target} {input.query}"

    # Extracting the mismatching regions, using exonerate to align the meta-consensus with the reference to find them.
    rule extract_mismatches_concat:
      input: rules.comparison_ref_metaconsensus_concat.output
      output: os.path.join("{output_folder}", "mismatches", "mismatches_concat_d{depth}_t1_{threshold}_t2_{value}.fasta")
      shell: "./src/util/exonerate_parser.py {input} {output}  {MISMATCH_AREA_SIZE} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "

    # --------------------------------------------------------#
    # VI.d Comparisons between the two concatenation strategies
    # --------------------------------------------------------#

    # Plots comparing the two strategies of concatenation #
    # Stats for all meta_consensus threshold for concatenated meta-consensus
    rule stats_ref_metaconsensus_all_t2:
      input: expand(rules.stats_ref_metaconsensus.output, value=config["threshold_2"],allow_missing=True)
      output: os.path.join("{output_folder}", "plot", "stats_meta_ref_d{depth}_t1_{threshold}_tALL.fasta")
      message: "Regroup every meta-consensus stats "
      shell: "cat {input} > {output}"

    # Stats for all meta_consensus threshold for concatenated consensus
    rule stats_ref_metaconsensus_all_t2_concat:
      input: expand(rules.stats_ref_metaconsensus_concat.output, value=config["threshold_2"],allow_missing=True)
      output: os.path.join("{output_folder}", "plot", "stats_meta_concat_ref_d{depth}_t1_{threshold}_tALL.fasta")
      shell: "cat {input} > {output}"

    # Plot for concatenated consensus, and both concatenated meta-consensus strategies
    rule plot_depth_all_concat:
      input: expand(rules.stats_ref_metaconsensus_all_t2.output, depth=config["depth"], output_folder=config["output_folder"], allow_missing=True), expand(rules.stats_ref_metaconsensus_all_t2_concat.output, depth=config["depth"], output_folder=config["output_folder"], allow_missing=True), expand(rules.exonerate_stats_tools.output, tool=config["tool"], depth=config["depth"], output_folder=config["output_folder"], allow_missing=True)
      params: target_folder="consensus/plot/graph"
      log: "{plot_folder}/logs/plot_all_depth_concat_t{threshold}.log"
      output: expand(os.path.join("{plot_folder}","plot","graph","graph_depth_all_concat_t{threshold}_{graph}.png"), graph=GRAPH_TYPES, depth=config["depth"],  allow_missing=True)
      message: "Exonerate comparison between meta-consensus and ref (preliminary consensus threshold in [ {config[threshold_1]} ],  meta-consensus threshold in [ {config[threshold_2]} ]"
      shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/graph/graph_depth_all_concat_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"


    # Creates every plots (uncomment specific plots to add them to the analysis. Adding sections will increase the run time)
    rule plot:
      input: # VI.a Regional comparisons
             #expand(rules.plot_t1.output, region=config["region"], depth=config["depth"] , plot_folder=config["output_folder"]),
             #expand(rules.plot_t2.output, region=config["region"], depth=config["depth"], threshold=config["threshold_1"], plot_folder=config["output_folder"]),
             #expand(rules.plot_region_all.output, region=config["region"], depth=config["depth"] , plot_folder=config["output_folder"]),
             #expand(rules.plot_depth.output, region=config["region"], threshold=config["threshold_1"], plot_folder=config["output_folder"]),
             #expand(rules.plot_depth_all.output, region=config["region"], threshold=config["threshold_1"], plot_folder=config["output_folder"]),
             #expand(rules.plot_depth_all_concat.output, threshold=config["threshold_1"], plot_folder=config["output_folder"]),
             # VI.b Meta-consensus concatenation
             expand(rules.extract_mismatches.output , threshold=config["threshold_1"], value=config["threshold_2"], output_folder=config["output_folder"], depth=config["depth"]),
             expand(rules.stats_ref_metaconsensus.output , threshold=config["threshold_1"], value=config["threshold_2"], output_folder=config["output_folder"], depth=config["depth"])
             # VI.c Consensus concatenation
             #expand(rules.stats_ref_metaconsensus_concat.output , threshold=config["threshold_1"], value=config["threshold_2"], output_folder=config["output_folder"], depth=config["depth"])
             #expand(rules.extract_mismatches_concat.output , threshold=config["threshold_1"], value=config["threshold_2"], output_folder=config["output_folder"], depth=config["depth"]),
             # VI.d Strategies comparisons
             #expand(rules.plot_depth_all_concat.output, threshold=config["threshold_1"], plot_folder=config["output_folder"]),
      message: "Completed all tasks"

else :
    rule plot:
      run: sys.exit("Plotting requires a reference to create statistics")