Skip to content
Snippets Groups Projects
Select Git revision
  • main
1 result

Snakefile

Blame
  • Snakefile 41.88 KiB
    import os, sys
    import time
    from scripts.extract_longest_read import extract_longest_read
    
    # ------------------------------------ #
    # Values tests and initialization #
    if not "region_size" in config:
        config["region_size"] = 0
    else :
        config["region_size"] = int(config["region_size"])
    
    if not "data_out" in config:
        config["data_out"] = os.path.join(config["output_folder"],"data")
    
    if not "consensus_folder" in config:
        config["consensus_folder"] = os.path.join(config["output_folder"],"consensus")
    
    if not "tool" in config:
        config["tool"] = [ "kalign2", "kalign3", "mafft", "muscle", "abpoa", "spoa"]
    
    if not "align" in config:
        config["align"] = "muscle"
    
    if not "threshold_1" in config:
        config["threshold_1"] = [50]
    
    if not "threshold_2" in config:
        config["threshold_2"] = [50]
    
    if not "data_in" in config:
        config["data_in"] = ""
    
    config["early_concat"] = True
    
    # These constants are modifiable values that needs testing to better adapt the warnings and the overall pipeline run. #
    
    # High depth and high length represent the values from which the warnings for slow runs are printed.
    HIGH_DEPTH = 60
    HIGH_LENGTH = 5000
    
    # This constant represents the minimal coverage percentage required for a read to be kept for a specific region
    READ_COVERAGE_PERCENTAGE = int(50)
    
    # This constant represents the expected region size extracted before and after the mismatch position.
    MISMATCH_AREA_SIZE = 40
    
    REGION_HEAD_OVERLAP = int(config["region_overlap"]) if "region_overlap" in config else 0
    
    # This constant is the relative path for MSA results #
    DATASET = expand("{data}/{format}", data=config["data_out"], format="msa")
    
    # This constant is a list of the various datas studied in the graph, this might need to be updated along with the exonerate stats and plotting rules
    GRAPH_TYPES = ["identity", "similarity", "equivalence", "mismatches","sequence_length", "aligned_length"]
    
    
    # Here are some functions required for the pipeline runs without references, or without defined length or depth
    
    def get_length(file):
        """
        This function gets the length of the first read in target file
        :param file: the relative path to the file
        :return: the length of the first read
        """
        with open(file, 'r') as f:
          f.readline()
          l = f.readline()
        return len(l)
    
    def cut_region_from_length(length):
        """
        Creates region intervals of the correct size, with the last region ending at length
        :param length: the last value for the regions
        :return: a list of regions written in the format "start_end"
        """
        start = 1
        region = []
        if config["region_size"] == 0:
            return ["1_"+str(length -1)]
        while start < int(length):
            region.append(str(start)+"_"+str( min(start+config["region_size"] , int(length)-1)))
            start += config["region_size"] - REGION_HEAD_OVERLAP
        return region
    
    def get_depth(file):
        """
        This function gets the maximum depth of target read file
        :param file: the relative path to the read file
        :return: the maximum read depth
        """
        out = subprocess.run(["wc", "-l", file], capture_output=True, text=True)
        return int(int(out.stdout.split(" ")[0]) /2)
    
    def eprint(*args, **kwargs):
        """
        This function is a wrapper to easily print in the error out
        """
        print(*args, file=sys.stderr, **kwargs)
    
    
    
    # Depth_check is used in the pipeline to print the correct depth in the file names.
    if config["depth"] == 0:
        depth_check = get_depth(config["read_file"])
    else:
        depth_check = config['depth']
    if not isinstance(depth_check, list):
        depth_check = [depth_check]
    
    # This snippet of code is used to creates the regions if they aren't preset
    if not 'region' in config:
        if not 'length' in config:
            if 'ref_file' in config:
                config['length'] = get_length(os.path.join(config["data_in"],config["ref_file"]))
            else:
                anchor = extract_longest_read(os.path.join(config["data_in"], config["read_file"]), 1)[0]
                config['length'] = len(anchor)
        config['region']= cut_region_from_length(config['length'])
        if config['region_size'] == 0 :
            config['region_size'] = config['length']+1
    
    # --------------------------------- #
    # Print warnings for high depth and length #
    if max(depth_check) > HIGH_DEPTH:
        eprint("\nWarning: a high depth may cause the Multiple Sequence Alignment phase to be extremely SLOW")
        if "muscle" in config["tool"]:
            eprint("\t- muscle may be extremely SLOW for high depth")
        if "abpoa" in config["tool"]:
            eprint("\t- abpoa may require a lot of memory to run, consider disabling it or lowering the depth if you encounter problems allocating memory")
        if "spoa" in config["tool"]:
            eprint("\t- spoa may require a lot of memory to run, consider disabling it or lowering the depth if you encounter problems allocating memory")
        time.sleep(1.3)
    if 'length' in config and int(config['length']) > int(HIGH_LENGTH):
        eprint("\nWarning: a high region length may cause the Multiple Sequence Alignment phase to be extremely SLOW")
        if "muscle" in config["tool"]:
            eprint("\t- muscle may be extremely SLOW for high length")
        if "abpoa" in config["tool"]:
            eprint("\t- abpoa may require a lot of memory to run, consider disabling it or working on smaller regions if you encounter problems allocating memory")
        if "spoa" in config["tool"]:
            eprint("\t- spoa may require a lot of memory to run, consider disabling it or working on smaller regions if you encounter problems allocating memory")
        time.sleep(1.3)
    
    
    # Generates the meta-consensus from given reads #
    rule all:
         input:
    #          expand("{output_folder}/meta_consensus/{region}/meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta", region=config["region"], depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])
            expand(os.path.join("{output_folder}","meta_consensus","meta_consensus_d{depth}_t1_{threshold}_t2_{value}.fasta"), depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"], align="muscle", output_folder=config["output_folder"])
    
         message: "Meta consensus can be found in {input}"
    
    # -------------------------------------------------------- #
    # Installs the tool used for consensus and meta-consensus using a provided Makefile#
    rule MTool:
         input:
              "src/msa_handler/MSA.cpp", "src/msa_handler/MSA.h", "src/msa_handler/main.cpp", "src/msa_handler/Makefile"
         log: "logs/MTool.log"
         output:
              "src/msa_handler/MTool"
         shell:
              "cd src/msa_handler && make MTool"
    
    # -------------------------------------------------------- #
    
    # I. Creating a read pile
    
    # -------------------------------------------------------- #
    # The initial part of the pipeline differs whether a reference file has been provided or not #
    # This is the version with a reference #
    if "ref_file" in config:
        # -------------------------------------------------------- #
        # Aligns the reads on a given reference with minimap2 #
        rule alignment_reads_on_ref :
              input :
                   ref=os.path.join(config["data_in"],config["ref_file"]),
                   read=os.path.join(config["data_in"], config["read_file"])
              output :
                   os.path.join(config["data_out"],'alignment','aln_reads_on_ref.sam')
              message :
                   "Minimap for {config[data_in]}"
              log:
                   os.path.join(config["output_folder"],'logs','1_alignment_reads_on_ref.log')
              conda:
                   "envs/minimap2.yaml"
              shell :
                'ORDER="minimap2 -cax map-ont -t 8  {input.ref} {input.read}";'
                'echo "ORDER: $ORDER" >{log};'
                '$ORDER  >{output} 2>> {log}'
    
        # Cuts the reads into smaller regions to reduce the overall size and lower the computational cost, and creates fasta to build MSA#
        rule reads_map_region:
            input :
                aln = os.path.join(config['data_out'],'alignment','aln_reads_on_ref.sam')
            output :
                os.path.join(config['data_out'],'read_map_region','read_r{start}_{end}.fasta')
            message:
                "Reads map region for (Region start= {wildcards.start} end= {wildcards.end})"
            log:
                os.path.join(config['output_folder'],'logs','1_reads_map_region_r{start}_{end}.log')
            conda:
                "envs/perl.yaml"
            params: cvalue=READ_COVERAGE_PERCENTAGE
            shell :
                'ORDER="./workflow/scripts/reads_map_region.pl -s {wildcards.start} -e {wildcards.end} -c {READ_COVERAGE_PERCENTAGE} {input.aln} {output}";'
                'echo "ORDER: $ORDER" >{log};'
                '$ORDER  2>&1 >>{log}'
    
    # -------------------------------------------------------- #
    # This is the version without a reference
    else :
    
        # Extracts the longest read from the read pile to use as an anchor for a pre-alignment #
        rule extract_longest_read:
            input : read=os.path.join(config["data_in"],config["read_file"])
            output : os.path.join(config["data_out"],"anchor.fasta")
            message: "Extracting the longest read from {config[read_file]}"
            shell : "./workflow/scripts/extract_longest_read.py -i {input.read} -o {output} -n 1 "
    
    
        # Align the reads on anchor read #
        rule alignment_reads_on_anchor :
            input :
                ref=os.path.join(config["data_out"],"anchor.fasta"),
                read=os.path.join(config["data_in"], config["read_file"])
            output :
                os.path.join(config["data_out"],'alignment','aln_reads_on_read.sam')
            message :
                "Minimap for {config[read_file]}"
            log: os.path.join(config["output_folder"],'logs','1_alignment_reads_on_anchor.log')
            conda:
                "envs/minimap2.yaml"
            shell :
                'ORDER="minimap2 -cax map-ont -t 8  {input.ref} {input.read}";'
                'echo "ORDER: $ORDER" >{log};'
                '$ORDER  >{output} 2>> {log}'
    
    
        # This rule creates a file with possible regions cuts where the anchor read appears to have insertion errors.
        # It has not find its use yet in the pipeline, but might still have an interest overall
        rule get_regions_for_read :
            input : sam={rules.alignment_reads_on_anchor.output}
            output: os.path.join(config["data_out"], 'regions.txt')
            message: "Getting the regions where the reads are aligned on the anchor"
            conda: "envs/samtools.yaml"
            params: depth=READ_COVERAGE_PERCENTAGE
            shell: "./workflow/scripts/prepare_region_from_read.sh {input.sam} {output} {params.depth}"
    
        # Cuts the reads into smaller regions to reduce the overall size and lower the computational cost, and creates fasta to build MSA #
        rule reads_map_regions_no_ref:
            input: sam={rules.alignment_reads_on_anchor.output}, regions={rules.get_regions_for_read.output} #, fix={rules.set_region_elem.output}
            output: os.path.join(config['data_out'], "read_map_region",'read_r{start}_{end, \d+}.fasta')
            conda: "envs/perl.yaml"
            log:
                os.path.join(config['output_folder'],'logs','1_reads_map_region_r{start}_{end}.log')
            shell :
                    'ORDER="./workflow/scripts/reads_map_region.pl -s {wildcards.start} -e {wildcards.end} -c {READ_COVERAGE_PERCENTAGE} {input.sam} {output}";'
                    'echo "ORDER: $ORDER" >{log};'
                    '$ORDER  2>&1 >>{log}'
    
    
    # -------------------------------------------------------- #
    # This rule picks the amount of reads kept for target region
    rule select_depth :
        input :
            os.path.join(config['data_out'],'read_map_region','read_r{region}.fasta')
        output :
            os.path.join(config['data_out'],'read_map_region','reads_r{region}_d{depth}.fasta')
        message:
            "Select depth for (Region: {wildcards.region} & Depth={wildcards.depth})"
        log:
            os.path.join(config['output_folder'],'logs','1_select_number_reads_r{region}_d{depth}.log')
        shell :
            'ORDER="./workflow/scripts/selected_nb_read.sh {input} {wildcards.depth} {output}";'
            'echo "ORDER: $ORDER" >{log};'
            '$ORDER >{output} 2>>{log}'
    
    
    # -------------------------------------------------------- #
    
    # II. MSA phase
    
    # ------------------------------------------------------- #
    # Creation of MSA from the reads pile using tools #
    
    # If you want to add an MSA tool to the pipeline, you need to create a rule following this format, create a matching conda env, and add the tool *NAME* in your configfile, in the tools list.
    
    #rule *NAME* :
    #    input :
    #        os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
    #    output :
    #        out = os.path.join('{output_folder}', "data" ,"msa",'MSA_*NAME*_r{region}_d{depth}.fasta')
    #    message:
    #        "*NAME* for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
    #    log:
    #        os.path.join(''{output_folder}',logs','2_*NAME*_r{region}_d{depth}.log')
    #    conda:
    #        "envs/*NAME*.yaml"
    #    shell :
    #        "the shell command for your tool. with {input} as the input and {output} as the output"
    # This will only work if the command output the files as written in output.
    # If all is done correctly, you should be able to add the tool in the pipeline run
    
    # -------------------------------------------------------- #
    
    rule muscle :
        input :
            os.path.join('{output_folder}', 'data',"read_map_region",'reads_r{region}_d{depth}.fasta')
        output :
            out = os.path.join('{output_folder}', "data", "msa"  ,'MSA_muscle_r{region}_d{depth}.fasta')
        message:
            "Muscle for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
        log:
            os.path.join('{output_folder}','logs','2_muscle_r{region}_d{depth}.log')
        conda:
            "envs/muscle.yaml"
        shell :
            "muscle -in {input} -out {output} 2> {log}"
    
    rule mafft :
        input :
            os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
        output :
            out = os.path.join('{output_folder}', "data" ,"msa" ,'mafft_uncorrected_r{region}_d{depth}.fasta')
        message:
            "Mafft for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
        log:
            os.path.join('{output_folder}','logs','2_mafft_r{region}_d{depth}.log')
        conda:
            "envs/mafft.yaml"
        shell :
           "mafft {input} > {output} 2> {log}"
    
    rule mafft_correction :
        input :
            os.path.join('{output_folder}', "data" ,"msa",'mafft_uncorrected_r{region}_d{depth}.fasta')
        output :
            os.path.join('{output_folder}', "data"  ,"msa",'MSA_mafft_r{region}_d{depth}.fasta')
        message:
            "Mafft correction for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
        shell :
            'cat {input} | tr "atgc" "ATGC" >{output} ; rm -f {input}'
    
    rule spoa :
        input :
            os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
        output :
            out = os.path.join('{output_folder}', "data" ,"msa",'MSA_spoa_r{region}_d{depth}.fasta')
        message:
            "Spoa for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
        log:
            os.path.join('{output_folder}','logs','2_spoa_r{region}_d{depth}.log')
        conda:
            "envs/spoa.yaml"
        shell :
           "spoa -r 1 -l 1 {input} > {output} 2> {log}"
    
    rule kalign2 :
        input :
            os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
        output :
            out = os.path.join('{output_folder}', "data" ,"msa",'MSA_kalign2_r{region}_d{depth}.fasta')
        message:
            "Kalign2 for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
        log:
            os.path.join('{output_folder}','logs','2_kalign_r{region}_d{depth}.log')
        conda:
            "envs/kalign2.yaml"
        shell :
            "kalign -i {input} -out {output.out} 1> {log} 2> {log}"
    
    rule kalign3 :
        input :
            os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
        output :
            out = os.path.join('{output_folder}', "data" ,"msa",'MSA_kalign3_r{region}_d{depth}.fasta')
        message:
            "Kalign3 for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
        log:
            os.path.join('{output_folder}','logs','2_kalign3_r{region}_d{depth}.log')
        conda:
            "envs/kalign3.yaml"
        shell :
            "kalign -i {input} -o {output.out} 1>{log} 2> {log}"
    
    rule abpoa :
        input :
            os.path.join(config["data_out"],"read_map_region",'reads_r{region}_d{depth}.fasta')
        output :
            out = os.path.join('{output_folder}' , "data" ,"msa",'abpoa_uncorrected_r{region}_d{depth}.fasta')
        message:
            "Abpoa for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
        log:
            os.path.join('{output_folder}',"logs",'2_abpoa_r{region}_d{depth}.log')
        conda:
            "envs/abpoa.yaml"
        shell :
            "abpoa -r1 {input} > {output} 2> {log}"
    
    rule abpoa_correction :
        input :
            os.path.join('{output_folder}' , "data" ,"msa",'abpoa_uncorrected_r{region}_d{depth}.fasta')
        output :
            os.path.join('{output_folder}', "data" ,"msa",'MSA_abpoa_r{region}_d{depth}.fasta')
        message:
            "Abpoa correction for {wildcards.output_folder} (Region :{wildcards.region} & Depth={wildcards.depth})"
        shell :
            'tail -n +2 {input} | sed "s/^/>seq\\n/" > {output} 2> /dev/null;'
    
    
    # -------------------------------------------------------- #
    
    # III. Consensus phase
    
    # -------------------------------------------------------- #
    # Creation of every individual tool's consensus for every region #
    rule create_consensus_per_tool:
         input:
            rules.MTool.output, data=expand(os.path.join("{dataset}","MSA_{tool}_r{region}_d{depth}.fasta"), allow_missing=True, dataset=DATASET)
         output:
             os.path.join("{output_folder}", "consensus", "{region}","{depth}","{threshold}","consensus_{tool}_{region}_{depth}.fasta")
         log: "{output_folder}/logs/consensus_per_tool_{tool}_r{region}_d{depth}_t{threshold}.log"
         message: "Consensus for tools"
         shell: "(./src/msa_handler/MTool {input.data} {output} {wildcards.threshold} ) 2> {log}"
    
    
    # After the regional consensus, there are two possible ways to concatenate the regions.
    # We can concatenate the region here to align full size sequences and do a single meta-consensus, or concatenate the meta-consensus.
    
    # Both options are under here
    
    # -------------------------------------------------------- #
    
    # a. Meta-consensus concatenation
    
    # --------------------------------------------------------#
    
    # III.a Consensus phase (end)
    
    # --------------------------------------------------------#
    
    # Creates the consensus pile in order to align it in the next phase
    rule create_consensus:
         input: expand(rules.create_consensus_per_tool.output, tool=config["tool"], allow_missing=True, dataset=DATASET)
         params: data=DATASET
         log: "{output_folder}/logs/3a_consensus_r{region}_d{depth}_t{threshold}.log"
         output:
             os.path.join("{output_folder}", "consensus" ,"{region}","{depth}","all_tools_consensus_{region}_{depth}_{threshold}.fasta")
         message: "Consensus region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"  
         shell: 'cat {input} > {output} 2>{log}'
            #  "./workflow/scripts/MTool_over_folder.py -dir {params.data} -o {output} -region {wildcards.region} -depth {wildcards.depth} -threshold {wildcards.threshold}"#
    
    # -------------------------------------------------------- #
    
    # IV.a Meta-MSA phase
    
    # -------------------------------------------------------- #
    # Alignment of the consensus for the meta-consensus #
    
    rule align_mafft:
         input:
              rules.create_consensus.output
         output:
             os.path.join("{output_folder}", "consensus","{region}","{depth}","mafft_consensus_{region}_{depth}_{threshold}.fasta")
         conda:
              "envs/mafft.yaml"
         log: "{output_folder}/logs/4a_align_mafft_r{region}_d{depth}_t{threshold}.log"
         message: "Alignment with mafft region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
         shell:
              "mafft --auto {input} > {output} 2> {log}"
    
    rule align_muscle:
         input:
             expand(rules.create_consensus.output, allow_missing=True)
         output:
             os.path.join("{output_folder}","consensus", "{region}","{depth}","muscle_consensus_{region}_{depth}_{threshold}.fasta")
         conda:
              "envs/muscle.yaml"
         log: "{output_folder}/logs/4a_align_muscle_r{region}_d{depth}_t{threshold}.log"
         message: "Alignment muscle region: {wildcards.region} , depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
         shell:
              "muscle -in {input} -out {output} > {log} 2>{log}"
    
    # -------------------------------------------------------- #
    
    # V.a Meta-consensus
    
    # -------------------------------------------------------- #
    # Individual meta consensus #
    rule regional_meta_consensus:
         input:
              muscle=expand(rules.align_muscle.output, output_folder=config['output_folder'], allow_missing=True)
         output:
             os.path.join("{output_folder}","meta_consensus", "{region}", "meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.fasta")
         log: "{output_folder}/logs/5a_meta_consensus_{align}_r{region}_d{depth}_t1_{threshold}_t2_{value}.log"
         message: "Meta-consensus region: {wildcards.region} , depth: {wildcards.depth} , consensus threshold: {wildcards.threshold} , meta consensus threshold: {wildcards.value}\n"
         shell:
              "./src/msa_handler/MTool {input.muscle} {output} {wildcards.value} 2> {log}"
    
    
    rule meta_consensus:
        input: expand(rules.regional_meta_consensus.output, region=config["region"], align="muscle" , allow_missing=True)
        output: os.path.join("{output_folder}", "meta_consensus","meta_consensus_d{depth}_t1_{threshold}_t2_{value}.fasta")
        log: "{output_folder}/logs/5a_meta_consensus_d{depth}_t1_{threshold}_t2_{value}.log"
        message: "Concatenating regional meta-consensus into a single sequence"
        shell:
            "./workflow/scripts/concat_regions.py {output} {input} 2> {log}"
    
    # ------------------------------------------------------- #
    # b. Concatenating region before consensus phase #
    # ------------------------------------------------------- #
    
    
    if "early_concat" in config:
    
        # -------------------------------------------------------- #
    
        # III.b Consensus phase (end)
    
        # --------------------------------------------------------#
        # Concatenates the consensus into a single sequence
        rule create_consensus_per_tool_concat:
         input:
            expand(rules.create_consensus_per_tool.output, region=config["region"], allow_missing=True)
         output:
            os.path.join("{output_folder}", "consensus", "{depth}","{threshold}","consensus_{tool}_{depth}.fasta")
         log: "{output_folder}/logs/3b_consensus_per_tool_concat_{tool}_d{depth}_t{threshold}.log"
         message: "Consensus for tools"
         shell: "./workflow/scripts/concat_regions.py {output} {input}" # >{log} 2>&1"
    
        # Creates the consensus pile for the meta-MSA phase
        rule create_consensus_concat:
         input: expand(rules.create_consensus_per_tool_concat.output, tool=config["tool"], allow_missing=True, dataset=DATASET)
         params: data=DATASET
         log: "{output_folder}/logs/3b_consensus_d{depth}_t{threshold}.log"
         output:
            os.path.join("{output_folder}", "consensus", "{depth}","all_tools_consensus_{depth}_{threshold}.fasta")
         message: "Consensus depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
         shell: 'cat {input} > {output} 2>{log}'
    
        # -------------------------------------------------------- #
    
        # IV.b Meta-MSA phase
    
        # -------------------------------------------------------- #
    
        # Creates the meta-MSA using muscle
        rule align_muscle_concat:
         input:
            expand(rules.create_consensus_concat.output, output_folder=config["output_folder"], allow_missing=True)
         output:
            os.path.join("{output_folder}","consensus", "{depth}","muscle_consensus_{depth}_{threshold}.fasta")
         conda:
            "envs/muscle.yaml"
         log: "{output_folder}/logs/4b_align_muscle_d{depth}_t{threshold}.log"
         message: "Alignment muscle depth: {wildcards.depth} , threshold: {wildcards.threshold}\n"
         shell:
            "muscle -in {input} -out {output} > {log} 2>{log}"
    
        # -------------------------------------------------------- #
    
        # V.b Meta consensus phase#
    
        # -------------------------------------------------------- #
    
        rule meta_consensus_concat:
         input:
            muscle=expand(rules.align_muscle_concat.output, output_folder=config['output_folder'], allow_missing=True)
         output:
            os.path.join("{output_folder}","meta_consensus", "meta_consensus_concat_{align}_d{depth}_t1_{threshold}_t2_{value}.fasta")
         log: "{output_folder}/logs/5b_meta_consensus_{align}_d{depth}_t1_{threshold}_t2_{value}.log"
         message: "Meta-consensus depth: {wildcards.depth} , consensus threshold: {wildcards.threshold} , meta consensus threshold: {wildcards.value}\n"
         shell:
            "./src/msa_handler/MTool {input.muscle} {output} {wildcards.value}"
    
    
        rule concat_before_consensus:
         input: expand(rules.meta_consensus_concat.output, output_folder=config["output_folder"], align="muscle", depth=config["depth"], threshold=config["threshold_1"], value=config["threshold_2"])
    
    # --------------------------------------------------------#
    
    # This part handle comparison to a reference and creates plots in order to better visualize the efficiency of the consensus strategy used for both the preliminary consensus and the meta-consensus. #
    
    if "ref_file" in config:
        # -------------------------------------------------------- #
    
        # VI. Meta-consensus analysis
    
        # -------------------------------------------------------- #
        # VI.a Regional analysis
        # -------------------------------------------------------- #
    
        # Cut the reference for every region
        rule region_seq :
         input :
            ref =os.path.join(config["data_in"],config["ref_file"])
         output :
            os.path.join(config['output_folder'], "data", "region_references","region_seq_r{start}_{end}.fasta")
         message:
            "Region's seq for (Region {wildcards.start} - {wildcards.end})"
         log:
            os.path.join(config['output_folder'],"logs","6_region_seq_r{start}_{end}.log")
         shell :
            'ORDER="./workflow/scripts/region_seq.sh {wildcards.start} {input.ref} {output} {wildcards.end}";'
            'echo "ORDER: $ORDER" >{log};'
            '$ORDER 2>&1 >>{log}'
    
        # Exonerate sequence comparison between meta-consensus and reference for each region
        rule exonerate_stats_meta:
          input: files=expand(rules.regional_meta_consensus.output, align="muscle", value=config["threshold_2"], allow_missing=True) , ref=expand("{ref}/region_references/region_seq_r{region}.fasta", data_in=config["data_in"], ref=config["data_out"], allow_missing=True)
          output: os.path.join("{output_folder}","plot","{region}","{depth}","{threshold}","aln_stats_meta.txt")
          log: "{output_folder}/logs/6_plot_exonerate_meta_r{region}_d{depth}_t{threshold}.log"
          conda: "envs/exonerate.yaml"
          message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold}) for meta-consensus threshold in [ {config[threshold_2]} ]"
          shell: "./workflow/scripts/exonerate_stats.sh {output} {input.ref} {input.files}"
    
        # Exonerate sequence comparison between consensus and reference for each region and each tools
        rule exonerate_stats_tools:
          input:
              ref=expand("{ref}/region_references/region_seq_r{region}.fasta", data_in=config["data_in"], ref=config["data_out"], allow_missing=True),
              consensus_per_tool=expand(rules.create_consensus_per_tool.output, value=config["threshold_2"], allow_missing=True)
          output: os.path.join("{output_folder}","plot","{region}","{depth}","{threshold}","aln_stats_{tool}.txt")
          log: "{output_folder}/logs/6_plot_exonerate_tools_{tool}_r{region}_d{depth}_t{threshold}.log"
          conda: "envs/exonerate.yaml"
          message: "Exonerate comparison between consensus for tool {wildcards.tool} and ref (region: {wildcards.region}, depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold}) for meta-consensus threshold in [ {config[threshold_2]} ]"
          shell: "./workflow/scripts/exonerate_stats.sh {output} {input.ref} {input.consensus_per_tool} 2> {log}"
    
        # Stats for every region for each tools
        rule stats_region_align:
          input: expand(rules.exonerate_stats_tools.output, region=config["region"], allow_missing=True)
          output: os.path.join("{output_folder}","plot","{depth}","{threshold}","aln_stats_region_{tool}.txt")
          log: "{output_folder}/logs/6_plot_exonerate_tools_regions_{tool}_d{depth}_t{threshold}.log"
          conda: "envs/exonerate.yaml"
          message: "Exonerate comparison between consensus for all regions tool {wildcards.tool} and ref (depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold})"
          shell: "cat {input} > {output} 2> {log}"
    
        # Stats for every region for the meta-consensus
        rule stats_region_meta:
          input: expand(rules.exonerate_stats_meta.output, region=config["region"], allow_missing=True)
          output: os.path.join("{output_folder}","plot","{depth}","{threshold}","meta_aln_stats_region.txt")
          log: "{output_folder}/logs/6_plot_exonerate_meta_regions_d{depth}_t{threshold}.log"
          conda: "envs/exonerate.yaml"
          message: "Exonerate comparison between meta-consensus for all regions and ref (depth: {wildcards.depth}, preliminary consensus threshold: {wildcards.threshold})"
          shell: "cat {input} > {output} 2> {log}"
    
        # Plots for every region, every consensus threshold for given depth
        rule plot_region_all:
          input: expand(rules.stats_region_align.output, threshold=config["threshold_1"], tool=config["tool"], output_folder=config["output_folder"], allow_missing=True) , expand(rules.stats_region_meta.output, threshold=config["threshold_1"], output_folder=config["output_folder"], allow_missing=True)
          params: target_folder="{plot_folder}/plot/{depth}" , region=[elem for elem in config["region"]]
          log: "{plot_folder}/logs/6_plot_t1_all_region_d{depth}.log"
          output: expand(os.path.join("{plot_folder}","plot","{depth}","graph_all_region_d{depth}_{graph}.png"), graph=GRAPH_TYPES, allow_missing=True)
          message: "Creating plot for meta-consensus quality with preliminary threshold in [ {config[threshold_1]} ] and meta-consensus in [ {config[threshold_2]} ] (depth: {wildcards.depth})"
          shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.depth}/graph_all_region_d{wildcards.depth} '[{params.region}]' {input} 2> {log} "
    
        # Plots by depth for meta-consensus regions
        rule plot_depth:
          input: expand(rules.exonerate_stats_meta.output, depth=config["depth"], output_folder=config['output_folder'], allow_missing=True)
          params: target_folder="plot/{region}/graph"
          log: "{plot_folder}/logs/6_plot_depth_r{region}_t{threshold}.log"
          output: expand(os.path.join("{plot_folder}","plot","{region}","graph","graph_depth_r{region}_t{threshold}_{graph}.png"), graph=GRAPH_TYPES, depth=config["depth"],  allow_missing=True)
          message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, or preliminary consensus threshold in [ {config[threshold_1]} ],  meta-consensus threshold in [ {config[threshold_2]} ]"
          shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.region}/graph/graph_depth_r{wildcards.region}_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
    
        # Plots by depth for every region
        rule plot_depth_all:
          input: expand(rules.exonerate_stats_meta.output, depth=config["depth"], output_folder=config["output_folder"], allow_missing=True), expand(rules.exonerate_stats_tools.output, tool=config["tool"], depth=config["depth"], output_folder=config["output_folder"], allow_missing=True)
          params: target_folder="consensus/plot/{region}/graph"
          log: "{plot_folder}/logs/plot_all_depth_r{region}_t{threshold}.log"
          output: expand(os.path.join("{plot_folder}","plot","{region}","graph","graph_depth_all_r{region}_t{threshold}_{graph}.png"), graph=GRAPH_TYPES, depth=config["depth"],  allow_missing=True)
          message: "Exonerate comparison between meta-consensus and ref (region: {wildcards.region}, or preliminary consensus threshold in [ {config[threshold_1]} ],  meta-consensus threshold in [ {config[threshold_2]} ]"
          shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.region}/graph/graph_depth_all_r{wildcards.region}_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
    
        # Plots for every consensus threshold
        rule plot_t1:
          input: expand(rules.exonerate_stats_meta.output, threshold=config["threshold_1"], output_folder=config["output_folder"], allow_missing=True)
          params: target_folder="{plot_folder}/plot/{region}/{depth}"
          log: "{plot_folder}/logs/plot_t1_r{region}_d{depth}.log"
          output: expand(os.path.join("{plot_folder}","plot","{region}","{depth}","graph_r{region}_d{depth}_{graph}.png"), graph=GRAPH_TYPES, allow_missing=True)
          message: "Creating plot for meta-consensus quality with preliminary threshold in [ {config[threshold_1]} ] and meta-consensus in [ {config[threshold_2]} ] (region: {wildcards.region}, depth: {wildcards.depth})"
          shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.region}/{wildcards.depth}/graph_r{wildcards.region}_d{wildcards.depth} '[{config[threshold_2]}]' {input} 2> {log} "
    
        # Plots for every meta-consensus threshold
        rule plot_t2:
          input: expand(rules.exonerate_stats_tools.output, tool=config["tool"], output_folder=config["output_folder"], allow_missing=True),
                 expand(rules.exonerate_stats_meta.output, output_folder=config["output_folder"], allow_missing=True)
          params: target_folder="{plot_folder}/plot/{region}/{depth}/{threshold}/"
          log: "{plot_folder}/logs/plot_t2_r{region}_d{depth}_t_{threshold}.log"
          output: expand(os.path.join("{plot_folder}","plot","{region}","{depth}","{threshold}","graph_r{region}_d{depth}_t1_{threshold}_{graph}.png"), graph=GRAPH_TYPES, allow_missing=True)
          message: "Creating plot for meta-consensus compared to preliminary consensus with meta-consensus threshold in {config[threshold_2]}  (region: {wildcards.region}, depth: {wildcards.depth}, threshold: {wildcards.threshold})"
          shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/{wildcards.region}/{wildcards.depth}/{wildcards.threshold}/graph_r{wildcards.region}_d{wildcards.depth}_t1_{wildcards.threshold} '[{config[threshold_2]}]'  {input} 2> {log}"
    
    
        # ------------------------------------------------------ #
        #  VI.b Regions concatenated after the meta-consensus #
        # ------------------------------------------------------ #
    
        # Exonerate for pair-wise alignement
        rule comparison_ref_metaconsensus:
          input: query=rules.meta_consensus.output, target=config["ref_file"]
          output: os.path.join("{output_folder}", "mismatches", "alignment_meta_ref_d{depth}_t1_{threshold}_t2_{value}.fasta")
          conda: "envs/exonerate.yaml"
          shell: "exonerate --bestn 1 -Q dna -E -m a:g --showvulgar false --showsugar false --showcigar false --showalignment true {input.target} {input.query} > {output} 2>/dev/null"
    
        # Exonerate for stats
        rule stats_ref_metaconsensus:
          input: query=rules.meta_consensus.output, target=config["ref_file"]
          output: os.path.join("{output_folder}", "plot", "stats_meta_ref_d{depth}_t1_{threshold}_t2_{value}.fasta")
          conda: "envs/exonerate.yaml"
          shell: "./workflow/scripts/exonerate_stats.sh {output} {input.target} {input.query}"
    
    
        # Extracting the mismatching regions, using exonerate to align the meta-consensus with the reference to find them.
        rule extract_mismatches:
          input: rules.comparison_ref_metaconsensus.output
          output: os.path.join("{output_folder}", "mismatches", "mismatches_d{depth}_t1_{threshold}_t2_{value}.fasta")
          shell: "./src/util/exonerate_parser.py {input} {output} {MISMATCH_AREA_SIZE} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "
    
    
        # ------------------------------------------------------ #
        # VI.c Regions concatenated after the consensus phase #
        # ------------------------------------------------------ #
    
        # Exonerate for pair-wise alignement
        rule comparison_ref_metaconsensus_concat:
          input: query=expand(rules.meta_consensus_concat.output, align="muscle", allow_missing=True), target=config["ref_file"]
          output: os.path.join("{output_folder}", "mismatches", "alignment_meta_concat_ref_d{depth}_t1_{threshold}_t2_{value}.fasta")
          conda: "envs/exonerate.yaml"
          shell: "exonerate --bestn 1 -Q dna -E -m a:g --showvulgar false --showsugar false --showcigar false --showalignment true {input.target} {input.query} > {output} 2>/dev/null"
    
        # Exonerate for stats
        rule stats_ref_metaconsensus_concat:
          input: query=expand(rules.meta_consensus_concat.output, align="muscle", allow_missing=True), target=config["ref_file"]
          output: os.path.join("{output_folder}", "plot", "stats_meta_concat_ref_d{depth}_t1_{threshold}_t2_{value}.fasta")
          conda: "envs/exonerate.yaml"
          shell: "./workflow/scripts/exonerate_stats.sh {output} {input.target} {input.query}"
    
        # Extracting the mismatching regions, using exonerate to align the meta-consensus with the reference to find them.
        rule extract_mismatches_concat:
          input: rules.comparison_ref_metaconsensus_concat.output
          output: os.path.join("{output_folder}", "mismatches", "mismatches_concat_d{depth}_t1_{threshold}_t2_{value}.fasta")
          shell: "./src/util/exonerate_parser.py {input} {output}  {MISMATCH_AREA_SIZE} {config[region_size]} '{wildcards.output_folder}/data/read_map_region' '{config[region]}' "
    
        # --------------------------------------------------------#
        # VI.d Comparisons between the two concatenation strategies
        # --------------------------------------------------------#
    
        # Plots comparing the two strategies of concatenation #
        # Stats for all meta_consensus threshold for concatenated meta-consensus
        rule stats_ref_metaconsensus_all_t2:
          input: expand(rules.stats_ref_metaconsensus.output, value=config["threshold_2"],allow_missing=True)
          output: os.path.join("{output_folder}", "plot", "stats_meta_ref_d{depth}_t1_{threshold}_tALL.fasta")
          message: "Regroup every meta-consensus stats "
          shell: "cat {input} > {output}"
    
        # Stats for all meta_consensus threshold for concatenated consensus
        rule stats_ref_metaconsensus_all_t2_concat:
          input: expand(rules.stats_ref_metaconsensus_concat.output, value=config["threshold_2"],allow_missing=True)
          output: os.path.join("{output_folder}", "plot", "stats_meta_concat_ref_d{depth}_t1_{threshold}_tALL.fasta")
          shell: "cat {input} > {output}"
    
        # Plot for concatenated consensus, and both concatenated meta-consensus strategies
        rule plot_depth_all_concat:
          input: expand(rules.stats_ref_metaconsensus_all_t2.output, depth=config["depth"], output_folder=config["output_folder"], allow_missing=True), expand(rules.stats_ref_metaconsensus_all_t2_concat.output, depth=config["depth"], output_folder=config["output_folder"], allow_missing=True), expand(rules.exonerate_stats_tools.output, tool=config["tool"], depth=config["depth"], output_folder=config["output_folder"], allow_missing=True)
          params: target_folder="consensus/plot/graph"
          log: "{plot_folder}/logs/plot_all_depth_concat_t{threshold}.log"
          output: expand(os.path.join("{plot_folder}","plot","graph","graph_depth_all_concat_t{threshold}_{graph}.png"), graph=GRAPH_TYPES, depth=config["depth"],  allow_missing=True)
          message: "Exonerate comparison between meta-consensus and ref (preliminary consensus threshold in [ {config[threshold_1]} ],  meta-consensus threshold in [ {config[threshold_2]} ]"
          shell: "./workflow/scripts/plot_results.py {wildcards.plot_folder}/plot/graph/graph_depth_all_concat_t{wildcards.threshold} '[{config[threshold_2]}]' {input} 2> {log}"
    
    
        # Creates every plots (uncomment specific plots to add them to the analysis. Adding sections will increase the run time)
        rule plot:
          input: # VI.a Regional comparisons
                 #expand(rules.plot_t1.output, region=config["region"], depth=config["depth"] , plot_folder=config["output_folder"]),
                 #expand(rules.plot_t2.output, region=config["region"], depth=config["depth"], threshold=config["threshold_1"], plot_folder=config["output_folder"]),
                 #expand(rules.plot_region_all.output, region=config["region"], depth=config["depth"] , plot_folder=config["output_folder"]),
                 #expand(rules.plot_depth.output, region=config["region"], threshold=config["threshold_1"], plot_folder=config["output_folder"]),
                 #expand(rules.plot_depth_all.output, region=config["region"], threshold=config["threshold_1"], plot_folder=config["output_folder"]),
                 #expand(rules.plot_depth_all_concat.output, threshold=config["threshold_1"], plot_folder=config["output_folder"]),
                 # VI.b Meta-consensus concatenation
                 expand(rules.extract_mismatches.output , threshold=config["threshold_1"], value=config["threshold_2"], output_folder=config["output_folder"], depth=config["depth"]),
                 expand(rules.stats_ref_metaconsensus.output , threshold=config["threshold_1"], value=config["threshold_2"], output_folder=config["output_folder"], depth=config["depth"])
                 # VI.c Consensus concatenation
                 #expand(rules.stats_ref_metaconsensus_concat.output , threshold=config["threshold_1"], value=config["threshold_2"], output_folder=config["output_folder"], depth=config["depth"])
                 #expand(rules.extract_mismatches_concat.output , threshold=config["threshold_1"], value=config["threshold_2"], output_folder=config["output_folder"], depth=config["depth"]),
                 # VI.d Strategies comparisons
                 #expand(rules.plot_depth_all_concat.output, threshold=config["threshold_1"], plot_folder=config["output_folder"]),
          message: "Completed all tasks"
    
    else :
        rule plot:
          run: sys.exit("Plotting requires a reference to create statistics")