Skip to content
Snippets Groups Projects
Commit 6b067bc6 authored by Rohmer Coralie's avatar Rohmer Coralie
Browse files

add dataset stats + rename region to frame

parent e6bfc7fe
Branches
No related tags found
No related merge requests found
...@@ -3,7 +3,7 @@ import os ...@@ -3,7 +3,7 @@ import os
configfile: "config.yaml" configfile: "config.yaml"
EXP = "experiments" EXP = "experiments"
PREFIX_REGION="region_" PREFIX_FRAME="frame_"
#Script usage #Script usage
def usage(): def usage():
print("\nScript:\tsnakemake", print("\nScript:\tsnakemake",
...@@ -13,8 +13,8 @@ def usage(): ...@@ -13,8 +13,8 @@ def usage():
"Arguments: ", "Arguments: ",
" -required: i : nanoport long reads", " -required: i : nanoport long reads",
" r : reference sequences. (1 or 2)", " r : reference sequences. (1 or 2)",
" -optional: o : number of regions to be tested", " -optional: o : number of frames to be tested",
" b : beginning(s) position of region(s) (replacing nbr).", " b : beginning(s) position of frame(s) (replacing nbr).",
" d : depth(s).", " d : depth(s).",
" s : sizes of regions", " s : sizes of regions",
" t : threshold for sequence consensus", " t : threshold for sequence consensus",
...@@ -70,12 +70,12 @@ try: ...@@ -70,12 +70,12 @@ try:
beginings = config['B'] beginings = config['B']
except: except:
try: try:
nb_region = int(config['O']) nb_frame = int(config['O'])
except: except:
nb_region = 1 nb_frame = 1
beginings=[] beginings=[]
first_begining = (genome_size_min - int(REGION_SIZES[-1]))/(nb_region+1) first_begining = (genome_size_min - int(REGION_SIZES[-1]))/(nb_frame+1)
for i in range(nb_region): for i in range(nb_frame):
beginings.append(int((i+1)*first_begining)) beginings.append(int((i+1)*first_begining))
DATA_SETS=[] DATA_SETS=[]
...@@ -84,9 +84,9 @@ i=0 ...@@ -84,9 +84,9 @@ i=0
if type(beginings) != int: # equal : if len(beginings) > 1 if type(beginings) != int: # equal : if len(beginings) > 1
for start in beginings: for start in beginings:
i += 1 i += 1
DATA_SETS.append(os.path.join(EXP,EXP_NAME,PREFIX_REGION + str(start))) DATA_SETS.append(os.path.join(EXP,EXP_NAME,PREFIX_FRAME + str(start)))
else: else:
DATA_SETS.append(os.path.join(EXP,EXP_NAME,PREFIX_REGION + str(beginings))) DATA_SETS.append(os.path.join(EXP,EXP_NAME,PREFIX_FRAME + str(beginings)))
beginings = [int(beginings)] beginings = [int(beginings)]
...@@ -114,7 +114,8 @@ rule all : ...@@ -114,7 +114,8 @@ rule all :
expand(EXP + "/" + EXP_NAME + "/results/" + EXP_NAME + "_data_align_t{threshold}.csv", data_set=DATA_SETS, threshold=THRESHOLDS), expand(EXP + "/" + EXP_NAME + "/results/" + EXP_NAME + "_data_align_t{threshold}.csv", data_set=DATA_SETS, threshold=THRESHOLDS),
expand('{data_set}/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA), expand('{data_set}/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA),
expand(EXP + '/'+EXP_NAME + '/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA), expand(EXP + '/'+EXP_NAME + '/results/' + EXP_NAME + '_graph_{attribute}.pdf', data_set=DATA_SETS, attribute=ATTRIBUTES_DATA),
expand(EXP + "/" + EXP_NAME + "/results/" + EXP_NAME + "_meta_consensus_data_align_t{threshold}.csv", threshold=THRESHOLDS) expand(EXP + "/" + EXP_NAME + "/results/" + EXP_NAME + "_meta_consensus_data_align_t{threshold}.csv", threshold=THRESHOLDS),
os.path.join(EXP,EXP_NAME,'results','dataset_stats.txt')
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
# Data set preparation # Data set preparation
...@@ -126,7 +127,7 @@ rule data_set_preparation : ...@@ -126,7 +127,7 @@ rule data_set_preparation :
output : output :
os.path.join(EXP,EXP_NAME,'data','reads.fasta'), os.path.join(EXP,EXP_NAME,'data','reads.fasta'),
os.path.join(EXP,EXP_NAME,'data','ref.fasta'), os.path.join(EXP,EXP_NAME,'data','ref.fasta'),
expand(os.path.join('{data_set}','region_start','region_start.txt') , data_set = DATA_SETS) expand(os.path.join('{data_set}','frame_start','frame_start.txt') , data_set = DATA_SETS)
message : message :
"Data set preparation for "+EXP + '/'+ EXP_NAME "Data set preparation for "+EXP + '/'+ EXP_NAME
log: log:
...@@ -134,7 +135,7 @@ rule data_set_preparation : ...@@ -134,7 +135,7 @@ rule data_set_preparation :
conda: conda:
"env_conda/python3.yaml" "env_conda/python3.yaml"
shell: shell:
'ORDER="./src/data_set_preparation.py -i {input} -p '+PREFIX_REGION+' -n '+ EXP + '/'+EXP_NAME + ' -b ' + ' '.join(map(str,beginings)) + '";' 'ORDER="./src/data_set_preparation.py -i {input} -p '+PREFIX_FRAME+' -n '+ EXP + '/'+EXP_NAME + ' -b ' + ' '.join(map(str,beginings)) + '";'
'echo "ORDER: $ORDER" >{log};' 'echo "ORDER: $ORDER" >{log};'
'$ORDER 2>&1 >>{log}' '$ORDER 2>&1 >>{log}'
...@@ -164,7 +165,7 @@ rule alignment_reads_on_ref : ...@@ -164,7 +165,7 @@ rule alignment_reads_on_ref :
rule reads_map_region : rule reads_map_region :
input : input :
aln = os.path.join(EXP,EXP_NAME,'alignement','aln_reads_on_ref.sam'), aln = os.path.join(EXP,EXP_NAME,'alignement','aln_reads_on_ref.sam'),
start = os.path.join('{data_set}','region_start','region_start.txt') start = os.path.join('{data_set}','frame_start','frame_start.txt')
output : output :
os.path.join('{data_set}','read_map_region','reads_r{region_size}.fasta') os.path.join('{data_set}','read_map_region','reads_r{region_size}.fasta')
message: message:
...@@ -371,7 +372,7 @@ rule abpoa_correction : ...@@ -371,7 +372,7 @@ rule abpoa_correction :
rule region_seq : rule region_seq :
input : input :
ref = EXP + '/'+ EXP_NAME + '/data/ref.fasta', ref = EXP + '/'+ EXP_NAME + '/data/ref.fasta',
start = "{data_set}/region_start/region_start.txt" start = "{data_set}/frame_start/frame_start.txt"
output : output :
"{data_set}/seq_selectes_region/region_seq_r{region_size}.fasta" "{data_set}/seq_selectes_region/region_seq_r{region_size}.fasta"
message: message:
...@@ -596,3 +597,18 @@ rule meta_consensus_region_mean: ...@@ -596,3 +597,18 @@ rule meta_consensus_region_mean:
'ORDER="./src/region_mean.py -in {input} -out {output} -t {wildcards.threshold}";' 'ORDER="./src/region_mean.py -in {input} -out {output} -t {wildcards.threshold}";'
'echo "ORDER: $ORDER" >{log};' 'echo "ORDER: $ORDER" >{log};'
'$ORDER 2>>{log}' '$ORDER 2>>{log}'
rule dataset_stats:
input :
os.path.join(EXP,EXP_NAME,'alignement','aln_reads_on_ref.sam')
output :
os.path.join(EXP,EXP_NAME,'results','dataset_stats.txt')
message:
"Dataset stats for " + EXP + '/'+ EXP_NAME
log:
EXP + '/'+EXP_NAME + "/logs/20_dataset_stats.log"
conda:
"env_conda/samtools.yaml"
shell :
'set +o pipefail;'
'samtools stats {input} | head -38 >{output} 2>>{log}'
name: samtools
channels:
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=4.5=1_gnu
- bzip2=1.0.8=h7b6447c_0
- c-ares=1.18.1=h7f8727e_0
- ca-certificates=2021.10.26=h06a4308_2
- curl=7.80.0=h7f8727e_0
- krb5=1.19.2=hac12032_0
- libcurl=7.80.0=h0b77cf5_0
- libedit=3.1.20210910=h7f8727e_0
- libev=4.33=h7f8727e_1
- libgcc=7.2.0=h69d50b8_2
- libgcc-ng=9.3.0=h5101ec6_17
- libgomp=9.3.0=h5101ec6_17
- libnghttp2=1.46.0=hce63b2e_0
- libssh2=1.9.0=h1ba5d50_1
- libstdcxx-ng=9.3.0=hd4cf53a_17
- ncurses=6.3=h7f8727e_2
- openssl=1.1.1m=h7f8727e_0
- samtools=1.7=1
- xz=5.2.5=h7b6447c_0
- zlib=1.2.11=h7f8727e_4
...@@ -24,9 +24,9 @@ def usage(): ...@@ -24,9 +24,9 @@ def usage():
" name of the experiment", " name of the experiment",
" -o <int> ", " -o <int> ",
" default: 10", " default: 10",
" number of regions to be tested", " number of frames to be tested",
" -b <int>,<int>,...", " -b <int>,<int>,...",
" beginning(s) position of region(s) (replacing -o)", " beginning(s) position of frame(s) (replacing -o)",
" -d <int>,<int>,...", " -d <int>,<int>,...",
" default: 10,20,50", " default: 10,20,50",
" sequencing depth(s) (number of reads)", " sequencing depth(s) (number of reads)",
...@@ -108,11 +108,11 @@ def summary(): ...@@ -108,11 +108,11 @@ def summary():
exp_names=re.sub('\n', r' ', exp_names) exp_names=re.sub('\n', r' ', exp_names)
if os.path.exists("results_mean"): if os.path.exists("results_mean"):
result = subprocess.run("rm -r results_mean",shell=True) result = subprocess.run("rm -r results_mean",shell=True)
if os.path.exists("results_all_regions"): if os.path.exists("results_all_frames"):
result = subprocess.run("rm -r results_all_regions",shell=True) result = subprocess.run("rm -r results_all_frames",shell=True)
result = subprocess.run("./src/total_data_format.py -n " + exp_names,shell=True) result = subprocess.run("./src/total_data_format.py -n " + exp_names,shell=True)
result = subprocess.run("./src/total_data_format.py -m -n " + exp_names,shell=True) result = subprocess.run("./src/total_data_format.py -m -n " + exp_names,shell=True)
print("See folders: results_mean & results_all_regions") print("See folders: results_mean & results_all_frames")
else: else:
print("No experiment has been launched yet") print("No experiment has been launched yet")
...@@ -253,8 +253,8 @@ else: ...@@ -253,8 +253,8 @@ else:
lines.append("B: [" + beginings + "]") lines.append("B: [" + beginings + "]")
except: except:
try: try:
nb_region = sys.argv[sys.argv.index("-o")+1] nb_frame = sys.argv[sys.argv.index("-o")+1]
lines.append("O: [" + nb_region + "]") lines.append("O: [" + nb_frame + "]")
except: except:
pass pass
cores="" cores=""
......
...@@ -10,7 +10,7 @@ except: ...@@ -10,7 +10,7 @@ except:
try: try:
prefix = sys.argv[sys.argv.index("-p")+1] prefix = sys.argv[sys.argv.index("-p")+1]
except: except:
sys.stderr.write("ERROR: The prefix for regions is missing.\n") sys.stderr.write("ERROR: The prefix for frame is missing.\n")
try: try:
exp_name = sys.argv[sys.argv.index("-n")+1] exp_name = sys.argv[sys.argv.index("-n")+1]
...@@ -47,5 +47,5 @@ os.system(order) ...@@ -47,5 +47,5 @@ os.system(order)
for start in starts: for start in starts:
order = "echo " + start + " >" + exp_name + "/" + prefix + start + "/region_start/region_start.txt" order = "echo " + start + " >" + exp_name + "/" + prefix + start + "/frame_start/frame_start.txt"
os.system(order) os.system(order)
...@@ -4,7 +4,7 @@ import subprocess ...@@ -4,7 +4,7 @@ import subprocess
EXP = "experiments" EXP = "experiments"
ATTRIBUTES_TO_DISPLAY=["percentage_Identity","percentage_Error","percentage_Match"] ATTRIBUTES_TO_DISPLAY=["percentage_Identity","percentage_Error","percentage_Match"]
ATTRIBUTES_TO_DISPLAY_THRESHOLD_INDEPENDANT=["time","memory"] ATTRIBUTES_TO_DISPLAY_THRESHOLD_INDEPENDANT=["time","memory"]
PREFIX="region_" PREFIX="frame_"
RESULT_FOLDER="results" RESULT_FOLDER="results"
NAME_DATA_FILE="data_align_t" NAME_DATA_FILE="data_align_t"
NAME_META_CONSENSUS="meta_consensus_" NAME_META_CONSENSUS="meta_consensus_"
...@@ -49,7 +49,7 @@ except: ...@@ -49,7 +49,7 @@ except:
files={} files={}
i=0 i=0
result = subprocess.run("if [ ! -d results_mean ]; then mkdir results_mean;fi",shell=True) result = subprocess.run("if [ ! -d results_mean ]; then mkdir results_mean;fi",shell=True)
result = subprocess.run("if [ ! -d results_all_regions ]; then mkdir results_all_regions;fi",shell=True) result = subprocess.run("if [ ! -d results_all_frames ]; then mkdir results_all_frames;fi",shell=True)
#----------------------------------------------------------------------------- #-----------------------------------------------------------------------------
# Retrieved what was needed to read the files # Retrieved what was needed to read the files
#----------------------------------------------------------------------------- #-----------------------------------------------------------------------------
...@@ -146,10 +146,10 @@ for threshold in files : ...@@ -146,10 +146,10 @@ for threshold in files :
if attribute in ATTRIBUTES_TO_DISPLAY_THRESHOLD_INDEPENDANT: if attribute in ATTRIBUTES_TO_DISPLAY_THRESHOLD_INDEPENDANT:
pass pass
output_mean=open("results_mean/data_" + add_name_file_output + "mean_"+ attribute + ".csv","w") output_mean=open("results_mean/data_" + add_name_file_output + "mean_"+ attribute + ".csv","w")
output_all=open("results_all_regions/data_" + add_name_file_output + "all_region_"+ attribute + ".csv","w") output_all=open("results_all_frames/data_" + add_name_file_output + "all_frame_"+ attribute + ".csv","w")
else: else:
output_mean=open("results_mean/data_" + add_name_file_output + "mean_"+ attribute + "_" + threshold + ".csv","w") output_mean=open("results_mean/data_" + add_name_file_output + "mean_"+ attribute + "_" + threshold + ".csv","w")
output_all=open("results_all_regions/data_" + add_name_file_output + "all_region_"+ attribute + "_" + threshold + ".csv","w") output_all=open("results_all_frames/data_" + add_name_file_output + "all_frame_"+ attribute + "_" + threshold + ".csv","w")
output_mean.write(",,") output_mean.write(",,")
output_all.write(",,") output_all.write(",,")
for exp_name in data["order"]: for exp_name in data["order"]:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment