Skip to content
Snippets Groups Projects
Commit 4d9d8404 authored by Rohmer Coralie's avatar Rohmer Coralie
Browse files

ready: ecoli hifi illumina yeast simulated

parent 151b3efb
No related branches found
No related tags found
No related merge requests found
...@@ -30,7 +30,7 @@ READS_ILLUMINA_BMB="ERR1308675.fastq" ...@@ -30,7 +30,7 @@ READS_ILLUMINA_BMB="ERR1308675.fastq"
REF_CCN="ccnscafold.fa" REF_CCN="ccnscafold.fa"
REF_BMB="bmbscafold.fa" REF_BMB="bmbscafold.fa"
ID=["CCN_yeast","BMB_yeast","diploid_yeast","ecoli_hifi","ecoli_illumina"] ID=["CCN_yeast","BMB_yeast","diploid_yeast","ecoli_hifi","ecoli_illumina"]
CONFIGS=["","_clustal","_tcoffee"] CONFIGS=[""]#,"_clustal","_tcoffee"]
READS_HUMAN="reads.fastq" READS_HUMAN="reads.fastq"
REF_HUMAN="chm13.draft_v1.1.fasta" REF_HUMAN="chm13.draft_v1.1.fasta"
...@@ -93,260 +93,6 @@ rule align_human_reads_ref : ...@@ -93,260 +93,6 @@ rule align_human_reads_ref :
'minimap2 -cax map-ont -t '+THREAD+' {input.ref} {input.reads} >{output} 2>{log}' 'minimap2 -cax map-ont -t '+THREAD+' {input.ref} {input.reads} >{output} 2>{log}'
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
# Ecoli hifi data
#-------------------------------------------------------------------------------
rule download_hifi_reads_ecoli :
output :
os.path.join('ecoli_hifi_data',READS_HIFI)
message :
"Download hifi reads for ecoli hifi data"
log:
os.path.join('logs','ecoli_hifi_data','1_download_hifi.log')
conda:
"env_conda/fastq-dump.yaml"
shell:
'fastq-dump --gzip SRR11434954 >{log} 2>&1;'
'mv ' + READS_HIFI + ' {output}'
rule download_nanopore_reads_ecoli_hifi :
output :
os.path.join('ecoli_hifi_data',READS_NANOPORE_ECOLI_HIFI + '.gz')
message :
"Download nanopore reads for ecoli hifi data"
log:
os.path.join('logs','ecoli_hifi_data','2_download_nanopore.log')
conda:
"env_conda/fastq-dump.yaml"
shell:
'fastq-dump --gzip SRR12801740 >{log} 2>&1;'
'mv ' + READS_NANOPORE_ECOLI_HIFI + '.gz {output}'
rule gunzip_nanopore_reads_ecoli_hifi :
input :
os.path.join('ecoli_hifi_data',READS_NANOPORE_ECOLI_HIFI + '.gz')
output :
os.path.join('ecoli_hifi_data',READS_NANOPORE_ECOLI_HIFI)
message :
"Decompress nanopore reads for ecoli hifi data"
log:
os.path.join('logs','ecoli_hifi_data','3_gunzip_nanopore_reads.log')
conda:
"env_conda/gzip.yaml"
shell:
'gunzip {input} >{log} 2>&1'
rule select_hifi_reads_ecoli :
input :
os.path.join('ecoli_hifi_data',READS_HIFI)
output:
os.path.join('ecoli_hifi_data','reads_hifi.fastq')
message :
"Select hifi reads for ecoli hifi data"
log:
os.path.join('logs','ecoli_hifi_data','4_select_hifi_reads.log')
conda:
"env_conda/seqkit.yaml"
shell:
'set +o pipefail;'
'seqkit sort -l -r {input} 2>{log}|head -100000 >{output} 2>{log}'
rule assembly_hifi_reads_ecoli :
input :
os.path.join('ecoli_hifi_data','reads_hifi.fastq')
output:
os.path.join('ecoli_hifi_data','assembly_hifi','assembly.fasta')
message :
"Assembly hifi reads for ecoli hifi data with flye"
log:
os.path.join('logs','ecoli_hifi_data','5_assembly_hifi_reads.log')
conda:
"env_conda/flye.yaml"
shell:
'flye --pacbio-hifi {input} --genome-size 5m --threads ' + THREAD + ' -i 2 -o ecoli_hifi_data/assembly_hifi >{log} 2>&1'
rule extract_first_contig_hifi_reads :
input :
os.path.join('ecoli_hifi_data','assembly_hifi','assembly.fasta')
output:
os.path.join('ecoli_hifi_data','ref_ecoli_hifi.fasta')
message :
"Extract first contig assembly for ecoli hifi data"
log:
os.path.join('logs','ecoli_hifi_data','6_extract_first_contig_hifi_reads.log')
shell:
'set +o pipefail;'
'cat {input}|tr "\\n" "@"|sed "s/@>/\\n>/g"|sed "s/@/\\n/;s/@//g"|head -2 >{output} 2>{log}'
rule configuration_file_ecoli_hifi_data:
input :
ref=os.path.join('ecoli_hifi_data','ref_ecoli_hifi.fasta'),
reads=os.path.join('ecoli_hifi_data',READS_NANOPORE_ECOLI_HIFI)
output :
ref=os.path.join('before_configuration_files','config_ecoli_hifi.yaml')
message :
"Configuration file for ecoli hifi"
log:
os.path.join('logs','ecoli_hifi_data','7_configuration_ecoli_hifi.log')
shell:
'PATH_DATA=`pwd`;'
'echo -e "N: ecoli_hifi" >{output};'
'echo -e "I: $PATH_DATA/{input.ref}" >>{output};'
'echo -e "R: $PATH_DATA/{input.reads}" >>{output}'
#-------------------------------------------------------------------------------
# Ecoli illumina data
#-------------------------------------------------------------------------------
rule download_illumina_reads_ecoli :
output :
os.path.join('ecoli_illumina_data',READS_ILLUMINA+ '.gz')
message :
"Download illumina reads for ecoli illumina data"
log:
os.path.join('logs','ecoli_illumina_data','1_download_illumina.log')
conda:
"env_conda/fastq-dump.yaml"
shell:
'fastq-dump --gzip --split-spot --clip SRR8333590 >{log} 2>&1;'
'mv ' + READS_ILLUMINA + '.gz {output}'
rule gunzip_illumina_reads_ecoli_illumina:
input :
os.path.join('ecoli_illumina_data',READS_ILLUMINA + '.gz')
output :
os.path.join('ecoli_illumina_data',READS_ILLUMINA)
message :
"Decompress illumina reads for ecoli illumina data"
log:
os.path.join('logs','ecoli_illumina_data','2_gunzip_illumina_reads.log')
conda:
"env_conda/gzip.yaml"
shell:
'gunzip {input} >{log} 2>&1'
rule download_nanopore_reads_ecoli_illumina :
output :
os.path.join('ecoli_illumina_data',READS_NANOPORE_ECOLI_ILLUMINA + '.gz')
message :
"Download nanopore reads for ecoli illumina data"
log:
os.path.join('logs','ecoli_illumina_data','3_download_nanopore.log')
conda:
"env_conda/fastq-dump.yaml"
shell:
'fastq-dump --gzip SRR8335315 >{log} 2>&1;'
'mv ' + READS_NANOPORE_ECOLI_ILLUMINA + '.gz {output}'
rule gunzip_nanopore_reads_ecoli_illumina :
input :
os.path.join('ecoli_illumina_data',READS_NANOPORE_ECOLI_ILLUMINA + '.gz')
output :
os.path.join('ecoli_illumina_data',READS_NANOPORE_ECOLI_ILLUMINA)
message :
"Decompress nanopore reads for ecoli illumina data"
log:
os.path.join('logs','ecoli_illumina_data','4_gunzip_nanopore_reads.log')
conda:
"env_conda/gzip.yaml"
shell:
'gunzip {input} >{log} 2>&1'
rule download_ref_ecoli :
output :
os.path.join('ecoli_illumina_data',REF_ECOLI + '.gz')
message :
"Download ref ecoli for ecoli illumina data"
log:
os.path.join('logs','ecoli_illumina_data','5_download_ref_ecoli.log')
shell:
'wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/004/358/385/GCF_004358385.1_ASM435838v1/GCF_004358385.1_ASM435838v1_genomic.fna.gz -o {log};'
'mv ' + REF_ECOLI + '.gz {output}'
rule gunzip_ref_ecoli :
input :
os.path.join('ecoli_illumina_data',REF_ECOLI+ '.gz')
output :
os.path.join('ecoli_illumina_data',REF_ECOLI)
message :
"Decompress ref ecoli for ecoli illumina data"
log:
os.path.join('logs','ecoli_illumina_data','6_gunzip_ref_ecoli.log')
conda:
"env_conda/gzip.yaml"
shell:
'gunzip {input} >{log} 2>&1'
rule assembly_illumina_reads_ecoli :
input :
os.path.join('ecoli_illumina_data',READS_ILLUMINA)
output:
os.path.join('ecoli_illumina_data','assembly_illumina','contigs.fasta')
message :
"Assembly illumina reads for ecoli illumina data with spades"
log:
os.path.join('logs','ecoli_illumina_data','7_assembly_illumina_reads.log')
conda:
"env_conda/spades.yaml"
shell:
'spades.py -s {input} -o ecoli_illumina_data/assembly_illumina --only-assembler -t '+THREAD+' >{log} 2>&1'
rule extract_first_chromosome_ref :
input :
os.path.join('ecoli_illumina_data',REF_ECOLI)
output:
os.path.join('ecoli_illumina_data','ref_ecoli.fasta')
message :
"Extract first chromosome ref for ecoli illumina data"
log:
os.path.join('logs','ecoli_illumina_data','8_extract_first_chromosome_ref.log')
shell:
'head -67952 {input} >{output} 2>{log}'
rule align_illumina_contig_ref :
input :
contigs=os.path.join('ecoli_illumina_data','assembly_illumina','contigs.fasta'),
ref=os.path.join('ecoli_illumina_data','ref_ecoli.fasta')
output:
os.path.join('ecoli_illumina_data','aln_contigs_illumina_ref_ecoli.sam')
message :
"Align illumina contigs with ref for ecoli illumina data with minimap"
log:
os.path.join('logs','ecoli_illumina_data','9_align_illumina_contig_ref.log')
conda:
"env_conda/minimap2.yaml"
shell:
'minimap2 -cax asm5 -t '+THREAD+' {input.ref} {input.contigs} >{output} 2>{log}'
rule reads_map_region_ecoli_illumina :
input :
os.path.join('ecoli_illumina_data','aln_contigs_illumina_ref_ecoli.sam'),
output :
os.path.join('ecoli_illumina_data','ref_illumina_ecoli.fasta')
message:
"Reads map region for ecoli illumina data"
log:
os.path.join('logs','ecoli_illumina_data','10_reads_map_region.log')
conda:
"env_conda/perl.yaml"
shell :
'DEB=4845200;END=5080000;'
'./src/reads_map_region.pl -s $DEB -e $END {input} {output} >{log} 2>&1'
rule configuration_file_ecoli_illumina_data:
input :
ref=os.path.join('ecoli_illumina_data','ref_illumina_ecoli.fasta'),
reads=os.path.join('ecoli_illumina_data',READS_NANOPORE_ECOLI_ILLUMINA)
output :
ref=os.path.join('before_configuration_files','config_ecoli_illumina.yaml')
message :
"Configuration file for ecoli illumina"
log:
os.path.join('logs','ecoli_illumina_data','11_configuration_ecoli_illumina.log')
shell:
'PATH_DATA=`pwd`;'
'echo -e "N: ecoli_illumina" >{output};'
'echo -e "I: $PATH_DATA/{input.ref}" >>{output};'
'echo -e "R: $PATH_DATA/{input.reads}" >>{output}'
#-------------------------------------------------------------------------------
# Yeast data # Yeast data
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
rule download_CCN_yeast_reads : rule download_CCN_yeast_reads :
...@@ -463,7 +209,7 @@ rule assembly_illumina_reads_CCN : ...@@ -463,7 +209,7 @@ rule assembly_illumina_reads_CCN :
illumina=os.path.join('yeast_data',READS_ILLUMINA_CCN + '.gz'), illumina=os.path.join('yeast_data',READS_ILLUMINA_CCN + '.gz'),
nanopore=os.path.join('yeast_data',READS_CCN) nanopore=os.path.join('yeast_data',READS_CCN)
output: output:
os.path.join('yeast_data','assembly_illumina_CCN','contigs.fasta') os.path.join('yeast_data','assembly_illumina_ccn','contigs.fasta')
message : message :
"Assembly illumina reads for CCN yeast data with spades" "Assembly illumina reads for CCN yeast data with spades"
log: log:
...@@ -471,14 +217,14 @@ rule assembly_illumina_reads_CCN : ...@@ -471,14 +217,14 @@ rule assembly_illumina_reads_CCN :
conda: conda:
"env_conda/spades.yaml" "env_conda/spades.yaml"
shell: shell:
'spades.py --12 {input.illumina} --nanopore {input.nanopore} -o yeast_data/assembly_illumina_CCN --only-assembler -t '+THREAD+' >{log} 2>&1' 'spades.py --12 {input.illumina} --nanopore {input.nanopore} -o yeast_data/assembly_illumina_ccn --only-assembler -t '+THREAD+' >{log} 2>&1'
rule assembly_illumina_reads_BMB : rule assembly_illumina_reads_BMB :
input : input :
illumina=os.path.join('yeast_data',READS_ILLUMINA_BMB + '.gz'), illumina=os.path.join('yeast_data',READS_ILLUMINA_BMB + '.gz'),
nanopore=os.path.join('yeast_data',READS_BMB) nanopore=os.path.join('yeast_data',READS_BMB)
output: output:
os.path.join('yeast_data','assembly_illumina_BMB','contigs.fasta') os.path.join('yeast_data','assembly_illumina_bmb','contigs.fasta')
message : message :
"Assembly illumina reads for BMB yeast data with spades" "Assembly illumina reads for BMB yeast data with spades"
log: log:
...@@ -486,44 +232,47 @@ rule assembly_illumina_reads_BMB : ...@@ -486,44 +232,47 @@ rule assembly_illumina_reads_BMB :
conda: conda:
"env_conda/spades.yaml" "env_conda/spades.yaml"
shell: shell:
'spades.py --12 {input.illumina} --nanopore {input.nanopore} -o yeast_data/assembly_illumina_BMB --only-assembler -t '+THREAD+' >{log} 2>&1' 'spades.py --12 {input.illumina} --nanopore {input.nanopore} -o yeast_data/assembly_illumina_bmb --only-assembler -t '+THREAD+' >{log} 2>&1'
rule reduced_ref_CCN: rule reduced_ref_bmb:
input : input :
os.path.join('yeast_data','assembly_illumina_CCN','contigs.fasta') os.path.join('yeast_data','assembly_illumina_bmb','contigs.fasta')
output : output :
os.path.join('yeast_data',"reduced_CCN.fasta") os.path.join('yeast_data',"reduced_assembly_bmb.fasta")
message : message :
"Reduced reference for CCN yeast" "Reduced reference for bmb yeast"
log: log:
os.path.join('logs','yeast_data','11_reduced_CCN_ref.log') os.path.join('logs','yeast_data','11_reduced_bmb_ref.log')
shell: shell:
'set +o pipefail;' 'set +o pipefail;'
'head -42442 {input} |tail -11976 >{output} 2>>{log}' 'LINE=`cat {input} |grep -n ">"|head -2| cut -d: -f1|tail -1`;'
'head -$(($LINE-1)) {input} >{output} 2>>{log}'
rule reduced_ref_BMB: rule reduced_ref_ccn:
input : input :
os.path.join('yeast_data','assembly_illumina_BMB','contigs.fasta') os.path.join('yeast_data','assembly_illumina_ccn','contigs.fasta')
output : output :
os.path.join('yeast_data',"reduced_BMB.fasta") os.path.join('yeast_data',"reduced_assembly_ccn.fasta")
message : message :
"Reduced reference for BMB yeast" "Reduced reference for ccn yeast"
log: log:
os.path.join('logs','yeast_data','12_reduced_BMB_ref.log') os.path.join('logs','yeast_data','12_reduced_ccn_ref.log')
shell: shell:
'set +o pipefail;' 'set +o pipefail;'
'head -15288 {input} >{output} 2>>{log}' 'LINE_6=`cat {input} |grep -n ">"|head -6| cut -d: -f1|tail -1`;'
'LINE_5=`cat {input} |grep -n ">"|head -5| cut -d: -f1|tail -1`;'
'head -$(($LINE_6 - 1)) {input} | tail -$(($LINE_6 - $LINE_5)) >{output} 2>>{log}'
rule align_ref_BMB_ref_CCN : rule align_ref_BMB_ref_CCN :
input : input :
ccn=os.path.join('yeast_data',"reduced_CCN.fasta"), ccn=os.path.join('yeast_data',"reduced_assembly_ccn.fasta"),
bmb=os.path.join('yeast_data',"reduced_BMB.fasta") bmb=os.path.join('yeast_data',"reduced_assembly_bmb.fasta")
output: output:
os.path.join('yeast_data','aln_bmb_ccn_reduit.sam') os.path.join('yeast_data','aln_reduced_assembly_bmb_ccn.sam')
message : message :
"Align BMB ref with CCN ref with minimap" "Align BMB ref with CCN ref with minimap"
log: log:
os.path.join('logs','yeast_data','13_align_ref_BMB_ref_CCN.log') os.path.join('logs','yeast_data','13_align_ref_bmb_ref_ccn.log')
conda: conda:
"env_conda/minimap2.yaml" "env_conda/minimap2.yaml"
shell: shell:
...@@ -531,7 +280,7 @@ rule align_ref_BMB_ref_CCN : ...@@ -531,7 +280,7 @@ rule align_ref_BMB_ref_CCN :
rule reads_map_region_yeast : rule reads_map_region_yeast :
input : input :
os.path.join('yeast_data','aln_bmb_ccn_reduit.sam') os.path.join('yeast_data','aln_reduced_assembly_bmb_ccn.sam')
output : output :
os.path.join('yeast_data','ref_ccn.fasta') os.path.join('yeast_data','ref_ccn.fasta')
message: message:
...@@ -541,12 +290,12 @@ rule reads_map_region_yeast : ...@@ -541,12 +290,12 @@ rule reads_map_region_yeast :
conda: conda:
"env_conda/perl.yaml" "env_conda/perl.yaml"
shell : shell :
'DEB=158700;END=453600;' 'DEB=1356150;END=1791934;'
'./src/reads_map_region.pl -s $DEB -e $END {input} {output} >{log} 2>&1' './src/reads_map_region.pl -s $DEB -e $END {input} {output} >{log} 2>&1'
rule region_seq_yeast : rule region_seq_yeast :
input : input :
os.path.join('yeast_data',"reduced_BMB.fasta") os.path.join('yeast_data',"reduced_assembly_bmb.fasta")
output : output :
start=os.path.join('yeast_data','start_region.txt'), start=os.path.join('yeast_data','start_region.txt'),
ref=os.path.join('yeast_data','ref_bmb.fasta') ref=os.path.join('yeast_data','ref_bmb.fasta')
...@@ -555,7 +304,7 @@ rule region_seq_yeast : ...@@ -555,7 +304,7 @@ rule region_seq_yeast :
log: log:
os.path.join('logs','yeast_data','15_region_seq_yeast.log') os.path.join('logs','yeast_data','15_region_seq_yeast.log')
shell : shell :
'DEB=158700;END=453600;' 'DEB=1356150;END=1791934;'
'echo $DEB >{output.start};' 'echo $DEB >{output.start};'
'SIZE=$(($END-$DEB));' 'SIZE=$(($END-$DEB));'
'./src/region_seq.sh {output.start} {input} {output.ref} $SIZE >{log} 2>&1' './src/region_seq.sh {output.start} {input} {output.ref} $SIZE >{log} 2>&1'
...@@ -573,8 +322,8 @@ rule configuration_file_CCN_yeast_data: ...@@ -573,8 +322,8 @@ rule configuration_file_CCN_yeast_data:
shell: shell:
'PATH_DATA=`pwd`;' 'PATH_DATA=`pwd`;'
'echo -e "N: CCN_yeast" >{output};' 'echo -e "N: CCN_yeast" >{output};'
'echo -e "I: $PATH_DATA/{input.ref}" >>{output};' 'echo -e "I: $PATH_DATA/{input.reads}" >>{output};'
'echo -e "R: $PATH_DATA/{input.reads}" >>{output}' 'echo -e "R: $PATH_DATA/{input.ref}" >>{output}'
rule configuration_file_BMB_yeast_data: rule configuration_file_BMB_yeast_data:
input : input :
...@@ -589,8 +338,8 @@ rule configuration_file_BMB_yeast_data: ...@@ -589,8 +338,8 @@ rule configuration_file_BMB_yeast_data:
shell: shell:
'PATH_DATA=`pwd`;' 'PATH_DATA=`pwd`;'
'echo -e "N: BMB_yeast" >{output};' 'echo -e "N: BMB_yeast" >{output};'
'echo -e "I: $PATH_DATA/{input.ref}" >>{output};' 'echo -e "I: $PATH_DATA/{input.reads}" >>{output};'
'echo -e "R: $PATH_DATA/{input.reads}" >>{output}' 'echo -e "R: $PATH_DATA/{input.ref}" >>{output}'
rule diploid_reads: rule diploid_reads:
input : input :
...@@ -624,7 +373,7 @@ rule align_ref_BMB_ref_CCN_exonerate : ...@@ -624,7 +373,7 @@ rule align_ref_BMB_ref_CCN_exonerate :
ccn=os.path.join('yeast_data',"ref_ccn.fasta"), ccn=os.path.join('yeast_data',"ref_ccn.fasta"),
bmb=os.path.join('yeast_data',"ref_bmb.fasta") bmb=os.path.join('yeast_data',"ref_bmb.fasta")
output: output:
os.path.join('yeast_data','aln_refs_bmb_ccn.fasta') os.path.join('yeast_data','aln_refs_bmb_ccn.txt')
message : message :
"Align BMB ref with CCN ref with exonerate" "Align BMB ref with CCN ref with exonerate"
log: log:
...@@ -632,7 +381,19 @@ rule align_ref_BMB_ref_CCN_exonerate : ...@@ -632,7 +381,19 @@ rule align_ref_BMB_ref_CCN_exonerate :
conda: conda:
"env_conda/exonerate.yaml" "env_conda/exonerate.yaml"
shell: shell:
'exonerate -m a:l --bigseq yes --dnahspthreshold 120 {input.bmb} {input.ccn} | ./src/exonerate_to_fasta.pl >{output} 2>{log}' 'exonerate -m a:g --bigseq yes -Q dna -E --bestn 1 --dnahspthreshold 120 {input.bmb} {input.ccn} >{output} 2>{log}'
rule exonerate_to_fasta :
input :
os.path.join('yeast_data','aln_refs_bmb_ccn.txt')
output:
os.path.join('yeast_data','aln_refs_bmb_ccn.fasta')
message :
"Exonerate to fasta"
log:
os.path.join('logs','yeast_data','20_bis_exonerate_to_fasta.log')
shell:
'./src/exonerate_to_fasta.pl <{input} >{output} 2>{log}'
rule ref_diploide : rule ref_diploide :
input : input :
...@@ -661,8 +422,264 @@ rule configuration_file_diploid_yeast_data: ...@@ -661,8 +422,264 @@ rule configuration_file_diploid_yeast_data:
shell: shell:
'PATH_DATA=`pwd`;' 'PATH_DATA=`pwd`;'
'echo -e "N: diploid_yeast" >{output};' 'echo -e "N: diploid_yeast" >{output};'
'echo -e "I: $PATH_DATA/{input.ref}" >>{output};' 'echo -e "I: $PATH_DATA/{input.reads}" >>{output};'
'echo -e "R: $PATH_DATA/{input.reads}" >>{output}' 'echo -e "R: $PATH_DATA/{input.ref}" >>{output}'
#-------------------------------------------------------------------------------
# Ecoli hifi data
#-------------------------------------------------------------------------------
rule download_hifi_reads_ecoli :
output :
os.path.join('ecoli_hifi_data',READS_HIFI)
message :
"Download hifi reads for ecoli hifi data"
log:
os.path.join('logs','ecoli_hifi_data','1_download_hifi.log')
conda:
"env_conda/fastq-dump.yaml"
shell:
'fastq-dump --gzip SRR11434954 >{log} 2>&1;'
'mv ' + READS_HIFI + ' {output}'
rule download_nanopore_reads_ecoli_hifi :
output :
os.path.join('ecoli_hifi_data',READS_NANOPORE_ECOLI_HIFI + '.gz')
message :
"Download nanopore reads for ecoli hifi data"
log:
os.path.join('logs','ecoli_hifi_data','2_download_nanopore.log')
conda:
"env_conda/fastq-dump.yaml"
shell:
'fastq-dump --gzip SRR12801740 >{log} 2>&1;'
'mv ' + READS_NANOPORE_ECOLI_HIFI + '.gz {output}'
rule gunzip_nanopore_reads_ecoli_hifi :
input :
os.path.join('ecoli_hifi_data',READS_NANOPORE_ECOLI_HIFI + '.gz')
output :
os.path.join('ecoli_hifi_data',READS_NANOPORE_ECOLI_HIFI)
message :
"Decompress nanopore reads for ecoli hifi data"
log:
os.path.join('logs','ecoli_hifi_data','3_gunzip_nanopore_reads.log')
conda:
"env_conda/gzip.yaml"
shell:
'gunzip {input} >{log} 2>&1'
rule select_hifi_reads_ecoli :
input :
os.path.join('ecoli_hifi_data',READS_HIFI)
output:
os.path.join('ecoli_hifi_data','reads_hifi.fastq')
message :
"Select hifi reads for ecoli hifi data"
log:
os.path.join('logs','ecoli_hifi_data','4_select_hifi_reads.log')
conda:
"env_conda/seqkit.yaml"
shell:
'set +o pipefail;'
'seqkit sort -l -r {input} 2>{log}|head -100000 >{output} 2>{log}'
rule assembly_hifi_reads_ecoli :
input :
os.path.join('ecoli_hifi_data','reads_hifi.fastq')
output:
os.path.join('ecoli_hifi_data','assembly_hifi','assembly.fasta')
message :
"Assembly hifi reads for ecoli hifi data with flye"
log:
os.path.join('logs','ecoli_hifi_data','5_assembly_hifi_reads.log')
conda:
"env_conda/flye.yaml"
shell:
'flye --pacbio-hifi {input} --genome-size 5m --threads ' + THREAD + ' -i 2 -o ecoli_hifi_data/assembly_hifi >{log} 2>&1'
rule extract_first_contig_hifi_reads :
input :
os.path.join('ecoli_hifi_data','assembly_hifi','assembly.fasta')
output:
os.path.join('ecoli_hifi_data','ref_ecoli_hifi.fasta')
message :
"Extract first contig assembly for ecoli hifi data"
log:
os.path.join('logs','ecoli_hifi_data','6_extract_first_contig_hifi_reads.log')
shell:
'set +o pipefail;'
'cat {input}|tr "\\n" "@"|sed "s/@>/\\n>/g"|sed "s/@/\\n/;s/@//g"|head -2 >{output} 2>{log}'
rule configuration_file_ecoli_hifi_data:
input :
ref=os.path.join('ecoli_hifi_data','ref_ecoli_hifi.fasta'),
reads=os.path.join('ecoli_hifi_data',READS_NANOPORE_ECOLI_HIFI)
output :
ref=os.path.join('before_configuration_files','config_ecoli_hifi.yaml')
message :
"Configuration file for ecoli hifi"
log:
os.path.join('logs','ecoli_hifi_data','7_configuration_ecoli_hifi.log')
shell:
'PATH_DATA=`pwd`;'
'echo -e "N: ecoli_hifi" >{output};'
'echo -e "I: $PATH_DATA/{input.reads}" >>{output};'
'echo -e "R: $PATH_DATA/{input.ref}" >>{output}'
#-------------------------------------------------------------------------------
# Ecoli illumina data
#-------------------------------------------------------------------------------
rule download_illumina_reads_ecoli :
output :
os.path.join('ecoli_illumina_data',READS_ILLUMINA+ '.gz')
message :
"Download illumina reads for ecoli illumina data"
log:
os.path.join('logs','ecoli_illumina_data','1_download_illumina.log')
conda:
"env_conda/fastq-dump.yaml"
shell:
'fastq-dump --gzip --split-spot --clip SRR8333590 >{log} 2>&1;'
'mv ' + READS_ILLUMINA + '.gz {output}'
rule gunzip_illumina_reads_ecoli_illumina:
input :
os.path.join('ecoli_illumina_data',READS_ILLUMINA + '.gz')
output :
os.path.join('ecoli_illumina_data',READS_ILLUMINA)
message :
"Decompress illumina reads for ecoli illumina data"
log:
os.path.join('logs','ecoli_illumina_data','2_gunzip_illumina_reads.log')
conda:
"env_conda/gzip.yaml"
shell:
'gunzip {input} >{log} 2>&1'
rule download_nanopore_reads_ecoli_illumina :
output :
os.path.join('ecoli_illumina_data',READS_NANOPORE_ECOLI_ILLUMINA + '.gz')
message :
"Download nanopore reads for ecoli illumina data"
log:
os.path.join('logs','ecoli_illumina_data','3_download_nanopore.log')
conda:
"env_conda/fastq-dump.yaml"
shell:
'fastq-dump --gzip SRR8335315 >{log} 2>&1;'
'mv ' + READS_NANOPORE_ECOLI_ILLUMINA + '.gz {output}'
rule gunzip_nanopore_reads_ecoli_illumina :
input :
os.path.join('ecoli_illumina_data',READS_NANOPORE_ECOLI_ILLUMINA + '.gz')
output :
os.path.join('ecoli_illumina_data',READS_NANOPORE_ECOLI_ILLUMINA)
message :
"Decompress nanopore reads for ecoli illumina data"
log:
os.path.join('logs','ecoli_illumina_data','4_gunzip_nanopore_reads.log')
conda:
"env_conda/gzip.yaml"
shell:
'gunzip {input} >{log} 2>&1'
rule download_ref_ecoli :
output :
os.path.join('ecoli_illumina_data',REF_ECOLI + '.gz')
message :
"Download ref ecoli for ecoli illumina data"
log:
os.path.join('logs','ecoli_illumina_data','5_download_ref_ecoli.log')
shell:
'wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/004/358/385/GCF_004358385.1_ASM435838v1/GCF_004358385.1_ASM435838v1_genomic.fna.gz -o {log};'
'mv ' + REF_ECOLI + '.gz {output}'
rule gunzip_ref_ecoli :
input :
os.path.join('ecoli_illumina_data',REF_ECOLI+ '.gz')
output :
os.path.join('ecoli_illumina_data',REF_ECOLI)
message :
"Decompress ref ecoli for ecoli illumina data"
log:
os.path.join('logs','ecoli_illumina_data','6_gunzip_ref_ecoli.log')
conda:
"env_conda/gzip.yaml"
shell:
'gunzip {input} >{log} 2>&1'
rule assembly_illumina_reads_ecoli :
input :
os.path.join('ecoli_illumina_data',READS_ILLUMINA)
output:
os.path.join('ecoli_illumina_data','assembly_illumina','contigs.fasta')
message :
"Assembly illumina reads for ecoli illumina data with spades"
log:
os.path.join('logs','ecoli_illumina_data','7_assembly_illumina_reads.log')
conda:
"env_conda/spades.yaml"
shell:
'spades.py -s {input} -o ecoli_illumina_data/assembly_illumina --only-assembler -t '+THREAD+' >{log} 2>&1'
rule extract_first_chromosome_ref :
input :
os.path.join('ecoli_illumina_data',REF_ECOLI)
output:
os.path.join('ecoli_illumina_data','ref_ecoli.fasta')
message :
"Extract first chromosome ref for ecoli illumina data"
log:
os.path.join('logs','ecoli_illumina_data','8_extract_first_chromosome_ref.log')
shell:
'head -67952 {input} >{output} 2>{log}'
rule align_illumina_contig_ref :
input :
contigs=os.path.join('ecoli_illumina_data','assembly_illumina','contigs.fasta'),
ref=os.path.join('ecoli_illumina_data','ref_ecoli.fasta')
output:
os.path.join('ecoli_illumina_data','aln_contigs_illumina_ref_ecoli.sam')
message :
"Align illumina contigs with ref for ecoli illumina data with minimap"
log:
os.path.join('logs','ecoli_illumina_data','9_align_illumina_contig_ref.log')
conda:
"env_conda/minimap2.yaml"
shell:
'minimap2 -cax asm5 -t '+THREAD+' {input.ref} {input.contigs} >{output} 2>{log}'
rule reads_map_region_ecoli_illumina :
input :
os.path.join('ecoli_illumina_data','aln_contigs_illumina_ref_ecoli.sam'),
output :
os.path.join('ecoli_illumina_data','ref_illumina_ecoli.fasta')
message:
"Reads map region for ecoli illumina data"
log:
os.path.join('logs','ecoli_illumina_data','10_reads_map_region.log')
conda:
"env_conda/perl.yaml"
shell :
'DEB=4845200;END=5080000;'
'./src/reads_map_region.pl -s $DEB -e $END {input} {output} >{log} 2>&1'
rule configuration_file_ecoli_illumina_data:
input :
ref=os.path.join('ecoli_illumina_data','ref_illumina_ecoli.fasta'),
reads=os.path.join('ecoli_illumina_data',READS_NANOPORE_ECOLI_ILLUMINA)
output :
ref=os.path.join('before_configuration_files','config_ecoli_illumina.yaml')
message :
"Configuration file for ecoli illumina"
log:
os.path.join('logs','ecoli_illumina_data','11_configuration_ecoli_illumina.log')
shell:
'PATH_DATA=`pwd`;'
'echo -e "N: ecoli_illumina" >{output};'
'echo -e "I: $PATH_DATA/{input.reads}" >>{output};'
'echo -e "R: $PATH_DATA/{input.ref}" >>{output}'
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
# Simulated data # Simulated data
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
...@@ -792,8 +809,8 @@ rule configuration_file_simulated_data: ...@@ -792,8 +809,8 @@ rule configuration_file_simulated_data:
'PATH_DATA=`pwd`;' 'PATH_DATA=`pwd`;'
'NAME_SIMULATED=`echo {input.reads}|cut -d"/" -f 2|sed "s/_0001.fastq//"`;' 'NAME_SIMULATED=`echo {input.reads}|cut -d"/" -f 2|sed "s/_0001.fastq//"`;'
'echo -e "N: $NAME_SIMULATED" >{output};' 'echo -e "N: $NAME_SIMULATED" >{output};'
'echo -e "I: $PATH_DATA/{input.ref}" >>{output};' 'echo -e "I: $PATH_DATA/{input.reads}" >>{output};'
'echo -e "R: $PATH_DATA/{input.reads}" >>{output}' 'echo -e "R: $PATH_DATA/{input.ref}" >>{output}'
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
# Configuration files # Configuration files
......
...@@ -38,3 +38,9 @@ while (<>){ ...@@ -38,3 +38,9 @@ while (<>){
} }
} }
} }
if($nb_align==1){
print(">$name_query\n");
print("$seq_1\n");
print(">$name_target\n");
print("$seq_2\n");
}
...@@ -13,4 +13,4 @@ else ...@@ -13,4 +13,4 @@ else
CORES=$1 CORES=$1
fi fi
echo "Launch Snakemake" echo "Launch Snakemake"
snakemake -c$CORES -np --use-conda --rerun-incomplete --keep-going snakemake -c$CORES --use-conda --rerun-incomplete --keep-going
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment