Skip to content
Snippets Groups Projects
Commit 1de90c54 authored by Rohmer Coralie's avatar Rohmer Coralie
Browse files

add human and levure

parent edbb6667
Branches
No related tags found
No related merge requests found
#!/bin/bash
THREAD="24"
if [ ! -d "ecoli_hifi_data" ];then
mkdir "ecoli_hifi_data"
fi
cd ecoli_hifi_data
echo "Download hifi reads..."
#fastq-dump --gzip SRR11434954
echo "Download nanopore reads..."
#fastq-dump --gzip SRR12801740
#gunzip SRR12801740.fastq.gz >/dev/null
echo "Select hifi reads..."
#seqkit sort -l -r SRR11434954.fastq.gz|head -100000 >read_hifi.fastq
echo "Assembly of hifi (flye)"
#flye --pacbio-hifi read_hifi.fastq --genome-size 5m --threads $THREAD -i 2 -o ass_hifi
echo "Extract first contig"
#cat ass_hifi/assembly.fasta|tr "\n" "@"|sed "s/@>/\n>/g"|sed "s/@/\n/;s/@//g"|head -2 >ref_ecoli_hifi.fasta
PATH_DATA=`pwd`
NAME="ecoli_hifi"
IN="$PATH_DATA/SRR12801740.fastq"
REF="$PATH_DATA/ref_ecoli_hifi.fasta"
echo -e "N: $NAME\nI: $IN\nR: $REF" >config_$NAME\.yaml
......@@ -7,37 +7,45 @@ fi
cd ecoli_illumina_nanopore_data
# fastq-dump --gzip SRR8335315 #Nanopore
# gunzip SRR8335315.fastq.gz
# fastq-dump --gzip --split-spot --clip SRR8333590 #illumina
# gunzip SRR8333590.fastq.gz
# wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/004/358/385/GCF_004358385.1_ASM435838v1/GCF_004358385.1_ASM435838v1_genomic.fna.gz
# gunzip GCF_004358385.1_ASM435838v1_genomic.fna.gz
#
# flye --nano-corr SRR8335315.fa --out ass_flye --genome-size 5500000 --threads $THREAD
#
# spades.py -s SRR8333590.fastq -o spades_output_illumina --only-assembler -t $THREAD
# spades.py -s SRR8333590.fastq --nanopore SRR8335315.fastq -o spades_output_hybride --only-assembler -t $THREAD
#
# head -67952 data.fna >ref_ecoli.fasta
echo "Download nanopore reads..."
#fastq-dump --gzip SRR8335315 >/dev/null #Nanopore
#gunzip SRR8335315.fastq.gz >/dev/null
echo "Download illumina reads..."
#fastq-dump --gzip --split-spot --clip SRR8333590 >/dev/null #illumina
#gunzip SRR8333590.fastq.gz >/dev/null
echo "Download ecoli reference"
#wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/004/358/385/GCF_004358385.1_ASM435838v1/GCF_004358385.1_ASM435838v1_genomic.fna.gz >/dev/null
#gunzip GCF_004358385.1_ASM435838v1_genomic.fna.gz >/dev/null
echo "Assembly of nanopore reads (flye)"
#flye --nano-raw SRR8335315.fastq --out ass_flye --genome-size 5500000 --threads $THREAD >/dev/null
echo "Assembly of illumina reads (spades)"
#spades.py -s SRR8333590.fastq -o spades_output_illumina --only-assembler -t $THREAD >/dev/null
echo "Assembly of illumina and nanopore reads (spades)"
#spades.py -s SRR8333590.fastq --nanopore SRR8335315.fastq -o spades_output_hybride --only-assembler -t $THREAD >/dev/null
#head -67952 GCF_004358385.1_ASM435838v1_genomic.fna >ref_ecoli.fasta
echo "Alignment of the nanopore contigs on the reference (minimap2)"
#minimap2 -cax asm5 -t $THREAD ref_ecoli.fasta assembly.fasta >aln_contigs_nanopore_ref_ecoli.sam
echo "Alignment of the illumina contigs on the reference (minimap2)"
#minimap2 -cax asm5 -t $THREAD ref_ecoli.fasta spades_output_illumina/contigs.fasta >aln_contigs_illumina_ref_ecoli.sam
echo "Alignment of the hybride contigs on the reference (minimap2)"
#minimap2 -cax asm5 -t $THREAD ref_ecoli.fasta spades_output_hybride/contigs.fasta >aln_contigs_hybride_ref_ecoli.sam
# DEB=4845200;END=5080000
# ../src/read_map_region_v2.pl -s $DEB -e $END aln_contigs_illumina_ref_ecoli.sam ref_illumina.fasta
DEB=4845200;END=5080000
echo "Selection of the part of the nanopore contig that matches on a region of the reference."
#../src/read_map_region_v2.pl -s $DEB -e $END aln_contigs_nanopore_ref_ecoli.sam ref_nanopore.fasta
#echo "Selection of the part of the illumina contig that matches on a region of the reference."
#../src/read_map_region_v2.pl -s $DEB -e $END aln_contigs_illumina_ref_ecoli.sam ref_illumina.fasta
#echo "Selection of the part of the hybride contig that matches on a region of the reference."
#../src/read_map_region_v2.pl -s $DEB -e $END aln_contigs_hybride_ref_ecoli.sam ref_hybride.fasta
PATH_DATA=`pwd`
cd ..
if [ ! -d "configuration_files" ];then
mkdir "configuration_files"
fi
NAME_I="e_coli_ref_illumina"
NAME_N="e_coli_ref_nanopore"
IN="$PATH_DATA/SRR8335315.fastq"
REF_I="$PATH_DATA/ref_illumina.fasta"
REF_N="$PATH_DATA/ref_nanopore.fasta"
echo -e "NAME: $NAME_I\nIN: $IN\nR: $REF_I" >configuration_files/config_$NAME_I.yaml
echo -e "NAME: $NAME_N\nIN: $IN\nR: $REF_N" >configuration_files/config_$NAME_N.yaml
echo -e "N: $NAME_I\nI: $IN\nR: $REF_I" >config_$NAME_I.yaml
echo -e "N: $NAME_N\nI: $IN\nR: $REF_N" >config_$NAME_N.yaml
#include <fstream>
#include <string.h>
#include <iostream>
#include <list>
using namespace std;
string split_1(string chaine, char del ){
string re ="";
int size(chaine.size());
int i(0);
while(chaine[i] != del && i<size){
re += chaine[i];
i++;
}
return re;
}
int main(int argc, char** argv){
string file_names_seq=argv[1];
string file_fasta=argv[2];
list <string> names_seq={};
ifstream flux(file_names_seq.c_str());
if(flux){
string ligne("");
while(getline(flux, ligne)){
names_seq.push_back(ligne);
}
}
else{
cout << "ERROR: Unable to open the file." << endl;
exit(0);
}
flux.close();
flux.open(file_fasta.c_str());
if(flux){
string ligne("");
bool found(false);
while(getline(flux, ligne)){
if (ligne[0] == '>'){
string name_seq_fasta=split_1(ligne.substr(1, ligne.size()-1),' ');
found=false;
list <string>::iterator p=names_seq.begin();
list <string>::iterator end=names_seq.end();
while( !found && p!=end ){
if(name_seq_fasta == *p){
found=true;
}
else{
p++;
}
}
if (found) {
names_seq.erase(p);
std::cout << ligne << '\n';
}
}
else{
if (found) {
std::cout << ligne << '\n';
}
}
}
}
else{
cout << "ERROR: Unable to open the file." << endl;
exit(0);
}
flux.close();
return 0;
}
#!/bin/bash
THREAD="24"
cd src/
g++ -Wall -o extract_fasta extract_seqs_fasta.cpp
cd ..
if [ ! -d "human_data" ];then
mkdir "human_data"
fi
cd human_data
echo "Download nanopore reads..."
#wget https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/nanopore/rel8-guppy-5.0.7/reads.fastq.gz
#gunzip reads.fastq.gz
echo "Download human reference"
#wget https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chm13.draft_v1.1.fasta.gz
#gunzip chm13.draft_v1.1.fasta.gz
echo "Extract chrX"
tail -2571272 chm13.draft_v1.1.fasta|head -2570994 >ref_chrX_W.fasta
minimap2 -cax map-ont -t $THREAD chm13.draft_v1.1.fasta reads.fastq >aln_nanopore_ref_chm13.sam
PATH_DATA=`pwd`
echo "Creating configuration files..."
NAME="human_ref_W"
IN="$PATH_DATA/reads.fastq"
REF="$PATH_DATA/ref_chrX_W.fasta"
echo -e "N: $NAME\nI: $IN\nR: $REF" >config_$NAME.yaml
#!/bin/bash
THREAD="24"
if [ ! -d "levure_data" ];then
mkdir "levure_data"
fi
cd levure_data
PATH_DATA=`pwd`
echo "Creating configuration files..."
NAME="levure_bmb"
IN="$PATH_DATA/reads.fastq"
REF="$PATH_DATA/ref_chrX_W.fasta"
IN="/data/bonsai/crohmer/data/ERR4352154.fastq"
REF=" /data/bonsai/crohmer/data/ref_bmb.fasta"
echo -e "N: $NAME\nI: $IN\nR: $REF" >config_$NAME.yaml
NAME="levure_ccn"
IN="$PATH_DATA/reads.fastq"
REF="$PATH_DATA/ref_chrX_W.fasta"
IN="/data/bonsai/crohmer/data/ERR4352155.fastq"
REF="/data/bonsai/crohmer/data/ref_ccn.fasta"
echo -e "N: $NAME\nI: $IN\nR: $REF" >config_$NAME.yaml
NAME="levure_diploide"
IN="$PATH_DATA/reads.fastq"
REF="$PATH_DATA/ref_chrX_W.fasta"
IN="/data/bonsai/crohmer/data/read_diploide_shuffle.fastq"
REF="/data/bonsai/crohmer/data/seq_consensus.fasta"
echo -e "N: $NAME\nI: $IN\nR: $REF" >config_$NAME.yaml
......@@ -5,47 +5,41 @@ fi
cd simulated_data
wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz
gunzip GCF_000005845.2_ASM584v2_genomic.fna.gz
wget https://github.com/yukiteruono/pbsim2/raw/master/data/R103.model
echo "Download reference..."
#wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz >/dev/null
#gunzip GCF_000005845.2_ASM584v2_genomic.fna.gz >/dev/null
echo "Download model..."
#wget https://github.com/yukiteruono/pbsim2/raw/master/data/R103.model >/dev/null
REF="GCF_000005845.2_ASM584v2_genomic.fna"
SIZE=10000
DEPTH=400
MODEL=R103.model
#Deletion 10%Error
pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 00:00:100 --accuracy-mean 0.90 --prefix simulated_del_10
for ERROR in 1 2 5 10 15 20 25 30;
do
ACCURACY=$((100-$ERROR))
#Insertion 10%Error
pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 00:100:00 --accuracy-mean 0.90 --prefix simulated_ins_10
#Deletion
echo "Simulation of the simulated_del_$ERROR dataset (pbsim2)"
#pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 00:00:100 --accuracy-mean 0.$ACCURACY --prefix simulated_del_$ERROR >/dev/null
#Substitution 10%Error
pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 100:00:00 --accuracy-mean 0.90 --prefix simulated_sub_10
#Insertion
echo "Simulation of the simulated_ins_$ERROR dataset (pbsim2)"
#pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 00:100:00 --accuracy-mean 0.$ACCURACY --prefix simulated_ins_$ERROR >/dev/null
#Mixte 10%Error
pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 23:31:46 --accuracy-mean 0.90 --prefix simulated_mixed_10
#Substitution
echo "Simulation of the simulated_sub_$ERROR dataset (pbsim2)"
#pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 100:00:00 --accuracy-mean 0.$ACCURACY --prefix simulated_sub_$ERROR >/dev/null
#Mixte 15%Error
pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 23:31:46 --accuracy-mean 0.85 --prefix simulated_mixed_15
#Mixte 20%Error
pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 23:31:46 --accuracy-mean 0.80 --prefix simulated_mixed_20
#Mixte 25%Error
pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 23:31:46 --accuracy-mean 0.75 --prefix simulated_mixed_25
#Mixte 30%Error
pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 23:31:46 --accuracy-mean 0.70 --prefix simulated_mixed_30
#Mixte
echo "Simulation of the simulated_mixed_$ERROR dataset (pbsim2)"
#pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 23:31:46 --accuracy-mean 0.$ACCURACY --prefix simulated_mixed_$ERROR >/dev/null
done
rm *.maf *.ref
#rm *.maf *.ref
PATH_DATA=`pwd`
cd ..
if [ ! -d "configuration_files" ];then
mkdir "configuration_files"
fi
for SIMULATED in `ls simulated_data|grep fastq`;
for SIMULATED in `ls |grep fastq`;
do NAME_SIMULATED=`echo $SIMULATED|sed "s/_0001.fastq//"`;
echo -e "NAME: $NAME_SIMULATED\nIN: $PATH_DATA/$SIMULATED\nR: $PATH_DATA/GCF_000005845.2_ASM584v2_genomic.fna" >configuration_files/config_$NAME_SIMULATED.yaml;
echo -e "N: $NAME_SIMULATED\nI: $PATH_DATA/$SIMULATED\nR: $PATH_DATA/GCF_000005845.2_ASM584v2_genomic.fna" >config_$NAME_SIMULATED.yaml;
done
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment