Skip to content
Snippets Groups Projects
Commit edbb6667 authored by Rohmer Coralie's avatar Rohmer Coralie
Browse files

add ecoli nanopore and simulated data

parent d378566a
Branches
No related tags found
No related merge requests found
name: msa-limit-data
channels:
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- _openmp_mutex=4.5=1_gnu
- ca-certificates=2021.10.26=h06a4308_2
- certifi=2021.10.8=py37h06a4308_0
- flye=2.8.1=py37h1c8e9b9_0
- k8=0.2.5=h9a82719_1
- ld_impl_linux-64=2.35.1=h7274673_9
- libffi=3.3=he6710b0_2
- libgcc-ng=9.3.0=h5101ec6_17
- libgomp=9.3.0=h5101ec6_17
- libstdcxx-ng=9.3.0=hd4cf53a_17
- minimap2=2.17=h5bf99c6_4
- ncurses=6.3=h7f8727e_2
- openssl=1.1.1l=h7f8727e_0
- pbsim2=2.0.1=h7d875b9_0
- pip=21.2.2=py37h06a4308_0
- python=3.7.11=h12debd9_0
- readline=8.1=h27cfd23_0
- setuptools=58.0.4=py37h06a4308_0
- spades=3.13.0=0
- sqlite=3.36.0=hc218d9a_0
- sra-tools=2.8.0=0
- tk=8.6.11=h1ccaba5_0
- wheel=0.37.0=pyhd3eb1b0_1
- xz=5.2.5=h7b6447c_0
- zlib=1.2.11=h7b6447c_3
#!/bin/bash
#./src/simulated_data.sh
#./src/ecoli_illumina_nanopore_data.sh
cd configuration_files
for FILE in `ls`
do
echo "D: [10,20,50,100,150]" >>$FILE
echo "S: [100,200,500,1000,2000,5000,10000]">>$FILE
echo "T: 50">>$FILE
echo "M: [muscle,mafft,poa,kalign,spoa,abpoa,kalign3]">>$FILE
echo "NBR: 10">>$FILE
done
#!/bin/bash
THREAD="12"
if [ ! -d "ecoli_illumina_nanopore_data" ];then
mkdir "ecoli_illumina_nanopore_data"
fi
cd ecoli_illumina_nanopore_data
# fastq-dump --gzip SRR8335315 #Nanopore
# gunzip SRR8335315.fastq.gz
# fastq-dump --gzip --split-spot --clip SRR8333590 #illumina
# gunzip SRR8333590.fastq.gz
# wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/004/358/385/GCF_004358385.1_ASM435838v1/GCF_004358385.1_ASM435838v1_genomic.fna.gz
# gunzip GCF_004358385.1_ASM435838v1_genomic.fna.gz
#
# flye --nano-corr SRR8335315.fa --out ass_flye --genome-size 5500000 --threads $THREAD
#
# spades.py -s SRR8333590.fastq -o spades_output_illumina --only-assembler -t $THREAD
# spades.py -s SRR8333590.fastq --nanopore SRR8335315.fastq -o spades_output_hybride --only-assembler -t $THREAD
#
# head -67952 data.fna >ref_ecoli.fasta
# minimap2 -cax asm5 -t $THREAD ref_ecoli.fasta assembly.fasta >aln_contigs_nanopore_ref_ecoli.sam
# minimap2 -cax asm5 -t $THREAD ref_ecoli.fasta spades_output_illumina/contigs.fasta >aln_contigs_illumina_ref_ecoli.sam
# minimap2 -cax asm5 -t $THREAD ref_ecoli.fasta spades_output_hybride/contigs.fasta >aln_contigs_hybride_ref_ecoli.sam
# DEB=4845200;END=5080000
# ../src/read_map_region_v2.pl -s $DEB -e $END aln_contigs_illumina_ref_ecoli.sam ref_illumina.fasta
# ../src/read_map_region_v2.pl -s $DEB -e $END aln_contigs_nanopore_ref_ecoli.sam ref_nanopore.fasta
# ../src/read_map_region_v2.pl -s $DEB -e $END aln_contigs_hybride_ref_ecoli.sam ref_hybride.fasta
PATH_DATA=`pwd`
cd ..
if [ ! -d "configuration_files" ];then
mkdir "configuration_files"
fi
NAME_I="e_coli_ref_illumina"
NAME_N="e_coli_ref_nanopore"
IN="$PATH_DATA/SRR8335315.fastq"
REF_I="$PATH_DATA/ref_illumina.fasta"
REF_N="$PATH_DATA/ref_nanopore.fasta"
echo -e "NAME: $NAME_I\nIN: $IN\nR: $REF_I" >configuration_files/config_$NAME_I.yaml
echo -e "NAME: $NAME_N\nIN: $IN\nR: $REF_N" >configuration_files/config_$NAME_N.yaml
#!/usr/bin/perl -w
use List::Util qw(min max);
#------------------------------------------------------------------------------
# VERIFICATION D'USAGE
#------------------------------------------------------------------------------
sub usage{
print "\nRead_map_region uses a SAM file to retrieve in FASTA format the reads\n".
"mapped to a specific region of the reference. Reads are truncated to\n".
"retrieve only the mapped section but this step can be removed with the\n".
"uncut-end option.\n".
"\nUSAGE: ./read_map_region.pl [option] -s <int> -t <int> <in.sam> <output>\n".
"\t-s starting position of the region\n".
"\t-t size of the region\n".
"\t-e end position of the region (This option replaces the -t)\n".
"\nOPTION(S):\n".
"\t-c percentage of coverage from which the read is kept (default: 100)\n".
"\t-u --uncut-end (always put first) do not cut the ends of the read\n".
"\t not mapped to the region\n".
"\n";
exit;
}
if (!exists $ARGV[0] || $ARGV[0] =~ '-h'){
usage();
}
if ( $ARGV[0] =~ /(-u|--uncut-end)/ ){
$UNCUT_END = 1;
shift(@ARGV);
}
else{
$UNCUT_END = 0;
}
$OUTPUT = pop(@ARGV);
$FILE_INPUT = pop(@ARGV);
@TAB_FILE_INPUT = split(/\./,$FILE_INPUT);
$FORMAT = pop(@TAB_FILE_INPUT);
if ( !open(IN, "<", $FILE_INPUT) || $FORMAT ne 'sam' ){
print "The file could not be opened or the format is incorrect.\n";
usage();
}
%hash_argv = @ARGV;
if ( (exists $hash_argv{'-s'}) && ($hash_argv{'-s'} =~ /[0-9]+/) ) {
$REF_START = $hash_argv{'-s'}*1;
}
else{
print "The beginning of the region must be indicated.\n";
usage();
}
if ( (exists $hash_argv{'-t'}) && ($hash_argv{'-t'} =~ /[0-9]+/) ){
$REGION_SIZE = $hash_argv{'-t'}*1;
$REF_END = $REF_START + $REGION_SIZE - 1;
}
else{
if ( (exists $hash_argv{'-e'}) && ($hash_argv{'-e'} =~ /[0-9]+/) ){
$REF_END = $hash_argv{'-e'}*1;
$REGION_SIZE = $REF_END - $REF_START + 1;
}
else{
print "The size of the region or the end of the region must be indicated.\n";
usage();
}
}
if (exists $hash_argv{'-c'}){
$COVERAGE = $hash_argv{'-c'};
if ($COVERAGE > 100 || $COVERAGE < 0){
print "The coverage must be between 0 and 100.\n";
usage();
}
}
else{
$COVERAGE = 100;
}
#------------------------------------------------------------------------------
# DEBUT DU SCRIPT
#------------------------------------------------------------------------------
#Fonction: Renvoie true si le byte est présent dans l'octet
#Arg: $FLAG int flag/octet à tester
# $power int puissance du byte rechercher
#Renvoie: boolean
sub checksflag
{
my ($FLAG,$power)=@_;
return ((($FLAG/(2**$power)) % 2) != 0);
}
open(OUT, ">", $OUTPUT) || die ("You cannot create the file $OUTPUT");
#Lecture du fichier sam
while (<IN>)
{
unless(/^@/)
{
chomp;
($QNAME,$FLAG,$RNAME,$pos_start,$MAPQ,$CIGAR,$RNEXT,$PNEXT,$TLEN,$SEQ) = split(/\t/,$_);
#Ceux qui match et ont un alignement primaire
if (!checksflag($FLAG,2) && !checksflag($FLAG,8)){
#print("$QNAME $FLAG\n");
#Ceux qui match avant la fin de la région
if ($pos_start <= $REF_END) {
@nb_cigar = split(/[A-Z]/,$CIGAR);
@op_cigar = split(/[0-9]*/,$CIGAR);
shift(@op_cigar);
$nb_op_cigar = @op_cigar;
$pos_end = $pos_start-1;#Le premier nucléotide est utilisé
$i = 0;
$nb_insertion = 0;
$nb_deletion = 0;
$nb_segment = 0;
$read_finish = 0;
#Read qui commence avant la région
if ($pos_start < $REF_START){
#Tant que le cigar n'est pas fini ou que le début de la région n'est pas dépassé
while ($i < $nb_op_cigar && $pos_end < $REF_START-1) {
if (!($op_cigar[$i] =~ "H" )) {
if ($op_cigar[$i] =~ 'I'){
$nb_insertion += $nb_cigar[$i];
}
else{
if ($op_cigar[$i] =~ 'S'){
$nb_segment += $nb_cigar[$i];
}
else{
$pos_end += $nb_cigar[$i];
if ($op_cigar[$i] =~ 'D'){
$nb_deletion += $nb_cigar[$i];
}
}
}
}
$i++;
}
$diff_pressure_cigar = $pos_end - ($REF_START -1) ;
#Si la coupure ne ce fait pas entre deux opérateurs du cigar
if ($diff_pressure_cigar > 0){
$i--;
$pos_end -= $diff_pressure_cigar;
$nb_cigar[$i] = $diff_pressure_cigar;
$nb_deletion -= $diff_pressure_cigar if ($op_cigar[$i] =~ 'D');
}else{
#Si le read n'est pas fini
if ($i < $nb_op_cigar){
#S'il y a une insertion sur le prochain opérateur
#pour pouvoir couper après
if ($op_cigar[$i] =~ 'I'){
$nb_insertion += $nb_cigar[$i];
$i++;
}
}
#Sinon, le read match avant la région
else{
$read_finish = 1;
}
}
$start_match_on_read = $pos_end - $pos_start + $nb_insertion + $nb_segment - $nb_deletion +1;
}else{
$start_match_on_read=0;
}
#Elimine ceux qui termine avant la région
#après cette condition tout les reads sont bien mappé sur la région
if (!$read_finish) {
#Continue le parcours du cigar, s'arrête si la fn de la région est rencontré
while ($i < $nb_op_cigar && $pos_end < $REF_END) {
if (!($op_cigar[$i] =~ 'H')) {
if ($op_cigar[$i] =~ 'I'){
$nb_insertion += $nb_cigar[$i];
}
else{
if ($op_cigar[$i] =~ 'S'){
$nb_segment += $nb_cigar[$i];
}
else{
$pos_end += $nb_cigar[$i];
if ($op_cigar[$i] =~ 'D'){
$nb_deletion += $nb_cigar[$i];
}
}
}
}
$i++;
}
#Si la coupure ne ce fait pas entre deux opérateurs du cigar
$diff_pressure_cigar = $pos_end - $REF_END ;
if ($diff_pressure_cigar > 0){
$i--;
$pos_end -= $diff_pressure_cigar;
$nb_cigar[$i] = $diff_pressure_cigar;
$nb_deletion -= $diff_pressure_cigar if ($op_cigar[$i] =~ 'D');
}
$end_match_on_read = $pos_end - $pos_start + $nb_insertion + $nb_segment - $nb_deletion;
$region_coverage = max($REF_END,$pos_end) - min($REF_START,$pos_start) - abs($REF_END - $pos_end) - abs($REF_START - $pos_start) + 1;
if ($region_coverage >= $REGION_SIZE*($COVERAGE/100) ) {
if ($UNCUT_END){
print OUT ">$QNAME\n";
print OUT "$SEQ\n";
}
else{
$truncated_seq_size = $end_match_on_read - $start_match_on_read;
if ($truncated_seq_size > $REGION_SIZE/2) {
print OUT ">$QNAME\n";
@seq = split(//,$SEQ);
for (my $j = $start_match_on_read; $j <= $end_match_on_read; $j++) {
print OUT "$seq[$j]";
}
print OUT "\n";
}
}
}#else{print("$QNAME $FLAG Pas assez couvert\n")}
}#else{print("$QNAME $FLAG termine avant\n")}
}#else{print("$QNAME $FLAG match après\n")}
}#else{print("$QNAME $FLAG ignoré\n")}
}
}
close IN;
$MAPQ=$PNEXT=$RNAME=$TLEN=$RNEXT=0;
#!/bin/bash
if [ ! -d "simulated_data" ];then
mkdir "simulated_data"
fi
cd simulated_data
wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz
gunzip GCF_000005845.2_ASM584v2_genomic.fna.gz
wget https://github.com/yukiteruono/pbsim2/raw/master/data/R103.model
REF="GCF_000005845.2_ASM584v2_genomic.fna"
SIZE=10000
DEPTH=400
MODEL=R103.model
#Deletion 10%Error
pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 00:00:100 --accuracy-mean 0.90 --prefix simulated_del_10
#Insertion 10%Error
pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 00:100:00 --accuracy-mean 0.90 --prefix simulated_ins_10
#Substitution 10%Error
pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 100:00:00 --accuracy-mean 0.90 --prefix simulated_sub_10
#Mixte 10%Error
pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 23:31:46 --accuracy-mean 0.90 --prefix simulated_mixed_10
#Mixte 15%Error
pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 23:31:46 --accuracy-mean 0.85 --prefix simulated_mixed_15
#Mixte 20%Error
pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 23:31:46 --accuracy-mean 0.80 --prefix simulated_mixed_20
#Mixte 25%Error
pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 23:31:46 --accuracy-mean 0.75 --prefix simulated_mixed_25
#Mixte 30%Error
pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 23:31:46 --accuracy-mean 0.70 --prefix simulated_mixed_30
rm *.maf *.ref
PATH_DATA=`pwd`
cd ..
if [ ! -d "configuration_files" ];then
mkdir "configuration_files"
fi
for SIMULATED in `ls simulated_data|grep fastq`;
do NAME_SIMULATED=`echo $SIMULATED|sed "s/_0001.fastq//"`;
echo -e "NAME: $NAME_SIMULATED\nIN: $PATH_DATA/$SIMULATED\nR: $PATH_DATA/GCF_000005845.2_ASM584v2_genomic.fna" >configuration_files/config_$NAME_SIMULATED.yaml;
done
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment