add ecoli nanopore and simulated data

edbb6667 · Rohmer Coralie · d378566a · edbb6667 · edbb6667 · edbb6667
Commit edbb6667 authored 3 years ago by Rohmer Coralie
--- a/env_conda/msa-limit-data.yaml
+++ b/env_conda/msa-limit-data.yaml
+name: msa-limit-data
+channels:
+  - bioconda
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=4.5=1_gnu
+  - ca-certificates=2021.10.26=h06a4308_2
+  - certifi=2021.10.8=py37h06a4308_0
+  - flye=2.8.1=py37h1c8e9b9_0
+  - k8=0.2.5=h9a82719_1
+  - ld_impl_linux-64=2.35.1=h7274673_9
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=9.3.0=h5101ec6_17
+  - libgomp=9.3.0=h5101ec6_17
+  - libstdcxx-ng=9.3.0=hd4cf53a_17
+  - minimap2=2.17=h5bf99c6_4
+  - ncurses=6.3=h7f8727e_2
+  - openssl=1.1.1l=h7f8727e_0
+  - pbsim2=2.0.1=h7d875b9_0
+  - pip=21.2.2=py37h06a4308_0
+  - python=3.7.11=h12debd9_0
+  - readline=8.1=h27cfd23_0
+  - setuptools=58.0.4=py37h06a4308_0
+  - spades=3.13.0=0
+  - sqlite=3.36.0=hc218d9a_0
+  - sra-tools=2.8.0=0
+  - tk=8.6.11=h1ccaba5_0
+  - wheel=0.37.0=pyhd3eb1b0_1
+  - xz=5.2.5=h7b6447c_0
+  - zlib=1.2.11=h7b6447c_3
--- a/msa-limit-data.sh
+++ b/msa-limit-data.sh
+#!/bin/bash
+#./src/simulated_data.sh
+#./src/ecoli_illumina_nanopore_data.sh
+cd configuration_files
+for FILE in `ls`
+do
+  echo "D: [10,20,50,100,150]" >>$FILE
+  echo "S: [100,200,500,1000,2000,5000,10000]">>$FILE
+  echo "T: 50">>$FILE
+  echo "M: [muscle,mafft,poa,kalign,spoa,abpoa,kalign3]">>$FILE
+  echo "NBR: 10">>$FILE
+done
--- a/src/ecoli_illumina_nanopore_data.sh
+++ b/src/ecoli_illumina_nanopore_data.sh
+#!/bin/bash
+THREAD="12"
+if [ ! -d "ecoli_illumina_nanopore_data" ];then
+mkdir "ecoli_illumina_nanopore_data"
+fi
+cd ecoli_illumina_nanopore_data
+# fastq-dump --gzip SRR8335315 #Nanopore
+# gunzip SRR8335315.fastq.gz
+# fastq-dump --gzip --split-spot --clip SRR8333590 #illumina
+# gunzip SRR8333590.fastq.gz
+# wget  https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/004/358/385/GCF_004358385.1_ASM435838v1/GCF_004358385.1_ASM435838v1_genomic.fna.gz
+# gunzip GCF_004358385.1_ASM435838v1_genomic.fna.gz
+#
+# flye --nano-corr  SRR8335315.fa  --out ass_flye --genome-size 5500000 --threads $THREAD
+#
+# spades.py -s SRR8333590.fastq -o spades_output_illumina --only-assembler -t $THREAD
+# spades.py -s SRR8333590.fastq --nanopore SRR8335315.fastq -o spades_output_hybride --only-assembler -t $THREAD
+#
+# head -67952 data.fna >ref_ecoli.fasta
+# minimap2 -cax asm5  -t $THREAD ref_ecoli.fasta assembly.fasta >aln_contigs_nanopore_ref_ecoli.sam
+# minimap2 -cax asm5  -t $THREAD ref_ecoli.fasta spades_output_illumina/contigs.fasta >aln_contigs_illumina_ref_ecoli.sam
+# minimap2 -cax asm5  -t $THREAD ref_ecoli.fasta spades_output_hybride/contigs.fasta >aln_contigs_hybride_ref_ecoli.sam
+# DEB=4845200;END=5080000
+# ../src/read_map_region_v2.pl -s $DEB -e $END aln_contigs_illumina_ref_ecoli.sam ref_illumina.fasta
+# ../src/read_map_region_v2.pl -s $DEB -e $END aln_contigs_nanopore_ref_ecoli.sam ref_nanopore.fasta
+# ../src/read_map_region_v2.pl -s $DEB -e $END aln_contigs_hybride_ref_ecoli.sam ref_hybride.fasta
+PATH_DATA=`pwd`
+cd ..
+if [ ! -d "configuration_files" ];then
+  mkdir "configuration_files"
+fi
+NAME_I="e_coli_ref_illumina"
+NAME_N="e_coli_ref_nanopore"
+IN="$PATH_DATA/SRR8335315.fastq"
+REF_I="$PATH_DATA/ref_illumina.fasta"
+REF_N="$PATH_DATA/ref_nanopore.fasta"
+echo -e "NAME: $NAME_I\nIN: $IN\nR: $REF_I" >configuration_files/config_$NAME_I.yaml
+echo -e "NAME: $NAME_N\nIN: $IN\nR: $REF_N" >configuration_files/config_$NAME_N.yaml
--- a/src/read_map_region_v2.pl
+++ b/src/read_map_region_v2.pl
+#!/usr/bin/perl -w
+use List::Util qw(min max);
+#------------------------------------------------------------------------------
+# VERIFICATION D'USAGE
+#------------------------------------------------------------------------------
+sub usage{
+	print "\nRead_map_region uses a SAM file to retrieve in FASTA format the reads\n".
+		  "mapped to a specific region of the reference. Reads are truncated to\n".
+		  "retrieve only the mapped section but this step can be removed with the\n".
+			"uncut-end option.\n".
+		  "\nUSAGE: ./read_map_region.pl [option] -s <int> -t <int> <in.sam> <output>\n".
+	      "\t-s starting position of the region\n".
+	      "\t-t size of the region\n".
+	      "\t-e end position of the region (This option replaces the -t)\n".
+	      "\nOPTION(S):\n".
+	      "\t-c percentage of coverage from which the read is kept (default: 100)\n".
+	      "\t-u --uncut-end (always put first) do not cut the ends of the read\n".
+				"\t                not mapped to the region\n".
+	      "\n";
+	exit;
+}
+if (!exists $ARGV[0] || $ARGV[0] =~ '-h'){
+	usage();
+}
+if ( $ARGV[0] =~ /(-u|--uncut-end)/ ){
+	$UNCUT_END = 1;
+	shift(@ARGV);
+}
+else{
+	$UNCUT_END = 0;
+}
+$OUTPUT = pop(@ARGV);
+$FILE_INPUT = pop(@ARGV);
+@TAB_FILE_INPUT = split(/\./,$FILE_INPUT);
+$FORMAT = pop(@TAB_FILE_INPUT);
+if ( !open(IN, "<", $FILE_INPUT) || $FORMAT ne 'sam' ){
+	print "The file could not be opened or the format is incorrect.\n";
+	usage();
+}
+%hash_argv = @ARGV;
+if ( (exists $hash_argv{'-s'}) && ($hash_argv{'-s'} =~ /[0-9]+/) ) {
+    $REF_START = $hash_argv{'-s'}*1;
+}
+else{
+    print "The beginning of the region must be indicated.\n";
+    usage();
+}
+if ( (exists $hash_argv{'-t'}) && ($hash_argv{'-t'} =~ /[0-9]+/) ){
+    $REGION_SIZE = $hash_argv{'-t'}*1;
+    $REF_END = $REF_START + $REGION_SIZE - 1;
+}
+else{
+	if ( (exists $hash_argv{'-e'}) && ($hash_argv{'-e'} =~ /[0-9]+/) ){
+		$REF_END = $hash_argv{'-e'}*1;
+		$REGION_SIZE = $REF_END - $REF_START + 1;
+	}
+	else{
+		print "The size of the region or the end of the region must be indicated.\n";
+		usage();
+	}
+}
+if (exists $hash_argv{'-c'}){
+	$COVERAGE = $hash_argv{'-c'};
+	if ($COVERAGE > 100 || $COVERAGE < 0){
+		print "The coverage must be between 0 and 100.\n";
+		usage();
+	}
+}
+else{
+	$COVERAGE = 100;
+}
+#------------------------------------------------------------------------------
+# DEBUT DU SCRIPT
+#------------------------------------------------------------------------------
+#Fonction: Renvoie true si le byte est présent dans l'octet
+#Arg: $FLAG int flag/octet à tester
+#			$power int puissance du byte rechercher
+#Renvoie: boolean
+sub checksflag
+{
+	my ($FLAG,$power)=@_;
+	return ((($FLAG/(2**$power)) % 2) != 0);
+}
+open(OUT, ">", $OUTPUT) || die ("You cannot create the file $OUTPUT");
+#Lecture du fichier sam
+while (<IN>)
+{
+  unless(/^@/)
+  {
+  	chomp;
+    ($QNAME,$FLAG,$RNAME,$pos_start,$MAPQ,$CIGAR,$RNEXT,$PNEXT,$TLEN,$SEQ) = split(/\t/,$_);
+		#Ceux qui match et ont un alignement primaire
+		if (!checksflag($FLAG,2) && !checksflag($FLAG,8)){
+			#print("$QNAME $FLAG\n");
+	    #Ceux qui match avant la fin de la région
+	    if ($pos_start <= $REF_END) {
+	      @nb_cigar = split(/[A-Z]/,$CIGAR);
+	      @op_cigar = split(/[0-9]*/,$CIGAR);
+	      shift(@op_cigar);
+	      $nb_op_cigar = @op_cigar;
+	      $pos_end = $pos_start-1;#Le premier nucléotide est utilisé
+	      $i = 0;
+	      $nb_insertion = 0;
+	      $nb_deletion = 0;
+				$nb_segment = 0;
+	      $read_finish = 0;
+	      #Read qui commence avant la région
+	      if ($pos_start < $REF_START){
+					#Tant que le cigar n'est pas fini ou que le début de la région n'est pas dépassé
+					while ($i < $nb_op_cigar && $pos_end < $REF_START-1) {
+						if (!($op_cigar[$i] =~ "H" )) {
+							if ($op_cigar[$i] =~ 'I'){
+								$nb_insertion += $nb_cigar[$i];
+							}
+							else{
+								if ($op_cigar[$i] =~ 'S'){
+									$nb_segment += $nb_cigar[$i];
+								}
+								else{
+									$pos_end += $nb_cigar[$i];
+									if ($op_cigar[$i] =~ 'D'){
+									$nb_deletion += $nb_cigar[$i];
+									}
+								}
+							}
+						}
+						$i++;
+					}
+					$diff_pressure_cigar = $pos_end - ($REF_START -1) ;
+					#Si la coupure ne ce fait pas entre deux opérateurs du cigar
+					if ($diff_pressure_cigar > 0){
+						$i--;
+						$pos_end -= $diff_pressure_cigar;
+						$nb_cigar[$i] = $diff_pressure_cigar;
+						$nb_deletion -= $diff_pressure_cigar if ($op_cigar[$i] =~ 'D');
+					}else{
+						#Si le read n'est pas fini
+						if ($i < $nb_op_cigar){
+							#S'il y a une insertion sur le prochain opérateur
+							#pour pouvoir couper après
+							if ($op_cigar[$i] =~ 'I'){
+								$nb_insertion += $nb_cigar[$i];
+								$i++;
+							}
+						}
+						#Sinon, le read match avant la région
+						else{
+							$read_finish = 1;
+						}
+					}
+					$start_match_on_read = $pos_end - $pos_start + $nb_insertion + $nb_segment - $nb_deletion +1;
+	      }else{
+	        $start_match_on_read=0;
+	      }
+	      #Elimine ceux qui termine avant la région
+	      #après cette condition tout les reads sont bien mappé sur la région
+	      if (!$read_finish) {
+					#Continue le parcours du cigar, s'arrête si la fn de la région est rencontré
+	        while ($i < $nb_op_cigar && $pos_end < $REF_END) {
+						if (!($op_cigar[$i] =~ 'H')) {
+		          if ($op_cigar[$i] =~ 'I'){
+		            $nb_insertion += $nb_cigar[$i];
+		          }
+		          else{
+								if ($op_cigar[$i] =~ 'S'){
+									$nb_segment += $nb_cigar[$i];
+								}
+								else{
+									$pos_end += $nb_cigar[$i];
+									if ($op_cigar[$i] =~ 'D'){
+									$nb_deletion += $nb_cigar[$i];
+									}
+								}
+		          }
+						}
+	          $i++;
+	        }
+					#Si la coupure ne ce fait pas entre deux opérateurs du cigar
+					$diff_pressure_cigar = $pos_end - $REF_END ;
+					if ($diff_pressure_cigar > 0){
+						$i--;
+						$pos_end -= $diff_pressure_cigar;
+						$nb_cigar[$i] = $diff_pressure_cigar;
+						$nb_deletion -= $diff_pressure_cigar if ($op_cigar[$i] =~ 'D');
+					}
+					$end_match_on_read = $pos_end - $pos_start + $nb_insertion + $nb_segment - $nb_deletion;
+	        $region_coverage = max($REF_END,$pos_end) - min($REF_START,$pos_start) - abs($REF_END - $pos_end) - abs($REF_START - $pos_start) + 1;
+					if ($region_coverage >= $REGION_SIZE*($COVERAGE/100) ) {
+	        	if ($UNCUT_END){
+							print OUT ">$QNAME\n";
+	        		print OUT "$SEQ\n";
+	        	}
+	        	else{
+							$truncated_seq_size = $end_match_on_read - $start_match_on_read;
+							if ($truncated_seq_size > $REGION_SIZE/2) {
+								print OUT ">$QNAME\n";
+								@seq = split(//,$SEQ);
+								for (my $j = $start_match_on_read; $j <= $end_match_on_read; $j++) {
+									print OUT "$seq[$j]";
+								}
+								print OUT "\n";
+							}
+			      }
+	        }#else{print("$QNAME $FLAG Pas assez couvert\n")}
+	      }#else{print("$QNAME $FLAG termine avant\n")}
+	    }#else{print("$QNAME $FLAG match après\n")}
+		}#else{print("$QNAME $FLAG ignoré\n")}
+  }
+}
+close IN;
+$MAPQ=$PNEXT=$RNAME=$TLEN=$RNEXT=0;
--- a/src/simulated_data.sh
+++ b/src/simulated_data.sh
+#!/bin/bash
+if [ ! -d "simulated_data" ];then
+mkdir "simulated_data"
+fi
+cd simulated_data
+wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz
+gunzip GCF_000005845.2_ASM584v2_genomic.fna.gz
+wget https://github.com/yukiteruono/pbsim2/raw/master/data/R103.model
+REF="GCF_000005845.2_ASM584v2_genomic.fna"
+SIZE=10000
+DEPTH=400
+MODEL=R103.model
+#Deletion 10%Error
+pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 00:00:100 --accuracy-mean 0.90 --prefix simulated_del_10
+#Insertion 10%Error
+pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 00:100:00 --accuracy-mean 0.90 --prefix simulated_ins_10
+#Substitution 10%Error
+pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 100:00:00 --accuracy-mean 0.90 --prefix simulated_sub_10
+#Mixte 10%Error
+pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 23:31:46 --accuracy-mean 0.90 --prefix simulated_mixed_10
+#Mixte 15%Error
+pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 23:31:46 --accuracy-mean 0.85 --prefix simulated_mixed_15
+#Mixte 20%Error
+pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 23:31:46 --accuracy-mean 0.80 --prefix simulated_mixed_20
+#Mixte 25%Error
+pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 23:31:46 --accuracy-mean 0.75 --prefix simulated_mixed_25
+#Mixte 30%Error
+pbsim $REF --hmm_model $MODEL --depth $DEPTH --difference-ratio 23:31:46 --accuracy-mean 0.70 --prefix simulated_mixed_30
+rm *.maf *.ref
+PATH_DATA=`pwd`
+cd ..
+if [ ! -d "configuration_files" ];then
+  mkdir "configuration_files"
+fi
+for SIMULATED in `ls simulated_data|grep fastq`;
+do NAME_SIMULATED=`echo $SIMULATED|sed "s/_0001.fastq//"`;
+   echo -e "NAME: $NAME_SIMULATED\nIN: $PATH_DATA/$SIMULATED\nR: $PATH_DATA/GCF_000005845.2_ASM584v2_genomic.fna" >configuration_files/config_$NAME_SIMULATED.yaml;
+done