diff --git a/headers/index.h b/headers/index.h index 8790f03446750d819608a2772afc48513d9ad893..cc0e06e50a6b88a8e676e02af5c1f247e5864ef0 100644 --- a/headers/index.h +++ b/headers/index.h @@ -19,28 +19,12 @@ struct occ_in_read { -struct kmer_to_mphf { - uint64_t kmer_int; - uint64_t kmer_mphf; - - public : - bool operator== (const kmer_to_mphf& item){ - if (item.kmer_int == kmer_int && item.kmer_mphf == kmer_mphf){ - return true; - } - return false; - } - }; - - - struct rank_kmer { uint64_t rank; uint32_t nb_apparition; }; typedef robin_hood::unordered_map<uint32_t, vector<uint32_t>*> rh_umap; -typedef robin_hood::unordered_map<uint32_t, uint8_t> rh_umap_test; typedef vector<occ_in_read> vect_occ_read; typedef uint64_t kmer; typedef vector<vector<uint32_t>*> index_vector; @@ -62,21 +46,11 @@ class Index{ void set_nb_kmers(uint32_t nb_kmers); uint32_t get_nb_kmers(); - rh_umap index_fasta_rh(const string& filename, uint16_t k); - index_vector index_fasta_rh_mpfh(const string& filename, uint16_t k, BloomFilter& bf); - sparse_vector_u32 index_fasta_rankselect(const string& read_file, uint16_t k); - vect_occ_read query_sequence(const rh_umap& index, const string& sequence); - vect_occ_read query_sequence_mphf(const index_vector& index, const string& sequence); - vect_occ_read query_sequence_rankselect(const sparse_vector_u32& index, const string& sequence); - vector<uint32_t> query_kmer(const rh_umap& index, const uint64_t& kmer); - vector<uint32_t> query_kmer_mphf(const index_vector& index, const uint64_t& kmer); - vector<uint32_t> query_kmer_rankselect(const sparse_vector_u32& index_vect, const uint64_t& kmer); - vect_occ_read query_fasta(const rh_umap& index, const string& filename); - vect_occ_read query_fasta_mphf(const index_vector& index, const string& filename); - vect_occ_read query_fasta_rankselect(const sparse_vector_u32& index_vect, const string& filename); - void write_index(const rh_umap& index, const string& output_file); - void write_index_mphf(const index_vector& index_vect, const string& output_file); - void write_index_rankselect(const sparse_vector_u32& read_vector, const string& output_file); + vector<uint32_t> index_fasta_rankselect(const string& read_file, uint16_t k); + vect_occ_read query_sequence_rankselect(const vector<uint32_t>& index, const string& sequence); + vector<uint32_t> query_kmer_rankselect(const vector<uint32_t>& index_vect, const string& kmer); + vect_occ_read query_fasta_rankselect(const vector<uint32_t>& index_vect, const string& filename); + void write_index_rankselect(const vector<uint32_t>& read_vector, const string& output_file); uint32_t uniqueKmers(); }; diff --git a/headers/utils.h b/headers/utils.h index 41d1dcbd952d14441cc46ec1cb4db4eeb8e39940..592308d406d6c990e0624262dd5bb8bd3e5ef791 100644 --- a/headers/utils.h +++ b/headers/utils.h @@ -25,6 +25,14 @@ kmer hash64shift(kmer key); uint32_t revhash(uint32_t x); uint32_t unrevhash(uint32_t x); uint64_t nuc2int(char c); +char revCompChar(const char& c); uint64_t shiftnucl (uint64_t kmerint, char c, uint16_t k); +uint64_t shiftnucl_revcomp(uint64_t val, const char& c, uint16_t k); +string revcomp(const string& s); +uint64_t min_kmer(uint64_t kmer1, uint64_t kmer2); +uint64_t find_canonical(uint64_t kmer_int, uint64_t revComp_int, uint64_t bitvector_size); +uint64_t modulo(uint64_t nb, uint32_t m); +void transform_file(const string& fich1, const string& fich2); +string compare_files(const string& file1, const string& file2); #endif \ No newline at end of file diff --git a/sources/index.cpp b/sources/index.cpp index 065e436597b77fad6e1f168595317a428343bfae..8c8ba0d56712bcc03e58b27dc87f993a57e58666 100644 --- a/sources/index.cpp +++ b/sources/index.cpp @@ -5,29 +5,18 @@ #include <fstream> #include <filesystem> #include <chrono> +#include <math.h> #include "../BitMagic/src/bm.h" #include "../BitMagic/src/bmundef.h" #include "../BitMagic/src/bmsparsevec.h" -#include "../pthash/include/pthash.hpp" #include "../headers/utils.h" #include "../include/robin_hood.h" -//#include "../headers/bloomfilter.h" using namespace std; -using namespace pthash; using namespace chrono; using std::find; -// PTHash - -// typedef single_phf<murmurhash2_64, // base hasher -// dictionary_dictionary, // encoder type -// true // minimal -// > -// pthash_type; -// pthash_type f; - Index::Index(string& fastaFilename, uint16_t kmerLength){ @@ -49,189 +38,50 @@ uint32_t Index::get_nb_kmers(){ - -// rh_umap Index::index_fasta_rh(const string& read_file, uint16_t k){ - -// /** -// * @brief Associate kmers to the reads to which they belong -// * @param read_file (string) the name of the file containing reads -// * @param k (int) the kmers length -// * @return (unordered_map) a hashmap (keys : kmers (string) ; values : vector<int> representing IDs for the reads) -// */ - -// rh_umap hmap; -// uint32_t num_kmers = 0; -// uint32_t num_read = 0; -// ifstream fichier(read_file, ios::in); -// if(fichier) -// { -// string ligne; -// while(!fichier.eof()){ -// getline(fichier,ligne); -// if (ligne[0] != '>' && !ligne.empty()){ // ONLY GET SEQUENCES -// for (long unsigned int i=0; i<=ligne.length()-k; i++){ -// string kmer = ligne.substr(i, k); -// num_kmers ++; -// uint64_t kmer_int = str2numstrand(kmer); -// if(hmap.find(kmer_int) == hmap.end()){ -// hmap[kmer_int]=new vector<uint32_t>; -// } -// hmap[kmer_int]->push_back(num_read); // CHECK IF THE KMER IS ALREADY IN THE HASHMAP AND INSERT -// } -// num_read ++; -// } -// } -// fichier.close(); -// //exit(0); -// //delta_encoding(hmap); -// vector_compression(hmap); -// set_nb_kmers(num_kmers); -// } -// else{ -// cerr << "Error opening the file." << endl; -// } -// return hmap; -// } - - - -// vector<kmer_to_mphf> int_to_mphf; // Association kmer int and kmer mphf -// index_vector Index::index_fasta_rh_mpfh(const string& read_file, uint16_t k, BloomFilter& bf){ - -// /** -// * @brief Associate kmers to the reads to which they belong -// * @param read_file (string) the name of the file containing reads -// * @param k (int) the kmers length -// * @return (unordered_map) a hashmap (keys : kmers (string) ; values : vector<int> representing IDs for the reads) -// */ - -// index_vector index; -// // vector avec struct position + taille - -// uint32_t num_kmers = 0; -// uint32_t num_read = 0; -// ifstream fichier(read_file, ios::in); - -// if(fichier) -// { -// // BLOOM FILTER CONSTRUCTION -// string ligne; -// while(!fichier.eof()){ -// getline(fichier,ligne); -// if (ligne[0] != '>' && !ligne.empty()){ -// for (long unsigned int i=0; i<=ligne.length()-k; i++){ -// string kmer = ligne.substr(i, k); -// num_kmers ++; -// uint64_t kmer_int = str2numstrand(kmer); -// bf.insert(kmer_int); -// } -// } -// } - -// fichier.close(); - -// // MPHF CONSTRUCTION - -// vector<uint32_t> keys = bf.filter_to_array(*bf.get_filter()); - -// build_configuration config; -// config.c = 6.0; -// config.alpha = 0.94; -// config.minimal_output = true; // mphf -// config.verbose_output = true; - -// f.build_in_internal_memory(keys.begin(), keys.size(), config); - -// // INDEX CONSTRUCTION -// ifstream fichier(read_file, ios::in); -// while(!fichier.eof()){ -// getline(fichier,ligne); -// if (ligne[0] != '>' && !ligne.empty()){ -// for (long unsigned int i=0; i<=ligne.length()-k; i++){ -// string kmer = ligne.substr(i, k); -// uint32_t kmer_hash = revhash(str2numstrand(kmer)); -// uint64_t bphf_value = f(kmer_hash); - -// kmer_to_mphf kmer_mphf; // association between kmer int (hashed) and its mphf value -// kmer_mphf.kmer_int = kmer_hash; -// kmer_mphf.kmer_mphf = bphf_value; - -// if(find(int_to_mphf.begin(), int_to_mphf.end(), kmer_mphf) == int_to_mphf.end()){ -// int_to_mphf.push_back(kmer_mphf); -// } - -// // we complete the index (vector of vectors) -// if (int_to_mphf.back().kmer_mphf == bphf_value){ // if it is the last added element -// index.push_back(new vector<uint32_t>); -// index.back()->push_back(num_read); -// } -// else{ -// auto it = find(int_to_mphf.begin(), int_to_mphf.end(), kmer_mphf); -// uint indice = it - int_to_mphf.begin(); -// index[indice]->push_back(num_read); - -// } -// } -// num_read ++; -// } -// } - -// fichier.close(); - -// set_nb_kmers(num_kmers); -// } -// else{ -// cerr << "Error opening the file." << endl; -// } -// return index; -// } - - - vector<uint32_t> vect_pos = {0}; bm::bvector<> bitvector; bvector_rankselect rs_idx(new bm::bvector<>::rs_index_type()); +vector<uint32_t> Index::index_fasta_rankselect(const string& read_file, uint16_t k){ -sparse_vector_u32 Index::index_fasta_rankselect(const string& read_file, uint16_t k){ - - /** - * @brief Associate kmers to the reads to which they belong - * @param read_file (string) the name of the file containing reads - * @param k (int) the kmers length - * @return (unordered_map) a hashmap (keys : kmers (string) ; values : vector<int> representing IDs for the reads) - */ - bitvector.resize(2000000); - sparse_vector_u32 index; - uint32_t total_count, num_read = 0; + //bitvector.resize(pow(2,29)); + uint32_t bvsize = bitvector.size(); + vector<uint32_t> index; uint32_t num_kmers = 1; - uint64_t kmer_int, kmer_hash; + uint32_t kmer_int, revComp_int, kmer_hash, revComp_hash; + unordered_map<uint32_t, uint32_t> hmap; auto start_bitvector = high_resolution_clock::now(); ifstream fichier(read_file, ios::in); if(fichier){ - // BLOOM FILTER CONSTRUCTION + + // BLOOM FILTER while(!fichier.eof()){ string ligne, kmer; getline(fichier,ligne); + bool is_first = true; if (ligne[0] != '>' && !ligne.empty()){ - kmer = ligne.substr(0, k); - kmer_int = str2numstrand(kmer); - kmer_hash = revhash(kmer_int) % bitvector.size(); - if(bitvector[kmer_hash] != 1){ - bitvector.set(kmer_hash); - num_kmers++; - } - for (long unsigned int i=k; i<ligne.length(); i++){ - kmer_int = shiftnucl(kmer_int, ligne[i], k); - kmer_hash = revhash(kmer_int) % bitvector.size(); + uint16_t i = k; + while(i<ligne.length()){ + if(is_first){ + kmer = ligne.substr(0, k); + kmer_int = str2numstrand(kmer); + revComp_int = str2numstrand(revcomp(kmer)); + kmer_hash = find_canonical(kmer_int, revComp_int, bvsize); + is_first = false; + } + else{ + kmer_int = shiftnucl(kmer_int, ligne[i], k); + revComp_int = shiftnucl_revcomp(revComp_int, ligne[i], k); + kmer_hash = find_canonical(kmer_int, revComp_int, bvsize); + i++; + } if(bitvector[kmer_hash] != 1){ bitvector.set(kmer_hash); - num_kmers ++; + num_kmers++; } } } - num_read ++; } set_nb_kmers(num_kmers); bitvector.build_rs_index(rs_idx.get()); @@ -239,47 +89,61 @@ sparse_vector_u32 Index::index_fasta_rankselect(const string& read_file, uint16_ else{ cerr << "Error opening the file." << endl; } + cout << endl; fichier.close(); auto end_bitvector = high_resolution_clock::now(); auto querying_bitvector = duration_cast<std::chrono::seconds>(end_bitvector- start_bitvector); cout << "ok bitvector : " << querying_bitvector.count() << " seconds." << endl; + // VECTOR RANK/NB_APPARITION + auto start_struct = high_resolution_clock::now(); vector<uint32_t> kmer_app(num_kmers); ifstream fichier_struct(read_file, ios::in); if(fichier_struct){ string ligne_struct, kmer_struct; while(!fichier_struct.eof()){ + bool is_first = true; getline(fichier_struct,ligne_struct); if (ligne_struct[0] != '>' && !ligne_struct.empty()){ - - kmer_struct = ligne_struct.substr(0, k); - kmer_int = str2numstrand(kmer_struct); - kmer_hash = revhash(kmer_int) % bitvector.size(); - auto rank_hash = bitvector.rank(kmer_hash, *rs_idx); - kmer_app[rank_hash-1] ++; - for (long unsigned int i=k; i<ligne_struct.length(); i++){ - kmer_int = shiftnucl(kmer_int, ligne_struct[i], k); - kmer_hash = revhash(kmer_int) % bitvector.size(); + uint16_t i = k; + while(i<ligne_struct.length()){ + if(is_first){ + kmer_struct = ligne_struct.substr(0, k); + kmer_int = str2numstrand(kmer_struct); + revComp_int = str2numstrand(revcomp(kmer_struct)); + kmer_hash = find_canonical(kmer_int, revComp_int, bvsize); + is_first = false; + } + else{ + kmer_int = shiftnucl(kmer_int, ligne_struct[i], k); + revComp_int = shiftnucl_revcomp(revComp_int, ligne_struct[i], k); + kmer_hash = find_canonical(kmer_int, revComp_int, bvsize); + i++; + } auto rank_hash = bitvector.rank(kmer_hash, *rs_idx); kmer_app[rank_hash-1] ++; - } + } + } } } - } fichier_struct.close(); auto end_struct = high_resolution_clock::now(); auto querying_struct = duration_cast<std::chrono::seconds>(end_struct - start_struct); cout << "ok struct : " << querying_struct.count() << " seconds." << endl; + // VECTOR WITH START POSITIONS FOR READS + + uint32_t total_count = 0; for(vector<uint32_t>::iterator i = kmer_app.begin() ; i != kmer_app.end(); i++){ total_count += *i; vect_pos.push_back(total_count); } cout << "ok vectpos" << endl; - + index.resize(total_count); + uint cpt = 0; auto start_index = high_resolution_clock::now(); uint32_t num_read_index = 0; @@ -287,30 +151,26 @@ sparse_vector_u32 Index::index_fasta_rankselect(const string& read_file, uint16_ if(fichier_index){ string ligne_index, kmer_index; while(!fichier_index.eof()){ + bool is_first = true; getline(fichier_index,ligne_index); if (ligne_index[0] != '>' && !ligne_index.empty()){ + uint16_t i = k; num_read_index ++; - kmer_index = ligne_index.substr(0, k); - kmer_int = str2numstrand(kmer_index); - kmer_hash = revhash(kmer_int) % bitvector.size(); - auto rank_hash = bitvector.rank(kmer_hash, *rs_idx); - uint32_t start_pos = vect_pos[rank_hash - 1]; - uint32_t length_max = vect_pos[rank_hash] - vect_pos[rank_hash-1]; - - bool present = false; - uint tmp = start_pos; - while(tmp<start_pos+length_max && !present){ - if(index[tmp] == false){ - index[tmp] = num_read_index; - present = true; + while(i<ligne_index.length()){ + if(is_first){ + kmer_index = ligne_index.substr(0, k); + kmer_int = str2numstrand(kmer_index); + revComp_int = str2numstrand(revcomp(kmer_index)); + kmer_hash = find_canonical(kmer_int, revComp_int, bvsize); + is_first = false; + } + else{ + kmer_int = shiftnucl(kmer_int, ligne_index[i], k); + revComp_int = shiftnucl_revcomp(revComp_int, ligne_index[i], k); + kmer_hash = find_canonical(kmer_int, revComp_int, bvsize); + i++; } - tmp++; - } - - for (long unsigned int i=k; i<ligne_index.length(); i++){ - kmer_int = shiftnucl(kmer_int, ligne_index[i], k); - kmer_hash = revhash(kmer_int) % bitvector.size(); auto rank_hash = bitvector.rank(kmer_hash, *rs_idx); uint32_t start_pos = vect_pos[rank_hash - 1]; uint32_t length_max = vect_pos[rank_hash] - vect_pos[rank_hash-1]; @@ -338,61 +198,10 @@ sparse_vector_u32 Index::index_fasta_rankselect(const string& read_file, uint16_ - - - - -// vector<uint32_t> Index::query_kmer(const rh_umap& index, const uint64_t& kmer){ - -// /** -// * @brief Query a k-mer in an index structure. -// * @param kmer (string) the k-mer we are searching for. -// * @return (vector<int>) the list of reads in which the k-mer appears. -// */ - -// try{ -// vector<uint32_t> reads = *index.at(kmer); -// vector_decompression(reads); -// return reads; -// } - -// catch(const out_of_range& err) { -// cerr << "K-mer not found." << '\n'; -// vector<uint32_t> v; -// return v; -// } - -// } - - - -// vector<uint32_t> Index::query_kmer_mphf(const index_vector& index_vect, const uint64_t& kmer){ -// uint64_t kmer_hash = revhash(kmer); -// uint64_t kmer_mphf = bphf_it->lookup(kmer_hash); - -// kmer_to_mphf kmer_int_to_mphf; // association between kmer int (hashed) and its mphf value -// kmer_int_to_mphf.kmer_int = kmer_hash; -// kmer_int_to_mphf.kmer_mphf = kmer_mphf; - -// if (kmer_mphf != ULLONG_MAX){ // if the kmer exists -// auto it = find(val_to_pos.begin(), val_to_pos.end(), kmer_int_to_mphf); -// uint indice = it - val_to_pos.begin(); -// return *index_vect[indice]; -// } -// else{ -// cout << "K-mer not found." << '\n'; -// vector<uint32_t> v; -// return v; -// } - -// vector<uint32_t> v; -// return v; -// } - - - -vector<uint32_t> Index::query_kmer_rankselect(const sparse_vector_u32& index, const uint64_t& kmer){ - uint64_t kmer_hash = revhash(kmer) % bitvector.size(); +vector<uint32_t> Index::query_kmer_rankselect(const vector<uint32_t>& index, const string& kmer){ + uint32_t kmer_int = str2numstrand(kmer); + uint32_t revComp_int = str2numstrand(revcomp(kmer)); + uint64_t kmer_hash = find_canonical(kmer_int, revComp_int, bitvector.size()); uint length = 0; uint start = 0; if (bitvector[kmer_hash] == 1){ // if the kmer exists @@ -416,92 +225,14 @@ vector<uint32_t> Index::query_kmer_rankselect(const sparse_vector_u32& index, co -// vect_occ_read Index::query_sequence(const rh_umap& index, const string& sequence){ - -// /** -// * @brief Query a read in an index structure. The read will be first divided into k-mers. -// * @param read (string) the read we'll divide in k-mers and search for in the index. -// * @return (vector<int, vector<int>>) the list of reads in which the k-mers appears and their number of occurences. -// */ - -// vect_occ_read res; -// unordered_map<uint32_t, uint32_t> tmp_reads; -// for (long unsigned int i=0; i<=sequence.length()-this->k; i++){ -// string kmer = sequence.substr(i, this->k); -// uint64_t kmer_int = str2numstrand(kmer); -// vector<uint32_t> reads_of_kmer = query_kmer(index, kmer_int); -// for (auto& idRead : reads_of_kmer){ -// if(tmp_reads.find(idRead) != tmp_reads.end()){ -// tmp_reads[idRead] ++; -// } -// else{ -// tmp_reads[idRead] = 1; -// } -// } -// } - -// for (auto const &pair: tmp_reads){ -// occ_in_read structRead; -// structRead.read_id = pair.first; -// structRead.total_count = pair.second; -// res.push_back(structRead); -// } - -// return res; -// } - - - -// vect_occ_read Index::query_sequence_mphf(const index_vector& index_vect, const string& sequence){ - -// /** -// * @brief Query a read in an index structure. The read will be first divided into k-mers. -// * @param read (string) the read we'll divide in k-mers and search for in the index. -// * @return (vector<int, vector<int>>) the list of reads in which the k-mers appears and their number of occurences. -// */ - -// vect_occ_read res; -// unordered_map<uint32_t, uint32_t> tmp_reads; -// for (long unsigned int i=0; i<=sequence.length()-this->k; i++){ -// string kmer = sequence.substr(i, this->k); -// uint64_t kmer_int = str2numstrand(kmer); -// vector<uint32_t> reads_of_kmer = query_kmer_mphf(index_vect, kmer_int); -// for (auto& idRead : reads_of_kmer){ -// if(tmp_reads.find(idRead) != tmp_reads.end()){ -// tmp_reads[idRead] ++; -// } -// else{ -// tmp_reads[idRead] = 1; -// } -// } -// } - -// for (auto const &pair: tmp_reads){ -// occ_in_read structRead; -// structRead.read_id = pair.first; -// structRead.total_count = pair.second; -// res.push_back(structRead); -// } - -// return res; -// } - - - -vect_occ_read Index::query_sequence_rankselect(const sparse_vector_u32& index, const string& sequence){ - - /** - * @brief Query a read in an index structure. The read will be first divided into k-mers. - * @param read (string) the read we'll divide in k-mers and search for in the index. - * @return (vector<int, vector<int>>) the list of reads in which the k-mers appear and their number of occurences. - */ +vect_occ_read Index::query_sequence_rankselect(const vector<uint32_t>& index, const string& sequence){ vect_occ_read res; unordered_map<uint32_t, uint32_t> tmp_reads; + bool is_first = true; for (long unsigned int i=0; i<=sequence.length()-this->k; i++){ string kmer = sequence.substr(i, this->k); - uint64_t kmer_int = str2numstrand(kmer); - vector<uint32_t> reads_of_kmer = query_kmer_rankselect(index, kmer_int); + vector<uint32_t> reads_of_kmer = query_kmer_rankselect(index, kmer); for (auto& idRead : reads_of_kmer){ if(tmp_reads.find(idRead) != tmp_reads.end()){ tmp_reads[idRead] ++; @@ -524,109 +255,8 @@ vect_occ_read Index::query_sequence_rankselect(const sparse_vector_u32& index, c -// vect_occ_read Index::query_fasta(const rh_umap& index, const string& filename){ -// /** -// * @brief Search, for all the reads in the file, and for all their kmers, in which read they appear. -// * @param filename (string) the file containing the reads. -// * @return (vector<vector<occ_in_read>>) for each read, the list of reads in which the kmers appear. -// */ - -// vect_occ_read res; -// unordered_map<uint32_t, uint32_t> tmp_reads; - -// ifstream fichier(filename, ios::in); - -// if(fichier){ -// string ligne; -// while( !fichier.eof()){ -// getline(fichier,ligne); -// if (ligne[0] != '>' && !ligne.empty()){ -// vect_occ_read occ_read = query_sequence(index, ligne); -// for (auto& struct_read : occ_read){ -// int read_id = struct_read.read_id; -// int total_count = struct_read.total_count; -// if(tmp_reads.find(read_id) != tmp_reads.end()){ -// tmp_reads[read_id] += total_count; -// } -// else{ -// tmp_reads[read_id] = total_count; -// } -// } -// } -// } -// for (auto const &pair: tmp_reads){ -// occ_in_read structRead; -// structRead.read_id = pair.first; -// structRead.total_count = pair.second; -// res.push_back(structRead); -// } -// fichier.close(); -// return res; -// } -// else{ -// cerr << "Error opening the file." << endl; -// return res; -// } -// } - - - -// vect_occ_read Index::query_fasta_mphf(const index_vector& index_vect, const string& filename){ - -// /** -// * @brief Search, for all the reads in the file, and for all their kmers, in which read they appear. -// * @param filename (string) the file containing the reads. -// * @return (vector<vector<occ_in_read>>) for each read, the list of reads in which the kmers appear. -// */ - -// vect_occ_read res; -// unordered_map<uint32_t, uint32_t> tmp_reads; - -// ifstream fichier(filename, ios::in); - -// if(fichier){ -// string ligne; -// while( !fichier.eof()){ -// getline(fichier,ligne); -// if (ligne[0] != '>' && !ligne.empty()){ -// vect_occ_read occ_read = query_sequence_mphf(index_vect, ligne); -// for (auto& struct_read : occ_read){ -// int read_id = struct_read.read_id; -// int total_count = struct_read.total_count; -// if(tmp_reads.find(read_id) != tmp_reads.end()){ -// tmp_reads[read_id] += total_count; -// } -// else{ -// tmp_reads[read_id] = total_count; -// } -// } -// } -// } -// for (auto const &pair: tmp_reads){ -// occ_in_read structRead; -// structRead.read_id = pair.first; -// structRead.total_count = pair.second; -// res.push_back(structRead); -// } -// fichier.close(); -// return res; -// } -// else{ -// cerr << "Error opening the file." << endl; -// return res; -// } -// } - - - -vect_occ_read Index::query_fasta_rankselect(const sparse_vector_u32& index, const string& filename){ - - /** - * @brief Search, for all the reads in the file, and for all their kmers, in which read they appear. - * @param filename (string) the file containing the reads. - * @return (vector<vector<occ_in_read>>) for each read, the list of reads in which the kmers appear. - */ +vect_occ_read Index::query_fasta_rankselect(const vector<uint32_t>& index, const string& filename){ vect_occ_read res; unordered_map<uint32_t, uint32_t> tmp_reads; @@ -667,57 +297,16 @@ vect_occ_read Index::query_fasta_rankselect(const sparse_vector_u32& index, cons } - -// void Index::write_index(const rh_umap& index, const string& output_file){ - -// /** -// * @brief Query a k-mer in an index structure. -// * @param kmer (string) the k-mer we are searching for. -// * @return (vector<int>) the list of reads in which the k-mer appears. -// */ -// ofstream outp(output_file); -// if (outp){ -// for (auto const &pair: index) { -// outp << pair.first << ": "; -// for (int i : *pair.second){ -// outp << i << ","; -// } -// outp << "\n"; -// } -// } - -// outp.close(); -// } - - - -// void Index::write_index_mphf(const index_vector& index_vect, const string& output_file){ - - -// ofstream outp(output_file); -// if (outp){ -// for(unsigned int i = 0; i < index_vect.size(); i++) { -// uint64_t kmer_hashed = int_to_mphf[i].kmer_int; -// uint64_t kmer_int = unrevhash(kmer_hashed); - -// outp << kmer_int << ": "; -// for (auto read : *index_vect[i]){ -// outp << read << ","; -// } -// outp << "\n"; -// } -// } -// outp.close(); -// } - - -void Index::write_index_rankselect(const sparse_vector_u32& index, const string& output_file){ +void Index::write_index_rankselect(const vector<uint32_t>& index, const string& output_file){ ofstream outp(output_file); + cout << "Writing the file..." << endl; if (outp){ uint rank, pos_begin, length; uint kmer_hash = bitvector.get_first(); - while(kmer_hash != 0){ + bool first = true; + + while(kmer_hash != 0 || first){ rank = bitvector.rank(kmer_hash, *rs_idx); pos_begin = vect_pos[rank-1]; length = vect_pos[rank] - vect_pos[rank-1]; @@ -726,10 +315,9 @@ void Index::write_index_rankselect(const sparse_vector_u32& index, const string& outp << index[i] << ","; } outp << "\n"; - + first = false; kmer_hash = bitvector.get_next(kmer_hash); } - cout << "File created. " << endl; } outp.close(); } diff --git a/sources/main.cpp b/sources/main.cpp index 331047edee0778ad0995f7d1fd3058a5904cd5b6..912add20bfbe81cf2467d33fb582f53c4e8fae16 100644 --- a/sources/main.cpp +++ b/sources/main.cpp @@ -12,7 +12,6 @@ #include "../headers/index.h" #include "../headers/utils.h" #include "../headers/bloomfilter.h" -//#include "bool_iterator.cpp" using namespace std; using namespace chrono; @@ -26,13 +25,18 @@ int main(int argc, char *argv[]) else{ + /*transform_file("TESTS/index_phage.txt", "TESTS/transform_phage.txt"); + transform_file("TESTS/phage_index_rankselect.txt", "TESTS/transform_phage_rank.txt"); + + cout << compare_files("TESTS/transform_rankselect_myco_sorted.txt", "TESTS/transform_rh_myco_sorted.txt");*/ + std::string filename(argv[1]); uint16_t k = stoi(argv[2]); float error_rate = stof(argv[3]); Index index = Index(filename, k); auto start_indexing = high_resolution_clock::now(); - sparse_vector_u32 read_vector = index.index_fasta_rankselect(filename, k); + vector<uint32_t> read_vector = index.index_fasta_rankselect(filename, k); auto end_indexing = high_resolution_clock::now(); auto indexing = duration_cast<seconds>(end_indexing - start_indexing); double pct_unique_kmers = (double)index.uniqueKmers()/index.get_nb_kmers()*100; @@ -43,7 +47,7 @@ int main(int argc, char *argv[]) uint64_t memory_used_index = getMemorySelfMaxUsed(); cout << "Resource usage (indexing) : " << intToString(memory_used_index) << " Ko" << endl; cout << "Resource usage (for 1 k-mer) : " << koToBit((double)memory_used_index/index.get_nb_kmers()) << " Bits" << endl; - index.write_index_rankselect(read_vector, "output_files/index_rankselect.txt"); + index.write_index_rankselect(read_vector, "output_files/phage_index_rankselect_order.txt"); int opt; @@ -56,10 +60,9 @@ int main(int argc, char *argv[]) // QUERYING KMER string kmer_to_search = seq_from_fasta(optarg); - uint64_t kmer_int = str2numstrand(kmer_to_search); auto start_querying_kmer = high_resolution_clock::now(); - vector<uint32_t> res_query_kmer = index.query_kmer_rankselect(read_vector, kmer_int); + vector<uint32_t> res_query_kmer = index.query_kmer_rankselect(read_vector, kmer_to_search); if(res_query_kmer.empty()){ cout << "The index does not contain this k-mer."; } diff --git a/sources/utils.cpp b/sources/utils.cpp index d42c3741c564085dde81df4f6a788cc850313677..1edaec77a1231cf0d0d5c7721bad51e473b502ec 100644 --- a/sources/utils.cpp +++ b/sources/utils.cpp @@ -166,20 +166,6 @@ void write_values(string output_file, float val1, const string& val2){ -kmer hash64shift(kmer key) { - key = (~key) + (key << 21); // key = (key << 21) - key - 1; - key = key ^ (key >> 24); - key = (key + (key << 3)) + (key << 8); // key * 265 - key = key ^ (key >> 14); - key = (key + (key << 2)) + (key << 4); // key * 21 - key = key ^ (key >> 28); - key = key + (key << 31); - return key; -} - - - - uint32_t revhash(uint32_t x) { x = ((x >> 16) ^ x) * 0x2c1b3c6d; x = ((x >> 16) ^ x) * 0x297a2d39; @@ -204,4 +190,100 @@ uint64_t shiftnucl(uint64_t val, char c, uint16_t k) { val = val + str2numstrand(string(1,c)); uint mask = pow(2,(k*2))-1; return val & mask; -} \ No newline at end of file +} + + +char revCompChar(const char& c){ + switch (c){ + case 'A': return 'T'; + case 'C': return 'G'; + case 'G': return 'C'; + } + return 'A'; +} + + + +string revcomp(const string& s){ + string res(s.size(), 0); + for(int i((int)s.length() - 1); i >= 0; i--){ + res[s.size()-1-i] = revCompChar(s[i]); + } + return res; +} + + + + +uint64_t shiftnucl_revcomp(uint64_t val, const char& c, uint16_t k) { + val = val >> 2; + return val + (str2numstrand(string(1,revCompChar(c))) << ((k-1)*2)); +} + + + + +uint64_t min_kmer(uint64_t kmer1, uint64_t kmer2){ + if(kmer1<kmer2){ + return kmer1; + } + return kmer2; +} + + + +uint64_t modulo(uint64_t nb, uint32_t m){ + return nb & (m-1); +} + + + +uint64_t find_canonical(uint64_t kmer_int, uint64_t revComp_int, uint64_t bitvector_size){ + return modulo(revhash(min_kmer(kmer_int, revComp_int)), bitvector_size); +} + + + +void transform_file(const string& fich1, const string& fich2){ + + ifstream fichier1(fich1, ios::in); + ofstream outp(fich2); + if(outp){ + string ligne; + while(!fichier1.eof()){ + getline(fichier1,ligne); + ligne.erase(0, ligne.find(":")+1); + ligne.erase(std::remove_if(ligne.begin(), ligne.end(), ::isspace), ligne.end()); + outp << ligne << "\n"; + } + } + fichier1.close(); + outp.close(); +} + + + +string compare_files(const string& file1, const string& file2){ + string res = ""; + int cpt = 1; + ifstream fichier1(file1, ios::in); + ifstream fichier2(file2, ios::in); + + if(fichier1 && fichier2){ + string ligne1, ligne2; + while(!fichier1.eof() && !fichier2.eof()){ + getline(fichier1,ligne1); + getline(fichier2,ligne2); + + if(ligne1 != ligne2){ + res += std::to_string(cpt); + res += " "; + } + cpt++; + } + } + return res; +} + + +