From c23ff7bb3396e8c4e94a7d295aeee3645fb1abd3 Mon Sep 17 00:00:00 2001
From: Lea Vandamme <lea.vandamme.etu@univ-lille.fr>
Date: Thu, 23 Jun 2022 17:43:59 +0200
Subject: [PATCH] version using rank bitvectors/rank ok

---
 headers/index.h   |  36 +--
 headers/utils.h   |   8 +
 sources/index.cpp | 574 +++++++---------------------------------------
 sources/main.cpp  |  13 +-
 sources/utils.cpp | 112 +++++++--
 5 files changed, 199 insertions(+), 544 deletions(-)

diff --git a/headers/index.h b/headers/index.h
index 8790f03..cc0e06e 100644
--- a/headers/index.h
+++ b/headers/index.h
@@ -19,28 +19,12 @@ struct occ_in_read {
 
 
 
-struct kmer_to_mphf {
-            uint64_t kmer_int;
-            uint64_t kmer_mphf;
-
-            public :
-                bool operator== (const kmer_to_mphf& item){
-                    if (item.kmer_int == kmer_int && item.kmer_mphf == kmer_mphf){
-                        return true;
-                    }
-                    return false;
-                }
-        };
-
-
-
 struct rank_kmer {
             uint64_t rank;
             uint32_t nb_apparition;
         };
 
 typedef robin_hood::unordered_map<uint32_t, vector<uint32_t>*> rh_umap;
-typedef robin_hood::unordered_map<uint32_t, uint8_t> rh_umap_test;
 typedef vector<occ_in_read> vect_occ_read;
 typedef uint64_t kmer;
 typedef vector<vector<uint32_t>*>  index_vector;
@@ -62,21 +46,11 @@ class Index{
         void set_nb_kmers(uint32_t nb_kmers);
         uint32_t get_nb_kmers();
 
-        rh_umap index_fasta_rh(const string& filename, uint16_t k);
-        index_vector index_fasta_rh_mpfh(const string& filename, uint16_t k, BloomFilter& bf);
-        sparse_vector_u32 index_fasta_rankselect(const string& read_file, uint16_t k);
-        vect_occ_read query_sequence(const rh_umap& index, const string& sequence);
-        vect_occ_read query_sequence_mphf(const index_vector& index, const string& sequence);
-        vect_occ_read query_sequence_rankselect(const sparse_vector_u32& index, const string& sequence);
-        vector<uint32_t> query_kmer(const rh_umap& index, const uint64_t& kmer);
-        vector<uint32_t> query_kmer_mphf(const index_vector& index, const uint64_t& kmer);
-        vector<uint32_t> query_kmer_rankselect(const sparse_vector_u32& index_vect, const uint64_t& kmer);
-        vect_occ_read query_fasta(const rh_umap& index, const string& filename);
-        vect_occ_read query_fasta_mphf(const index_vector& index, const string& filename);
-        vect_occ_read query_fasta_rankselect(const sparse_vector_u32& index_vect, const string& filename);
-        void write_index(const rh_umap& index, const string& output_file);
-        void write_index_mphf(const index_vector& index_vect, const string& output_file);
-        void write_index_rankselect(const sparse_vector_u32& read_vector, const string& output_file);
+        vector<uint32_t> index_fasta_rankselect(const string& read_file, uint16_t k);
+        vect_occ_read query_sequence_rankselect(const vector<uint32_t>& index, const string& sequence);
+        vector<uint32_t> query_kmer_rankselect(const vector<uint32_t>& index_vect, const string& kmer);
+        vect_occ_read query_fasta_rankselect(const vector<uint32_t>& index_vect, const string& filename);
+        void write_index_rankselect(const vector<uint32_t>& read_vector, const string& output_file);
         uint32_t uniqueKmers();
 };
 
diff --git a/headers/utils.h b/headers/utils.h
index 41d1dcb..592308d 100644
--- a/headers/utils.h
+++ b/headers/utils.h
@@ -25,6 +25,14 @@ kmer hash64shift(kmer key);
 uint32_t revhash(uint32_t x);
 uint32_t unrevhash(uint32_t x);
 uint64_t nuc2int(char c); 
+char revCompChar(const char& c);
 uint64_t shiftnucl (uint64_t kmerint, char c, uint16_t k);
+uint64_t shiftnucl_revcomp(uint64_t val, const char& c, uint16_t k);
+string revcomp(const string& s);
+uint64_t min_kmer(uint64_t kmer1, uint64_t kmer2);
+uint64_t find_canonical(uint64_t kmer_int, uint64_t revComp_int, uint64_t bitvector_size);
+uint64_t modulo(uint64_t nb, uint32_t m);
+void transform_file(const string& fich1, const string& fich2);
+string compare_files(const string& file1, const string& file2);
 
 #endif
\ No newline at end of file
diff --git a/sources/index.cpp b/sources/index.cpp
index 065e436..8c8ba0d 100644
--- a/sources/index.cpp
+++ b/sources/index.cpp
@@ -5,29 +5,18 @@
 #include <fstream>
 #include <filesystem>
 #include <chrono>
+#include <math.h>
 #include "../BitMagic/src/bm.h"
 #include "../BitMagic/src/bmundef.h"
 #include "../BitMagic/src/bmsparsevec.h"
-#include "../pthash/include/pthash.hpp"
 #include "../headers/utils.h"
 #include "../include/robin_hood.h"
-//#include "../headers/bloomfilter.h"
 
 
 using namespace std;
-using namespace pthash;
 using namespace chrono;
 using std::find;
 
-// PTHash
-
-// typedef single_phf<murmurhash2_64,         // base hasher
-//                     dictionary_dictionary,  // encoder type
-//                     true                    // minimal
-//                     >
-// pthash_type;
-// pthash_type f;
-
 
 
 Index::Index(string& fastaFilename, uint16_t kmerLength){
@@ -49,189 +38,50 @@ uint32_t Index::get_nb_kmers(){
 
 
 
-
-// rh_umap Index::index_fasta_rh(const string& read_file, uint16_t k){
-
-//     /**
-//      * @brief Associate kmers to the reads to which they belong
-//      * @param read_file (string) the name of the file containing reads
-//      * @param k (int) the kmers length
-//      * @return (unordered_map) a hashmap (keys : kmers (string) ; values : vector<int> representing IDs for the reads)
-//      */
-
-//     rh_umap hmap;
-//     uint32_t num_kmers = 0;
-//     uint32_t num_read = 0;
-//     ifstream fichier(read_file, ios::in);
-//     if(fichier)
-//         {   
-//             string ligne;
-//             while(!fichier.eof()){
-//                 getline(fichier,ligne);
-//                 if (ligne[0] != '>' && !ligne.empty()){ // ONLY GET SEQUENCES
-//                     for (long unsigned int i=0; i<=ligne.length()-k; i++){
-//                         string kmer = ligne.substr(i, k);
-//                         num_kmers ++;
-//                         uint64_t kmer_int = str2numstrand(kmer);
-//                         if(hmap.find(kmer_int) == hmap.end()){
-//                             hmap[kmer_int]=new vector<uint32_t>;
-//                         }
-//                         hmap[kmer_int]->push_back(num_read); // CHECK IF THE KMER IS ALREADY IN THE HASHMAP AND INSERT
-//                     }
-//                     num_read ++;
-//                 }
-//             }
-//             fichier.close();
-//             //exit(0);
-//             //delta_encoding(hmap);
-//             vector_compression(hmap);
-//             set_nb_kmers(num_kmers);
-//         }
-//     else{
-//         cerr << "Error opening the file." << endl;
-//     }
-//     return hmap;
-// }
-
-
-
-// vector<kmer_to_mphf> int_to_mphf; // Association kmer int and kmer mphf
-// index_vector Index::index_fasta_rh_mpfh(const string& read_file, uint16_t k, BloomFilter& bf){
-
-//     /**
-//      * @brief Associate kmers to the reads to which they belong
-//      * @param read_file (string) the name of the file containing reads
-//      * @param k (int) the kmers length
-//      * @return (unordered_map) a hashmap (keys : kmers (string) ; values : vector<int> representing IDs for the reads)
-//      */
-
-//     index_vector index;
-//     // vector avec struct position + taille
-
-//     uint32_t num_kmers = 0;
-//     uint32_t num_read = 0;
-//     ifstream fichier(read_file, ios::in);
-    
-//     if(fichier)
-//         {   
-//             // BLOOM FILTER CONSTRUCTION
-//             string ligne;
-//             while(!fichier.eof()){
-//                 getline(fichier,ligne);
-//                 if (ligne[0] != '>' && !ligne.empty()){
-//                     for (long unsigned int i=0; i<=ligne.length()-k; i++){
-//                         string kmer = ligne.substr(i, k);
-//                         num_kmers ++;
-//                         uint64_t kmer_int = str2numstrand(kmer);
-//                         bf.insert(kmer_int);
-//                     }
-//                 }
-//             }
-
-//             fichier.close();
-
-//             // MPHF CONSTRUCTION
-            
-//             vector<uint32_t> keys = bf.filter_to_array(*bf.get_filter());
-
-//             build_configuration config;
-//             config.c = 6.0;
-//             config.alpha = 0.94;
-//             config.minimal_output = true;  // mphf
-//             config.verbose_output = true;
-
-//             f.build_in_internal_memory(keys.begin(), keys.size(), config);
-            
-//             // INDEX CONSTRUCTION
-//             ifstream fichier(read_file, ios::in);
-//             while(!fichier.eof()){
-//                 getline(fichier,ligne);
-//                 if (ligne[0] != '>' && !ligne.empty()){
-//                     for (long unsigned int i=0; i<=ligne.length()-k; i++){
-//                         string kmer = ligne.substr(i, k);
-//                         uint32_t kmer_hash = revhash(str2numstrand(kmer));
-//                         uint64_t bphf_value = f(kmer_hash);
-
-//                         kmer_to_mphf kmer_mphf; // association between kmer int (hashed) and its mphf value
-//                         kmer_mphf.kmer_int = kmer_hash;
-//                         kmer_mphf.kmer_mphf = bphf_value;
-
-//                         if(find(int_to_mphf.begin(), int_to_mphf.end(), kmer_mphf) == int_to_mphf.end()){
-//                             int_to_mphf.push_back(kmer_mphf);
-//                         }
-                        
-//                         // we complete the index (vector of vectors)
-//                         if (int_to_mphf.back().kmer_mphf == bphf_value){ // if it is the last added element
-//                             index.push_back(new vector<uint32_t>);
-//                             index.back()->push_back(num_read);
-//                         }
-//                         else{
-//                             auto it = find(int_to_mphf.begin(), int_to_mphf.end(), kmer_mphf);
-//                             uint indice = it - int_to_mphf.begin();
-//                             index[indice]->push_back(num_read);
-
-//                         }
-//                     }
-//                     num_read ++;
-//                 }
-//             }
-
-//             fichier.close();
-            
-//             set_nb_kmers(num_kmers);
-//         }
-//     else{
-//         cerr << "Error opening the file." << endl;
-//     }
-//     return index;
-// }
-
-
-
 vector<uint32_t> vect_pos = {0};
 bm::bvector<> bitvector;
 bvector_rankselect rs_idx(new bm::bvector<>::rs_index_type());
+vector<uint32_t> Index::index_fasta_rankselect(const string& read_file, uint16_t k){
 
-sparse_vector_u32 Index::index_fasta_rankselect(const string& read_file, uint16_t k){
-
-    /**
-     * @brief Associate kmers to the reads to which they belong
-     * @param read_file (string) the name of the file containing reads
-     * @param k (int) the kmers length
-     * @return (unordered_map) a hashmap (keys : kmers (string) ; values : vector<int> representing IDs for the reads)
-     */
-    bitvector.resize(2000000);
-    sparse_vector_u32 index;
-    uint32_t total_count, num_read = 0;
+    //bitvector.resize(pow(2,29));
+    uint32_t bvsize = bitvector.size();
+    vector<uint32_t> index;
     uint32_t num_kmers = 1;
-    uint64_t kmer_int, kmer_hash;
+    uint32_t kmer_int, revComp_int, kmer_hash, revComp_hash;
+    unordered_map<uint32_t, uint32_t> hmap;
 
     auto start_bitvector = high_resolution_clock::now();
     ifstream fichier(read_file, ios::in);
     if(fichier){
-        // BLOOM FILTER CONSTRUCTION
+
+        // BLOOM FILTER
         
         while(!fichier.eof()){
             string ligne, kmer;
             getline(fichier,ligne);
+            bool is_first = true;
             if (ligne[0] != '>' && !ligne.empty()){
-                kmer = ligne.substr(0, k);
-                kmer_int = str2numstrand(kmer);
-                kmer_hash = revhash(kmer_int) % bitvector.size();
-                if(bitvector[kmer_hash] != 1){
-                    bitvector.set(kmer_hash);
-                    num_kmers++;
-                }
-                for (long unsigned int i=k; i<ligne.length(); i++){
-                    kmer_int = shiftnucl(kmer_int, ligne[i], k);
-                    kmer_hash = revhash(kmer_int) % bitvector.size();
+                uint16_t i = k;
+                while(i<ligne.length()){
+                    if(is_first){
+                        kmer = ligne.substr(0, k);
+                        kmer_int = str2numstrand(kmer);
+                        revComp_int = str2numstrand(revcomp(kmer));
+                        kmer_hash = find_canonical(kmer_int, revComp_int, bvsize);
+                        is_first = false;
+                    }
+                    else{
+                        kmer_int = shiftnucl(kmer_int, ligne[i], k);
+                        revComp_int = shiftnucl_revcomp(revComp_int, ligne[i], k);
+                        kmer_hash = find_canonical(kmer_int, revComp_int, bvsize);
+                        i++;
+                    }
                     if(bitvector[kmer_hash] != 1){
                         bitvector.set(kmer_hash);
-                        num_kmers ++;
+                        num_kmers++;
                     }
                 }
             }
-            num_read ++;
         }
         set_nb_kmers(num_kmers);
         bitvector.build_rs_index(rs_idx.get());
@@ -239,47 +89,61 @@ sparse_vector_u32 Index::index_fasta_rankselect(const string& read_file, uint16_
     else{
         cerr << "Error opening the file." << endl;
     }
+    cout << endl;
     fichier.close();
     auto end_bitvector = high_resolution_clock::now();
     auto querying_bitvector = duration_cast<std::chrono::seconds>(end_bitvector- start_bitvector);
     cout << "ok bitvector : " << querying_bitvector.count() << " seconds."  << endl;
 
+    // VECTOR RANK/NB_APPARITION
+
     auto start_struct = high_resolution_clock::now();
     vector<uint32_t> kmer_app(num_kmers);
     ifstream fichier_struct(read_file, ios::in);
     if(fichier_struct){
         string ligne_struct, kmer_struct;
         while(!fichier_struct.eof()){
+            bool is_first = true;
             getline(fichier_struct,ligne_struct);
             if (ligne_struct[0] != '>' && !ligne_struct.empty()){
-                    
-                kmer_struct = ligne_struct.substr(0, k);
-                kmer_int = str2numstrand(kmer_struct);
-                kmer_hash = revhash(kmer_int) % bitvector.size();
-                auto rank_hash = bitvector.rank(kmer_hash, *rs_idx);
-                kmer_app[rank_hash-1] ++;
-                for (long unsigned int i=k; i<ligne_struct.length(); i++){
-                    kmer_int = shiftnucl(kmer_int, ligne_struct[i], k);
-                    kmer_hash = revhash(kmer_int) % bitvector.size();
+                uint16_t i = k;
+                while(i<ligne_struct.length()){
+                    if(is_first){
+                        kmer_struct = ligne_struct.substr(0, k);
+                        kmer_int = str2numstrand(kmer_struct);
+                        revComp_int = str2numstrand(revcomp(kmer_struct));
+                        kmer_hash = find_canonical(kmer_int, revComp_int, bvsize);
+                        is_first = false;
+                    }
+                    else{
+                        kmer_int = shiftnucl(kmer_int, ligne_struct[i], k);
+                        revComp_int = shiftnucl_revcomp(revComp_int, ligne_struct[i], k);
+                        kmer_hash = find_canonical(kmer_int, revComp_int, bvsize);
+                        i++;
+                    }
                     auto rank_hash = bitvector.rank(kmer_hash, *rs_idx);
                     kmer_app[rank_hash-1] ++;
-                }  
+                    } 
+                }
             }
         }
-    }
 
     fichier_struct.close();
     auto end_struct = high_resolution_clock::now();
     auto querying_struct = duration_cast<std::chrono::seconds>(end_struct - start_struct);
     cout << "ok struct : " << querying_struct.count() << " seconds."  << endl;
 
+    // VECTOR WITH START POSITIONS FOR READS
+
+    uint32_t total_count = 0;
     for(vector<uint32_t>::iterator i = kmer_app.begin() ; i != kmer_app.end(); i++){
         total_count += *i;
         vect_pos.push_back(total_count);
     }
     cout << "ok vectpos" << endl;
-
+    
     index.resize(total_count);
+    uint cpt = 0;
         
     auto start_index = high_resolution_clock::now();
     uint32_t num_read_index = 0;
@@ -287,30 +151,26 @@ sparse_vector_u32 Index::index_fasta_rankselect(const string& read_file, uint16_
     if(fichier_index){
         string ligne_index, kmer_index;
         while(!fichier_index.eof()){
+            bool is_first = true;
             getline(fichier_index,ligne_index);
             if (ligne_index[0] != '>' && !ligne_index.empty()){
+                uint16_t i = k;
                 num_read_index ++;
 
-                kmer_index = ligne_index.substr(0, k);
-                kmer_int = str2numstrand(kmer_index);
-                kmer_hash = revhash(kmer_int) % bitvector.size();
-                auto rank_hash = bitvector.rank(kmer_hash, *rs_idx);
-                uint32_t start_pos = vect_pos[rank_hash - 1];
-                uint32_t length_max = vect_pos[rank_hash] - vect_pos[rank_hash-1];
-
-                bool present = false;
-                uint tmp = start_pos;
-                while(tmp<start_pos+length_max && !present){
-                    if(index[tmp] == false){
-                        index[tmp] = num_read_index;
-                        present = true; 
+                while(i<ligne_index.length()){
+                    if(is_first){
+                        kmer_index = ligne_index.substr(0, k);
+                        kmer_int = str2numstrand(kmer_index);
+                        revComp_int = str2numstrand(revcomp(kmer_index));
+                        kmer_hash = find_canonical(kmer_int, revComp_int, bvsize);
+                        is_first = false;
+                    }
+                    else{
+                        kmer_int = shiftnucl(kmer_int, ligne_index[i], k);
+                        revComp_int = shiftnucl_revcomp(revComp_int, ligne_index[i], k);
+                        kmer_hash = find_canonical(kmer_int, revComp_int, bvsize);
+                        i++;
                     }
-                    tmp++;
-                }
-
-                for (long unsigned int i=k; i<ligne_index.length(); i++){
-                    kmer_int = shiftnucl(kmer_int, ligne_index[i], k);
-                    kmer_hash = revhash(kmer_int) % bitvector.size();
                     auto rank_hash = bitvector.rank(kmer_hash, *rs_idx);
                     uint32_t start_pos = vect_pos[rank_hash - 1];
                     uint32_t length_max = vect_pos[rank_hash] - vect_pos[rank_hash-1];
@@ -338,61 +198,10 @@ sparse_vector_u32 Index::index_fasta_rankselect(const string& read_file, uint16_
 
 
 
-
-
-
-
-// vector<uint32_t> Index::query_kmer(const rh_umap& index, const uint64_t& kmer){
-
-//     /**
-//      * @brief Query a k-mer in an index structure.
-//      * @param kmer (string) the k-mer we are searching for.
-//      * @return (vector<int>) the list of reads in which the k-mer appears.
-//      */
-
-//     try{
-//         vector<uint32_t> reads = *index.at(kmer);
-//         vector_decompression(reads);
-//         return reads;
-//     }
-    
-//     catch(const out_of_range& err) {
-//         cerr << "K-mer not found." << '\n';
-//         vector<uint32_t> v;
-//         return v;
-//     }
-    
-// }
-
-
-
-// vector<uint32_t> Index::query_kmer_mphf(const index_vector& index_vect, const uint64_t& kmer){
-//     uint64_t kmer_hash = revhash(kmer);
-//     uint64_t kmer_mphf = bphf_it->lookup(kmer_hash);
-
-//     kmer_to_mphf kmer_int_to_mphf; // association between kmer int (hashed) and its mphf value
-//     kmer_int_to_mphf.kmer_int = kmer_hash;
-//     kmer_int_to_mphf.kmer_mphf = kmer_mphf;
-
-//     if (kmer_mphf != ULLONG_MAX){ // if the kmer exists
-//         auto it = find(val_to_pos.begin(), val_to_pos.end(), kmer_int_to_mphf);
-//         uint indice = it - val_to_pos.begin();
-//         return *index_vect[indice];
-//     }
-//     else{
-//         cout << "K-mer not found." << '\n';
-//         vector<uint32_t> v;
-//         return v;
-//     }
-    
-//    vector<uint32_t> v;
-//         return v;
-// }
-
-
-
-vector<uint32_t> Index::query_kmer_rankselect(const sparse_vector_u32& index, const uint64_t& kmer){
-    uint64_t kmer_hash = revhash(kmer) % bitvector.size();
+vector<uint32_t> Index::query_kmer_rankselect(const vector<uint32_t>& index, const string& kmer){
+    uint32_t kmer_int = str2numstrand(kmer);
+    uint32_t revComp_int = str2numstrand(revcomp(kmer));
+    uint64_t kmer_hash = find_canonical(kmer_int, revComp_int, bitvector.size());
     uint length = 0;
     uint start = 0;
     if (bitvector[kmer_hash] == 1){ // if the kmer exists
@@ -416,92 +225,14 @@ vector<uint32_t> Index::query_kmer_rankselect(const sparse_vector_u32& index, co
 
 
 
-// vect_occ_read Index::query_sequence(const rh_umap& index, const string& sequence){
-
-//     /**
-//      * @brief Query a read in an index structure. The read will be first divided into k-mers.
-//      * @param read (string) the read we'll divide in k-mers and search for in the index.
-//      * @return (vector<int, vector<int>>) the list of reads in which the k-mers appears and their number of occurences.
-//      */
-
-//     vect_occ_read res;
-//     unordered_map<uint32_t, uint32_t> tmp_reads;
-//     for (long unsigned int i=0; i<=sequence.length()-this->k; i++){
-//         string kmer = sequence.substr(i, this->k);
-//         uint64_t kmer_int = str2numstrand(kmer);
-//         vector<uint32_t> reads_of_kmer = query_kmer(index, kmer_int);
-//         for (auto& idRead : reads_of_kmer){
-//             if(tmp_reads.find(idRead) != tmp_reads.end()){
-//                 tmp_reads[idRead] ++;
-//             }
-//             else{
-//                 tmp_reads[idRead] = 1;
-//             }
-//         }
-//     }
-
-//     for (auto const &pair: tmp_reads){
-//         occ_in_read structRead;
-//         structRead.read_id = pair.first;
-//         structRead.total_count = pair.second;
-//         res.push_back(structRead);
-//     }
-
-//     return res;
-// }
-
-
-
-// vect_occ_read Index::query_sequence_mphf(const index_vector& index_vect, const string& sequence){
-
-//     /**
-//      * @brief Query a read in an index structure. The read will be first divided into k-mers.
-//      * @param read (string) the read we'll divide in k-mers and search for in the index.
-//      * @return (vector<int, vector<int>>) the list of reads in which the k-mers appears and their number of occurences.
-//      */
-
-//     vect_occ_read res;
-//     unordered_map<uint32_t, uint32_t> tmp_reads;
-//     for (long unsigned int i=0; i<=sequence.length()-this->k; i++){
-//         string kmer = sequence.substr(i, this->k);
-//         uint64_t kmer_int = str2numstrand(kmer);
-//         vector<uint32_t> reads_of_kmer = query_kmer_mphf(index_vect, kmer_int);
-//         for (auto& idRead : reads_of_kmer){
-//             if(tmp_reads.find(idRead) != tmp_reads.end()){
-//                 tmp_reads[idRead] ++;
-//             }
-//             else{
-//                 tmp_reads[idRead] = 1;
-//             }
-//         }
-//     }
-
-//     for (auto const &pair: tmp_reads){
-//         occ_in_read structRead;
-//         structRead.read_id = pair.first;
-//         structRead.total_count = pair.second;
-//         res.push_back(structRead);
-//     }
-
-//     return res;
-// }
-
-
-
-vect_occ_read Index::query_sequence_rankselect(const sparse_vector_u32& index, const string& sequence){
-
-    /**
-     * @brief Query a read in an index structure. The read will be first divided into k-mers.
-     * @param read (string) the read we'll divide in k-mers and search for in the index.
-     * @return (vector<int, vector<int>>) the list of reads in which the k-mers appear and their number of occurences.
-     */
+vect_occ_read Index::query_sequence_rankselect(const vector<uint32_t>& index, const string& sequence){
 
     vect_occ_read res;
     unordered_map<uint32_t, uint32_t> tmp_reads;
+    bool is_first = true;
     for (long unsigned int i=0; i<=sequence.length()-this->k; i++){
         string kmer = sequence.substr(i, this->k);
-        uint64_t kmer_int = str2numstrand(kmer);
-        vector<uint32_t> reads_of_kmer = query_kmer_rankselect(index, kmer_int);
+        vector<uint32_t> reads_of_kmer = query_kmer_rankselect(index, kmer);
         for (auto& idRead : reads_of_kmer){
             if(tmp_reads.find(idRead) != tmp_reads.end()){
                 tmp_reads[idRead] ++;
@@ -524,109 +255,8 @@ vect_occ_read Index::query_sequence_rankselect(const sparse_vector_u32& index, c
 
 
 
-// vect_occ_read Index::query_fasta(const rh_umap& index, const string& filename){
 
-//     /**
-//      * @brief Search, for all the reads in the file, and for all their kmers, in which read they appear.
-//      * @param filename (string) the file containing the reads.
-//      * @return (vector<vector<occ_in_read>>) for each read, the list of reads in which the kmers appear.
-//      */
-    
-//     vect_occ_read res;
-//     unordered_map<uint32_t, uint32_t> tmp_reads;
-
-//     ifstream fichier(filename, ios::in);
-
-//     if(fichier){   
-//         string ligne;
-//         while( !fichier.eof()){
-//             getline(fichier,ligne);
-//             if (ligne[0] != '>' && !ligne.empty()){
-//                 vect_occ_read occ_read = query_sequence(index, ligne);
-//                 for (auto& struct_read : occ_read){
-//                     int read_id = struct_read.read_id;
-//                     int total_count = struct_read.total_count;
-//                     if(tmp_reads.find(read_id) != tmp_reads.end()){
-//                         tmp_reads[read_id] += total_count;
-//                     }
-//                     else{
-//                         tmp_reads[read_id] = total_count;
-//                     }
-//                 }
-//             }
-//         }
-//         for (auto const &pair: tmp_reads){
-//             occ_in_read structRead;
-//             structRead.read_id = pair.first;
-//             structRead.total_count = pair.second;
-//             res.push_back(structRead);
-//         }
-//     fichier.close();
-//     return res;
-//     }
-//     else{
-//         cerr << "Error opening the file." << endl;
-//         return res;
-//     }
-// }
-
-
-
-// vect_occ_read Index::query_fasta_mphf(const index_vector& index_vect, const string& filename){
-
-//     /**
-//      * @brief Search, for all the reads in the file, and for all their kmers, in which read they appear.
-//      * @param filename (string) the file containing the reads.
-//      * @return (vector<vector<occ_in_read>>) for each read, the list of reads in which the kmers appear.
-//      */
-    
-//     vect_occ_read res;
-//     unordered_map<uint32_t, uint32_t> tmp_reads;
-
-//     ifstream fichier(filename, ios::in);
-
-//     if(fichier){   
-//         string ligne;
-//         while( !fichier.eof()){
-//             getline(fichier,ligne);
-//             if (ligne[0] != '>' && !ligne.empty()){
-//                 vect_occ_read occ_read = query_sequence_mphf(index_vect, ligne);
-//                 for (auto& struct_read : occ_read){
-//                     int read_id = struct_read.read_id;
-//                     int total_count = struct_read.total_count;
-//                     if(tmp_reads.find(read_id) != tmp_reads.end()){
-//                         tmp_reads[read_id] += total_count;
-//                     }
-//                     else{
-//                         tmp_reads[read_id] = total_count;
-//                     }
-//                 }
-//             }
-//         }
-//         for (auto const &pair: tmp_reads){
-//             occ_in_read structRead;
-//             structRead.read_id = pair.first;
-//             structRead.total_count = pair.second;
-//             res.push_back(structRead);
-//         }
-//     fichier.close();
-//     return res;
-//     }
-//     else{
-//         cerr << "Error opening the file." << endl;
-//         return res;
-//     }
-// }
-
-
-
-vect_occ_read Index::query_fasta_rankselect(const sparse_vector_u32& index, const string& filename){
-
-    /**
-     * @brief Search, for all the reads in the file, and for all their kmers, in which read they appear.
-     * @param filename (string) the file containing the reads.
-     * @return (vector<vector<occ_in_read>>) for each read, the list of reads in which the kmers appear.
-     */
+vect_occ_read Index::query_fasta_rankselect(const vector<uint32_t>& index, const string& filename){
     
     vect_occ_read res;
     unordered_map<uint32_t, uint32_t> tmp_reads;
@@ -667,57 +297,16 @@ vect_occ_read Index::query_fasta_rankselect(const sparse_vector_u32& index, cons
 }
 
 
-
-// void Index::write_index(const rh_umap& index, const string& output_file){
-
-//     /**
-//      * @brief Query a k-mer in an index structure.
-//      * @param kmer (string) the k-mer we are searching for.
-//      * @return (vector<int>) the list of reads in which the k-mer appears.
-//      */
-//     ofstream outp(output_file);
-//     if (outp){
-//         for (auto const &pair: index) {
-//             outp << pair.first << ": ";
-//             for (int i : *pair.second){
-//                 outp << i << ","; 
-//             }
-//         outp << "\n";
-//         }
-//     }
-
-//     outp.close();
-// }
-
-
-
-// void Index::write_index_mphf(const index_vector& index_vect, const string& output_file){
-
-   
-//     ofstream outp(output_file);
-//     if (outp){
-//         for(unsigned int i = 0; i < index_vect.size(); i++) {
-//             uint64_t kmer_hashed = int_to_mphf[i].kmer_int;
-//             uint64_t kmer_int = unrevhash(kmer_hashed);
-
-//             outp << kmer_int << ": ";
-//             for (auto read : *index_vect[i]){
-//                 outp << read << ","; 
-//             }
-//             outp << "\n";
-//         }
-//     }
-//     outp.close();
-// }
-
-
-void Index::write_index_rankselect(const sparse_vector_u32& index, const string& output_file){
+void Index::write_index_rankselect(const vector<uint32_t>& index, const string& output_file){
 
     ofstream outp(output_file);
+    cout << "Writing the file..." << endl;
     if (outp){
         uint rank, pos_begin, length;
         uint kmer_hash = bitvector.get_first();
-        while(kmer_hash != 0){
+        bool first = true;
+
+        while(kmer_hash != 0 || first){
             rank = bitvector.rank(kmer_hash, *rs_idx);
             pos_begin = vect_pos[rank-1];
             length = vect_pos[rank] - vect_pos[rank-1];
@@ -726,10 +315,9 @@ void Index::write_index_rankselect(const sparse_vector_u32& index, const string&
                 outp << index[i] << ",";
             }
             outp << "\n";
-
+            first = false;
             kmer_hash = bitvector.get_next(kmer_hash);
         }
-        cout << "File created. " << endl;
     }
     outp.close();
 }
diff --git a/sources/main.cpp b/sources/main.cpp
index 331047e..912add2 100644
--- a/sources/main.cpp
+++ b/sources/main.cpp
@@ -12,7 +12,6 @@
 #include "../headers/index.h"
 #include "../headers/utils.h"
 #include "../headers/bloomfilter.h"
-//#include "bool_iterator.cpp"
 
 using namespace std;
 using namespace chrono;
@@ -26,13 +25,18 @@ int main(int argc, char *argv[])
 
     else{
 
+        /*transform_file("TESTS/index_phage.txt", "TESTS/transform_phage.txt");
+        transform_file("TESTS/phage_index_rankselect.txt", "TESTS/transform_phage_rank.txt");
+
+        cout << compare_files("TESTS/transform_rankselect_myco_sorted.txt", "TESTS/transform_rh_myco_sorted.txt");*/
+
         std::string filename(argv[1]);
         uint16_t k = stoi(argv[2]);
         float error_rate = stof(argv[3]);
         
         Index index = Index(filename, k);
         auto start_indexing = high_resolution_clock::now();
-        sparse_vector_u32 read_vector = index.index_fasta_rankselect(filename, k);
+        vector<uint32_t> read_vector = index.index_fasta_rankselect(filename, k);
         auto end_indexing = high_resolution_clock::now();
         auto indexing = duration_cast<seconds>(end_indexing - start_indexing);
         double pct_unique_kmers = (double)index.uniqueKmers()/index.get_nb_kmers()*100;
@@ -43,7 +47,7 @@ int main(int argc, char *argv[])
         uint64_t memory_used_index = getMemorySelfMaxUsed();
         cout << "Resource usage (indexing) : " << intToString(memory_used_index) << " Ko" << endl;
         cout << "Resource usage (for 1 k-mer) : " << koToBit((double)memory_used_index/index.get_nb_kmers()) << " Bits" << endl;
-        index.write_index_rankselect(read_vector, "output_files/index_rankselect.txt");
+        index.write_index_rankselect(read_vector, "output_files/phage_index_rankselect_order.txt");
 
         int opt;
 
@@ -56,10 +60,9 @@ int main(int argc, char *argv[])
 
                         // QUERYING KMER
                     string kmer_to_search = seq_from_fasta(optarg);
-                    uint64_t kmer_int = str2numstrand(kmer_to_search);
 
                     auto start_querying_kmer = high_resolution_clock::now();
-                    vector<uint32_t> res_query_kmer = index.query_kmer_rankselect(read_vector, kmer_int);
+                    vector<uint32_t> res_query_kmer = index.query_kmer_rankselect(read_vector, kmer_to_search);
                     if(res_query_kmer.empty()){
                         cout << "The index does not contain this k-mer.";
                     }
diff --git a/sources/utils.cpp b/sources/utils.cpp
index d42c374..1edaec7 100644
--- a/sources/utils.cpp
+++ b/sources/utils.cpp
@@ -166,20 +166,6 @@ void write_values(string output_file, float val1, const string& val2){
 
 
 
-kmer hash64shift(kmer key) {
-	key = (~key) + (key << 21); // key = (key << 21) - key - 1;
-	key = key ^ (key >> 24);
-	key = (key + (key << 3)) + (key << 8); // key * 265
-	key = key ^ (key >> 14);
-	key = (key + (key << 2)) + (key << 4); // key * 21
-	key = key ^ (key >> 28);
-	key = key + (key << 31);
-	return key;
-}
-
-
-
-
 uint32_t revhash(uint32_t x) {
 	x = ((x >> 16) ^ x) * 0x2c1b3c6d;
 	x = ((x >> 16) ^ x) * 0x297a2d39;
@@ -204,4 +190,100 @@ uint64_t shiftnucl(uint64_t val, char c, uint16_t k) {
   val = val + str2numstrand(string(1,c));
   uint mask = pow(2,(k*2))-1;
   return val & mask;
-}
\ No newline at end of file
+}
+
+
+char revCompChar(const char& c){
+  switch (c){
+      case 'A': return 'T';
+      case 'C': return 'G';
+      case 'G': return 'C';
+    }
+    return 'A';
+}
+
+
+
+string revcomp(const string& s){
+  string res(s.size(), 0);
+  for(int i((int)s.length() - 1); i >= 0; i--){
+    res[s.size()-1-i] = revCompChar(s[i]);
+  }
+  return res;
+}
+
+
+
+
+uint64_t shiftnucl_revcomp(uint64_t val, const char& c, uint16_t k) {
+	val = val >> 2;
+  return val + (str2numstrand(string(1,revCompChar(c))) << ((k-1)*2));
+}
+
+
+
+
+uint64_t min_kmer(uint64_t kmer1, uint64_t kmer2){
+  if(kmer1<kmer2){
+    return kmer1;
+  }
+  return kmer2;
+}
+
+
+
+uint64_t modulo(uint64_t nb, uint32_t m){
+  return nb & (m-1);
+}
+
+
+
+uint64_t find_canonical(uint64_t kmer_int, uint64_t revComp_int, uint64_t bitvector_size){
+  return modulo(revhash(min_kmer(kmer_int, revComp_int)), bitvector_size);
+}
+
+
+
+void transform_file(const string& fich1, const string& fich2){
+  
+  ifstream fichier1(fich1, ios::in);
+  ofstream outp(fich2);
+  if(outp){
+    string ligne;
+    while(!fichier1.eof()){
+      getline(fichier1,ligne);
+      ligne.erase(0, ligne.find(":")+1);
+      ligne.erase(std::remove_if(ligne.begin(), ligne.end(), ::isspace), ligne.end());
+      outp << ligne << "\n";
+    }
+  }
+  fichier1.close();
+  outp.close();
+}
+
+
+
+string compare_files(const string& file1, const string& file2){
+  string res = "";
+  int cpt = 1;
+  ifstream fichier1(file1, ios::in);
+  ifstream fichier2(file2, ios::in);
+
+  if(fichier1 && fichier2){
+    string ligne1, ligne2;
+    while(!fichier1.eof() && !fichier2.eof()){
+      getline(fichier1,ligne1);
+      getline(fichier2,ligne2);
+
+      if(ligne1 != ligne2){
+        res += std::to_string(cpt);
+        res += " ";
+      }
+      cpt++;
+    }
+  }
+  return res;
+}
+
+
+
-- 
GitLab