Skip to content
Snippets Groups Projects
Commit c23ff7bb authored by Vandamme Léa's avatar Vandamme Léa
Browse files

version using rank bitvectors/rank ok

parent bec6f40e
No related branches found
No related tags found
No related merge requests found
...@@ -19,28 +19,12 @@ struct occ_in_read { ...@@ -19,28 +19,12 @@ struct occ_in_read {
struct kmer_to_mphf {
uint64_t kmer_int;
uint64_t kmer_mphf;
public :
bool operator== (const kmer_to_mphf& item){
if (item.kmer_int == kmer_int && item.kmer_mphf == kmer_mphf){
return true;
}
return false;
}
};
struct rank_kmer { struct rank_kmer {
uint64_t rank; uint64_t rank;
uint32_t nb_apparition; uint32_t nb_apparition;
}; };
typedef robin_hood::unordered_map<uint32_t, vector<uint32_t>*> rh_umap; typedef robin_hood::unordered_map<uint32_t, vector<uint32_t>*> rh_umap;
typedef robin_hood::unordered_map<uint32_t, uint8_t> rh_umap_test;
typedef vector<occ_in_read> vect_occ_read; typedef vector<occ_in_read> vect_occ_read;
typedef uint64_t kmer; typedef uint64_t kmer;
typedef vector<vector<uint32_t>*> index_vector; typedef vector<vector<uint32_t>*> index_vector;
...@@ -62,21 +46,11 @@ class Index{ ...@@ -62,21 +46,11 @@ class Index{
void set_nb_kmers(uint32_t nb_kmers); void set_nb_kmers(uint32_t nb_kmers);
uint32_t get_nb_kmers(); uint32_t get_nb_kmers();
rh_umap index_fasta_rh(const string& filename, uint16_t k); vector<uint32_t> index_fasta_rankselect(const string& read_file, uint16_t k);
index_vector index_fasta_rh_mpfh(const string& filename, uint16_t k, BloomFilter& bf); vect_occ_read query_sequence_rankselect(const vector<uint32_t>& index, const string& sequence);
sparse_vector_u32 index_fasta_rankselect(const string& read_file, uint16_t k); vector<uint32_t> query_kmer_rankselect(const vector<uint32_t>& index_vect, const string& kmer);
vect_occ_read query_sequence(const rh_umap& index, const string& sequence); vect_occ_read query_fasta_rankselect(const vector<uint32_t>& index_vect, const string& filename);
vect_occ_read query_sequence_mphf(const index_vector& index, const string& sequence); void write_index_rankselect(const vector<uint32_t>& read_vector, const string& output_file);
vect_occ_read query_sequence_rankselect(const sparse_vector_u32& index, const string& sequence);
vector<uint32_t> query_kmer(const rh_umap& index, const uint64_t& kmer);
vector<uint32_t> query_kmer_mphf(const index_vector& index, const uint64_t& kmer);
vector<uint32_t> query_kmer_rankselect(const sparse_vector_u32& index_vect, const uint64_t& kmer);
vect_occ_read query_fasta(const rh_umap& index, const string& filename);
vect_occ_read query_fasta_mphf(const index_vector& index, const string& filename);
vect_occ_read query_fasta_rankselect(const sparse_vector_u32& index_vect, const string& filename);
void write_index(const rh_umap& index, const string& output_file);
void write_index_mphf(const index_vector& index_vect, const string& output_file);
void write_index_rankselect(const sparse_vector_u32& read_vector, const string& output_file);
uint32_t uniqueKmers(); uint32_t uniqueKmers();
}; };
......
...@@ -25,6 +25,14 @@ kmer hash64shift(kmer key); ...@@ -25,6 +25,14 @@ kmer hash64shift(kmer key);
uint32_t revhash(uint32_t x); uint32_t revhash(uint32_t x);
uint32_t unrevhash(uint32_t x); uint32_t unrevhash(uint32_t x);
uint64_t nuc2int(char c); uint64_t nuc2int(char c);
char revCompChar(const char& c);
uint64_t shiftnucl (uint64_t kmerint, char c, uint16_t k); uint64_t shiftnucl (uint64_t kmerint, char c, uint16_t k);
uint64_t shiftnucl_revcomp(uint64_t val, const char& c, uint16_t k);
string revcomp(const string& s);
uint64_t min_kmer(uint64_t kmer1, uint64_t kmer2);
uint64_t find_canonical(uint64_t kmer_int, uint64_t revComp_int, uint64_t bitvector_size);
uint64_t modulo(uint64_t nb, uint32_t m);
void transform_file(const string& fich1, const string& fich2);
string compare_files(const string& file1, const string& file2);
#endif #endif
\ No newline at end of file
This diff is collapsed.
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
#include "../headers/index.h" #include "../headers/index.h"
#include "../headers/utils.h" #include "../headers/utils.h"
#include "../headers/bloomfilter.h" #include "../headers/bloomfilter.h"
//#include "bool_iterator.cpp"
using namespace std; using namespace std;
using namespace chrono; using namespace chrono;
...@@ -26,13 +25,18 @@ int main(int argc, char *argv[]) ...@@ -26,13 +25,18 @@ int main(int argc, char *argv[])
else{ else{
/*transform_file("TESTS/index_phage.txt", "TESTS/transform_phage.txt");
transform_file("TESTS/phage_index_rankselect.txt", "TESTS/transform_phage_rank.txt");
cout << compare_files("TESTS/transform_rankselect_myco_sorted.txt", "TESTS/transform_rh_myco_sorted.txt");*/
std::string filename(argv[1]); std::string filename(argv[1]);
uint16_t k = stoi(argv[2]); uint16_t k = stoi(argv[2]);
float error_rate = stof(argv[3]); float error_rate = stof(argv[3]);
Index index = Index(filename, k); Index index = Index(filename, k);
auto start_indexing = high_resolution_clock::now(); auto start_indexing = high_resolution_clock::now();
sparse_vector_u32 read_vector = index.index_fasta_rankselect(filename, k); vector<uint32_t> read_vector = index.index_fasta_rankselect(filename, k);
auto end_indexing = high_resolution_clock::now(); auto end_indexing = high_resolution_clock::now();
auto indexing = duration_cast<seconds>(end_indexing - start_indexing); auto indexing = duration_cast<seconds>(end_indexing - start_indexing);
double pct_unique_kmers = (double)index.uniqueKmers()/index.get_nb_kmers()*100; double pct_unique_kmers = (double)index.uniqueKmers()/index.get_nb_kmers()*100;
...@@ -43,7 +47,7 @@ int main(int argc, char *argv[]) ...@@ -43,7 +47,7 @@ int main(int argc, char *argv[])
uint64_t memory_used_index = getMemorySelfMaxUsed(); uint64_t memory_used_index = getMemorySelfMaxUsed();
cout << "Resource usage (indexing) : " << intToString(memory_used_index) << " Ko" << endl; cout << "Resource usage (indexing) : " << intToString(memory_used_index) << " Ko" << endl;
cout << "Resource usage (for 1 k-mer) : " << koToBit((double)memory_used_index/index.get_nb_kmers()) << " Bits" << endl; cout << "Resource usage (for 1 k-mer) : " << koToBit((double)memory_used_index/index.get_nb_kmers()) << " Bits" << endl;
index.write_index_rankselect(read_vector, "output_files/index_rankselect.txt"); index.write_index_rankselect(read_vector, "output_files/phage_index_rankselect_order.txt");
int opt; int opt;
...@@ -56,10 +60,9 @@ int main(int argc, char *argv[]) ...@@ -56,10 +60,9 @@ int main(int argc, char *argv[])
// QUERYING KMER // QUERYING KMER
string kmer_to_search = seq_from_fasta(optarg); string kmer_to_search = seq_from_fasta(optarg);
uint64_t kmer_int = str2numstrand(kmer_to_search);
auto start_querying_kmer = high_resolution_clock::now(); auto start_querying_kmer = high_resolution_clock::now();
vector<uint32_t> res_query_kmer = index.query_kmer_rankselect(read_vector, kmer_int); vector<uint32_t> res_query_kmer = index.query_kmer_rankselect(read_vector, kmer_to_search);
if(res_query_kmer.empty()){ if(res_query_kmer.empty()){
cout << "The index does not contain this k-mer."; cout << "The index does not contain this k-mer.";
} }
......
...@@ -166,20 +166,6 @@ void write_values(string output_file, float val1, const string& val2){ ...@@ -166,20 +166,6 @@ void write_values(string output_file, float val1, const string& val2){
kmer hash64shift(kmer key) {
key = (~key) + (key << 21); // key = (key << 21) - key - 1;
key = key ^ (key >> 24);
key = (key + (key << 3)) + (key << 8); // key * 265
key = key ^ (key >> 14);
key = (key + (key << 2)) + (key << 4); // key * 21
key = key ^ (key >> 28);
key = key + (key << 31);
return key;
}
uint32_t revhash(uint32_t x) { uint32_t revhash(uint32_t x) {
x = ((x >> 16) ^ x) * 0x2c1b3c6d; x = ((x >> 16) ^ x) * 0x2c1b3c6d;
x = ((x >> 16) ^ x) * 0x297a2d39; x = ((x >> 16) ^ x) * 0x297a2d39;
...@@ -205,3 +191,99 @@ uint64_t shiftnucl(uint64_t val, char c, uint16_t k) { ...@@ -205,3 +191,99 @@ uint64_t shiftnucl(uint64_t val, char c, uint16_t k) {
uint mask = pow(2,(k*2))-1; uint mask = pow(2,(k*2))-1;
return val & mask; return val & mask;
} }
char revCompChar(const char& c){
switch (c){
case 'A': return 'T';
case 'C': return 'G';
case 'G': return 'C';
}
return 'A';
}
string revcomp(const string& s){
string res(s.size(), 0);
for(int i((int)s.length() - 1); i >= 0; i--){
res[s.size()-1-i] = revCompChar(s[i]);
}
return res;
}
uint64_t shiftnucl_revcomp(uint64_t val, const char& c, uint16_t k) {
val = val >> 2;
return val + (str2numstrand(string(1,revCompChar(c))) << ((k-1)*2));
}
uint64_t min_kmer(uint64_t kmer1, uint64_t kmer2){
if(kmer1<kmer2){
return kmer1;
}
return kmer2;
}
uint64_t modulo(uint64_t nb, uint32_t m){
return nb & (m-1);
}
uint64_t find_canonical(uint64_t kmer_int, uint64_t revComp_int, uint64_t bitvector_size){
return modulo(revhash(min_kmer(kmer_int, revComp_int)), bitvector_size);
}
void transform_file(const string& fich1, const string& fich2){
ifstream fichier1(fich1, ios::in);
ofstream outp(fich2);
if(outp){
string ligne;
while(!fichier1.eof()){
getline(fichier1,ligne);
ligne.erase(0, ligne.find(":")+1);
ligne.erase(std::remove_if(ligne.begin(), ligne.end(), ::isspace), ligne.end());
outp << ligne << "\n";
}
}
fichier1.close();
outp.close();
}
string compare_files(const string& file1, const string& file2){
string res = "";
int cpt = 1;
ifstream fichier1(file1, ios::in);
ifstream fichier2(file2, ios::in);
if(fichier1 && fichier2){
string ligne1, ligne2;
while(!fichier1.eof() && !fichier2.eof()){
getline(fichier1,ligne1);
getline(fichier2,ligne2);
if(ligne1 != ligne2){
res += std::to_string(cpt);
res += " ";
}
cpt++;
}
}
return res;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment