Skip to content
Snippets Groups Projects
Commit 5da2c5e9 authored by Salson Mikael's avatar Salson Mikael
Browse files

Add class to deal with multiple sequences

parent 4789e783
No related branches found
No related tags found
No related merge requests found
#ifndef MULTICOVER_HPP
#define MULTICOVER_HPP
#include "coverind.hpp"
#include <string>
#include <iostream>
#include <cstdlib>
#include <sdsl/bit_vectors.hpp>
#include <algorithm>
#include <fstream>
#include <exception>
#include "lib/gzstream.h"
extern "C" {
#include <htslib/hts.h>
#include <htslib/sam.h>
}
template<class bv>
class MultiCover {
private:
size_t nb_targets;
std::string *target_names;
Coverind<bv> **covers;
public:
/**
* Builds a coverage structure either from an indexed BAM file
* or from an already built structure, stored in files
* @param filename: filename of the BAM or basename of the index
* @param load: loads the index iff true
*/
MultiCover(const std::string filename, bool load = false);
~MultiCover();
/**
* @return the coverage structure from one chromosome, given its index
*/
Coverind<bv> &getChrCoverage(size_t i);
/**
* @return the coverage structure from one chromosome, given its name
*/
Coverind<bv> &getChrCoverage(const std::string chr);
/**
* @return the chromosome index given its name
*/
size_t getChrIndex(const std::string chr);
/**
* @return the name of the i-th chromosome
*/
std::string getChrName(size_t i) const;
/**
* @return the number of chromosomes
*/
size_t getNbChromosomes() const;
/**
* Generate a compressed BED file with the number of reads covering
* all the genome positions
*/
void generateBED(size_t range_size, const std::string bed) const;
/**
* Save the index
*/
void save(std::string filename);
/**
* Spaced used by all the bit vectors
*/
uint64_t size_in_bytes() const;
private:
void loadBam(const std::string filename);
void loadIndex(const std::string filename);
};
template<class bv>
MultiCover<bv>::MultiCover(const std::string filename, bool load) {
if (! load) {
loadBam(filename);
} else {
loadIndex(filename);
}
}
template <class bv>
MultiCover<bv>::~MultiCover() {
delete [] target_names;
for (size_t i = 0; i < nb_targets; i++)
delete covers[i];
free(covers);
}
template <class bv>
Coverind<bv> &MultiCover<bv>::getChrCoverage(size_t i) {
if (i >= nb_targets)
throw std::invalid_argument("There are only "+std::to_string(nb_targets)+" chromosomes");
return *covers[i];
}
template <class bv>
Coverind<bv> &MultiCover<bv>::getChrCoverage(const std::string chr) {
return getChrCoverage(getChrIndex(chr));
}
template <class bv>
std::string MultiCover<bv>::getChrName(size_t i) const {
if (i >= nb_targets)
throw std::invalid_argument("There are only "+std::to_string(nb_targets)+" chromosomes");
return target_names[i];
}
template <class bv>
size_t MultiCover<bv>::getChrIndex(const std::string chr) {
for (size_t i = 0; i < nb_targets; i++)
if (target_names[i] == chr)
return i;
throw new std::invalid_argument("No chromosome called "+chr);
}
template <class bv>
size_t MultiCover<bv>::getNbChromosomes() const {
return nb_targets;
}
template <class bv>
void MultiCover<bv>::generateBED(size_t range_size, const std::string bed) const {
ogzstream output(std::string(bed).c_str());
for (size_t chr_idx = 0; chr_idx < nb_targets; chr_idx++) {
size_t size_chr = covers[chr_idx]->nb_genome_positions();
for (size_t i = 0; i < size_chr - range_size + 1; i += range_size) {
output << target_names[chr_idx] << "\t" << i << "\t"
<< i+range_size << "\t"
<< covers[chr_idx]->count_reads_between(i, i + range_size - 1)
<< std::endl;
}
}
output.close();
}
template <class bv>
void MultiCover<bv>::save(const std::string filename) {
std::ofstream conf(filename+".conf");
conf << nb_targets << std::endl;
for (size_t i = 0; i < nb_targets; i++) {
conf << target_names[i] << std::endl;
// TODO: protect filename
covers[i]->save(filename+"."+target_names[i]);
}
conf.close();
}
template <class bv>
uint64_t MultiCover<bv>::size_in_bytes() const {
uint64_t size = 0;
for(size_t i = 0; i < nb_targets; i++) {
size += covers[i]->size_in_bytes();
}
return size;
}
template <class bv>
void MultiCover<bv>::loadBam(const std::string filename) {
htsFile *bam = hts_open(filename.c_str(), "r");
hts_idx_t *bam_idx = sam_index_load(bam, filename.c_str());
sam_hdr_t *bam_header = sam_hdr_read(bam);
bam1_t *read_entry = bam_init1();
nb_targets = bam_header->n_targets;
target_names = new std::string[nb_targets];
covers = (Coverind<bv> **)malloc(sizeof(Coverind<bv>*)*nb_targets);
if (! bam)
throw std::invalid_argument("Cannot open the BAM file");
if (! bam_idx)
throw std::invalid_argument("Cannot open index for the BAM file");
// Iterating over targets
for (size_t i = 0; i < nb_targets; i++) {
target_names[i] = std::string(bam_header->target_name[i]);
hts_itr_t *read_it = sam_itr_queryi(bam_idx, i, 0, bam_header->target_len[i]);
size_t nb_reads = 0;
std::vector<size_t> start_pos, end_pos;
if (read_it) {
while (sam_itr_next(bam, read_it, read_entry) > 0) {
// TODO: filter on flags supplementary, secondary and dup
// see read_entry->core.flag
start_pos.push_back(read_entry->core.pos);
end_pos.push_back(bam_endpos(read_entry) - 1);
nb_reads++;
}
}
// Create bits vectors for start and stop positions
bit_vector start_bv(nb_reads+bam_header->target_len[i]),
stop_bv(nb_reads+bam_header->target_len[i]);
std::sort(end_pos.begin(), end_pos.end());
for (size_t j = 0; j < start_pos.size(); j++) {
start_bv[j + start_pos[j]] = 1;
stop_bv[j + end_pos[j]] = 1;
}
covers[i] = new Coverind<bv>(start_bv, stop_bv);
}
sam_hdr_destroy(bam_header);
hts_idx_destroy(bam_idx);
hts_close(bam);
}
template <class bv>
void MultiCover<bv>::loadIndex(const std::string filename) {
std::ifstream conf(filename+".conf");
conf >> nb_targets;
target_names = new std::string[nb_targets];
covers = (Coverind<bv> **)malloc(sizeof(Coverind<bv>*)*nb_targets);
for (size_t i = 0; i < nb_targets; i++) {
conf >> target_names[i];
covers[i] = new Coverind<bv>(filename+"."+target_names[i]);
}
conf.close();
}
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment