Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
coverind
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Salson Mikael
coverind
Commits
5da2c5e9
Commit
5da2c5e9
authored
Jan 16, 2020
by
Salson Mikael
Browse files
Options
Downloads
Patches
Plain Diff
Add class to deal with multiple sequences
parent
4789e783
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
multicover.hpp
+229
-0
229 additions, 0 deletions
multicover.hpp
with
229 additions
and
0 deletions
multicover.hpp
0 → 100644
+
229
−
0
View file @
5da2c5e9
#ifndef MULTICOVER_HPP
#define MULTICOVER_HPP
#include
"coverind.hpp"
#include
<string>
#include
<iostream>
#include
<cstdlib>
#include
<sdsl/bit_vectors.hpp>
#include
<algorithm>
#include
<fstream>
#include
<exception>
#include
"lib/gzstream.h"
extern
"C"
{
#include
<htslib/hts.h>
#include
<htslib/sam.h>
}
template
<
class
bv
>
class
MultiCover
{
private:
size_t
nb_targets
;
std
::
string
*
target_names
;
Coverind
<
bv
>
**
covers
;
public:
/**
* Builds a coverage structure either from an indexed BAM file
* or from an already built structure, stored in files
* @param filename: filename of the BAM or basename of the index
* @param load: loads the index iff true
*/
MultiCover
(
const
std
::
string
filename
,
bool
load
=
false
);
~
MultiCover
();
/**
* @return the coverage structure from one chromosome, given its index
*/
Coverind
<
bv
>
&
getChrCoverage
(
size_t
i
);
/**
* @return the coverage structure from one chromosome, given its name
*/
Coverind
<
bv
>
&
getChrCoverage
(
const
std
::
string
chr
);
/**
* @return the chromosome index given its name
*/
size_t
getChrIndex
(
const
std
::
string
chr
);
/**
* @return the name of the i-th chromosome
*/
std
::
string
getChrName
(
size_t
i
)
const
;
/**
* @return the number of chromosomes
*/
size_t
getNbChromosomes
()
const
;
/**
* Generate a compressed BED file with the number of reads covering
* all the genome positions
*/
void
generateBED
(
size_t
range_size
,
const
std
::
string
bed
)
const
;
/**
* Save the index
*/
void
save
(
std
::
string
filename
);
/**
* Spaced used by all the bit vectors
*/
uint64_t
size_in_bytes
()
const
;
private:
void
loadBam
(
const
std
::
string
filename
);
void
loadIndex
(
const
std
::
string
filename
);
};
template
<
class
bv
>
MultiCover
<
bv
>::
MultiCover
(
const
std
::
string
filename
,
bool
load
)
{
if
(
!
load
)
{
loadBam
(
filename
);
}
else
{
loadIndex
(
filename
);
}
}
template
<
class
bv
>
MultiCover
<
bv
>::~
MultiCover
()
{
delete
[]
target_names
;
for
(
size_t
i
=
0
;
i
<
nb_targets
;
i
++
)
delete
covers
[
i
];
free
(
covers
);
}
template
<
class
bv
>
Coverind
<
bv
>
&
MultiCover
<
bv
>::
getChrCoverage
(
size_t
i
)
{
if
(
i
>=
nb_targets
)
throw
std
::
invalid_argument
(
"There are only "
+
std
::
to_string
(
nb_targets
)
+
" chromosomes"
);
return
*
covers
[
i
];
}
template
<
class
bv
>
Coverind
<
bv
>
&
MultiCover
<
bv
>::
getChrCoverage
(
const
std
::
string
chr
)
{
return
getChrCoverage
(
getChrIndex
(
chr
));
}
template
<
class
bv
>
std
::
string
MultiCover
<
bv
>::
getChrName
(
size_t
i
)
const
{
if
(
i
>=
nb_targets
)
throw
std
::
invalid_argument
(
"There are only "
+
std
::
to_string
(
nb_targets
)
+
" chromosomes"
);
return
target_names
[
i
];
}
template
<
class
bv
>
size_t
MultiCover
<
bv
>::
getChrIndex
(
const
std
::
string
chr
)
{
for
(
size_t
i
=
0
;
i
<
nb_targets
;
i
++
)
if
(
target_names
[
i
]
==
chr
)
return
i
;
throw
new
std
::
invalid_argument
(
"No chromosome called "
+
chr
);
}
template
<
class
bv
>
size_t
MultiCover
<
bv
>::
getNbChromosomes
()
const
{
return
nb_targets
;
}
template
<
class
bv
>
void
MultiCover
<
bv
>::
generateBED
(
size_t
range_size
,
const
std
::
string
bed
)
const
{
ogzstream
output
(
std
::
string
(
bed
).
c_str
());
for
(
size_t
chr_idx
=
0
;
chr_idx
<
nb_targets
;
chr_idx
++
)
{
size_t
size_chr
=
covers
[
chr_idx
]
->
nb_genome_positions
();
for
(
size_t
i
=
0
;
i
<
size_chr
-
range_size
+
1
;
i
+=
range_size
)
{
output
<<
target_names
[
chr_idx
]
<<
"
\t
"
<<
i
<<
"
\t
"
<<
i
+
range_size
<<
"
\t
"
<<
covers
[
chr_idx
]
->
count_reads_between
(
i
,
i
+
range_size
-
1
)
<<
std
::
endl
;
}
}
output
.
close
();
}
template
<
class
bv
>
void
MultiCover
<
bv
>::
save
(
const
std
::
string
filename
)
{
std
::
ofstream
conf
(
filename
+
".conf"
);
conf
<<
nb_targets
<<
std
::
endl
;
for
(
size_t
i
=
0
;
i
<
nb_targets
;
i
++
)
{
conf
<<
target_names
[
i
]
<<
std
::
endl
;
// TODO: protect filename
covers
[
i
]
->
save
(
filename
+
"."
+
target_names
[
i
]);
}
conf
.
close
();
}
template
<
class
bv
>
uint64_t
MultiCover
<
bv
>::
size_in_bytes
()
const
{
uint64_t
size
=
0
;
for
(
size_t
i
=
0
;
i
<
nb_targets
;
i
++
)
{
size
+=
covers
[
i
]
->
size_in_bytes
();
}
return
size
;
}
template
<
class
bv
>
void
MultiCover
<
bv
>::
loadBam
(
const
std
::
string
filename
)
{
htsFile
*
bam
=
hts_open
(
filename
.
c_str
(),
"r"
);
hts_idx_t
*
bam_idx
=
sam_index_load
(
bam
,
filename
.
c_str
());
sam_hdr_t
*
bam_header
=
sam_hdr_read
(
bam
);
bam1_t
*
read_entry
=
bam_init1
();
nb_targets
=
bam_header
->
n_targets
;
target_names
=
new
std
::
string
[
nb_targets
];
covers
=
(
Coverind
<
bv
>
**
)
malloc
(
sizeof
(
Coverind
<
bv
>*
)
*
nb_targets
);
if
(
!
bam
)
throw
std
::
invalid_argument
(
"Cannot open the BAM file"
);
if
(
!
bam_idx
)
throw
std
::
invalid_argument
(
"Cannot open index for the BAM file"
);
// Iterating over targets
for
(
size_t
i
=
0
;
i
<
nb_targets
;
i
++
)
{
target_names
[
i
]
=
std
::
string
(
bam_header
->
target_name
[
i
]);
hts_itr_t
*
read_it
=
sam_itr_queryi
(
bam_idx
,
i
,
0
,
bam_header
->
target_len
[
i
]);
size_t
nb_reads
=
0
;
std
::
vector
<
size_t
>
start_pos
,
end_pos
;
if
(
read_it
)
{
while
(
sam_itr_next
(
bam
,
read_it
,
read_entry
)
>
0
)
{
// TODO: filter on flags supplementary, secondary and dup
// see read_entry->core.flag
start_pos
.
push_back
(
read_entry
->
core
.
pos
);
end_pos
.
push_back
(
bam_endpos
(
read_entry
)
-
1
);
nb_reads
++
;
}
}
// Create bits vectors for start and stop positions
bit_vector
start_bv
(
nb_reads
+
bam_header
->
target_len
[
i
]),
stop_bv
(
nb_reads
+
bam_header
->
target_len
[
i
]);
std
::
sort
(
end_pos
.
begin
(),
end_pos
.
end
());
for
(
size_t
j
=
0
;
j
<
start_pos
.
size
();
j
++
)
{
start_bv
[
j
+
start_pos
[
j
]]
=
1
;
stop_bv
[
j
+
end_pos
[
j
]]
=
1
;
}
covers
[
i
]
=
new
Coverind
<
bv
>
(
start_bv
,
stop_bv
);
}
sam_hdr_destroy
(
bam_header
);
hts_idx_destroy
(
bam_idx
);
hts_close
(
bam
);
}
template
<
class
bv
>
void
MultiCover
<
bv
>::
loadIndex
(
const
std
::
string
filename
)
{
std
::
ifstream
conf
(
filename
+
".conf"
);
conf
>>
nb_targets
;
target_names
=
new
std
::
string
[
nb_targets
];
covers
=
(
Coverind
<
bv
>
**
)
malloc
(
sizeof
(
Coverind
<
bv
>*
)
*
nb_targets
);
for
(
size_t
i
=
0
;
i
<
nb_targets
;
i
++
)
{
conf
>>
target_names
[
i
];
covers
[
i
]
=
new
Coverind
<
bv
>
(
filename
+
"."
+
target_names
[
i
]);
}
conf
.
close
();
}
#endif
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment