37#include <unordered_map>
120 std::vector< std::pair<float, hts_pos_t> >::difference_type
windowSize{0};
252 void operator()(sam_hdr_t *header)
const { sam_hdr_destroy(header); }
260 void operator()(bam1_t *bamRecord)
const { bam_destroy1(bamRecord); }
268 void operator()(BGZF *bgzfHandle)
const { bgzf_close(bgzfHandle); }
303 *
this = std::move(toMove);
320 [[nodiscard]] std::unique_ptr<sam_hdr_t, BAMheaderDeleter>
getHeaderCopy()
const;
328 [[nodiscard]] std::pair<std::unique_ptr<bam1_t, BAMrecordDeleter>, int32_t>
getNextRecord();
331 static const uint16_t nRetries_;
333 BGZF *fileHandle_{
nullptr};
335 std::unique_ptr<sam_hdr_t, BAMheaderDeleter> headerUPointer_{
nullptr};
337 std::string fileName_;
339 std::filesystem::perms initialPermissions_{std::filesystem::perms::none};
398 [[nodiscard]]
bool empty() const noexcept {
return exonRanges_.empty(); };
403 [[nodiscard]] std::string
geneName()
const {
return geneName_; };
408 [[nodiscard]]
size_t nExons() const noexcept {
return exonRanges_.size(); };
413 [[nodiscard]]
char strand() const noexcept {
return isNegativeStrand_ ?
'-' :
'+'; };
419 [[nodiscard]] std::pair<hts_pos_t, hts_pos_t>
at(
const size_t &idx)
const {
return exonRanges_.at(idx);};
428 [[nodiscard]] std::pair<hts_pos_t, hts_pos_t>
geneSpan() const noexcept;
437 [[nodiscard]] std::pair<hts_pos_t, hts_pos_t>
firstExonSpan() const noexcept;
510 static const
size_t strandIDidx_;
512 static const
char gffDelimiter_;
514 static const
size_t spanStart_;
516 static const
size_t spanEnd_;
518 std::
string geneName_;
520 bool isNegativeStrand_{
false};
525 std::vector< std::pair<hts_pos_t, hts_pos_t> > exonRanges_;
546 leftProbability_{windowParameters.currentProbability},
547 rightProbability_{windowParameters.alternativeProbability},
548 kSuccesses_{calculateKsuccesses_(windowBegin, windowParameters)},
549 nTrials_{static_cast<float>(windowParameters.windowSize)} {};
585 float leftProbability_{0.0F};
591 float rightProbability_{0.0F};
593 float kSuccesses_{0.0F};
595 float nTrials_{0.0F};
602 [[nodiscard]]
static float calculateKsuccesses_(
const std::vector< std::pair<float, hts_pos_t> >::const_iterator &windowBegin,
const BinomialWindowParameters &windowParameters);
620 BAMrecord(
const bam1_t *alignmentRecord,
const sam_hdr_t *samHeader);
655 void addSecondaryAlignment(
const bam1_t *alignmentRecord,
const sam_hdr_t *samHeader,
const hts_pos_t localWindow = 500'000);
660 [[nodiscard]] std::string
getReadName()
const {
return readName_; };
667 [[nodiscard]] hts_pos_t
getMapStart() const noexcept {
return mapStart_; };
674 [[nodiscard]] hts_pos_t
getMapEnd() const noexcept {
return mapEnd_; };
682 return isRev_ ? mapEnd_ : mapStart_;
688 [[nodiscard]]
bool isRevComp() const noexcept {
return isRev_; };
693 [[nodiscard]]
bool isMapped() const noexcept {
return isMapped_; };
723 [[nodiscard]] hts_pos_t
getReadLength() const noexcept {
return static_cast<hts_pos_t
>( sequenceAndQuality_.size() ); };
786 static const uint16_t sequenceMask_;
788 static const uint16_t qualityShift_;
790 static const uint16_t suppSecondaryAlgn_;
795 static const std::array<hts_pos_t, 10> queryConsumption_;
800 static const std::array<hts_pos_t, 10> referenceConsumption_;
802 static const std::array<char, 16> seqNT16str_;
804 static const std::array<char, 16> complSeqNT16str_;
809 static const std::array<float, 10> sequenceMatch_;
811 bool isMapped_{
false};
815 uint16_t secondaryAlignmentCount_{0};
821 hts_pos_t mapStart_{0};
827 hts_pos_t mapEnd_{0};
829 std::string readName_;
831 std::string referenceName_;
833 std::vector<uint32_t> cigar_;
839 std::vector<uint16_t> sequenceAndQuality_;
841 std::vector<BAMsecondary> localSecondaryAlignments_;
934 static const uint16_t suppSecondaryAlgn_;
952 [[nodiscard]] std::vector<
ExonGroup>::const_iterator findOverlappingGene_(const std::vector<
ExonGroup> &chromosomeExonGroups,
953 const std::vector<
ExonGroup>::const_iterator &exonGroupSearchStart,
BAMrecord &alignedRead);
960 void processPrimaryAlignment_(const std::
string &referenceName, const
BAMrecord &alignmentRecord, std::unordered_map<std::
string, std::vector<
ExonGroup>::const_iterator> &latestExonGroupIts);
967 void processSecondaryAlignment_(const std::
string &referenceName, const
BAMrecord &alignmentRecord, const std::unordered_map<std::
string, std::vector<
ExonGroup>::const_iterator> &latestExonGroupIts);
1022 void addRemaps(const std::
string &remapBAMfileName, const
float &remapIdentityCutoff);
1028 [[nodiscard]] std::vector<std::
string>
saveRemappedBAM(const std::
string &outputBAMfileName) const;
1040 static const uint16_t secondaryOrUnmappedAlgn_;
Records from a BAM file.
Definition isoseqAlgn.hpp:975
BAMfile(BAMfile &&toMove) noexcept=default
Move constructor.
BAMfile & operator=(const BAMfile &toCopy)=delete
Copy assignment operator.
void addRemaps(const std::string &remapBAMfileName, const float &remapIdentityCutoff)
Add re-mapped read regions.
size_t getPrimaryAlignmentCount() const noexcept
Get the number of primary alignments.
BAMfile & operator=(BAMfile &&toMove) noexcept=default
Move assignment operator.
BAMfile(const std::string &BAMfileName)
Constructor with BAM file name.
BAMfile()=default
Default constructor.
~BAMfile()=default
Destructor.
std::vector< std::string > saveRemappedBAM(const std::string &outputBAMfileName) const
Save the reads with re-alignments to a BAM file.
BAMfile(const BAMfile &toCopy)=delete
Copy constructor.
std::vector< std::string > saveSortedRemappedBAM(const std::string &outputBAMfileName) const
Sort and save the reads with re-alignments to a BAM file.
Summary of a BAM record set.
Definition isoseqAlgn.hpp:609
std::vector< std::pair< float, hts_pos_t > > getReadCentricMatchStatus() const
Reference match status along the read.
bool isMapped() const noexcept
Is the read mapped?
Definition isoseqAlgn.hpp:693
void addSecondaryAlignment(const bam1_t *alignmentRecord, const sam_hdr_t *samHeader, const hts_pos_t localWindow=500 '000)
Add a secondary alignment record.
bool hasLocalSecondaryAlignments() const noexcept
Are there any local secondary alignments?
Definition isoseqAlgn.hpp:703
bool isRevComp() const noexcept
Is the read reverse-complemented?
Definition isoseqAlgn.hpp:688
BAMrecord(const bam1_t *alignmentRecord, const sam_hdr_t *samHeader)
Constructor with data.
hts_pos_t getmRNAstart() const noexcept
mRNA start position
Definition isoseqAlgn.hpp:681
BAMrecord(const BAMrecord &toCopy)=default
Copy constructor.
hts_pos_t getMapEnd() const noexcept
Map end position.
Definition isoseqAlgn.hpp:674
uint32_t getFirstCIGAR() const noexcept
Get the first cigar element with strand reversal.
std::string getReferenceName() const
Get reference name.
Definition isoseqAlgn.hpp:749
std::vector< uint32_t > getCIGARvector() const
CIGAR vector.
Definition isoseqAlgn.hpp:730
BAMrecord()=default
Default constructor.
hts_pos_t getMapStart() const noexcept
Map start position.
Definition isoseqAlgn.hpp:667
uint16_t localSecondaryAlignmentCount() const noexcept
Count of secondary alignments overlapping the primary.
Definition isoseqAlgn.hpp:713
uint16_t localReversedSecondaryAlignmentCount() const noexcept
Count of reversed secondary alignments overlapping the primary.
BAMrecord & operator=(const BAMrecord &toCopy)=default
Copy assignment operator.
std::string getReadName() const
Output read name.
Definition isoseqAlgn.hpp:660
bool hasSecondaryAlignments() const noexcept
Are there any secondary alignments?
Definition isoseqAlgn.hpp:698
std::string getSequenceAndQuality(const MappedReadInterval &segmentBoundaries) const
Get a segment of the sequence and the ASCII quality score.
uint16_t secondaryAlignmentCount() const noexcept
Count of all secondary alignments regardless of position.
Definition isoseqAlgn.hpp:708
~BAMrecord()=default
Destructor.
hts_pos_t getReadLength() const noexcept
Read length.
Definition isoseqAlgn.hpp:723
BAMrecord(BAMrecord &&toMove) noexcept=default
Move constructor.
std::vector< MappedReadInterval > getPoorlyMappedRegions(const BinomialWindowParameters &windowParameters) const
Identify unmapped portions of the read.
BAMrecord & operator=(BAMrecord &&toMove) noexcept=default
Move assignment operator.
std::string getCIGARstring() const
CIGAR string.
MappedReadMatchStatus getBestReferenceMatchStatus() const
Best reference-centric match status.
BAM file safe reader.
Definition isoseqAlgn.hpp:278
BAMsafeReader(BAMsafeReader &&toMove) noexcept
Move constructor.
Definition isoseqAlgn.hpp:302
std::unique_ptr< sam_hdr_t, BAMheaderDeleter > getHeaderCopy() const
Get BAM header.
BAMsafeReader(const BAMsafeReader &toCopy)=delete
Copy constructor.
BAMsafeReader()=default
Default constructor.
std::pair< std::unique_ptr< bam1_t, BAMrecordDeleter >, int32_t > getNextRecord()
Get the next BAM record.
BAMsafeReader & operator=(BAMsafeReader &&toMove) noexcept
Move assignment operator.
BAMsafeReader & operator=(const BAMsafeReader &toCopy)=delete
Copy assignment operator.
BAMsafeReader(const std::string &bamFileName)
Constructor with file name.
~BAMsafeReader()
Destructor.
Relate BAM alignments to exons.
Definition isoseqAlgn.hpp:849
size_t nChromosomes() const noexcept
Number of chromosomes/scaffolds/linkage groups.
BAMtoGenome & operator=(BAMtoGenome &&toMove) noexcept=default
Move assignment operator.
BAMtoGenome()=default
Default constructor.
~BAMtoGenome()=default
Destructor.
BAMtoGenome(BAMtoGenome &&toMove) noexcept=default
Move constructor.
void saveUnmappedRegions(const std::string &outFileName, const BinomialWindowParameters &windowParameters, const size_t &nThreads) const
Save unmapped read statistics to file.
BAMtoGenome & operator=(const BAMtoGenome &toCopy)=default
Copy assignment operator.
void saveReadCoverageStats(const std::string &outFileName, const size_t &nThreads) const
Save read coverage to file.
BAMtoGenome(const BamAndGffFiles &bamGFFfilePairNames)
Constructor intersecting iso-Seq alignments and exons.
BAMtoGenome(const BAMtoGenome &toCopy)=default
Copy constructor.
size_t nExonSets() const noexcept
Number of exon sets (genes with exons).
Group of exons from the same gene.
Definition isoseqAlgn.hpp:345
ExonGroup(const ExonGroup &toCopy)=default
Copy constructor.
char strand() const noexcept
Strand ID.
Definition isoseqAlgn.hpp:413
uint32_t firstExonAfter(const hts_pos_t &position) const noexcept
Index of the first exon after a given position.
uint32_t lastExonBefore(const hts_pos_t &position) const noexcept
Index of the last exon before a given position.
ExonGroup(std::string geneName, const char strand, std::set< std::pair< hts_pos_t, hts_pos_t > > &exonSet)
Constructor with lines from a GFF file.
std::pair< hts_pos_t, hts_pos_t > firstExonSpan() const noexcept
First exon span.
std::string geneName() const
Report the gene name.
Definition isoseqAlgn.hpp:403
ExonGroup & operator=(ExonGroup &&toMove) noexcept=default
Move assignment operator.
ExonGroup(ExonGroup &&toMove) noexcept=default
Move constructor.
hts_pos_t firstExonLength() const noexcept
First exon length.
size_t nExons() const noexcept
Number of exons in the gene.
Definition isoseqAlgn.hpp:408
uint32_t lastOverlappingExon(const hts_pos_t &position) const noexcept
Index of the last exon overlapping a given position.
std::pair< hts_pos_t, hts_pos_t > at(const size_t &idx) const
Range covered by a given exon.
Definition isoseqAlgn.hpp:419
uint32_t firstOverlappingExon(const hts_pos_t &position) const noexcept
Index of the first exon overlapping a given position.
ExonGroup()=default
Default constructor.
ExonGroup & operator=(const ExonGroup &toCopy)=default
Copy assignment operator.
ExonGroup(std::string geneName, const char strand, const std::vector< std::string > &exonLinesFomGFF)
Constructor with exon lines from a GFF file.
bool empty() const noexcept
Is the object empty?
Definition isoseqAlgn.hpp:398
std::vector< float > getBestExonCoverageQuality(const BAMrecord &alignment) const
Get best read coverage quality per exon.
std::vector< float > getExonCoverageQuality(const BAMrecord &alignment) const
Get read coverage quality per exon.
std::pair< hts_pos_t, hts_pos_t > getFirstIntronSpan() const
First intron span.
~ExonGroup()=default
Destructor.
std::pair< hts_pos_t, hts_pos_t > geneSpan() const noexcept
Gene span.
Binomial BIC for a read window.
Definition isoseqAlgn.hpp:536
float getBICdifference() const noexcept
Get BIC difference.
ReadMatchWindowBIC(ReadMatchWindowBIC &&toMove) noexcept=default
Move constructor.
ReadMatchWindowBIC()=default
Default constructor.
ReadMatchWindowBIC & operator=(ReadMatchWindowBIC &&toMove) noexcept=default
Move assignment operator.
~ReadMatchWindowBIC()=default
Destructor.
ReadMatchWindowBIC & operator=(const ReadMatchWindowBIC &toCopy)=default
Copy assignment operator.
ReadMatchWindowBIC(const std::vector< std::pair< float, hts_pos_t > >::const_iterator &windowBegin, const BinomialWindowParameters &windowParameters)
Constructor with a read match vector window.
Definition isoseqAlgn.hpp:545
ReadMatchWindowBIC(const ReadMatchWindowBIC &toCopy)=default
Copy constructor.
BAM record pointer deleter.
Definition isoseqAlgn.hpp:255
void operator()(bam1_t *bamRecord) const
Functor operator.
Definition isoseqAlgn.hpp:260
BAM secondary alignment.
Definition isoseqAlgn.hpp:236
hts_pos_t mapStart
Base-1 map start position on the reference.
Definition isoseqAlgn.hpp:238
std::vector< uint32_t > cigar
CIGAR string.
Definition isoseqAlgn.hpp:244
hts_pos_t mapEnd
Base-1 map end position on the reference.
Definition isoseqAlgn.hpp:240
bool sameStrandAsPrimary
Is it the same strand as the primary?
Definition isoseqAlgn.hpp:242
BGZF handle pointer deleter.
Definition isoseqAlgn.hpp:263
void operator()(BGZF *bgzfHandle) const
Functor operator.
Definition isoseqAlgn.hpp:268
BAM and GFF file name pair.
Definition isoseqAlgn.hpp:65
std::string gffFileName
GFF file name.
Definition isoseqAlgn.hpp:69
std::string bamFileName
BAM file name.
Definition isoseqAlgn.hpp:67
Binomial window parameters.
Definition isoseqAlgn.hpp:108
float bicDifferenceThreshold
Minimum difference in BIC between windows with current and alternative probabilities.
Definition isoseqAlgn.hpp:118
float alternativeProbability
Probability of success to consider as an alternative.
Definition isoseqAlgn.hpp:116
std::vector< std::pair< float, hts_pos_t > >::difference_type windowSize
Window size.
Definition isoseqAlgn.hpp:120
float currentProbability
Probability of success up to this window.
Definition isoseqAlgn.hpp:110
Delineates an interval in a mapped read.
Definition isoseqAlgn.hpp:90
hts_pos_t referenceStart
Base-1 start position on the reference.
Definition isoseqAlgn.hpp:96
hts_pos_t readStart
Base-0 start position on the read.
Definition isoseqAlgn.hpp:92
hts_pos_t referenceEnd
Base-1 end position on the reference.
Definition isoseqAlgn.hpp:98
hts_pos_t readEnd
Base-0 end position on the read.
Definition isoseqAlgn.hpp:94
Read match and map start.
Definition isoseqAlgn.hpp:101
hts_pos_t mapStart
Map start.
Definition isoseqAlgn.hpp:103
std::vector< float > matchStatus
Match status vector.
Definition isoseqAlgn.hpp:105
Exons covered by a read.
Definition isoseqAlgn.hpp:127
std::vector< float > bestExonCoverageScores
Best exon coverage scores.
Definition isoseqAlgn.hpp:221
std::string geneName
Gene name.
Definition isoseqAlgn.hpp:187
hts_pos_t alignmentEnd
Alignment end.
Definition isoseqAlgn.hpp:147
hts_pos_t bestAlignmentEnd
Best alignment end.
Definition isoseqAlgn.hpp:159
std::string readName
Read name.
Definition isoseqAlgn.hpp:135
uint16_t nSecondaryAlignments
Number of secondary alignments.
Definition isoseqAlgn.hpp:167
hts_pos_t alignmentStart
Alignment start.
Definition isoseqAlgn.hpp:141
std::vector< float > exonCoverageScores
Exon coverage scores.
Definition isoseqAlgn.hpp:215
uint32_t firstSoftClipLength
Length of the soft clip at read start.
Definition isoseqAlgn.hpp:165
char strand
Strand ID.
Definition isoseqAlgn.hpp:192
uint16_t nGoodSecondaryAlignments
Number of good secondary alignments.
Definition isoseqAlgn.hpp:173
hts_pos_t lastExonEnd
Last exon end.
Definition isoseqAlgn.hpp:209
uint16_t nLocalReversedAlignments
Number of locally mapping reverse-complemented reads.
Definition isoseqAlgn.hpp:179
hts_pos_t firstExonStart
First exon start.
Definition isoseqAlgn.hpp:203
std::string chromosomeName
Chromosome name.
Definition isoseqAlgn.hpp:133
hts_pos_t bestAlignmentStart
Best alignment start.
Definition isoseqAlgn.hpp:153
uint16_t nExons
Number of exons.
Definition isoseqAlgn.hpp:181
hts_pos_t firstExonLength
First exon length.
Definition isoseqAlgn.hpp:197
Read portion for re-mapping.
Definition isoseqAlgn.hpp:224
size_t end
Base-0 one past the read end.
Definition isoseqAlgn.hpp:228
std::string originalName
Original read name.
Definition isoseqAlgn.hpp:230
size_t start
Base-0 read start.
Definition isoseqAlgn.hpp:226
Mismatch statistics and FASTQ file name pair.
Definition isoseqAlgn.hpp:72
std::string statsFileName
Statistics file name.
Definition isoseqAlgn.hpp:74
std::string fastqFileName
FASTQ file name.
Definition isoseqAlgn.hpp:76
Token name and GFF attribute list pair.
Definition isoseqAlgn.hpp:79
std::string tokenName
GFF token name.
Definition isoseqAlgn.hpp:81
std::vector< std::string > attributeList
Attribute list.
Definition isoseqAlgn.hpp:83