isoSeqQC
Quality assessment and re-mapping of poorly mapped isoSeq read segments
Loading...
Searching...
No Matches
helperFunctions.hpp
Go to the documentation of this file.
1/*
2 * Copyright (c) 2024-2025 Anthony J. Greenberg and Rebekah Rogers
3 *
4 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
5 *
6 * 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7 *
8 * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
9 *
10 * 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11 *
12 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
13 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
14 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
15 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
16 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
17 * THE POSSIBILITY OF SUCH DAMAGE.
18 */
19
21
29
30#pragma once
31
32#include <memory>
33#include <array>
34#include <string>
35#include <utility> // for std::pair
36#include <vector>
37
38#include "sam.h"
39
40#include "isoseqAlgn.hpp"
41
42namespace isaSpace {
44 constexpr size_t nGFFfields{9UL};
46 using bamGFFvector = std::vector< std::pair<BAMrecord, ExonGroup> >;
47
54 [[nodiscard]] std::string extractAttributeName(const TokenAttibuteListPair &tokenAndAttrList);
55
63 [[nodiscard]] std::string extractParentName(const std::string &attributeString);
64
73 [[nodiscard]] bool rangesOverlap(const std::pair<hts_pos_t, hts_pos_t> &range1, const std::pair<hts_pos_t, hts_pos_t> &range2) noexcept;
74
83 [[nodiscard]] std::array<std::string, nGFFfields> parseGFFline(const std::string &gffLine);
84
93 [[nodiscard]] std::unordered_map< std::string, std::vector<ExonGroup> > parseGFF(const std::string &gffFileName);
94
105 [[nodiscard]] std::vector<std::vector<float>::const_iterator> getPeaks(const std::vector<float> &values, const float &threshold);
106
117 [[nodiscard]] std::vector<std::vector<float>::const_iterator> getValleys(const std::vector<float> &values, const float &threshold);
118
128 [[nodiscard]] std::vector<float> getReferenceMatchStatus(const std::vector<uint32_t> &cigar);
129
135 [[nodiscard]] ReadExonCoverage getExonCoverageStats(const std::pair<BAMrecord, ExonGroup> &readAndExons);
136
143 [[nodiscard]] std::string stringifyExonCoverage(const ReadExonCoverage &readRecord, char separator = '\t');
144
151 [[nodiscard]] std::string stringifyAlignmentRange(const bamGFFvector::const_iterator &begin, const bamGFFvector::const_iterator &end);
161 [[nodiscard]] std::string stringifyUnmappedRegions(const bamGFFvector::const_iterator &begin, const bamGFFvector::const_iterator &end, const BinomialWindowParameters &windowParameters);
171 [[nodiscard]] std::pair<std::string, std::string> getUnmappedRegionsAndFASTQ(const bamGFFvector::const_iterator &begin, const bamGFFvector::const_iterator &end, const BinomialWindowParameters &windowParameters);
172
182 [[nodiscard]] ReadPortion parseRemappedReadName(const std::string &remappedReadName);
191 [[nodiscard]] std::unique_ptr<bam1_t, BAMrecordDeleter> modifyCIGAR(const ReadPortion &modRange, const std::unique_ptr<bam1_t, BAMrecordDeleter> &bamRecord);
205 const std::unique_ptr<sam_hdr_t, BAMheaderDeleter> &newRecordHeader, const std::unique_ptr<bam1_t, BAMrecordDeleter> &newRecord, const ReadPortion &remapInfo,
206 const std::unique_ptr<sam_hdr_t, BAMheaderDeleter> &originalHeader, const float &remapIdentityCutoff, std::vector< std::unique_ptr<bam1_t, BAMrecordDeleter> > &readMapVector);
207
214 std::unique_ptr<BGZF, BGZFhandleDeleter> openBGZFtoAppend(const std::string &bamFileName);
215
225 [[nodiscard]] std::vector< std::pair<bamGFFvector::const_iterator, bamGFFvector::const_iterator> >
226 makeThreadRanges(const bamGFFvector &targetVector, const size_t &threadCount);
227
236 [[nodiscard]] std::unordered_map<std::string, std::string> parseCL(int &argc, char **argv);
237
247 void extractCLinfo(const std::unordered_map<std::string, std::string> &parsedCLI,
248 std::unordered_map<std::string, int> &intVariables,
249 std::unordered_map<std::string, float> &floatVariables,
250 std::unordered_map<std::string, std::string> &stringVariables);
251}
std::array< std::string, nGFFfields > parseGFFline(const std::string &gffLine)
Parse a GFF line into fields.
ReadPortion parseRemappedReadName(const std::string &remappedReadName)
Extract original read name and coordinates.
std::string stringifyAlignmentRange(const bamGFFvector::const_iterator &begin, const bamGFFvector::const_iterator &end)
Produce a string from a range of read alignments.
void extractCLinfo(const std::unordered_map< std::string, std::string > &parsedCLI, std::unordered_map< std::string, int > &intVariables, std::unordered_map< std::string, float > &floatVariables, std::unordered_map< std::string, std::string > &stringVariables)
Extract parameters from parsed command line interface flags.
std::vector< std::vector< float >::const_iterator > getPeaks(const std::vector< float > &values, const float &threshold)
Identify peaks in numerical data.
std::vector< std::vector< float >::const_iterator > getValleys(const std::vector< float > &values, const float &threshold)
Identify valleys in numerical data.
ReadExonCoverage getExonCoverageStats(const std::pair< BAMrecord, ExonGroup > &readAndExons)
Extract exon coverage statistics for a read.
std::unordered_map< std::string, std::vector< ExonGroup > > parseGFF(const std::string &gffFileName)
Parse a GFF file.
std::vector< std::pair< BAMrecord, ExonGroup > > bamGFFvector
Alias for BAM record and exon group pair vector.
Definition helperFunctions.hpp:46
bool rangesOverlap(const std::pair< hts_pos_t, hts_pos_t > &range1, const std::pair< hts_pos_t, hts_pos_t > &range2) noexcept
Test for range overlap.
std::vector< std::pair< bamGFFvector::const_iterator, bamGFFvector::const_iterator > > makeThreadRanges(const bamGFFvector &targetVector, const size_t &threadCount)
Make per-thread alignment record/annotation vector ranges.
void addRemappedSecondaryAlignment(const std::unique_ptr< sam_hdr_t, BAMheaderDeleter > &newRecordHeader, const std::unique_ptr< bam1_t, BAMrecordDeleter > &newRecord, const ReadPortion &remapInfo, const std::unique_ptr< sam_hdr_t, BAMheaderDeleter > &originalHeader, const float &remapIdentityCutoff, std::vector< std::unique_ptr< bam1_t, BAMrecordDeleter > > &readMapVector)
Add a re-mapped secondary alignment.
std::unique_ptr< BGZF, BGZFhandleDeleter > openBGZFtoAppend(const std::string &bamFileName)
Open a BGZF file handle for appending.
constexpr size_t nGFFfields
Number of GFF file fields.
Definition helperFunctions.hpp:44
std::string stringifyExonCoverage(const ReadExonCoverage &readRecord, char separator='\t')
Convert ReadExonCoverage to string.
std::vector< float > getReferenceMatchStatus(const std::vector< uint32_t > &cigar)
Read match status along the reference.
std::string extractParentName(const std::string &attributeString)
Extract parent name.
std::pair< std::string, std::string > getUnmappedRegionsAndFASTQ(const bamGFFvector::const_iterator &begin, const bamGFFvector::const_iterator &end, const BinomialWindowParameters &windowParameters)
Produce a string of poorly aligned region statistics and corresponding FASTQ records from an alignmen...
std::string stringifyUnmappedRegions(const bamGFFvector::const_iterator &begin, const bamGFFvector::const_iterator &end, const BinomialWindowParameters &windowParameters)
Produce a string of poorly aligned region statistics from an alignment range.
std::unique_ptr< bam1_t, BAMrecordDeleter > modifyCIGAR(const ReadPortion &modRange, const std::unique_ptr< bam1_t, BAMrecordDeleter > &bamRecord)
Modify the BAM CIGAR string to erase alignment in a range.
std::unordered_map< std::string, std::string > parseCL(int &argc, char **argv)
Command line parser.
std::string extractAttributeName(const TokenAttibuteListPair &tokenAndAttrList)
Extract a name.
Read isoSeq alignments and compare to genome annotations.
Binomial window parameters.
Definition isoseqAlgn.hpp:108
Exons covered by a read.
Definition isoseqAlgn.hpp:127
Read portion for re-mapping.
Definition isoseqAlgn.hpp:224
Token name and GFF attribute list pair.
Definition isoseqAlgn.hpp:79