isoSeqQC
Quality assessment and re-mapping of poorly mapped isoSeq read segments
Loading...
Searching...
No Matches
helperFunctions.hpp File Reference

Genomic analyses helper functions. More...

#include <memory>
#include <array>
#include <string>
#include <utility>
#include <vector>
#include "sam.h"
#include "isoseqAlgn.hpp"
Include dependency graph for helperFunctions.hpp:

Go to the source code of this file.

Typedefs

using isaSpace::bamGFFvector = std::vector< std::pair<BAMrecord, ExonGroup> >
 Alias for BAM record and exon group pair vector.

Functions

std::string isaSpace::extractAttributeName (const TokenAttibuteListPair &tokenAndAttrList)
 Extract a name.
std::string isaSpace::extractParentName (const std::string &attributeString)
 Extract parent name.
bool isaSpace::rangesOverlap (const std::pair< hts_pos_t, hts_pos_t > &range1, const std::pair< hts_pos_t, hts_pos_t > &range2) noexcept
 Test for range overlap.
std::array< std::string, nGFFfieldsisaSpace::parseGFFline (const std::string &gffLine)
 Parse a GFF line into fields.
std::unordered_map< std::string, std::vector< ExonGroup > > isaSpace::parseGFF (const std::string &gffFileName)
 Parse a GFF file.
std::vector< std::vector< float >::const_iterator > isaSpace::getPeaks (const std::vector< float > &values, const float &threshold)
 Identify peaks in numerical data.
std::vector< std::vector< float >::const_iterator > isaSpace::getValleys (const std::vector< float > &values, const float &threshold)
 Identify valleys in numerical data.
std::vector< float > isaSpace::getReferenceMatchStatus (const std::vector< uint32_t > &cigar)
 Read match status along the reference.
ReadExonCoverage isaSpace::getExonCoverageStats (const std::pair< BAMrecord, ExonGroup > &readAndExons)
 Extract exon coverage statistics for a read.
std::string isaSpace::stringifyExonCoverage (const ReadExonCoverage &readRecord, char separator='\t')
 Convert ReadExonCoverage to string.
std::string isaSpace::stringifyAlignmentRange (const bamGFFvector::const_iterator &begin, const bamGFFvector::const_iterator &end)
 Produce a string from a range of read alignments.
std::string isaSpace::stringifyUnmappedRegions (const bamGFFvector::const_iterator &begin, const bamGFFvector::const_iterator &end, const BinomialWindowParameters &windowParameters)
 Produce a string of poorly aligned region statistics from an alignment range.
std::pair< std::string, std::string > isaSpace::getUnmappedRegionsAndFASTQ (const bamGFFvector::const_iterator &begin, const bamGFFvector::const_iterator &end, const BinomialWindowParameters &windowParameters)
 Produce a string of poorly aligned region statistics and corresponding FASTQ records from an alignment range.
ReadPortion isaSpace::parseRemappedReadName (const std::string &remappedReadName)
 Extract original read name and coordinates.
std::unique_ptr< bam1_t, BAMrecordDeleterisaSpace::modifyCIGAR (const ReadPortion &modRange, const std::unique_ptr< bam1_t, BAMrecordDeleter > &bamRecord)
 Modify the BAM CIGAR string to erase alignment in a range.
void isaSpace::addRemappedSecondaryAlignment (const std::unique_ptr< sam_hdr_t, BAMheaderDeleter > &newRecordHeader, const std::unique_ptr< bam1_t, BAMrecordDeleter > &newRecord, const ReadPortion &remapInfo, const std::unique_ptr< sam_hdr_t, BAMheaderDeleter > &originalHeader, const float &remapIdentityCutoff, std::vector< std::unique_ptr< bam1_t, BAMrecordDeleter > > &readMapVector)
 Add a re-mapped secondary alignment.
std::unique_ptr< BGZF, BGZFhandleDeleterisaSpace::openBGZFtoAppend (const std::string &bamFileName)
 Open a BGZF file handle for appending.
std::vector< std::pair< bamGFFvector::const_iterator, bamGFFvector::const_iterator > > isaSpace::makeThreadRanges (const bamGFFvector &targetVector, const size_t &threadCount)
 Make per-thread alignment record/annotation vector ranges.
std::unordered_map< std::string, std::string > isaSpace::parseCL (int &argc, char **argv)
 Command line parser.
void isaSpace::extractCLinfo (const std::unordered_map< std::string, std::string > &parsedCLI, std::unordered_map< std::string, int > &intVariables, std::unordered_map< std::string, float > &floatVariables, std::unordered_map< std::string, std::string > &stringVariables)
 Extract parameters from parsed command line interface flags.

Variables

constexpr size_t isaSpace::nGFFfields {9UL}
 Number of GFF file fields.

Detailed Description

Genomic analyses helper functions.

Author
Anthony J. Greenberg and Rebekah Rogers
Version
0.2

Definitions of class-external functions needed by genomic analyses.

Function Documentation

◆ addRemappedSecondaryAlignment()

void isaSpace::addRemappedSecondaryAlignment ( const std::unique_ptr< sam_hdr_t, BAMheaderDeleter > & newRecordHeader,
const std::unique_ptr< bam1_t, BAMrecordDeleter > & newRecord,
const ReadPortion & remapInfo,
const std::unique_ptr< sam_hdr_t, BAMheaderDeleter > & originalHeader,
const float & remapIdentityCutoff,
std::vector< std::unique_ptr< bam1_t, BAMrecordDeleter > > & readMapVector )

Add a re-mapped secondary alignment.

Add a read portion remap as a secondary alignment to a vector of BAM records. Only reads that pass the identity threshold are added.

Parameters
[in]newRecordHeaderheader corresponding to the re-mapped record
[in]newRecordremapped BAM record
[in]remapInfooriginal name and read segment range
[in]originalHeaderheader corresponding to the original record
[in]remapIdentityCutofffraction of sites in the remapped read that are identical to the reference
[in,out]readMapVectorvector of alignments of a read, first element is the primary alignment

◆ extractAttributeName()

std::string isaSpace::extractAttributeName ( const TokenAttibuteListPair & tokenAndAttrList)
nodiscard

Extract a name.

Extract a token name from GFF attributes.

Parameters
[in]tokenAndAttrListfield token and the list of attributes

◆ extractCLinfo()

void isaSpace::extractCLinfo ( const std::unordered_map< std::string, std::string > & parsedCLI,
std::unordered_map< std::string, int > & intVariables,
std::unordered_map< std::string, float > & floatVariables,
std::unordered_map< std::string, std::string > & stringVariables )

Extract parameters from parsed command line interface flags.

Extracts needed variable values, indexed by std::string encoded variable names.

Parameters
[in]parsedCLIflag values parsed from the command line
[out]intVariablesindexed int variables for use by main()
[out]floatVariablesindexed float variables for use by main()
[out]stringVariablesindexed std::string variables for use by main()

◆ extractParentName()

std::string isaSpace::extractParentName ( const std::string & attributeString)
nodiscard

Extract parent name.

Extract the value of the Parent= attribute from the provided GFF attribute string.

Parameters
[in]attributeStringGFF attribute string
Returns
parent name

◆ getExonCoverageStats()

ReadExonCoverage isaSpace::getExonCoverageStats ( const std::pair< BAMrecord, ExonGroup > & readAndExons)
nodiscard

Extract exon coverage statistics for a read.

Parameters
[in]readAndExonsread alignment with the corresponding exon group
Returns
exon coverage object

◆ getPeaks()

std::vector< std::vector< float >::const_iterator > isaSpace::getPeaks ( const std::vector< float > & values,
const float & threshold )
nodiscard

Identify peaks in numerical data.

Returns iterators to the elements in a vector that correspond to peaks above the provided threshold. Last element is the const_iterator to the end of the vector unless it is empty, and is the only element if there are no peaks.

Parameters
[in]valuesvector of values
[in]thresholdvalue that must be exceeded for a peak call
Returns
vector of iterators to peak elements

◆ getReferenceMatchStatus()

std::vector< float > isaSpace::getReferenceMatchStatus ( const std::vector< uint32_t > & cigar)
nodiscard

Read match status along the reference.

Parses CIGAR to track read (query) match/mismatch (1.0 for match, 0.0 for mismatch) status along the reference. This means that insertions in the read are ignored. The vector start begins at the position closest to the first exon start regardless of strand.

Parameters
[in]cigarCIGAR vector
Returns
vector of match status

◆ getUnmappedRegionsAndFASTQ()

std::pair< std::string, std::string > isaSpace::getUnmappedRegionsAndFASTQ ( const bamGFFvector::const_iterator & begin,
const bamGFFvector::const_iterator & end,
const BinomialWindowParameters & windowParameters )
nodiscard

Produce a string of poorly aligned region statistics and corresponding FASTQ records from an alignment range.

Only saves information from reads that have poorly mapped regions, potentially multiple per read.

Parameters
[in]beginstart iterator
[in]endend iterator
[in]windowParameterssliding window parameters
Returns
strings with coverage information (.first) and FASTQ (.second)

◆ getValleys()

std::vector< std::vector< float >::const_iterator > isaSpace::getValleys ( const std::vector< float > & values,
const float & threshold )
nodiscard

Identify valleys in numerical data.

Returns iterators to the elements in a vector that correspond to valleys below the provided threshold. Last element is the const_iterator to the end of the vector unless it is empty, and is the only element if there are no valleys.

Parameters
[in]valuesvector of values
[in]thresholdvalue that must exceed the valley values
Returns
vector of iterators to valley elements

◆ makeThreadRanges()

std::vector< std::pair< bamGFFvector::const_iterator, bamGFFvector::const_iterator > > isaSpace::makeThreadRanges ( const bamGFFvector & targetVector,
const size_t & threadCount )
nodiscard

Make per-thread alignment record/annotation vector ranges.

Constructs a vector of iterator pairs bracketing chunks of a vector to be processed in parallel.

Parameters
[in]targetVectorthe vector to be processed
[in]threadCountnumber of threads
Returns
vector of iterator pairs for each thread

◆ modifyCIGAR()

std::unique_ptr< bam1_t, BAMrecordDeleter > isaSpace::modifyCIGAR ( const ReadPortion & modRange,
const std::unique_ptr< bam1_t, BAMrecordDeleter > & bamRecord )
nodiscard

Modify the BAM CIGAR string to erase alignment in a range.

Substitute operations in a BAM record within the given range with non-matching operations.

Parameters
[in]modRangemodification range
[in]bamRecordBAM record to be modified
Returns
BAM record with the CIGAR vector replaced

◆ openBGZFtoAppend()

std::unique_ptr< BGZF, BGZFhandleDeleter > isaSpace::openBGZFtoAppend ( const std::string & bamFileName)

Open a BGZF file handle for appending.

Opens a handle to the BAM file for appending, deleting the original if it exists.

Parameters
[in]bamFileNameBAM file name

◆ parseCL()

std::unordered_map< std::string, std::string > isaSpace::parseCL ( int & argc,
char ** argv )
nodiscard

Command line parser.

Maps flags to values. Flags assumed to be of the form --flag-name value.

Parameters
[in]argcsize of the argv array
[in]argvcommand line input array
Returns
map of tags to values

◆ parseGFF()

std::unordered_map< std::string, std::vector< ExonGroup > > isaSpace::parseGFF ( const std::string & gffFileName)
nodiscard

Parse a GFF file.

Extract exons from a GFF file and group them by gene and chromosome/linkage group. The map keys are linkage groups, scaffolds, or chromosomes plus strand ID.

Parameters
[in]gffFileNameGFF file name
Returns
collection of exon group vectors by chromosome and strand

◆ parseGFFline()

std::array< std::string, nGFFfields > isaSpace::parseGFFline ( const std::string & gffLine)
nodiscard

Parse a GFF line into fields.

Places FAIL in the first element if the number of fields is not nGFFfields as required by the GFF specification.

Parameters
[in]gffLineone line of a GFF file
Returns
each GFF field in a separate element

◆ parseRemappedReadName()

ReadPortion isaSpace::parseRemappedReadName ( const std::string & remappedReadName)
nodiscard

Extract original read name and coordinates.

The remapped read names are original names, with _-separated start and end coordinates. Underscores in the original read name are allowed. If the coordinates are absent, returns an empty object.

Parameters
[in]remappedReadNamere-mapped read portion name
Returns
original read name and segment coordinates

◆ rangesOverlap()

bool isaSpace::rangesOverlap ( const std::pair< hts_pos_t, hts_pos_t > & range1,
const std::pair< hts_pos_t, hts_pos_t > & range2 )
nodiscardnoexcept

Test for range overlap.

The first position in each range need not be before the second in the same range.

Parameters
[in]range1first range
[in]range2second range
Returns
true if there is overlap

◆ stringifyAlignmentRange()

std::string isaSpace::stringifyAlignmentRange ( const bamGFFvector::const_iterator & begin,
const bamGFFvector::const_iterator & end )
nodiscard

Produce a string from a range of read alignments.

Parameters
[in]beginstart iterator
[in]endend iterator
Returns
string with coverage information

◆ stringifyExonCoverage()

std::string isaSpace::stringifyExonCoverage ( const ReadExonCoverage & readRecord,
char separator = '\t' )
nodiscard

Convert ReadExonCoverage to string.

Parameters
[in]readRecordindividual read record
[in]separatorfield separator
Returns
std::string with the read record elements, without a new line at the end

◆ stringifyUnmappedRegions()

std::string isaSpace::stringifyUnmappedRegions ( const bamGFFvector::const_iterator & begin,
const bamGFFvector::const_iterator & end,
const BinomialWindowParameters & windowParameters )
nodiscard

Produce a string of poorly aligned region statistics from an alignment range.

Only saves information from reads that have poorly mapped regions, potentially multiple per read.

Parameters
[in]beginstart iterator
[in]endend iterator
[in]windowParameterssliding window parameters
Returns
string with coverage information