vash
Fast genetic similarity estimation with hash tables
|
Auxiliary functions for variant hashing. More...
#include <cstdint>
#include <cstddef>
#include <vector>
#include <array>
#include <unordered_map>
#include "gvarHash.hpp"
#include "similarityMatrix.hpp"
Go to the source code of this file.
Functions | |
uint16_t | BayesicSpace::countSetBits (uint16_t inVal) |
Count set bits in a 16-bit word. | |
uint64_t | BayesicSpace::countSetBits (const std::vector< uint8_t > &inVec) |
Count set bits in a vector. | |
uint64_t | BayesicSpace::countSetBits (const std::vector< uint8_t > &inVec, const LocationWithLength &window) |
Count set bits in a range within a vector. | |
size_t | BayesicSpace::getAvailableRAM () |
Get available RAM. | |
uint32_t | BayesicSpace::murMurHashMixer (const std::array< uint32_t, SIZE_OF_SIZET > &key, const uint32_t &seed) |
MurMurHash mixer module of an index value. | |
uint32_t | BayesicSpace::murMurHashFinalizer (const uint32_t &inputHash) |
MurMurHash finalizer. | |
uint32_t | BayesicSpace::murMurHash (const std::array< uint32_t, SIZE_OF_SIZET > &key, const uint32_t &seed) |
MurMurHash of an index value. | |
uint32_t | BayesicSpace::murMurHash (const std::vector< size_t > &key, const uint32_t &seed) |
MurMurHash of a vector of indexes. | |
uint32_t | BayesicSpace::murMurHash (const std::vector< uint32_t > &key, const uint32_t &seed) |
MurMurHash of a vector of 32-bit unsigned integers. | |
uint32_t | BayesicSpace::murMurHash (const std::vector< uint16_t > &key, const LocationWithLength &keyWindow, const uint32_t &seed) |
MurMurHash of a vector of indexes. | |
void | BayesicSpace::testBedMagicBytes (const std::array< char, N_BED_TEST_BYTES > &bytesToTest) |
Test .bed magic bytes. | |
std::vector< std::pair< size_t, size_t > > | BayesicSpace::makeThreadRanges (const CountAndSize &threadPoolSizes) |
Build thread ranges. | |
std::vector< size_t > | BayesicSpace::makeChunkSizes (const size_t &nElements, const size_t &nChunks) |
Build chunk sizes. | |
std::vector< std::pair< RowColIdx, RowColIdx > > | BayesicSpace::makeChunkRanges (const LocationWithLength &startAndChunkSize, const size_t nChunks) |
Build chunk ranges. | |
std::pair< HashGroupItPairCount, HashGroupItPairCount > | BayesicSpace::makeGroupRanges (const std::vector< HashGroup > &groupVector, const HashGroupItPairCount &startHGPC, const size_t &chunkSize) |
Delimit a chunk of indexes. | |
void | BayesicSpace::binarizeBedLocus (const LocationWithLength &bedLocusWindow, const std::vector< char > &bedLocus, const size_t &nIndividuals, const LocationWithLength &binLocusWindow, std::vector< uint8_t > &binLocus) |
Convert a locus from .bed to binary format. | |
void | BayesicSpace::binarizeMacLocus (const std::vector< int > &macLocus, const LocationWithLength &binLocusWindow, std::vector< uint8_t > &binLocus) |
Convert a locus from a vector of minor allele counts. | |
std::vector< std::string > | BayesicSpace::getLocusNames (const std::string &bimFileName) |
Extract locus names. | |
void | BayesicSpace::parseCL (int &argc, char **argv, std::unordered_map< std::string, std::string > &cli) |
Command line parser. | |
void | BayesicSpace::extractCLinfo (const std::unordered_map< std::string, std::string > &parsedCLI, std::unordered_map< std::string, int > &intVariables, std::unordered_map< std::string, float > &floatVariables, std::unordered_map< std::string, std::string > &stringVariables) |
Extract parameters from parsed command line interface flags. | |
Auxiliary functions for variant hashing.
Definitions of class-external functions needed by hashing classes.
void BayesicSpace::binarizeBedLocus | ( | const LocationWithLength & | bedLocusWindow, |
const std::vector< char > & | bedLocus, | ||
const size_t & | nIndividuals, | ||
const LocationWithLength & | binLocusWindow, | ||
std::vector< uint8_t > & | binLocus ) |
Convert a locus from .bed to binary format.
Convert the .bed two-bit format to one-bit binary.
If the number of individuals is not divisible by eight, the last binary byte is padded with 0s.
[in] | bedLocusWindow | .bed locus window |
[in] | bedLocus | vector of .bed format bytes |
[in] | nIndividuals | number of individuals |
[in] | binLocusWindow | binary locus window |
[out] | binLocus | vector of binary format bytes |
void BayesicSpace::binarizeMacLocus | ( | const std::vector< int > & | macLocus, |
const LocationWithLength & | binLocusWindow, | ||
std::vector< uint8_t > & | binLocus ) |
Convert a locus from a vector of minor allele counts.
Convert minor allele counts to one-bit binary. Input is a vector of minor allele counts (0, 1, or 2) or -9 for missing data. Heterozygotes are assigned the major or minor allele at random, missing genotypes are assigned the major allele. The counts are checked and re-coded if necessary so that set bits represent the minor allele. This function should run faster if the 0 is the major allele homozygote. While the above values are the norm, any negative number will be interpreted as missing, any odd number as 1, and any (non-0) even number as 2.
If the number of individuals is not divisible by eight, the last binary byte is padded with 0s.
[in] | macLocus | vector of minor allele counts at a locus |
[in] | binLocusWindow | window into the binary vector giving the locus index and length in bytes |
[out] | binLocus | vector of binary format bytes |
uint64_t BayesicSpace::countSetBits | ( | const std::vector< uint8_t > & | inVec | ) |
Count set bits in a vector.
Counting the set bits in a vector of bytes using Karnigan's method.
[in] | inVec | input vector |
uint64_t BayesicSpace::countSetBits | ( | const std::vector< uint8_t > & | inVec, |
const LocationWithLength & | window ) |
Count set bits in a range within a vector.
Counting the set bits in a range within a vector of bytes using Karnigan's method.
[in] | inVec | input vector |
[in] | window | vector window in bytes |
uint16_t BayesicSpace::countSetBits | ( | uint16_t | inVal | ) |
Count set bits in a 16-bit word.
Counting the set bits using Karnigan's method. Passing by value to modify the copy and also because the address is much bigger than 16 bits.
[in] | inVal | input value |
void BayesicSpace::extractCLinfo | ( | const std::unordered_map< std::string, std::string > & | parsedCLI, |
std::unordered_map< std::string, int > & | intVariables, | ||
std::unordered_map< std::string, float > & | floatVariables, | ||
std::unordered_map< std::string, std::string > & | stringVariables ) |
Extract parameters from parsed command line interface flags.
Extracts needed variable values, indexed by std::string
encoded variable names.
[in] | parsedCLI | flag values parsed from the command line |
[out] | intVariables | indexed int variables for use by main() |
[out] | floatVariables | indexed float variables for use by main() |
[out] | stringVariables | indexed std::string variables for use by main() |
size_t BayesicSpace::getAvailableRAM | ( | ) |
Get available RAM.
Estimates available RAM. If procfs
is mounted, uses information from there. Otherwise, sets available RAM to 2 GiB.
std::vector< std::string > BayesicSpace::getLocusNames | ( | const std::string & | bimFileName | ) |
Extract locus names.
Extract locus names from a .bim file.
[in] | bimFileName | .bim file name |
std::vector< std::pair< RowColIdx, RowColIdx > > BayesicSpace::makeChunkRanges | ( | const LocationWithLength & | startAndChunkSize, |
const size_t | nChunks ) |
Build chunk ranges.
Build ranges of row/column index pairs for each chunk for a given length of a vectorized similarity matrix.
[in] | startAndChunkSize | full matrix index and chunk size |
[in] | nChunks | number of chunks |
std::vector< size_t > BayesicSpace::makeChunkSizes | ( | const size_t & | nElements, |
const size_t & | nChunks ) |
Build chunk sizes.
Build a vector of chunk sizes. If the number of elements is not evenly divisible by the number of chunks, the remainder is spread among the first number of elements modulo number of chunks.
[in] | nElements | number of elements |
[in] | nChunks | number of chunks |
std::pair< HashGroupItPairCount, HashGroupItPairCount > BayesicSpace::makeGroupRanges | ( | const std::vector< HashGroup > & | groupVector, |
const HashGroupItPairCount & | startHGPC, | ||
const size_t & | chunkSize ) |
Delimit a chunk of indexes.
Identifies the start and end hash table buckets (groups of loci) and indexes within them for chunked pair-wise LD estimation.
[in] | groupVector | vector of hash table buckets |
[in] | startHGPC | start group iterator with index into the group |
[in] | chunkSize | size of the chunk to be processed |
std::vector< std::pair< size_t, size_t > > BayesicSpace::makeThreadRanges | ( | const CountAndSize & | threadPoolSizes | ) |
Build thread ranges.
Build index ranges to use within each thread.
[in] | threadPoolSizes | number of threads and number of loci per thread |
uint32_t BayesicSpace::murMurHash | ( | const std::array< uint32_t, SIZE_OF_SIZET > & | key, |
const uint32_t & | seed ) |
MurMurHash of an index value.
Generates a 32-bit hash of an index value using the MurMurHash3 algorithm.
[in] | key | the key to be hashed |
[in] | seed | the seed |
uint32_t BayesicSpace::murMurHash | ( | const std::vector< size_t > & | key, |
const uint32_t & | seed ) |
MurMurHash of a vector of indexes.
Generates a 32-bit hash of an index value vector using the MurMurHash3 algorithm.
[in] | key | the key vector to be hashed |
[in] | seed | the seed |
uint32_t BayesicSpace::murMurHash | ( | const std::vector< uint16_t > & | key, |
const LocationWithLength & | keyWindow, | ||
const uint32_t & | seed ) |
MurMurHash of a vector of indexes.
Generates a 32-bit hash of a vector of uint16_t
values using the MurMurHash3 algorithm.
[in] | key | the vector to be hashed |
[in] | keyWindow | the range of elements in the key to hash |
[in] | seed | the hash seed |
uint32_t BayesicSpace::murMurHash | ( | const std::vector< uint32_t > & | key, |
const uint32_t & | seed ) |
MurMurHash of a vector of 32-bit unsigned integers.
Generates a 32-bit hash of a vector of unsigned 32-bit integers using the MurMurHash3 algorithm.
[in] | key | the key vector to be hashed |
[in] | seed | the seed |
uint32_t BayesicSpace::murMurHashFinalizer | ( | const uint32_t & | inputHash | ) |
MurMurHash finalizer.
MurMurHash3 finalizer for a hash value.
[in] | inputHash | input unfinlized hash value |
uint32_t BayesicSpace::murMurHashMixer | ( | const std::array< uint32_t, SIZE_OF_SIZET > & | key, |
const uint32_t & | seed ) |
MurMurHash mixer module of an index value.
Generates a 32-bit an unfinalized hash of an index value using the MurMurHash3 algorithm.
[in] | key | the key to be hashed |
[in] | seed | the seed |
void BayesicSpace::parseCL | ( | int & | argc, |
char ** | argv, | ||
std::unordered_map< std::string, std::string > & | cli ) |
Command line parser.
Maps flags to values. Flags assumed to be of the form --flag-name value
.
[in] | argc | size of the argv array |
[in] | argv | command line input array |
[out] | cli | map of tags to values |
void BayesicSpace::testBedMagicBytes | ( | const std::array< char, N_BED_TEST_BYTES > & | bytesToTest | ) |
Test .bed magic bytes.
Throws if one of the input bytes does not match the three magic values in plink
.bed files.
[in] | bytesToTest | the byte set to test |