vash
Fast genetic similarity estimation with hash tables
Loading...
Searching...
No Matches
vashFunctions.hpp
Go to the documentation of this file.
1/*
2 * Copyright (c) 2023 Anthony J. Greenberg
3 *
4 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
5 *
6 * 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7 *
8 * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
9 *
10 * 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11 *
12 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
13 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
14 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
15 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
16 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
17 * THE POSSIBILITY OF SUCH DAMAGE.
18 */
19
21
30#pragma once
31
32#include <cstdint>
33#include <cstddef>
34#include <vector>
35#include <array>
36#include <unordered_map>
37
38#include "gvarHash.hpp"
39#include "similarityMatrix.hpp"
40
41namespace BayesicSpace {
43 constexpr size_t SIZE_OF_SIZET{sizeof(size_t) / sizeof(uint32_t)};
45 constexpr size_t N_BED_TEST_BYTES{3};
53 [[gnu::warn_unused_result]] uint16_t countSetBits(uint16_t inVal);
61 [[gnu::warn_unused_result]] uint64_t countSetBits(const std::vector<uint8_t> &inVec);
70 [[gnu::warn_unused_result]] uint64_t countSetBits(const std::vector<uint8_t> &inVec, const LocationWithLength &window);
77 [[gnu::warn_unused_result]] size_t getAvailableRAM();
87 [[gnu::warn_unused_result]] uint32_t murMurHashMixer(const std::array<uint32_t, SIZE_OF_SIZET> &key, const uint32_t &seed);
96 [[gnu::warn_unused_result]] uint32_t murMurHashFinalizer(const uint32_t &inputHash);
106 [[gnu::warn_unused_result]] uint32_t murMurHash(const std::array<uint32_t, SIZE_OF_SIZET> &key, const uint32_t &seed);
116 [[gnu::warn_unused_result]] uint32_t murMurHash(const std::vector<size_t> &key, const uint32_t &seed);
126 [[gnu::warn_unused_result]] uint32_t murMurHash(const std::vector<uint32_t> &key, const uint32_t &seed);
137 [[gnu::warn_unused_result]] uint32_t murMurHash(const std::vector<uint16_t> &key, const LocationWithLength &keyWindow, const uint32_t &seed);
144 void testBedMagicBytes(const std::array<char, N_BED_TEST_BYTES> &bytesToTest);
152 [[gnu::warn_unused_result]] std::vector< std::pair<size_t, size_t> > makeThreadRanges(const CountAndSize &threadPoolSizes);
162 [[gnu::warn_unused_result]] std::vector<size_t> makeChunkSizes(const size_t &nElements, const size_t &nChunks);
171 [[gnu::warn_unused_result]] std::vector< std::pair<RowColIdx, RowColIdx> > makeChunkRanges(const LocationWithLength &startAndChunkSize, const size_t nChunks);
182 [[gnu::warn_unused_result]] std::pair<HashGroupItPairCount, HashGroupItPairCount>
183 makeGroupRanges(const std::vector<HashGroup> &groupVector, const HashGroupItPairCount &startHGPC, const size_t &chunkSize);
201 void binarizeBedLocus(const LocationWithLength &bedLocusWindow, const std::vector<char> &bedLocus, const size_t &nIndividuals,
202 const LocationWithLength &binLocusWindow, std::vector<uint8_t> &binLocus);
217 void binarizeMacLocus(const std::vector<int> &macLocus, const LocationWithLength &binLocusWindow, std::vector<uint8_t> &binLocus);
225 [[gnu::warn_unused_result]] std::vector<std::string> getLocusNames(const std::string &bimFileName);
234 void parseCL(int &argc, char **argv, std::unordered_map<std::string, std::string> &cli);
244 void extractCLinfo(const std::unordered_map<std::string, std::string> &parsedCLI, std::unordered_map<std::string, int> &intVariables,
245 std::unordered_map<std::string, float> &floatVariables, std::unordered_map<std::string, std::string> &stringVariables);
246}
Summarize variant tables by hashing.
Similarity matrix.
Number of items and their size.
Definition gvarHash.hpp:68
Hash group vector iterator and element number.
Definition gvarHash.hpp:142
Window location and extent.
Definition gvarHash.hpp:57
void binarizeMacLocus(const std::vector< int > &macLocus, const LocationWithLength &binLocusWindow, std::vector< uint8_t > &binLocus)
Convert a locus from a vector of minor allele counts.
std::pair< HashGroupItPairCount, HashGroupItPairCount > makeGroupRanges(const std::vector< HashGroup > &groupVector, const HashGroupItPairCount &startHGPC, const size_t &chunkSize)
Delimit a chunk of indexes.
uint32_t murMurHash(const std::array< uint32_t, SIZE_OF_SIZET > &key, const uint32_t &seed)
MurMurHash of an index value.
std::vector< std::pair< size_t, size_t > > makeThreadRanges(const CountAndSize &threadPoolSizes)
Build thread ranges.
void binarizeBedLocus(const LocationWithLength &bedLocusWindow, const std::vector< char > &bedLocus, const size_t &nIndividuals, const LocationWithLength &binLocusWindow, std::vector< uint8_t > &binLocus)
Convert a locus from .bed to binary format.
constexpr size_t N_BED_TEST_BYTES
Number of test bytes in a .bed file.
Definition vashFunctions.hpp:45
constexpr size_t SIZE_OF_SIZET
Number of 32-bit values in size_t
Definition vashFunctions.hpp:43
uint32_t murMurHashFinalizer(const uint32_t &inputHash)
MurMurHash finalizer.
size_t getAvailableRAM()
Get available RAM.
std::vector< std::pair< RowColIdx, RowColIdx > > makeChunkRanges(const LocationWithLength &startAndChunkSize, const size_t nChunks)
Build chunk ranges.
void parseCL(int &argc, char **argv, std::unordered_map< std::string, std::string > &cli)
Command line parser.
void extractCLinfo(const std::unordered_map< std::string, std::string > &parsedCLI, std::unordered_map< std::string, int > &intVariables, std::unordered_map< std::string, float > &floatVariables, std::unordered_map< std::string, std::string > &stringVariables)
Extract parameters from parsed command line interface flags.
std::vector< size_t > makeChunkSizes(const size_t &nElements, const size_t &nChunks)
Build chunk sizes.
std::vector< std::string > getLocusNames(const std::string &bimFileName)
Extract locus names.
uint32_t murMurHashMixer(const std::array< uint32_t, SIZE_OF_SIZET > &key, const uint32_t &seed)
MurMurHash mixer module of an index value.
void testBedMagicBytes(const std::array< char, N_BED_TEST_BYTES > &bytesToTest)
Test .bed magic bytes.
uint16_t countSetBits(uint16_t inVal)
Count set bits in a 16-bit word.