vash
Fast genetic similarity estimation with hash tables
Loading...
Searching...
No Matches
gvarHash.hpp
Go to the documentation of this file.
1/*
2 * Copyright (c) 2021 Anthony J. Greenberg
3 *
4 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
5 *
6 * 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7 *
8 * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
9 *
10 * 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11 *
12 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
13 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
14 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
15 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
16 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
17 * THE POSSIBILITY OF SUCH DAMAGE.
18 */
19
21
30#pragma once
31
32#include <cstddef>
33#include <fstream>
34#include <vector>
35#include <utility> // for std::pair
36#include <string>
37#include <thread>
38
39#include "similarityMatrix.hpp"
40
41namespace BayesicSpace {
42 struct LocationWithLength;
43 struct CountAndSize;
44 struct IndividualAndSketchCounts;
45 struct BedDataStats;
46 struct InOutFileNames;
47 struct SparsityParameters;
48 struct HashGroup;
49 struct HashGroupItPairCount;
50 class GenoTableBin;
51 class GenoTableHash;
52
59 size_t start;
61 size_t length;
62 };
63
68 struct CountAndSize {
70 size_t count;
72 size_t size;
73 };
74
81 uint32_t nIndividuals;
83 uint16_t kSketches;
84 };
85
110 std::string inputFileName;
112 std::string outputFileName;
113 };
131 struct HashGroup {
135 std::vector<uint32_t> locusIndexes;
136 };
137
144 size_t pairCount{0};
146 std::vector<HashGroup>::const_iterator hgIterator;
147 };
148
156 public:
158 GenoTableBin() : nIndividuals_{0}, nLoci_{0}, binLocusSize_{0}, nThreads_{1} {};
169 GenoTableBin(const std::string &inputFileName, const uint32_t &nIndividuals, std::string logFileName) :
170 GenoTableBin( inputFileName, nIndividuals, std::move(logFileName), std::thread::hardware_concurrency() ) {};
183 GenoTableBin(const std::string &inputFileName, const uint32_t &nIndividuals, std::string logFileName, const size_t &nThreads);
196 GenoTableBin(const std::vector<int> &maCounts, const uint32_t &nIndividuals, std::string logFileName) :
197 GenoTableBin( maCounts, nIndividuals, std::move(logFileName), std::thread::hardware_concurrency() ) {};
212 GenoTableBin(const std::vector<int> &maCounts, const uint32_t &nIndividuals, std::string logFileName, const size_t &nThreads);
213
215 GenoTableBin(const GenoTableBin &toCopy) = delete;
217 GenoTableBin& operator=(const GenoTableBin &toCopy) = delete;
222 GenoTableBin(GenoTableBin &&toMove) noexcept = default;
228 GenoTableBin& operator=(GenoTableBin &&toMove) noexcept = default;
230 ~GenoTableBin() = default;
231
238 void saveGenoBinary(const std::string &outFileName) const;
250 void allJaccardLD( const InOutFileNames &bimAndLDnames, const size_t &suggestNchunks = static_cast<size_t>(1) ) const;
255 void saveLogFile() const;
256 private:
261 std::vector<uint8_t> binGenotypes_;
263 uint32_t nIndividuals_;
265 uint32_t nLoci_;
267 size_t binLocusSize_;
269 size_t nThreads_;
271 static const size_t nMagicBytes_;
273 static const uint8_t oneBit_;
275 static const uint8_t byteSize_;
277 static const uint8_t bedGenoPerByte_;
279 static const uint8_t llWordSize_;
281 mutable std::string logMessages_;
283 std::string logFileName_;
292 void bed2binBlk_(const std::vector<char> &bedData, const std::pair<size_t, size_t> &bedLocusIndRange, const LocationWithLength &locusSpan);
302 size_t bed2binThreaded_(const std::vector<char> &bedData, const std::vector< std::pair<size_t, size_t> > &threadRanges, const LocationWithLength &locusSpan);
311 size_t bed2bin_(const BedDataStats &locusGroupStats, std::fstream &bedStream);
319 void mac2binBlk_(const std::vector<int> &macData, const std::pair<size_t, size_t> &locusIndRange);
325 [[gnu::warn_unused_result]] SimilarityMatrix jaccardBlock_(const std::pair<RowColIdx, RowColIdx> &blockRange) const;
331 [[gnu::warn_unused_result]] SimilarityMatrix jaccardThreaded_(const std::vector< std::pair<RowColIdx, RowColIdx> > &indexPairs) const;
337 [[gnu::warn_unused_result]] JaccardPair makeJaccardPair_(const RowColIdx &rowColumn) const;
338 };
346 public:
348 GenoTableHash() : nIndividuals_{0}, kSketches_{0}, sketchSize_{0}, nLoci_{0}, locusSize_{0}, nFullWordBytes_{0}, nThreads_{1}, emptyBinIdxSeed_{0} {};
364 GenoTableHash(const std::string &inputFileName, const IndividualAndSketchCounts &indivSketchCounts, const size_t &nThreads, std::string logFileName);
379 GenoTableHash(const std::string &inputFileName, const IndividualAndSketchCounts &indivSketchCounts, std::string logFileName) :
380 GenoTableHash( inputFileName, indivSketchCounts, std::thread::hardware_concurrency(), std::move(logFileName) ) {};
398 GenoTableHash(const std::vector<int> &maCounts, const IndividualAndSketchCounts &indivSketchCounts, const size_t &nThreads, std::string logFileName);
413 GenoTableHash(const std::vector<int> &maCounts, const IndividualAndSketchCounts &indivSketchCounts, std::string logFileName) :
414 GenoTableHash( maCounts, indivSketchCounts, std::thread::hardware_concurrency(), std::move(logFileName) ) {};
415
417 GenoTableHash(const GenoTableHash &toCopy) = delete;
419 GenoTableHash& operator=(const GenoTableHash &toCopy) = delete;
424 GenoTableHash(GenoTableHash &&toMove) noexcept = default;
430 GenoTableHash& operator=(GenoTableHash &&toMove) noexcept = default;
432 ~GenoTableHash() = default;
433
449 void allHashLD( const float &similarityCutOff, const InOutFileNames &bimAndLDnames, const size_t &suggestNchunks = static_cast<size_t>(1) ) const;
461 [[gnu::warn_unused_result]] std::vector<HashGroup> makeLDgroups(const size_t &nRowsPerBand) const;
470 void makeLDgroups(const size_t &nRowsPerBand, const InOutFileNames &bimAndGroupNames) const;
484 void ldInGroups(const SparsityParameters &sparsityValues, const InOutFileNames &bimAndLDnames, const size_t &suggestNchunks = static_cast<size_t>(1) ) const;
489 void saveLogFile() const;
490 private:
495 std::vector<uint16_t> sketches_;
497 uint32_t nIndividuals_;
499 uint16_t kSketches_;
501 uint32_t sketchSize_;
503 uint32_t nLoci_;
505 uint32_t locusSize_;
507 size_t nFullWordBytes_;
509 size_t nThreads_;
515 uint64_t emptyBinIdxSeed_;
517 mutable std::string logMessages_;
519 std::string logFileName_;
521 static const size_t nMagicBytes_;
523 static const uint8_t oneBit_;
525 static const uint8_t byteSize_;
527 static const uint8_t bedGenoPerByte_;
529 static const uint8_t llWordSize_;
531 static const uint32_t roundMask_;
533 static const uint64_t allBitsSet_;
535 static const size_t wordSizeInBits_;
537 static const uint16_t emptyBinToken_;
547 void permuteBits_(const std::vector<size_t> &permutationIdx, std::vector<uint8_t> &binLocus) const;
556 void locusOPH_(const size_t &locusInd, const std::vector<size_t> &permutation, std::vector<uint8_t> &binLocus);
567 void bed2ophBlk_(const std::vector<char> &bedData, const std::pair<size_t, size_t> &bedLocusIndRange, const LocationWithLength &bedLocusSpan,
568 const std::vector<size_t> &permutation, const std::vector< std::pair<size_t, size_t> > &padIndiv);
580 size_t bed2ophThreaded_(const std::vector<char> &bedData, const std::vector< std::pair<size_t, size_t> > &threadRanges, const LocationWithLength &bedLocusSpan,
581 const std::vector<size_t> &permutation, const std::vector< std::pair<size_t, size_t> > &padIndiv);
592 size_t bed2oph_(const BedDataStats &locusGroupStats, std::fstream &bedStream, const std::vector<size_t> &permutation,
593 const std::vector< std::pair<size_t, size_t> > &padIndiv);
604 void mac2ophBlk_(const std::vector<int> &macData, const std::pair<size_t, size_t> &blockRange,
605 const std::vector<size_t> &permutation, const std::vector< std::pair<size_t, size_t> > &padIndiv);
617 size_t mac2ophThreaded_(const std::vector<int> &macData, const std::vector< std::pair<size_t, size_t> > &threadRanges, const LocationWithLength &locusBlock,
618 const std::vector<size_t> &permutation, const std::vector< std::pair<size_t, size_t> > &padIndiv);
629 [[gnu::warn_unused_result]] SimilarityMatrix hashJacBlock_(const std::pair<RowColIdx, RowColIdx> &blockRange, const std::vector<uint32_t> &locusIndexes, const float &similarityCutOff) const;
639 [[gnu::warn_unused_result]] SimilarityMatrix hashJacBlock_(const std::pair<HashGroupItPairCount, HashGroupItPairCount> &blockRange, const float &similarityCutOff) const;
650 [[gnu::warn_unused_result]] SimilarityMatrix hashJacThreaded_(const std::vector< std::pair<RowColIdx, RowColIdx> > &indexPairs,
651 const std::vector<uint32_t> &locusIndexes, const float &similarityCutOff) const;
661 [[gnu::warn_unused_result]] SimilarityMatrix hashJacThreaded_(const std::vector< std::pair<HashGroupItPairCount, HashGroupItPairCount> > &blockRanges, const float &similarityCutOff) const;
667 [[gnu::warn_unused_result]] JaccardPair makeJaccardPair_(const RowColIdx &rowColumn) const noexcept;
668 };
669}
Class to store binary compressed genotype tables.
Definition gvarHash.hpp:155
void allJaccardLD(const InOutFileNames &bimAndLDnames, const size_t &suggestNchunks=static_cast< size_t >(1)) const
All by all Jaccard similarity LD with locus names.
GenoTableBin(const std::vector< int > &maCounts, const uint32_t &nIndividuals, std::string logFileName)
Constructor with count vector.
Definition gvarHash.hpp:196
void saveLogFile() const
Save the log to a file.
GenoTableBin & operator=(const GenoTableBin &toCopy)=delete
Copy assignment operator (deleted)
GenoTableBin(GenoTableBin &&toMove) noexcept=default
Move constructor.
GenoTableBin()
Default constructor.
Definition gvarHash.hpp:158
GenoTableBin & operator=(GenoTableBin &&toMove) noexcept=default
Move assignment operator.
void saveGenoBinary(const std::string &outFileName) const
Save the binary genotype file.
GenoTableBin(const std::string &inputFileName, const uint32_t &nIndividuals, std::string logFileName, const size_t &nThreads)
Constructor with input file name and thread count.
GenoTableBin(const GenoTableBin &toCopy)=delete
Copy constructor (deleted)
GenoTableBin(const std::vector< int > &maCounts, const uint32_t &nIndividuals, std::string logFileName, const size_t &nThreads)
Constructor with count vector and thread count.
GenoTableBin(const std::string &inputFileName, const uint32_t &nIndividuals, std::string logFileName)
Constructor with input file name.
Definition gvarHash.hpp:169
~GenoTableBin()=default
Destructor.
Class to store compressed genotype tables.
Definition gvarHash.hpp:345
GenoTableHash(const std::vector< int > &maCounts, const IndividualAndSketchCounts &indivSketchCounts, const size_t &nThreads, std::string logFileName)
Constructor with count vector and thread number.
GenoTableHash(const std::vector< int > &maCounts, const IndividualAndSketchCounts &indivSketchCounts, std::string logFileName)
Constructor with count vector.
Definition gvarHash.hpp:413
GenoTableHash(const std::string &inputFileName, const IndividualAndSketchCounts &indivSketchCounts, std::string logFileName)
Constructor with input file name.
Definition gvarHash.hpp:379
GenoTableHash(const std::string &inputFileName, const IndividualAndSketchCounts &indivSketchCounts, const size_t &nThreads, std::string logFileName)
Constructor with input file name and thread number.
std::vector< HashGroup > makeLDgroups(const size_t &nRowsPerBand) const
Assign groups by linkage disequilibrium (LD)
void allHashLD(const float &similarityCutOff, const InOutFileNames &bimAndLDnames, const size_t &suggestNchunks=static_cast< size_t >(1)) const
All by all LD from hashes.
GenoTableHash(const GenoTableHash &toCopy)=delete
Copy constructor (deleted)
GenoTableHash & operator=(const GenoTableHash &toCopy)=delete
Copy assignment operator (deleted)
GenoTableHash(GenoTableHash &&toMove) noexcept=default
Move constructor.
~GenoTableHash()=default
Destructor.
void ldInGroups(const SparsityParameters &sparsityValues, const InOutFileNames &bimAndLDnames, const size_t &suggestNchunks=static_cast< size_t >(1)) const
LD in groups.
void makeLDgroups(const size_t &nRowsPerBand, const InOutFileNames &bimAndGroupNames) const
Assign groups by LD and save to a file with locus names.
GenoTableHash & operator=(GenoTableHash &&toMove) noexcept=default
Move assignment operator.
void saveLogFile() const
Save the log to a file.
GenoTableHash()
Default constructor.
Definition gvarHash.hpp:348
Similarity matrix.
Definition similarityMatrix.hpp:94
Similarity matrix.
Attributes of .bed format loci.
Definition gvarHash.hpp:90
size_t nLociPerThread
Number of loci per thread.
Definition gvarHash.hpp:94
size_t firstLocusIdx
First locus index.
Definition gvarHash.hpp:92
size_t nMemChunks
Number of chunks needed to fit data into RAM.
Definition gvarHash.hpp:102
size_t nBytesToRead
Number of bytes to read from a .bed file.
Definition gvarHash.hpp:98
size_t nLociToRead
Number of loci to read from a .bed file.
Definition gvarHash.hpp:100
size_t nBytesPerLocus
Number of bytes per locus.
Definition gvarHash.hpp:96
Number of items and their size.
Definition gvarHash.hpp:68
size_t size
Item size.
Definition gvarHash.hpp:72
size_t count
Number of items.
Definition gvarHash.hpp:70
Hash group vector iterator and element number.
Definition gvarHash.hpp:142
std::vector< HashGroup >::const_iterator hgIterator
HashGroup vector iterator
Definition gvarHash.hpp:146
size_t pairCount
Number of pairs already processed.
Definition gvarHash.hpp:144
Hash-derived group.
Definition gvarHash.hpp:131
std::vector< uint32_t > locusIndexes
Indexes of loci in the current groups.
Definition gvarHash.hpp:135
uint64_t cumulativeNpairs
Cumulative number of previously considered pairs.
Definition gvarHash.hpp:133
Input and output file names.
Definition gvarHash.hpp:108
std::string inputFileName
Input file name.
Definition gvarHash.hpp:110
std::string outputFileName
Output file name.
Definition gvarHash.hpp:112
Number of individuals and sketches.
Definition gvarHash.hpp:79
uint32_t nIndividuals
Number of individuals.
Definition gvarHash.hpp:81
uint16_t kSketches
Number of sketches.
Definition gvarHash.hpp:83
Pair of integers to calculate Jaccard similarity.
Definition similarityMatrix.hpp:61
Window location and extent.
Definition gvarHash.hpp:57
size_t length
Window size.
Definition gvarHash.hpp:61
size_t start
Window start index.
Definition gvarHash.hpp:59
Row and column index pair.
Definition similarityMatrix.hpp:47
LD matrix sparsity parameters.
Definition gvarHash.hpp:120
float similarityCutOff
Similarity cut-off value for saving pairs.
Definition gvarHash.hpp:124
size_t nRowsPerBand
Number of rows in a band of a banded hash.
Definition gvarHash.hpp:122