41namespace BayesicSpace {
42 struct LocationWithLength;
44 struct IndividualAndSketchCounts;
46 struct InOutFileNames;
47 struct SparsityParameters;
49 struct HashGroupItPairCount;
158 GenoTableBin() : nIndividuals_{0}, nLoci_{0}, binLocusSize_{0}, nThreads_{1} {};
169 GenoTableBin(
const std::string &inputFileName,
const uint32_t &nIndividuals, std::string logFileName) :
170 GenoTableBin( inputFileName, nIndividuals, std::move(logFileName), std::thread::hardware_concurrency() ) {};
183 GenoTableBin(
const std::string &inputFileName,
const uint32_t &nIndividuals, std::string logFileName,
const size_t &nThreads);
196 GenoTableBin(
const std::vector<int> &maCounts,
const uint32_t &nIndividuals, std::string logFileName) :
197 GenoTableBin( maCounts, nIndividuals, std::move(logFileName), std::thread::hardware_concurrency() ) {};
212 GenoTableBin(
const std::vector<int> &maCounts,
const uint32_t &nIndividuals, std::string logFileName,
const size_t &nThreads);
261 std::vector<uint8_t> binGenotypes_;
263 uint32_t nIndividuals_;
267 size_t binLocusSize_;
271 static const size_t nMagicBytes_;
273 static const uint8_t oneBit_;
275 static const uint8_t byteSize_;
277 static const uint8_t bedGenoPerByte_;
279 static const uint8_t llWordSize_;
281 mutable std::string logMessages_;
283 std::string logFileName_;
292 void bed2binBlk_(
const std::vector<char> &bedData,
const std::pair<size_t, size_t> &bedLocusIndRange,
const LocationWithLength &locusSpan);
302 size_t bed2binThreaded_(
const std::vector<char> &bedData,
const std::vector< std::pair<size_t, size_t> > &threadRanges,
const LocationWithLength &locusSpan);
311 size_t bed2bin_(
const BedDataStats &locusGroupStats, std::fstream &bedStream);
319 void mac2binBlk_(
const std::vector<int> &macData,
const std::pair<size_t, size_t> &locusIndRange);
325 [[gnu::warn_unused_result]]
SimilarityMatrix jaccardBlock_(
const std::pair<RowColIdx, RowColIdx> &blockRange)
const;
331 [[gnu::warn_unused_result]]
SimilarityMatrix jaccardThreaded_(
const std::vector< std::pair<RowColIdx, RowColIdx> > &indexPairs)
const;
348 GenoTableHash() : nIndividuals_{0}, kSketches_{0}, sketchSize_{0}, nLoci_{0}, locusSize_{0}, nFullWordBytes_{0}, nThreads_{1}, emptyBinIdxSeed_{0} {};
380 GenoTableHash( inputFileName, indivSketchCounts, std::thread::hardware_concurrency(), std::move(logFileName) ) {};
414 GenoTableHash( maCounts, indivSketchCounts, std::thread::hardware_concurrency(), std::move(logFileName) ) {};
449 void allHashLD(
const float &similarityCutOff,
const InOutFileNames &bimAndLDnames,
const size_t &suggestNchunks =
static_cast<size_t>(1) )
const;
461 [[gnu::warn_unused_result]] std::vector<HashGroup>
makeLDgroups(
const size_t &nRowsPerBand)
const;
495 std::vector<uint16_t> sketches_;
497 uint32_t nIndividuals_;
501 uint32_t sketchSize_;
507 size_t nFullWordBytes_;
515 uint64_t emptyBinIdxSeed_;
517 mutable std::string logMessages_;
519 std::string logFileName_;
521 static const size_t nMagicBytes_;
523 static const uint8_t oneBit_;
525 static const uint8_t byteSize_;
527 static const uint8_t bedGenoPerByte_;
529 static const uint8_t llWordSize_;
531 static const uint32_t roundMask_;
533 static const uint64_t allBitsSet_;
535 static const size_t wordSizeInBits_;
537 static const uint16_t emptyBinToken_;
547 void permuteBits_(
const std::vector<size_t> &permutationIdx, std::vector<uint8_t> &binLocus)
const;
556 void locusOPH_(
const size_t &locusInd,
const std::vector<size_t> &permutation, std::vector<uint8_t> &binLocus);
567 void bed2ophBlk_(
const std::vector<char> &bedData,
const std::pair<size_t, size_t> &bedLocusIndRange,
const LocationWithLength &bedLocusSpan,
568 const std::vector<size_t> &permutation,
const std::vector< std::pair<size_t, size_t> > &padIndiv);
580 size_t bed2ophThreaded_(
const std::vector<char> &bedData,
const std::vector< std::pair<size_t, size_t> > &threadRanges,
const LocationWithLength &bedLocusSpan,
581 const std::vector<size_t> &permutation,
const std::vector< std::pair<size_t, size_t> > &padIndiv);
592 size_t bed2oph_(
const BedDataStats &locusGroupStats, std::fstream &bedStream,
const std::vector<size_t> &permutation,
593 const std::vector< std::pair<size_t, size_t> > &padIndiv);
604 void mac2ophBlk_(
const std::vector<int> &macData,
const std::pair<size_t, size_t> &blockRange,
605 const std::vector<size_t> &permutation,
const std::vector< std::pair<size_t, size_t> > &padIndiv);
617 size_t mac2ophThreaded_(
const std::vector<int> &macData,
const std::vector< std::pair<size_t, size_t> > &threadRanges,
const LocationWithLength &locusBlock,
618 const std::vector<size_t> &permutation,
const std::vector< std::pair<size_t, size_t> > &padIndiv);
629 [[gnu::warn_unused_result]]
SimilarityMatrix hashJacBlock_(
const std::pair<RowColIdx, RowColIdx> &blockRange,
const std::vector<uint32_t> &locusIndexes,
const float &similarityCutOff)
const;
639 [[gnu::warn_unused_result]]
SimilarityMatrix hashJacBlock_(
const std::pair<HashGroupItPairCount, HashGroupItPairCount> &blockRange,
const float &similarityCutOff)
const;
650 [[gnu::warn_unused_result]]
SimilarityMatrix hashJacThreaded_(
const std::vector< std::pair<RowColIdx, RowColIdx> > &indexPairs,
651 const std::vector<uint32_t> &locusIndexes,
const float &similarityCutOff)
const;
661 [[gnu::warn_unused_result]]
SimilarityMatrix hashJacThreaded_(
const std::vector< std::pair<HashGroupItPairCount, HashGroupItPairCount> > &blockRanges,
const float &similarityCutOff)
const;
667 [[gnu::warn_unused_result]]
JaccardPair makeJaccardPair_(
const RowColIdx &rowColumn)
const noexcept;
Class to store binary compressed genotype tables.
Definition gvarHash.hpp:155
void allJaccardLD(const InOutFileNames &bimAndLDnames, const size_t &suggestNchunks=static_cast< size_t >(1)) const
All by all Jaccard similarity LD with locus names.
GenoTableBin(const std::vector< int > &maCounts, const uint32_t &nIndividuals, std::string logFileName)
Constructor with count vector.
Definition gvarHash.hpp:196
void saveLogFile() const
Save the log to a file.
GenoTableBin & operator=(const GenoTableBin &toCopy)=delete
Copy assignment operator (deleted)
GenoTableBin(GenoTableBin &&toMove) noexcept=default
Move constructor.
GenoTableBin()
Default constructor.
Definition gvarHash.hpp:158
GenoTableBin & operator=(GenoTableBin &&toMove) noexcept=default
Move assignment operator.
void saveGenoBinary(const std::string &outFileName) const
Save the binary genotype file.
GenoTableBin(const std::string &inputFileName, const uint32_t &nIndividuals, std::string logFileName, const size_t &nThreads)
Constructor with input file name and thread count.
GenoTableBin(const GenoTableBin &toCopy)=delete
Copy constructor (deleted)
GenoTableBin(const std::vector< int > &maCounts, const uint32_t &nIndividuals, std::string logFileName, const size_t &nThreads)
Constructor with count vector and thread count.
GenoTableBin(const std::string &inputFileName, const uint32_t &nIndividuals, std::string logFileName)
Constructor with input file name.
Definition gvarHash.hpp:169
~GenoTableBin()=default
Destructor.
Class to store compressed genotype tables.
Definition gvarHash.hpp:345
GenoTableHash(const std::vector< int > &maCounts, const IndividualAndSketchCounts &indivSketchCounts, const size_t &nThreads, std::string logFileName)
Constructor with count vector and thread number.
GenoTableHash(const std::vector< int > &maCounts, const IndividualAndSketchCounts &indivSketchCounts, std::string logFileName)
Constructor with count vector.
Definition gvarHash.hpp:413
GenoTableHash(const std::string &inputFileName, const IndividualAndSketchCounts &indivSketchCounts, std::string logFileName)
Constructor with input file name.
Definition gvarHash.hpp:379
GenoTableHash(const std::string &inputFileName, const IndividualAndSketchCounts &indivSketchCounts, const size_t &nThreads, std::string logFileName)
Constructor with input file name and thread number.
std::vector< HashGroup > makeLDgroups(const size_t &nRowsPerBand) const
Assign groups by linkage disequilibrium (LD)
void allHashLD(const float &similarityCutOff, const InOutFileNames &bimAndLDnames, const size_t &suggestNchunks=static_cast< size_t >(1)) const
All by all LD from hashes.
GenoTableHash(const GenoTableHash &toCopy)=delete
Copy constructor (deleted)
GenoTableHash & operator=(const GenoTableHash &toCopy)=delete
Copy assignment operator (deleted)
GenoTableHash(GenoTableHash &&toMove) noexcept=default
Move constructor.
~GenoTableHash()=default
Destructor.
void ldInGroups(const SparsityParameters &sparsityValues, const InOutFileNames &bimAndLDnames, const size_t &suggestNchunks=static_cast< size_t >(1)) const
LD in groups.
void makeLDgroups(const size_t &nRowsPerBand, const InOutFileNames &bimAndGroupNames) const
Assign groups by LD and save to a file with locus names.
GenoTableHash & operator=(GenoTableHash &&toMove) noexcept=default
Move assignment operator.
void saveLogFile() const
Save the log to a file.
GenoTableHash()
Default constructor.
Definition gvarHash.hpp:348
Similarity matrix.
Definition similarityMatrix.hpp:94
Attributes of .bed format loci.
Definition gvarHash.hpp:90
size_t nLociPerThread
Number of loci per thread.
Definition gvarHash.hpp:94
size_t firstLocusIdx
First locus index.
Definition gvarHash.hpp:92
size_t nMemChunks
Number of chunks needed to fit data into RAM.
Definition gvarHash.hpp:102
size_t nBytesToRead
Number of bytes to read from a .bed file.
Definition gvarHash.hpp:98
size_t nLociToRead
Number of loci to read from a .bed file.
Definition gvarHash.hpp:100
size_t nBytesPerLocus
Number of bytes per locus.
Definition gvarHash.hpp:96
Number of items and their size.
Definition gvarHash.hpp:68
size_t size
Item size.
Definition gvarHash.hpp:72
size_t count
Number of items.
Definition gvarHash.hpp:70
Hash group vector iterator and element number.
Definition gvarHash.hpp:142
std::vector< HashGroup >::const_iterator hgIterator
HashGroup vector iterator
Definition gvarHash.hpp:146
size_t pairCount
Number of pairs already processed.
Definition gvarHash.hpp:144
Hash-derived group.
Definition gvarHash.hpp:131
std::vector< uint32_t > locusIndexes
Indexes of loci in the current groups.
Definition gvarHash.hpp:135
uint64_t cumulativeNpairs
Cumulative number of previously considered pairs.
Definition gvarHash.hpp:133
Input and output file names.
Definition gvarHash.hpp:108
std::string inputFileName
Input file name.
Definition gvarHash.hpp:110
std::string outputFileName
Output file name.
Definition gvarHash.hpp:112
Number of individuals and sketches.
Definition gvarHash.hpp:79
uint32_t nIndividuals
Number of individuals.
Definition gvarHash.hpp:81
uint16_t kSketches
Number of sketches.
Definition gvarHash.hpp:83
Pair of integers to calculate Jaccard similarity.
Definition similarityMatrix.hpp:61
Window location and extent.
Definition gvarHash.hpp:57
size_t length
Window size.
Definition gvarHash.hpp:61
size_t start
Window start index.
Definition gvarHash.hpp:59
Row and column index pair.
Definition similarityMatrix.hpp:47
LD matrix sparsity parameters.
Definition gvarHash.hpp:120
float similarityCutOff
Similarity cut-off value for saving pairs.
Definition gvarHash.hpp:124
size_t nRowsPerBand
Number of rows in a band of a banded hash.
Definition gvarHash.hpp:122