BALL 1.5.0
Loading...
Searching...
No Matches
binaryFingerprintMethods.h
Go to the documentation of this file.
1// -*- Mode: C++; tab-width: 2; -*-
2// vi: set ts=2:
3//
4
5#ifndef BALL_STRUCTURE_BINARYFINGERPRINTMETHODS_H
6#define BALL_STRUCTURE_BINARYFINGERPRINTMETHODS_H
7
8#ifndef BALL_COMMON_H
9# include <BALL/common.h>
10#endif
11
12#ifndef BALL_DATATYPE_OPTIONS_H
14#endif
15
16
17#include <boost/graph/adjacency_list.hpp>
18#include <boost/graph/graph_traits.hpp>
19#include <boost/graph/incremental_components.hpp>
20#include <boost/thread/mutex.hpp>
21#include <boost/thread/thread.hpp>
22#include <boost/unordered_set.hpp>
23
24
25#include <map>
26#include <set>
27#include <vector>
28
29
30namespace BALL
31{
67 {
68 struct InvertedIndex;
69
74
75 typedef std::vector<InvertedIndex*> InvertedIndices;
76 typedef std::vector<std::vector<unsigned short> > FingerprintFeatures;
77
78 typedef boost::adjacency_list<boost::vecS, boost::vecS, boost::undirectedS> SimilarityGraph;
79 typedef boost::graph_traits<SimilarityGraph>::vertex_descriptor Vertex;
80 typedef boost::graph_traits<SimilarityGraph>::vertices_size_type VertexIndex;
81 typedef boost::disjoint_sets<Vertex*, VertexIndex*, boost::find_with_full_path_compression> DisjointSet;
82
84
85 public:
90
95 {
99 static const String BLOCKSIZE;
100
104 static const String SIM_CUTOFF;
105
109 static const String PRECISION;
110
114 static const String STORE_NN;
115
120 static const String MAX_CLUSTERS;
121
125 static const String N_THREADS;
126
130 static const String VERBOSITY;
131 };
132
137 {
138 static const unsigned short BLOCKSIZE;
139 static const float SIM_CUTOFF;
140 static const float PRECISION;
141 static const unsigned int MAX_CLUSTERS;
142 static const bool STORE_NN;
143 static const unsigned int N_THREADS;
144 static const int VERBOSITY;
145 };
146
147
152
157
158
164
165
171 BinaryFingerprintMethods(const Options& options, const FingerprintFeatures& lib_features);
172
173
180 BinaryFingerprintMethods(const Options& options, const FingerprintFeatures& lib_features, const FingerprintFeatures& query_features);
181
182
187
188
193
195
200
205
207
208
218 static bool parseBinaryFingerprint(const String& fprint, std::vector<unsigned short>& features, unsigned int fp_type, const char* delim=",");
219
220
225
231 bool setLibraryFeatures(const FingerprintFeatures& lib_features);
232
233
239 bool setQueryFeatures(const FingerprintFeatures& query_features);
240
241
246 unsigned int getTargetLibrarySize() const;
247
248
253 unsigned int getQueryLibrarySize() const;
254
255
260 const Options& getOptions() const;
261
262
267 void setVerbosityLevel(const int verbosity);
268
270
271
276
282 bool cutoffSearch(const float sim_cutoff, const String& outfile_name);
283
284
295 bool connectedComponents(const std::vector<unsigned int>& selection,
296 std::vector<std::vector<unsigned int> >& ccs,
297 std::vector<std::vector<std::pair<unsigned int, float> > >& nn_data,
298 const float cutoff,
299 const bool store_nns = false);
300
301
309 bool averageLinkageClustering(const std::vector<unsigned int>& selection,
310 std::vector<std::pair<unsigned int, float> >& nn_data,
311 std::map<unsigned int, std::vector<unsigned int> >& cluster_selection);
312
313
321 bool calculateSelectionMedoid(const std::vector<unsigned int>& selection, unsigned int& medoid_index, std::vector<float>& avg_sims);
322
324
325 private:
329 struct ThreadData
330 {
331 LongSize first;
332 LongSize last;
333
334 float nn_sim;
335 unsigned int blocksize;
336 unsigned int dataset_size;
337 unsigned int active_iids_size;
338
339 unsigned short* cc_row;
340 unsigned short** cc_matrix;
341
342 float* float_array;
343 double** dprec_matrix;
344 unsigned int* uint_array;
345
346 String outfile_name;
347 };
348
349
355 struct FeatureList
356 {
357 // FeatureID which corresponds in principle to the index of a bit in a 2D fingerprint.
358 unsigned short feature_id;
359
360 // Array which stores the sorted list of all molecules which all share the associated feature_id.
361 // Molecules are only represented by their positions in the Block they belong to.
362 unsigned short* block_positions;
363 };
364
365
369 struct InvertedIndex
370 {
371 // Number of molecules in the block.
372 unsigned int n_molecules;
373
374 // Array which stores the number of features of every molecule in the InvertedIndex
375 unsigned short* n_features;
376
377 // Array which stores the ID of the parent cluster to which the molecule at the InvertedIndex position belongs to.
378 unsigned int* parent_clusters;
379
380 // Array of FeatureLists implemented as a skip list.
381 FeatureList* feature_skip_list;
382 };
383
384
388 struct Cluster
389 {
390 // Molecule ID for leaf nodes. Otherwise internal Cluster ID.
391 unsigned int c_id;
392
393 // Cluster index [0, n_molecules-1] is used to map from 2D matrix to 1D array.
394 unsigned int c_index;
395
396 // Number of leaf nodes inside in the corresponding cluster.
397 unsigned int c_size;
398
399 // Left child. NULL if cluster is a leaf.
400 Cluster* l_child;
401
402 // Right child. NULL if cluster is a leaf.
403 Cluster* r_child;
404
405 // Pointer to the clusters current nearest neigbour.
406 Cluster* nn;
407
408 // Flag for various purposes.
409 bool is_rnn;
410
411 // Sum of all intra-cluster pairwise similarities within this cluster. Used finally for KGS penalty calculation.
412 double sim_sum;
413
414 // Pointer to predecessor cluster in the Nearest Neighbour Chain. NULL if not member of the Nearest Neighbour Chain.
415 Cluster* predecessor;
416
417 // Similarity to predecessor in Nearest Neighbour Chain. 0.0 if not member of Nearest Neighbour Chain.
418 float predecessor_sim;
419
420 // Sum of all inter-cluster pairwise similarities of clusters nn_chain[i] and nn_chain[i-1]. 0.0 if not member of the NNChain.
421 double predecessor_sim_sum;
422
423 // Inverted indices of all leaf molecules within this cluster.
424 boost::unordered_set<InvertedIndex*> c_members;
425
426 // Vector of pointers to FingerprintFeatures in lib_features_ of all leaf molecules within this cluster.
427 std::vector<const std::vector<unsigned short>* > leaf_members;
428 };
429
430
431 enum clustering_methods_
432 {
433 STORED_DATA_PARALLEL,
434 STORED_MATRIX
435 };
436
437
441 DisjointSet* ds;
442
443
447 std::vector<Vertex> parent;
448
449
453 std::vector<VertexIndex> rank;
454
455
459 Options options_;
460
461
465 unsigned int n_threads_;
466
467
472 const FingerprintFeatures* lib_features_;
473
474
478 InvertedIndices lib_iindices_;
479
480
485 const FingerprintFeatures* query_features_;
486
487
491 InvertedIndices query_iindices_;
492
493
497 boost::thread* threads_;
498
499
503 boost::mutex out_mutex_;
504
505
509 ThreadData* thread_data_;
510
511
515 unsigned short blocksize_;
516
517
521 LongSize cc_matrix_size_;
522
523
527 float cutoff_;
528
529
533 float precision_;
534
535
539 bool store_nns_;
540
541
547 LongSize n_comparisons_;
548 LongSize n_comparisons_backup_;
549
550
554 int verbosity_;
555
556
560 std::vector<Cluster*> leaf_clusters_;
561
562
566 std::map<float, std::vector<Cluster*> > internal_clusters_;
567
568
572 std::vector<Cluster*> vec_actives_;
573
577 std::vector<Cluster*> vec_inactives_;
578
579
583 float* sim_matrix_;
584
588 double* dprec_sim_matrix_;
589
590
594 std::vector<std::pair<std::vector<InvertedIndex*>, unsigned int> > active_iids_;
595
596
601 unsigned int active_iids_size_;
602
603
607 Cluster* nn_chain_tip_;
608
609
613 unsigned int nn_chain_size_;
614
615
619 std::vector<Cluster*>::iterator current_nn_;
620
621
625 float current_nn_sim_;
626
627
631 unsigned short clustering_method_;
632
633
637 unsigned int n_clusters_;
638
639
644 unsigned int max_clusters_;
645
646
651 void setup(const Options& options);
652
653
658 void clear();
659
660
665 void assign(const BinaryFingerprintMethods& bfm);
666
667
673 bool checkInputData(const std::vector<unsigned int>& selection) const;
674
675
682 void createThreadData(const unsigned int blocksize, const unsigned int dataset_size, const unsigned int active_iids_size);
683
684
688 void destroyThreadData();
689
690
696 InvertedIndex* createInvertedIndex(const std::vector<std::pair<const std::vector<unsigned short>*, unsigned int> >& members);
697
698
703 void destroyInvertedIndex(InvertedIndex* ii);
704
705
710 void destroyInvertedIndices(InvertedIndices& ii_destroy);
711
712
718 void createInvertedIndices(const std::vector<std::pair<const std::vector<unsigned short>*, unsigned int> >& molecules, InvertedIndices& ii_target);
719
720
725 void setBlockSize(const unsigned short blocksize);
726
727
734 void arrayToUpperTriangluarMatrix(LongSize& row, LongSize& col, const LongSize array_index) const;
735
736
743 void arrayToStrictUpperTriangularMatrix(LongSize& row, LongSize& col, const LongSize array_index) const;
744
745
752 LongSize strictUpperTriangularMatrixToArray(const LongSize row, const LongSize col) const;
753
754
760 bool getNextComparisonIndex(LongSize& index);
761
762
771 bool checkSimilaritySwitch(const float a_sim, const float b_sim, const unsigned int a_id, const unsigned int b_id) const;
772
773
780 void calculateCommonCounts_1_1(const FeatureList* ii1, const FeatureList* ii2, unsigned short& cc_count);
781
782
789 void calculateCommonCounts_1_N(const FeatureList* ii1, const FeatureList* ii2, unsigned short* cc_row);
790
791
798 void calculateCommonCounts_M_N(const InvertedIndex* ii_1, const InvertedIndex* ii_2, unsigned short** cc_matrix);
799
800
808 void cutoffSearchSimilarities(const unsigned int query_index, const unsigned int lib_index, unsigned short** cc_matrix, File& outfile);
809
810
815 void cutoffSearchThread(const unsigned int thread_id);
816
817
824 typedef void (BinaryFingerprintMethods::*PairwiseSimilaritiesBase)(const unsigned int ii1_index, const unsigned int ii2_index, ThreadData* t_data);
825 PairwiseSimilaritiesBase pairwiseSimilaritiesBase;
826
827
834 void pairwiseSimilaritiesNearestNeighbours(const unsigned int ii1_index, const unsigned int ii2_index, ThreadData* t_data);
835
836
843 void pairwiseSimilaritiesStoredMatrix(const unsigned int ii1_index, const unsigned int ii2_index, ThreadData* t_data);
844
845
853 void pairwiseSimilaritiesConnectedComponents(const unsigned int ii1_index, const unsigned int ii2_index, ThreadData* t_data);
854
855
862 void pairwiseSimilaritiesMedoids(const unsigned int ii1_index, const unsigned int ii2_index, ThreadData* t_data);
863
864
871 bool pairwiseSimilarities(const std::vector<unsigned int>& selection, std::vector<std::pair<unsigned int, float> >& nn_data);
872
873
878 void pairwiseSimilaritiesThread(const unsigned int thread_id);
879
880
888 void calculateParallelSimilaritiesActives(const InvertedIndex* ii1, const InvertedIndex* ii2, unsigned short** cc_matrix, double** sim_matrix);
889
890
895 void calculateParallelSimilaritiesActivesThread(const unsigned int thread_id);
896
897
905 void calculateParallelSimilarities(const InvertedIndex* ii1, const InvertedIndex* ii2, unsigned short** cc_matrix, double** sim_matrix);
906
907
912 void calculateParallelSimilaritiesThread(const unsigned int thread_id);
913
914
921 void similarityUpdateAverageLinkage(const Cluster* merged_cluster, const Cluster* current);
922
928 void similarityUpdateAverageLinkageThread(const unsigned int thread_id, const Cluster* merged_cluster);
929
930
938 void clusterSimilaritySum_1_N(const InvertedIndex* ii1, const InvertedIndex* ii2, const unsigned short* cc_row, double& sim_sum);
939
940
948 void clusterSimilaritySum_M_N(const InvertedIndex* ii1, const InvertedIndex* ii2, unsigned short** cc_matrix, double& sim_sum);
949
950
955 void similarityMatrixFromClustersThread(const unsigned int thread_id);
956
957
963 void averageLinkageParallel(Cluster*& root);
964
965
971 void NNChainCore(Cluster*& root);
972
973
979 void clusterSelectionKGS(std::map<unsigned int, std::vector<unsigned int> >& cluster_selection);
980
981
987 void enumerateClusterMembers(Cluster* cl, unsigned int cluster_id);
988
989
993 void nextNearestNeighbour();
994
995
999 void moveNearestNeighbour();
1000
1001
1009 Cluster* mergeClusters(Cluster* c1, Cluster* c2, double sim_sum);
1010
1011
1016 Cluster* createCluster();
1017
1021 void switchStorageMethod();
1022
1023
1027 void finalizeClustering();
1028 };
1029} // namespace BALL
1030
1031#endif // BALL_STRUCTURE_BINARYFINGERPRINTMETHODS_H
BALL_ULONG64_TYPE LongSize
This class provides efficient similarity calculation functionality for 2D binary fingerprints.
BinaryFingerprintMethods(const Options &options)
bool setQueryFeatures(const FingerprintFeatures &query_features)
bool calculateSelectionMedoid(const std::vector< unsigned int > &selection, unsigned int &medoid_index, std::vector< float > &avg_sims)
void setVerbosityLevel(const int verbosity)
static bool parseBinaryFingerprint(const String &fprint, std::vector< unsigned short > &features, unsigned int fp_type, const char *delim=",")
BinaryFingerprintMethods(const Options &options, const FingerprintFeatures &lib_features)
bool cutoffSearch(const float sim_cutoff, const String &outfile_name)
const Options & getOptions() const
BinaryFingerprintMethods(const BinaryFingerprintMethods &bfm)
bool connectedComponents(const std::vector< unsigned int > &selection, std::vector< std::vector< unsigned int > > &ccs, std::vector< std::vector< std::pair< unsigned int, float > > > &nn_data, const float cutoff, const bool store_nns=false)
BinaryFingerprintMethods(const Options &options, const FingerprintFeatures &lib_features, const FingerprintFeatures &query_features)
unsigned int getTargetLibrarySize() const
bool averageLinkageClustering(const std::vector< unsigned int > &selection, std::vector< std::pair< unsigned int, float > > &nn_data, std::map< unsigned int, std::vector< unsigned int > > &cluster_selection)
unsigned int getQueryLibrarySize() const
bool setLibraryFeatures(const FingerprintFeatures &lib_features)
#define BALL_EXPORT