RDKit
Open-source cheminformatics and machine learning.
BitOps.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2012 greg Landrum and Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef __RD_BITOPS_H__
12 #define __RD_BITOPS_H__
13 /*! \file BitOps.h
14 
15  \brief Contains general bit-comparison and similarity operations.
16 
17  The notation used to document the similarity metrics is:
18  - \c V1_n: number of bits in vector 1
19  - \c V1_o: number of on bits in vector 1
20  - <tt>(V1&V2)_o</tt>: number of on bits in the intersection of vectors 1 and
21  2
22 
23  */
24 
25 #include "BitVects.h"
26 #include <string>
27 
28 //! general purpose wrapper for calculating the similarity between two bvs
29 //! that may be of unequal size (will automatically fold as appropriate)
30 template <typename T>
31 double SimilarityWrapper(const T& bv1, const T& bv2,
32  double (*metric)(const T&, const T&),
33  bool returnDistance = false) {
34  double res = 0.0;
35  if (bv1.getNumBits() > bv2.getNumBits()) {
36  T* bv1tmp = FoldFingerprint(bv1, bv1.getNumBits() / bv2.getNumBits());
37  res = metric(*bv1tmp, bv2);
38  delete bv1tmp;
39  } else if (bv2.getNumBits() > bv1.getNumBits()) {
40  T* bv2tmp = FoldFingerprint(bv2, bv2.getNumBits() / bv1.getNumBits());
41  res = metric(bv1, *bv2tmp);
42  delete bv2tmp;
43  } else {
44  res = metric(bv1, bv2);
45  }
46  if (returnDistance) {
47  res = 1.0 - res;
48  }
49  return res;
50 }
51 //! \overload
52 template <typename T>
53 double SimilarityWrapper(const T& bv1, const T& bv2, double a, double b,
54  double (*metric)(const T&, const T&, double, double),
55  bool returnDistance = false) {
56  double res = 0.0;
57  if (bv1.getNumBits() > bv2.getNumBits()) {
58  T* bv1tmp = FoldFingerprint(bv1, bv1.getNumBits() / bv2.getNumBits());
59  res = metric(*bv1tmp, bv2, a, b);
60  delete bv1tmp;
61  } else if (bv2.getNumBits() > bv1.getNumBits()) {
62  T* bv2tmp = FoldFingerprint(bv2, bv2.getNumBits() / bv1.getNumBits());
63  res = metric(bv1, *bv2tmp, a, b);
64  delete bv2tmp;
65  } else {
66  res = metric(bv1, bv2, a, b);
67  }
68  if (returnDistance) {
69  res = 1.0 - res;
70  }
71  return res;
72 }
73 
75  const char* ref);
76 RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const std::string& probe,
77  const std::string& ref);
79  const ExplicitBitVect& ref);
80 
81 template <typename T1>
83  const std::string& pkl);
84 
85 template <typename T1>
86 RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const T1& probe, const T1& ref);
87 
88 //! returns the number of on bits in common between two bit vectors
89 /*!
90  \return (bv1&bv2)_o
91 */
92 template <typename T1, typename T2>
93 RDKIT_DATASTRUCTS_EXPORT int NumOnBitsInCommon(const T1& bv1, const T2& bv2);
94 
96  const ExplicitBitVect& bv2);
97 
98 //! returns the Tanimoto similarity between two bit vects
99 /*!
100  \return <tt>(bv1&bv2)_o / [bv1_o + bv2_o - (bv1&bv2)_o]</tt>
101 */
102 template <typename T1, typename T2>
104  const T2& bv2);
105 
106 //! returns the Cosine similarity between two bit vects
107 /*!
108  \return <tt>(bv1&bv2)_o / sqrt(bv1_o + bv2_o)</tt>
109 */
110 template <typename T1, typename T2>
111 RDKIT_DATASTRUCTS_EXPORT double CosineSimilarity(const T1& bv1, const T2& bv2);
112 
113 //! returns the Kulczynski similarity between two bit vects
114 /*!
115  \return <tt>(bv1&bv2)_o * [bv1_o + bv2_o] / [2 * bv1_o * bv2_o]</tt>
116 */
117 template <typename T1, typename T2>
119  const T2& bv2);
120 
121 //! returns the Dice similarity between two bit vects
122 /*!
123  \return <tt>2*(bv1&bv2)_o / [bv1_o + bv2_o]</tt>
124 */
125 template <typename T1, typename T2>
126 RDKIT_DATASTRUCTS_EXPORT double DiceSimilarity(const T1& bv1, const T2& bv2);
127 
128 //! returns the Tversky similarity between two bit vects
129 /*!
130  \return <tt>(bv1&bv2)_o / [a*bv1_o + b*bv2_o + (1 - a - b)*(bv1&bv2)_o]</tt>
131 
132  Notes:
133  # 0 <= a,b <= 1
134  # Tversky(a=1,b=1) = Tanimoto
135  # Tversky(a=1/2,b=1/2) = Dice
136 
137 */
138 template <typename T1, typename T2>
139 RDKIT_DATASTRUCTS_EXPORT double TverskySimilarity(const T1& bv1, const T2& bv2,
140  double a, double b);
141 
142 //! returns the Sokal similarity between two bit vects
143 /*!
144  \return <tt>(bv1&bv2)_o / [2*bv1_o + 2*bv2_o - 3*(bv1&bv2)_o]</tt>
145 */
146 template <typename T1, typename T2>
147 RDKIT_DATASTRUCTS_EXPORT double SokalSimilarity(const T1& bv1, const T2& bv2);
148 
149 //! returns the McConnaughey similarity between two bit vects
150 /*!
151  \return <tt>[(bv1&bv2)_o * (bv1_o + bv2_o) - (bv1_o * bv2_o)] / (bv1_o *
152  bv2_o)</tt>
153 */
154 template <typename T1, typename T2>
156  const T2& bv2);
157 
158 //! returns the Asymmetric similarity between two bit vects
159 /*!
160  \return <tt>(bv1&bv2)_o / min(bv1_o,bv2_o)</tt>
161 */
162 template <typename T1, typename T2>
164  const T2& bv2);
165 
166 //! returns the Braun-Blanquet similarity between two bit vects
167 /*!
168  \return <tt>(bv1&bv2)_o / max(bv1_o,bv2_o)</tt>
169 */
170 template <typename T1, typename T2>
172  const T2& bv2);
173 
174 //! returns the Russel similarity between two bit vects
175 /*!
176  \return <tt>(bv1&bv2)_o / bv1_o</tt>
177 
178  <b>Note:</b> that this operation is non-commutative:
179  RusselSimilarity(bv1,bv2) != RusselSimilarity(bv2,bv1)
180 
181 */
182 template <typename T1, typename T2>
183 RDKIT_DATASTRUCTS_EXPORT double RusselSimilarity(const T1& bv1, const T2& bv2);
184 
185 //! returns the Rogot-Goldberg similarity between two bit vects
186 /*!
187  \return <tt>(bv1&bv2)_o / (bv1_o + bv2_o)
188  + (bv1_n - bv1_o - bv2_o + (bv1&bv2)_o) / (2*bv1_n - bv1_o - bv2_o) </tt>
189 */
190 template <typename T1, typename T2>
192  const T2& bv2);
193 
194 //! returns the on bit similarity between two bit vects
195 /*!
196  \return <tt>(bv1&bv2)_o / (bv1|bv2)_o </tt>
197 */
198 template <typename T1, typename T2>
199 RDKIT_DATASTRUCTS_EXPORT double OnBitSimilarity(const T1& bv1, const T2& bv2);
200 
201 //! returns the number of common bits (on and off) between two bit vects
202 /*!
203  \return <tt>bv1_n - (bv1^bv2)_o</tt>
204 */
205 template <typename T1, typename T2>
206 RDKIT_DATASTRUCTS_EXPORT int NumBitsInCommon(const T1& bv1, const T2& bv2);
207 
209  const ExplicitBitVect& bv2);
210 
211 //! returns the common-bit similarity (on and off) between two bit vects
212 //! This is also called Manhattan similarity.
213 /*!
214  \return <tt>[bv1_n - (bv1^bv2)_o] / bv1_n</tt>
215 */
216 template <typename T1, typename T2>
217 RDKIT_DATASTRUCTS_EXPORT double AllBitSimilarity(const T1& bv1, const T2& bv2);
218 
219 //! returns an IntVect with indices of all on bits in common between two bit
220 /// vects
221 template <typename T1, typename T2>
222 RDKIT_DATASTRUCTS_EXPORT IntVect OnBitsInCommon(const T1& bv1, const T2& bv2);
223 
224 //! returns an IntVect with indices of all off bits in common between two bit
225 /// vects
226 template <typename T1, typename T2>
227 RDKIT_DATASTRUCTS_EXPORT IntVect OffBitsInCommon(const T1& bv1, const T2& bv2);
228 
229 //! returns the on-bit projected similarities between two bit vects
230 /*!
231  \return two values, as a DoubleVect:
232  - <tt>(bv1&bv2)_o / bv1_o</tt>
233  - <tt>(bv1&bv2)_o / bv2_o</tt>
234 */
235 template <typename T1, typename T2>
237  const T2& bv2);
238 
239 //! returns the on-bit projected similarities between two bit vects
240 /*!
241  \return two values, as a DoubleVect:
242  - <tt>[bv1_n - (bv1|bv2)_o] / [bv1_n - bv1_o]</tt>
243  - <tt>[bv2_n - (bv1|bv2)_o] / [bv2_n - bv2_o]</tt>
244 
245  <b>Note:</b> <tt>bv1_n = bv2_n</tt>
246 
247 */
248 template <typename T1, typename T2>
250  const T2& bv2);
251 
252 //! folds a bit vector \c factor times and returns the result
253 /*!
254  \param bv1 the vector to be folded
255  \param factor (optional) the number of times to fold it
256 
257  \return a pointer to the folded fingerprint, which is
258  <tt>bv1_n/factor</tt> long.
259 
260  <b>Note:</b> The caller is responsible for <tt>delete</tt>ing the result.
261  */
262 template <typename T1>
264  unsigned int factor = 2);
265 
266 //! returns a text representation of a bit vector (a string of 0s and 1s)
267 /*!
268  \param bv1 the vector to use
269 
270  \return an std::string
271 
272  */
273 template <typename T1>
274 RDKIT_DATASTRUCTS_EXPORT std::string BitVectToText(const T1& bv1);
275 
276 //! returns a hex representation of a bit vector compatible with Andrew Dalke's
277 /// FPS format
278 /*!
279  \param bv1 the vector to use
280 
281  \return an std::string
282 
283  */
284 template <typename T1>
285 RDKIT_DATASTRUCTS_EXPORT std::string BitVectToFPSText(const T1& bv1);
286 
287 //! returns a binary string representation of a bit vector (an array of bytes)
288 /*!
289  \param bv1 the vector to use
290 
291  \return an std::string
292 
293  */
294 template <typename T1>
296 
297 //! updates a bit vector from Andrew Dalke's FPS format
298 /*!
299  \param bv1 the vector to use
300  \param fps the FPS hex string
301 
302 
303  */
304 template <typename T1>
306  const std::string& fps);
307 
308 //! updates a bit vector from a binary string representation of a bit vector (an
309 /// array of bytes)
310 /*!
311  \param bv1 the vector to use
312  \param fps the binary string
313 
314 
315  */
316 template <typename T1>
318  T1& bv1, const std::string& fps);
319 
320 // FIX: docs and tests please
321 
323  const unsigned char* bv1, unsigned int nBytes);
324 
325 RDKIT_DATASTRUCTS_EXPORT double CalcBitmapTanimoto(const unsigned char* bv1,
326  const unsigned char* bv2,
327  unsigned int nBytes);
328 RDKIT_DATASTRUCTS_EXPORT double CalcBitmapDice(const unsigned char* bv1,
329  const unsigned char* bv2,
330  unsigned int nBytes);
331 RDKIT_DATASTRUCTS_EXPORT double CalcBitmapTversky(const unsigned char* bv1,
332  const unsigned char* bv2,
333  unsigned int nBytes,
334  double ca, double cb);
336  const unsigned char* probe, const unsigned char* ref, unsigned int nBytes);
337 #endif
RDKIT_DATASTRUCTS_EXPORT std::string BitVectToBinaryText(const T1 &bv1)
returns a binary string representation of a bit vector (an array of bytes)
RDKIT_DATASTRUCTS_EXPORT double McConnaugheySimilarity(const T1 &bv1, const T2 &bv2)
returns the McConnaughey similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT int NumBitsInCommon(const T1 &bv1, const T2 &bv2)
returns the number of common bits (on and off) between two bit vects
RDKIT_DATASTRUCTS_EXPORT double AsymmetricSimilarity(const T1 &bv1, const T2 &bv2)
returns the Asymmetric similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const char *probe, const char *ref)
RDKIT_DATASTRUCTS_EXPORT unsigned int CalcBitmapPopcount(const unsigned char *bv1, unsigned int nBytes)
double SimilarityWrapper(const T &bv1, const T &bv2, double(*metric)(const T &, const T &), bool returnDistance=false)
Definition: BitOps.h:31
RDKIT_DATASTRUCTS_EXPORT std::string BitVectToFPSText(const T1 &bv1)
RDKIT_DATASTRUCTS_EXPORT double TverskySimilarity(const T1 &bv1, const T2 &bv2, double a, double b)
returns the Tversky similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT double RusselSimilarity(const T1 &bv1, const T2 &bv2)
returns the Russel similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT double AllBitSimilarity(const T1 &bv1, const T2 &bv2)
RDKIT_DATASTRUCTS_EXPORT std::string BitVectToText(const T1 &bv1)
returns a text representation of a bit vector (a string of 0s and 1s)
RDKIT_DATASTRUCTS_EXPORT double BraunBlanquetSimilarity(const T1 &bv1, const T2 &bv2)
returns the Braun-Blanquet similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT double OnBitSimilarity(const T1 &bv1, const T2 &bv2)
returns the on bit similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT bool CalcBitmapAllProbeBitsMatch(const unsigned char *probe, const unsigned char *ref, unsigned int nBytes)
RDKIT_DATASTRUCTS_EXPORT IntVect OnBitsInCommon(const T1 &bv1, const T2 &bv2)
RDKIT_DATASTRUCTS_EXPORT T1 * FoldFingerprint(const T1 &bv1, unsigned int factor=2)
folds a bit vector factor times and returns the result
RDKIT_DATASTRUCTS_EXPORT double SokalSimilarity(const T1 &bv1, const T2 &bv2)
returns the Sokal similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT double CosineSimilarity(const T1 &bv1, const T2 &bv2)
returns the Cosine similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT double DiceSimilarity(const T1 &bv1, const T2 &bv2)
returns the Dice similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT double KulczynskiSimilarity(const T1 &bv1, const T2 &bv2)
returns the Kulczynski similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT double RogotGoldbergSimilarity(const T1 &bv1, const T2 &bv2)
returns the Rogot-Goldberg similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT IntVect OffBitsInCommon(const T1 &bv1, const T2 &bv2)
RDKIT_DATASTRUCTS_EXPORT double CalcBitmapDice(const unsigned char *bv1, const unsigned char *bv2, unsigned int nBytes)
RDKIT_DATASTRUCTS_EXPORT double TanimotoSimilarity(const T1 &bv1, const T2 &bv2)
returns the Tanimoto similarity between two bit vects
RDKIT_DATASTRUCTS_EXPORT void UpdateBitVectFromBinaryText(T1 &bv1, const std::string &fps)
RDKIT_DATASTRUCTS_EXPORT double CalcBitmapTversky(const unsigned char *bv1, const unsigned char *bv2, unsigned int nBytes, double ca, double cb)
RDKIT_DATASTRUCTS_EXPORT void UpdateBitVectFromFPSText(T1 &bv1, const std::string &fps)
updates a bit vector from Andrew Dalke's FPS format
RDKIT_DATASTRUCTS_EXPORT double CalcBitmapTanimoto(const unsigned char *bv1, const unsigned char *bv2, unsigned int nBytes)
RDKIT_DATASTRUCTS_EXPORT DoubleVect OffBitProjSimilarity(const T1 &bv1, const T2 &bv2)
returns the on-bit projected similarities between two bit vects
RDKIT_DATASTRUCTS_EXPORT int NumOnBitsInCommon(const T1 &bv1, const T2 &bv2)
returns the number of on bits in common between two bit vectors
RDKIT_DATASTRUCTS_EXPORT DoubleVect OnBitProjSimilarity(const T1 &bv1, const T2 &bv2)
returns the on-bit projected similarities between two bit vects
std::vector< int > IntVect
Definition: BitVect.h:17
std::vector< double > DoubleVect
Definition: BitVect.h:19
Pulls in all the BitVect classes.
a class for bit vectors that are densely occupied
#define RDKIT_DATASTRUCTS_EXPORT
Definition: export.h:81