RDKit
Open-source cheminformatics and machine learning.
InfoBitRanker.h
Go to the documentation of this file.
1 // $Id$
2 //
3 // Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 #include <RDGeneral/export.h>
12 #ifndef _RD_INFORANKER_H_
13 #define _RD_INFORANKER_H_
14 
15 #include <RDGeneral/types.h>
16 #include <DataStructs/BitVects.h>
17 #include <iostream>
18 
19 /*! \brief Class used to rank bits based on a specified measure of information
20  *
21  * Basically a primitive mimic of the CombiChem "signal" functionality
22  * To use:
23  * - create an instance of this class
24  * - loop over the fingerprints in the dataset by calling accumulateVotes
25  *method
26  * - call getTopN to get the top n ranked bits
27  *
28  * Sample usage and results from the python wrapper:
29  * Here's a small set of vectors:
30  * >>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i]
31  * ...
32  * 0001 0
33  * 0101 0
34  * 0010 1
35  * 1110 1
36  *
37  * Default ranker, using infogain:
38  * >>> ranker = InfoBitRanker(4,2)
39  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
40  * ...
41  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
42  *int(bit),'%.3f'%gain,int(n0),int(n1)
43  * ...
44  * 3 1.000 2 0
45  * 2 1.000 0 2
46  * 0 0.311 0 1
47  *
48  * Using the biased infogain:
49  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY)
50  * >>> ranker.SetBiasList((1,))
51  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
52  * ...
53  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
54  *int(bit),'%.3f'%gain,int(n0),int(n1)
55  * ...
56  * 2 1.000 0 2
57  * 0 0.311 0 1
58  * 1 0.000 1 1
59  *
60  * A chi squared ranker is also available:
61  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE)
62  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
63  * ...
64  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
65  *int(bit),'%.3f'%gain,int(n0),int(n1)
66  * ...
67  * 3 4.000 2 0
68  * 2 4.000 0 2
69  * 0 1.333 0 1
70  *
71  * As is a biased chi squared:
72  * >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE)
73  * >>> ranker.SetBiasList((1,))
74  * >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
75  * ...
76  * >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
77  *int(bit),'%.3f'%gain,int(n0),int(n1)
78  * ...
79  * 2 4.000 0 2
80  * 0 1.333 0 1
81  * 1 0.000 1 1
82  */
83 namespace RDInfoTheory {
84 typedef std::vector<RDKit::USHORT> USHORT_VECT;
85 typedef std::vector<USHORT_VECT> VECT_USHORT_VECT;
86 
88  public:
89  /*! \brief the type of measure for information
90  *
91  */
92  typedef enum {
93  ENTROPY = 1,
94  BIASENTROPY = 2,
95  CHISQUARE = 3,
96  BIASCHISQUARE = 4
97  } InfoType;
98 
99  /*! \brief Constructor
100  *
101  * ARGUMENTS:
102  *
103  * - nBits: the dimension of the bit vectors or the fingerprint length
104  * - nClasses: the number of classes used in the classification problem
105  *(e.g. active,
106  * moderately active, inactive etc.). It is assumed that the
107  *classes are
108  * numbered from 0 to (nClasses - 1)
109  * - infoType: the type of information metric
110  */
111  InfoBitRanker(unsigned int nBits, unsigned int nClasses,
112  InfoType infoType = InfoBitRanker::ENTROPY)
113  : d_dims(nBits), d_classes(nClasses), d_type(infoType) {
114  d_counts.resize(0);
115  for (unsigned int i = 0; i < nClasses; i++) {
116  USHORT_VECT cCount;
117  cCount.resize(d_dims, 0);
118  d_counts.push_back(cCount);
119  }
120  d_clsCount.resize(d_classes, 0);
121  d_nInst = 0;
122  d_top = 0;
123  dp_topBits = nullptr;
124  d_biasList.resize(0);
125  dp_maskBits = nullptr;
126  }
127 
129  if (dp_topBits) {
130  delete[] dp_topBits;
131  }
132  if (dp_maskBits) {
133  delete dp_maskBits;
134  }
135  }
136 
137  /*! \brief Accumulate the votes for all the bits turned on in a bit vector
138  *
139  * ARGUMENTS:
140  *
141  * - bv : bit vector that supports [] operator
142  * - label : the class label for the bit vector. It is assumed that 0 <=
143  *class < nClasses
144  */
145  void accumulateVotes(const ExplicitBitVect &bv, unsigned int label);
146  void accumulateVotes(const SparseBitVect &bv, unsigned int label);
147 
148  /*! \brief Returns the top n bits ranked by the information metric
149  *
150  * This is actually the function where most of the work of ranking is
151  *happening
152  *
153  * \param num the number of top ranked bits that are required
154  *
155  * \return a pointer to an information array. The client should *not*
156  * delete this
157  */
158  double *getTopN(unsigned int num);
159 
160  /*! \brief return the number of labelled instances(examples) or fingerprints
161  *seen so far
162  *
163  */
164  unsigned int getNumInstances() const { return d_nInst; }
165 
166  /*! \brief return the number of classes
167  *
168  */
169  unsigned int getNumClasses() const { return d_classes; }
170 
171  /*! \brief Set the classes to which the entropy calculation should be biased
172  *
173  * This list contains a set of class ids used when in the BIASENTROPY mode of
174  *ranking bits.
175  * In this mode, a bit must be correllated higher with one of the biased
176  *classes than all the
177  * other classes. For example, in a two class problem with actives and
178  *inactives, the fraction of
179  * actives that hit the bit has to be greater than the fraction of inactives
180  *that hit the bit
181  *
182  * ARGUMENTS:
183  * classList - list of class ids that we want a bias towards
184  */
185  void setBiasList(RDKit::INT_VECT &classList);
186 
187  /*! \brief Set the bits to be used as a mask
188  *
189  * If this function is called, only the bits which are present in the
190  * maskBits list will be used.
191  *
192  * ARGUMENTS:
193  * maskBits - the bits to be considered
194  */
195  void setMaskBits(RDKit::INT_VECT &maskBits);
196 
197  /*! \brief Write the top N bits to a stream
198  *
199  */
200  void writeTopBitsToStream(std::ostream *outStream) const;
201 
202  /*! \brief Write the top bits to a file
203  *
204  */
205  void writeTopBitsToFile(const std::string &fileName) const;
206 
207  private:
208  /*! \brief check if we want to compute the info content for a bit based on the
209  *bias list
210  *
211  * This what happens here:
212  * - the fraction of items in each class that hit a particular bit are
213  *computed
214  * - the maximum of these fractions for classes that are not in the
215  *biasList are computed
216  * - If this maximum is less than the fraction for at least one of the
217  * classes in the biaslist, the bit is considered good
218  * ARGUMENTS:
219  * - resMat : the result matrix, one dimensional matrix of dimension (2*(num
220  *of classes))
221  * a 2D structure is assumed with the first row containing number
222  *of items of each class
223  * with the bit set and the second row to entires of each class
224  *with the bit turned off
225  */
226  bool BiasCheckBit(RDKit::USHORT *resMat) const;
227 
228  /*! \brief Compute the biased info entropy gain based on the bias list
229  *
230  * This what happens here:
231  * - we call BiasCheckBit to see if the bit qualifies to compute the
232  *infocontent
233  * - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
234  *
235  * ARGUMENTS:
236  * - resMat : the result matrix, one dimensional matrix of dimension (2*(num
237  *of classes))
238  * a 2D structure is assumed with the first row containing number
239  *of items of each class
240  * with the bit set and the second row to entires of each class
241  *with the bit turned off
242  */
243  double BiasInfoEntropyGain(RDKit::USHORT *resMat) const;
244 
245  /*! \brief Compute the biased chi qsure value based on the bias list
246  *
247  * This what happens here:
248  * - we call BiasCheckBit to see if the bit qualifies to compute the
249  *infocontent
250  * - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
251  *
252  * ARGUMENTS:
253  * - resMat : the result matrix, one dimensional matrix of dimension (2*(num
254  *of classes))
255  * a 2D structure is assumed with the first row containing number
256  *of items of each class
257  * with the bit set and the second row to entires of each class
258  *with the bit turned off
259  */
260  double BiasChiSquareGain(RDKit::USHORT *resMat) const;
261 
262  unsigned int d_dims; // the number of bits in the fingerprints
263  unsigned int d_classes; // the number of classes (active, inactive,
264  // moderately active etc.)
265  InfoType d_type; // the type of information measure - currently we support
266  // only entropy
267  VECT_USHORT_VECT d_counts; // place holder of counting the number of hits for
268  // each bit for each class
269  USHORT_VECT d_clsCount; // counter for the number of instances of each class
270  double *dp_topBits; // storage for the top ranked bits and the corresponding
271  // statistics
272  unsigned int d_top; // the number of bits that have been ranked
273  unsigned int d_nInst; // total number of instances or fingerprints used
274  // accumulate votes
276  d_biasList; // if we want a bias towards certain classes in ranking bits
277  ExplicitBitVect *dp_maskBits; // allows only certain bits to be considered
278 };
279 } // namespace RDInfoTheory
280 #endif
Pulls in all the BitVect classes.
a class for bit vectors that are densely occupied
void accumulateVotes(const ExplicitBitVect &bv, unsigned int label)
Accumulate the votes for all the bits turned on in a bit vector.
InfoType
the type of measure for information
Definition: InfoBitRanker.h:92
void setMaskBits(RDKit::INT_VECT &maskBits)
Set the bits to be used as a mask.
void writeTopBitsToFile(const std::string &fileName) const
Write the top bits to a file.
InfoBitRanker(unsigned int nBits, unsigned int nClasses, InfoType infoType=InfoBitRanker::ENTROPY)
Constructor.
unsigned int getNumClasses() const
return the number of classes
void accumulateVotes(const SparseBitVect &bv, unsigned int label)
unsigned int getNumInstances() const
return the number of labelled instances(examples) or fingerprints seen so far
double * getTopN(unsigned int num)
Returns the top n bits ranked by the information metric.
void writeTopBitsToStream(std::ostream *outStream) const
Write the top N bits to a stream.
void setBiasList(RDKit::INT_VECT &classList)
Set the classes to which the entropy calculation should be biased.
a class for bit vectors that are sparsely occupied.
Definition: SparseBitVect.h:34
#define RDKIT_INFOTHEORY_EXPORT
Definition: export.h:241
Class used to rank bits based on a specified measure of information.
std::vector< RDKit::USHORT > USHORT_VECT
Definition: InfoBitRanker.h:84
std::vector< USHORT_VECT > VECT_USHORT_VECT
Definition: InfoBitRanker.h:85
std::vector< int > INT_VECT
Definition: types.h:278
unsigned short USHORT
Definition: types.h:275