RDKit
Open-source cheminformatics and machine learning.
SparseIntVect.h
Go to the documentation of this file.
1 // $Id$
2 //
3 // Copyright (C) 2007-2008 Greg Landrum
4 //
5 // @@ All Rights Reserved @@
6 // This file is part of the RDKit.
7 // The contents are covered by the terms of the BSD license
8 // which is included in the file license.txt, found at the root
9 // of the RDKit source tree.
10 //
11 #include <RDGeneral/export.h>
12 #ifndef __RD_SPARSE_INT_VECT_20070921__
13 #define __RD_SPARSE_INT_VECT_20070921__
14 
15 #include <map>
16 #include <string>
17 #include <RDGeneral/Invariant.h>
18 #include <sstream>
19 #include <RDGeneral/Exceptions.h>
20 #include <RDGeneral/StreamOps.h>
21 #include <cstdint>
22 
24  0x0001; //!< version number to use in pickles
25 namespace RDKit {
26 //! a class for efficiently storing sparse vectors of ints
27 template <typename IndexType>
29  public:
30  typedef std::map<IndexType, int> StorageType;
31 
32  SparseIntVect() : d_length(0) {}
33 
34  //! initialize with a particular length
35  SparseIntVect(IndexType length) : d_length(length) {}
36 
37  //! Copy constructor
39  d_length = other.d_length;
40  d_data.clear();
41  d_data.insert(other.d_data.begin(), other.d_data.end());
42  }
43 
44  //! constructor from a pickle
45  SparseIntVect(const std::string &pkl) {
46  initFromText(pkl.c_str(), pkl.size());
47  }
48  //! constructor from a pickle
49  SparseIntVect(const char *pkl, const unsigned int len) {
50  initFromText(pkl, len);
51  }
52 
54  if (this == &other) {
55  return *this;
56  }
57  d_length = other.d_length;
58  d_data.clear();
59  d_data.insert(other.d_data.begin(), other.d_data.end());
60  return *this;
61  }
62 
63  //! destructor (doesn't need to do anything)
64  ~SparseIntVect() = default;
65 
66 #ifdef __clang__
67 #pragma clang diagnostic push
68 #pragma clang diagnostic ignored "-Wtautological-compare"
69 #elif (defined(__GNUC__) || defined(__GNUG__)) && \
70  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 1))
71 #if (__GNUC__ > 4 || __GNUC_MINOR__ > 5)
72 #pragma GCC diagnostic push
73 #endif
74 #pragma GCC diagnostic ignored "-Wtype-limits"
75 #endif
76  //! return the value at an index
77  int getVal(IndexType idx) const {
78  if (idx < 0 || idx >= d_length) {
79  throw IndexErrorException(static_cast<int>(idx));
80  }
81  int res = 0;
82  typename StorageType::const_iterator iter = d_data.find(idx);
83  if (iter != d_data.end()) {
84  res = iter->second;
85  }
86  return res;
87  }
88 
89  //! set the value at an index
90  void setVal(IndexType idx, int val) {
91  if (idx < 0 || idx >= d_length) {
92  throw IndexErrorException(static_cast<int>(idx));
93  }
94  if (val != 0) {
95  d_data[idx] = val;
96  } else {
97  d_data.erase(idx);
98  }
99  }
100 #ifdef __clang__
101 #pragma clang diagnostic pop
102 #elif (defined(__GNUC__) || defined(__GNUG__)) && \
103  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 5))
104 #pragma GCC diagnostic pop
105 #endif
106  //! support indexing using []
107  int operator[](IndexType idx) const { return getVal(idx); }
108 
109  //! returns the length
110  IndexType getLength() const { return d_length; }
111 
112  //! returns the sum of all the elements in the vect
113  //! the doAbs argument toggles summing the absolute values of the elements
114  int getTotalVal(bool doAbs = false) const {
115  int res = 0;
116  typename StorageType::const_iterator iter;
117  for (iter = d_data.begin(); iter != d_data.end(); ++iter) {
118  if (!doAbs) {
119  res += iter->second;
120  } else {
121  res += abs(iter->second);
122  }
123  }
124  return res;
125  }
126  //! returns the length
127  unsigned int size() const { return getLength(); }
128 
129  //! returns our nonzero elements as a map(IndexType->int)
130  const StorageType &getNonzeroElements() const { return d_data; }
131 
132  //! this is a "fuzzy" intesection, the final value
133  //! of each element is equal to the minimum from
134  //! the two vects.
136  if (other.d_length != d_length) {
137  throw ValueErrorException("SparseIntVect size mismatch");
138  }
139 
140  typename StorageType::iterator iter = d_data.begin();
141  typename StorageType::const_iterator oIter = other.d_data.begin();
142  while (iter != d_data.end()) {
143  // we're relying on the fact that the maps are sorted:
144  while (oIter != other.d_data.end() && oIter->first < iter->first) {
145  ++oIter;
146  }
147  if (oIter != other.d_data.end() && oIter->first == iter->first) {
148  // found it:
149  if (oIter->second < iter->second) {
150  iter->second = oIter->second;
151  }
152  ++oIter;
153  ++iter;
154  } else {
155  // not there; our value is zero, which means
156  // we should remove this value:
157  typename StorageType::iterator tmpIter = iter;
158  ++tmpIter;
159  d_data.erase(iter);
160  iter = tmpIter;
161  }
162  }
163  return *this;
164  }
166  const SparseIntVect<IndexType> &other) const {
167  SparseIntVect<IndexType> res(*this);
168  return res &= other;
169  }
170 
171  //! this is a "fuzzy" union, the final value
172  //! of each element is equal to the maximum from
173  //! the two vects.
175  if (other.d_length != d_length) {
176  throw ValueErrorException("SparseIntVect size mismatch");
177  }
178 
179  typename StorageType::iterator iter = d_data.begin();
180  typename StorageType::const_iterator oIter = other.d_data.begin();
181  while (iter != d_data.end()) {
182  // we're relying on the fact that the maps are sorted:
183  while (oIter != other.d_data.end() && oIter->first < iter->first) {
184  d_data[oIter->first] = oIter->second;
185  ++oIter;
186  }
187  if (oIter != other.d_data.end() && oIter->first == iter->first) {
188  // found it:
189  if (oIter->second > iter->second) {
190  iter->second = oIter->second;
191  }
192  ++oIter;
193  }
194  ++iter;
195  }
196  // finish up the other vect:
197  while (oIter != other.d_data.end()) {
198  d_data[oIter->first] = oIter->second;
199  ++oIter;
200  }
201  return *this;
202  }
204  const SparseIntVect<IndexType> &other) const {
205  SparseIntVect<IndexType> res(*this);
206  return res |= other;
207  }
208 
210  if (other.d_length != d_length) {
211  throw ValueErrorException("SparseIntVect size mismatch");
212  }
213  typename StorageType::iterator iter = d_data.begin();
214  typename StorageType::const_iterator oIter = other.d_data.begin();
215  while (oIter != other.d_data.end()) {
216  while (iter != d_data.end() && iter->first < oIter->first) {
217  ++iter;
218  }
219  if (iter != d_data.end() && oIter->first == iter->first) {
220  // found it:
221  iter->second += oIter->second;
222  if (!iter->second) {
223  typename StorageType::iterator tIter = iter;
224  ++tIter;
225  d_data.erase(iter);
226  iter = tIter;
227  } else {
228  ++iter;
229  }
230  } else {
231  d_data[oIter->first] = oIter->second;
232  }
233  ++oIter;
234  }
235  return *this;
236  }
238  const SparseIntVect<IndexType> &other) const {
239  SparseIntVect<IndexType> res(*this);
240  return res += other;
241  }
242 
244  if (other.d_length != d_length) {
245  throw ValueErrorException("SparseIntVect size mismatch");
246  }
247  typename StorageType::iterator iter = d_data.begin();
248  typename StorageType::const_iterator oIter = other.d_data.begin();
249  while (oIter != other.d_data.end()) {
250  while (iter != d_data.end() && iter->first < oIter->first) {
251  ++iter;
252  }
253  if (iter != d_data.end() && oIter->first == iter->first) {
254  // found it:
255  iter->second -= oIter->second;
256  if (!iter->second) {
257  typename StorageType::iterator tIter = iter;
258  ++tIter;
259  d_data.erase(iter);
260  iter = tIter;
261  } else {
262  ++iter;
263  }
264  } else {
265  d_data[oIter->first] = -oIter->second;
266  }
267  ++oIter;
268  }
269  return *this;
270  }
272  const SparseIntVect<IndexType> &other) const {
273  SparseIntVect<IndexType> res(*this);
274  return res -= other;
275  }
277  typename StorageType::iterator iter = d_data.begin();
278  while (iter != d_data.end()) {
279  iter->second *= v;
280  ++iter;
281  }
282  return *this;
283  }
285  SparseIntVect<IndexType> res(*this);
286  return res *= v;
287  }
289  typename StorageType::iterator iter = d_data.begin();
290  while (iter != d_data.end()) {
291  iter->second /= v;
292  ++iter;
293  }
294  return *this;
295  }
297  SparseIntVect<IndexType> res(*this);
298  return res /= v;
299  }
301  typename StorageType::iterator iter = d_data.begin();
302  while (iter != d_data.end()) {
303  iter->second += v;
304  ++iter;
305  }
306  return *this;
307  }
309  SparseIntVect<IndexType> res(*this);
310  return res += v;
311  }
313  typename StorageType::iterator iter = d_data.begin();
314  while (iter != d_data.end()) {
315  iter->second -= v;
316  ++iter;
317  }
318  return *this;
319  }
321  SparseIntVect<IndexType> res(*this);
322  return res -= v;
323  }
324 
325  bool operator==(const SparseIntVect<IndexType> &v2) const {
326  if (d_length != v2.d_length) {
327  return false;
328  }
329  return d_data == v2.d_data;
330  }
331  bool operator!=(const SparseIntVect<IndexType> &v2) const {
332  return !(*this == v2);
333  }
334 
335  //! returns a binary string representation (pickle)
336  std::string toString() const {
337  std::stringstream ss(std::ios_base::binary | std::ios_base::out |
338  std::ios_base::in);
339  std::uint32_t tInt;
341  streamWrite(ss, tInt);
342  tInt = sizeof(IndexType);
343  streamWrite(ss, tInt);
344  streamWrite(ss, d_length);
345  IndexType nEntries = d_data.size();
346  streamWrite(ss, nEntries);
347 
348  typename StorageType::const_iterator iter = d_data.begin();
349  while (iter != d_data.end()) {
350  streamWrite(ss, iter->first);
351  std::int32_t tInt = iter->second;
352  streamWrite(ss, tInt);
353  ++iter;
354  }
355  return ss.str();
356  }
357 
358  void fromString(const std::string &txt) {
359  initFromText(txt.c_str(), txt.length());
360  }
361 
362  private:
363  IndexType d_length;
364  StorageType d_data;
365 
366  void initFromText(const char *pkl, const unsigned int len) {
367  d_data.clear();
368  std::stringstream ss(std::ios_base::binary | std::ios_base::out |
369  std::ios_base::in);
370  ss.write(pkl, len);
371 
372  std::uint32_t vers;
373  streamRead(ss, vers);
374  if (vers == 0x0001) {
375  std::uint32_t tInt;
376  streamRead(ss, tInt);
377  if (tInt > sizeof(IndexType)) {
378  throw ValueErrorException(
379  "IndexType cannot accommodate index size in SparseIntVect pickle");
380  }
381  switch (tInt) {
382  case sizeof(char):
383  readVals<unsigned char>(ss);
384  break;
385  case sizeof(std::int32_t):
386  readVals<std::uint32_t>(ss);
387  break;
388  case sizeof(boost::int64_t):
389  readVals<boost::uint64_t>(ss);
390  break;
391  default:
392  throw ValueErrorException("unreadable format");
393  }
394  } else {
395  throw ValueErrorException("bad version in SparseIntVect pickle");
396  }
397  }
398  template <typename T>
399  void readVals(std::stringstream &ss) {
400  PRECONDITION(sizeof(T) <= sizeof(IndexType), "invalid size");
401  T tVal;
402  streamRead(ss, tVal);
403  d_length = tVal;
404  T nEntries;
405  streamRead(ss, nEntries);
406  for (T i = 0; i < nEntries; ++i) {
407  streamRead(ss, tVal);
408  std::int32_t val;
409  streamRead(ss, val);
410  d_data[tVal] = val;
411  }
412  }
413 };
414 
415 template <typename IndexType, typename SequenceType>
417  const SequenceType &seq) {
418  typename SequenceType::const_iterator seqIt;
419  for (seqIt = seq.begin(); seqIt != seq.end(); ++seqIt) {
420  // EFF: probably not the most efficient approach
421  IndexType idx = *seqIt;
422  vect.setVal(idx, vect.getVal(idx) + 1);
423  }
424 }
425 
426 namespace {
427 template <typename IndexType>
428 void calcVectParams(const SparseIntVect<IndexType> &v1,
429  const SparseIntVect<IndexType> &v2, double &v1Sum,
430  double &v2Sum, double &andSum) {
431  if (v1.getLength() != v2.getLength()) {
432  throw ValueErrorException("SparseIntVect size mismatch");
433  }
434  v1Sum = v2Sum = andSum = 0.0;
435  // we're doing : (v1&v2).getTotalVal(), but w/o generating
436  // the other vector:
437  typename SparseIntVect<IndexType>::StorageType::const_iterator iter1, iter2;
438  iter1 = v1.getNonzeroElements().begin();
439  if (iter1 != v1.getNonzeroElements().end()) {
440  v1Sum += abs(iter1->second);
441  }
442  iter2 = v2.getNonzeroElements().begin();
443  if (iter2 != v2.getNonzeroElements().end()) {
444  v2Sum += abs(iter2->second);
445  }
446  while (iter1 != v1.getNonzeroElements().end()) {
447  while (iter2 != v2.getNonzeroElements().end() &&
448  iter2->first < iter1->first) {
449  ++iter2;
450  if (iter2 != v2.getNonzeroElements().end()) {
451  v2Sum += abs(iter2->second);
452  }
453  }
454  if (iter2 != v2.getNonzeroElements().end()) {
455  if (iter2->first == iter1->first) {
456  if (abs(iter2->second) < abs(iter1->second)) {
457  andSum += abs(iter2->second);
458  } else {
459  andSum += abs(iter1->second);
460  }
461  ++iter2;
462  if (iter2 != v2.getNonzeroElements().end()) {
463  v2Sum += abs(iter2->second);
464  }
465  }
466  ++iter1;
467  if (iter1 != v1.getNonzeroElements().end()) {
468  v1Sum += abs(iter1->second);
469  }
470  } else {
471  break;
472  }
473  }
474  if (iter1 != v1.getNonzeroElements().end()) {
475  ++iter1;
476  while (iter1 != v1.getNonzeroElements().end()) {
477  v1Sum += abs(iter1->second);
478  ++iter1;
479  }
480  }
481  if (iter2 != v2.getNonzeroElements().end()) {
482  ++iter2;
483  while (iter2 != v2.getNonzeroElements().end()) {
484  v2Sum += abs(iter2->second);
485  ++iter2;
486  }
487  }
488 }
489 } // namespace
490 
491 template <typename IndexType>
493  const SparseIntVect<IndexType> &v2,
494  bool returnDistance = false, double bounds = 0.0) {
495  if (v1.getLength() != v2.getLength()) {
496  throw ValueErrorException("SparseIntVect size mismatch");
497  }
498  double v1Sum = 0.0;
499  double v2Sum = 0.0;
500  if (!returnDistance && bounds > 0.0) {
501  v1Sum = v1.getTotalVal(true);
502  v2Sum = v2.getTotalVal(true);
503  double denom = v1Sum + v2Sum;
504  if (fabs(denom) < 1e-6) {
505  // no need to worry about returnDistance here
506  return 0.0;
507  }
508  double minV = v1Sum < v2Sum ? v1Sum : v2Sum;
509  if (2. * minV / denom < bounds) {
510  return 0.0;
511  }
512  v1Sum = 0.0;
513  v2Sum = 0.0;
514  }
515 
516  double numer = 0.0;
517 
518  calcVectParams(v1, v2, v1Sum, v2Sum, numer);
519 
520  double denom = v1Sum + v2Sum;
521  double sim;
522  if (fabs(denom) < 1e-6) {
523  sim = 0.0;
524  } else {
525  sim = 2. * numer / denom;
526  }
527  if (returnDistance) {
528  sim = 1. - sim;
529  }
530  // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
531  return sim;
532 }
533 
534 template <typename IndexType>
536  const SparseIntVect<IndexType> &v2, double a, double b,
537  bool returnDistance = false, double bounds = 0.0) {
538  RDUNUSED_PARAM(bounds);
539  if (v1.getLength() != v2.getLength()) {
540  throw ValueErrorException("SparseIntVect size mismatch");
541  }
542  double v1Sum = 0.0;
543  double v2Sum = 0.0;
544  double andSum = 0.0;
545 
546  calcVectParams(v1, v2, v1Sum, v2Sum, andSum);
547 
548  double denom = a * v1Sum + b * v2Sum + (1 - a - b) * andSum;
549  double sim;
550 
551  if (fabs(denom) < 1e-6) {
552  sim = 0.0;
553  } else {
554  sim = andSum / denom;
555  }
556  if (returnDistance) {
557  sim = 1. - sim;
558  }
559  // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
560  return sim;
561 }
562 
563 template <typename IndexType>
565  const SparseIntVect<IndexType> &v2,
566  bool returnDistance = false, double bounds = 0.0) {
567  return TverskySimilarity(v1, v2, 1.0, 1.0, returnDistance, bounds);
568 }
569 } // namespace RDKit
570 
571 #endif
#define RDUNUSED_PARAM(x)
Definition: Invariant.h:196
#define PRECONDITION(expr, mess)
Definition: Invariant.h:109
const int ci_SPARSEINTVECT_VERSION
version number to use in pickles
Definition: SparseIntVect.h:23
Class to allow us to throw an IndexError from C++ and have it make it back to Python.
Definition: Exceptions.h:20
a class for efficiently storing sparse vectors of ints
Definition: SparseIntVect.h:28
SparseIntVect< IndexType > & operator+=(int v)
SparseIntVect< IndexType > & operator/(int v)
SparseIntVect(IndexType length)
initialize with a particular length
Definition: SparseIntVect.h:35
unsigned int size() const
returns the length
const SparseIntVect< IndexType > operator+(const SparseIntVect< IndexType > &other) const
SparseIntVect< IndexType > & operator*(int v)
SparseIntVect< IndexType > & operator+=(const SparseIntVect< IndexType > &other)
bool operator==(const SparseIntVect< IndexType > &v2) const
SparseIntVect(const SparseIntVect< IndexType > &other)
Copy constructor.
Definition: SparseIntVect.h:38
~SparseIntVect()=default
destructor (doesn't need to do anything)
SparseIntVect< IndexType > & operator|=(const SparseIntVect< IndexType > &other)
const SparseIntVect< IndexType > operator-(const SparseIntVect< IndexType > &other) const
const SparseIntVect< IndexType > operator|(const SparseIntVect< IndexType > &other) const
SparseIntVect< IndexType > & operator/=(int v)
const SparseIntVect< IndexType > operator&(const SparseIntVect< IndexType > &other) const
SparseIntVect(const char *pkl, const unsigned int len)
constructor from a pickle
Definition: SparseIntVect.h:49
int operator[](IndexType idx) const
support indexing using []
void fromString(const std::string &txt)
SparseIntVect< IndexType > & operator*=(int v)
void setVal(IndexType idx, int val)
set the value at an index
Definition: SparseIntVect.h:90
SparseIntVect< IndexType > & operator&=(const SparseIntVect< IndexType > &other)
SparseIntVect & operator=(const SparseIntVect< IndexType > &other)
Definition: SparseIntVect.h:53
std::string toString() const
returns a binary string representation (pickle)
int getTotalVal(bool doAbs=false) const
SparseIntVect< IndexType > & operator-(int v)
std::map< IndexType, int > StorageType
Definition: SparseIntVect.h:30
SparseIntVect< IndexType > & operator-=(const SparseIntVect< IndexType > &other)
bool operator!=(const SparseIntVect< IndexType > &v2) const
SparseIntVect< IndexType > & operator-=(int v)
SparseIntVect(const std::string &pkl)
constructor from a pickle
Definition: SparseIntVect.h:45
SparseIntVect< IndexType > & operator+(int v)
IndexType getLength() const
returns the length
int getVal(IndexType idx) const
return the value at an index
Definition: SparseIntVect.h:77
const StorageType & getNonzeroElements() const
returns our nonzero elements as a map(IndexType->int)
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:40
Std stuff.
Definition: Abbreviations.h:19
double TverskySimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, double a, double b, bool returnDistance=false, double bounds=0.0)
void updateFromSequence(SparseIntVect< IndexType > &vect, const SequenceType &seq)
double TanimotoSimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, bool returnDistance=false, double bounds=0.0)
double DiceSimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, bool returnDistance=false, double bounds=0.0)
void streamRead(std::istream &ss, T &loc)
does a binary read of an object from a stream
Definition: StreamOps.h:282
void streamWrite(std::ostream &ss, const T &val)
does a binary write of an object to a stream
Definition: StreamOps.h:260