RDKit
Open-source cheminformatics and machine learning.
python_streambuf.h
Go to the documentation of this file.
1 //
2 // This file is part of the CCTBX distribution:
3 // http://cctbx.sourceforge.net/
4 // Downloaded from here:
5 // http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/python_streambuf.h?revision=13619
6 //
7 // Copyright (c) 2006, The Regents of the University of
8 // California, through Lawrence Berkeley National Laboratory (subject to
9 // receipt of any required approvals from the U.S. Dept. of Energy). All
10 // rights reserved.
11 //
12 // The license is here:
13 // http://cctbx.svn.sourceforge.net/viewvc/cctbx/trunk/boost_adaptbx/LICENSE_2_0.txt?revision=5148
14 //
15 #include <RDGeneral/export.h>
16 #ifndef BOOST_ADAPTBX_PYTHON_STREAMBUF_H
17 #define BOOST_ADAPTBX_PYTHON_STREAMBUF_H
19 #include <boost/python/object.hpp>
20 #include <boost/python/str.hpp>
21 #include <boost/python/extract.hpp>
22 
23 #include <boost/optional.hpp>
24 #include <boost/utility/typed_in_place_factory.hpp>
26 
27 //#include <tbxx/error_utils.hpp>
28 #include <RDGeneral/Invariant.h>
29 #include <RDGeneral/Exceptions.h>
30 
31 #include <streambuf>
32 #include <iostream>
33 
34 namespace boost_adaptbx {
35 namespace python {
36 
37 namespace bp = boost::python;
38 
39 /// A stream buffer getting data from and putting data into a Python file object
40 /** The aims are as follow:
41 
42  - Given a C++ function acting on a standard stream, e.g.
43 
44  \code
45  void read_inputs(std::istream& input) {
46  ...
47  input >> something >> something_else;
48  }
49  \endcode
50 
51  and given a piece of Python code which creates a file-like object,
52  to be able to pass this file object to that C++ function, e.g.
53 
54  \code
55  import gzip
56  gzip_file_obj = gzip.GzipFile(...)
57  read_inputs(gzip_file_obj)
58  \endcode
59 
60  and have the standard stream pull data from and put data into the Python
61  file object.
62 
63  - When Python \c read_inputs() returns, the Python object is able to
64  continue reading or writing where the C++ code left off.
65 
66  - Operations in C++ on mere files should be competitively fast compared
67  to the direct use of \c std::fstream.
68 
69 
70  \b Motivation
71 
72  - the standard Python library offer of file-like objects (files,
73  compressed files and archives, network, ...) is far superior to the
74  offer of streams in the C++ standard library and Boost C++ libraries.
75 
76  - i/o code involves a fair amount of text processing which is more
77  efficiently prototyped in Python but then one may need to rewrite
78  a time-critical part in C++, in as seamless a manner as possible.
79 
80  \b Usage
81 
82  This is 2-step:
83 
84  - a trivial wrapper function
85 
86  \code
87  using boost_adaptbx::python::streambuf;
88  void read_inputs_wrapper(streambuf& input)
89  {
90  streambuf::istream is(input);
91  read_inputs(is);
92  }
93 
94  def("read_inputs", read_inputs_wrapper);
95  \endcode
96 
97  which has to be written every time one wants a Python binding for
98  such a C++ function.
99 
100  - the Python side
101 
102  \code
103  from boost.python import streambuf
104  read_inputs(streambuf(python_file_obj=obj, buffer_size=1024))
105  \endcode
106 
107  \c buffer_size is optional. See also: \c default_buffer_size
108 
109  Note: references are to the C++ standard (the numbers between parentheses
110  at the end of references are margin markers).
111 */
112 class streambuf : public std::basic_streambuf<char> {
113  private:
114  typedef std::basic_streambuf<char> base_t;
115 
116  public:
117  /* The syntax
118  using base_t::char_type;
119  would be nicer but Visual Studio C++ 8 chokes on it
120  */
121  typedef base_t::char_type char_type;
122  typedef base_t::int_type int_type;
123  typedef base_t::pos_type pos_type;
124  typedef base_t::off_type off_type;
125  typedef base_t::traits_type traits_type;
126 
127  // work around Visual C++ 7.1 problem
128  inline static int traits_type_eof() { return traits_type::eof(); }
129 
130  /// The default size of the read and write buffer.
131  /** They are respectively used to buffer data read from and data written to
132  the Python file object. It can be modified from Python.
133  */
134  const static std::size_t default_buffer_size = 1024;
135 
136  /// Construct from a Python file object
137  /** if buffer_size is 0 the current default_buffer_size is used.
138  */
139  streambuf(bp::object& python_file_obj, std::size_t buffer_size_ = 0)
140  : py_read(getattr(python_file_obj, "read", bp::object())),
141  py_write(getattr(python_file_obj, "write", bp::object())),
142  py_seek(getattr(python_file_obj, "seek", bp::object())),
143  py_tell(getattr(python_file_obj, "tell", bp::object())),
144  buffer_size(buffer_size_ != 0 ? buffer_size_ : default_buffer_size),
145  write_buffer(nullptr),
146  pos_of_read_buffer_end_in_py_file(0),
147  pos_of_write_buffer_end_in_py_file(buffer_size),
148  farthest_pptr(nullptr) {
149  TEST_ASSERT(buffer_size != 0);
150  /* Some Python file objects (e.g. sys.stdout and sys.stdin)
151  have non-functional seek and tell. If so, assign None to
152  py_tell and py_seek.
153  */
154  if (py_tell != bp::object()) {
155  try {
156  off_type py_pos = bp::extract<off_type>(py_tell());
157  if (py_seek != bp::object()) {
158  /* Make sure we can actually seek.
159  bzip2 readers from python have a seek method, but it fails
160  when they are in write mode.
161  */
162  py_seek(py_pos);
163  }
164  } catch (bp::error_already_set&) {
165  py_tell = bp::object();
166  py_seek = bp::object();
167  /* Boost.Python does not do any Python exception handling whatsoever
168  So we need to catch it by hand like so.
169  */
170  PyErr_Clear();
171  }
172  }
173 
174  if (py_write != bp::object()) {
175  // C-like string to make debugging easier
176  write_buffer = new char[buffer_size + 1];
177  write_buffer[buffer_size] = '\0';
178  setp(write_buffer, write_buffer + buffer_size); // 27.5.2.4.5 (5)
179  farthest_pptr = pptr();
180  } else {
181  // The first attempt at output will result in a call to overflow
182  setp(nullptr, nullptr);
183  }
184 
185  if (py_tell != bp::object()) {
186  off_type py_pos = bp::extract<off_type>(py_tell());
187  pos_of_read_buffer_end_in_py_file = py_pos;
188  pos_of_write_buffer_end_in_py_file = py_pos;
189  }
190  }
191 
192  /// constructor to enforce a mode (binary or text)
193  streambuf(bp::object& python_file_obj, char mode,
194  std::size_t buffer_size_ = 0)
195  : streambuf(python_file_obj, buffer_size_) {
196 #if 1
197  bp::object io_mod = bp::import("io");
198  CHECK_INVARIANT(io_mod, "module not found");
199  bp::object iobase = io_mod.attr("TextIOBase");
200  CHECK_INVARIANT(iobase, "base class not found");
201 #else
202  // using statics to save an undetermined amount of time results in
203  // alarming seg faults on windows. so we don't do it. Keep this here
204  // for the moment though in case someone manages to figure that out in
205  // the future
206  static bp::object io_mod = bp::object();
207  static bp::object iobase = bp::object();
208  if (!io_mod) io_mod = bp::import("io");
209  if (io_mod && !iobase) iobase = io_mod.attr("TextIOBase");
210  CHECK_INVARIANT(io_mod, "module not found");
211  CHECK_INVARIANT(iobase, "base class not found");
212 #endif
213 
214  df_isTextMode = PyObject_IsInstance(python_file_obj.ptr(), iobase.ptr());
215  switch (mode) {
216  case 's': /// yeah, is redundant, but it is somehow natural to do "s"
217  case 't':
218  if (!df_isTextMode) {
219  throw ValueErrorException(
220  "Need a text mode file object like StringIO or a file opened "
221  "with mode 't'");
222  }
223  break;
224  case 'b':
225  if (df_isTextMode) {
226  throw ValueErrorException(
227  "Need a binary mode file object like BytesIO or a file opened "
228  "with mode 'b'");
229  }
230  break;
231  default:
232  throw std::invalid_argument("bad mode character");
233  }
234  }
235 
236  /// Mundane destructor freeing the allocated resources
237  ~streambuf() override {
238  if (write_buffer) {
239  delete[] write_buffer;
240  }
241  }
242 
243  /// C.f. C++ standard section 27.5.2.4.3
244  /** It is essential to override this virtual function for the stream
245  member function readsome to work correctly (c.f. 27.6.1.3, alinea 30)
246  */
247  std::streamsize showmanyc() override {
248  int_type const failure = traits_type::eof();
249  int_type status = underflow();
250  if (status == failure) {
251  return -1;
252  }
253  return egptr() - gptr();
254  }
255 
256  /// C.f. C++ standard section 27.5.2.4.3
257  int_type underflow() override {
258  int_type const failure = traits_type::eof();
259  if (py_read == bp::object()) {
260  throw std::invalid_argument(
261  "That Python file object has no 'read' attribute");
262  }
263  read_buffer = py_read(buffer_size);
264  char* read_buffer_data;
265  bp::ssize_t py_n_read;
266  if (PyBytes_AsStringAndSize(read_buffer.ptr(), &read_buffer_data,
267  &py_n_read) == -1) {
268  setg(nullptr, nullptr, nullptr);
269  throw std::invalid_argument(
270  "The method 'read' of the Python file object "
271  "did not return a string.");
272  }
273  off_type n_read = (off_type)py_n_read;
274  pos_of_read_buffer_end_in_py_file += n_read;
275  setg(read_buffer_data, read_buffer_data, read_buffer_data + n_read);
276  // ^^^27.5.2.3.1 (4)
277  if (n_read == 0) {
278  return failure;
279  }
280  return traits_type::to_int_type(read_buffer_data[0]);
281  }
282 
283  /// C.f. C++ standard section 27.5.2.4.5
285  if (py_write == bp::object()) {
286  throw std::invalid_argument(
287  "That Python file object has no 'write' attribute");
288  }
289  farthest_pptr = std::max(farthest_pptr, pptr());
290  off_type n_written = (off_type)(farthest_pptr - pbase());
291  off_type orig_n_written = n_written;
292  const unsigned int STD_ASCII = 0x7F;
293  if (df_isTextMode && static_cast<unsigned int>(c) > STD_ASCII) {
294  // we're somewhere in the middle of a utf8 block. If we
295  // only write part of it we'll end up with an exception,
296  // so push everything that could be utf8 into the next block
297  while (n_written > 0 && static_cast<unsigned int>(
298  write_buffer[n_written - 1]) > STD_ASCII) {
299  --n_written;
300  }
301  }
302  bp::str chunk(pbase(), pbase() + n_written);
303  py_write(chunk);
304 
305  if ((!df_isTextMode || static_cast<unsigned int>(c) <= STD_ASCII) &&
306  !traits_type::eq_int_type(c, traits_type::eof())) {
307  py_write(traits_type::to_char_type(c));
308  n_written++;
309  }
310 
311  setp(pbase(), epptr());
312  // ^^^ 27.5.2.4.5 (5)
313  farthest_pptr = pptr();
314  if (n_written) {
315  pos_of_write_buffer_end_in_py_file += n_written;
316  if (df_isTextMode && static_cast<unsigned int>(c) > STD_ASCII &&
317  !traits_type::eq_int_type(c, traits_type::eof())) {
318  size_t n_to_copy = orig_n_written - n_written;
319 
320  for (size_t i = 0; i < n_to_copy; ++i) {
321  sputc(write_buffer[n_written + i]);
322  ++farthest_pptr;
323  }
324  sputc(c);
325  ++farthest_pptr;
326  }
327  }
328  return traits_type::eq_int_type(c, traits_type::eof())
329  ? traits_type::not_eof(c)
330  : c;
331  }
332 
333  /// Update the python file to reflect the state of this stream buffer
334  /** Empty the write buffer into the Python file object and set the seek
335  position of the latter accordingly (C++ standard section 27.5.2.4.2).
336  If there is no write buffer or it is empty, but there is a non-empty
337  read buffer, set the Python file object seek position to the
338  seek position in that read buffer.
339  */
340  int sync() override {
341  int result = 0;
342  farthest_pptr = std::max(farthest_pptr, pptr());
343  if (farthest_pptr && farthest_pptr > pbase()) {
344  off_type delta = pptr() - farthest_pptr;
345  int_type status = overflow();
346  if (traits_type::eq_int_type(status, traits_type::eof())) {
347  result = -1;
348  }
349  if (py_seek != bp::object()) {
350  py_seek(delta, 1);
351  }
352  } else if (gptr() && gptr() < egptr()) {
353  if (py_seek != bp::object()) {
354  py_seek(gptr() - egptr(), 1);
355  }
356  }
357  return result;
358  }
359 
360  /// C.f. C++ standard section 27.5.2.4.2
361  /** This implementation is optimised to look whether the position is within
362  the buffers, so as to avoid calling Python seek or tell. It is
363  important for many applications that the overhead of calling into Python
364  is avoided as much as possible (e.g. parsers which may do a lot of
365  backtracking)
366  */
367  pos_type seekoff(off_type off, std::ios_base::seekdir way,
368  std::ios_base::openmode which =
369  std::ios_base::in | std::ios_base::out) override {
370  /* In practice, "which" is either std::ios_base::in or out
371  since we end up here because either seekp or seekg was called
372  on the stream using this buffer. That simplifies the code
373  in a few places.
374  */
375  int const failure = off_type(-1);
376 
377  if (py_seek == bp::object()) {
378  throw std::invalid_argument(
379  "That Python file object has no 'seek' attribute");
380  }
381 
382  // we need the read buffer to contain something!
383  if (which == std::ios_base::in && !gptr()) {
384  if (traits_type::eq_int_type(underflow(), traits_type::eof())) {
385  return failure;
386  }
387  }
388 
389  // compute the whence parameter for Python seek
390  int whence;
391  switch (way) {
392  case std::ios_base::beg:
393  whence = 0;
394  break;
395  case std::ios_base::cur:
396  whence = 1;
397  break;
398  case std::ios_base::end:
399  whence = 2;
400  break;
401  default:
402  return failure;
403  }
404 
405  // Let's have a go
406  boost::optional<off_type> result =
407  seekoff_without_calling_python(off, way, which);
408  if (!result) {
409  // we need to call Python
410  if (which == std::ios_base::out) {
411  overflow();
412  }
413  if (way == std::ios_base::cur) {
414  if (which == std::ios_base::in) {
415  off -= egptr() - gptr();
416  } else if (which == std::ios_base::out) {
417  off += pptr() - pbase();
418  }
419  }
420  py_seek(off, whence);
421  result = off_type(bp::extract<off_type>(py_tell()));
422  if (which == std::ios_base::in) {
423  underflow();
424  }
425  }
426  return *result;
427  }
428 
429  /// C.f. C++ standard section 27.5.2.4.2
431  std::ios_base::openmode which =
432  std::ios_base::in | std::ios_base::out) override {
433  return streambuf::seekoff(sp, std::ios_base::beg, which);
434  }
435 
436  private:
437  bp::object py_read, py_write, py_seek, py_tell;
438 
439  std::size_t buffer_size;
440 
441  /* This is actually a Python string and the actual read buffer is
442  its internal data, i.e. an array of characters. We use a Boost.Python
443  object so as to hold on it: as a result, the actual buffer can't
444  go away.
445  */
446  bp::object read_buffer;
447 
448  /* A mere array of char's allocated on the heap at construction time and
449  de-allocated only at destruction time.
450  */
451  char* write_buffer;
452  bool df_isTextMode;
453 
454  off_type pos_of_read_buffer_end_in_py_file,
455  pos_of_write_buffer_end_in_py_file;
456 
457  // the farthest place the buffer has been written into
458  char* farthest_pptr;
459 
460  boost::optional<off_type> seekoff_without_calling_python(
461  off_type off, std::ios_base::seekdir way, std::ios_base::openmode which) {
462  boost::optional<off_type> const failure;
463 
464  // Buffer range and current position
465  off_type buf_begin, buf_end, buf_cur, upper_bound;
466  off_type pos_of_buffer_end_in_py_file;
467  if (which == std::ios_base::in) {
468  pos_of_buffer_end_in_py_file = pos_of_read_buffer_end_in_py_file;
469  buf_begin = reinterpret_cast<std::streamsize>(eback());
470  buf_cur = reinterpret_cast<std::streamsize>(gptr());
471  buf_end = reinterpret_cast<std::streamsize>(egptr());
472  upper_bound = buf_end;
473  } else if (which == std::ios_base::out) {
474  pos_of_buffer_end_in_py_file = pos_of_write_buffer_end_in_py_file;
475  buf_begin = reinterpret_cast<std::streamsize>(pbase());
476  buf_cur = reinterpret_cast<std::streamsize>(pptr());
477  buf_end = reinterpret_cast<std::streamsize>(epptr());
478  farthest_pptr = std::max(farthest_pptr, pptr());
479  upper_bound = reinterpret_cast<std::streamsize>(farthest_pptr) + 1;
480  } else {
481  CHECK_INVARIANT(0, "unreachable code");
482  }
483 
484  // Sought position in "buffer coordinate"
485  off_type buf_sought;
486  if (way == std::ios_base::cur) {
487  buf_sought = buf_cur + off;
488  } else if (way == std::ios_base::beg) {
489  buf_sought = buf_end + (off - pos_of_buffer_end_in_py_file);
490  } else if (way == std::ios_base::end) {
491  return failure;
492  } else {
493  CHECK_INVARIANT(0, "unreachable code");
494  }
495 
496  // if the sought position is not in the buffer, give up
497  if (buf_sought < buf_begin || buf_sought >= upper_bound) {
498  return failure;
499  }
500 
501  // we are in wonderland
502  if (which == std::ios_base::in) {
503  gbump(buf_sought - buf_cur);
504  } else if (which == std::ios_base::out) {
505  pbump(buf_sought - buf_cur);
506  }
507  return pos_of_buffer_end_in_py_file + (buf_sought - buf_end);
508  }
509 
510  public:
511  class istream : public std::istream {
512  public:
513  istream(streambuf& buf) : std::istream(&buf) {
514  exceptions(std::ios_base::badbit);
515  }
516 
517  ~istream() override {
518  // do nothing.
519  // This used to do:
520  // if (this->good()) this->sync();
521  // but that caused problems if the underlying file had been closed
522  // (see github #579) and really doesn't seem necessary for what we're
523  // doing.
524  }
525  };
526 
527  class ostream : public std::ostream {
528  public:
529  ostream(streambuf& buf) : std::ostream(&buf) {
530  exceptions(std::ios_base::badbit);
531  }
532 
533  ~ostream() override {
534  if (this->good()) {
535  this->flush();
536  }
537  }
538  };
539 };
540 
541 // std::size_t streambuf::default_buffer_size = 1024;
542 
545 
546  streambuf_capsule(bp::object& python_file_obj, std::size_t buffer_size = 0)
547  : python_streambuf(python_file_obj, buffer_size) {}
548 };
549 
551  ostream(bp::object& python_file_obj, std::size_t buffer_size = 0)
552  : streambuf_capsule(python_file_obj, buffer_size),
554 
555  ~ostream() noexcept override {
556  if (this->good()) {
557  this->flush();
558  }
559  }
560 };
561 } // namespace python
562 } // namespace boost_adaptbx
563 
564 #endif // GUARD
#define TEST_ASSERT(expr)
Definition: Invariant.h:152
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:101
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:40
A stream buffer getting data from and putting data into a Python file object.
~streambuf() override
Mundane destructor freeing the allocated resources.
static const std::size_t default_buffer_size
The default size of the read and write buffer.
pos_type seekpos(pos_type sp, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out) override
C.f. C++ standard section 27.5.2.4.2.
std::streamsize showmanyc() override
C.f. C++ standard section 27.5.2.4.3.
pos_type seekoff(off_type off, std::ios_base::seekdir way, std::ios_base::openmode which=std::ios_base::in|std::ios_base::out) override
C.f. C++ standard section 27.5.2.4.2.
streambuf(bp::object &python_file_obj, char mode, std::size_t buffer_size_=0)
constructor to enforce a mode (binary or text)
int sync() override
Update the python file to reflect the state of this stream buffer.
int_type overflow(int_type c=traits_type_eof()) override
C.f. C++ standard section 27.5.2.4.5.
int_type underflow() override
C.f. C++ standard section 27.5.2.4.3.
streambuf(bp::object &python_file_obj, std::size_t buffer_size_=0)
Construct from a Python file object.
ostream(bp::object &python_file_obj, std::size_t buffer_size=0)
streambuf_capsule(bp::object &python_file_obj, std::size_t buffer_size=0)