forpy  2
sampling.h
Go to the documentation of this file.
1 /* Author: Christoph Lassner */
2 #pragma once
3 #ifndef FORPY_UTIL_SAMPLING_H_
4 #define FORPY_UTIL_SAMPLING_H_
5 
6 #include <cereal/access.hpp>
7 #include <cereal/types/polymorphic.hpp>
8 
9 #include <cstdint>
10 #include <random>
11 #include <type_traits>
12 #include <vector>
13 #include <algorithm>
14 
15 #include "../global.h"
16 #include "./serialization/basics.h"
17 
18 namespace forpy {
30 static int64_t ibinom(const int &n, int k) {
31  FASSERT(n >= 0 && k >= 0);
32  int i;
33  int64_t b;
34 
35  if (0 == k || n == k) return 1LL;
36  if (k > n) return 0LL;
37 
38  if (k > (n - k)) k = n - k;
39  if (1 == k) return static_cast<int64_t>(n);
40 
41  b = 1LL;
42  for (i = 1; i <= k; ++i) {
43  b *= (n - (k - i));
44  if (b < 0) return -1LL; /* Overflow */
45  b /= i;
46  }
47  return b;
48 };
49 
61 template <typename T>
63  public:
64  SamplingWithoutReplacement(const T &min, const T &max,
65  const std::shared_ptr<std::mt19937> &random_engine)
67  FASSERT(max >= min);
68  indices = std::vector<T>(max - min + 1);
69  std::iota(indices.begin(), indices.end(), 0);
70  index = 0;
71  dist = std::uniform_int_distribution<size_t>(0, max - min);
72  };
73 
77  bool sample_available() const { return index < indices.size(); };
78 
82  T get_next() {
83  T return_value;
84  {
85  if (index >= indices.size())
86  throw ForpyException(
87  "Tried to redraw without replacement "
88  "from a limited set where the num of remaining examples was 0.");
89  size_t rand_index = dist(*random_engine);
90  std::swap(indices[index], indices[rand_index]);
91  if (index != indices.size() - 1)
92  dist.param(std::uniform_int_distribution<size_t>::param_type(
93  dist.min() + 1, dist.max()));
94  return_value = min + indices[index++];
95  }
96  return return_value;
97  };
98 
99  inline friend std::ostream &operator<<(
100  std::ostream &stream, const SamplingWithoutReplacement &self) {
101  stream << "forpy::SamplingWithoutReplacement[" << self.min
102  << " (inc):" << (self.min + self.indices.size() - 1) << " (inc), "
103  << (self.indices.size() - self.index) << " available]";
104  return stream;
105  };
106 
107  bool operator==(const SamplingWithoutReplacement<T> &rhs) const {
108  return (min == rhs.min && random_engine == rhs.random_engine &&
109  dist == rhs.dist && indices == rhs.indices && index == rhs.index);
110  };
111 
112  private:
114 
115  friend class cereal::access;
116  template <class Archive>
117  void serialize(Archive &ar, const uint &) {
118  ar(CEREAL_NVP(min), CEREAL_NVP(random_engine), CEREAL_NVP(dist),
119  CEREAL_NVP(indices), CEREAL_NVP(index));
120  };
121 
122  T min;
123  std::shared_ptr<std::mt19937> random_engine;
124  std::uniform_int_distribution<T> dist;
125  std::vector<T> indices;
126  size_t index;
127 };
128 
155 template <typename T>
156 static std::vector<T> unique_indices(T num, T min, const T &max,
157  std::mt19937 *random_engine,
158  bool return_sorted = false) {
159  static_assert(std::is_integral<T>::value, "T must be integral!");
160  if (max < min) throw ForpyException("Invalid sample range.");
161  if (num > max - min + 1)
162  throw ForpyException("Sample size larger than range.");
163  std::vector<T> result(num);
164  if (num == max - min + 1) {
165  // The full range of numbers is required.
166  std::iota(result.begin(), result.end(), min);
167  } else {
168  std::geometric_distribution<T> dist;
169  if (num + 1 < max - min + 1) {
170  // Draw unique samples by iterating once over the numbers.
171  // There are only as many 'steps' as there should be numbers drawn.
172  // The difference between two numbers uniquely drawn from the set is
173  // distributed with a geometric distribution.
174  // The mean difference between two numbers is (max-min+1)/(num+1).
175  dist = std::geometric_distribution<T>(static_cast<float>(num + 1) /
176  static_cast<float>(max - min + 1));
177  }
178  // A truncated geometric distribution must be used to guarantee that
179  // all numbers can be drawn from the range. r is the truncation level.
180  // i is the remaining number of elements to be drawn.
181  T r, i = num;
182  while (i--) {
183  r = (max - min + 1 - i);
184  if (r <= 1) {
185  // The next value must be used, since there is no more 'space' for
186  // additional numbers.
187  result[num - i - 1] = min;
188  } else {
189  // Do the next step and handle truncation.
190  result[num - i - 1] = min +=
191  std::min<T>(dist(*random_engine) + 1, r - 1);
192  }
193  }
194  }
195  // The numbers are now ready, sorted in the appropriate interval.
196  if (!return_sorted) {
197  std::shuffle(result.begin(), result.end(), *random_engine);
198  }
199  return result;
200 };
201 }; // namespace forpy
202 #endif // FORPY_UTIL_SAMPLING_H_
T get_next()
Gets the next sample.
Definition: sampling.h:82
A lazy evaluation sampling without replacement.
Definition: sampling.h:62
bool operator==(const SamplingWithoutReplacement< T > &rhs) const
Definition: sampling.h:107
#define FASSERT(condition)
Definition: global.h:124
std::shared_ptr< std::mt19937 > random_engine
Definition: sampling.h:123
static std::vector< T > unique_indices(T num, T min, const T &max, std::mt19937 *random_engine, bool return_sorted=false)
Sampling without replacement.
Definition: sampling.h:156
void serialize(Archive &ar, const uint &)
Definition: sampling.h:117
SamplingWithoutReplacement(const T &min, const T &max, const std::shared_ptr< std::mt19937 > &random_engine)
Definition: sampling.h:64
static int64_t ibinom(const int &n, int k)
Integer binomial with overflow detection.
Definition: sampling.h:30
unsigned int uint
Convenience typedef for unsigned int.
Definition: types.h:113
friend std::ostream & operator<<(std::ostream &stream, const SamplingWithoutReplacement &self)
Definition: sampling.h:99
bool sample_available() const
Returns true if a sample can be drawn without raising an exception.
Definition: sampling.h:77
std::uniform_int_distribution< T > dist
Definition: sampling.h:124