forpy  2
tree.h
Go to the documentation of this file.
1 /* Author: Christoph Lassner. */
2 #pragma once
3 #ifndef FORPY_TREE_H_
4 #define FORPY_TREE_H_
5 
6 #include "./global.h"
7 
8 #include "./util/serialization/basics.h"
9 
10 #include <atomic>
11 #include <fstream>
12 #include <future>
13 #include <mapbox/variant_cast.hpp>
14 #include <memory>
15 #include <string>
16 #include <tuple>
17 #include <utility>
18 #include <vector>
19 
22 #include "./deciders/idecider.h"
23 #include "./leafs/ileaf.h"
24 #include "./types.h"
25 #include "./util/desk.h"
26 
27 namespace forpy {
28 class Forest;
29 
36 class Tree {
37  public:
55  Tree(const uint &max_depth = std::numeric_limits<uint>::max(),
56  const uint &min_samples_at_leaf = 1, const uint &min_samples_at_node = 2,
57  const std::shared_ptr<IDecider> &decider = nullptr,
58  const std::shared_ptr<ILeaf> &leaf_manager = nullptr,
59  const uint &random_seed = 1);
60 
67  Tree(std::string filename);
68 
85  void make_node(const IDataProvider *data_provider, Desk *d);
86 
104  void DFS(const IDataProvider *data_provider,
105  const ECompletionLevel &completion, Desk *d);
106  void parallel_DFS(Desk *d, TodoMark &mark, IDataProvider *data_provider,
107  const bool &finalize = true);
108  void DFS_and_store(Desk *d, TodoMark &mark, const IDataProvider *dprov,
109  const ECompletionLevel &comp);
110 
118  size_t get_depth() const;
119 
143  Tree *fit(const Data<MatCRef> &data_v, const Data<MatCRef> &annotation_v,
144  const size_t &n_threads, const bool &complete_dfs = true,
145  const std::vector<float> &weights = std::vector<float>());
146 
161  Tree *fit_dprov(std::shared_ptr<IDataProvider> data_provider,
162  const bool &complete_dfs = true);
163 
172  id_t predict_leaf(const Data<MatCRef> &data, const id_t &start_node = 0,
173  const std::function<void(void *)> &dptf = nullptr) const;
174 
201  Data<Mat> predict(const Data<MatCRef> &data_v, const int &num_threads = 1,
202  const bool &use_fast_prediction_if_available = true,
203  const bool &predict_proba = false,
204  const bool &for_forest = false);
205 
211  Data<Mat> predict_proba(const Data<MatCRef> &data_v,
212  const int &num_threads = 1,
213  const bool &use_fast_prediction_if_available = true);
214 
219  const Data<MatCRef> &data, const id_t &start_node = 0,
220  const std::function<void(void *)> &dptf = nullptr) const {
221  return leaf_manager->get_result(predict_leaf(data, start_node, dptf));
222  };
223 
228  const std::vector<Data<Mat>> &leaf_results,
229  const Vec<float> &weights = Vec<float>(),
230  const bool &predict_proba = false) const {
231  return leaf_manager->get_result(leaf_results, weights, predict_proba);
232  };
233 
238  inline bool is_initialized() const { return is_initialized_for_training; };
239 
243  inline float get_weight() const { return weight; };
244 
248  inline size_t get_n_nodes() const { return tree.size(); };
249 
253  inline void set_weight(const float &new_weight) { weight = new_weight; };
254 
258  inline size_t get_input_data_dimensions() const {
259  return decider->get_data_dim();
260  };
261 
265  inline std::shared_ptr<const IDecider> get_decider() const {
266  return decider;
267  };
268 
272  inline std::shared_ptr<const ILeaf> get_leaf_manager() const {
273  return leaf_manager;
274  };
275 
279  inline size_t get_samples_stored() const { return stored_in_leafs.load(); };
280 
281  inline const std::vector<std::pair<id_t, id_t>> get_tree() const {
282  return tree;
283  };
284 
292  void enable_fast_prediction();
293 
297  inline void disable_fast_prediction() {
298  VLOG(9) << "Disabling fast prediction; freeing memory.";
299  fast_tree.reset();
300  };
301 
302  bool operator==(Tree const &rhs) const;
303 
310  void save(const std::string &filename) const;
311 
312  inline friend std::ostream &operator<<(std::ostream &stream,
313  const Tree &self) {
314  stream << "forpy::Tree[depth " << self.get_depth() << "]";
315  return stream;
316  };
317 
318  private:
319  friend class forpy::Forest;
320  friend class cereal::access;
321  template <class Archive>
322  void serialize(Archive &ar, const uint &) {
323  ar(CEREAL_NVP(max_depth), CEREAL_NVP(is_initialized_for_training),
324  CEREAL_NVP(min_samples_at_node), CEREAL_NVP(min_samples_at_leaf),
325  CEREAL_NVP(weight), CEREAL_NVP(decider), CEREAL_NVP(leaf_manager),
326  CEREAL_NVP(tree), CEREAL_NVP(stored_in_leafs), CEREAL_NVP(next_id),
327  CEREAL_NVP(random_seed));
328  };
329 
334  uint max_depth;
341  unsigned int min_samples_at_node;
343  unsigned int min_samples_at_leaf;
345  float weight;
347  std::atomic<size_t> stored_in_leafs;
349  std::shared_ptr<IDecider> decider;
351  std::shared_ptr<ILeaf> leaf_manager;
353  std::vector<std::pair<id_t, id_t>> tree;
360  std::unique_ptr<
361  mu::variant<std::vector<std::tuple<size_t, float, size_t, size_t>>,
362  std::vector<std::tuple<size_t, double, size_t, size_t>>,
363  std::vector<std::tuple<size_t, uint32_t, size_t, size_t>>,
364  std::vector<std::tuple<size_t, uint8_t, size_t, size_t>>>>
366  std::vector<std::future<void>> futures;
367  std::mutex fut_mtx;
368  std::atomic<id_t> next_id;
370  // If any, a deep copy must be made of a tree to guarantee consistency
371  // between the tree layout and the saved features, classifiers and leafs.
372  // This is disallowed for the first.
374 };
375 
376 class ClassificationTree : public Tree {
377  public:
378  inline ClassificationTree(const std::string &filename) : Tree(filename){};
379  ClassificationTree(const uint &max_depth = std::numeric_limits<uint>::max(),
380  const uint &min_samples_at_leaf = 1,
381  const uint &min_samples_at_node = 2,
382  const uint &n_valid_features_to_use = 0,
383  const bool &autoscale_valid_features = false,
384  const uint &random_seed = 1,
385  const size_t &n_thresholds = 0,
386  const float &gain_threshold = 1E-7f);
387 
388  inline std::unordered_map<std::string, mu::variant<uint, size_t, float, bool>>
389  get_params(const bool & /*deep*/ = false) const {
390  return params;
391  }
392 
393  inline std::shared_ptr<ClassificationTree> set_params(
394  const std::unordered_map<
395  std::string, mu::variant<uint, size_t, float, bool>> &params) {
396  return std::make_shared<ClassificationTree>(
397  GetWithDefVar<uint>(params, "max_depth",
398  std::numeric_limits<uint>::max()),
399  GetWithDefVar<uint>(params, "min_samples_at_leaf", 1),
400  GetWithDefVar<uint>(params, "min_samples_at_node", 2),
401  GetWithDefVar<uint>(params, "n_valid_features_to_use", 0),
402  GetWithDefVar<bool>(params, "autoscale_valid_features", false),
403  GetWithDefVar<uint>(params, "random_seed", 1),
404  GetWithDefVar<size_t>(params, "n_thresholds", 0),
405  GetWithDefVar<float>(params, "gain_threshold", 1E-7f));
406  }
407 
408  inline friend std::ostream &operator<<(std::ostream &stream,
409  const ClassificationTree &self) {
410  stream << "forpy::ClassificationTree[depth " << self.get_depth() << "]";
411  return stream;
412  };
413 
414  private:
415  std::unordered_map<std::string, mu::variant<uint, size_t, float, bool>>
416  params;
417  friend class cereal::access;
418  template <class Archive>
419  void serialize(Archive &ar, const uint &) {
420  ar(cereal::make_nvp("base", cereal::base_class<Tree>(this)),
421  CEREAL_NVP(params));
422  }
424 };
425 
426 class RegressionTree : public Tree {
427  public:
428  inline RegressionTree(const std::string &filename) : Tree(filename){};
429  RegressionTree(const uint &max_depth = std::numeric_limits<uint>::max(),
430  const uint &min_samples_at_leaf = 1,
431  const uint &min_samples_at_node = 2,
432  const uint &n_valid_features_to_use = 0,
433  const bool &autoscale_valid_features = false,
434  const uint &random_seed = 1, const size_t &n_thresholds = 0,
435  const float &gain_threshold = 1E-7f,
436  const bool &store_variance = false,
437  const bool &summarize = false);
438 
439  inline std::unordered_map<std::string, mu::variant<uint, size_t, float, bool>>
440  get_params(const bool & /*deep*/ = false) const {
441  return params;
442  }
443 
444  inline std::shared_ptr<RegressionTree> set_params(
445  const std::unordered_map<
446  std::string, mu::variant<uint, size_t, float, bool>> &params) {
447  return std::make_shared<RegressionTree>(
448  GetWithDefVar<uint>(params, "max_depth",
449  std::numeric_limits<uint>::max()),
450  GetWithDefVar<uint>(params, "min_samples_at_leaf", 1),
451  GetWithDefVar<uint>(params, "min_samples_at_node", 2),
452  GetWithDefVar<uint>(params, "n_valid_features_to_use", 0),
453  GetWithDefVar<bool>(params, "autoscale_valid_features", false),
454  GetWithDefVar<uint>(params, "random_seed", 1),
455  GetWithDefVar<size_t>(params, "n_thresholds", 0),
456  GetWithDefVar<float>(params, "gain_threshold", 1E-7f),
457  GetWithDefVar<bool>(params, "store_variance", false),
458  GetWithDefVar<bool>(params, "summarize", false));
459  }
460 
461  inline friend std::ostream &operator<<(std::ostream &stream,
462  const RegressionTree &self) {
463  stream << "forpy::RegressionTree[depth " << self.get_depth() << "]";
464  return stream;
465  };
466 
467  private:
468  std::unordered_map<std::string, mu::variant<uint, size_t, float, bool>>
469  params;
470  friend class cereal::access;
471  template <class Archive>
472  void serialize(Archive &ar, const uint &) {
473  ar(cereal::make_nvp("base", cereal::base_class<Tree>(this)),
474  CEREAL_NVP(params));
475  }
477 };
478 
479 }; // namespace forpy
480 #endif // FORPY_TREE_H_
uint random_seed
Definition: tree.h:369
friend class cereal::access
Definition: tree.h:320
unsigned int min_samples_at_node
Definition: tree.h:341
std::unordered_map< std::string, mu::variant< uint, size_t, float, bool > > get_params(const bool &=false) const
Definition: tree.h:389
std::unordered_map< std::string, mu::variant< uint, size_t, float, bool > > params
Definition: tree.h:412
std::atomic< size_t > stored_in_leafs
Definition: tree.h:347
void serialize(Archive &ar, const uint &)
Definition: tree.h:472
size_t get_input_data_dimensions() const
The data dimension that is required by this tree.
Definition: tree.h:258
void parallel_DFS(Desk *d, TodoMark &mark, IDataProvider *data_provider, const bool &finalize=true)
const std::vector< std::pair< id_t, id_t > > get_tree() const
Definition: tree.h:281
Tree * fit_dprov(std::shared_ptr< IDataProvider > data_provider, const bool &complete_dfs=true)
The fitting function for a single tree.
friend class cereal::access
Definition: tree.h:470
std::unique_ptr< mu::variant< std::vector< std::tuple< size_t, float, size_t, size_t > >, std::vector< std::tuple< size_t, double, size_t, size_t > >, std::vector< std::tuple< size_t, uint32_t, size_t, size_t > >, std::vector< std::tuple< size_t, uint8_t, size_t, size_t > > > > fast_tree
Definition: tree.h:365
void disable_fast_prediction()
Definition: tree.h:297
bool is_initialized_for_training
Definition: tree.h:339
void enable_fast_prediction()
A data provider for the training of one tree.
Definition: idataprovider.h:22
std::vector< std::future< void > > futures
Definition: tree.h:366
void DFS_and_store(Desk *d, TodoMark &mark, const IDataProvider *dprov, const ECompletionLevel &comp)
size_t id_t
Element id type.
Definition: types.h:106
void set_weight(const float &new_weight)
Sets the tree weight.
Definition: tree.h:253
The main tree class for the forpy framework.
Definition: tree.h:36
DISALLOW_COPY_AND_ASSIGN(ClassificationTree)
typename mu::variant< Empty, STOT< float >, STOT< double >, STOT< uint >, STOT< uint8_t > > Data
Storing a variant of the provided data container type.
Definition: storage.h:126
void make_node(const IDataProvider *data_provider, Desk *d)
Handle the creation of one tree node.
std::shared_ptr< ILeaf > leaf_manager
Definition: tree.h:351
friend std::ostream & operator<<(std::ostream &stream, const Tree &self)
Definition: tree.h:312
DISALLOW_COPY_AND_ASSIGN(Tree)
friend class cereal::access
Definition: tree.h:417
std::shared_ptr< IDecider > decider
Definition: tree.h:349
id_t predict_leaf(const Data< MatCRef > &data, const id_t &start_node=0, const std::function< void(void *)> &dptf=nullptr) const
Get the leaf id of the leaf where the given data will arrive.
Stores the parameters for one marked tree node.
Definition: types.h:152
uint max_depth
Definition: tree.h:328
std::shared_ptr< const IDecider > get_decider() const
The classifier manager used by this tree.
Definition: tree.h:265
Data< Mat > predict(const Data< MatCRef > &data_v, const int &num_threads=1, const bool &use_fast_prediction_if_available=true, const bool &predict_proba=false, const bool &for_forest=false)
Data< Mat > predict_proba(const Data< MatCRef > &data_v, const int &num_threads=1, const bool &use_fast_prediction_if_available=true)
Overload for consistency with the sklearn interface.
std::atomic< id_t > next_id
Definition: tree.h:368
float get_weight() const
The tree weight.
Definition: tree.h:243
Data< Mat > predict_leaf_result(const Data< MatCRef > &data, const id_t &start_node=0, const std::function< void(void *)> &dptf=nullptr) const
Get the data prediction result for the given data.
Definition: tree.h:218
bool is_initialized() const
Whether the trees fit method has been called and its DFS and BFS methods can now be used...
Definition: tree.h:238
void serialize(Archive &ar, const uint &)
Definition: tree.h:419
std::mutex fut_mtx
Definition: tree.h:367
float weight
Definition: tree.h:345
std::shared_ptr< RegressionTree > set_params(const std::unordered_map< std::string, mu::variant< uint, size_t, float, bool >> &params)
Definition: tree.h:444
Tree(const uint &max_depth=std::numeric_limits< uint >::max(), const uint &min_samples_at_leaf=1, const uint &min_samples_at_node=2, const std::shared_ptr< IDecider > &decider=nullptr, const std::shared_ptr< ILeaf > &leaf_manager=nullptr, const uint &random_seed=1)
The standard constructor for the forpy trees.
RegressionTree(const std::string &filename)
Definition: tree.h:428
size_t get_samples_stored() const
The number of samples stored in leafs.
Definition: tree.h:279
std::shared_ptr< const ILeaf > get_leaf_manager() const
The leaf manager used by this tree.
Definition: tree.h:272
std::shared_ptr< ClassificationTree > set_params(const std::unordered_map< std::string, mu::variant< uint, size_t, float, bool >> &params)
Definition: tree.h:393
DISALLOW_COPY_AND_ASSIGN(RegressionTree)
std::vector< std::pair< id_t, id_t > > tree
Definition: tree.h:353
size_t get_depth() const
std::unordered_map< std::string, mu::variant< uint, size_t, float, bool > > get_params(const bool &=false) const
Definition: tree.h:440
Eigen::Matrix< DT, Eigen::Dynamic, 1, Eigen::ColMajor > Vec
Definition: types.h:73
Data< Mat > combine_leaf_results(const std::vector< Data< Mat >> &leaf_results, const Vec< float > &weights=Vec< float >(), const bool &predict_proba=false) const
Definition: tree.h:227
Main thread desk object.
Definition: desk.h:201
void serialize(Archive &ar, const uint &)
Definition: tree.h:322
bool operator==(Tree const &rhs) const
std::unordered_map< std::string, mu::variant< uint, size_t, float, bool > > params
Definition: tree.h:465
ClassificationTree(const std::string &filename)
Definition: tree.h:378
size_t get_n_nodes() const
The number of tree nodes.
Definition: tree.h:248
ECompletionLevel
Specifies the completion level for one training step.
Definition: types.h:95
void DFS(const IDataProvider *data_provider, const ECompletionLevel &completion, Desk *d)
Do one DFS step with given completion level.
void save(const std::string &filename) const
Save the tree.
unsigned int uint
Convenience typedef for unsigned int.
Definition: types.h:113
friend std::ostream & operator<<(std::ostream &stream, const RegressionTree &self)
Definition: tree.h:461
Tree * fit(const Data< MatCRef > &data_v, const Data< MatCRef > &annotation_v, const size_t &n_threads, const bool &complete_dfs=true, const std::vector< float > &weights=std::vector< float >())
Standard fitting function.
unsigned int min_samples_at_leaf
Definition: tree.h:343
friend std::ostream & operator<<(std::ostream &stream, const ClassificationTree &self)
Definition: tree.h:408