forpy
2
|
The main tree class for the forpy framework. More...
#include <tree.h>
Public Member Functions | |
Tree (const uint &max_depth=std::numeric_limits< uint >::max(), const uint &min_samples_at_leaf=1, const uint &min_samples_at_node=2, const std::shared_ptr< IDecider > &decider=nullptr, const std::shared_ptr< ILeaf > &leaf_manager=nullptr, const uint &random_seed=1) | |
The standard constructor for the forpy trees. More... | |
Tree (std::string filename) | |
Deserialization constructor for the forpy trees. More... | |
void | make_node (const IDataProvider *data_provider, Desk *d) |
Handle the creation of one tree node. More... | |
void | DFS (const IDataProvider *data_provider, const ECompletionLevel &completion, Desk *d) |
Do one DFS step with given completion level. More... | |
void | parallel_DFS (Desk *d, TodoMark &mark, IDataProvider *data_provider, const bool &finalize=true) |
void | DFS_and_store (Desk *d, TodoMark &mark, const IDataProvider *dprov, const ECompletionLevel &comp) |
size_t | get_depth () const |
Tree * | fit (const Data< MatCRef > &data_v, const Data< MatCRef > &annotation_v, const size_t &n_threads, const bool &complete_dfs=true, const std::vector< float > &weights=std::vector< float >()) |
Standard fitting function. More... | |
Tree * | fit_dprov (std::shared_ptr< IDataProvider > data_provider, const bool &complete_dfs=true) |
The fitting function for a single tree. More... | |
id_t | predict_leaf (const Data< MatCRef > &data, const id_t &start_node=0, const std::function< void(void *)> &dptf=nullptr) const |
Get the leaf id of the leaf where the given data will arrive. More... | |
Data< Mat > | predict (const Data< MatCRef > &data_v, const int &num_threads=1, const bool &use_fast_prediction_if_available=true, const bool &predict_proba=false, const bool &for_forest=false) |
Data< Mat > | predict_proba (const Data< MatCRef > &data_v, const int &num_threads=1, const bool &use_fast_prediction_if_available=true) |
Overload for consistency with the sklearn interface. More... | |
Data< Mat > | predict_leaf_result (const Data< MatCRef > &data, const id_t &start_node=0, const std::function< void(void *)> &dptf=nullptr) const |
Get the data prediction result for the given data. More... | |
Data< Mat > | combine_leaf_results (const std::vector< Data< Mat >> &leaf_results, const Vec< float > &weights=Vec< float >(), const bool &predict_proba=false) const |
bool | is_initialized () const |
Whether the trees fit method has been called and its DFS and BFS methods can now be used. More... | |
float | get_weight () const |
The tree weight. More... | |
size_t | get_n_nodes () const |
The number of tree nodes. More... | |
void | set_weight (const float &new_weight) |
Sets the tree weight. More... | |
size_t | get_input_data_dimensions () const |
The data dimension that is required by this tree. More... | |
std::shared_ptr< const IDecider > | get_decider () const |
The classifier manager used by this tree. More... | |
std::shared_ptr< const ILeaf > | get_leaf_manager () const |
The leaf manager used by this tree. More... | |
size_t | get_samples_stored () const |
The number of samples stored in leafs. More... | |
const std::vector< std::pair< id_t, id_t > > | get_tree () const |
void | enable_fast_prediction () |
void | disable_fast_prediction () |
bool | operator== (Tree const &rhs) const |
void | save (const std::string &filename) const |
Save the tree. More... | |
Private Member Functions | |
template<class Archive > | |
void | serialize (Archive &ar, const uint &) |
DISALLOW_COPY_AND_ASSIGN (Tree) | |
Private Attributes | |
uint | max_depth |
bool | is_initialized_for_training |
unsigned int | min_samples_at_node |
unsigned int | min_samples_at_leaf |
float | weight |
std::atomic< size_t > | stored_in_leafs |
std::shared_ptr< IDecider > | decider |
std::shared_ptr< ILeaf > | leaf_manager |
std::vector< std::pair< id_t, id_t > > | tree |
std::unique_ptr< mu::variant< std::vector< std::tuple< size_t, float, size_t, size_t > >, std::vector< std::tuple< size_t, double, size_t, size_t > >, std::vector< std::tuple< size_t, uint32_t, size_t, size_t > >, std::vector< std::tuple< size_t, uint8_t, size_t, size_t > > > > | fast_tree |
std::vector< std::future< void > > | futures |
std::mutex | fut_mtx |
std::atomic< id_t > | next_id |
uint | random_seed |
Friends | |
class | forpy::Forest |
class | cereal::access |
std::ostream & | operator<< (std::ostream &stream, const Tree &self) |
The main tree class for the forpy framework.
This class is the core element of the framework. It can be used as a standalone tree or to form a forest.
forpy::Tree::Tree | ( | const uint & | max_depth = std::numeric_limits< uint >::max() , |
const uint & | min_samples_at_leaf = 1 , |
||
const uint & | min_samples_at_node = 2 , |
||
const std::shared_ptr< IDecider > & | decider = nullptr , |
||
const std::shared_ptr< ILeaf > & | leaf_manager = nullptr , |
||
const uint & | random_seed = 1 |
||
) |
The standard constructor for the forpy trees.
max_depth | uint > 0 The maximum tree depth, including leafs (up to including). |
min_samples_at_leaf | uint > 0 The minimum number of samples at a leaf (from including). |
min_samples_at_node | uint>=2*min_samples_at_leaf The minimum number of samples at a node (from including). |
decider | IDecider The decider that stores, optimizes and applies the decision rules for each inner tree node. |
leaf_manager | The leaf manager generates, stores and handles the return values of the leaf nodes. |
random_seed | uint>0 Seed for the random engine. |
forpy::Tree::Tree | ( | std::string | filename | ) |
Deserialization constructor for the forpy trees.
filename | string The filename to deserialize the tree from. |
void forpy::Tree::DFS | ( | const IDataProvider * | data_provider, |
const ECompletionLevel & | completion, | ||
Desk * | d | ||
) |
Do one DFS step with given completion level.
For CompletionLevel::Level, the branch of the tree below the currently marked node is completed.
The function is to be used within a thread (see forpy::Tree::parallel_DFS).
data_provider | forpy::IDataProvider* The data provider to use to get the samples with the relevant ids. |
completion | CompletionLevel The ECompletionLevel to reach before returning from the function. |
d | Desk Desk to use thread local memory from. |
void forpy::Tree::DFS_and_store | ( | Desk * | d, |
TodoMark & | mark, | ||
const IDataProvider * | dprov, | ||
const ECompletionLevel & | comp | ||
) |
|
inline |
|
private |
void forpy::Tree::enable_fast_prediction | ( | ) |
Unpack the hash maps for thresholds and feature IDs for fast predictions.
This only works for trees with threshold deciders and AlignedSurfaceCalcluators for the features. Requires more memory than the default trees, but is significantly faster.
Tree* forpy::Tree::fit | ( | const Data< MatCRef > & | data_v, |
const Data< MatCRef > & | annotation_v, | ||
const size_t & | n_threads, | ||
const bool & | complete_dfs = true , |
||
const std::vector< float > & | weights = std::vector< float >() |
||
) |
Standard fitting function.
Fits this tree to the data given by the data provider. If complete_dfs is true, the tree is completely fitted to the data Otherwise, just a node todo for the root node is added and the tree may be performed step-by-step by calling the BFS or DFS functions.
Releases the GIL in Python!
data_v | Variant of 2D array, col-major contiguous Col-wise data points. |
annotation_v | Variant of 2D array, row-major contiguous Row-wise annotations. |
n_threads | size_t The number of threads to use. If set to 0, use all hardware threads. |
complete_dfs | bool If set to true, finishes training the tree. Otherwise, the training is just set up, and make_node must be called. Default: true. |
weights | vector<float> A vector with positive weights for each sample or an empty vector. |
Tree* forpy::Tree::fit_dprov | ( | std::shared_ptr< IDataProvider > | data_provider, |
const bool & | complete_dfs = true |
||
) |
The fitting function for a single tree.
Fits this tree to the data given by the data provider. If complete_dfs is true, the tree is completely fitted to the data Otherwise, just a node todo for the root node is added and the tree may be performed step-by-step by calling the BFS or DFS functions.
data_provider | shared(IDataProvider) The data provider for the fitting process. |
complete_dfs | bool If true, complete the fitting process. |
|
inline |
size_t forpy::Tree::get_depth | ( | ) | const |
Get the tree depth.
The depth is defined to be 0 for an "empty" tree (only a leaf/root node) and as the amount of edges on the longest path in the tree otherwise.
|
inline |
|
inline |
|
inline |
|
inline |
|
inline |
|
inline |
void forpy::Tree::make_node | ( | const IDataProvider * | data_provider, |
Desk * | d | ||
) |
Handle the creation of one tree node.
Takes the next one of the list of marked nodes and fits it to the data. If necessary, creates two child nodes and a split criterion, otherwise makes it a leaf.
The function is to be used within a thread (see forpy::Tree::parallel_DFS). It is marked const
so as to avoid concurrent writes to member elements. Everything that is written to must be available in a forpy::Desk.
bool forpy::Tree::operator== | ( | Tree const & | rhs | ) | const |
void forpy::Tree::parallel_DFS | ( | Desk * | d, |
TodoMark & | mark, | ||
IDataProvider * | data_provider, | ||
const bool & | finalize = true |
||
) |
Data<Mat> forpy::Tree::predict | ( | const Data< MatCRef > & | data_v, |
const int & | num_threads = 1 , |
||
const bool & | use_fast_prediction_if_available = true , |
||
const bool & | predict_proba = false , |
||
const bool & | for_forest = false |
||
) |
Predicts new data points.
Releases the GIL in Python!
data_v | Variant of 2D data, row-major contiguous The data predict with one sample per row. |
num_threads | int>0 The number of threads to use for prediction. The number of samples should be at least three times larger than the number of threads to observe good parallelization behavior. Currently disabled. |
use_fast_prediction_if_available | bool If set to true (default), this will create a compressed version of the tree that has particularly favorable properties for fast access and use it for predictions. You can trigger the creation manually by calling Tree::enable_fast_prediction. |
predict_proba | bool If enabled, will ask the leaf manager to provide probability information additionally to the prediction output. |
for_forest | bool If set to true, will create an intermediate result that can be fused to a whole forest result. Not relevant for end-users. |
id_t forpy::Tree::predict_leaf | ( | const Data< MatCRef > & | data, |
const id_t & | start_node = 0 , |
||
const std::function< void(void *)> & | dptf = nullptr |
||
) | const |
Get the leaf id of the leaf where the given data will arrive.
data | The data to propagate through the tree. |
start_node | The node to start from, doesn't have to be the root. |
dptf | Feature mapping function; disabled at the moment. |
Data<Mat> forpy::Tree::predict_proba | ( | const Data< MatCRef > & | data_v, |
const int & | num_threads = 1 , |
||
const bool & | use_fast_prediction_if_available = true |
||
) |
Overload for consistency with the sklearn interface.
void forpy::Tree::save | ( | const std::string & | filename | ) | const |
Save the tree.
filename | string The filename of the file to store the tree in. |
|
inlineprivate |
|
inline |
|
friend |
|
friend |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |
|
private |