UFJF - Machine Learning Toolkit  0.51.8
AutoWeightedVoting.hpp
1 //
2 // Created by mateuscmarim on 05/01/2021.
3 //
4 
5 #pragma once
6 
7 #include "VotingClassifier.hpp"
8 #include <ufjfmltk/Validation.hpp>
9 
10 namespace mltk {
11  namespace ensemble {
12  template<typename T>
13  class AutoWeightedVoting : public Ensemble<T>, public classifier::Classifier<T> {
14  private:
15  size_t p_size = 1000, fold = 5;
16  double F = 0.8, CR = 0.9;
17  std::vector<double> best_weights;
18  VotingClassifier<T> voter;
20  bool use_simplex = false;
21  std::mt19937 generator;
22  std::pair<Point<double>, double> Sbest;
23  size_t best_generation = 0;
24 
25  template<template<typename...> class WeakLearner,
26  template<typename...> class... WeakLearners>
27  void fillLearnersVector(WeakLearner<T> flearner) {
28  this->m_learners.push_back(std::make_shared<WeakLearner<T> >(flearner));
29  }
30 
31  template<template<typename...> class WeakLearner,
32  template<typename...> class... WeakLearners>
33  void fillLearnersVector(WeakLearner<T> flearner, WeakLearners<T>... weak_learners) {
34  this->m_learners.push_back(std::make_shared<WeakLearner<T> >(flearner));
35  fillLearnersVector(weak_learners...);
36  }
37 
38  Point<T> simplex_projection(Point<T> const& y) {
39  Point<double> t(y.size(), 0), x(y.size()), u(y.size());
40  double that = 0, tmpsum = 0, tmax = 0;
41  bool bget = false;
42  int i = y.size() - 2, n = y.size();
43 
44  u = y;
45  std::sort(u.X().begin(), u.X().end());
46 
47  do{
48  for (size_t j = i+1; j < n; j++) {
49  t[i] += u[j] - 1;
50  }
51  if (t[i] != 0) t[i] = t[i]/double(n - i);
52  tmpsum += u[i];
53  tmax = (tmpsum-1)/double(n-i);
54  if (tmax >= u[i]) {
55  that = tmax;
56  bget = true;
57  break;
58  }
59  i--;
60  }while(i >= 0);
61 
62  if (!bget) {
63  that = (tmpsum+u[n-1]-1)/n;
64  }
65  for(int j = 0; j < x.size(); j++){
66  x[j] = y[j] - that;
67  if(x[j] < 0) x[j] = 0;
68  }
69 
70  return x;
71  }
72 
73  double compute_acc(Data<T>& data, Learner<T>& learner){
74  auto acc = 0;
75  for(auto const& p: data){
76  auto pred = learner.evaluate(*p);
77  if(pred == p->Y()){
78  acc++;
79  }
80  }
81  return double(acc)/ data.size();
82  }
83 
84  double objective_function(Point<double> const& x){
85  double tp = 0, tn = 0, fp = 0, fn = 0;
86  int n = valid_pair.test.size();
87  voter.setWeights(x.X());
88 // for(size_t i = 0; i < n; i++){
89 // auto point = valid_pair.test[i];
90 // auto pred = voter.evaluate(*point);
91 //
92 // if(pred == 1){
93 // if(point->Y() == 1){
94 // tp++;
95 // }else fp++;
96 // }else{
97 // if(point->Y() == -1){
98 // tn++;
99 // }else fn++;
100 // }
101 // }
102 // double div = (tp+fp)*(tp+fn)*(tn+fp)*(tn+fn);
103 // double mcc = ((tp*tn)-(fp*fn))/((div > 0)?std::sqrt(div):1);
104  //return 100-validation::kfold(*this->samples, voter, 10, this->seed, 0);
105  return 100*compute_acc(valid_pair.test, voter);
106  }
107 
108  std::vector<Point<double>> init_population(){
109  std::vector<Point<double>> P;
110  std::uniform_real_distribution<double> dist(0., 1.);
111  //generator.seed(this->seed);
112 
113  for(size_t i = 0; i < p_size; i++){
114  std::vector<double> w(this->m_learners.size());
115  for(double & j : w){
116  j = dist(generator);
117  }
118  P.emplace_back(w);
119  if(use_simplex){
120  P[P.size()-1] = simplex_projection(P[P.size()-1]);
121  }else{
122  P[P.size()-1] = mltk::abs(P[P.size()-1]);
123  }
124  }
125  return P;
126  }
127 
128  std::vector<double> eval_population(std::vector<Point<double>> const& population){
129  std::vector<double> costs(population.size());
130 
131  for(size_t i = 0; i < population.size(); i++){
132  costs[i] = objective_function(population[i]);
133  }
134  return costs;
135  }
136 
137  std::pair<Point<double>, double> get_best_solution(std::vector<Point<double>> const& population, std::vector<double> const& costs){
138  auto best_pos = std::max_element(costs.begin(), costs.end()) - costs.begin();
139  return std::make_pair(population[best_pos], costs[best_pos]);
140  }
141 
142  Point<double> new_sample(Point<double> const& P0, std::vector<Point<double>> const& population){
143  std::uniform_int_distribution<size_t> dist(0, population.size()-1);
144  size_t pos = 0;
145  //generator.seed(this->seed++);
146 
147  // parents selection
148  Point<double> P1;
149  do{
150  pos = dist(generator);
151  P1 = population[pos];
152  }while(P1 == P0);
153  Point<double> P2;
154  do{
155  pos = dist(generator);
156  P2 = population[pos];
157  }while(P2 == P0 || P2 == P1);
158 
159  Point<double> P3;
160  do{
161  pos = dist(generator);
162  P3 = population[pos];
163  }while(P3 == P0 || P3 == P1 || P3 == P2);
164 
165  std::uniform_real_distribution<double> distCR(0., 1.);
166  std::uniform_int_distribution<size_t> distNP(0, this->m_learners.size() - 1);
167  size_t cut_point = distNP(generator);
168  Point<double> S(this->m_learners.size());
169  for(int i = 0; i < S.size(); i++){
170  double _cr = distCR(generator);
171 
172  if(i == cut_point || _cr < CR){
173  S[i] = P3[i] + F * (P1[i]-P2[i]);
174  }else{
175  S[i] = P0[i];
176  }
177  }
178  if(use_simplex){
179  S = mltk::abs(S);
180  S = simplex_projection(S);
181  }else{
182  S = mltk::abs(S/S.norm());
183  }
184  return S;
185  }
186 
187 
188  public:
189  AutoWeightedVoting() = default;
190 
191  template<template<typename...> class WeakLearner,
192  template<typename...> class... WeakLearners>
193  AutoWeightedVoting(Data<T> &samples, const bool simplex, WeakLearner<T> flearner, WeakLearners<T>... weak_learners){
194  this->samples = std::make_shared<Data<T> >(samples);
195  this->use_simplex = simplex;
196  fillLearnersVector(flearner, weak_learners...);
197  }
198 
199  bool train() override {
200  voter.setLearners(this->m_learners);
201  voter.setVotingType("soft");
202  valid_pair = validation::partTrainTest(*this->samples, 10, this->seed);
203  voter.setSamples(valid_pair.train);
204  voter.train();
205  generator.seed(this->seed);
206 
207  auto population = init_population();
208  auto p_costs = eval_population(population);
209  Sbest = get_best_solution(population, p_costs);
210 
211 // for(size_t i = 0; i < population.size(); i++){
212 // std::clog << population[i] << ", cost: " << p_costs[i] << std::endl;
213 // }
214 // std::clog << "Best weights: " << Sbest.first << ", Cost: " << Sbest.second << std::endl;
215  for(int G = 0; G < this->MAX_IT; G++){
216  std::clog << "\nGeneration " << G <<std::endl;
217  std::vector<Point<double> > new_population;
218  for(size_t i = 0; i < population.size(); i++){
219  auto Si = new_sample(Sbest.first, population);
220  double si_cost = objective_function(Si);
221  if(si_cost <= p_costs[i]){
222  new_population.push_back(Si);
223  }else{
224  new_population.push_back(population[i]);
225  }
226  }
227  population = new_population;
228  p_costs = eval_population(population);
229  for(size_t i = 0; i < population.size(); i++){
230  std::clog << population[i] << ", cost: " << p_costs[i] << std::endl;
231  }
232  auto candidate = get_best_solution(population, p_costs);
233  if(Sbest.second < candidate.second){
234  Sbest = candidate;
235  best_generation = G;
236  }
237  // std::clog << "Best weights: " << Sbest.first << ", Cost: " << Sbest.second << std::endl;
238  //std::cout << Sbest.second <<std::endl;
239  if(Sbest.second == 1){
240  break;
241  }
242  }
243  best_weights = Sbest.first.X();
244  voter.setWeights(best_weights);
245  std::cout << Sbest.first << " Best weights cost: " << objective_function(Sbest.first) << std::endl;
246  return true;
247  }
248 
249  double evaluate(const Point<T> &p, bool raw_value = false) override {
250  voter.setWeights(best_weights);
251  return voter.evaluate(p, raw_value);
252  }
253 
254  std::string getFormulationString() override {
255  return this->m_learners[0]->getFormulationString();
256  }
257 
258  Point<double> getBestWeights(){
259  return Point<double>(best_weights);
260  }
261 
262  size_t getBestGeneration() { return best_generation; }
263 
264  VotingClassifier<double> getVoter(){ return voter; }
265  auto getValidPair(){ return valid_pair; }
266  };
267  }
268 }
269 #endif //UFJF_MLTK_AUTOWEIGHTEDVOTING_HPP
size_t size() const
Returns the size of the dataset.
Definition: Data.hpp:208
int MAX_IT
Max number of iterations.
Definition: Learner.hpp:37
std::shared_ptr< Data< T > > samples
Samples used in the model training.
Definition: Learner.hpp:21
virtual double evaluate(const Point< T > &p, bool raw_value=false)=0
Returns the class of a feature point based on the trained Learner.
size_t seed
seed for random operations.
Definition: Learner.hpp:46
double norm(int p=NormType::NORM_L2) const
Returns the p-norm of the point.
Definition: Point.hpp:651
Rep const & X() const
Returns the attributes representation of the point (std::vector by default).
Definition: Point.hpp:139
std::size_t size() const
Returns the dimension of the point.
Definition: Point.hpp:133
Definition: classifier/Classifier.hpp:17
Definition: AutoWeightedVoting.hpp:13
bool train() override
Function that execute the training phase of a Learner.
Definition: AutoWeightedVoting.hpp:199
double evaluate(const Point< T > &p, bool raw_value=false) override
Returns the class of a feature point based on the trained Learner.
Definition: AutoWeightedVoting.hpp:249
std::string getFormulationString() override
getFormulationString Returns a string that represents the formulation of the learner (Primal or Dual)...
Definition: AutoWeightedVoting.hpp:254
Namespace for ensemble methods.
Definition: ensemble/Ensemble.hpp:16
void setSamples(DataPointer< T > samples) override
setSamples Set the samples used by the Learner.
Definition: ensemble/Ensemble.hpp:56
std::vector< LearnerPointer< T > > m_learners
Pointer to base learner used by the ensemble method.
Definition: ensemble/Ensemble.hpp:22
Definition: VotingClassifier.hpp:13
bool train() override
Function that execute the training phase of a Learner.
Definition: VotingClassifier.hpp:28
double evaluate(const Point< T > &p, bool raw_value=false) override
Returns the class of a feature point based on the trained Learner.
Definition: VotingClassifier.hpp:38
TrainTestPair< T > partTrainTest(Data< T > &data, size_t fold, bool stratified=true, bool keepIndex=true, size_t seed=0)
Divide the samples in training and test set.
Definition: valid/Validation.hpp:414
UFJF-MLTK main namespace for core functionalities.
Definition: classifier/Classifier.hpp:11
A struct representing a pair with training and test data.
Definition: valid/Validation.hpp:77
Data< T > test
Test data.
Definition: valid/Validation.hpp:81
Data< T > train
Train data.
Definition: valid/Validation.hpp:79