UFJF - Machine Learning Toolkit  0.51.8
RFE.hpp
1 //
2 // Created by Mateus Coutinho Mari on 6/26/2018.
3 //
4 
5 #pragma once
6 #define first_decay 0.25
7 
8 #include "FeatureSelection.hpp"
9 
10 
11 namespace mltk{
12  namespace featselect {
13  template<typename T = double>
14  class RFE : public FeatureSelection<T> {
15  private:
16  struct select_weight {
17  double w;
18  double val;
19  double fname;
20  };
21 
22  static int compare_weight_greater(const select_weight &a, const select_weight &b);
23 
24  public:
25  RFE() = default;
27  validation::CrossValidation *cv = nullptr, int skip = 1, int jump = 1,
28  bool leave_one_out = false);
29 
30  Data<T> selectFeatures() override;
31  };
32 
33  template<typename T>
34  RFE<T>::RFE(const Data<T>& samples, classifier::Classifier<T> *classifier, int final_dim,
35  validation::CrossValidation *cv, int skip, int jump, bool leave_one_out) {
36  this->samples = mltk::make_data<T>(samples);
37  this->classifier = classifier;
38  this->depth = this->samples->dim()-final_dim;
39  this->final_dim = final_dim;
40  this->skip = skip;
41  this->jump = jump;
42  this->cv = cv;
43  if(!this->cv){
44  this->cv = new validation::CrossValidation();
45  this->cv->seed = std::vector<unsigned int>(1, 0);
46  this->cv->fold = 10;
47  this->cv->qtde = 0;
48  this->cv->jump = this->jump;
49  }
50  this->leave_one_out = leave_one_out;
51  }
52 
53  template<typename T>
55  size_t dim = this->samples->dim(), partial_dim = 0, i = 0, j = 0;
56  std::vector<int> features, partial_features, choosen_feats, fnames;
57  std::vector<double> w, new_w;
58  std::vector<select_weight> weight;
59  std::shared_ptr<Data<T> > stmp_partial, stmp(std::make_shared<Data<T> >());
60  Solution sol;
61  int svcount = 0, level = 0, leveljump = 0, partial_svs = 0;
62  int partial = 0; //verifica se última solução é uma solução recuperada (parcial)
63  double max_time = this->classifier->getMaxTime(), time_mult = this->samples->getTime_mult();
64  double margin = 0, leave_oo = 0, kfolderror = 0, partial_time = 0, partial_margin = 0;
65  double START_TIME = 100.0f * clock() / CLOCKS_PER_SEC;
66  double n0 = 1;
67 
68  stmp->copy(*this->samples);
69  /*error check*/
70  if (this->depth < 1 || this->depth >= dim) {
71  std::cerr << "Invalid depth!\n";
72  return Data<T>();
73  }
74 
75  features.assign(this->depth, -1);
76  /*inicializando o cross-validation*/
77  if (this->cv->qtde > 0) {
78  //utils_initialize_random();
79  this->cv->seed.resize(this->cv->qtde);
80  for (i = 0; i < this->cv->qtde; i++)
81  this->cv->seed[i] = i; //rand();
82  this->cv->initial_error = 0;
83  this->cv->actual_error = 0;
84  }
85 
86  this->classifier->setVerbose(0);
87 
88  while (true) {
89  svcount = 0;
90  margin = 0;
91 
92  //if(level != 0) // || level == depth) //else stmp->max_time = max_time;
93  if (level == 1) {
94  n0 = max_time *= first_decay;
95  } else if (level > 1) {
96  max_time = n0 * std::exp(-time_mult * ((double) dim / (dim - level)));
97  }
98 
99  this->classifier->setGamma(margin);
100  this->classifier->setSolution(sol);
101  this->classifier->setSamples(stmp);
102 
103  /*training sample*/
104  if (!this->classifier->train()) {
105  sol = this->classifier->getSolution();
106  svcount = sol.svs;
107  margin = sol.margin;
108  sol.w.X().erase(sol.w.X().begin(), sol.w.X().end());
109 
110  if (this->verbose) std::cerr << "Training Failed!\n";
111  if (level > 0) {
112  std::cout << "---------------\n :: FINAL :: \n---------------\n";
113  choosen_feats = stmp_partial->getFeaturesNames();
114  std::cout << "Choosen Features: ";
115  for (i = 0; i < stmp_partial->dim() - 1; ++i) std::cout << choosen_feats[i] << ", ";
116  std::cout << choosen_feats[i] << std::endl;
117 
118  if (this->cv->qtde > 0) {
119  if ((dim - partial_dim) % this->cv->jump != 0) {
120  for (this->cv->actual_error = 0, i = 0; i < this->cv->qtde; i++) {
121  this->cv->actual_error += 100- validation::kfold(*stmp, *this->classifier,
122  this->cv->fold, true, this->cv->seed[i], 0).accuracy;
123  }
124  kfolderror = this->cv->actual_error / this->cv->qtde;
125  }
126  std::cout << "Dim: " << partial_dim << ", Margin: " << partial_dim << ", SVs: " << partial_svs
127  << ", Error " << this->cv->fold << "-fold: " << kfolderror << "%\n";
128  } else
129  std::cout << "Dim: " << partial_dim << ", Margin: " << partial_dim << ", SVs: " << partial_svs
130  << std::endl;
131  std::cout << "---------------\nTotal time: " << partial_time << "\n\n";
132  partial = 1;
133  //data_write(filename, stmp_parcial, 0);
134  weight.clear();
135  }
136  break;
137  }
138  sol = this->classifier->getSolution();
139  margin = sol.margin;
140  svcount = sol.svs;
141  partial_margin = margin;
142  partial_svs = svcount;
143  partial_time = (100.0f * clock() / CLOCKS_PER_SEC - START_TIME) / 100.0f;
144  partial_dim = dim - level;
145 
146  stmp_partial.reset();
147  stmp_partial = std::make_shared<Data<T> >();
148  stmp_partial->copy(*this->samples);
149 
150  partial_features.clear();
151 
152  int levelminusjump = (level - this->jump);
153 
154  if (levelminusjump > 0) {
155  partial_features.resize((size_t) (level - this->jump));
156  }
157 
158  for (i = 0; (i < levelminusjump && levelminusjump > 0); ++i) {
159  partial_features[i] = features[i];
160  }
161 
162  if (this->cv->qtde > 0) {
163  if (level == 0) {
164  for (this->cv->initial_error = 0, i = 0; i < this->cv->qtde; i++) {
165  auto report = validation::kfold(*stmp, *this->classifier, this->cv->fold, true, this->cv->seed[i], 0);
166  this->cv->initial_error = report.error;
167  }
168  kfolderror = this->cv->initial_error / this->cv->qtde;
169  } else if (level % this->cv->jump == 0) {
170  for (this->cv->actual_error = 0, i = 0; i < this->cv->qtde; i++) {
171  this->cv->actual_error += 100-validation::kfold(*stmp, *this->classifier, this->cv->fold, true,
172  this->cv->seed[i], 0).accuracy;
173  }
174  kfolderror = this->cv->actual_error / this->cv->qtde;
175  }
176  }
177 
178  /*leave one out*/
179  if (this->leave_one_out) {
180  // leave_oo = utils_leave_one_out(stmp, train, skip, 0);
181  std::cout << "LeaveOO -- Dim: " << (dim - level) << ", Margin: " << margin << ", LeaveOO: " << leave_oo
182  << ", SVs: " << svcount << std::endl;
183  } else if (this->verbose) {
184  if (this->cv->qtde > 0 && level % this->cv->jump == 0)
185  std::cout << "Dim: " << (dim - level) << ", Margin: " << margin << ", SVs: " << svcount << ", Erro "
186  << this->cv->fold << "-fold: " << kfolderror << "%\n";
187  else
188  std::cout << "Dim: " << (dim - level) << ", Margin: " << margin << ", SVs: " << svcount << std::endl;
189  //printf("Dim: %d, Margem: %lf, Distancia entre os centros: %f, SVs: %d\n", (dim-level), data_get_dist_centers(stmp), margin, svcount);
190  }
191 
192  w = this->classifier->getSolutionRef()->w.X();
193  weight.resize(stmp->dim());
194  auto fnames = stmp->getFeaturesNames();
195 
196  for (i = 0; i < stmp->dim(); ++i) {
197  weight[i].w = w[i];
198  weight[i].fname = fnames[i];
199  }
200 
201  std::sort(weight.begin(), weight.end(), compare_weight_greater);
202 
203  std::cout << "---------------------\n";
204  if (this->verbose > 1) {
205  for (i = 0; i < stmp->dim(); ++i)
206  std::cout << weight[i].fname << ": " << weight[i].w << std::endl;
207  std::cout << "---------------------\n";
208  }
209 
210  /*stopping criterion*/
211  if (level >= this->depth ||
212  (this->cv->qtde > 0 && (this->cv->actual_error - this->cv->initial_error) > this->cv->limit_error)) {
213  std::cout << "---------------\n :: FINAL :: \n---------------\n";
214  choosen_feats = stmp->getFeaturesNames();
215  std::cout << "Choosen Features: ";
216  for (i = 0; i < stmp->dim() - 1; ++i) std::cout << choosen_feats[i] << ",";
217  std::cout << choosen_feats[i] << std::endl;
218 
219  std::cout << "---------------\nEliminated Features: ";
220  for (i = 0; i < leveljump - 1; ++i) std::cout << features[i] << ",";
221  std::cout << features[i] << std::endl;
222 
223  if (this->cv->qtde > 0) {
224  if (level % this->cv->jump != 0) {
225  for (this->cv->actual_error = 0, i = 0; i < this->cv->qtde; i++) {
226  this->cv->actual_error += 100- validation::kfold(*stmp, *this->classifier, this->cv->fold, true,
227  this->cv->seed[i], 0).accuracy;
228  }
229  kfolderror = this->cv->actual_error / this->cv->qtde;
230  }
231  std::cout << "Dim: " << (dim - level) << ", Margin: " << margin << ", SVs: " << svcount << ", Erro "
232  << this->cv->fold << "-fold: " << kfolderror << "%\n";
233  } else
234  std::cout << "Dim: " << (dim - level) << ", Margem: " << margin << ", SVs: " << svcount << std::endl;
235 
236  std::cout << "---------------\nTotal time: " << (100.0f * clock() / CLOCKS_PER_SEC - START_TIME) / 100.0f
237  << "\n\n";
238 
239  //data_write(filename, stmp, 0);
240  weight.clear();
241  break;
242  }
243 
244  if (level + this->jump > this->depth)
245  leveljump = this->depth;
246  else
247  leveljump = level + this->jump;
248 
249  /*manutencao do w do pai para o IMA Primal*/
250  if (this->classifier->getFormulationString() == "Primal") {
251  for (j = 0; j < stmp->dim(); ++j)
252  for (i = level; i < leveljump; ++i)
253  if (weight[i - level].w == w[j])
254  w[j] = 0;
255 
256  new_w.resize(dim - leveljump);
257 
258  for (i = 0, j = 0; j < stmp->dim(); ++j)
259  if (w[j] != 0)
260  new_w[i++] = w[j];
261  //novo_w[i] = w[j]; //bias nao copia mais
262  w.clear();
263  w = new_w;
264  sol.w.clear();
265  sol.w = w;
266  } else //IMA Dual e SMO
267  {
268  w.clear();
269  }
270 
271  // if(*stmp != *this->samples){ stmp.reset(); }
272 
273  /*saving removed feature name*/
274  for (i = level; i < leveljump; ++i) {
275  std::cout << "Removing w = " << weight[i - level].w << "\n";
276  features[i] = weight[i - level].fname;
277  }
278  std::cout << "---------------------\n";
279  weight.clear();
280 
281  /*increment*/
282  if (level + this->jump > this->depth) {
283  level = this->depth;
284  this->jump = 0;
285  } else {
286  level += this->jump;
287 
288  }
289  /*get temp data struct*/
290  stmp->removeFeatures(features);
291  }
292  /*free stuff*/
293 
294  if (this->cv->qtde > 0) this->cv->seed.clear();
295 
296  //samples.reset();
297  if (partial) {
298  stmp.reset();
299  return *stmp_partial;
300  } else {
301  stmp_partial.reset();
302  return *stmp;
303  }
304  }
305 
306  /*----------------------------------------------------------*
307  * Returns 1 for a > b, -1 a < b, 0 if a = b *
308  *----------------------------------------------------------*/
309  template<typename T>
310  int RFE<T>::compare_weight_greater(const select_weight &a, const select_weight &b) {
311  /* V (greater)*/
312  //printf("%d\n",(fabs(ia->w) > fabs(ib->w)) - (fabs(ia->w) < fabs(ib->w)));
313  return fabs(a.w) < fabs(b.w);
314  //return (fabs(a.w) > fabs(b.w)) - (fabs(a.w) < fabs(b.w));
315  }
316  }
317 }
double getTime_mult() const
Return the time multiplier.
Definition: Data.hpp:1765
size_t dim() const
Returns the dimension of the dataset.
Definition: Data.hpp:213
virtual void setSamples(const Data< T > &data)
setSamples Set the samples used by the Learner.
Definition: Learner.hpp:150
virtual bool train()=0
Function that execute the training phase of a Learner.
double getMaxTime() const
getMaxTime Returns the maximum running time in the training phase of the Learner.
Definition: Learner.hpp:130
virtual std::string getFormulationString()=0
getFormulationString Returns a string that represents the formulation of the learner (Primal or Dual)...
void setVerbose(int _verbose)
Set the level of verbose.
Definition: Learner.hpp:175
Rep const & X() const
Returns the attributes representation of the point (std::vector by default).
Definition: Point.hpp:139
Definition: Solution.hpp:13
unsigned int svs
Number of support Vectors.
Definition: Solution.hpp:31
mltk::Point< double > w
Weights vector.
Definition: Solution.hpp:17
double margin
Margin generated from the classifier that generated the solution.
Definition: Solution.hpp:27
Definition: classifier/Classifier.hpp:17
Solution * getSolutionRef()
getSolution Returns a reference to the solution of the classifier.
Definition: classifier/Classifier.hpp:58
Solution getSolution() const
getSolution Returns the solution of the classifier.
Definition: classifier/Classifier.hpp:52
void setGamma(double gamma)
Set the gamma (margin) of the classifier.
Definition: classifier/Classifier.hpp:67
void setSolution(Solution solution)
setSolution Set a solution for the classifier.
Definition: classifier/Classifier.hpp:79
Definition: featselect/FeatureSelection.hpp:17
bool leave_one_out
Leave one out.
Definition: featselect/FeatureSelection.hpp:46
int skip
Number of levels to be skipped.
Definition: featselect/FeatureSelection.hpp:44
classifier::Classifier< double > * classifier
Classifier used by the method.
Definition: featselect/FeatureSelection.hpp:23
int final_dim
Final dimension.
Definition: featselect/FeatureSelection.hpp:40
validation::CrossValidation * cv
Structure to hold the cross-validation result.
Definition: featselect/FeatureSelection.hpp:25
std::shared_ptr< Data< double > > samples
Attributes.
Definition: featselect/FeatureSelection.hpp:21
int jump
Jump size.
Definition: featselect/FeatureSelection.hpp:42
Definition: RFE.hpp:14
Data< T > selectFeatures() override
Function that executes the feature selection phase.
Definition: RFE.hpp:54
ValidationReport kfold(Data< T > sample, classifier::Classifier< T > &classifier, size_t fold, bool stratified=true, size_t seed=0, int verbose=0)
Executes k-fold stratified cross-validation.
Definition: valid/Validation.hpp:312
UFJF-MLTK main namespace for core functionalities.
Definition: classifier/Classifier.hpp:11
Structure to manage cross validation.
Definition: valid/Validation.hpp:62
double accuracy
Accuracy of the validated model.
Definition: valid/Validation.hpp:24