UFJF - Machine Learning Toolkit  0.51.8
Fisher.hpp
1 //
2 // Created by Mateus Coutinho Mari on 7/27/2018.
3 //
4 
5 #pragma once
6 
7 #include "FeatureSelection.hpp"
9 
10 namespace mltk{
11  namespace featselect {
12  template<typename T = double>
13  class Fisher : public FeatureSelection<T> {
14  private:
15  int number = 0;
16 
17  struct fisher_select_score {
18  int fname;
19  double score;
20  };
21 
22  static int fisher_select_compare_score_greater(const fisher_select_score &a, const fisher_select_score &b);
23 
24  public:
25  Fisher() = default;
26  explicit Fisher(const Data<T>& samples,
27  classifier::Classifier<T> *classifier = nullptr, int number = 0);
28 
29  Data<T> selectFeatures() override;
30  };
31 
32  template<typename T>
33  int Fisher<T>::fisher_select_compare_score_greater(const Fisher::fisher_select_score &a,
34  const Fisher::fisher_select_score &b) {
35  return a.score < b.score;
36  }
37 
38  template<typename T>
39  Fisher<T>::Fisher(const Data<T>& samples, classifier::Classifier<T> *classifier, int final_dim) {
40  this->samples = mltk::make_data<T>(samples);
41  this->classifier = classifier;
42  this->final_dim = final_dim;
43  this->number = final_dim;
44  }
45 
46  template<typename T>
48  size_t i, j;
49  size_t num_pos = 0, num_neg = 0, svs = 0, dim = this->samples->dim(), size = this->samples->size();
50  int partial = 0;
51  double margin = 0.0;
52  std::vector<int> remove(dim - number, -1), fnames;
53  std::vector<double> avg_neg(dim), avg_pos(dim), sd_neg(dim), sd_pos(dim), w;
54  std::vector<fisher_select_score> scores(dim);
55  std::shared_ptr<Data<T> > stmp(std::make_shared<Data<T> >()), stmp_partial(std::make_shared<Data<T> >());
56  Solution sol;
57 
58  /*calc average*/
59  for (i = 0; i < dim; ++i) {
60  num_neg = 0;
61  num_pos = 0;
62  avg_neg[i] = 0;
63  avg_pos[i] = 0;
64  for (j = 0; j < size; ++j) {
65  if ((*this->samples)[j]->Y() == -1) {
66  avg_neg[i] += (*this->samples)[j]->X()[i];
67  ++num_neg;
68  } else {
69  avg_pos[i] += (*this->samples)[j]->X()[i];
70  ++num_pos;
71  }
72  }
73  avg_neg[i] /= num_neg;
74  avg_pos[i] /= num_pos;
75  }
76 
77  /*calc standard deviation*/
78  for (i = 0; i < dim; ++i) {
79  sd_neg[i] = 0;
80  sd_pos[i] = 0;
81  for (j = 0; j < size; ++j) {
82  if ((*this->samples)[j]->Y() == -1)
83  sd_neg[i] += std::pow((*this->samples)[j]->X()[i] - avg_neg[i], 2);
84  else sd_pos[i] += std::pow((*this->samples)[j]->X()[i] - avg_pos[i], 2);
85  }
86  sd_neg[i] = sqrt(sd_neg[i] / (num_neg - 1));
87  sd_pos[i] = sqrt(sd_pos[i] / (num_pos - 1));
88  }
89 
90  fnames = this->samples->getFeaturesNames();
91 
92  /*calc scores*/
93  for (i = 0; i < dim; ++i) {
94  scores[i].score = std::pow(avg_pos[i] - avg_neg[i], 2) / (sd_pos[i] + sd_neg[i]);
95  scores[i].fname = fnames[i];
96  if (this->verbose)
97  std::cout << "Score: " << scores[i].score << ", Fname: " << scores[i].fname << std::endl;
98  }
99  if (this->verbose) std::cout << "----------------------------\n";
100 
101  if (this->verbose) std::cout << "Dim: " << dim << " -- ";
102 
103  /*training sample*/
104  this->classifier->setVerbose(0);
105  this->classifier->setGamma(margin);
106  this->classifier->setSamples(this->samples);
107  if (!this->classifier->train()) {
108  w.erase(w.begin(), w.end());
109  if (this->verbose) std::cout << "Training failed!\n";
110  //break;
111  } else {
112  sol = this->classifier->getSolution();
113  std::cout << "Training sucessful...\n";
114  std::cout << "Margin = " << sol.margin << ", Support Vectors = " << sol.svs << "\n";
115  std::cout << "----------------------------\n";
116  }
117 
118  std::sort(scores.begin(), scores.end(), fisher_select_compare_score_greater);
119 
120  stmp_partial->copy(*this->samples);
121  stmp->copy(*this->samples);
122 
123  for (i = 0; i < (dim - this->number); ++i) {
124  if (this->verbose) std::cout << "Score: " << scores[i].score << ", Fname: " << scores[i].fname << "\n";
125  remove[i] = scores[i].fname;
126  stmp->removeFeatures(remove);
127 
128  if (this->verbose)
129  std::cout << "Dim: " << dim - i - 1 << " -- ";
130 
131  /*training sample*/
132  w.erase(w.begin(), w.end());
133  this->classifier->setGamma(margin);
134  this->classifier->setSamples(stmp);
135  if (!this->classifier->train()) {
136  if (this->verbose) std::cout << "Training failed!\n";
137  partial = 1;
138  break;
139  } else {
140  sol = this->classifier->getSolution();
141  std::cout << "Training sucessful...\n";
142  std::cout << "Margin = " << sol.margin << ", Support Vectors = " << sol.svs << "\n";
143  std::cout << "----------------------------\n";
144  }
145  stmp_partial.reset();
146  stmp_partial = std::make_shared<Data<T> >();
147  stmp_partial->copy(*stmp);
148  }
149 
150  if (partial) {
151  stmp.reset();
152  return *stmp_partial;
153  } else {
154  stmp_partial.reset();
155  return *stmp;
156  }
157  }
158  }
159 }
size_t dim() const
Returns the dimension of the dataset.
Definition: Data.hpp:213
Definition: Solution.hpp:13
unsigned int svs
Number of support Vectors.
Definition: Solution.hpp:31
double margin
Margin generated from the classifier that generated the solution.
Definition: Solution.hpp:27
Definition: classifier/Classifier.hpp:17
Definition: featselect/FeatureSelection.hpp:17
classifier::Classifier< double > * classifier
Classifier used by the method.
Definition: featselect/FeatureSelection.hpp:23
std::shared_ptr< Data< double > > samples
Attributes.
Definition: featselect/FeatureSelection.hpp:21
Definition: Fisher.hpp:13
Data< T > selectFeatures() override
Function that executes the feature selection phase.
Definition: Fisher.hpp:47
UFJF-MLTK main namespace for core functionalities.
Definition: classifier/Classifier.hpp:11