UFJF - Machine Learning Toolkit  0.51.8
Golub.hpp
1 //
2 // Created by Mateus Coutinho Mari on 7/27/2018.
3 //
4 
5 #pragma once
6 #include "FeatureSelection.hpp"
7 
8 namespace mltk{
9  namespace featselect {
10  template<typename T = double>
11  class Golub : public FeatureSelection<T> {
12  private:
13  int number;
14 
15  struct golub_select_score {
16  int fname;
17  double score;
18  };
19 
20  static int golub_select_compare_score_greater(const golub_select_score &a, const golub_select_score &b);
21 
22  public:
23  Golub() = default;
24  explicit Golub(const Data<T>& samples, classifier::Classifier<T> *classifier = nullptr,
25  int number = 0);
26 
27  Data<T> selectFeatures() override;
28  };
29 
30  template<typename T>
31  Golub<T>::Golub(const Data<T>& samples, classifier::Classifier<T> *classifier, int number) {
32  this->samples = mltk::make_data<T>(samples);
33  this->classifier = classifier;
34  this->number = number;
35  }
36 
37  template<typename T>
39  size_t i, j;
40  size_t num_pos = 0, num_neg = 0, svs = 0, dim = this->samples->dim(), size = this->samples->size();
41  int partial = 0;
42  double margin = 0.0;
43  std::vector<int> remove(dim - number), fnames;
44  std::vector<double> avg_neg(dim), avg_pos(dim), sd_neg(dim), sd_pos(dim), w;
45  std::vector<golub_select_score> scores(dim);
46  std::shared_ptr<Data<T> > stmp(std::make_shared<Data<T> >()), stmp_partial(std::make_shared<Data<T> >());
47  Solution sol;
48 
49  /*calc average*/
50  for (i = 0; i < dim; ++i) {
51  num_neg = 0;
52  num_pos = 0;
53  avg_neg[i] = 0;
54  avg_pos[i] = 0;
55  for (j = 0; j < size; ++j) {
56  if ((*this->samples)[j]->Y() == -1) {
57  avg_neg[i] += (*this->samples)[j]->X()[i];
58  ++num_neg;
59  } else {
60  avg_pos[i] += (*this->samples)[j]->X()[i];
61  ++num_pos;
62  }
63  }
64  avg_neg[i] /= num_neg;
65  avg_pos[i] /= num_pos;
66  }
67 
68  /*calc standard deviation*/
69  for (i = 0; i < dim; ++i) {
70  sd_neg[i] = 0;
71  sd_pos[i] = 0;
72  for (j = 0; j < size; ++j) {
73  if ((*this->samples)[j]->Y() == -1)
74  sd_neg[i] += std::pow((*this->samples)[j]->X()[i] - avg_neg[i], 2);
75  else sd_pos[i] += std::pow((*this->samples)[j]->X()[i] - avg_pos[i], 2);
76  }
77  sd_neg[i] = sqrt(sd_neg[i] / (num_neg - 1));
78  sd_pos[i] = sqrt(sd_pos[i] / (num_pos - 1));
79  }
80 
81  fnames = this->samples->getFeaturesNames();
82 
83  /*calc scores*/
84  for (i = 0; i < dim; ++i) {
85  scores[i].score = fabs(avg_pos[i] - avg_neg[i]) / (sd_pos[i] + sd_neg[i]);
86  scores[i].fname = fnames[i];
87  if (this->verbose)
88  std::cout << "Score: " << scores[i].score << ", Fname: " << scores[i].fname << std::endl;
89  }
90  if (this->verbose) std::cout << "----------------------------\n";
91 
92  if (this->verbose) std::cout << "Dim: " << dim << " -- ";
93 
94  /*training sample*/
95  this->classifier->setVerbose(0);
96  this->classifier->setGamma(margin);
97  this->classifier->setSamples(this->samples);
98  if (!this->classifier->train()) {
99  w.erase(w.begin(), w.end());
100  if (this->verbose) std::cout << "Training failed!\n";
101  //break;
102  } else {
103  sol = this->classifier->getSolution();
104  std::cout << "Training sucessful...\n";
105  std::cout << "Margin = " << sol.margin << ", Support Vectors = " << sol.svs << "\n";
106  std::cout << "----------------------------\n";
107  }
108 
109  std::sort(scores.begin(), scores.end(), golub_select_compare_score_greater);
110 
111  stmp_partial->copy(*this->samples);
112  stmp->copy(*this->samples);
113 
114  for (i = 0; i < (dim - this->number); ++i) {
115  if (this->verbose) std::cout << "Score: " << scores[i].score << ", Fname: " << scores[i].fname << "\n";
116  remove[i] = scores[i].fname;
117  stmp->removeFeatures(remove);
118 
119  if (this->verbose)
120  std::cout << "Dim: " << dim - i - 1 << " -- ";
121 
122  /*training sample*/
123  w.erase(w.begin(), w.end());
124  this->classifier->setGamma(margin);
125  this->classifier->setSamples(stmp);
126  if (!this->classifier->train()) {
127  if (this->verbose) std::cout << "Training failed!\n";
128  partial = 1;
129  break;
130  } else {
131  sol = this->classifier->getSolution();
132  std::cout << "Training sucessful...\n";
133  std::cout << "Margin = " << sol.margin << ", Support Vectors = " << sol.svs << "\n";
134  std::cout << "----------------------------\n";
135  }
136  stmp_partial.reset();
137  stmp_partial = std::make_shared<Data<T> >();
138  stmp_partial->copy(*stmp);
139  }
140 
141  if (partial) {
142  stmp.reset();
143  return *stmp_partial;
144  } else {
145  stmp_partial.reset();
146  return *stmp;
147  }
148  }
149 
150  /*----------------------------------------------------------*
151  * Returns 1 for a > b, -1 a < b, 0 if a = b *
152  *----------------------------------------------------------*/
153  template<typename T>
154  int Golub<T>::golub_select_compare_score_greater(const golub_select_score &a, const golub_select_score &b) {
155  //const golub_select_score *ia = (const golub_select_score*) a;
156  //const golub_select_score *ib = (const golub_select_score*) b;
157 
158  /* V (greater)*/
159  return a.score < b.score;
160  //return (a.score < b.score) - (a.score > b.score);
161  }
162  }
163 }
size_t dim() const
Returns the dimension of the dataset.
Definition: Data.hpp:213
Definition: Solution.hpp:13
unsigned int svs
Number of support Vectors.
Definition: Solution.hpp:31
double margin
Margin generated from the classifier that generated the solution.
Definition: Solution.hpp:27
Definition: classifier/Classifier.hpp:17
Definition: featselect/FeatureSelection.hpp:17
classifier::Classifier< double > * classifier
Classifier used by the method.
Definition: featselect/FeatureSelection.hpp:23
std::shared_ptr< Data< double > > samples
Attributes.
Definition: featselect/FeatureSelection.hpp:21
Definition: Golub.hpp:11
Data< T > selectFeatures() override
Function that executes the feature selection phase.
Definition: Golub.hpp:38
UFJF-MLTK main namespace for core functionalities.
Definition: classifier/Classifier.hpp:11