UFJF - Machine Learning Toolkit  0.51.8
Sampling.hpp
1 
2 #pragma once
3 
4 #include "Data.hpp"
5 #include "DistanceMetric.hpp"
6 #include <random>
7 
8 namespace mltk{
9  template < typename T >
10  class DSM{
11  size_t seed{};
12  size_t n_dims{}, k{};
13  double alpha{};
14  Point<double> labels;
15  std::vector<std::vector<size_t>> subspaces;
16 
17  public:
18  DSM()=default;
19  DSM(const Data<T>& data, size_t k, size_t n, double alpha): alpha(alpha), n_dims(n), k(k) {
20  labels = data.getLabels();
21  subspaces.resize(k);
22  for(auto& Si: subspaces) {
23  Si.resize(n_dims, -1);
24  }
25  }
26 
27  double div_m(int i, size_t feat){
28  return (div_mx(feat)+div_ms(i))/2;
29  }
30 
31  double div_ms(int i){
32  double max_val = 0.0;
33  for(int j = 0; j < subspaces.size();j++){
34  if(j == i) continue;
35  double val = 0.0;
36  std::vector<size_t> intersection;
37  std::set_intersection(subspaces[i].begin(), subspaces[i].end(),
38  subspaces[j].begin(), subspaces[j].end(),
39  std::back_inserter(intersection));
40  val = double(intersection.size())/subspaces[j].size();
41  if(val > max_val) max_val = val;
42  }
43  return 1.0 - max_val;
44  }
45 
46  double div_mx(size_t feat){
47  double count = 0.0;
48  for(auto& Si: subspaces){
49  if(std::find(Si.begin(), Si.end(), feat) != Si.end()){
50  count++;
51  }
52  }
53  return 1.0 - (count/subspaces.size());
54  }
55 
56  double qual_correlation(const Data<T>& data, int feat){
57  Point<T> x = data.getFeature(feat);
58  return mltk::stats::covar(x, labels)/(mltk::stats::std_dev(x)*mltk::stats::std_dev(labels));
59  }
60 
61  inline void setAlpha(double _alpha){ this->alpha = _alpha; }
62 
63  std::vector<std::vector<size_t>> operator()(const Data< T > &data){
64  for(int i = 0; i < n_dims; i++) {
65  for(int j = 0; j < subspaces.size(); j++) {
66  Point<double> fscore(data.dim(), -1.01);
67  for (int c = 0; c < data.dim(); c++) {
68  if (std::find(subspaces[j].begin(), subspaces[j].end(), c) == subspaces[j].end()) {
69  fscore[c] = alpha*qual_correlation(data, c) + (1-alpha)*div_m(j, c);
70  }
71  }
72  auto best_x_id = std::max_element(fscore.X().begin(), fscore.X().end()) - fscore.X().begin();
73  subspaces[j][i] = best_x_id;
74  }
75  }
76  return subspaces;
77  }
78  };
79 
80  template < typename T >
81  class RSM{
82  size_t seed{};
83  size_t n_dims{};
84  double r{};
85  std::mt19937 generator;
86  std::vector<size_t> feats;
87 
88  public:
89  RSM()=default;
90  RSM(double r, size_t dims, size_t seed): r(r), seed(seed) {
91  generator.seed(seed);
92  n_dims = std::ceil(r * dims);
93  feats.resize(dims);
94  std::iota(feats.begin(), feats.end(), 0);
95  }
96  std::vector<size_t> operator()(Data< T > &data){
97  std::shuffle(feats.begin(), feats.end(), generator);
98  std::vector<size_t> new_feats(n_dims);
99  for(size_t j = 0; j < new_feats.size(); j++){
100  new_feats[j] = feats[j];
101  }
102  return new_feats;
103  }
104  };
105 
109  template < typename T, typename Callable = metrics::dist::Euclidean< T > >
111  protected:
112  Callable distance_metric;
113  public:
114  OverSampling()=default;
115  explicit OverSampling(Callable dist_metric): distance_metric(dist_metric) {}
116  virtual Data< T > operator()(Data< T > &data) = 0;
117  };
118 
122  template < typename T=double, typename Callable = metrics::dist::Euclidean< T > >
123  class SMOTE: public OverSampling< T, Callable > {
124  private:
126  size_t seed = 0;
128  size_t k = 5;
130  double r = 0.1;
131 
132  public:
133  explicit SMOTE(size_t k = 1, double r = 0.1, size_t seed = 0, Callable dist_metric = Callable())
134  : k(k), r(r), seed(seed), OverSampling<T>(dist_metric) {}
135 
136  Data< T > operator()(Data< T > &data) override {
137  std::random_device rd;
138  seed = (seed > 0)?seed:rd();
139  std::mt19937 generator(seed);
140  std::uniform_real_distribution<double> distribution(0.0,1.0);
141  // number of generated artificial points
142  size_t n_apoints = r * data.size();
143  // find the minority class
144  auto classes = data.classes();
145  auto class_distribution = data.classesDistribution();
146  int min_class = classes[std::min_element(class_distribution.begin(), class_distribution.end()) - class_distribution.begin()];
147  // copy all the points from the minority class to the Z set
148  std::vector<int> class_copy = {min_class};
149  std::vector<SamplePointer< T > > artificial_data;
150  Data< T > Z;
151  Z.classesCopy(data, class_copy);
152 
153  // iterate through all the elements from the Z set
154  for(auto z = Z.begin(); z != Z.end(); ++z){
155  std::vector<std::pair<size_t, double> > distance(Z.size());
156  std::vector<SamplePointer< T > > k_neighbors(k);
157  size_t id = 0;
158  auto _z = *(*z);
159 
160  // compute the metrics from a point to the rest
161  std::transform(Z.begin(), Z.end(), distance.begin(), [this, &_z, &id](auto p){
162  id++;
163  return std::make_pair(id, this->distance_metric(_z, *p));
164  });
165 
166  // sort the distances in increasing metrics order
167  std::sort(distance.begin(), distance.end(), [](auto &d1, auto &d2){
168  return d1.second < d2.second;
169  });
170 
171  // remove points with equal metrics or duplicate ids
172  distance.erase(std::unique(distance.begin(), distance.end(), [](auto &d1, auto &d2){
173  return (d1.second == d2.second) || ((d1.first == d2.first));
174  }), distance.end());
175 
176  distance.erase(std::remove_if(distance.begin(), distance.end(), [](auto &d){
177  return d.second == 0;
178  }), distance.end());
179 
180  // get the k neighbors
181  for(size_t i = 0; (i < k) && (i < distance.size()); i++){
182  k_neighbors[i] = data[distance[i].first-1];
183  }
184 
185  // create the artificial points and insert them to the dataset
186  for(auto p = k_neighbors.begin(); p != k_neighbors.end(); p++){
187  if(!*p) continue;
188  auto _k = *(*p);
189  Point< T > s(_z.size(), 0.0, 0);
190  double alpha = distribution(generator);
191 
192  s = _z + alpha * (_z - _k);
193  s.Y() = min_class;
194 
195  artificial_data.push_back(mltk::make_point< T >(s));
196  }
197  }
198 
199  std::shuffle(artificial_data.begin(), artificial_data.end(), std::default_random_engine(seed));
200  for (size_t i = 0; i < artificial_data.size(); i++) {
201  data.insertPoint(artificial_data[i]);
202  }
203  return data;
204  }
205  };
206 
210  template < typename T=double, typename Callable = metrics::dist::Euclidean< T > >
211  class BorderlineSMOTEOne: public OverSampling< T, Callable > {
212  private:
214  size_t seed = 0;
216  size_t k = 5;
218  size_t m = 5;
220  double r = 0.1;
221 
222  public:
223  explicit BorderlineSMOTEOne(size_t k = 1, double r = 0.1, size_t m = 1, size_t seed = 0, Callable dist_metric = Callable())
224  : k(k), r(r), m(m), seed(seed), OverSampling<T>(dist_metric) {}
225 
226  Data< T > operator()(Data< T > &data) override {
227  // Find the majority class
228  auto classes = data.classes();
229  auto class_distribution = data.classesDistribution();
230  int maj_class = classes[std::max_element(class_distribution.begin(), class_distribution.end()) - class_distribution.begin()];
231  // Initialize the danger subset
232  Data< T > danger_subset;
233  std::set<size_t> danger_ids;
234 
235  // iterate through all the elements from the Z set
236  for(auto z = data.begin(); z != data.end(); ++z){
237  std::vector<std::pair<size_t, double> > distance(data.size());
238  std::vector<SamplePointer< T > > M(m);
239  size_t id = 0;
240  auto _z = (*z);
241 
242  // compute the euclidean metrics from a point to the rest
243  std::transform(data.begin(), data.end(), distance.begin(), [this, &_z, &id](auto p){
244  id++;
245  return std::make_pair(id, this->distance_metric(*_z, *p));
246  });
247 
248  // sort the distances in increasing metrics order
249  std::sort(distance.begin(), distance.end(), [](auto &d1, auto &d2){
250  return d1.second < d2.second;
251  });
252 
253  // remove points with equal metrics or duplicate ids
254  distance.erase(std::unique(distance.begin(), distance.end(), [](auto &d1, auto &d2){
255  return (d1.second == d2.second) || ((d1.first == d2.first));
256  }), distance.end());
257 
258  distance.erase(std::remove_if(distance.begin(), distance.end(), [](auto &d){
259  return d.second == 0;
260  }), distance.end());
261 
262  // get the m neighbors
263  for(size_t i = 0; (i < m) && (i < distance.size()); i++){
264  M[i] = data[distance[i].first-1];
265  }
266 
267  // set m' as the number of points on the majority class set on m neighbors
268  size_t m_ = std::count_if(M.begin(), M.end(), [&maj_class](auto &p){
269  return p->Y() == maj_class;
270  });
271 
272  // insert the point to the danger subset if m/2 <= m' < m
273  if(m_ >= (m/2) && m_ < m){
274  danger_ids.insert(_z->Id());
275  danger_subset.insertPoint(_z);
276  }
277  }
278 
279  // apply SMOTE algorithm to the danger subset
280  if(danger_subset.size() > 0){
281  SMOTE< T > smote(k, r, seed);
282  smote(danger_subset);
283  }
284 
285  for(auto p = danger_subset.begin(); p != danger_subset.end(); p++){
286  auto _p = (*p);
287 
288  if(danger_ids.find(_p->Id()) != danger_ids.end()){
289  data.insertPoint(_p);
290  }
291  }
292 
293  return data;
294  }
295  };
296 }
Functor for the implementation of the Borderline SMOTE 1 over sampling algorithm.
Definition: Sampling.hpp:211
Definition: Sampling.hpp:10
bool insertPoint(const Data< T > &samples, int _index, bool keepIndex=false)
Insert a point to the data from another sample.
Definition: Data.hpp:1437
void classesCopy(const Data< T > &_data, std::vector< int > &classes)
Makes a deep copy from another data object.
Definition: Data.hpp:1520
size_t size() const
Returns the size of the dataset.
Definition: Data.hpp:208
const std::vector< int > classes() const
Returns a vector containing the numeric values of the classes.
Definition: Data.hpp:1831
size_t dim() const
Returns the dimension of the dataset.
Definition: Data.hpp:213
Point< T > getFeature(int index) const
Get the values of a feature from all points.
Definition: Data.hpp:2107
std::vector< size_t > classesDistribution() const
Returns a vector containing the frequency of the classes. Only valid for classification datasets.
Definition: Data.hpp:1826
Base class for the implementation of over sampling methods.
Definition: Sampling.hpp:110
Rep const & X() const
Returns the attributes representation of the point (std::vector by default).
Definition: Point.hpp:139
double const & Y() const
Returns the class or value of the point.
Definition: Point.hpp:152
Definition: Sampling.hpp:81
Functor for the implementation of the SMOTE over sampling algorithm.
Definition: Sampling.hpp:123
double covar(const mltk::Point< T, R > &p, const mltk::Point< T, R > &p1)
Compute the covariance between two points.
Definition: Statistics.hpp:180
double std_dev(const mltk::Point< T, R > &p)
Compute the standard deviation of a point.
Definition: Statistics.hpp:119
UFJF-MLTK main namespace for core functionalities.
Definition: classifier/Classifier.hpp:11