UFJF - Machine Learning Toolkit  0.51.8
KMeans.hpp
1 //
2 // Created by mateus558 on 26/03/2020.
3 //
4 
5 #ifndef UFJF_MLTK_KMEANS_HPP
6 #define UFJF_MLTK_KMEANS_HPP
7 #pragma once
8 
9 #include "ufjfmltk/clusterer/Clusterer.hpp"
10 #include <random>
11 
12 namespace mltk{
13  namespace clusterer {
14 
15  using Cluster = mltk::Point<size_t>;
16  using Clusters = std::vector<Cluster>;
17 
21  template<typename T = double, typename Callable = metrics::dist::Euclidean<T> >
22  class KMeans : public Clusterer<T> {
23  private:
25  std::string initialization;
26 
27  public:
28  KMeans() = default;
29  KMeans(const Data<T>& samples, size_t k, std::string initialization = "random", size_t seed = 0,
30  int verbose = 0);
31 
32  bool train() override;
33 
34  double assign_clusters(const std::vector<mltk::PointPointer<T>>& points);
35 
36  double clusters_variance(const Clusters& clusters);
37 
38  void compute_centers();
39 
40  double evaluate(const Point<T> &p, bool raw_value = false) override;
41 
42  std::vector<mltk::Point<size_t>> clusters() override;
43 
44  std::string getFormulationString() override;
45 
46  };
47 
48  template<typename T, typename Callable>
49  KMeans<T, Callable>::KMeans(const mltk::Data<T>& samples, size_t k, std::string initialization, size_t seed,
50  int verbose)
51  : Clusterer<T>(mltk::make_data<T>(samples), k), initialization(std::move(initialization)){
52  this->m_centers.assign(this->n_clusters, std::vector<T>(this->samples->dim(), 0.0));
53  this->m_clusters.assign(this->n_clusters, std::vector<size_t>());
54  this->verbose = verbose;
55  this->seed = seed;
56  }
57 
58  template<typename T, typename Callable>
59  double KMeans<T, Callable>::assign_clusters(const std::vector<mltk::PointPointer<T>>& points) {
60  double sum_d = 0;
61  double cost = 0.0;
62 
63  std::for_each(points.begin(), points.end(), [&](const std::shared_ptr<Point<T> > q){
64  std::vector<double> distances(this->m_centers.size(), 0.0);
65  std::transform(this->m_centers.begin(), this->m_centers.end(), distances.begin(), [&](const Point<T>& center){
66  return this->dist_function(*q, center);
67  });
68  size_t cluster_id = std::min_element(distances.begin(), distances.end()) - distances.begin();
69  this->m_clusters[cluster_id].X().push_back(q->Id()-1);
70  cost += distances[cluster_id] * distances[cluster_id];
71  });
72  return cost;
73  }
74 
75  template<typename T, typename Callable>
76  void KMeans<T, Callable>::compute_centers(){
77  int i = 0;
78  std::for_each(this->m_clusters.begin(), this->m_clusters.end(), [&](const auto& cluster){
79  for(const size_t id: cluster){
80  this->m_centers[i] += *this->samples->point(id);
81  }
82  i++;
83  });
84  i = 0;
85  std::for_each(this->m_centers.begin(), this->m_centers.end(), [&](auto& center){
86  center /= this->m_clusters[i].size();
87  i++;
88  });
89  }
90 
91  template<typename T, typename Callable>
93  double cost = 0.0, old_cost = 0.0;
94  size_t dim = this->samples->dim(), size = this->samples->size();
95  size_t time = this->start_time + this->max_time;
96  auto points = this->samples->points();
97  bool has_converged = true;
98  mltk::random::init(this->seed);
99 
100  std::shuffle(points.begin(), points.end(), std::mt19937(this->seed));
101 
102  if (initialization == "random") {
103  std::vector<size_t> centers_ids(this->n_clusters);
104 
105  // generate the ids of the centers in the dataset using the random number generator function
106  std::generate(centers_ids.begin(), centers_ids.end(), [&size](){
107  return mltk::random::intInRange(0, int(size));
108  });
109  // get the values from the dataset for the centers
110  std::transform(centers_ids.begin(), centers_ids.end(), this->m_centers.begin(),
111  [this](const size_t &center_id) {
112  return (*this->samples)[center_id]->X();
113  });
114  } else if (initialization == "kmeanspp") {
115  // choose the first center randomly
116  this->m_centers[0] = points[mltk::random::intInRange(0, int(size))]->X();
117  // choose the next cluster in points with a probability directly proportional to the metrics from the
118  // last chosen cluster.
119  for (size_t i = 1; i < this->m_centers.size(); i++) {
120  double sum_d = 0.0;
121  std::vector<double> distances(points.size(), 0.0);
122  //compute the distances from the points to the last cluster
123  std::transform(points.begin(), points.end(), distances.begin(),
124  [&i, &sum_d, this](const std::shared_ptr<Point<T> > q) {
125  double d = this->dist_function(Point<T>(this->m_centers[i - 1]), *q);
126  sum_d += d;
127  return d;
128  });
129  for (double &distance : distances) {
130  distance /= sum_d;
131  }
132  // use the distances as a probability distribution
133  std::discrete_distribution<size_t> _dist(distances.begin(), distances.end());
134  // generate the id for the next cluster
135  size_t center = mltk::random::intInRange(0, int(size));
136  this->m_centers[i] = points[center]->X();
137  }
138  }
139 
140  if (this->verbose) {
141  std::cout << "----------------------------------------------------------------------------------------\n";
142  std::cout << " steps updates cost_function diff variance milli(ms)\n";
143  std::cout << "----------------------------------------------------------------------------------------\n";
144  }
145  this->timer.reset();
146  do {
147  old_cost = cost;
148  has_converged = true;
149  // assign each point to the nearest cluster
150  cost = assign_clusters(points);
151  // update the centers of the clusters
152  compute_centers();
153 
154  this->steps++;
155  double secs = this->timer.elapsed();
156  if (this->verbose) {
157  auto diff = fabs(cost - old_cost);
158  std::cout << " " << this->steps << " " << this->ctot << " " << cost
159  << " " << diff << " " << clusters_variance(this->m_clusters) << " " << secs << "\n";
160  }
161  if (time - this->timer.elapsed() <= 0) break;
162  has_converged = fabs(cost - old_cost) <= this->EPS;
163  for(auto& cluster: this->m_clusters){
164  cluster.clear();
165  }
166  } while (!has_converged);
167 
168  return true;
169  }
170 
171  template<typename T, typename Callable>
172  double KMeans<T, Callable>::evaluate(const Point<T> &p, bool raw_value) {
173  std::vector<double> distances(this->m_centers.size(), 0.0);
174  std::transform(this->m_centers.begin(), this->m_centers.end(), distances.begin(), [&](const Point<T>& center){
175  return this->dist_function(center, p);
176  });
177  size_t cluster_id = std::min_element(distances.begin(), distances.end()) - distances.begin();
178  return cluster_id;
179  }
180 
181  template<typename T, typename Callable>
183  return "Clusterer";
184  }
185 
186  template<typename T, typename Callable>
187  std::vector<mltk::Point<size_t>> KMeans<T, Callable>::clusters() {
188  assign_clusters(this->samples->points());
189  return this->m_clusters;
190  }
191 
192  template<typename T, typename Callable>
193  double KMeans<T, Callable>::clusters_variance(const Clusters &clusters) {
194  double var = 0.0;
195  std::for_each(clusters.begin(), clusters.end(), [&](const Cluster& cluster){
196  Point<double> mean(this->samples->dim());
197 
198  for(const size_t id: cluster){
199  mean += (*this->samples)(id);
200  }
201  mean /= cluster.size();
202  for(const size_t id: cluster){
203  var += mltk::pow((*this->samples)(id)-mean, 2).sum();
204  }
205  });
206 
207  return var;
208  }
209 
210  }
211 }
212 
213 #endif //UFJF_MLTK_KMEANS_HPP
size_t size() const
Returns the size of the dataset.
Definition: Data.hpp:208
size_t dim() const
Returns the dimension of the dataset.
Definition: Data.hpp:213
std::vector< std::shared_ptr< Point< T > > > points()
Returns a shared pointer to the vector of Points of the sample.
Definition: Data.hpp:1685
std::shared_ptr< Data< T > > samples
Samples used in the model training.
Definition: Learner.hpp:21
int verbose
Verbose level of the output.
Definition: Learner.hpp:42
size_t seed
seed for random operations.
Definition: Learner.hpp:46
Definition: clusterer/Clusterer.hpp:18
size_t n_clusters
Number of clusters for the cluster method.
Definition: clusterer/Clusterer.hpp:23
std::vector< mltk::Point< size_t > > m_clusters
Clusters of points.
Definition: clusterer/Clusterer.hpp:27
std::vector< mltk::Point< double > > m_centers
Vector with the centers of the clusters.
Definition: clusterer/Clusterer.hpp:25
Wrapper for the implementation of the K-Means clustering algorithm.
Definition: KMeans.hpp:22
bool train() override
Function that execute the training phase of a Learner.
Definition: KMeans.hpp:92
double evaluate(const Point< T > &p, bool raw_value=false) override
Returns the class of a feature point based on the trained Learner.
Definition: KMeans.hpp:172
std::string getFormulationString() override
getFormulationString Returns a string that represents the formulation of the learner (Primal or Dual)...
Definition: KMeans.hpp:182
A helper class to measure execution time for benchmarking purposes.
Definition: ThreadPool.hpp:503
Integral intInRange(Integral low, Integral1 high)
Returns a integer between low and high.
Definition: Random.cpp:27
size_t init(unsigned int seed=0)
Initialize the mersenne twister pseudorandom number generator.
Definition: Random.cpp:12
double mean(const mltk::Point< T, R > &p)
Compute the mean (average) of a point.
Definition: Statistics.hpp:103
double var(const mltk::Point< T, R > &p)
Compute the variance of a point.
Definition: Statistics.hpp:142
UFJF-MLTK main namespace for core functionalities.
Definition: classifier/Classifier.hpp:11
DataPointer< T > make_data(Types... args)
Makes a shared_pointer for a data object.
Definition: Data.hpp:532