5 #ifndef UFJF_MLTK_KMEANS_HPP
6 #define UFJF_MLTK_KMEANS_HPP
9 #include "ufjfmltk/clusterer/Clusterer.hpp"
16 using Clusters = std::vector<Cluster>;
21 template<
typename T =
double,
typename Callable = metrics::dist::Eucl
idean<T> >
25 std::string initialization;
32 bool train()
override;
34 double assign_clusters(
const std::vector<mltk::PointPointer<T>>& points);
36 double clusters_variance(
const Clusters& clusters);
38 void compute_centers();
42 std::vector<mltk::Point<size_t>> clusters()
override;
48 template<
typename T,
typename Callable>
58 template<
typename T,
typename Callable>
59 double KMeans<T, Callable>::assign_clusters(
const std::vector<mltk::PointPointer<T>>& points) {
63 std::for_each(points.begin(), points.end(), [&](
const std::shared_ptr<
Point<T> > q){
64 std::vector<double> distances(this->m_centers.size(), 0.0);
65 std::transform(this->m_centers.begin(), this->m_centers.end(), distances.begin(), [&](const Point<T>& center){
66 return this->dist_function(*q, center);
68 size_t cluster_id = std::min_element(distances.begin(), distances.end()) - distances.begin();
69 this->m_clusters[cluster_id].X().push_back(q->Id()-1);
70 cost += distances[cluster_id] * distances[cluster_id];
75 template<
typename T,
typename Callable>
76 void KMeans<T, Callable>::compute_centers(){
78 std::for_each(this->m_clusters.begin(), this->m_clusters.end(), [&](
const auto& cluster){
79 for(const size_t id: cluster){
80 this->m_centers[i] += *this->samples->point(id);
85 std::for_each(this->m_centers.begin(), this->m_centers.end(), [&](
auto& center){
86 center /= this->m_clusters[i].size();
91 template<
typename T,
typename Callable>
93 double cost = 0.0, old_cost = 0.0;
94 size_t dim = this->samples->
dim(), size = this->samples->
size();
95 size_t time = this->start_time + this->max_time;
96 auto points = this->samples->
points();
97 bool has_converged =
true;
100 std::shuffle(points.begin(), points.end(), std::mt19937(this->seed));
102 if (initialization ==
"random") {
103 std::vector<size_t> centers_ids(this->n_clusters);
106 std::generate(centers_ids.begin(), centers_ids.end(), [&size](){
107 return mltk::random::intInRange(0, int(size));
110 std::transform(centers_ids.begin(), centers_ids.end(), this->m_centers.begin(),
111 [
this](
const size_t ¢er_id) {
112 return (*this->samples)[center_id]->X();
114 }
else if (initialization ==
"kmeanspp") {
119 for (
size_t i = 1; i < this->m_centers.size(); i++) {
121 std::vector<double> distances(points.size(), 0.0);
123 std::transform(points.begin(), points.end(), distances.begin(),
124 [&i, &sum_d,
this](
const std::shared_ptr<
Point<T> > q) {
125 double d = this->dist_function(Point<T>(this->m_centers[i - 1]), *q);
129 for (
double &distance : distances) {
133 std::discrete_distribution<size_t> _dist(distances.begin(), distances.end());
136 this->m_centers[i] = points[center]->X();
141 std::cout <<
"----------------------------------------------------------------------------------------\n";
142 std::cout <<
" steps updates cost_function diff variance milli(ms)\n";
143 std::cout <<
"----------------------------------------------------------------------------------------\n";
148 has_converged =
true;
150 cost = assign_clusters(points);
155 double secs = this->
timer.elapsed();
157 auto diff = fabs(cost - old_cost);
158 std::cout <<
" " << this->steps <<
" " << this->ctot <<
" " << cost
159 <<
" " << diff <<
" " << clusters_variance(this->m_clusters) <<
" " << secs <<
"\n";
161 if (time - this->
timer.elapsed() <= 0)
break;
162 has_converged = fabs(cost - old_cost) <= this->EPS;
163 for(
auto& cluster: this->m_clusters){
166 }
while (!has_converged);
171 template<
typename T,
typename Callable>
173 std::vector<double> distances(this->m_centers.size(), 0.0);
174 std::transform(this->m_centers.begin(), this->m_centers.end(), distances.begin(), [&](
const Point<T>& center){
175 return this->dist_function(center, p);
177 size_t cluster_id = std::min_element(distances.begin(), distances.end()) - distances.begin();
181 template<
typename T,
typename Callable>
186 template<
typename T,
typename Callable>
188 assign_clusters(this->samples->
points());
189 return this->m_clusters;
192 template<
typename T,
typename Callable>
193 double KMeans<T, Callable>::clusters_variance(
const Clusters &clusters) {
195 std::for_each(clusters.begin(), clusters.end(), [&](
const Cluster& cluster){
196 Point<double> mean(this->samples->dim());
198 for(const size_t id: cluster){
199 mean += (*this->samples)(id);
201 mean /= cluster.size();
202 for(
const size_t id: cluster){
203 var += mltk::pow((*this->samples)(id)-mean, 2).sum();
size_t size() const
Returns the size of the dataset.
Definition: Data.hpp:208
size_t dim() const
Returns the dimension of the dataset.
Definition: Data.hpp:213
std::vector< std::shared_ptr< Point< T > > > points()
Returns a shared pointer to the vector of Points of the sample.
Definition: Data.hpp:1685
std::shared_ptr< Data< T > > samples
Samples used in the model training.
Definition: Learner.hpp:21
int verbose
Verbose level of the output.
Definition: Learner.hpp:42
size_t seed
seed for random operations.
Definition: Learner.hpp:46
Definition: clusterer/Clusterer.hpp:18
size_t n_clusters
Number of clusters for the cluster method.
Definition: clusterer/Clusterer.hpp:23
std::vector< mltk::Point< size_t > > m_clusters
Clusters of points.
Definition: clusterer/Clusterer.hpp:27
std::vector< mltk::Point< double > > m_centers
Vector with the centers of the clusters.
Definition: clusterer/Clusterer.hpp:25
Wrapper for the implementation of the K-Means clustering algorithm.
Definition: KMeans.hpp:22
bool train() override
Function that execute the training phase of a Learner.
Definition: KMeans.hpp:92
double evaluate(const Point< T > &p, bool raw_value=false) override
Returns the class of a feature point based on the trained Learner.
Definition: KMeans.hpp:172
std::string getFormulationString() override
getFormulationString Returns a string that represents the formulation of the learner (Primal or Dual)...
Definition: KMeans.hpp:182
A helper class to measure execution time for benchmarking purposes.
Definition: ThreadPool.hpp:503
Integral intInRange(Integral low, Integral1 high)
Returns a integer between low and high.
Definition: Random.cpp:27
size_t init(unsigned int seed=0)
Initialize the mersenne twister pseudorandom number generator.
Definition: Random.cpp:12
double mean(const mltk::Point< T, R > &p)
Compute the mean (average) of a point.
Definition: Statistics.hpp:103
double var(const mltk::Point< T, R > &p)
Compute the variance of a point.
Definition: Statistics.hpp:142
UFJF-MLTK main namespace for core functionalities.
Definition: classifier/Classifier.hpp:11
DataPointer< T > make_data(Types... args)
Makes a shared_pointer for a data object.
Definition: Data.hpp:532