UFJF - Machine Learning Toolkit  0.51.8
KNNClassifier.hpp
1 //
2 // Created by mateus558 on 20/03/2020.
3 //
4 
5 #ifndef UFJF_MLTK_KNN_HPP
6 #define UFJF_MLTK_KNN_HPP
7 
8 #include "PrimalClassifier.hpp"
9 #include "ufjfmltk/core/DistanceMatrix.hpp"
10 #include "ufjfmltk/core/DistanceMetric.hpp"
11 #include "ufjfmltk/core/ThreadPool.hpp"
12 #include <assert.h>
13 
14 
15 namespace mltk{
16  namespace classifier {
20  template<typename T = double, typename Callable = metrics::dist::Euclidean<T> >
21  class KNNClassifier : public PrimalClassifier<T> {
22  private:
24  size_t k = 3;
26  Callable dist_function;
28  bool precomputed = false;
29  std::string algorithm = "brute";
30  metrics::dist::BaseMatrix distances;
31 
32  public:
33 
34  KNNClassifier() = default;
35  explicit KNNClassifier(size_t _k, std::string _algorithm = "brute")
36  : k(_k), algorithm(_algorithm) {}
37 
38  KNNClassifier(Data<T> &_samples, size_t _k, std::string _algorithm = "brute")
39  : k(_k), algorithm(_algorithm) {
40  this->samples = mltk::make_data<T>(_samples);
41  }
42 
43  bool train() override;
44 
45  double evaluate(const Point<T> &p, bool raw_value = false) override;
46 
47  Callable& metric(){ return dist_function; }
48 
49  void setPrecomputedDistances(metrics::dist::BaseMatrix _distances){
50  this->distances = _distances;
51  this->precomputed = true;
52  }
53 
54  metrics::dist::DistanceMatrix<Callable> precomputeDistances(mltk::Data<T> &data, bool diagonal = false, const size_t threads = std::thread::hardware_concurrency()){
55  this->precomputed = true;
56 
57  return metrics::dist::DistanceMatrix<Callable>(data, diagonal, threads);
58  }
59  };
60 
61 
62  template<typename T, typename Callable>
63  double KNNClassifier<T, Callable>::evaluate(const Point<T> &p, bool raw_value) {
64  assert(this->samples->dim() == p.size());
65  auto points = this->samples->points();
66  mltk::Point<double> distances(this->samples->size());
67  std::vector<int> classes = this->samples->classes();
68  std::vector<size_t> idx(distances.size());
69  std::vector<PointPointer<T>> neigh;
70  auto p0 = std::make_shared<Point<T> >(p);
71 
72  if(algorithm == "brute"){
73  // fill the index vector
74  std::iota(idx.begin(), idx.end(), 0);
75  if(!precomputed) {
76  // compute the metrics from the sample to be evaluated to the samples vector
77  std::transform(points.begin(), points.end(), distances.begin(),
78  [&p0, this](const std::shared_ptr<Point<T> > q) {
79  return this->dist_function(*p0, *q);
80  });
81 
82 
83  // sort the index vector by the metrics from the sample to be evaluated
84  std::nth_element(idx.begin(), idx.begin() + this->k, idx.end(), [&distances](size_t i1, size_t i2) {
85  return distances[i1] < distances[i2];
86  });
87  }else if(!this->distances.isDiagonalMatrix()){
88  size_t idp = p.Id()-1;
89  std::nth_element(idx.begin(), idx.begin() + this->k, idx.end(), [&idp, &distances, &points, this](size_t i1, size_t i2) {
90  size_t id1 = points[i1]->Id()-1;
91  size_t id2 = points[i2]->Id()-1;
92 
93  return this->distances[idp][id1] < this->distances[idp][id2];
94  });
95  } else {
96  std::nth_element(idx.begin(), idx.begin() + this->k, idx.end(), [&p, this, &points](size_t i1, size_t i2) {
97  size_t id1 = points[i1]->Id()-1;
98  size_t id2 = points[i2]->Id()-1;
99  size_t idp = p.Id()-1;
100 
101  if(idp == id1 || idp == id2) return false;
102 
103  size_t idp1 = (idp > id1) ? idp : id1;
104  size_t id1p = (idp > id1) ? id1 : idp;
105 
106  size_t idp2 = (idp > id2) ? idp : id2;
107  size_t id2p = (idp > id2) ? id2 : idp;
108 
109  return this->distances[idp1][id1p] < this->distances[idp2][id2p];
110  });
111  }
112  }
113  // Define frequency counting logic in a lambda function.
114  auto calculateFrequency = [&idx, &points, &neigh, this](int c) {
115  if (algorithm == "brute") {
116  return std::count_if(idx.begin(), idx.begin() + this->k, [&points, &c](size_t id) {
117  return points[id]->Y() == c;
118  });
119  } else {
120  return std::count_if(neigh.begin(), neigh.end(), [&c](auto point) {
121  return point->Y() == c;
122  });
123  }
124  };
125 
126  // Use std::pair to store max frequency and its related index.
127  std::pair<int, size_t> maxDetails{0, 0}; // frequency, index
128 
129  double s = 0.0001;
130  double max_prob = 0.0;
131 
132  // Iterate over classes using a conventional for loop for better readability.
133  for(size_t i = 0; i < classes.size(); ++i) {
134  int freq = calculateFrequency(classes[i]);
135 
136  if(freq > maxDetails.first) {
137  double prob = (freq+s)/(k+classes.size()*s);
138 
139  maxDetails.first = freq;
140  maxDetails.second = i;
141 
142  max_prob = prob;
143  }
144  }
145 
146  this->pred_prob = (1-max_prob > 1E-7) ? 1: max_prob;
147 
148  return classes[maxDetails.second];
149  }
150 
151  template<typename T, typename Callable>
153  return true;
154  }
155  }
156  }
157 #endif //UFJF_MLTK_KNN_HPP
std::shared_ptr< Data< T > > samples
Samples used in the model training.
Definition: Learner.hpp:21
std::size_t size() const
Returns the dimension of the point.
Definition: Point.hpp:133
size_t const & Id() const
Returns the id of the point.
Definition: Point.hpp:180
Wrapper for the implementation of the K-Nearest Neighbors classifier algorithm.
Definition: KNNClassifier.hpp:21
bool train() override
Function that execute the training phase of a Learner.
Definition: KNNClassifier.hpp:152
double evaluate(const Point< T > &p, bool raw_value=false) override
Returns the class of a feature point based on the trained Learner.
Definition: KNNClassifier.hpp:63
Definition: PrimalClassifier.hpp:14
Definition: DistanceMatrix.hpp:9
Definition: DistanceMatrix.hpp:34
UFJF-MLTK main namespace for core functionalities.
Definition: classifier/Classifier.hpp:11