UFJF - Machine Learning Toolkit  0.51.8
Statistics.hpp
Go to the documentation of this file.
1 
7 #pragma once
8 
9 #include <vector>
10 #include <cmath>
11 #include <iostream>
12 #include "Data.hpp"
13 
14 namespace mltk{
15  template < typename T > class Data;
16 }
17 
19 namespace mltk::stats {
25  template < typename T, typename R >
26  double mean (const mltk::Point<T, R> &p);
32  template <typename T>
33  double mean(const Data<T>& data, size_t feat);
39  template < typename T, typename R >
40  double std_dev(const mltk::Point<T, R> &p);
46  template < typename T >
47  double std_dev(const Data<T>& data, size_t feat);
53  template < typename T, typename R >
54  double var(const mltk::Point<T, R> &p);
60  template <typename T>
61  double var(const Data<T>& data, size_t feat);
68  template < typename T, typename R >
69  double covar(const mltk::Point<T, R> &p, const mltk::Point<T, R> &p1);
77  template < typename T >
78  double radius(const Data<T>& data, int feat, double q);
86  template < typename T >
87  double distCenters(const Data<T>& data, int feat);
95  template < typename T >
96  double distCentersWithoutFeats(const Data<T>& data, const std::vector<int>& feats, int index);
97 
98  /*********************************************
99  * Implementation *
100  *********************************************/
101 
102  template < typename T, typename R >
103  double mean (const Point<T, R> &p){
104  assert(p.size() > 0);
105  return p.sum()/p.size();
106  }
107 
108  template <typename T>
109  double mean(const Data<T>& data, size_t feat){
110  assert(feat < data.dim());
111  double sum = 0.0;
112  for(size_t i = 0; i < data.size(); i++){
113  sum += (*data[i])[feat];
114  }
115  return (data.size()>0)?sum/data.size():0.0;
116  }
117 
118  template < typename T, typename R >
119  double std_dev(const Point<T, R> &p){
120  assert(p.size() > 0);
121  return std::sqrt((mltk::pow(p-mltk::stats::mean(p), 2)).sum()/p.size());
122  }
123 
124  template < typename T >
125  double std_dev(const Data<T>& data, size_t feat){
126  int i, size = data.size();
127  double avg, sd;
128  std::vector<std::shared_ptr<Point< T > > > points = data.points();
129 
130  if(size == 1) return 0.0;
131 
132  avg = mltk::stats::mean(data, feat);
133 
134  for(sd = 0.0, i = 0; i < data.size(); ++i){
135  sd += (points[i]->X()[feat] - avg)*(points[i]->X()[feat] - avg);
136  }
137 
138  return std::sqrt(sd/(data.size() - 1));
139  }
140 
141  template < typename T, typename R >
142  double var(const Point<T, R> &p){
143  assert(p.size() > 0);
144  return mltk::pow(p-mltk::stats::mean(p), 2).sum()/p.size();
145  }
146 
147  template <typename T>
148  double var(const Data<T>& data, size_t feat){
149  int i, j;
150  int dim = data.dim(), size = data.size();
151  std::vector<int> fnames = data.getFeaturesNames();
152  std::vector<double> avg(dim);
153  std::vector<std::shared_ptr<Point< T > > > points = data.points();
154 
155  for(j = 0; j < dim; ++j){
156  if(feat < 0 || fnames[j] != feat){
157  avg[j] = 0.0;
158  for(i = 0; i < size; ++i){
159  avg[j] += points[i]->X()[j];
160  }
161  avg[j] = avg[j] / size;
162  }
163  }
164 
165  double sum = 0.0;
166  for(i = 0; i < size; ++i){
167  double norm = 0.0;
168  for(j = 0; j < dim; ++j){
169  if(feat < 0 || fnames[j] != feat){
170  norm += std::pow(avg[j] - points[i]->X()[j], 2);
171  }
172  }
173  sum += norm;
174  }
175  sum = sum/size;
176  return sum;
177  }
178 
179  template < typename T, typename R >
180  double covar(const Point<T, R> &p, const Point<T, R> &p1){
181  assert(p.size() == p1.size());
182  return ((p-mltk::stats::mean(p))*(p1-mltk::stats::mean(p1))).sum()/(p1.size()-1.0);
183  }
184 
185  template < typename T >
186  double radius(const Data<T>& data, int feat, double q){
187  int i = 0, j = 0, dim = data.dim(), size = data.size();
188  double norm = 0.0;
189  double max = 1.0;
190  std::vector<int> fnames = data.getFeaturesNames();
191  std::vector<double> avg(dim, 0.0);
192  std::vector<std::shared_ptr<Point< T > > > points = data.points();
193 
194  if(q == 2){
195  for(j = 0; j < dim; ++j){
196  if(feat < 0 || fnames[j] != feat){
197  avg[j] = 0.0;
198  for(i = 0; i < size; ++i){
199  avg[j] += points[i]->X()[j];
200  }
201  avg[j] = avg[j] / size;
202  }
203  }
204 
205  for(max = 0, i = 0; i < size; ++i){
206  for(norm = 0, j = 0; j < dim; ++j){
207  if(feat < 0 || fnames[j] != feat){
208  norm += std::pow(avg[j] - points[i]->X()[j], 2);
209  }
210  }
211  norm = std::sqrt(norm);
212  if(max < norm) max = norm;
213  }
214 
215  }else if(q == 1){
216  for(max = 0, i = 0; i < size; ++i){
217  for(j = 0; j < dim; ++j){
218  if(feat < 0 || fnames[j] != feat)
219  if(max < fabs(points[i]->X()[j]))
220  max = fabs(points[i]->X()[j]);
221  }
222  }
223  }
224 
225  return max;
226  }
227 
228  template < typename T >
229  double distCenters(const Data<T>& data, int feat){
230  int i = 0, j = 0, dim = data.dim(), size = data.size();
231  double dist = 0.0;
232  int size_pos = 0, size_neg = 0;
233  std::vector<int> fnames = data.getFeaturesNames();
234  std::vector<double> avg_pos(dim, 0.0), avg_neg(dim, 0.0);
235  std::vector<std::shared_ptr<Point< T > > > points = data.points();
236 
237  for(size_pos = 0, size_neg = 0, i = 0; i < size; ++i){
238  if(points[i]->Y() == 1) size_pos++;
239  else size_neg++;
240  }
241 
242  for(j = 0; j < dim; ++j){
243  for(i = 0; i < size; ++i){
244  if(points[i]->Y() == 1){
245  avg_pos[j] += points[i]->X()[j];
246  }else
247  avg_neg[j] += points[i]->X()[j];
248  }
249 
250  avg_pos[j] /= (double)size_pos;
251  avg_neg[j] /= (double)size_neg;
252  }
253 
254  for(dist = 0.0, j = 0; j < dim; ++j){
255  if(feat < 0 || fnames[j] != feat)
256  dist += std::pow(avg_pos[j] - avg_neg[j], 2);
257  }
258 
259  return std::sqrt(dist);
260  }
261 
262  template < typename T >
263  double distCentersWithoutFeats(const Data<T>& data, const std::vector<int>& feats, int index){
264  int i = 0, j = 0, dim = data.dim(), size = data.size();
265  double dist = 0.0;
266  int size_pos = 0, size_neg = 0, featsize = feats.size();
267  std::vector<int> fnames = data.getFeaturesNames();
268  std::vector<double> avg_pos(dim, 0.0), avg_neg(dim, 0.0);
269  std::vector<std::shared_ptr<Point< T > > > points = data.points();
270 
271  for(size_pos = 0, size_neg = 0, i = 0; i < size; ++i){
272  if(points[i]->Y() == 1) size_pos++;
273  else size_neg++;
274  }
275 
276  for(j = 0; j < dim; ++j){
277  for(i = 0; i < size; ++i){
278  if(points[i]->Y() == 1)
279  avg_pos[j] += points[i]->X()[j];
280  else
281  avg_neg[j] += points[i]->X()[j];
282  }
283 
284  avg_pos[j] /= (double) size_pos;
285  avg_neg[j] /= (double) size_neg;
286  }
287 
288  for(dist = 0.0, j = 0; j < dim; ++j){
289  for(i = 0; i < featsize; ++i){
290  if(fnames[j] == feats[i])
291  dist -= std::pow(avg_pos[j] - avg_neg[j], 2);
292  }
293  }
294 
295  return std::sqrt(std::fabs(dist));
296  }
297 }
size_t size() const
Returns the size of the dataset.
Definition: Data.hpp:208
std::vector< int > getFeaturesNames() const
Returns the features names.
Definition: Data.hpp:1675
size_t dim() const
Returns the dimension of the dataset.
Definition: Data.hpp:213
std::vector< std::shared_ptr< Point< T > > > points()
Returns a shared pointer to the vector of Points of the sample.
Definition: Data.hpp:1685
Wrapper for the point data.
Definition: Point.hpp:42
T sum(const std::function< T(T)> &f=[](T const &t) { return t;}) const
Compute the sum of the components of the point.
Definition: Point.hpp:285
std::size_t size() const
Returns the dimension of the point.
Definition: Point.hpp:133
Namespace for statistical methods.
Definition: Statistics.hpp:19
double distCentersWithoutFeats(const Data< T > &data, const std::vector< int > &feats, int index)
Compute the distance between the centers of binary classes without given features.
Definition: Statistics.hpp:263
double mean(const mltk::Point< T, R > &p)
Compute the mean (average) of a point.
Definition: Statistics.hpp:103
double distCenters(const Data< T > &data, int feat)
Compute the distance between the centers of binary classes without given features.
Definition: Statistics.hpp:229
double covar(const mltk::Point< T, R > &p, const mltk::Point< T, R > &p1)
Compute the covariance between two points.
Definition: Statistics.hpp:180
double var(const mltk::Point< T, R > &p)
Compute the variance of a point.
Definition: Statistics.hpp:142
double std_dev(const mltk::Point< T, R > &p)
Compute the standard deviation of a point.
Definition: Statistics.hpp:119
double radius(const Data< T > &data, int feat, double q)
Returns radius of the ball that circ. the data.
Definition: Statistics.hpp:186
UFJF-MLTK main namespace for core functionalities.
Definition: classifier/Classifier.hpp:11
T max(const Point< T, R > &p)
Returns the max value of the point.
Definition: Point.hpp:544