6 #ifndef VALIDATION__HPP
7 #define VALIDATION__HPP
30 double sensitivity{0.0}, specificity{0.0};
33 size_t tp{0}, tn{0}, fp{0}, fn{0};
48 std::cout <<
"True positives: " << tp << std::endl;
49 std::cout <<
"True negatives: " << tn << std::endl;
50 std::cout <<
"False positives: " << fp << std::endl;
51 std::cout <<
"False negatives: " << fn << std::endl;
52 std::cout <<
"Precision: " <<
precision << std::endl;
53 std::cout <<
"Recall: " <<
precision << std::endl;
54 std::cout <<
"F1-score: " <<
precision << std::endl;
55 std::cout <<
"Accuracy: " <<
accuracy << std::endl;
66 std::vector<unsigned int> seed;
68 double initial_error = 0.0;
69 double actual_error = 0.0;
70 double limit_error = 0.0;
98 template<
typename T >
109 template <
typename T>
110 inline ValidationReport
metricsReport(
const Data<T>& data,
const std::vector<std::vector<size_t> > &cfm,
111 std::vector<int> positive_labels = std::vector<int>());
120 template<
typename T,
typename Classifier>
121 inline double accuracy(
const Data<T>& data, Classifier& model,
bool trained =
true);
130 inline ValidationReport classificationReport(
const Point<int> &real,
const Point<int> &predicted);
141 template<
typename T >
142 std::vector<TrainTestPair<T>>
kfoldsplit(
Data<T> &samples,
size_t folds=5,
bool stratified=
true,
bool keepIndex=
true,
155 template<
typename T >
156 std::vector<TrainTestPair<T>>
kfoldsplit(
Data<T> &samples,
size_t folds,
size_t qtde,
157 bool stratified=
true,
bool keepIndex=
true,
size_t seed=0);
167 TrainTestPair<T>
partTrainTest(
Data<T> &data,
size_t fold,
bool stratified=
true,
bool keepIndex=
true,
size_t seed=0);
177 template <
typename T>
179 bool stratified=
true,
size_t seed=0,
int verbose=0);
192 template <
typename T>
194 size_t fold,
bool stratified =
true,
size_t seed = 0,
int verbose = 0);
200 template <
typename T>
202 const size_t fold,
bool stratified,
const size_t seed,
const int verbose){
203 size_t _seed = (seed == 0) ? std::random_device{}() : seed;
204 auto valid_pair =
partTrainTest(samples, fold,
true,
true, _seed);
206 size_t fp = 0, fn = 0, tp = 0, tn = 0, erro=0;
207 double error = 0, errocross = 0, func = 0.0, margin = 0, bias;
208 std::vector<double> w;
209 auto classes = samples.
classes();
217 for(errocross = 0, i = 0; i < qtde; i++)
219 if(verbose) std::cout <<
"\nExecucao " << i + 1 <<
" / " << qtde <<
":\n";
220 errocross +=
kfold(samples, classifier, fold, stratified, _seed + i, verbose).error;
222 if(verbose >= 1)std::cout <<
"\n\nErro " << fold <<
"-Fold Cross Validation: " << errocross/qtde <<
"%\n";
223 solution.
accuracy = 100.0 - errocross/qtde;
224 solution.error = 100.0 - solution.
accuracy;
226 solution.
recall /= qtde*fold;
227 solution.
tnrate /= qtde*fold;
237 std::cout <<
"\nFinal Validation:\n";
238 std::cout <<
"Pts de Treino: " << valid_pair.train.size() <<
"\n";
239 std::cout <<
"Pts de Teste: " << valid_pair.test.size() <<
"\n";
242 valid_pair.train.setClasses(samples.
classes());
243 classifier.
setSamples(mltk::make_data<T>(valid_pair.train));
249 if(!classifier.
train()){
251 std::cerr <<
"Validation error: The convergency wasn't reached in the training set!\n";
258 for(
auto it = valid_pair.test.begin(); it != valid_pair.test.end(); it++, i++){
260 double _y = classifier.
evaluate(*point);
262 if(point->Y() != _y){
264 std::cerr <<
"[" << i+1 <<
"x] function: " << _y <<
", y: " << point->Y() << std::endl;
266 if(classes.size() == 2 && point->Y() == -1) fp++;
else fn++;
269 std::cerr <<
"[" << i+1 <<
"] function: " << _y <<
", y: " << point->Y() << std::endl;
270 if(classes.size() == 2 && point->Y() == -1) tn++;
else tp++;
277 std::shared_ptr<Data< T > > traintest_sample(mltk::make_data<T>());
279 *traintest_sample = valid_pair.test;
280 traintest_sample->join(valid_pair.train);
281 traintest_sample->setClasses(classes);
285 std::cerr <<
"Validation error: The convergency wasn't reached in the training set!\n";
289 for(
auto it = valid_pair.test.begin(); it != valid_pair.test.end(); it++, i++){
293 if(point->Y() != _y){
295 std::cerr <<
"[" << i+1 <<
"x] function: " << _y <<
", y: " << point->Y() << std::endl;
297 if(point->Y() == -1 && classes.size() == 2) fp++;
else fn++;
300 std::cerr <<
"[" << i+1 <<
"] function: " << _y <<
", y: " << point->Y() << std::endl;
301 if(point->Y() == -1 && classes.size() == 2) tn++;
else tp++;
304 if(verbose) std::cout.flush();
307 if(verbose >= 1) std::cout <<
"Validation Error: " << erro <<
" -- " << ((double)erro/(
double) valid_pair.test.size()) * 100.0f <<
"%\n";
311 template <
typename T>
313 bool stratified,
const size_t seed,
const int verbose){
315 std::vector<double> error_arr(fold);
316 auto classes = sample.
classes();
317 size_t _seed = (seed == 0) ? std::random_device{}() : seed;
319 std::vector<TrainTestPair<T>> folds =
kfoldsplit(sample, fold, stratified,
true, seed);
323 for(
size_t fp = 0, fn = 0, tp = 0, tn = 0, j = 0; j < fold; ++j){
324 auto _test_sample = folds[j].test;
325 auto _train_sample = folds[j].train;
328 std::cout <<
"\nCross-Validation " << j + 1 <<
": \n";
329 std::cout <<
"Train points: " << _train_sample.size() << std::endl;
330 std::cout <<
"Train distribution: " <<
mltk::Point<size_t>(_train_sample.classesDistribution()) << std::endl;
331 std::cout <<
"Test points: " << _test_sample.
size() << std::endl;
332 std::cout <<
"Test distribution: " <<
mltk::Point<size_t>(_test_sample.classesDistribution()) << std::endl;
333 std::cout << std::endl;
342 if(!classifier.
train()){
344 std::cerr <<
"Error at " << fold <<
"-fold: The convergency wasn't reached at the set " << j+1 <<
"!\n";
349 for(
auto it = _test_sample.begin(); it != _test_sample.end(); it++, i++){
351 double _y = classifier.
evaluate(*point);
353 if(point->Y() != _y){
355 std::cerr <<
"[" << i+1 <<
"x] function: " << _y <<
", y: " << point->Y() << std::endl;
357 if(classes.size() == 2 && point->Y() == -1) fp++;
else fn++;
360 std::cerr <<
"[" << i+1 <<
"] function: " << _y <<
", y: " << point->Y() << std::endl;
361 if(classes.size() == 2 && point->Y() == -1) tn++;
else tp++;
366 std::shared_ptr<Data< T > > traintest_sample(std::make_shared<
Data< T > >());
367 traintest_sample = mltk::make_data<T>(_test_sample);
368 traintest_sample->join(_train_sample);
369 traintest_sample->setClasses(classes);
371 dual->recomputeKernel();
374 std::cerr <<
"Validation error: The convergency wasn't reached in the training set!\n";
378 for(
auto it = _test_sample.begin(); it != _test_sample.end(); it++, i++){
381 if(point->Y() != _y){
383 std::cerr <<
"[" << i+1 <<
"x] function: " << _y <<
", y: " << point->Y() << std::endl;
385 if(point->Y() == -1 && classes.size() == 2) fp++;
else fn++;
388 std::cerr <<
"[" << i+1 <<
"] function: " << _y <<
", y: " << point->Y() << std::endl;
389 if(point->Y() == -1 && classes.size() == 2) tn++;
else tp++;
395 if(verbose) std::cout <<
"Error " << j + 1 <<
": " << error_arr[j] <<
" -- " << ((double)error_arr[j]/(
double) _test_sample.size()) * 100.0f <<
"%\n";
396 error += ((double)error_arr[j]/(
double) _test_sample.size()) * 100.0f;
397 if(classes.size() == 2){
398 solution.
accuracy += (double)(tp + tn)/(double)(tp + tn + fp + fn);
399 solution.
precision += (double)tp/(
double)(tp + fp);
400 solution.
recall += (double)tp/(
double)(tp + fn);
401 solution.
tnrate += (double)tn/(
double)(tn + fp);
408 solution.error = (((double)error)/(double)fold);
409 solution.
accuracy = 100.0 - solution.error;
416 size_t _seed = (seed == 0) ? std::random_device{}() : seed;
417 std::vector<Data<T> > folds = samples.
splitSample(fold, stratified, keepIndex, seed);
420 for(
auto it = folds.begin(); it != folds.end()-1; it++){
422 for(
auto p = _data.begin(); p != _data.end(); p++){
427 size_t last_fold = folds.size()-1;
428 for(
auto it = folds[last_fold].begin(); it != folds[last_fold].end(); it++){
441 template<
typename T >
442 std::vector<TrainTestPair<T>>
kfoldsplit(
Data<T> &samples,
const size_t folds,
const size_t qtde,
443 bool stratified,
bool keepIndex,
const size_t seed) {
444 std::vector<TrainTestPair<T> > kkfold_split;
445 size_t _seed = (seed == 0) ? std::random_device{}() : seed;
447 kkfold_split.reserve(qtde*folds);
448 for(
int i = 0; i < qtde; i++){
449 size_t other_seed = (seed == 0) ? std::random_device{}() : _seed+i;
450 auto kfold_split =
kfoldsplit(samples, folds, stratified, keepIndex, other_seed);
452 for(
size_t j = 0; j < kfold_split.size(); j++){
453 kfold_split[j].execution = i+1;
456 kkfold_split.insert(kkfold_split.end(), kfold_split.begin(), kfold_split.end());
461 template<
typename T >
462 std::vector<TrainTestPair<T>>
kfoldsplit(
Data<T> &data,
const size_t folds,
bool stratified,
bool keepIndex,
465 size_t _seed = (seed == 0) ? std::random_device{}() : seed;
467 std::vector<Data<T> > data_folds = samples.
splitSample(folds, stratified, keepIndex, seed);
468 std::vector<TrainTestPair<T> > kfold_split;
470 kfold_split.reserve(folds);
472 for(
int i = 0; i < folds; i++){
474 train.
join(data_folds[i]);
477 int next_j=(i+2) % folds;
479 for(
int j = (i+1) % folds; gone < folds-2; gone++, j = (j+1) % folds){
480 train.
join(data_folds[j]);
481 next_j = (j+1) % folds;
484 auto test = data_folds[(next_j)%folds];
487 test.shuffle(_seed+i);
489 train.
setName(data.
name() +
"_train_fold_"+std::to_string(i));
490 test.setName(data.
name() +
"_test_fold_"+std::to_string(i));
495 train.
setName(data.
name()+
"_train_fold_"+std::to_string(i+1));
496 test.setName(data.
name()+
"_test_fold_"+std::to_string(i+1));
498 kfold_split.emplace_back(train, test);
499 kfold_split.back().fold = i+1;
505 double errors = 0, total = 0;
506 for(
size_t i = 0; i < conf_matrix.size(); i++){
507 for(
size_t j = 0; j < conf_matrix[i].size(); j++){
509 errors += conf_matrix[i][j];
511 total += conf_matrix[i][j];
514 return (1 - errors/total)*100;
517 template<
typename T,
typename Classifier>
518 inline double accuracy(
const Data<T>& data, Classifier& model,
bool trained){
523 for(
int i = 0; i < data.
size(); i++){
524 auto point = data(i);
525 if(model.evaluate(point) == point.Y()){
529 return acc/data.
size();
532 template <
typename T>
534 std::vector<int> positive_labels){
538 std::vector<char> is_positive;
540 if(positive_labels.empty()){
541 is_positive.resize(classes.size(),
true);
543 is_positive.resize(classes.size(),
false);
544 std::for_each(positive_labels.begin(), positive_labels.end(), [&](
const int& label){
545 int pos = std::find(classes.begin(), classes.end(), label) - classes.begin();
546 assert((pos < classes.size()) &&
"Label not found.");
547 is_positive[pos] = true;
551 for(
int i = 0; i < cfm.size(); i++){
552 for(
int j = 0; j < cfm.size(); j++){
555 report.tp += cfm[i][j];
557 report.tn += cfm[i][j];
561 report.fn += cfm[i][j];
563 report.fp += cfm[i][j];
570 report.error = 1.0 - report.
accuracy;
571 report.errors = report.fp+report.fn;
572 report.sensitivity = (double)report.tp/(report.tp + report.fn);
573 report.specificity = (double)report.tn/(report.tn + report.fp);
574 report.
precision = (double)report.tp/(report.tp + report.fp);
575 report.
recall = report.sensitivity;
581 template<
typename T >
583 auto classes = samples.
classes();
584 size_t size = samples.
size(), i, j, idp, idy, n_classes = classes.size();
585 std::vector<std::vector<size_t> > confusion_m(n_classes, std::vector<size_t>(n_classes, 0));
588 for(i = 0; i < size; i++){
589 int pred = learner.
evaluate(*(samples[i]));
590 for(j = 0, idp = 0, idy = 0; j < n_classes; j++) {
591 if (classes[j] == pred) {
594 if (classes[j] == samples[i]->Y()) {
597 if (idp && idy)
break;
599 confusion_m[idp][idy]++;
605 assert((real.
size() == predicted.
size()) &&
"Desired and predicted values points of different sizes. ");
606 ValidationReport report;
607 std::set<int> unique_labels;
608 for(
int i = 0; i < real.
size(); i++){
609 unique_labels.insert(real[i]);
610 unique_labels.insert(predicted[i]);
612 for(
int i = 0; i < real.
size(); i++){
613 if(real[i] == predicted[i]){
615 if(unique_labels.size() == 2){
622 }
else if(unique_labels.size() == 2){
630 report.accuracy /= real.
size();
631 report.sensitivity = report.tp/(report.tp+report.fn);
632 report.specificity = report.tn/(report.fn+report.tn);
633 report.precision = report.tp/(report.tp+report.fp);
634 report.recall = report.tp/(report.tp+report.fn);
635 report.fscore = 2*(report.precision*report.recall)/(report.precision + report.recall);
bool insertPoint(const Data< T > &samples, int _index, bool keepIndex=false)
Insert a point to the data from another sample.
Definition: Data.hpp:1437
std::vector< Data< T > > splitSample(const std::size_t &split_size, bool stratified=true, bool keepIndex=false, size_t seed=0)
Split the data by a given size.
Definition: Data.hpp:1855
void join(const Data< T > &data)
Merge one dataset with another.
Definition: Data.hpp:1598
size_t size() const
Returns the size of the dataset.
Definition: Data.hpp:208
void resetIndex()
Reset the index vector.
Definition: Data.hpp:1759
std::string name() const
The name of the dataset is defined as the name ofthe file where it were loaded from.
Definition: Data.hpp:203
const std::vector< int > classes() const
Returns a vector containing the numeric values of the classes.
Definition: Data.hpp:1831
void setName(const std::string &name)
Set the name of the dataset.
Definition: Data.hpp:277
mltk::Data< T > copy() const
Returns a copy of itself.
Definition: Data.hpp:1551
void shuffle(const size_t &seed=42)
Shuffle the data with a given seed.
Definition: Data.hpp:1349
virtual void setSamples(const Data< T > &data)
setSamples Set the samples used by the Learner.
Definition: Learner.hpp:150
void setSeed(const size_t _seed)
Set the seed to be used by the learner.
Definition: Learner.hpp:145
virtual bool train()=0
Function that execute the training phase of a Learner.
virtual std::string getFormulationString()=0
getFormulationString Returns a string that represents the formulation of the learner (Primal or Dual)...
virtual double evaluate(const Point< T > &p, bool raw_value=false)=0
Returns the class of a feature point based on the trained Learner.
void setVerbose(int _verbose)
Set the level of verbose.
Definition: Learner.hpp:175
Wrapper for the point data.
Definition: Point.hpp:42
Rep const & X() const
Returns the attributes representation of the point (std::vector by default).
Definition: Point.hpp:139
std::size_t size() const
Returns the dimension of the point.
Definition: Point.hpp:133
Definition: Solution.hpp:13
double bias
Bias of the solution.
Definition: Solution.hpp:23
mltk::Point< double > w
Weights vector.
Definition: Solution.hpp:17
Definition: classifier/Classifier.hpp:17
Solution getSolution() const
getSolution Returns the solution of the classifier.
Definition: classifier/Classifier.hpp:52
Definition: DualClassifier.hpp:16
virtual double evaluate(const Point< T > &p, bool raw_value=false) override
Returns the class of a feature point based on the trained Learner.
Definition: DualClassifier.hpp:28
Validation methods namespace.
Definition: valid/Validation.hpp:17
double confusionMatrixAccuracy(const std::vector< std::vector< size_t > > &conf_matrix)
Compute the accuracy based on a confusion matrix.
Definition: valid/Validation.hpp:504
std::vector< std::vector< size_t > > generateConfusionMatrix(Data< T > &samples, Learner< T > &learner)
Compute the confusion matrix for a given trained classifier.
Definition: valid/Validation.hpp:582
ValidationReport metricsReport(const Data< T > &data, const std::vector< std::vector< size_t > > &cfm, std::vector< int > positive_labels=std::vector< int >())
Generates a report with classifiers metrics.
Definition: valid/Validation.hpp:533
TrainTestPair< T > partTrainTest(Data< T > &data, size_t fold, bool stratified=true, bool keepIndex=true, size_t seed=0)
Divide the samples in training and test set.
Definition: valid/Validation.hpp:414
std::vector< TrainTestPair< T > > kfoldsplit(Data< T > &samples, size_t folds=5, bool stratified=true, bool keepIndex=true, size_t seed=0)
Split the data in k folds.
Definition: valid/Validation.hpp:462
ValidationReport kkfold(Data< T > samples, classifier::Classifier< T > &classifier, size_t qtde, size_t fold, bool stratified=true, size_t seed=0, int verbose=0)
Executes the validation with several executions of the k fold algorithm.
Definition: valid/Validation.hpp:201
ValidationReport kfold(Data< T > sample, classifier::Classifier< T > &classifier, size_t fold, bool stratified=true, size_t seed=0, int verbose=0)
Executes k-fold stratified cross-validation.
Definition: valid/Validation.hpp:312
Structure to manage cross validation.
Definition: valid/Validation.hpp:62
A struct representing a pair with training and test data.
Definition: valid/Validation.hpp:77
Data< T > test
Test data.
Definition: valid/Validation.hpp:81
Data< T > train
Train data.
Definition: valid/Validation.hpp:79
Solution for the validation of a ML method.
Definition: valid/Validation.hpp:21
size_t trueNegative
Number of true negatives generated by the classifier in the evaluation.
Definition: valid/Validation.hpp:43
double precision
Precision of the validated model.
Definition: valid/Validation.hpp:27
double recall
Recall of the validated model.
Definition: valid/Validation.hpp:29
double tprate
True positive rate.
Definition: valid/Validation.hpp:37
size_t truePositive
Number of true positives generated by the classifier in the evaluation.
Definition: valid/Validation.hpp:45
double accuracy
Accuracy of the validated model.
Definition: valid/Validation.hpp:24
size_t falseNegative
Number of false negatives generated by the classifier in the evaluation.
Definition: valid/Validation.hpp:41
double tnrate
True negative rate.
Definition: valid/Validation.hpp:35
size_t falsePositive
Number of false positives generated by the classifier in the evaluation.
Definition: valid/Validation.hpp:39