UFJF - Machine Learning Toolkit  0.51.8
All Classes Namespaces Files Functions Variables Pages
Data.hpp
Go to the documentation of this file.
1 /*
2  _oo0oo_
3  o8888888o
4  88" . "88
5  (| -_- |)
6  0\ = /0
7  ___/`---'\___
8  .' \\| |// '.
9  / \\||| : |||// \
10  / _||||| -:- |||||- \
11  | | \\\ - /// | |
12  | \_| ''\---/'' |_/ |
13  \ .-\__ '-' ___/-. /
14  ___'. .' /--.--\ `. .'___
15  ."" '< `.___\_<|>_/___.' >' "".
16  | | : `- \`.;`\ _ /`;.`/ - ` : | |
17  \ \ `_. \_ __\ /__ _/ .-` / /
18  =====`-.____`.___ \_____/___.-`___.-'=====
19  `=---='
20 
21 
22  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
23 
24  A Buddha statue to bless your
25  code to be bug free
26 */
27 
33 #pragma once
34 
35 #include <utility>
36 #include <cmath>
37 #include <vector>
38 #include <cstring>
39 #include <string>
40 #include <numeric>
41 #include <algorithm>
42 #include <sstream>
43 #include <iostream>
44 #include <fstream>
45 #include <memory>
46 #include <random>
47 #include <set>
48 #include <chrono>
49 #include <map>
50 
51 #include "Point.hpp"
52 #include "Statistics.hpp"
53 #include "Utils.hpp"
54 
55 namespace mltk{
56  static const std::vector<std::string> types {"data", "csv", "arff", "txt", "plt"};
57  enum Type {TYPE_INVALID = -1, TYPE_DATA = 0, TYPE_CSV = 1, TYPE_ARFF = 2, TYPE_TXT = 3};
58 
59  template < typename T >
60  class Statistics;
61 
62  template < typename T >
63  class Data;
64 
65  template < class T > using DataPointer = std::shared_ptr<mltk::Data< T > >;
66  template < class T > using SamplePointer = std::shared_ptr<mltk::Point< T > >;
67  template < class T > using SampleIterator = typename std::vector<SamplePointer< T > >::iterator;
68  template < class T > using ConstSampleIterator = typename std::vector<SamplePointer< T > >::const_iterator;
69 
73  template < typename T = double >
74  class Data {
75  // Associations
76  // Attributes
77  private :
78  std::string dataset_name{""};
80  std::vector<SamplePointer<T> > m_points = {};
82  std::vector<int> fnames = {};
84  std::vector<int> index = {};
86  std::vector<std::string> class_names = {};
88  std::vector<int> m_classes = {};
90  std::vector<size_t> class_distribution = {};
92  size_t m_size = 0;
94  size_t m_dim = 0;
96  double time_mult = 0.0;
98  std::string pos_class = std::string("1"), neg_class = std::string("-1");
100  bool is_empty = true;
102  bool atEnd = false;
104  bool normalized = false;
106  bool cdist_computed = false;
108  std::string type = "Classification";
109  public:
114  [[nodiscard]] const std::string &getType() const;
115 
116  private:
117  // Private Operations
123  int process_class(std::string& item);
129  Type identifyFileType (std::string file);
135  bool load_csv (const std::string& path);
141  bool load_arff (const std::string& path);
147  bool load_data (const std::string& path);
153  bool load_txt (const std::string& path);
159  std::string discover_dataset_name(const std::string& path);
160 
161  public :
162  void setType(const std::string &type);
163 
164  Data() = default;
165  Data(const Data<T>& other);
170  Data(const std::string &dataset, bool atEnd);
177  explicit Data (const char* dataset, const char* datasetType, const char* pos_class = "1", const char* neg_class = "-1");
184  Data (size_t size, size_t dim, T val = T());
190  explicit Data (const char* dataset);
191 
192  /*********************************************
193  * Getters *
194  *********************************************/
198  [[nodiscard]] bool isClassification() const { return (type == "Classification" || type == "MultiClassification" || type == "BinClassification");}
203  [[nodiscard]] std::string name() const{ return dataset_name; }
208  [[nodiscard]] size_t size() const{ return m_points.size();};
213  [[nodiscard]] size_t dim() const{ return (!m_points.empty()) ? m_points.front()->size() : 0; }
218  std::vector<std::shared_ptr<Point< T > > > points ();
223  std::vector<std::shared_ptr<Point< T > > > points () const;
228  [[nodiscard]] const std::vector<int> classes() const;
234  PointPointer<T> point (int index) const;
240  Point< T > getFeature(int index) const;
241  [[nodiscard]] Point< double > getLabels() const;
242  Point< double > labels() const;
247  [[nodiscard]] std::vector<size_t> classesDistribution() const ;
252  [[nodiscard]] std::vector<std::string> classesNames() const;
257  [[nodiscard]] std::vector<int> getFeaturesNames() const;
258  mltk::Point<int> featuresNames() const;
263  [[nodiscard]] std::vector<int> getIndex() const;
268  [[nodiscard]] double getTime_mult() const;
269 
270  /*********************************************
271  * Setters *
272  *********************************************/
277  void setName(const std::string& name){ this->dataset_name = name; }
282  inline void setClassesNames(const std::vector<std::string> class_names){ this->class_names = class_names; }
287  void setClasses(const std::vector<int> &classes);
292  void setClassesAtEnd(bool atEnd){ this->atEnd = atEnd; }
298  void setPoint (int index, std::shared_ptr<Point< T > > p);
303  void setFeaturesNames(const std::vector<int>& fnames);
308  void setIndex(std::vector<int> index);
313  void setDim(size_t dim);
314 
315  /*********************************************
316  * Other operations *
317  *********************************************/
318 
327  void shuffle(const size_t& seed = 42);
333  bool load (const std::string& file);
339  bool load (const std::string& file, bool _atEnd);
345  void write(const std::string& fname, std::string ext);
350  [[nodiscard]] bool isEmpty () const;
355  [[nodiscard]] bool isNormalized() const{ return normalized; };
359  void clear();
364  void classesCopy (const Data< T > &_data, std::vector<int> &classes);
374  void copy (const Data< T > &_data);
379  void copyZero (const Data< T >& other);
384  std::vector< Data< T > > splitByClasses(bool keepIndex = false);
392  std::vector< Data< T > > splitSample(const std::size_t &split_size, bool stratified = true, bool keepIndex = false, size_t seed = 0);
393  //std::vector< Data< T > > splitSample1(const std::size_t &split_size, bool stratified = true, bool keepIndex = false, size_t seed = 0);
394 
401  Data< T > selectFeatures(std::vector<size_t> feats, int size=-1);
409  Data< T > sampling(const size_t& samp_size, bool with_replacement = true, const int &seed=0);
414  void apply(std::function<void(mltk::PointPointer<T> point)> f);
420  void join(const Data< T >& data);
427  bool insertPoint (const Data< T >& samples, int _index, bool keepIndex = false);
433  bool insertPoint (std::shared_ptr<Point< T > > p, bool keepIndex = false);
439  bool insertPoint (Point< T > p, bool keepIndex = false);
445  std::vector<bool> removePoints (std::vector<int> ids);
451  bool removePoint (int pid);
457  Data< T > insertFeatures(std::vector<int> ins_feat, bool keepIndex = false);
463  Data<T> removeFeatures (std::vector<int> feats, int fsize) const;
469  bool removeFeatures (std::vector<int> feats);
476  bool updatePointValue(const size_t &idx, double value);
482  void changeXVector(std::vector<int> index);
487  void normalize(double p = 2);
491  void resetIndex();
492 
493  /*********************************************
494  * Overloaded operators for the Data class. *
495  *********************************************/
496 
497  SampleIterator<T> begin() { return m_points.begin(); }
498 
499  SampleIterator<T> end() { return m_points.end(); }
500 
501  ConstSampleIterator<T> begin() const { return m_points.cbegin(); }
502 
503  ConstSampleIterator<T> end() const { return m_points.cend(); }
504 
505  std::shared_ptr<Point< T > > operator[](size_t i) const {return m_points[i];}
506 
507  std::shared_ptr<Point< T > > & operator[](size_t i) {return m_points[i];}
508 
509  Point< T > operator()(size_t i) const {return *(m_points[i]);}
510  Point< T >& operator()(size_t i) {return *m_points[i];}
511 
512  Data< T >& operator=(const Data< T >&);
513 
514  bool operator==(const Data< T > &rhs) const;
515 
516  bool operator!=(const Data< T > &rhs) const;
517 
518  template< typename U >
519  friend std::ostream &operator<<( std::ostream &output, const Data< U > &data );
520 
521  ~Data();
522  };
523 
531  template < typename T, typename... Types >
532  DataPointer< T > make_data(Types... args) {
533  return std::make_shared< Data < T > >(args...);
534  }
535 
536  template < typename T >
537  std::ostream &operator<<( std::ostream &output, const Data< T > &data ){
538  for(auto p : data.m_points){
539  output << *p << std::endl;
540  }
541 
542  return output;
543  }
544 
545  template<typename T>
546  mltk::Data< T >::Data(const char* dataset){
547  if(!load(std::string(dataset))){
548  std::cerr << "Couldn't read the dataset." << std::endl;
549  }
550  }
551 
552  template<typename T>
553  Data<T>::Data(const std::string &dataset, bool atEnd): atEnd(atEnd) {
554  if(!load(std::string(dataset), atEnd)){
555  std::cerr << "Couldn't read the dataset." << std::endl;
556  }
557  }
558 
559  template<typename T>
560  mltk::Data<T>::Data(size_t size, size_t dim, T val) {
561  size_t i;
562 
563  this->m_points.resize(size);
564  this->fnames.resize(dim);
565  this->index.resize(size);
566 
567  for(i = 0; i < size; i++){
568  this->m_points[i] = std::make_shared<Point< T > >(dim, val);
569  this->m_points[i]->Id() = i+1;
570  this->m_points[i]->Y() = 0;
571  }
572 
573  std::iota(fnames.begin(), fnames.end(), 1);
574  std::iota(index.begin(), index.end(), 0);
575  this->m_size = size;
576  this->m_dim = dim;
577  if(size > 0) this->is_empty = false;
578  }
579 
580  template<typename T>
581  mltk::Data< T >::Data(const char* dataset, const char* datasetType, const char* pos_class, const char* neg_class){
582  this->pos_class = std::string(pos_class);
583  this->neg_class = std::string(neg_class);
584  this->type = std::string(datasetType);
585 
586  if(!load(std::string(dataset))) {
587  std::cerr << "Couldn't read the dataset." << std::endl;
588  }
589  }
590 
591  template<typename T>
592  Type mltk::Data< T >::identifyFileType(std::string file){
593  size_t l , k, i, j, tsize = types.size();
594 
595  for(k = file.size() - 1; (k >= 0 && file[k] != '.') ; k--){
596  for(i = 0; i < tsize; ++i){
597  if(file[k] == types[i][types[i].size()-1]){
598  for(j = types[i].size() - 1, l = 0; (j >= 0 && file[k-l] != '.') ; j--, l++){
599  if(j == 0 && types[i][j] == file[k-l]){
600  switch(i){
601  case 0:
602  return Type::TYPE_DATA;
603  break;
604  case 1:
605  return Type::TYPE_CSV;
606  break;
607  case 2:
608  return Type::TYPE_ARFF;
609  break;
610  case 3:
611  return Type::TYPE_TXT;
612  break;
613  default:
614  return Type::TYPE_INVALID;
615  break;
616  }
617  }else if(types[i][j] != file[k-l]){
618  break;
619  }
620  }
621  }
622  }
623  }
624 
625  return Type::TYPE_INVALID;
626  }
627 
628  template<typename T>
629  bool mltk::Data< T >::load(const std::string& file){
630  Type t = identifyFileType(file);
631  this->dataset_name = discover_dataset_name(file);
632  this->cdist_computed = true;
633 
634  this->atEnd = false;
635 
636  switch (t) {
637  case TYPE_ARFF:
638  this->atEnd = true;
639  return load_arff(file);
640  case TYPE_CSV:
641  return load_csv(file);
642  case TYPE_DATA:
643  return load_data(file);
644  case TYPE_TXT:
645  return load_txt(file);
646  default:
647  std::cerr << "Invalid file type." << std::endl;
648  return false;
649  }
650  fnames.assign(this->dim(), 0);
651  std::iota(fnames.begin(), fnames.end(), 1);
652  return true;
653  }
654 
655  template<typename T>
656  bool mltk::Data< T >::load(const std::string& file, bool _atEnd){
657  Type t = identifyFileType(file);
658  this->dataset_name = discover_dataset_name(file);
659  this->cdist_computed = true;
660 
661  this->atEnd = _atEnd;
662 
663  switch (t) {
664  case TYPE_ARFF:
665  return load_arff(file);
666  case TYPE_CSV:
667  return load_csv(file);
668  case TYPE_DATA:
669  return load_data(file);
670  case TYPE_TXT:
671  return load_txt(file);
672  default:
673  std::cerr << "Invalid file type." << std::endl;
674  return false;
675  }
676  fnames.assign(this->dim(), 0);
677  std::iota(fnames.begin(), fnames.end(), 1);
678  return true;
679  }
680 
681  template<typename T>
682  bool mltk::Data< T >::load_csv(const std::string& path){
683  std::ifstream input(path.c_str());
684  std::stringstream ss;
685  std::string str, item;
686  int _dim, ldim, _size, ssize;
687  char deli = ',';
688  bool cond, flag;
689 
690  flag = false;
691  _dim = ldim = _size = 0;
692 
693  if(!input){
694  std::cout << "File could not be opened!" << std::endl;
695  return false;
696  }
697 
698  //Verify if the class is at the begining or at the end and error check
699  while(std::getline(input, str)){
700  str = mltk::utils::trim_copy(str);
701  ssize = str.size();
702  _dim = -1;
703  ss.str(str);
704 
705  //Define csv file delimitator
706  if(_size == 0){
707  for(int i = 0; i < ssize; ++i){
708  if(str[i] == ','){
709  deli = ',';
710  break;
711  }else if(str[i] == ';'){
712  deli = ';';
713  break;
714  }
715  }
716  }
717 
718  while(getline(ss, item, deli)){
719  //check for invalid feature or class
720  if(this->isClassification()) {
721  if (_dim == -1 && !flag) {
722  if (!((item == pos_class) || (item == neg_class))) {
723  flag = true;
724  }
725  } else if (ss.eof() && !flag) {
726  if (!((item == pos_class) || (item == neg_class))) {
727  flag = true;
728  }
729  }
730  }
731  _dim++;
732  }
733 
734  if(ldim != _dim && ldim != 0){
735  std::cerr << "All the samples must have the same dimension!" << std::endl;
736  return false;
737  }
738 
739  ldim = _dim;
740  _size++;
741  ss.clear();
742  }
743 
744  input.clear();
745  input.seekg(0, std::ios::beg);
746 
747  //initialize _dim and _size
748  this->m_dim = _dim;
749  this->m_size = _size;
750 
751  //reserve memory for fnames array and set feature names
752  fnames.assign(_dim, 0);
753  index.assign(_size, 0);
754  std::iota(fnames.begin(), fnames.end(), 1);
755  std::iota(index.begin(), index.end(), 0);
756 
757  //reserve memory for points array
758  m_points.resize(_size);
759  _size = 0;
760  //Read sample (line) from file
761  while(std::getline(input, str)){
762  str = mltk::utils::trim_copy(str);
763  auto new_point = std::make_shared<Point< T > >();
764 
765  ss.str(str);
766  _dim = -1;
767 
768  //reserve memory for x array
769  new_point->X().resize(this->m_dim, 0.0);
770 
771  //Read features from line
772  while(std::getline(ss, item, deli)){
773  if(atEnd)
774  cond = !ss.eof();
775  else
776  cond = _dim != -1;
777 
778  if(cond){
779  if(utils::is_number(item))
780  new_point->X()[(!atEnd) ? _dim : _dim + 1] = utils::atod(item.c_str());
781  }else{
782  double c;
783 
784  if(this->isClassification()) {
785  c = process_class(item);
786  }else{
787  c = utils::atod(item.c_str());
788  }
789  new_point->Y() = c;
790  }
791  _dim++;
792  }
793 
794  m_points[_size++] = std::move(new_point);
795  m_points[_size - 1]->Id() = _size;
796  ss.clear();
797  }
798 
799  is_empty = false;
800 
801  return true;
802  }
803 
804  template<typename T>
805  bool mltk::Data< T >::load_data(const std::string& path){
806  std::ifstream input(path.c_str());
807  std::string str, item, buffer;
808  std::stringstream ss, ss1;
809  int _dim, ldim, _size;
810  double c;
811  bool is_feature, type_detect = false;
812 
813  if(!input){
814  std::cout << "File could not be opened!" << std::endl;
815  return false;
816  }
817  _dim = ldim = _size = c = 0;
818  //get dimension of the points and do error check
819  while(std::getline(input, str)){
820  str = mltk::utils::trim_copy(str);
821  _dim = -1;
822 
823  ss.str(str);
824  ss.clear();
825 
826  while(std::getline(ss, item, ' ')){
827  if(item == "") continue;
828  item = mltk::utils::trim_copy(item);
829  const char * pch = std::strchr(item.c_str(), ':');
830  _dim++;
831  /*if(_size > 0 && _dim < ldim && pch == nullptr){
832  std::cerr << "Error (line: " << _size << ", _dim: " << _dim << "): file isn't in the .data format." << std::endl;
833  return false;
834  }*/
835 
836  ss1.str(item);
837  ss1.clear();
838  while(_size > 0 && std::getline(ss1, item, ':')) {
839  if(_dim == ldim && !type_detect){
840  if(strchr(item.c_str(), '.')){
841  this->type = "Regression";
842  }else{
843  this->type = "Classification";
844  }
845  type_detect = true;
846  }else if(_dim < ldim && !utils::is_number(item)){
847  std::clog << "Warning (line: " << _size << "): feature " << _dim << " is not a number." << std::endl;
848  }
849  }
850  }
851  if(ldim != _dim && ldim != 0){
852  std::cerr << "Error (line: " << _size << "): all the samples must have the same dimension! (_dim: " << _dim << ", last_dim: " << ldim << ")" << std::endl;
853  return false;
854  }
855 
856  ldim = _dim;
857  _size++;
858  }
859 
860  input.clear();
861  input.seekg(0, std::ios::beg);
862 
863  //initialize _dim and _size
864  this->m_dim = _dim;
865  this->m_size = _size;
866 
867  //reserve memory for fnames array and set feature names
868  fnames.assign(_dim, 0);
869  index.assign(_size, 0);
870  std::iota(fnames.begin(), fnames.end(), 1);
871  std::iota(index.begin(), index.end(), 0);
872 
873  //reserve memory for points array
874  m_points.resize(_size);
875  _size = 0;
876 
877  //get lines from file
878  while(std::getline(input, str)){
879  str = mltk::utils::trim_copy(str);
880  auto new_point = std::make_shared<Point< T > >();
881 
882  ss.str(str);
883  ss.clear();
884  new_point->X().resize(this->m_dim, 0.0);
885 
886  //Read features from line
887  _dim = 0;
888  while(std::getline(ss, item, ' ')){
889  item = mltk::utils::trim_copy(item);
890  const char * pch = std::strchr(item.c_str(), ':');
891  if(!pch){
892  if(this->isClassification()) {
893  c = process_class(item);
894  }else{
895  c = utils::atod(item.c_str());
896  }
897  new_point->Y() = c;
898  }else{
899  //Verify if the class is at the beggining or at the end
900  is_feature = false; //Verify if it's including value or fname
901  ss1.str(item);
902  ss1.clear();
903  //Get feature name and value
904  while(std::getline(ss1, item, ':')){
905  item.erase( std::remove(item.begin(), item.end(), '\r'), item.end() );
906  if(!is_feature){
907  fnames[_dim] = utils::stoin(item);
908  is_feature = true;
909  }else{
910  if(utils::is_number(item)){
911  new_point->X()[_dim++] = utils::atod(item.c_str());
912  }
913  is_feature = false;
914  }
915  }
916  }
917  }
918  m_points[_size++] = std::move(new_point);
919  m_points[_size - 1]->Id() = _size;
920  }
921 
922  if(m_classes.size() == 2){
923  type = "BinClassification";
924  }else{
925  type = "MultiClassification";
926  }
927 
928  input.close();
929  is_empty = false;
930  return true;
931  }
932 
933  template<typename T>
934  bool mltk::Data< T >::load_arff(const std::string& path){
935  std::ifstream input(path.c_str());
936  std::istringstream ss;
937  std::string str, item;
938  int _dim, ldim, _size, c;
939  bool atBegin, flag, cond;
940 
941  if(!input){
942  std::cout << "File could not be opened! \nError: " << std::strerror(errno) << std::endl;
943  return false;
944  }
945 
946  _dim = ldim = _size = c = 0;
947  atBegin = flag = cond = false;
948 
949  //Verify if the class is at the begining or at the end and error check
950  while(std::getline(input, str)){
951  str = mltk::utils::trim_copy(str);
952  _dim = 0;
953  ss.str(str);
954 
955  while(std::getline(ss, item, ',')){
956  //check for invalid feature or class
957  auto found_class = std::find(this->class_names.begin(), this->class_names.end(), item);
958 
959  if(!utils::is_number(item) && (found_class != this->class_names.end())){
960  std::clog << "Warning: point[" << _size << "] " << _dim + 1 << " feature is not a number. (" << item << ")" << std::endl;
961  _dim--;
962  }
963  if(this->isClassification()) {
964  if (_dim == 0 && !flag) {
965  if (!((item == pos_class) || (item == neg_class))) {
966  //atEnd = true;
967  flag = true;
968  }
969  } else if (ss.eof() && !flag) {
970  if (!((item == pos_class) || (item == neg_class))) {
971  atBegin = true;
972  flag = true;
973  }
974  }
975  }
976  _dim++;
977  }
978 
979  if(ldim != _dim && ldim != 0){
980  std::cerr << "All the samples must have the same dimension!" << std::endl;
981  return false;
982  }
983 
984  ldim = _dim;
985  _size++;
986  ss.clear();
987  }
988  input.clear();
989  input.seekg(0, std::ios::beg);
990 
991  //initialize _dim and _size
992  _dim--;
993  this->m_dim = _dim;
994  this->m_size = _size;
995 
996  //reserve memory for fnames array and set feature names
997  fnames.assign(_dim, 0);
998  index.assign(_size, 0);
999  std::iota(fnames.begin(), fnames.end(), 1);
1000  std::iota(index.begin(), index.end(), 0);
1001 
1002  //reserve memory for points array
1003  m_points.resize(_size);
1004 
1005  _size = 0;
1006 
1007  //Read line (sample) from file
1008  while(std::getline(input, str)){
1009  str = mltk::utils::trim_copy(str);
1010  auto new_point = std::make_shared<Point< T > >();
1011  _dim = -1;
1012  ss.str(str);
1013 
1014  //reserve memory for features
1015  new_point->X().assign(this->m_dim, 0.0);
1016 
1017  //Read features from line
1018  while(std::getline(ss, item, ',')){
1019  if(atEnd)
1020  cond = (!ss.eof() && atEnd);
1021  else
1022  cond = _dim != 0;
1023 
1024  if(cond){
1025  if(utils::is_number(item)){
1026  new_point->X()[_dim + 1] = utils::atod(item.c_str());
1027  }
1028  }else{
1029  if(this->isClassification()){
1030  c = process_class(item);
1031  }
1032  new_point->Y() = c;
1033  }
1034  _dim++;
1035  }
1036 
1037  m_points[_size++] = std::move(new_point);
1038  m_points[_size - 1]->Id() = _size;
1039  ss.clear();
1040  }
1041 
1042  is_empty = false;
1043 
1044  return true;
1045  }
1046 
1047  template<typename T>
1048  bool mltk::Data< T >::load_txt(const std::string& path){
1049  std::ifstream input(path.c_str());
1050  std::istringstream ss;
1051  std::string str, item;
1052  int n, _dim = 0, n1, d, _size;
1053 
1054  if(!input){
1055  std::cout << "File could not be opened!" << std::endl;
1056  return false;
1057  }
1058 
1059  n1 = d = _size = 0;
1060 
1061  //error check
1062  while(std::getline(input, str)){
1063  str = mltk::utils::trim_copy(str);
1064  ss.str(str);
1065  n1 = 0;
1066  _dim = 0;
1067 
1068  while(std::getline(ss, item, ' ')){
1069  if(n1 >= 2){
1070  _dim++;
1071  }
1072  n1++;
1073  }
1074 
1075  if(_size != 0 && _dim != d ){
1076  std::cerr << _dim << " " << this->m_dim << std::endl;
1077  std::cerr << "All the samples must have the same dimension!" << std::endl;
1078  return false;
1079  }
1080 
1081  d = _dim;
1082  _size++;
1083  ss.clear();
1084  }
1085 
1086  //Initialize _size and _dim
1087  this->m_size = _size;
1088  this->m_dim = _dim;
1089 
1090  //Set features names
1091  fnames.assign(_dim, 0);
1092  index.assign(_size, 0);
1093  std::iota(fnames.begin(), fnames.end(), 1);
1094  std::iota(index.begin(), index.end(), 0);
1095 
1096  input.clear();
1097  input.seekg(0, std::ios::beg);
1098 
1099  //Allocate memory for points array
1100  m_points.resize(_size);
1101 
1102  _size = 0;
1103 
1104  //get line from file (sample)
1105  while(std::getline(input, str)){
1106  str = mltk::utils::trim_copy(str);
1107  auto new_point = std::make_shared<Point< T > >();
1108 
1109  //Allocate memory for features
1110  new_point->X().resize(_dim, 0.0);
1111  ss.str(str);
1112  n = 0;
1113 
1114  //read features from line
1115  while(std::getline(ss, item, ' ')){
1116  if(n >= 2){
1117  if(utils::is_number(item))
1118  new_point->X()[n - 2] = utils::atod(item.c_str());
1119  else{ std::clog << "Warning: point[" << _size << "] " << n - 2 << " feature is not a number." << std::endl; }
1120  new_point->Y() = 0;
1121  }else if(n == 1){
1122  new_point->Id() = utils::atod(item.c_str());
1123  }
1124  n++;
1125  }
1126 
1127  m_points[_size++] = std::move(new_point);
1128  m_points[_size - 1]->Id() = _size;
1129  ss.clear();
1130  }
1131 
1132  is_empty = false;
1133 
1134  return true;
1135  }
1136 
1137  template<typename T>
1138  int mltk::Data<T>::process_class(std::string& item) {
1139  int c = 0;
1140  auto class_name_it = std::find(this->class_names.begin(), this->class_names.end(), item);
1141 
1142  item.erase(std::remove_if(item.begin(), item.end(), ::isspace), item.end());
1143  if(item.empty()) return std::numeric_limits<int>::infinity();
1144 
1145  if(class_name_it == class_names.end()){
1146  this->class_names.push_back(item);
1147  auto has_mone = std::find(class_names.begin(), class_names.end(), "-1");
1148  if(has_mone == class_names.end()){
1149  if(utils::is_number(item)) {
1150  c = std::stoi(item);
1151  }else{
1152  auto pos = std::find(class_names.begin(), class_names.end(), item);
1153  c = (int)(pos - this->class_names.begin())+1;
1154  }
1155  }else{
1156  c = 1;
1157  }
1158  if(item == "-1"){
1159  c = -1;
1160  }
1161 
1162  }else{
1163  if(utils::is_number(item)) {
1164  c = std::stoi(item);
1165  }else{
1166  c = (int)(class_name_it - this->class_names.begin())+1;
1167  }
1168  }
1169  auto class_it = std::find(this->m_classes.begin(), this->m_classes.end(), c);
1170  if(class_it == this->m_classes.end()){
1171  this->m_classes.push_back(c);
1172  this->class_distribution.push_back(0);
1173  }
1174  auto it_pos = std::find_if(this->m_classes.begin(), this->m_classes.end(), [&c](const int &_class){
1175  return (_class == c);
1176  });
1177  this->class_distribution[(it_pos-this->m_classes.begin())]++;
1178 
1179  return c;
1180  }
1181 
1182  template<typename T>
1184  int i;
1185 
1186  if(m_size == 1){ std::cout << "Error: RemovePoint, only one point left\n"; return false; }
1187  //Ids bound verification
1188  if(pid > m_points[m_size - 1]->Id() || pid <= 0) return false;
1189  int y;
1190  if(!index.empty()){
1191  index.resize(m_size);
1192 
1193  for(i = 0; i < m_size; ++i){
1194  if(i >= pid){
1195  index[i-1] = index[i] - 1;
1196  }
1197  }
1198  }
1199 
1200  //Find the point by its id and erase it
1201  for(i = 0; i < m_size; i++){
1202  if(m_points[i]->Id() == pid){
1203  y = m_points[i]->Y();
1204  m_points.erase(m_points.begin() + i);
1205  break;
1206  }
1207  }
1208  m_size--;
1209  auto pos = std::find(m_classes.begin(), m_classes.end(), y) - m_classes.begin();
1210  class_distribution[pos]--;
1211  return true;
1212  }
1213 
1214  template<typename T>
1215  void mltk::Data< T >::write(const std::string& fname, std::string ext){
1216  int i, j;
1217  std::string path = fname + "." + ext;
1218  std::ofstream outstream(path.c_str(), std::ios::out);
1219  m_dim =this->dim();
1220 
1221  if(!outstream.is_open()){
1222  std::cerr << "Can't write in file." << std::endl;
1223  return;
1224  }
1225 
1226  for(i = 0; i < m_size; i++){
1227  if(ext == "plt"){
1228  outstream << m_points[i]->Y() << " ";
1229  for(j = 0; j < m_dim-1; j++){
1230  outstream << m_points[i]->X()[j] << " ";
1231  }
1232  outstream << m_points[i]->X()[j] << std::endl;
1233  }else if(ext == "data"){
1234  outstream << m_points[i]->Y() << " ";
1235  for(j = 0; j < m_dim-1; j++){
1236  outstream << fnames[j] << ":" << m_points[i]->X()[j] << " ";
1237  }
1238  outstream << fnames[j] << ":" << m_points[i]->X()[j] << "\n";
1239  }else if(ext == "csv"){
1240  outstream << m_points[i]->Y() << ",";
1241  for(j = 0; j < m_dim-1; j++){
1242  outstream << m_points[i]->X()[j] << ",";
1243  }
1244  outstream << m_points[i]->X()[j] << "\n";
1245  }
1246  }
1247 
1248  outstream.close();
1249  }
1250 
1251  template<typename T>
1252  std::vector<bool> mltk::Data< T >::removePoints(std::vector<int> ids){
1253  int idsize = ids.size(), i, j;
1254  bool save;
1255  std::shared_ptr<Point< T > > po;
1256  auto p = m_points.begin();
1257  std::vector<bool> notFound(idsize, true);
1258 
1259  std::sort(ids.begin(), ids.end());
1260 
1261  for(; p != m_points.end();){
1262  save = true;
1263  po = (*p);
1264  for(i = 0; i < idsize; i++){
1265  if(po->Id() == ids[i]){
1266  save = false;
1267  notFound[i] = false;
1268  break;
1269  }
1270  }
1271 
1272  if(save) p++;
1273  else{
1274  p = m_points.erase(p);
1275  int c = po->Y();
1276  if(!index.empty()){
1277  index.resize(m_size);
1278 
1279  for(j = 0; j < m_size; ++j){
1280  if(j >= ids[i]){
1281  index[j-1] = index[j] - 1;
1282  }
1283  }
1284  }
1285  //Size verification.
1286  if(m_size == 1){ std::clog << "Error: RemovePoint, only one point left." << std::endl; break;}
1287  auto class_pos = std::find(m_classes.begin(), m_classes.end(), c) - m_classes.begin();
1288  class_distribution[class_pos]--;
1289  m_size--;
1290  }
1291  }
1292 
1293  return notFound;
1294  }
1295 
1296  template<typename T>
1297  mltk::Data< T > mltk::Data< T >::insertFeatures(std::vector<int> ins_feat, bool keepIndex){
1298  size_t i, j, s, offset = 0, fsize = ins_feat.size();
1299  bool saveflag = false;
1300  std::vector<int> new_fnames(fsize, 0);
1301  std::shared_ptr<Point< T > > p;
1302  auto *smout = new mltk::Data< T >;
1303 
1304  if(fsize == 0) return this->copy();
1305  sort(ins_feat.begin(), ins_feat.end());
1306 
1307  //error check
1308  if(fsize > m_dim){ std::cerr << "Error: InsertFeature, fsize(" << ins_feat.size() << ")>dim(" << m_dim << ")\n"; return *smout; }
1309  smout->setDim(fsize);
1310 
1311  //Copying information to new data array
1312  for(i = 0; i < m_size; i++){
1313  p = std::make_shared<Point< T > >();
1314  p->X().resize(fsize);
1315  p->Alpha() = m_points[i]->Alpha();
1316  p->Id() = m_points[i]->Id();
1317  p->Y() = m_points[i]->Y();
1318 
1319  //Copying features
1320  s = 0, offset = 0;
1321  for(j = 0; j < m_dim; j++){
1322  if(offset < fsize && fnames[j] == ins_feat[offset]){
1323  saveflag = true;
1324  offset++;
1325  }
1326 
1327  if(saveflag){
1328  p->X()[s] = m_points[i]->X()[j];
1329  new_fnames[s] = fnames[j];
1330  s++;
1331  saveflag = false;
1332  }
1333  }
1334  //error check
1335  if(s != fsize){
1336  std::cerr << "Error: Something went wrong on InsertFeature\n";
1337  std::cerr << "s = " << s << ", dim = " << m_dim << ", fsize = " << fsize << std::endl;
1338  smout->clear();
1339  return *smout;
1340  }
1341  smout->insertPoint(p, keepIndex);
1342  }
1343  smout->setFeaturesNames(new_fnames);
1344 
1345  return *smout;
1346  }
1347 
1348  template<typename T>
1349  void mltk::Data< T >::shuffle(const size_t &seed){
1350  std::mt19937 gen((seed == 0) ? std::random_device{}() : seed);
1351  std::shuffle(m_points.begin(), m_points.end(), gen);
1352 
1353  for(size_t i = 0; i < m_size; i++){
1354  index[i] = i;
1355  }
1356  }
1357 
1358  template < typename T >
1359  Data<T> Data<T>::removeFeatures(std::vector<int> feats, int fsize) const {
1360  auto data_copy = this->copy();
1361  size_t feats_size = (fsize==-1)?feats.size():fsize;
1362  std::vector<int> to_remove(feats.begin(), feats.begin()+feats_size);
1363  data_copy.removeFeatures(to_remove);
1364  return data_copy;
1365  }
1366 
1367  template<typename T>
1368  bool mltk::Data< T >::removeFeatures(std::vector<int> feats){
1369  size_t i, j, k, psize = m_points.size(), rsize = feats.size();
1370  typename std::vector< T >::iterator itr;
1371  std::vector<int>::iterator fitr;
1372  std::vector<bool> exist(rsize, true);
1373 
1374  if(feats.empty()) return true;
1375 
1376  if(fnames.size() == 1){
1377  std::cerr << "Error: RemoveFeature, only one feature left.\n";
1378  return false;
1379  }
1380  /*if(feats.size() >= fnames.size()){
1381  cerr << "Error: RemoveFeature, more or equal features to remove than exist.\n";
1382  return false;
1383  }*/
1384 
1385  //Sort feats for remove features easily
1386  sort(feats.begin(), feats.end());
1387 
1388  //Check the existence of the features to be removed
1389  for(i = 0; i < rsize; i++){
1390  for(j = 0; j < m_dim; j++){
1391  if((feats[i] > -1) && feats[i] == fnames[j]){
1392  break;
1393  }
1394  }
1395  if(j == m_dim){
1396  exist[i] = false;
1397  }
1398  }
1399 
1400  //Remove features from each point
1401  for(i = 0; i < psize; i++){
1402  if(m_points[i] == nullptr) std::clog << "WARNING: point is null." << std::endl;
1403 
1404  // Iterate through the point features
1405  for(itr = m_points[i]->X().begin(),k = 0, j = 0; itr != m_points[i]->X().end();){
1406  while(k < rsize && !exist[k]) k++; // go to next existent feature
1407  if(k == rsize) break; // Verify if is in the end of the feats vector
1408 
1409  // Feature to remove found, remove it from the point and go to the next feat to remove
1410  if(fnames[j] == feats[k]){
1411  itr = m_points[i]->X().erase(itr);
1412  k++;
1413  }else{
1414  itr++;
1415  }
1416  j++;
1417  }
1418  }
1419 
1420  //remove names of removed features
1421  for(k = 0; k < rsize; k++){
1422  for(fitr = fnames.begin(); fitr != fnames.end();){
1423  if((*fitr) == feats[k]){
1424  fitr = fnames.erase(fitr);
1425  m_dim--;
1426  break;
1427  }else{
1428  fitr++;
1429  }
1430  }
1431  }
1432 
1433  return true;
1434  }
1435 
1436  template<typename T>
1437  bool mltk::Data< T >::insertPoint(const Data< T >& samples, int _index, bool keepIndex){
1438  if(_index > samples.size() - 1){
1439  std::cerr << "Index out of bounds. (insertPoint)" << std::endl;
1440  return false;
1441  }
1442 
1443  insertPoint(samples.point(_index), keepIndex);
1444 
1445  return true;
1446  }
1447 
1448  template<typename T>
1449  bool mltk::Data< T >::insertPoint(std::shared_ptr<Point< T > > p, bool keepIndex){
1450  //Dimension verification
1451  if(size() > 0 && p && p->size() != this->dim()){
1452  std::cerr << "Point with dimension different from the data. (insertPoint)" << std::endl;
1453  std::cerr << "Point dim = " << p->size() << " dim = " << m_dim << std::endl;
1454  return false;
1455  }
1456 
1457  if(p->size() > 0) {
1458  this->fnames = std::vector<int>(p->size(), 0);
1459  std::iota(this->fnames.begin(), this->fnames.end(), 1);
1460  }
1461  //Insert the point p at the end of the points vector
1462 
1463  m_points.insert(m_points.end(), p);
1464  m_size++;
1465  if(is_empty) is_empty = false;
1466  if(m_dim == 0) m_dim = p->size();
1467 
1468  if(this->isClassification()){
1469  auto class_pos = std::find(this->m_classes.begin(), this->m_classes.end(), p->Y());
1470 
1471  if(class_pos == this->m_classes.end()){
1472  this->class_names.push_back(std::to_string(int(m_points[m_size - 1]->Y())));
1473  this->m_classes.push_back(m_points[m_size - 1]->Y());
1474  this->class_distribution.push_back(1);
1475  }else{
1476  this->class_distribution[int(class_pos - this->m_classes.begin())]++;
1477  }
1478  }
1479 
1480  //Give a new id to the point equal to the previous point id plus 1
1481  size_t last_pos = m_size - 1;
1482  if(!keepIndex){
1483  m_points[last_pos]->Id() = m_size;
1484  }
1485  index.push_back(m_points.size()-1);
1486  return true;
1487  }
1488 
1489  template<typename T>
1490  bool mltk::Data< T >::insertPoint(Point< T > p, bool keepIndex){
1491  return this->insertPoint(std::make_shared<Point< T > >(p), keepIndex);
1492  }
1493 
1494  template<typename T>
1495  void mltk::Data< T >::changeXVector(std::vector<int> _index){
1496  int i;
1497  std::vector<std::shared_ptr<Point< T > > > nPoints(m_size);
1498 
1499  //Copy features and classes of the points making the changes
1500  for(i = 0; i < m_size; i++){
1501  nPoints[i]->X() = m_points[_index[i]]->X();
1502  nPoints[i]->X() = m_points[_index[i]]->X();
1503  }
1504 
1505  //Save changes in the class
1506  m_points = nPoints;
1507  }
1508 
1509  template<typename T>
1510  PointPointer<T> mltk::Data< T >::point(int _index) const{
1511  return m_points[_index];
1512  }
1513 
1514  template<typename T>
1515  void mltk::Data< T >::setPoint(int _index, std::shared_ptr<Point< T > > p){
1516  m_points[_index] = p;
1517  }
1518 
1519  template<typename T>
1520  void mltk::Data< T >::classesCopy(const mltk::Data< T > &_data, std::vector<int> &classes){
1521  size_t _size = 0;
1522  std::set<int> _classes;
1523  for(auto c: classes){
1524  _classes.insert(c);
1525  }
1526  for(size_t i = 0; i < _data.size(); i++){
1527  if(_classes.find(_data[i]->Y()) != _classes.end()){
1528  this->m_points.push_back(std::make_shared<Point< T > >());
1529  size_t curr = this->m_points.size()-1;
1530  this->m_points[curr]->X() = _data[i]->X();
1531  this->m_points[curr]->Y() = _data[i]->Y();
1532  this->m_points[curr]->Alpha() = _data[i]->Alpha();
1533  this->m_points[curr]->Id() = _data[i]->Id();
1534  _size++;
1535  }
1536  }
1537 
1538  this->fnames = _data.getFeaturesNames();
1539  this->m_size = _size;
1540  this->m_classes = classes;
1541  this->m_dim = _data.dim();
1542  this->type = _data.getType();
1543  this->index = _data.getIndex();
1544  this->is_empty = _data.isEmpty();
1545  this->normalized = _data.isNormalized();
1546  this->time_mult = _data.getTime_mult();
1547  this->cdist_computed = false;
1548  }
1549 
1550  template<typename T>
1552  mltk::Data<T> cp;
1553  cp.copy(*this);
1554  return cp;
1555  }
1556 
1557  template<typename T>
1559  size_t _size = _data.size();
1560  if(this->m_points.size() > 0) this->m_points.clear();
1561  //this->m_points.reserve(_size);
1562  for(size_t i = 0; i < _size; i++){
1563  auto point = std::make_shared<Point< T > >();
1564  point->X().clear();
1565  point->X().resize(_data[i]->X().size());
1566  std::copy(_data[i]->X().begin(), _data[i]->X().end(), point->X().begin());
1567  point->Y() = _data[i]->Y();
1568  point->Alpha() = _data[i]->Alpha();
1569  point->Id() = _data[i]->Id();
1570  this->m_points.push_back(point);
1571  }
1572  this->fnames = _data.getFeaturesNames();
1573  this->m_size = _data.size();
1574  this->m_classes = _data.classes();
1575  this->class_names = _data.classesNames();
1576  this->class_distribution = _data.classesDistribution();
1577  this->m_dim = _data.dim();
1578  this->type = _data.getType();
1579  this->index = _data.getIndex();
1580  this->is_empty = _data.isEmpty();
1581  this->normalized = _data.isNormalized();
1582  this->time_mult = _data.getTime_mult();
1583  this->cdist_computed = _data.cdist_computed;
1584  this->dataset_name = _data.dataset_name;
1585  }
1586 
1587  template<typename T>
1589  fnames = other.fnames;
1590  m_dim = other.m_dim;
1591  m_size = 0;
1592  is_empty = other.is_empty;
1593  normalized = other.normalized;
1594  cdist_computed = false;
1595  }
1596 
1597  template<typename T>
1599  size_t i, j, dim1 = data.dim(), antsize = m_size, size1 = data.size();
1600  std::vector<int> index1 = data.getIndex(), antindex = index;
1601  auto points1 = data.points();
1602 
1603  if(m_dim > dim1){
1604  std::cerr << "Error: sample1 dimension must be less or equal to sample2\n";
1605  exit(1);
1606  }
1607 
1608  m_size += size1;
1609 
1610  m_points.resize(m_size);
1611 
1612  for(i = antsize, j = 0; i < m_size && j < size1; i++, j++){
1613  m_points[i] = mltk::make_point<T>();
1614  m_points[i]->X() = points1[j]->X();
1615  m_points[i]->Y() = points1[j]->Y();
1616  m_points[i]->Alpha() = points1[j]->Alpha();
1617  m_points[i]->Id() = points1[j]->Id();
1618  }
1619 
1620  this->resetIndex();
1621 
1622  std::vector<int> diff, classes1 = data.classes(), classes = m_classes;
1623  std::sort(classes.begin(), classes.end());
1624  std::sort(classes1.begin(), classes1.end());
1625  std::set_difference(classes1.begin(), classes1.end(), classes.begin(), classes.end(),
1626  std::inserter(diff, diff.begin()));
1627  m_classes.reserve(m_classes.size()+diff.size());
1628  m_classes.insert(m_classes.end(), diff.begin(), diff.end());
1629 
1630  std::vector<std::string> diff_names, classes_names1 = data.classesNames(), classes_names = class_names;
1631  std::sort(classes_names.begin(), classes_names.end());
1632  std::sort(classes_names1.begin(), classes_names1.end());
1633  std::set_difference(classes_names1.begin(), classes_names1.end(), classes_names.begin(), classes_names.end(),
1634  std::inserter(diff_names, diff_names.begin()));
1635  class_names.reserve(class_names.size()+diff_names.size());
1636  class_names.insert(class_names.end(), diff_names.begin(), diff_names.end());
1637 
1638  class_distribution.assign(m_classes.size(), 0);
1639  computeClassesDistribution();
1640  if(fnames.empty()){
1641  fnames = data.getFeaturesNames();
1642  }
1643  }
1644 
1645  template<typename T>
1647  int i = 0, j = 0;
1648  int old_dim = this->dim();
1649  double norm = 0.0;
1650 
1651  for(i = 0; i < m_size; ++i){
1652  for(norm = 0, j = 0; j < old_dim; ++j){
1653  norm += std::pow(fabs(m_points[i]->X()[j]),p);
1654  }
1655  m_points[i]->X().resize(old_dim+1);
1656  m_points[i]->X()[j] = 1;
1657  norm += std::pow(fabs(m_points[i]->X()[j]),p);
1658  norm = std::pow(norm, 1.0/p);
1659  for(j = 0; j < old_dim+1; ++j){
1660  m_points[i]->X()[j] /= norm;
1661  }
1662  }
1663  m_dim = old_dim + 1;
1664  fnames.push_back(this->dim());
1665 
1666  normalized = true;
1667  }
1668 
1669  template<typename T>
1670  void mltk::Data< T >::setDim(size_t _dim){
1671  this->m_dim = _dim;
1672  }
1673 
1674  template<typename T>
1675  std::vector<int> mltk::Data< T >::getFeaturesNames() const{
1676  return fnames;
1677  }
1678 
1679  template<typename T>
1680  void mltk::Data< T >::setFeaturesNames(const std::vector<int>& _fnames){
1681  this->fnames = _fnames;
1682  }
1683 
1684  template<typename T>
1685  std::vector<std::shared_ptr<Point< T > > > mltk::Data< T >::points(){
1686  return m_points;
1687  }
1688 
1689  template<typename T>
1690  std::vector<std::shared_ptr<Point< T > > > mltk::Data< T >::points() const {
1691  return m_points;
1692  }
1693 
1694  template<typename T>
1695  std::vector<int> mltk::Data< T >::getIndex() const{
1696  return index;
1697  }
1698 
1699  template<typename T>
1701  return is_empty;
1702  }
1703 
1704  template<typename T>
1706  m_points = data.m_points;
1707  fnames = data.fnames;
1708  index = data.index;
1709  m_size = data.m_size;
1710  m_dim = data.m_dim;
1711  m_classes = data.m_classes;
1712  class_distribution = data.class_distribution;
1713  class_names = data.class_names;
1714  time_mult = data.time_mult;
1715  pos_class = data.pos_class;
1716  neg_class = data.neg_class;
1717  is_empty = data.is_empty;
1718  normalized = data.normalized;
1719  dataset_name = data.dataset_name;
1720  type = data.type;
1721 
1722  return *this;
1723  }
1724 
1725 
1726  template<typename T>
1728  m_points.clear();
1729  fnames.clear();
1730  index.clear();
1731  m_classes.clear();
1732  class_names.clear();
1733  m_size = 0;
1734  m_dim = 0;
1735  normalized = false;
1736  is_empty = true;
1737  cdist_computed = false;
1738  }
1739 
1740  template<typename T>
1742  fnames.clear();
1743  index.clear();
1744  m_classes.clear();
1745  class_names.clear();
1746  m_size = 0;
1747  m_dim = 0;
1748  normalized = false;
1749  is_empty = true;
1750  cdist_computed = false;
1751  }
1752 
1753  template<typename T>
1754  void mltk::Data< T >::setIndex(std::vector<int> _index) {
1755  this->index = _index;
1756  }
1757 
1758  template<typename T>
1760  index.assign(m_points.size(), 0);
1761  iota(index.begin(), index.end(), 0);
1762  }
1763 
1764  template<typename T>
1766  return time_mult;
1767  }
1768 
1769  template<typename T>
1770  bool mltk::Data< T >::operator==(const mltk::Data< T > &rhs) const {
1771  if(m_points.size() != rhs.m_points.size()) return false;
1772 
1773  size_t i, _size = m_points.size();
1774 
1775  for(i = 0; i < _size; i++){
1776  if(*m_points[i] != *rhs.m_points[i]){
1777  return false;
1778  }
1779  }
1780 
1781  if(m_classes.size() != rhs.classes().size()) return false;
1782  for(i = 0; i < m_classes.size(); i++){
1783  if(m_classes[i] != rhs.classes()[i]){
1784  return false;
1785  }
1786  }
1787 
1788  for(i = 0; i < class_distribution.size(); i++){
1789  if(class_distribution[i] != rhs.class_distribution[i]){
1790  return false;
1791  }
1792  }
1793 
1794  return fnames == rhs.fnames &&
1795  index == rhs.index &&
1796  _size == rhs.m_size &&
1797  m_dim == rhs.m_dim &&
1798  time_mult == rhs.time_mult &&
1799  pos_class == rhs.pos_class &&
1800  neg_class == rhs.neg_class &&
1801  is_empty == rhs.is_empty &&
1802  normalized == rhs.normalized;
1803  }
1804 
1805  template<typename T>
1806  bool mltk::Data< T >::operator!=(const mltk::Data< T > &rhs) const {
1807  return !(rhs == *this);
1808  }
1809 
1810  template<typename T>
1811  void mltk::Data<T>::setType(const std::string &_type) {
1812  this->type = _type;
1813  }
1814 
1815  template<typename T>
1816  const std::string &mltk::Data<T>::getType() const {
1817  return type;
1818  }
1819 
1820  template<typename T>
1821  std::vector<std::string> mltk::Data<T>::classesNames() const{
1822  return this->class_names;
1823  }
1824 
1825  template<typename T>
1826  std::vector<size_t> mltk::Data<T>::classesDistribution() const{
1827  return this->class_distribution;
1828  }
1829 
1830  template<typename T>
1831  const std::vector<int> mltk::Data<T>::classes() const {
1832  return this->m_classes;
1833  }
1834 
1835  template<typename T>
1836  void mltk::Data<T>::setClasses(const std::vector<int> &_classes) {
1837  this->m_classes = _classes;
1838  this->class_distribution.resize(_classes.size());
1839  this->cdist_computed = false;
1840  }
1841 
1842  template <typename T>
1844  if(cdist_computed) return;
1845  this->class_distribution = std::vector<size_t>(this->m_classes.size(), 0);
1846  for(auto p: m_points){
1847  int c = p->Y();
1848  auto class_it = std::find(this->m_classes.begin(), this->m_classes.end(), c);
1849  size_t class_pos = class_it - this->m_classes.begin();
1850  class_distribution[class_pos]++;
1851  }
1852  }
1853 
1854  template<typename T>
1855  std::vector<Data<T>> Data<T>::splitSample(const std::size_t &split_size, bool stratified, bool keepIndex, const size_t seed) {
1856  std::multimap<std::string, mltk::PointPointer<T>> classified_objects;
1857  std::map<std::string, int> label_map;
1858  std::vector<mltk::Data<T>> partitions(split_size);
1859  mltk::Data<T> data = copy();
1860  size_t new_size = std::floor(double(size()) / split_size);
1861  size_t _seed = (seed == 0) ? std::random_device{}() : seed;
1862 
1863  data.shuffle(_seed);
1864 
1865  if(this->isClassification() && stratified) {
1866  for (const mltk::PointPointer<T> obj : data.points()) {
1867  classified_objects.insert({std::to_string(obj->Y()), obj});
1868  }
1869 
1870  int i = 0;
1871  for(const auto& label: data.classes()) {
1872  label_map.insert({std::to_string(label), i});
1873  i++;
1874  }
1875 
1876  // To keep track which partition should be filled next
1877  std::vector<int> fillIndex(split_size, 0);
1878 
1879  // Smarter distribution
1880  for (const auto pair : classified_objects) {
1881  // Calculate the index based on respective label's count
1882  int i = fillIndex[label_map[pair.first]] % split_size;
1883  fillIndex[label_map[pair.first]]++;
1884  partitions[i].insertPoint(pair.second, keepIndex);
1885  }
1886 
1887  return partitions;
1888  }
1889 
1890  size_t counter = 0;
1891  for(size_t i = 0; i < partitions.size(); i++) {
1892  for(size_t j = 0; j < new_size; j++) {
1893  partitions[i].insertPoint(data[counter], keepIndex);
1894  counter++;
1895  if(counter == this->size()){
1896  return partitions;
1897  }
1898  }
1899  }
1900  return partitions;
1901  }
1902 
1903  // template<typename T>
1904  // std::vector<Data<T>> Data<T>::splitSample(const std::size_t &split_size, bool stratified, bool keepIndex, const size_t seed) {
1905  // std::vector<Data<T>> split(split_size);
1906  // size_t _seed = (seed == 0) ? std::random_device{}() : seed;
1907  // auto new_size = std::floor(double(size()) / split_size);
1908 
1909  // if(this->isClassification() && stratified){
1910  // this->computeClassesDistribution();
1911  // Point< double > dist(class_distribution.size());
1912  // for(size_t i = 0; i < class_distribution.size(); i++){
1913  // dist[i] = std::ceil((class_distribution[i]/double(size()))*new_size);
1914  // }
1915  // auto classes_split = this->splitByClasses(keepIndex);
1916  // std::sort(classes_split.begin(), classes_split.end(), [](const Data<T> &a, const Data<T> &b){
1917  // return a.size() > b.size();
1918  // });
1919  // bool try_next = false;
1920  // int tries = 0;
1921  // for(int i = 0, j = 0, k = 0, l = split_size-1; j < classes_split.size(), i < size(); i++, k++, l--){
1922  // if(l < 0){
1923  // l = split_size-1;
1924  // }
1925  // //std::cout << i << " " << j << " " << k << " " << l << std::endl;
1926  // // std::cout<< split[l].size() << " " <<new_size << " " << dist << " " << classes_split[j].size() << std::endl;
1927  // // for(auto slice: split){
1928  // // auto slice_dist = mltk::Point<size_t>(slice.classesDistribution());
1929  // // if(!slice_dist.empty()) std::cout << " " << slice.size() << " " << slice_dist;
1930  // // }
1931  // //std::cout << std::endl;
1932  // if(split[l].size() == new_size && !try_next){
1933  // i--;
1934  // k--;
1935  // try_next = true;
1936  // tries++;
1937  // continue;
1938  // }
1939  // if(k < classes_split[j].size()){
1940  // int current_class = classes_split[j][k]->Y();
1941  // auto count = split[l].classesDistribution();
1942  // //if(!count.empty()) std::cout << mltk::Point<size_t>(count) << " " << dist[j] << std::endl;
1943  // if((split[l].classes().empty() || count.empty() || count[j] < dist[j]) || split[j].size() < new_size || tries == split_size-1){
1944  // split[l].insertPoint(classes_split[j][k], keepIndex);
1945  // try_next = false;
1946  // tries = 0;
1947  // }else{
1948  // try_next = true;
1949  // tries++;
1950  // i--;
1951  // k--;
1952  // }
1953  // }else{
1954  // i--;
1955  // l++;
1956  // j++;
1957  // k = -1;
1958  // }
1959  // }
1960  // for(size_t i = 0; i < split.size(); i++){
1961  // split[i].shuffle(_seed+i);
1962  // }
1963  // }else{
1964  // auto data = this->copy();
1965  // data.shuffle(_seed);
1966  // size_t counter = 0;
1967  // for(size_t i = 0; i < split.size(); i++){
1968  // for(size_t j = 0; j < new_size; j++){
1969  // split[i].insertPoint(data[counter], keepIndex);
1970  // counter++;
1971  // if(counter == this->size()){
1972  // return split;
1973  // }
1974  // }
1975  // }
1976  // }
1977  // return split;
1978  // }
1979 
1980  template<typename T>
1981  bool Data<T>::updatePointValue(const size_t &idx, const double value) {
1982  if(idx >= m_size){
1983  std::cerr << "Error [Data]: idx bigger than data size.\n";
1984  return false;
1985  }
1986  double old_value = m_points[idx]->Y();
1987 
1988  if(isClassification()){
1989  int _c = int(value);
1990  auto class_pos = std::find(m_classes.begin(), m_classes.end(), _c);
1991  if(class_pos == m_classes.end()){
1992  m_classes.push_back(_c);
1993  class_names.push_back(std::to_string(_c));
1994  class_distribution.push_back(1);
1995  }
1996  auto oldclass_pos = std::find(m_classes.begin(), m_classes.end(), int(old_value))- m_classes.begin();
1997  class_distribution[class_pos - m_classes.begin()]++;
1998  class_distribution[oldclass_pos]--;
1999  if(class_distribution[oldclass_pos] == 0){
2000  class_distribution.erase(class_distribution.begin() + oldclass_pos);
2001  m_classes.erase(m_classes.begin() + oldclass_pos);
2002  class_names.erase(class_names.begin() + oldclass_pos);
2003  }
2004 
2005  }
2006  m_points[idx]->Y() = value;
2007  return true;
2008  }
2009 
2010  template<typename T>
2011  std::vector<Data<T>> Data<T>::splitByClasses(bool keepIndex) {
2012  int last_c = std::numeric_limits<int>::max();
2013  std::vector<Data<T>> class_split(m_classes.size());
2014  auto data = copy();
2015  auto classes = data.classes();
2016  auto points = data.points();
2017 
2018  std::for_each(points.begin(), points.end(), [&](const auto pointPtr){
2019  auto class_pos = std::distance(classes.begin(), std::find(classes.begin(), classes.end(), int(pointPtr->Y())));
2020  class_split[class_pos].insertPoint(*pointPtr, keepIndex);
2021  });
2022 
2023  std::for_each(class_split.begin(), class_split.end(), [](auto& split){
2024  split.computeClassesDistribution();
2025  });
2026 
2027  return class_split;
2028  }
2029 
2030  template<typename T>
2031  Data<T> Data<T>::sampling(const size_t &samp_size, bool with_replacement, const int &seed) {
2032  assert(samp_size <= size());
2033  std::random_device rd;
2034  std::mt19937 gen((seed == 0)? std::random_device()() : seed);
2035  Data< T > sample;
2036  std::set<std::size_t> ids;
2037  auto classes_split = splitByClasses();
2038  Point<double> class_dist(m_classes.size());
2039  std::vector<std::uniform_int_distribution<size_t>> dist;
2040 
2041  auto temp_dist = classesDistribution();
2042  for(int i = 0; i < temp_dist.size(); i++){
2043  class_dist[i] = (double(temp_dist[i]) / size()) * samp_size;
2044  }
2045  for(int i = 0; i < class_dist.size(); i++){
2046  dist.emplace_back(0, class_dist[i]-1);
2047  }
2048  for(size_t i = 0; i < class_dist.size(); i++){
2049  class_dist[i] = (class_dist[i] < 1) ? 1 : std::floor(class_dist[i]);
2050  }
2051 
2052  for(size_t i = 0; i < class_dist.size(); i++){
2053  for(size_t j = 0; j < class_dist[i]; j++){
2054  std::size_t idx = dist[i](gen);
2055  if(!with_replacement) {
2056  while (ids.find(idx) != ids.end()) {
2057  idx = dist[i](gen);
2058  }
2059  ids.insert(idx);
2060  }
2061  sample.insertPoint(classes_split[i][idx]);
2062  }
2063  }
2064 
2065  return sample;
2066  }
2067 
2068  template<typename T>
2069  Data<T> Data<T>::selectFeatures(std::vector<size_t> feats, int size) {
2070  std::sort(feats.begin(), feats.end());
2071  size_t _size = (size == -1)?feats.size():size;
2072  Data<T> new_data;
2073  std::vector<size_t> feats_pos(feats.size());
2074  int i, j, invalid;
2075 
2076  for(i = 0, j = 0, invalid=0; (i < fnames.size()) && (invalid < feats.size()) && (j < feats.size()); ){
2077  if(feats[j] < 1) { invalid++; j++; continue; }
2078  if(fnames[i] == feats[j]){
2079  feats_pos[j] = i;
2080  j++;
2081  }
2082  i++;
2083  }
2084 
2085  assert((j == feats.size()) && "There are non-existing features on remove set.");
2086 
2087  for(auto const& point: this->m_points){
2088  auto new_point = make_point<T>(_size-invalid);
2089  for(i = 0; i < (_size-invalid); i++){
2090  (*new_point)[i] = (*point)[feats_pos[i]];
2091  (*new_point).Y() = (*point).Y();
2092  (*new_point).Id() = (*point).Id();
2093  (*new_point).Alpha() = (*point).Alpha();
2094  }
2095  new_data.setName(this->name());
2096  new_data.insertPoint(new_point, true);
2097  }
2098  return new_data;
2099  }
2100 
2101  template<typename T>
2102  Data<T>::Data(const Data<T> &other) {
2103  this->copy(other);
2104  }
2105 
2106  template<typename T>
2108  Point<T> feat(m_size, T());
2109  for(int i = 0; i < m_size; i++){
2110  feat[i] = (*m_points[i])[idx];
2111  }
2112  return feat;
2113  }
2114 
2115  template<typename T>
2117  Point<double> labels(m_size, double());
2118  for(int i = 0; i < m_size; i++){
2119  labels[i] = m_points[i]->Y();
2120  }
2121  return labels;
2122  }
2123 
2124  template<typename T>
2125  Point<double> Data<T>::labels() const{
2126  return getLabels();
2127  }
2128 
2129  template<typename T>
2130  void Data<T>::apply(std::function<void(mltk::PointPointer<T> point)> f) {
2131  std::for_each(this->m_points.begin(),this->m_points.end(), f);
2132  }
2133 
2134  template<typename T>
2135  std::string Data<T>::discover_dataset_name(const std::string &path) {
2136  auto tokens = mltk::utils::tokenize(path, '/');
2137  tokens = mltk::utils::tokenize(tokens.back(), '.');
2138  return (tokens.empty())?std::string():tokens[0];
2139  }
2140 
2141  template<typename T>
2142  mltk::Point<int> Data<T>::featuresNames() const {
2143  return getFeaturesNames();
2144  }
2145 }
Wrapper for the dataset data.
Definition: Data.hpp:74
void clear()
clear Clear the data.
Definition: Data.hpp:1727
bool insertPoint(const Data< T > &samples, int _index, bool keepIndex=false)
Insert a point to the data from another sample.
Definition: Data.hpp:1437
bool load(const std::string &file, bool _atEnd)
Load a dataset from a file.
Definition: Data.hpp:656
double getTime_mult() const
Return the time multiplier.
Definition: Data.hpp:1765
void copy(const Data< T > &_data)
Makes a deep copy from another data object.
Definition: Data.hpp:1558
std::vector< Data< T > > splitSample(const std::size_t &split_size, bool stratified=true, bool keepIndex=false, size_t seed=0)
Split the data by a given size.
Definition: Data.hpp:1855
void apply(std::function< void(mltk::PointPointer< T > point)> f)
Apply a function to all points on the dataset.
Definition: Data.hpp:2130
void setClassesAtEnd(bool atEnd)
Inform if the classes of the dataset to be loaded are at begining or end of the attributes.
Definition: Data.hpp:292
void classesCopy(const Data< T > &_data, std::vector< int > &classes)
Makes a deep copy from another data object.
Definition: Data.hpp:1520
std::vector< bool > removePoints(std::vector< int > ids)
Remove several points from the sample.
Definition: Data.hpp:1252
Data(const char *dataset, const char *datasetType, const char *pos_class="1", const char *neg_class="-1")
Data constructor to load a dataset from a file.
Definition: Data.hpp:581
void join(const Data< T > &data)
Merge one dataset with another.
Definition: Data.hpp:1598
bool load(const std::string &file)
Load a dataset from a file.
Definition: Data.hpp:629
PointPointer< T > point(int index) const
Returns a shared pointer to the point with the given index.
Definition: Data.hpp:1510
size_t size() const
Returns the size of the dataset.
Definition: Data.hpp:208
bool removeFeatures(std::vector< int > feats)
Remove features from the dataset.
Definition: Data.hpp:1368
void resetIndex()
Reset the index vector.
Definition: Data.hpp:1759
void changeXVector(std::vector< int > index)
Change the x vector of a sample.
Definition: Data.hpp:1495
void setFeaturesNames(const std::vector< int > &fnames)
setFeaturesNames Set the name of the features of the data.
Definition: Data.hpp:1680
void copyZero(const Data< T > &other)
Returns a copy of the data with zero points.
Definition: Data.hpp:1588
std::string name() const
The name of the dataset is defined as the name ofthe file where it were loaded from.
Definition: Data.hpp:203
bool isNormalized() const
Returns if the dataset is normalized.
Definition: Data.hpp:355
Data< T > sampling(const size_t &samp_size, bool with_replacement=true, const int &seed=0)
Sample the dataset with the given size.
Definition: Data.hpp:2031
const std::string & getType() const
Returns the type of the dataset.
Definition: Data.hpp:1816
void write(const std::string &fname, std::string ext)
write Write the data to a file with the given extention.
Definition: Data.hpp:1215
void setClassesNames(const std::vector< std::string > class_names)
Set classes names.
Definition: Data.hpp:282
void setIndex(std::vector< int > index)
Set the index vector for the data.
Definition: Data.hpp:1754
const std::vector< int > classes() const
Returns a vector containing the numeric values of the classes.
Definition: Data.hpp:1831
bool updatePointValue(const size_t &idx, double value)
Updates a Point value.
Definition: Data.hpp:1981
bool insertPoint(Point< T > p, bool keepIndex=false)
Insert a point to the end of points vector.
Definition: Data.hpp:1490
Data(const std::string &dataset, bool atEnd)
Definition: Data.hpp:553
void setName(const std::string &name)
Set the name of the dataset.
Definition: Data.hpp:277
Data(size_t size, size_t dim, T val=T())
Constructor for instantiation with an initial size and dimension.
Definition: Data.hpp:560
Data(const char *dataset)
Constructor for empty data.
Definition: Data.hpp:546
std::vector< int > getFeaturesNames() const
Returns the features names.
Definition: Data.hpp:1675
Data< T > insertFeatures(std::vector< int > ins_feat, bool keepIndex=false)
insertFeatures Returns Data object with only features in array.
Definition: Data.hpp:1297
bool removePoint(int pid)
Remove a point from the data.
Definition: Data.hpp:1183
bool isEmpty() const
Returns if there's a dataset loaded.
Definition: Data.hpp:1700
bool insertPoint(std::shared_ptr< Point< T > > p, bool keepIndex=false)
Insert a point to the end of points vector.
Definition: Data.hpp:1449
void setPoint(int index, std::shared_ptr< Point< T > > p)
setPoint Set the point in a position of the data.
Definition: Data.hpp:1515
void setClasses(const std::vector< int > &classes)
Set the classes to use in the dataset.
Definition: Data.hpp:1836
std::vector< int > getIndex() const
Returns the vector of indexes.
Definition: Data.hpp:1695
size_t dim() const
Returns the dimension of the dataset.
Definition: Data.hpp:213
void setDim(size_t dim)
setDim Set the dimension of the points.
Definition: Data.hpp:1670
void normalize(double p=2)
normalize Normalize the dataset using a Lp-norm.
Definition: Data.hpp:1646
std::vector< std::shared_ptr< Point< T > > > points()
Returns a shared pointer to the vector of Points of the sample.
Definition: Data.hpp:1685
void computeClassesDistribution()
Compute the frequency of each class in the dataset.
Definition: Data.hpp:1843
std::vector< Data< T > > splitByClasses(bool keepIndex=false)
Split the dataset by its labels. Only valid for classification datasets.
Definition: Data.hpp:2011
Data< T > removeFeatures(std::vector< int > feats, int fsize) const
Remove several features from the sample.
Definition: Data.hpp:1359
std::vector< std::shared_ptr< Point< T > > > points() const
Returns a shared pointer to the vector of Points of the sample.
Definition: Data.hpp:1690
mltk::Data< T > copy() const
Returns a copy of itself.
Definition: Data.hpp:1551
Data< T > selectFeatures(std::vector< size_t > feats, int size=-1)
Returns a Data object with selected features.
Definition: Data.hpp:2069
Point< T > getFeature(int index) const
Get the values of a feature from all points.
Definition: Data.hpp:2107
std::vector< std::string > classesNames() const
Returns a vector containing the name of the classes. Only valid for classification datasets.
Definition: Data.hpp:1821
void shuffle(const size_t &seed=42)
Shuffle the data with a given seed.
Definition: Data.hpp:1349
std::vector< size_t > classesDistribution() const
Returns a vector containing the frequency of the classes. Only valid for classification datasets.
Definition: Data.hpp:1826
bool isClassification() const
Inform if the dataset is used for classification.
Definition: Data.hpp:198
std::size_t size() const
Returns the dimension of the point.
Definition: Point.hpp:133
Definition: Data.hpp:60
UFJF-MLTK main namespace for core functionalities.
Definition: classifier/Classifier.hpp:11
DataPointer< T > make_data(Types... args)
Makes a shared_pointer for a data object.
Definition: Data.hpp:532
T max(const Point< T, R > &p)
Returns the max value of the point.
Definition: Point.hpp:544