56 static const std::vector<std::string> types {
"data",
"csv",
"arff",
"txt",
"plt"};
57 enum Type {TYPE_INVALID = -1, TYPE_DATA = 0, TYPE_CSV = 1, TYPE_ARFF = 2, TYPE_TXT = 3};
59 template <
typename T >
62 template <
typename T >
65 template <
class T >
using DataPointer = std::shared_ptr<mltk::Data< T > >;
66 template <
class T >
using SamplePointer = std::shared_ptr<mltk::Point< T > >;
67 template <
class T >
using SampleIterator =
typename std::vector<SamplePointer< T > >::iterator;
68 template <
class T >
using ConstSampleIterator =
typename std::vector<SamplePointer< T > >::const_iterator;
73 template <
typename T =
double >
78 std::string dataset_name{
""};
80 std::vector<SamplePointer<T> > m_points = {};
82 std::vector<int> fnames = {};
84 std::vector<int> index = {};
86 std::vector<std::string> class_names = {};
88 std::vector<int> m_classes = {};
90 std::vector<size_t> class_distribution = {};
96 double time_mult = 0.0;
98 std::string pos_class = std::string(
"1"), neg_class = std::string(
"-1");
100 bool is_empty =
true;
104 bool normalized =
false;
106 bool cdist_computed =
false;
108 std::string type =
"Classification";
114 [[nodiscard]]
const std::string &
getType()
const;
123 int process_class(std::string& item);
129 Type identifyFileType (std::string file);
135 bool load_csv (
const std::string& path);
141 bool load_arff (
const std::string& path);
147 bool load_data (
const std::string& path);
153 bool load_txt (
const std::string& path);
159 std::string discover_dataset_name(
const std::string& path);
162 void setType(
const std::string &type);
170 Data(
const std::string &dataset,
bool atEnd);
177 explicit Data (
const char* dataset,
const char* datasetType,
const char* pos_class =
"1",
const char* neg_class =
"-1");
190 explicit Data (
const char* dataset);
198 [[nodiscard]]
bool isClassification()
const {
return (type ==
"Classification" || type ==
"MultiClassification" || type ==
"BinClassification");}
203 [[nodiscard]] std::string
name()
const{
return dataset_name; }
208 [[nodiscard]]
size_t size()
const{
return m_points.size();};
213 [[nodiscard]]
size_t dim()
const{
return (!m_points.empty()) ? m_points.front()->size() : 0; }
218 std::vector<std::shared_ptr<Point< T > > >
points ();
223 std::vector<std::shared_ptr<Point< T > > >
points ()
const;
228 [[nodiscard]]
const std::vector<int>
classes()
const;
234 PointPointer<T>
point (
int index)
const;
282 inline void setClassesNames(
const std::vector<std::string> class_names){ this->class_names = class_names; }
333 bool load (
const std::string& file);
339 bool load (
const std::string& file,
bool _atEnd);
345 void write(
const std::string& fname, std::string ext);
392 std::vector< Data< T > >
splitSample(
const std::size_t &split_size,
bool stratified =
true,
bool keepIndex =
false,
size_t seed = 0);
414 void apply(std::function<
void(mltk::PointPointer<T>
point)> f);
497 SampleIterator<T> begin() {
return m_points.begin(); }
499 SampleIterator<T> end() {
return m_points.end(); }
501 ConstSampleIterator<T> begin()
const {
return m_points.cbegin(); }
503 ConstSampleIterator<T> end()
const {
return m_points.cend(); }
505 std::shared_ptr<Point< T > > operator[](
size_t i)
const {
return m_points[i];}
507 std::shared_ptr<Point< T > > & operator[](
size_t i) {
return m_points[i];}
509 Point< T > operator()(
size_t i)
const {
return *(m_points[i]);}
510 Point< T >& operator()(
size_t i) {
return *m_points[i];}
514 bool operator==(
const Data< T > &rhs)
const;
516 bool operator!=(
const Data< T > &rhs)
const;
518 template<
typename U >
519 friend std::ostream &operator<<( std::ostream &output,
const Data< U > &data );
531 template <
typename T,
typename... Types >
533 return std::make_shared< Data < T > >(args...);
536 template <
typename T >
537 std::ostream &operator<<( std::ostream &output,
const Data< T > &data ){
538 for(
auto p : data.m_points){
539 output << *p << std::endl;
547 if(!load(std::string(dataset))){
548 std::cerr <<
"Couldn't read the dataset." << std::endl;
554 if(!
load(std::string(dataset), atEnd)){
555 std::cerr <<
"Couldn't read the dataset." << std::endl;
563 this->m_points.resize(size);
564 this->fnames.resize(dim);
565 this->index.resize(size);
567 for(i = 0; i < size; i++){
568 this->m_points[i] = std::make_shared<Point< T > >(dim, val);
569 this->m_points[i]->Id() = i+1;
570 this->m_points[i]->Y() = 0;
573 std::iota(fnames.begin(), fnames.end(), 1);
574 std::iota(index.begin(), index.end(), 0);
577 if(size > 0) this->is_empty =
false;
582 this->pos_class = std::string(pos_class);
583 this->neg_class = std::string(neg_class);
584 this->type = std::string(datasetType);
586 if(!load(std::string(dataset))) {
587 std::cerr <<
"Couldn't read the dataset." << std::endl;
593 size_t l , k, i, j, tsize = types.size();
595 for(k = file.size() - 1; (k >= 0 && file[k] !=
'.') ; k--){
596 for(i = 0; i < tsize; ++i){
597 if(file[k] == types[i][types[i].size()-1]){
598 for(j = types[i].size() - 1, l = 0; (j >= 0 && file[k-l] !=
'.') ; j--, l++){
599 if(j == 0 && types[i][j] == file[k-l]){
602 return Type::TYPE_DATA;
605 return Type::TYPE_CSV;
608 return Type::TYPE_ARFF;
611 return Type::TYPE_TXT;
614 return Type::TYPE_INVALID;
617 }
else if(types[i][j] != file[k-l]){
625 return Type::TYPE_INVALID;
630 Type t = identifyFileType(file);
631 this->dataset_name = discover_dataset_name(file);
632 this->cdist_computed =
true;
639 return load_arff(file);
641 return load_csv(file);
643 return load_data(file);
645 return load_txt(file);
647 std::cerr <<
"Invalid file type." << std::endl;
650 fnames.assign(this->dim(), 0);
651 std::iota(fnames.begin(), fnames.end(), 1);
657 Type t = identifyFileType(file);
658 this->dataset_name = discover_dataset_name(file);
659 this->cdist_computed =
true;
661 this->atEnd = _atEnd;
665 return load_arff(file);
667 return load_csv(file);
669 return load_data(file);
671 return load_txt(file);
673 std::cerr <<
"Invalid file type." << std::endl;
676 fnames.assign(this->dim(), 0);
677 std::iota(fnames.begin(), fnames.end(), 1);
683 std::ifstream input(path.c_str());
684 std::stringstream ss;
685 std::string str, item;
686 int _dim, ldim, _size, ssize;
691 _dim = ldim = _size = 0;
694 std::cout <<
"File could not be opened!" << std::endl;
699 while(std::getline(input, str)){
700 str = mltk::utils::trim_copy(str);
707 for(
int i = 0; i < ssize; ++i){
711 }
else if(str[i] ==
';'){
718 while(getline(ss, item, deli)){
720 if(this->isClassification()) {
721 if (_dim == -1 && !flag) {
722 if (!((item == pos_class) || (item == neg_class))) {
725 }
else if (ss.eof() && !flag) {
726 if (!((item == pos_class) || (item == neg_class))) {
734 if(ldim != _dim && ldim != 0){
735 std::cerr <<
"All the samples must have the same dimension!" << std::endl;
745 input.seekg(0, std::ios::beg);
749 this->m_size = _size;
752 fnames.assign(_dim, 0);
753 index.assign(_size, 0);
754 std::iota(fnames.begin(), fnames.end(), 1);
755 std::iota(index.begin(), index.end(), 0);
758 m_points.resize(_size);
761 while(std::getline(input, str)){
762 str = mltk::utils::trim_copy(str);
763 auto new_point = std::make_shared<Point< T > >();
769 new_point->X().resize(this->m_dim, 0.0);
772 while(std::getline(ss, item, deli)){
779 if(utils::is_number(item))
780 new_point->X()[(!atEnd) ? _dim : _dim + 1] = utils::atod(item.c_str());
784 if(this->isClassification()) {
785 c = process_class(item);
787 c = utils::atod(item.c_str());
794 m_points[_size++] = std::move(new_point);
795 m_points[_size - 1]->Id() = _size;
806 std::ifstream input(path.c_str());
807 std::string str, item, buffer;
808 std::stringstream ss, ss1;
809 int _dim, ldim, _size;
811 bool is_feature, type_detect =
false;
814 std::cout <<
"File could not be opened!" << std::endl;
817 _dim = ldim = _size = c = 0;
819 while(std::getline(input, str)){
820 str = mltk::utils::trim_copy(str);
826 while(std::getline(ss, item,
' ')){
827 if(item ==
"")
continue;
828 item = mltk::utils::trim_copy(item);
829 const char * pch = std::strchr(item.c_str(),
':');
838 while(_size > 0 && std::getline(ss1, item,
':')) {
839 if(_dim == ldim && !type_detect){
840 if(strchr(item.c_str(),
'.')){
841 this->type =
"Regression";
843 this->type =
"Classification";
846 }
else if(_dim < ldim && !utils::is_number(item)){
847 std::clog <<
"Warning (line: " << _size <<
"): feature " << _dim <<
" is not a number." << std::endl;
851 if(ldim != _dim && ldim != 0){
852 std::cerr <<
"Error (line: " << _size <<
"): all the samples must have the same dimension! (_dim: " << _dim <<
", last_dim: " << ldim <<
")" << std::endl;
861 input.seekg(0, std::ios::beg);
865 this->m_size = _size;
868 fnames.assign(_dim, 0);
869 index.assign(_size, 0);
870 std::iota(fnames.begin(), fnames.end(), 1);
871 std::iota(index.begin(), index.end(), 0);
874 m_points.resize(_size);
878 while(std::getline(input, str)){
879 str = mltk::utils::trim_copy(str);
880 auto new_point = std::make_shared<Point< T > >();
884 new_point->X().resize(this->m_dim, 0.0);
888 while(std::getline(ss, item,
' ')){
889 item = mltk::utils::trim_copy(item);
890 const char * pch = std::strchr(item.c_str(),
':');
892 if(this->isClassification()) {
893 c = process_class(item);
895 c = utils::atod(item.c_str());
904 while(std::getline(ss1, item,
':')){
905 item.erase( std::remove(item.begin(), item.end(),
'\r'), item.end() );
907 fnames[_dim] = utils::stoin(item);
910 if(utils::is_number(item)){
911 new_point->X()[_dim++] = utils::atod(item.c_str());
918 m_points[_size++] = std::move(new_point);
919 m_points[_size - 1]->Id() = _size;
922 if(m_classes.size() == 2){
923 type =
"BinClassification";
925 type =
"MultiClassification";
935 std::ifstream input(path.c_str());
936 std::istringstream ss;
937 std::string str, item;
938 int _dim, ldim, _size, c;
939 bool atBegin, flag, cond;
942 std::cout <<
"File could not be opened! \nError: " << std::strerror(errno) << std::endl;
946 _dim = ldim = _size = c = 0;
947 atBegin = flag = cond =
false;
950 while(std::getline(input, str)){
951 str = mltk::utils::trim_copy(str);
955 while(std::getline(ss, item,
',')){
957 auto found_class = std::find(this->class_names.begin(), this->class_names.end(), item);
959 if(!utils::is_number(item) && (found_class != this->class_names.end())){
960 std::clog <<
"Warning: point[" << _size <<
"] " << _dim + 1 <<
" feature is not a number. (" << item <<
")" << std::endl;
963 if(this->isClassification()) {
964 if (_dim == 0 && !flag) {
965 if (!((item == pos_class) || (item == neg_class))) {
969 }
else if (ss.eof() && !flag) {
970 if (!((item == pos_class) || (item == neg_class))) {
979 if(ldim != _dim && ldim != 0){
980 std::cerr <<
"All the samples must have the same dimension!" << std::endl;
989 input.seekg(0, std::ios::beg);
994 this->m_size = _size;
997 fnames.assign(_dim, 0);
998 index.assign(_size, 0);
999 std::iota(fnames.begin(), fnames.end(), 1);
1000 std::iota(index.begin(), index.end(), 0);
1003 m_points.resize(_size);
1008 while(std::getline(input, str)){
1009 str = mltk::utils::trim_copy(str);
1010 auto new_point = std::make_shared<Point< T > >();
1015 new_point->X().assign(this->m_dim, 0.0);
1018 while(std::getline(ss, item,
',')){
1020 cond = (!ss.eof() && atEnd);
1025 if(utils::is_number(item)){
1026 new_point->X()[_dim + 1] = utils::atod(item.c_str());
1029 if(this->isClassification()){
1030 c = process_class(item);
1037 m_points[_size++] = std::move(new_point);
1038 m_points[_size - 1]->Id() = _size;
1047 template<
typename T>
1049 std::ifstream input(path.c_str());
1050 std::istringstream ss;
1051 std::string str, item;
1052 int n, _dim = 0, n1, d, _size;
1055 std::cout <<
"File could not be opened!" << std::endl;
1062 while(std::getline(input, str)){
1063 str = mltk::utils::trim_copy(str);
1068 while(std::getline(ss, item,
' ')){
1075 if(_size != 0 && _dim != d ){
1076 std::cerr << _dim <<
" " << this->m_dim << std::endl;
1077 std::cerr <<
"All the samples must have the same dimension!" << std::endl;
1087 this->m_size = _size;
1091 fnames.assign(_dim, 0);
1092 index.assign(_size, 0);
1093 std::iota(fnames.begin(), fnames.end(), 1);
1094 std::iota(index.begin(), index.end(), 0);
1097 input.seekg(0, std::ios::beg);
1100 m_points.resize(_size);
1105 while(std::getline(input, str)){
1106 str = mltk::utils::trim_copy(str);
1107 auto new_point = std::make_shared<Point< T > >();
1110 new_point->X().resize(_dim, 0.0);
1115 while(std::getline(ss, item,
' ')){
1117 if(utils::is_number(item))
1118 new_point->X()[n - 2] = utils::atod(item.c_str());
1119 else{ std::clog <<
"Warning: point[" << _size <<
"] " << n - 2 <<
" feature is not a number." << std::endl; }
1122 new_point->Id() = utils::atod(item.c_str());
1127 m_points[_size++] = std::move(new_point);
1128 m_points[_size - 1]->Id() = _size;
1137 template<
typename T>
1140 auto class_name_it = std::find(this->class_names.begin(), this->class_names.end(), item);
1142 item.erase(std::remove_if(item.begin(), item.end(), ::isspace), item.end());
1143 if(item.empty())
return std::numeric_limits<int>::infinity();
1145 if(class_name_it == class_names.end()){
1146 this->class_names.push_back(item);
1147 auto has_mone = std::find(class_names.begin(), class_names.end(),
"-1");
1148 if(has_mone == class_names.end()){
1149 if(utils::is_number(item)) {
1150 c = std::stoi(item);
1152 auto pos = std::find(class_names.begin(), class_names.end(), item);
1153 c = (int)(pos - this->class_names.begin())+1;
1163 if(utils::is_number(item)) {
1164 c = std::stoi(item);
1166 c = (int)(class_name_it - this->class_names.begin())+1;
1169 auto class_it = std::find(this->m_classes.begin(), this->m_classes.end(), c);
1170 if(class_it == this->m_classes.end()){
1171 this->m_classes.push_back(c);
1172 this->class_distribution.push_back(0);
1174 auto it_pos = std::find_if(this->m_classes.begin(), this->m_classes.end(), [&c](
const int &_class){
1175 return (_class == c);
1177 this->class_distribution[(it_pos-this->m_classes.begin())]++;
1182 template<
typename T>
1186 if(m_size == 1){ std::cout <<
"Error: RemovePoint, only one point left\n";
return false; }
1188 if(pid > m_points[m_size - 1]->Id() || pid <= 0)
return false;
1191 index.resize(m_size);
1193 for(i = 0; i < m_size; ++i){
1195 index[i-1] = index[i] - 1;
1201 for(i = 0; i < m_size; i++){
1202 if(m_points[i]->Id() == pid){
1203 y = m_points[i]->Y();
1204 m_points.erase(m_points.begin() + i);
1209 auto pos = std::find(m_classes.begin(), m_classes.end(), y) - m_classes.begin();
1210 class_distribution[pos]--;
1214 template<
typename T>
1217 std::string path = fname +
"." + ext;
1218 std::ofstream outstream(path.c_str(), std::ios::out);
1221 if(!outstream.is_open()){
1222 std::cerr <<
"Can't write in file." << std::endl;
1226 for(i = 0; i < m_size; i++){
1228 outstream << m_points[i]->Y() <<
" ";
1229 for(j = 0; j < m_dim-1; j++){
1230 outstream << m_points[i]->X()[j] <<
" ";
1232 outstream << m_points[i]->X()[j] << std::endl;
1233 }
else if(ext ==
"data"){
1234 outstream << m_points[i]->Y() <<
" ";
1235 for(j = 0; j < m_dim-1; j++){
1236 outstream << fnames[j] <<
":" << m_points[i]->X()[j] <<
" ";
1238 outstream << fnames[j] <<
":" << m_points[i]->X()[j] <<
"\n";
1239 }
else if(ext ==
"csv"){
1240 outstream << m_points[i]->Y() <<
",";
1241 for(j = 0; j < m_dim-1; j++){
1242 outstream << m_points[i]->X()[j] <<
",";
1244 outstream << m_points[i]->X()[j] <<
"\n";
1251 template<
typename T>
1253 int idsize = ids.size(), i, j;
1255 std::shared_ptr<Point< T > > po;
1256 auto p = m_points.begin();
1257 std::vector<bool> notFound(idsize,
true);
1259 std::sort(ids.begin(), ids.end());
1261 for(; p != m_points.end();){
1264 for(i = 0; i < idsize; i++){
1265 if(po->Id() == ids[i]){
1267 notFound[i] =
false;
1274 p = m_points.erase(p);
1277 index.resize(m_size);
1279 for(j = 0; j < m_size; ++j){
1281 index[j-1] = index[j] - 1;
1286 if(m_size == 1){ std::clog <<
"Error: RemovePoint, only one point left." << std::endl;
break;}
1287 auto class_pos = std::find(m_classes.begin(), m_classes.end(), c) - m_classes.begin();
1288 class_distribution[class_pos]--;
1296 template<
typename T>
1298 size_t i, j, s, offset = 0, fsize = ins_feat.
size();
1299 bool saveflag =
false;
1300 std::vector<int> new_fnames(fsize, 0);
1301 std::shared_ptr<Point< T > > p;
1304 if(fsize == 0)
return this->copy();
1305 sort(ins_feat.begin(), ins_feat.end());
1308 if(fsize > m_dim){ std::cerr <<
"Error: InsertFeature, fsize(" << ins_feat.size() <<
")>dim(" << m_dim <<
")\n";
return *smout; }
1309 smout->setDim(fsize);
1312 for(i = 0; i < m_size; i++){
1313 p = std::make_shared<Point< T > >();
1314 p->X().resize(fsize);
1315 p->Alpha() = m_points[i]->Alpha();
1316 p->Id() = m_points[i]->Id();
1317 p->Y() = m_points[i]->Y();
1321 for(j = 0; j < m_dim; j++){
1322 if(offset < fsize && fnames[j] == ins_feat[offset]){
1328 p->X()[s] = m_points[i]->X()[j];
1329 new_fnames[s] = fnames[j];
1336 std::cerr <<
"Error: Something went wrong on InsertFeature\n";
1337 std::cerr <<
"s = " << s <<
", dim = " << m_dim <<
", fsize = " << fsize << std::endl;
1341 smout->insertPoint(p, keepIndex);
1343 smout->setFeaturesNames(new_fnames);
1348 template<
typename T>
1350 std::mt19937 gen((seed == 0) ? std::random_device{}() : seed);
1351 std::shuffle(m_points.begin(), m_points.end(), gen);
1353 for(
size_t i = 0; i < m_size; i++){
1358 template <
typename T >
1360 auto data_copy = this->copy();
1361 size_t feats_size = (fsize==-1)?feats.size():fsize;
1362 std::vector<int> to_remove(feats.begin(), feats.begin()+feats_size);
1363 data_copy.removeFeatures(to_remove);
1367 template<
typename T>
1369 size_t i, j, k, psize = m_points.
size(), rsize = feats.size();
1370 typename std::vector< T >::iterator itr;
1371 std::vector<int>::iterator fitr;
1372 std::vector<bool> exist(rsize,
true);
1374 if(feats.empty())
return true;
1376 if(fnames.size() == 1){
1377 std::cerr <<
"Error: RemoveFeature, only one feature left.\n";
1386 sort(feats.begin(), feats.end());
1389 for(i = 0; i < rsize; i++){
1390 for(j = 0; j < m_dim; j++){
1391 if((feats[i] > -1) && feats[i] == fnames[j]){
1401 for(i = 0; i < psize; i++){
1402 if(m_points[i] ==
nullptr) std::clog <<
"WARNING: point is null." << std::endl;
1405 for(itr = m_points[i]->X().begin(),k = 0, j = 0; itr != m_points[i]->X().end();){
1406 while(k < rsize && !exist[k]) k++;
1407 if(k == rsize)
break;
1410 if(fnames[j] == feats[k]){
1411 itr = m_points[i]->X().erase(itr);
1421 for(k = 0; k < rsize; k++){
1422 for(fitr = fnames.begin(); fitr != fnames.end();){
1423 if((*fitr) == feats[k]){
1424 fitr = fnames.erase(fitr);
1436 template<
typename T>
1438 if(_index > samples.
size() - 1){
1439 std::cerr <<
"Index out of bounds. (insertPoint)" << std::endl;
1443 insertPoint(samples.
point(_index), keepIndex);
1448 template<
typename T>
1451 if(size() > 0 && p && p->size() != this->dim()){
1452 std::cerr <<
"Point with dimension different from the data. (insertPoint)" << std::endl;
1453 std::cerr <<
"Point dim = " << p->size() <<
" dim = " << m_dim << std::endl;
1458 this->fnames = std::vector<int>(p->size(), 0);
1459 std::iota(this->fnames.begin(), this->fnames.end(), 1);
1463 m_points.insert(m_points.end(), p);
1465 if(is_empty) is_empty =
false;
1466 if(m_dim == 0) m_dim = p->size();
1468 if(this->isClassification()){
1469 auto class_pos = std::find(this->m_classes.begin(), this->m_classes.end(), p->Y());
1471 if(class_pos == this->m_classes.end()){
1472 this->class_names.push_back(std::to_string(
int(m_points[m_size - 1]->Y())));
1473 this->m_classes.push_back(m_points[m_size - 1]->Y());
1474 this->class_distribution.push_back(1);
1476 this->class_distribution[int(class_pos - this->m_classes.begin())]++;
1481 size_t last_pos = m_size - 1;
1483 m_points[last_pos]->Id() = m_size;
1485 index.push_back(m_points.size()-1);
1489 template<
typename T>
1491 return this->insertPoint(std::make_shared<
Point< T > >(p), keepIndex);
1494 template<
typename T>
1497 std::vector<std::shared_ptr<Point< T > > > nPoints(m_size);
1500 for(i = 0; i < m_size; i++){
1501 nPoints[i]->X() = m_points[_index[i]]->X();
1502 nPoints[i]->X() = m_points[_index[i]]->X();
1509 template<
typename T>
1511 return m_points[_index];
1514 template<
typename T>
1516 m_points[_index] = p;
1519 template<
typename T>
1522 std::set<int> _classes;
1523 for(
auto c: classes){
1526 for(
size_t i = 0; i < _data.
size(); i++){
1527 if(_classes.find(_data[i]->Y()) != _classes.end()){
1528 this->m_points.push_back(std::make_shared<
Point< T > >());
1529 size_t curr = this->m_points.size()-1;
1530 this->m_points[curr]->X() = _data[i]->X();
1531 this->m_points[curr]->Y() = _data[i]->Y();
1532 this->m_points[curr]->Alpha() = _data[i]->Alpha();
1533 this->m_points[curr]->Id() = _data[i]->Id();
1539 this->m_size = _size;
1540 this->m_classes = classes;
1541 this->m_dim = _data.
dim();
1544 this->is_empty = _data.
isEmpty();
1547 this->cdist_computed =
false;
1550 template<
typename T>
1557 template<
typename T>
1559 size_t _size = _data.
size();
1560 if(this->m_points.size() > 0) this->m_points.clear();
1562 for(
size_t i = 0; i < _size; i++){
1563 auto point = std::make_shared<Point< T > >();
1565 point->X().resize(_data[i]->X().
size());
1566 std::copy(_data[i]->X().begin(), _data[i]->X().end(), point->X().begin());
1567 point->Y() = _data[i]->Y();
1568 point->Alpha() = _data[i]->Alpha();
1569 point->Id() = _data[i]->Id();
1570 this->m_points.push_back(point);
1573 this->m_size = _data.
size();
1574 this->m_classes = _data.
classes();
1577 this->m_dim = _data.
dim();
1580 this->is_empty = _data.
isEmpty();
1583 this->cdist_computed = _data.cdist_computed;
1584 this->dataset_name = _data.dataset_name;
1587 template<
typename T>
1589 fnames = other.fnames;
1590 m_dim = other.m_dim;
1592 is_empty = other.is_empty;
1593 normalized = other.normalized;
1594 cdist_computed =
false;
1597 template<
typename T>
1599 size_t i, j, dim1 = data.
dim(), antsize = m_size, size1 = data.
size();
1600 std::vector<int> index1 = data.
getIndex(), antindex = index;
1601 auto points1 = data.
points();
1604 std::cerr <<
"Error: sample1 dimension must be less or equal to sample2\n";
1610 m_points.resize(m_size);
1612 for(i = antsize, j = 0; i < m_size && j < size1; i++, j++){
1613 m_points[i] = mltk::make_point<T>();
1614 m_points[i]->X() = points1[j]->X();
1615 m_points[i]->Y() = points1[j]->Y();
1616 m_points[i]->Alpha() = points1[j]->Alpha();
1617 m_points[i]->Id() = points1[j]->Id();
1622 std::vector<int> diff, classes1 = data.
classes(), classes = m_classes;
1623 std::sort(classes.begin(), classes.end());
1624 std::sort(classes1.begin(), classes1.end());
1625 std::set_difference(classes1.begin(), classes1.end(), classes.begin(), classes.end(),
1626 std::inserter(diff, diff.begin()));
1627 m_classes.reserve(m_classes.size()+diff.size());
1628 m_classes.insert(m_classes.end(), diff.begin(), diff.end());
1630 std::vector<std::string> diff_names, classes_names1 = data.
classesNames(), classes_names = class_names;
1631 std::sort(classes_names.begin(), classes_names.end());
1632 std::sort(classes_names1.begin(), classes_names1.end());
1633 std::set_difference(classes_names1.begin(), classes_names1.end(), classes_names.begin(), classes_names.end(),
1634 std::inserter(diff_names, diff_names.begin()));
1635 class_names.reserve(class_names.size()+diff_names.size());
1636 class_names.insert(class_names.end(), diff_names.begin(), diff_names.end());
1638 class_distribution.assign(m_classes.size(), 0);
1639 computeClassesDistribution();
1645 template<
typename T>
1648 int old_dim = this->dim();
1651 for(i = 0; i < m_size; ++i){
1652 for(norm = 0, j = 0; j < old_dim; ++j){
1653 norm += std::pow(fabs(m_points[i]->X()[j]),p);
1655 m_points[i]->X().resize(old_dim+1);
1656 m_points[i]->X()[j] = 1;
1657 norm += std::pow(fabs(m_points[i]->X()[j]),p);
1658 norm = std::pow(norm, 1.0/p);
1659 for(j = 0; j < old_dim+1; ++j){
1660 m_points[i]->X()[j] /= norm;
1663 m_dim = old_dim + 1;
1664 fnames.push_back(this->dim());
1669 template<
typename T>
1674 template<
typename T>
1679 template<
typename T>
1681 this->fnames = _fnames;
1684 template<
typename T>
1689 template<
typename T>
1694 template<
typename T>
1699 template<
typename T>
1704 template<
typename T>
1706 m_points = data.m_points;
1707 fnames = data.fnames;
1709 m_size = data.m_size;
1711 m_classes = data.m_classes;
1712 class_distribution = data.class_distribution;
1713 class_names = data.class_names;
1714 time_mult = data.time_mult;
1715 pos_class = data.pos_class;
1716 neg_class = data.neg_class;
1717 is_empty = data.is_empty;
1718 normalized = data.normalized;
1719 dataset_name = data.dataset_name;
1726 template<
typename T>
1732 class_names.clear();
1737 cdist_computed =
false;
1740 template<
typename T>
1745 class_names.clear();
1750 cdist_computed =
false;
1753 template<
typename T>
1755 this->index = _index;
1758 template<
typename T>
1760 index.assign(m_points.size(), 0);
1761 iota(index.begin(), index.end(), 0);
1764 template<
typename T>
1769 template<
typename T>
1771 if(m_points.size() != rhs.m_points.size())
return false;
1773 size_t i, _size = m_points.
size();
1775 for(i = 0; i < _size; i++){
1776 if(*m_points[i] != *rhs.m_points[i]){
1781 if(m_classes.size() != rhs.
classes().size())
return false;
1782 for(i = 0; i < m_classes.size(); i++){
1783 if(m_classes[i] != rhs.
classes()[i]){
1788 for(i = 0; i < class_distribution.size(); i++){
1789 if(class_distribution[i] != rhs.class_distribution[i]){
1794 return fnames == rhs.fnames &&
1795 index == rhs.index &&
1796 _size == rhs.m_size &&
1797 m_dim == rhs.m_dim &&
1798 time_mult == rhs.time_mult &&
1799 pos_class == rhs.pos_class &&
1800 neg_class == rhs.neg_class &&
1801 is_empty == rhs.is_empty &&
1802 normalized == rhs.normalized;
1805 template<
typename T>
1807 return !(rhs == *
this);
1810 template<
typename T>
1815 template<
typename T>
1820 template<
typename T>
1822 return this->class_names;
1825 template<
typename T>
1827 return this->class_distribution;
1830 template<
typename T>
1832 return this->m_classes;
1835 template<
typename T>
1837 this->m_classes = _classes;
1838 this->class_distribution.resize(_classes.size());
1839 this->cdist_computed =
false;
1842 template <
typename T>
1844 if(cdist_computed)
return;
1845 this->class_distribution = std::vector<size_t>(this->m_classes.size(), 0);
1846 for(
auto p: m_points){
1848 auto class_it = std::find(this->m_classes.begin(), this->m_classes.end(), c);
1849 size_t class_pos = class_it - this->m_classes.begin();
1850 class_distribution[class_pos]++;
1854 template<
typename T>
1855 std::vector<Data<T>>
Data<T>::splitSample(
const std::size_t &split_size,
bool stratified,
bool keepIndex,
const size_t seed) {
1856 std::multimap<std::string, mltk::PointPointer<T>> classified_objects;
1857 std::map<std::string, int> label_map;
1858 std::vector<mltk::Data<T>> partitions(split_size);
1860 size_t new_size = std::floor(
double(size()) / split_size);
1861 size_t _seed = (seed == 0) ? std::random_device{}() : seed;
1865 if(this->isClassification() && stratified) {
1866 for (
const mltk::PointPointer<T> obj : data.
points()) {
1867 classified_objects.insert({std::to_string(obj->Y()), obj});
1871 for(
const auto& label: data.
classes()) {
1872 label_map.insert({std::to_string(label), i});
1877 std::vector<int> fillIndex(split_size, 0);
1880 for (
const auto pair : classified_objects) {
1882 int i = fillIndex[label_map[pair.first]] % split_size;
1883 fillIndex[label_map[pair.first]]++;
1884 partitions[i].insertPoint(pair.second, keepIndex);
1891 for(
size_t i = 0; i < partitions.size(); i++) {
1892 for(
size_t j = 0; j < new_size; j++) {
1893 partitions[i].insertPoint(data[counter], keepIndex);
1895 if(counter == this->size()){
1980 template<
typename T>
1983 std::cerr <<
"Error [Data]: idx bigger than data size.\n";
1986 double old_value = m_points[idx]->Y();
1988 if(isClassification()){
1989 int _c = int(value);
1990 auto class_pos = std::find(m_classes.begin(), m_classes.end(), _c);
1991 if(class_pos == m_classes.end()){
1992 m_classes.push_back(_c);
1993 class_names.push_back(std::to_string(_c));
1994 class_distribution.push_back(1);
1996 auto oldclass_pos = std::find(m_classes.begin(), m_classes.end(),
int(old_value))- m_classes.begin();
1997 class_distribution[class_pos - m_classes.begin()]++;
1998 class_distribution[oldclass_pos]--;
1999 if(class_distribution[oldclass_pos] == 0){
2000 class_distribution.erase(class_distribution.begin() + oldclass_pos);
2001 m_classes.erase(m_classes.begin() + oldclass_pos);
2002 class_names.erase(class_names.begin() + oldclass_pos);
2006 m_points[idx]->Y() = value;
2010 template<
typename T>
2013 std::vector<Data<T>> class_split(m_classes.size());
2015 auto classes = data.
classes();
2016 auto points = data.
points();
2018 std::for_each(points.begin(), points.end(), [&](
const auto pointPtr){
2019 auto class_pos = std::distance(classes.begin(), std::find(classes.begin(), classes.end(), int(pointPtr->Y())));
2020 class_split[class_pos].insertPoint(*pointPtr, keepIndex);
2023 std::for_each(class_split.begin(), class_split.end(), [](
auto& split){
2024 split.computeClassesDistribution();
2030 template<
typename T>
2032 assert(samp_size <= size());
2033 std::random_device rd;
2034 std::mt19937 gen((seed == 0)? std::random_device()() : seed);
2036 std::set<std::size_t> ids;
2037 auto classes_split = splitByClasses();
2039 std::vector<std::uniform_int_distribution<size_t>> dist;
2041 auto temp_dist = classesDistribution();
2042 for(
int i = 0; i < temp_dist.size(); i++){
2043 class_dist[i] = (double(temp_dist[i]) / size()) * samp_size;
2045 for(
int i = 0; i < class_dist.
size(); i++){
2046 dist.emplace_back(0, class_dist[i]-1);
2048 for(
size_t i = 0; i < class_dist.
size(); i++){
2049 class_dist[i] = (class_dist[i] < 1) ? 1 : std::floor(class_dist[i]);
2052 for(
size_t i = 0; i < class_dist.
size(); i++){
2053 for(
size_t j = 0; j < class_dist[i]; j++){
2054 std::size_t idx = dist[i](gen);
2055 if(!with_replacement) {
2056 while (ids.find(idx) != ids.end()) {
2068 template<
typename T>
2070 std::sort(feats.begin(), feats.end());
2071 size_t _size = (size == -1)?feats.size():size;
2073 std::vector<size_t> feats_pos(feats.size());
2076 for(i = 0, j = 0, invalid=0; (i < fnames.size()) && (invalid < feats.size()) && (j < feats.size()); ){
2077 if(feats[j] < 1) { invalid++; j++;
continue; }
2078 if(fnames[i] == feats[j]){
2085 assert((j == feats.size()) &&
"There are non-existing features on remove set.");
2087 for(
auto const& point: this->m_points){
2088 auto new_point = make_point<T>(_size-invalid);
2089 for(i = 0; i < (_size-invalid); i++){
2090 (*new_point)[i] = (*point)[feats_pos[i]];
2091 (*new_point).Y() = (*point).Y();
2092 (*new_point).Id() = (*point).Id();
2093 (*new_point).Alpha() = (*point).Alpha();
2095 new_data.
setName(this->name());
2101 template<
typename T>
2106 template<
typename T>
2109 for(
int i = 0; i < m_size; i++){
2110 feat[i] = (*m_points[i])[idx];
2115 template<
typename T>
2118 for(
int i = 0; i < m_size; i++){
2119 labels[i] = m_points[i]->Y();
2124 template<
typename T>
2125 Point<double> Data<T>::labels()
const{
2129 template<
typename T>
2131 std::for_each(this->m_points.begin(),this->m_points.end(), f);
2134 template<
typename T>
2136 auto tokens = mltk::utils::tokenize(path,
'/');
2137 tokens = mltk::utils::tokenize(tokens.back(),
'.');
2138 return (tokens.empty())?std::string():tokens[0];
2141 template<
typename T>
2143 return getFeaturesNames();
Wrapper for the dataset data.
Definition: Data.hpp:74
void clear()
clear Clear the data.
Definition: Data.hpp:1727
bool insertPoint(const Data< T > &samples, int _index, bool keepIndex=false)
Insert a point to the data from another sample.
Definition: Data.hpp:1437
bool load(const std::string &file, bool _atEnd)
Load a dataset from a file.
Definition: Data.hpp:656
double getTime_mult() const
Return the time multiplier.
Definition: Data.hpp:1765
void copy(const Data< T > &_data)
Makes a deep copy from another data object.
Definition: Data.hpp:1558
std::vector< Data< T > > splitSample(const std::size_t &split_size, bool stratified=true, bool keepIndex=false, size_t seed=0)
Split the data by a given size.
Definition: Data.hpp:1855
void apply(std::function< void(mltk::PointPointer< T > point)> f)
Apply a function to all points on the dataset.
Definition: Data.hpp:2130
void setClassesAtEnd(bool atEnd)
Inform if the classes of the dataset to be loaded are at begining or end of the attributes.
Definition: Data.hpp:292
void classesCopy(const Data< T > &_data, std::vector< int > &classes)
Makes a deep copy from another data object.
Definition: Data.hpp:1520
std::vector< bool > removePoints(std::vector< int > ids)
Remove several points from the sample.
Definition: Data.hpp:1252
Data(const char *dataset, const char *datasetType, const char *pos_class="1", const char *neg_class="-1")
Data constructor to load a dataset from a file.
Definition: Data.hpp:581
void join(const Data< T > &data)
Merge one dataset with another.
Definition: Data.hpp:1598
bool load(const std::string &file)
Load a dataset from a file.
Definition: Data.hpp:629
PointPointer< T > point(int index) const
Returns a shared pointer to the point with the given index.
Definition: Data.hpp:1510
size_t size() const
Returns the size of the dataset.
Definition: Data.hpp:208
bool removeFeatures(std::vector< int > feats)
Remove features from the dataset.
Definition: Data.hpp:1368
void resetIndex()
Reset the index vector.
Definition: Data.hpp:1759
void changeXVector(std::vector< int > index)
Change the x vector of a sample.
Definition: Data.hpp:1495
void setFeaturesNames(const std::vector< int > &fnames)
setFeaturesNames Set the name of the features of the data.
Definition: Data.hpp:1680
void copyZero(const Data< T > &other)
Returns a copy of the data with zero points.
Definition: Data.hpp:1588
std::string name() const
The name of the dataset is defined as the name ofthe file where it were loaded from.
Definition: Data.hpp:203
bool isNormalized() const
Returns if the dataset is normalized.
Definition: Data.hpp:355
Data< T > sampling(const size_t &samp_size, bool with_replacement=true, const int &seed=0)
Sample the dataset with the given size.
Definition: Data.hpp:2031
const std::string & getType() const
Returns the type of the dataset.
Definition: Data.hpp:1816
void write(const std::string &fname, std::string ext)
write Write the data to a file with the given extention.
Definition: Data.hpp:1215
void setClassesNames(const std::vector< std::string > class_names)
Set classes names.
Definition: Data.hpp:282
void setIndex(std::vector< int > index)
Set the index vector for the data.
Definition: Data.hpp:1754
const std::vector< int > classes() const
Returns a vector containing the numeric values of the classes.
Definition: Data.hpp:1831
bool updatePointValue(const size_t &idx, double value)
Updates a Point value.
Definition: Data.hpp:1981
bool insertPoint(Point< T > p, bool keepIndex=false)
Insert a point to the end of points vector.
Definition: Data.hpp:1490
Data(const std::string &dataset, bool atEnd)
Definition: Data.hpp:553
void setName(const std::string &name)
Set the name of the dataset.
Definition: Data.hpp:277
Data(size_t size, size_t dim, T val=T())
Constructor for instantiation with an initial size and dimension.
Definition: Data.hpp:560
Data(const char *dataset)
Constructor for empty data.
Definition: Data.hpp:546
std::vector< int > getFeaturesNames() const
Returns the features names.
Definition: Data.hpp:1675
Data< T > insertFeatures(std::vector< int > ins_feat, bool keepIndex=false)
insertFeatures Returns Data object with only features in array.
Definition: Data.hpp:1297
bool removePoint(int pid)
Remove a point from the data.
Definition: Data.hpp:1183
bool isEmpty() const
Returns if there's a dataset loaded.
Definition: Data.hpp:1700
bool insertPoint(std::shared_ptr< Point< T > > p, bool keepIndex=false)
Insert a point to the end of points vector.
Definition: Data.hpp:1449
void setPoint(int index, std::shared_ptr< Point< T > > p)
setPoint Set the point in a position of the data.
Definition: Data.hpp:1515
void setClasses(const std::vector< int > &classes)
Set the classes to use in the dataset.
Definition: Data.hpp:1836
std::vector< int > getIndex() const
Returns the vector of indexes.
Definition: Data.hpp:1695
size_t dim() const
Returns the dimension of the dataset.
Definition: Data.hpp:213
void setDim(size_t dim)
setDim Set the dimension of the points.
Definition: Data.hpp:1670
void normalize(double p=2)
normalize Normalize the dataset using a Lp-norm.
Definition: Data.hpp:1646
std::vector< std::shared_ptr< Point< T > > > points()
Returns a shared pointer to the vector of Points of the sample.
Definition: Data.hpp:1685
void computeClassesDistribution()
Compute the frequency of each class in the dataset.
Definition: Data.hpp:1843
std::vector< Data< T > > splitByClasses(bool keepIndex=false)
Split the dataset by its labels. Only valid for classification datasets.
Definition: Data.hpp:2011
Data< T > removeFeatures(std::vector< int > feats, int fsize) const
Remove several features from the sample.
Definition: Data.hpp:1359
std::vector< std::shared_ptr< Point< T > > > points() const
Returns a shared pointer to the vector of Points of the sample.
Definition: Data.hpp:1690
mltk::Data< T > copy() const
Returns a copy of itself.
Definition: Data.hpp:1551
Data< T > selectFeatures(std::vector< size_t > feats, int size=-1)
Returns a Data object with selected features.
Definition: Data.hpp:2069
Point< T > getFeature(int index) const
Get the values of a feature from all points.
Definition: Data.hpp:2107
std::vector< std::string > classesNames() const
Returns a vector containing the name of the classes. Only valid for classification datasets.
Definition: Data.hpp:1821
void shuffle(const size_t &seed=42)
Shuffle the data with a given seed.
Definition: Data.hpp:1349
std::vector< size_t > classesDistribution() const
Returns a vector containing the frequency of the classes. Only valid for classification datasets.
Definition: Data.hpp:1826
bool isClassification() const
Inform if the dataset is used for classification.
Definition: Data.hpp:198
std::size_t size() const
Returns the dimension of the point.
Definition: Point.hpp:133
UFJF-MLTK main namespace for core functionalities.
Definition: classifier/Classifier.hpp:11
DataPointer< T > make_data(Types... args)
Makes a shared_pointer for a data object.
Definition: Data.hpp:532
T max(const Point< T, R > &p)
Returns the max value of the point.
Definition: Point.hpp:544