![]() |
UFJF - Machine Learning Toolkit
0.51.8
|
Wrapper for the dataset data. More...
#include <Data.hpp>
Public Member Functions | |
const std::string & | getType () const |
Returns the type of the dataset. More... | |
void | setType (const std::string &type) |
Data (const Data< T > &other) | |
Data (const std::string &dataset, bool atEnd) | |
Data (const char *dataset, const char *datasetType, const char *pos_class="1", const char *neg_class="-1") | |
Data constructor to load a dataset from a file. More... | |
Data (size_t size, size_t dim, T val=T()) | |
Constructor for instantiation with an initial size and dimension. More... | |
Data (const char *dataset) | |
Constructor for empty data. More... | |
bool | isClassification () const |
Inform if the dataset is used for classification. More... | |
std::string | name () const |
The name of the dataset is defined as the name ofthe file where it were loaded from. More... | |
size_t | size () const |
Returns the size of the dataset. More... | |
size_t | dim () const |
Returns the dimension of the dataset. More... | |
std::vector< std::shared_ptr< Point< T > > > | points () |
Returns a shared pointer to the vector of Points of the sample. More... | |
std::vector< std::shared_ptr< Point< T > > > | points () const |
Returns a shared pointer to the vector of Points of the sample. More... | |
const std::vector< int > | classes () const |
Returns a vector containing the numeric values of the classes. More... | |
PointPointer< T > | point (int index) const |
Returns a shared pointer to the point with the given index. More... | |
Point< T > | getFeature (int index) const |
Get the values of a feature from all points. More... | |
Point< double > | getLabels () const |
Point< double > | labels () const |
std::vector< size_t > | classesDistribution () const |
Returns a vector containing the frequency of the classes. Only valid for classification datasets. More... | |
std::vector< std::string > | classesNames () const |
Returns a vector containing the name of the classes. Only valid for classification datasets. More... | |
std::vector< int > | getFeaturesNames () const |
Returns the features names. More... | |
mltk::Point< int > | featuresNames () const |
std::vector< int > | getIndex () const |
Returns the vector of indexes. More... | |
double | getTime_mult () const |
Return the time multiplier. More... | |
void | setName (const std::string &name) |
Set the name of the dataset. More... | |
void | setClassesNames (const std::vector< std::string > class_names) |
Set classes names. More... | |
void | setClasses (const std::vector< int > &classes) |
Set the classes to use in the dataset. More... | |
void | setClassesAtEnd (bool atEnd) |
Inform if the classes of the dataset to be loaded are at begining or end of the attributes. More... | |
void | setPoint (int index, std::shared_ptr< Point< T > > p) |
setPoint Set the point in a position of the data. More... | |
void | setFeaturesNames (const std::vector< int > &fnames) |
setFeaturesNames Set the name of the features of the data. More... | |
void | setIndex (std::vector< int > index) |
Set the index vector for the data. More... | |
void | setDim (size_t dim) |
setDim Set the dimension of the points. More... | |
void | computeClassesDistribution () |
Compute the frequency of each class in the dataset. More... | |
void | shuffle (const size_t &seed=42) |
Shuffle the data with a given seed. More... | |
bool | load (const std::string &file) |
Load a dataset from a file. More... | |
bool | load (const std::string &file, bool _atEnd) |
Load a dataset from a file. More... | |
void | write (const std::string &fname, std::string ext) |
write Write the data to a file with the given extention. More... | |
bool | isEmpty () const |
Returns if there's a dataset loaded. More... | |
bool | isNormalized () const |
Returns if the dataset is normalized. More... | |
void | clear () |
clear Clear the data. More... | |
void | classesCopy (const Data< T > &_data, std::vector< int > &classes) |
Makes a deep copy from another data object. More... | |
mltk::Data< T > | copy () const |
Returns a copy of itself. More... | |
void | copy (const Data< T > &_data) |
Makes a deep copy from another data object. More... | |
void | copyZero (const Data< T > &other) |
Returns a copy of the data with zero points. More... | |
std::vector< Data< T > > | splitByClasses (bool keepIndex=false) |
Split the dataset by its labels. Only valid for classification datasets. More... | |
std::vector< Data< T > > | splitSample (const std::size_t &split_size, bool stratified=true, bool keepIndex=false, size_t seed=0) |
Split the data by a given size. More... | |
Data< T > | selectFeatures (std::vector< size_t > feats, int size=-1) |
Returns a Data object with selected features. More... | |
Data< T > | sampling (const size_t &samp_size, bool with_replacement=true, const int &seed=0) |
Sample the dataset with the given size. More... | |
void | apply (std::function< void(mltk::PointPointer< T > point)> f) |
Apply a function to all points on the dataset. More... | |
void | join (const Data< T > &data) |
Merge one dataset with another. More... | |
bool | insertPoint (const Data< T > &samples, int _index, bool keepIndex=false) |
Insert a point to the data from another sample. More... | |
bool | insertPoint (std::shared_ptr< Point< T > > p, bool keepIndex=false) |
Insert a point to the end of points vector. More... | |
bool | insertPoint (Point< T > p, bool keepIndex=false) |
Insert a point to the end of points vector. More... | |
std::vector< bool > | removePoints (std::vector< int > ids) |
Remove several points from the sample. More... | |
bool | removePoint (int pid) |
Remove a point from the data. More... | |
Data< T > | insertFeatures (std::vector< int > ins_feat, bool keepIndex=false) |
insertFeatures Returns Data object with only features in array. More... | |
Data< T > | removeFeatures (std::vector< int > feats, int fsize) const |
Remove several features from the sample. More... | |
bool | removeFeatures (std::vector< int > feats) |
Remove features from the dataset. More... | |
bool | updatePointValue (const size_t &idx, double value) |
Updates a Point value. More... | |
void | changeXVector (std::vector< int > index) |
Change the x vector of a sample. More... | |
void | normalize (double p=2) |
normalize Normalize the dataset using a Lp-norm. More... | |
void | resetIndex () |
Reset the index vector. More... | |
SampleIterator< T > | begin () |
SampleIterator< T > | end () |
ConstSampleIterator< T > | begin () const |
ConstSampleIterator< T > | end () const |
std::shared_ptr< Point< T > > | operator[] (size_t i) const |
std::shared_ptr< Point< T > > & | operator[] (size_t i) |
Point< T > | operator() (size_t i) const |
Point< T > & | operator() (size_t i) |
Data< T > & | operator= (const Data< T > &) |
bool | operator== (const Data< T > &rhs) const |
bool | operator!= (const Data< T > &rhs) const |
Friends | |
template<typename U > | |
std::ostream & | operator<< (std::ostream &output, const Data< U > &data) |
Wrapper for the dataset data.
mltk::Data< T >::Data | ( | const std::string & | dataset, |
bool | atEnd | ||
) |
dataset | Dataset path. |
atEnd | Indicates if labels are at end of the columns of the dataset. |
|
explicit |
Data constructor to load a dataset from a file.
dataset | Path to the dataset to be loaded. |
pos_class | String representing the positive class on the dataset. |
neg_class | String representing the negative class on the dataset. |
mltk::Data< T >::Data | ( | size_t | size, |
size_t | dim, | ||
T | val = T() |
||
) |
Constructor for instantiation with an initial size and dimension.
size | The initial size of the data. |
dim | The initial dimension of the data. |
val | The default value for the initialization, if not given, it'll call the default constructor of T. |
|
explicit |
Constructor for empty data.
pos_class | String representing the positive class on the dataset. |
neg_class | String representing the negative class on the dataset. |
void mltk::Data< T >::apply | ( | std::function< void(mltk::PointPointer< T > point)> | f | ) |
Apply a function to all points on the dataset.
f | Function to be applied to the data, must receive a PointPointer. |
void mltk::Data< T >::changeXVector | ( | std::vector< int > | index | ) |
Change the x vector of a sample.
index | Indexes of the change to be made. |
const std::vector< int > mltk::Data< T >::classes |
Returns a vector containing the numeric values of the classes.
void mltk::Data< T >::classesCopy | ( | const Data< T > & | _data, |
std::vector< int > & | classes | ||
) |
Makes a deep copy from another data object.
std::vector< size_t > mltk::Data< T >::classesDistribution |
Returns a vector containing the frequency of the classes. Only valid for classification datasets.
std::vector< std::string > mltk::Data< T >::classesNames |
Returns a vector containing the name of the classes. Only valid for classification datasets.
void mltk::Data< T >::clear |
clear Clear the data.
void mltk::Data< T >::computeClassesDistribution |
Compute the frequency of each class in the dataset.
Data< T > mltk::Data< T >::copy |
Returns a copy of itself.
void mltk::Data< T >::copy | ( | const Data< T > & | _data | ) |
Makes a deep copy from another data object.
void mltk::Data< T >::copyZero | ( | const Data< T > & | other | ) |
Returns a copy of the data with zero points.
|
inline |
Returns the dimension of the dataset.
Point< T > mltk::Data< T >::getFeature | ( | int | index | ) | const |
Get the values of a feature from all points.
index | Index of the feature on the dataset. |
std::vector< int > mltk::Data< T >::getFeaturesNames |
Returns the features names.
std::vector< int > mltk::Data< T >::getIndex |
Returns the vector of indexes.
double mltk::Data< T >::getTime_mult |
Return the time multiplier.
const std::string & mltk::Data< T >::getType |
Returns the type of the dataset.
mltk::Data< T > mltk::Data< T >::insertFeatures | ( | std::vector< int > | ins_feat, |
bool | keepIndex = false |
||
) |
bool mltk::Data< T >::insertPoint | ( | const Data< T > & | samples, |
int | _index, | ||
bool | keepIndex = false |
||
) |
Insert a point to the data from another sample.
sample | Sample with the point to be added. |
_index | Index of the point to be added. |
bool mltk::Data< T >::insertPoint | ( | Point< T > | p, |
bool | keepIndex = false |
||
) |
bool mltk::Data< T >::insertPoint | ( | std::shared_ptr< Point< T > > | p, |
bool | keepIndex = false |
||
) |
|
inline |
Inform if the dataset is used for classification.
bool mltk::Data< T >::isEmpty |
Returns if there's a dataset loaded.
|
inline |
Returns if the dataset is normalized.
void mltk::Data< T >::join | ( | const Data< T > & | data | ) |
Merge one dataset with another.
data | Dataset to be joined. |
bool mltk::Data< T >::load | ( | const std::string & | file | ) |
Load a dataset from a file.
file | Path to dataset file. |
bool mltk::Data< T >::load | ( | const std::string & | file, |
bool | _atEnd | ||
) |
Load a dataset from a file.
file | Path to dataset file. |
|
inline |
The name of the dataset is defined as the name ofthe file where it were loaded from.
void mltk::Data< T >::normalize | ( | double | p = 2 | ) |
normalize Normalize the dataset using a Lp-norm.
p | Norm to be utilized. |
PointPointer< T > mltk::Data< T >::point | ( | int | index | ) | const |
Returns a shared pointer to the point with the given index.
index | Position of a point in the points array. |
std::vector< std::shared_ptr< Point< T > > > mltk::Data< T >::points |
Returns a shared pointer to the vector of Points of the sample.
std::vector< std::shared_ptr< Point< T > > > mltk::Data< T >::points |
Returns a shared pointer to the vector of Points of the sample.
bool mltk::Data< T >::removeFeatures | ( | std::vector< int > | feats | ) |
Remove features from the dataset.
feats | Features to be removed. |
Data< T > mltk::Data< T >::removeFeatures | ( | std::vector< int > | feats, |
int | fsize | ||
) | const |
Remove several features from the sample.
feats | Names of the features to be removed (must be sorted). |
bool mltk::Data< T >::removePoint | ( | int | pid | ) |
Remove a point from the data.
pid | Index of the point to be removed. |
std::vector< bool > mltk::Data< T >::removePoints | ( | std::vector< int > | ids | ) |
Remove several points from the sample.
ids | Ids of the points to be removed (must be sorted). |
void mltk::Data< T >::resetIndex |
Reset the index vector.
Data< T > mltk::Data< T >::sampling | ( | const size_t & | samp_size, |
bool | with_replacement = true , |
||
const int & | seed = 0 |
||
) |
Sample the dataset with the given size.
samp_size | Sampling size. |
with_replacement | Tells if sampling must be made with replacement. |
seed | Random generator seed. |
Data< T > mltk::Data< T >::selectFeatures | ( | std::vector< size_t > | feats, |
int | size = -1 |
||
) |
void mltk::Data< T >::setClasses | ( | const std::vector< int > & | classes | ) |
Set the classes to use in the dataset.
classes | Vector of classes that will be in the dataset. |
|
inline |
Inform if the classes of the dataset to be loaded are at begining or end of the attributes.
atEnd | Boolean informing if classes are at end. |
|
inline |
Set classes names.
class_names | strings represeting the classes names. |
void mltk::Data< T >::setDim | ( | size_t | dim | ) |
setDim Set the dimension of the points.
dim | Dimension to be set. |
void mltk::Data< T >::setFeaturesNames | ( | const std::vector< int > & | fnames | ) |
setFeaturesNames Set the name of the features of the data.
fnames | Name of the features. |
void mltk::Data< T >::setIndex | ( | std::vector< int > | index | ) |
Set the index vector for the data.
index | Index vector. |
|
inline |
Set the name of the dataset.
name | Name of the dataset. |
void mltk::Data< T >::setPoint | ( | int | index, |
std::shared_ptr< Point< T > > | p | ||
) |
setPoint Set the point in a position of the data.
index | Index of the point that will be set. |
p | Point to be set. |
void mltk::Data< T >::shuffle | ( | const size_t & | seed = 42 | ) |
Shuffle the data with a given seed.
seed | Seed given for randomization |
|
inline |
Returns the size of the dataset.
std::vector< Data< T > > mltk::Data< T >::splitByClasses | ( | bool | keepIndex = false | ) |
Split the dataset by its labels. Only valid for classification datasets.
std::vector< Data< T > > mltk::Data< T >::splitSample | ( | const std::size_t & | split_size, |
bool | stratified = true , |
||
bool | keepIndex = false , |
||
size_t | seed = 0 |
||
) |
Split the data by a given size.
split_size | Number of samples on each split. |
stratified | If true, the split will be done in a stratified manner. |
seed | Random generator seed. |
bool mltk::Data< T >::updatePointValue | ( | const size_t & | idx, |
double | value | ||
) |
Updates a Point value.
idx | Index of the point to be updated. |
value | New value of the point. |
void mltk::Data< T >::write | ( | const std::string & | fname, |
std::string | ext | ||
) |
write Write the data to a file with the given extention.
fname | Name of the file. |
ext | Extention of the file. |