UFJF - Machine Learning Toolkit  0.51.8
mltk::Data< T > Class Template Reference

Wrapper for the dataset data. More...

#include <Data.hpp>

Public Member Functions

const std::string & getType () const
 Returns the type of the dataset. More...
 
void setType (const std::string &type)
 
 Data (const Data< T > &other)
 
 Data (const std::string &dataset, bool atEnd)
 
 Data (const char *dataset, const char *datasetType, const char *pos_class="1", const char *neg_class="-1")
 Data constructor to load a dataset from a file. More...
 
 Data (size_t size, size_t dim, T val=T())
 Constructor for instantiation with an initial size and dimension. More...
 
 Data (const char *dataset)
 Constructor for empty data. More...
 
bool isClassification () const
 Inform if the dataset is used for classification. More...
 
std::string name () const
 The name of the dataset is defined as the name ofthe file where it were loaded from. More...
 
size_t size () const
 Returns the size of the dataset. More...
 
size_t dim () const
 Returns the dimension of the dataset. More...
 
std::vector< std::shared_ptr< Point< T > > > points ()
 Returns a shared pointer to the vector of Points of the sample. More...
 
std::vector< std::shared_ptr< Point< T > > > points () const
 Returns a shared pointer to the vector of Points of the sample. More...
 
const std::vector< int > classes () const
 Returns a vector containing the numeric values of the classes. More...
 
PointPointer< T > point (int index) const
 Returns a shared pointer to the point with the given index. More...
 
Point< T > getFeature (int index) const
 Get the values of a feature from all points. More...
 
Point< double > getLabels () const
 
Point< double > labels () const
 
std::vector< size_t > classesDistribution () const
 Returns a vector containing the frequency of the classes. Only valid for classification datasets. More...
 
std::vector< std::string > classesNames () const
 Returns a vector containing the name of the classes. Only valid for classification datasets. More...
 
std::vector< int > getFeaturesNames () const
 Returns the features names. More...
 
mltk::Point< int > featuresNames () const
 
std::vector< int > getIndex () const
 Returns the vector of indexes. More...
 
double getTime_mult () const
 Return the time multiplier. More...
 
void setName (const std::string &name)
 Set the name of the dataset. More...
 
void setClassesNames (const std::vector< std::string > class_names)
 Set classes names. More...
 
void setClasses (const std::vector< int > &classes)
 Set the classes to use in the dataset. More...
 
void setClassesAtEnd (bool atEnd)
 Inform if the classes of the dataset to be loaded are at begining or end of the attributes. More...
 
void setPoint (int index, std::shared_ptr< Point< T > > p)
 setPoint Set the point in a position of the data. More...
 
void setFeaturesNames (const std::vector< int > &fnames)
 setFeaturesNames Set the name of the features of the data. More...
 
void setIndex (std::vector< int > index)
 Set the index vector for the data. More...
 
void setDim (size_t dim)
 setDim Set the dimension of the points. More...
 
void computeClassesDistribution ()
 Compute the frequency of each class in the dataset. More...
 
void shuffle (const size_t &seed=42)
 Shuffle the data with a given seed. More...
 
bool load (const std::string &file)
 Load a dataset from a file. More...
 
bool load (const std::string &file, bool _atEnd)
 Load a dataset from a file. More...
 
void write (const std::string &fname, std::string ext)
 write Write the data to a file with the given extention. More...
 
bool isEmpty () const
 Returns if there's a dataset loaded. More...
 
bool isNormalized () const
 Returns if the dataset is normalized. More...
 
void clear ()
 clear Clear the data. More...
 
void classesCopy (const Data< T > &_data, std::vector< int > &classes)
 Makes a deep copy from another data object. More...
 
mltk::Data< T > copy () const
 Returns a copy of itself. More...
 
void copy (const Data< T > &_data)
 Makes a deep copy from another data object. More...
 
void copyZero (const Data< T > &other)
 Returns a copy of the data with zero points. More...
 
std::vector< Data< T > > splitByClasses (bool keepIndex=false)
 Split the dataset by its labels. Only valid for classification datasets. More...
 
std::vector< Data< T > > splitSample (const std::size_t &split_size, bool stratified=true, bool keepIndex=false, size_t seed=0)
 Split the data by a given size. More...
 
Data< T > selectFeatures (std::vector< size_t > feats, int size=-1)
 Returns a Data object with selected features. More...
 
Data< T > sampling (const size_t &samp_size, bool with_replacement=true, const int &seed=0)
 Sample the dataset with the given size. More...
 
void apply (std::function< void(mltk::PointPointer< T > point)> f)
 Apply a function to all points on the dataset. More...
 
void join (const Data< T > &data)
 Merge one dataset with another. More...
 
bool insertPoint (const Data< T > &samples, int _index, bool keepIndex=false)
 Insert a point to the data from another sample. More...
 
bool insertPoint (std::shared_ptr< Point< T > > p, bool keepIndex=false)
 Insert a point to the end of points vector. More...
 
bool insertPoint (Point< T > p, bool keepIndex=false)
 Insert a point to the end of points vector. More...
 
std::vector< bool > removePoints (std::vector< int > ids)
 Remove several points from the sample. More...
 
bool removePoint (int pid)
 Remove a point from the data. More...
 
Data< T > insertFeatures (std::vector< int > ins_feat, bool keepIndex=false)
 insertFeatures Returns Data object with only features in array. More...
 
Data< T > removeFeatures (std::vector< int > feats, int fsize) const
 Remove several features from the sample. More...
 
bool removeFeatures (std::vector< int > feats)
 Remove features from the dataset. More...
 
bool updatePointValue (const size_t &idx, double value)
 Updates a Point value. More...
 
void changeXVector (std::vector< int > index)
 Change the x vector of a sample. More...
 
void normalize (double p=2)
 normalize Normalize the dataset using a Lp-norm. More...
 
void resetIndex ()
 Reset the index vector. More...
 
SampleIterator< T > begin ()
 
SampleIterator< T > end ()
 
ConstSampleIterator< T > begin () const
 
ConstSampleIterator< T > end () const
 
std::shared_ptr< Point< T > > operator[] (size_t i) const
 
std::shared_ptr< Point< T > > & operator[] (size_t i)
 
Point< T > operator() (size_t i) const
 
Point< T > & operator() (size_t i)
 
Data< T > & operator= (const Data< T > &)
 
bool operator== (const Data< T > &rhs) const
 
bool operator!= (const Data< T > &rhs) const
 

Friends

template<typename U >
std::ostream & operator<< (std::ostream &output, const Data< U > &data)
 

Detailed Description

template<typename T = double>
class mltk::Data< T >

Wrapper for the dataset data.

Constructor & Destructor Documentation

◆ Data() [1/4]

template<typename T >
mltk::Data< T >::Data ( const std::string &  dataset,
bool  atEnd 
)
Parameters
datasetDataset path.
atEndIndicates if labels are at end of the columns of the dataset.

◆ Data() [2/4]

template<typename T >
mltk::Data< T >::Data ( const char *  dataset,
const char *  datasetType,
const char *  pos_class = "1",
const char *  neg_class = "-1" 
)
explicit

Data constructor to load a dataset from a file.

Parameters
datasetPath to the dataset to be loaded.
pos_classString representing the positive class on the dataset.
neg_classString representing the negative class on the dataset.

◆ Data() [3/4]

template<typename T >
mltk::Data< T >::Data ( size_t  size,
size_t  dim,
val = T() 
)

Constructor for instantiation with an initial size and dimension.

Parameters
sizeThe initial size of the data.
dimThe initial dimension of the data.
valThe default value for the initialization, if not given, it'll call the default constructor of T.

◆ Data() [4/4]

template<typename T >
mltk::Data< T >::Data ( const char *  dataset)
explicit

Constructor for empty data.

Parameters
pos_classString representing the positive class on the dataset.
neg_classString representing the negative class on the dataset.

Member Function Documentation

◆ apply()

template<typename T >
void mltk::Data< T >::apply ( std::function< void(mltk::PointPointer< T > point)>  f)

Apply a function to all points on the dataset.

Parameters
fFunction to be applied to the data, must receive a PointPointer.

◆ changeXVector()

template<typename T >
void mltk::Data< T >::changeXVector ( std::vector< int >  index)

Change the x vector of a sample.

Parameters
indexIndexes of the change to be made.
Returns
void

◆ classes()

template<typename T >
const std::vector< int > mltk::Data< T >::classes

Returns a vector containing the numeric values of the classes.

Returns
std::vector<int>

◆ classesCopy()

template<typename T >
void mltk::Data< T >::classesCopy ( const Data< T > &  _data,
std::vector< int > &  classes 
)

Makes a deep copy from another data object.

Returns
Data

◆ classesDistribution()

template<typename T >
std::vector< size_t > mltk::Data< T >::classesDistribution

Returns a vector containing the frequency of the classes. Only valid for classification datasets.

Returns
std::vector<double> containing the distribution of the classes.

◆ classesNames()

template<typename T >
std::vector< std::string > mltk::Data< T >::classesNames

Returns a vector containing the name of the classes. Only valid for classification datasets.

Returns
std::vector<std::string> containing the names of the classes.

◆ clear()

template<typename T >
void mltk::Data< T >::clear

clear Clear the data.

◆ computeClassesDistribution()

template<typename T >
void mltk::Data< T >::computeClassesDistribution

Compute the frequency of each class in the dataset.

◆ copy() [1/2]

template<typename T >
Data< T > mltk::Data< T >::copy

Returns a copy of itself.

Returns
Data

◆ copy() [2/2]

template<typename T >
void mltk::Data< T >::copy ( const Data< T > &  _data)

Makes a deep copy from another data object.

Returns
Data

◆ copyZero()

template<typename T >
void mltk::Data< T >::copyZero ( const Data< T > &  other)

Returns a copy of the data with zero points.

Returns
Data

◆ dim()

template<typename T = double>
size_t mltk::Data< T >::dim ( ) const
inline

Returns the dimension of the dataset.

Returns
int

◆ getFeature()

template<typename T >
Point< T > mltk::Data< T >::getFeature ( int  index) const

Get the values of a feature from all points.

Parameters
indexIndex of the feature on the dataset.
Returns
Point containing feature values.

◆ getFeaturesNames()

template<typename T >
std::vector< int > mltk::Data< T >::getFeaturesNames

Returns the features names.

Returns
std::vector<int>

◆ getIndex()

template<typename T >
std::vector< int > mltk::Data< T >::getIndex

Returns the vector of indexes.

Returns
std::vector<int>

◆ getTime_mult()

template<typename T >
double mltk::Data< T >::getTime_mult

Return the time multiplier.

Returns
double

◆ getType()

template<typename T >
const std::string & mltk::Data< T >::getType

Returns the type of the dataset.

Returns
std::string

◆ insertFeatures()

template<typename T >
mltk::Data< T > mltk::Data< T >::insertFeatures ( std::vector< int >  ins_feat,
bool  keepIndex = false 
)

insertFeatures Returns Data object with only features in array.

Parameters
ins_featArray with features that will be in the Data object.
Returns
Data If the object is empty something wrong happened.

◆ insertPoint() [1/3]

template<typename T >
bool mltk::Data< T >::insertPoint ( const Data< T > &  samples,
int  _index,
bool  keepIndex = false 
)

Insert a point to the data from another sample.

Parameters
sampleSample with the point to be added.
_indexIndex of the point to be added.
Returns
bool

◆ insertPoint() [2/3]

template<typename T >
bool mltk::Data< T >::insertPoint ( Point< T >  p,
bool  keepIndex = false 
)

Insert a point to the end of points vector.

Parameters
pPoint to be inserted.
Returns
bool

◆ insertPoint() [3/3]

template<typename T >
bool mltk::Data< T >::insertPoint ( std::shared_ptr< Point< T > >  p,
bool  keepIndex = false 
)

Insert a point to the end of points vector.

Parameters
pPoint to be inserted.
Returns
bool

◆ isClassification()

template<typename T = double>
bool mltk::Data< T >::isClassification ( ) const
inline

Inform if the dataset is used for classification.

◆ isEmpty()

template<typename T >
bool mltk::Data< T >::isEmpty

Returns if there's a dataset loaded.

Returns
bool

◆ isNormalized()

template<typename T = double>
bool mltk::Data< T >::isNormalized ( ) const
inline

Returns if the dataset is normalized.

Returns
bool

◆ join()

template<typename T >
void mltk::Data< T >::join ( const Data< T > &  data)

Merge one dataset with another.

Parameters
dataDataset to be joined.
Returns
bool

◆ load() [1/2]

template<typename T >
bool mltk::Data< T >::load ( const std::string &  file)

Load a dataset from a file.

Parameters
filePath to dataset file.
Returns
bool

◆ load() [2/2]

template<typename T >
bool mltk::Data< T >::load ( const std::string &  file,
bool  _atEnd 
)

Load a dataset from a file.

Parameters
filePath to dataset file.
Returns
bool

◆ name()

template<typename T = double>
std::string mltk::Data< T >::name ( ) const
inline

The name of the dataset is defined as the name ofthe file where it were loaded from.

Returns
Name of the dataset.

◆ normalize()

template<typename T >
void mltk::Data< T >::normalize ( double  p = 2)

normalize Normalize the dataset using a Lp-norm.

Parameters
pNorm to be utilized.

◆ point()

template<typename T >
PointPointer< T > mltk::Data< T >::point ( int  index) const

Returns a shared pointer to the point with the given index.

Parameters
indexPosition of a point in the points array.
Returns
std::vector<Point< T > >

◆ points() [1/2]

template<typename T >
std::vector< std::shared_ptr< Point< T > > > mltk::Data< T >::points

Returns a shared pointer to the vector of Points of the sample.

Returns
std::vector<std::shared_ptr<Point< T > > >

◆ points() [2/2]

template<typename T >
std::vector< std::shared_ptr< Point< T > > > mltk::Data< T >::points

Returns a shared pointer to the vector of Points of the sample.

Returns
std::vector<std::shared_ptr<Point< T > > >

◆ removeFeatures() [1/2]

template<typename T >
bool mltk::Data< T >::removeFeatures ( std::vector< int >  feats)

Remove features from the dataset.

Parameters
featsFeatures to be removed.
Returns
Success of feature removal.

◆ removeFeatures() [2/2]

template<typename T >
Data< T > mltk::Data< T >::removeFeatures ( std::vector< int >  feats,
int  fsize 
) const

Remove several features from the sample.

Parameters
featsNames of the features to be removed (must be sorted).
Returns
boolean informing if all features were succesfully removed.

◆ removePoint()

template<typename T >
bool mltk::Data< T >::removePoint ( int  pid)

Remove a point from the data.

Parameters
pidIndex of the point to be removed.
Returns
bool

◆ removePoints()

template<typename T >
std::vector< bool > mltk::Data< T >::removePoints ( std::vector< int >  ids)

Remove several points from the sample.

Parameters
idsIds of the points to be removed (must be sorted).
Returns
booleans informing which points were removed succesfully.

◆ resetIndex()

template<typename T >
void mltk::Data< T >::resetIndex

Reset the index vector.

◆ sampling()

template<typename T >
Data< T > mltk::Data< T >::sampling ( const size_t &  samp_size,
bool  with_replacement = true,
const int &  seed = 0 
)

Sample the dataset with the given size.

Parameters
samp_sizeSampling size.
with_replacementTells if sampling must be made with replacement.
seedRandom generator seed.
Returns
Data object with sampled data.

◆ selectFeatures()

template<typename T >
Data< T > mltk::Data< T >::selectFeatures ( std::vector< size_t >  feats,
int  size = -1 
)

Returns a Data object with selected features.

Parameters
featsFeatures to be selected from the dataset.
size
Returns
Data object with selected features.

◆ setClasses()

template<typename T >
void mltk::Data< T >::setClasses ( const std::vector< int > &  classes)

Set the classes to use in the dataset.

Parameters
classesVector of classes that will be in the dataset.

◆ setClassesAtEnd()

template<typename T = double>
void mltk::Data< T >::setClassesAtEnd ( bool  atEnd)
inline

Inform if the classes of the dataset to be loaded are at begining or end of the attributes.

Parameters
atEndBoolean informing if classes are at end.

◆ setClassesNames()

template<typename T = double>
void mltk::Data< T >::setClassesNames ( const std::vector< std::string >  class_names)
inline

Set classes names.

Parameters
class_namesstrings represeting the classes names.

◆ setDim()

template<typename T >
void mltk::Data< T >::setDim ( size_t  dim)

setDim Set the dimension of the points.

Parameters
dimDimension to be set.

◆ setFeaturesNames()

template<typename T >
void mltk::Data< T >::setFeaturesNames ( const std::vector< int > &  fnames)

setFeaturesNames Set the name of the features of the data.

Parameters
fnamesName of the features.

◆ setIndex()

template<typename T >
void mltk::Data< T >::setIndex ( std::vector< int >  index)

Set the index vector for the data.

Parameters
indexIndex vector.

◆ setName()

template<typename T = double>
void mltk::Data< T >::setName ( const std::string &  name)
inline

Set the name of the dataset.

Parameters
nameName of the dataset.

◆ setPoint()

template<typename T >
void mltk::Data< T >::setPoint ( int  index,
std::shared_ptr< Point< T > >  p 
)

setPoint Set the point in a position of the data.

Parameters
indexIndex of the point that will be set.
pPoint to be set.

◆ shuffle()

template<typename T >
void mltk::Data< T >::shuffle ( const size_t &  seed = 42)

Shuffle the data with a given seed.

Parameters
seedSeed given for randomization

◆ size()

template<typename T = double>
size_t mltk::Data< T >::size ( ) const
inline

Returns the size of the dataset.

Returns
int

◆ splitByClasses()

template<typename T >
std::vector< Data< T > > mltk::Data< T >::splitByClasses ( bool  keepIndex = false)

Split the dataset by its labels. Only valid for classification datasets.

Returns
Vector containing Data split by classes.

◆ splitSample()

template<typename T >
std::vector< Data< T > > mltk::Data< T >::splitSample ( const std::size_t &  split_size,
bool  stratified = true,
bool  keepIndex = false,
size_t  seed = 0 
)

Split the data by a given size.

Parameters
split_sizeNumber of samples on each split.
stratifiedIf true, the split will be done in a stratified manner.
seedRandom generator seed.
Returns
Vector containing the data split.

◆ updatePointValue()

template<typename T >
bool mltk::Data< T >::updatePointValue ( const size_t &  idx,
double  value 
)

Updates a Point value.

Parameters
idxIndex of the point to be updated.
valueNew value of the point.
Returns
Success of the point value update.

◆ write()

template<typename T >
void mltk::Data< T >::write ( const std::string &  fname,
std::string  ext 
)

write Write the data to a file with the given extention.

Parameters
fnameName of the file.
extExtention of the file.

The documentation for this class was generated from the following file: