5 #include "DistanceMetric.hpp"
9 template <
typename T >
15 std::vector<std::vector<size_t>> subspaces;
19 DSM(
const Data<T>& data,
size_t k,
size_t n,
double alpha): alpha(alpha), n_dims(n), k(k) {
20 labels = data.getLabels();
22 for(
auto& Si: subspaces) {
23 Si.resize(n_dims, -1);
27 double div_m(
int i,
size_t feat){
28 return (div_mx(feat)+div_ms(i))/2;
33 for(
int j = 0; j < subspaces.size();j++){
36 std::vector<size_t> intersection;
37 std::set_intersection(subspaces[i].begin(), subspaces[i].end(),
38 subspaces[j].begin(), subspaces[j].end(),
39 std::back_inserter(intersection));
40 val = double(intersection.size())/subspaces[j].size();
41 if(val > max_val) max_val = val;
46 double div_mx(
size_t feat){
48 for(
auto& Si: subspaces){
49 if(std::find(Si.begin(), Si.end(), feat) != Si.end()){
53 return 1.0 - (count/subspaces.size());
56 double qual_correlation(
const Data<T>& data,
int feat){
61 inline void setAlpha(
double _alpha){ this->alpha = _alpha; }
63 std::vector<std::vector<size_t>> operator()(
const Data< T > &data){
64 for(
int i = 0; i < n_dims; i++) {
65 for(
int j = 0; j < subspaces.size(); j++) {
67 for (
int c = 0; c < data.
dim(); c++) {
68 if (std::find(subspaces[j].begin(), subspaces[j].end(), c) == subspaces[j].end()) {
69 fscore[c] = alpha*qual_correlation(data, c) + (1-alpha)*div_m(j, c);
72 auto best_x_id = std::max_element(fscore.
X().begin(), fscore.
X().end()) - fscore.
X().begin();
73 subspaces[j][i] = best_x_id;
80 template <
typename T >
85 std::mt19937 generator;
86 std::vector<size_t> feats;
90 RSM(
double r,
size_t dims,
size_t seed): r(r), seed(seed) {
92 n_dims = std::ceil(r * dims);
94 std::iota(feats.begin(), feats.end(), 0);
96 std::vector<size_t> operator()(
Data< T > &data){
97 std::shuffle(feats.begin(), feats.end(), generator);
98 std::vector<size_t> new_feats(n_dims);
99 for(
size_t j = 0; j < new_feats.size(); j++){
100 new_feats[j] = feats[j];
109 template <
typename T,
typename Callable = metrics::dist::Eucl
idean< T > >
112 Callable distance_metric;
115 explicit OverSampling(Callable dist_metric): distance_metric(dist_metric) {}
122 template <
typename T=
double,
typename Callable = metrics::dist::Eucl
idean< T > >
133 explicit SMOTE(
size_t k = 1,
double r = 0.1,
size_t seed = 0, Callable dist_metric = Callable())
137 std::random_device rd;
138 seed = (seed > 0)?seed:rd();
139 std::mt19937 generator(seed);
140 std::uniform_real_distribution<double> distribution(0.0,1.0);
142 size_t n_apoints = r * data.
size();
146 int min_class = classes[std::min_element(class_distribution.begin(), class_distribution.end()) - class_distribution.begin()];
148 std::vector<int> class_copy = {min_class};
149 std::vector<SamplePointer< T > > artificial_data;
154 for(
auto z = Z.begin(); z != Z.end(); ++z){
155 std::vector<std::pair<size_t, double> > distance(Z.
size());
156 std::vector<SamplePointer< T > > k_neighbors(k);
161 std::transform(Z.begin(), Z.end(), distance.begin(), [
this, &_z, &
id](
auto p){
163 return std::make_pair(id, this->distance_metric(_z, *p));
167 std::sort(distance.begin(), distance.end(), [](
auto &d1,
auto &d2){
168 return d1.second < d2.second;
172 distance.erase(std::unique(distance.begin(), distance.end(), [](
auto &d1,
auto &d2){
173 return (d1.second == d2.second) || ((d1.first == d2.first));
176 distance.erase(std::remove_if(distance.begin(), distance.end(), [](
auto &d){
177 return d.second == 0;
181 for(
size_t i = 0; (i < k) && (i < distance.size()); i++){
182 k_neighbors[i] = data[distance[i].first-1];
186 for(
auto p = k_neighbors.begin(); p != k_neighbors.end(); p++){
190 double alpha = distribution(generator);
192 s = _z + alpha * (_z - _k);
195 artificial_data.push_back(mltk::make_point< T >(s));
199 std::shuffle(artificial_data.begin(), artificial_data.end(), std::default_random_engine(seed));
200 for (
size_t i = 0; i < artificial_data.size(); i++) {
210 template <
typename T=
double,
typename Callable = metrics::dist::Eucl
idean< T > >
223 explicit BorderlineSMOTEOne(
size_t k = 1,
double r = 0.1,
size_t m = 1,
size_t seed = 0, Callable dist_metric = Callable())
230 int maj_class = classes[std::max_element(class_distribution.begin(), class_distribution.end()) - class_distribution.begin()];
233 std::set<size_t> danger_ids;
236 for(
auto z = data.begin(); z != data.end(); ++z){
237 std::vector<std::pair<size_t, double> > distance(data.
size());
238 std::vector<SamplePointer< T > > M(m);
243 std::transform(data.begin(), data.end(), distance.begin(), [
this, &_z, &
id](
auto p){
245 return std::make_pair(id, this->distance_metric(*_z, *p));
249 std::sort(distance.begin(), distance.end(), [](
auto &d1,
auto &d2){
250 return d1.second < d2.second;
254 distance.erase(std::unique(distance.begin(), distance.end(), [](
auto &d1,
auto &d2){
255 return (d1.second == d2.second) || ((d1.first == d2.first));
258 distance.erase(std::remove_if(distance.begin(), distance.end(), [](
auto &d){
259 return d.second == 0;
263 for(
size_t i = 0; (i < m) && (i < distance.size()); i++){
264 M[i] = data[distance[i].first-1];
268 size_t m_ = std::count_if(M.begin(), M.end(), [&maj_class](
auto &p){
269 return p->Y() == maj_class;
273 if(m_ >= (m/2) && m_ < m){
274 danger_ids.insert(_z->Id());
280 if(danger_subset.
size() > 0){
282 smote(danger_subset);
285 for(
auto p = danger_subset.begin(); p != danger_subset.end(); p++){
288 if(danger_ids.find(_p->Id()) != danger_ids.end()){
Functor for the implementation of the Borderline SMOTE 1 over sampling algorithm.
Definition: Sampling.hpp:211
Definition: Sampling.hpp:10
bool insertPoint(const Data< T > &samples, int _index, bool keepIndex=false)
Insert a point to the data from another sample.
Definition: Data.hpp:1437
void classesCopy(const Data< T > &_data, std::vector< int > &classes)
Makes a deep copy from another data object.
Definition: Data.hpp:1520
size_t size() const
Returns the size of the dataset.
Definition: Data.hpp:208
const std::vector< int > classes() const
Returns a vector containing the numeric values of the classes.
Definition: Data.hpp:1831
size_t dim() const
Returns the dimension of the dataset.
Definition: Data.hpp:213
Point< T > getFeature(int index) const
Get the values of a feature from all points.
Definition: Data.hpp:2107
std::vector< size_t > classesDistribution() const
Returns a vector containing the frequency of the classes. Only valid for classification datasets.
Definition: Data.hpp:1826
Base class for the implementation of over sampling methods.
Definition: Sampling.hpp:110
Rep const & X() const
Returns the attributes representation of the point (std::vector by default).
Definition: Point.hpp:139
double const & Y() const
Returns the class or value of the point.
Definition: Point.hpp:152
Definition: Sampling.hpp:81
Functor for the implementation of the SMOTE over sampling algorithm.
Definition: Sampling.hpp:123
double covar(const mltk::Point< T, R > &p, const mltk::Point< T, R > &p1)
Compute the covariance between two points.
Definition: Statistics.hpp:180
double std_dev(const mltk::Point< T, R > &p)
Compute the standard deviation of a point.
Definition: Statistics.hpp:119
UFJF-MLTK main namespace for core functionalities.
Definition: classifier/Classifier.hpp:11