SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
FeatureSelection.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Soumyajit De
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  */
30 
31 #include <shogun/labels/Labels.h>
36 
37 namespace shogun
38 {
39 
40 template <class ST>
42 {
43  init();
44 }
45 
46 template <class ST>
48 {
49  SG_ADD(&m_target_dim, "target_dim", "target dimension",
51  SG_ADD((machine_int_t*)&m_algorithm, "algorithm",
52  "the feature selectiona algorithm", MS_NOT_AVAILABLE);
53  SG_ADD((machine_int_t*)&m_policy, "policy", "feature removal policy",
55  SG_ADD(&m_num_remove, "num_remove", "number or percentage of features to "
56  "be removed", MS_NOT_AVAILABLE);
57  SG_ADD((CSGObject**)&m_labels, "labels",
58  "the class labels for the features", MS_NOT_AVAILABLE);
59 
60  m_target_dim=0;
61  m_algorithm=BACKWARD_ELIMINATION;
62  m_policy=N_SMALLEST;
63  m_num_remove=1;
64  m_labels=NULL;
65 }
66 
67 template <class ST>
69 {
70  SG_UNREF(m_labels);
71 }
72 
73 
74 template <class ST>
76 {
77 }
78 
79 template <class ST>
81 {
82  SG_DEBUG("Entering!\n");
83 
84  // precompute whenever appropriate for performing the rest of the tasks
85  precompute();
86 
87  // NULL check for features is handled in get_num_features
88  index_t num_features=get_num_features(features);
89  SG_DEBUG("Initial number of features %d!\n", num_features);
90 
91  // the main loop
92  while (num_features>m_target_dim)
93  {
94  // tune the measurement parameters whenever necessary based on current
95  // features
96  adapt_params(features);
97 
98  // compute the measures for each of the current dimensions
99  SGVector<float64_t> measures(num_features);
100  for (index_t i=0; i<num_features; ++i)
101  measures[i]=compute_measures(features, i);
102 
103  if (io->get_loglevel()==MSG_DEBUG || io->get_loglevel()==MSG_GCDEBUG)
104  measures.display_vector("measures");
105 
106  // rank the measures
107  SGVector<index_t> argsorted=measures.argsort();
108 
109  if (io->get_loglevel()==MSG_DEBUG || io->get_loglevel()==MSG_GCDEBUG)
110  argsorted.display_vector("argsorted");
111 
112  // make sure that we don't end up with lesser feats than target dim
113  index_t to_remove;
114  if (m_policy==N_SMALLEST || m_policy==N_LARGEST)
115  to_remove=m_num_remove;
116  else
117  to_remove=num_features*m_num_remove*0.01;
118 
119  index_t can_remove=num_features-m_target_dim;
120 
121  // if policy is to remove N feats corresponding to smallest/largest
122  // measures, we just replace N with can_remove. if policy is to remove
123  // N% feats, then we change the policy temporarily and remove a fixed
124  // can_remove number of feats instead
125  index_t orig_remove=m_num_remove;
126  EFeatureRemovalPolicy orig_policy=m_policy;
127 
128  if (to_remove>can_remove)
129  {
130  m_num_remove=can_remove;
131  SG_DEBUG("Can only remove %d features in this iteration!\n",
132  can_remove);
133 
134  if (m_policy==PERCENTILE_SMALLEST)
135  m_policy=N_SMALLEST;
136  else if (m_policy==PERCENTILE_LARGEST)
137  m_policy=N_LARGEST;
138  }
139 
140  // remove appropriate number of features based on the measures and the
141  // removal policy
142  features=remove_feats(features, argsorted);
143 
144  // restore original removal policy and numbers if necessary for the
145  // sake of consistency
146  if (to_remove>can_remove)
147  {
148  m_policy=orig_policy;
149  m_num_remove=orig_remove;
150  }
151 
152  // update the number of features
153  num_features=get_num_features(features);
154  SG_DEBUG("Current number of features %d!\n", num_features);
155  }
156 
157  SG_DEBUG("Leaving!\n");
158  return features;
159 }
160 
161 template <class ST>
163 {
164  SG_DEBUG("Entering!\n");
165 
166  // sanity checks
167  REQUIRE(features, "Features cannot be NULL!\n");
168  REQUIRE(features->get_num_vectors()>0,
169  "Number of feature vectors has to be positive!\n");
170  REQUIRE(m_target_dim>0, "Target dimension (%d) has to be positive! Set "
171  "a higher number via set_target_dim().\n", m_target_dim);
172 
173  index_t num_features=get_num_features(features);
174  REQUIRE(num_features>0, "Invalid number of features (%d)! Most likely "
175  "feature selection cannot be performed for %s!\n",
176  num_features, features->get_name());
177  REQUIRE(num_features>m_target_dim,
178  "Number of original features (dimensions of the feature vectors) "
179  "(%d) has to be greater that the target dimension (%d)!\n",
180  num_features, m_target_dim);
181 
182  // this method makes a deep copy of the feature object and performs
183  // feature selection on it. This is already SG_REF'ed because of the
184  // implementation of clone()
185  CFeatures* feats_copy=(CFeatures*)features->clone();
186 
187  switch (m_algorithm)
188  {
190  return apply_backward_elimination(feats_copy);
191  default:
192  SG_ERROR("Specified algorithm not yet supported!\n");
193  return features;
194  }
195 
196  SG_DEBUG("Leaving!\n");
197 }
198 
199 template <class ST>
201 {
202 }
203 
204 template <class ST>
206 {
207 }
208 
209 template <class ST>
211 {
212  REQUIRE(features, "Features not initialized!\n");
213 
214  EFeatureClass f_class=features->get_feature_class();
215 
216  switch (f_class)
217  {
218  case C_DENSE:
219  {
220  CDenseFeatures<ST>* d_feats=dynamic_cast<CDenseFeatures<ST>*>(features);
221  REQUIRE(d_feats, "Type mismatch for dense features!\n");
222  return d_feats->get_num_features();
223  }
224  case C_SPARSE:
225  {
226  CSparseFeatures<ST>* s_feats=dynamic_cast<CSparseFeatures<ST>*>(features);
227  REQUIRE(s_feats, "Type mismatch for sparse features!\n");
228  return s_feats->get_num_features();
229  }
230  default:
231  SG_ERROR("Number of features not available for %s!\n",
232  features->get_name());
233  break;
234  }
235 
236  return 0;
237 }
238 
239 template <class ST>
241 {
242  m_target_dim=target_dim;
243 }
244 
245 template <class ST>
247 {
248  return m_target_dim;
249 }
250 
251 template <class ST>
253 {
254  return m_algorithm;
255 }
256 
257 template <class ST>
259 {
260  return m_policy;
261 }
262 
263 template <class ST>
265 {
266  m_num_remove=num_remove;
267 }
268 
269 template <class ST>
271 {
272  return m_num_remove;
273 }
274 
275 template <class ST>
277 {
278  SG_REF(labels);
279  SG_UNREF(m_labels);
280  m_labels=labels;
281 }
282 
283 template <class ST>
285 {
286  SG_REF(m_labels);
287  return m_labels;
288 }
289 
290 template <class ST>
292 {
293  return C_ANY;
294 }
295 
296 template <class ST>
298 {
299  return P_UNKNOWN;
300 }
301 
302 template<>
304 {
305  return F_LONGREAL;
306 }
307 
308 template<>
310 {
311  return F_DREAL;
312 }
313 
314 template<>
316 {
317  return F_SHORTREAL;
318 }
319 
320 template<>
322 {
323  return F_SHORT;
324 }
325 
326 template<>
328 {
329  return F_WORD;
330 }
331 
332 template<>
334 {
335  return F_CHAR;
336 }
337 
338 template<>
340 {
341  return F_CHAR;
342 }
343 
344 template<>
346 {
347  return F_BYTE;
348 }
349 
350 template<>
352 {
353  return F_INT;
354 }
355 
356 template<>
358 {
359  return F_UINT;
360 }
361 
362 template<>
364 {
365  return F_LONG;
366 }
367 
368 template<>
370 {
371  return F_ULONG;
372 }
373 
374 template<>
376 {
377  return F_BOOL;
378 }
379 
380 template class CFeatureSelection<bool>;
381 template class CFeatureSelection<char>;
382 template class CFeatureSelection<int8_t>;
383 template class CFeatureSelection<uint8_t>;
384 template class CFeatureSelection<int16_t>;
385 template class CFeatureSelection<uint16_t>;
386 template class CFeatureSelection<int32_t>;
387 template class CFeatureSelection<uint32_t>;
388 template class CFeatureSelection<int64_t>;
389 template class CFeatureSelection<uint64_t>;
390 template class CFeatureSelection<float32_t>;
391 template class CFeatureSelection<float64_t>;
392 template class CFeatureSelection<floatmax_t>;
393 
394 }
virtual const char * get_name() const =0
virtual void adapt_params(CFeatures *features)
EPreprocessorType
Definition: Preprocessor.h:32
int32_t index_t
Definition: common.h:62
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
virtual CSGObject * clone()
Definition: SGObject.cpp:1302
#define SG_UNREF(x)
Definition: SGRefObject.h:35
virtual int32_t get_num_vectors() const =0
#define SG_ERROR(...)
Definition: SGIO.h:130
#define REQUIRE(x,...)
Definition: SGIO.h:207
int32_t get_num_features() const
virtual EPreprocessorType get_type() const
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:38
Template class CFeatureSelection, base class for all feature selection preprocessors which select a s...
void set_num_remove(index_t num_remove)
EFeatureSelectionAlgorithm get_algorithm() const
virtual void set_labels(CLabels *labels)
virtual CFeatures * apply_backward_elimination(CFeatures *features)
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:98
EFeatureRemovalPolicy get_policy() const
#define SG_REF(x)
Definition: SGRefObject.h:34
virtual EFeatureClass get_feature_class()
virtual EFeatureClass get_feature_class() const =0
int32_t get_num_features() const
EFeatureType
shogun feature type
Definition: FeatureTypes.h:19
The class DenseFeatures implements dense feature matrices.
Definition: LDA.h:41
#define SG_DEBUG(...)
Definition: SGIO.h:108
virtual EFeatureType get_feature_type()
index_t get_num_features(CFeatures *features) const
int machine_int_t
Definition: common.h:59
The class Features is the base class of all feature objects.
Definition: Features.h:68
EFeatureSelectionAlgorithm
Class Preprocessor defines a preprocessor interface.
Definition: Preprocessor.h:75
void set_target_dim(index_t target_dim)
#define SG_ADD(...)
Definition: SGObject.h:67
SGVector< index_t > argsort()
Definition: SGVector.cpp:194
void display_vector(const char *name="vector", const char *prefix="") const
Definition: SGVector.cpp:405
virtual CFeatures * apply(CFeatures *features)
CLabels * get_labels() const

SHOGUN Machine Learning Toolbox - Documentation