SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
KLInferenceMethod.cpp
Go to the documentation of this file.
1  /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Wu Lin
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  *
30  * This code specifically adapted from function in approxKL.m and infKL.m
31  */
32 
34 
35 #ifdef HAVE_EIGEN3
37 
38 using namespace Eigen;
39 
40 namespace shogun
41 {
42 
43 CKLInferenceMethod::CKLInferenceMethod() : CInferenceMethod()
44 {
45  init();
46 }
47 
49  CFeatures* feat, CMeanFunction* m, CLabels* lab, CLikelihoodModel* mod)
50  : CInferenceMethod(kern, feat, m, lab, mod)
51 {
52  init();
54 }
55 
57 {
58  REQUIRE(mod, "the likelihood model must not be NULL\n")
60  REQUIRE(lik,
61  "The provided likelihood model (%s) must support variational Gaussian inference. ",
62  "Please use a Variational Gaussian Likelihood model\n",
63  mod->get_name());
64 }
65 
67 {
70 }
71 
72 void CKLInferenceMethod::init()
73 {
74  m_noise_factor=1e-10;
75  m_max_attempt=0;
76  m_exp_factor=2;
77  m_min_coeff_kernel=1e-5;
78  SG_ADD(&m_noise_factor, "noise_factor",
79  "The noise factor used for correcting Kernel matrix",
81  SG_ADD(&m_exp_factor, "exp_factor",
82  "The exponential factor used for increasing noise_factor",
84  SG_ADD(&m_max_attempt, "max_attempt",
85  "The max number of attempt to correct Kernel matrix",
87  SG_ADD(&m_min_coeff_kernel, "min_coeff_kernel",
88  "The minimum coeefficient of kernel matrix in LDLT factorization used to check whether the kernel matrix is positive definite or not",
90 
92  SG_ADD(&m_m, "m",
93  "The number of corrections to approximate the inverse Hessian matrix",
95  SG_ADD(&m_max_linesearch, "max_linesearch",
96  "The maximum number of trials to do line search for each L-BFGS update",
98  SG_ADD(&m_linesearch, "linesearch",
99  "The line search algorithm",
101  SG_ADD(&m_max_iterations, "max_iterations",
102  "The maximum number of iterations for L-BFGS update",
104  SG_ADD(&m_delta, "delta",
105  "Delta for convergence test based on the change of function value",
107  SG_ADD(&m_past, "past",
108  "Distance for delta-based convergence test",
110  SG_ADD(&m_epsilon, "epsilon",
111  "Epsilon for convergence test based on the change of gradient",
113  SG_ADD(&m_min_step, "min_step",
114  "The minimum step of the line search",
116  SG_ADD(&m_max_step, "max_step",
117  "The maximum step of the line search",
119  SG_ADD(&m_ftol, "ftol",
120  "A parameter used in Armijo condition",
122  SG_ADD(&m_wolfe, "wolfe",
123  "A parameter used in curvature condition",
125  SG_ADD(&m_gtol, "gtol",
126  "A parameter used in Morethuente linesearch to control the accuracy",
128  SG_ADD(&m_xtol, "xtol",
129  "The machine precision for floating-point values",
131  SG_ADD(&m_orthantwise_c, "orthantwise_c",
132  "Coeefficient for the L1 norm of variables",
134  SG_ADD(&m_orthantwise_start, "orthantwise_start",
135  "Start index for computing L1 norm of the variables",
137  SG_ADD(&m_orthantwise_end, "orthantwise_end",
138  "End index for computing L1 norm of the variables",
140  SG_ADD(&m_s2, "s2",
141  "Variational parameter sigma2",
143  SG_ADD(&m_mu, "mu",
144  "Variational parameter mu and posterior mean",
146  SG_ADD(&m_Sigma, "Sigma",
147  "Posterior covariance matrix Sigma",
149 }
150 
152 {
153 }
154 
156 {
157  SG_DEBUG("entering\n");
158 
160  update_init();
161  update_alpha();
162  update_chol();
164  update_deriv();
166 
167  SG_DEBUG("leaving\n");
168 }
169 
171 {
172  REQUIRE(noise_factor>=0, "The noise_factor %.20f should be non-negative\n", noise_factor);
173  m_noise_factor=noise_factor;
174 }
175 
177 {
178  REQUIRE(min_coeff_kernel>=0, "The min_coeff_kernel %.20f should be non-negative\n", min_coeff_kernel);
179  m_min_coeff_kernel=min_coeff_kernel;
180 }
181 
183 {
184  REQUIRE(max_attempt>=0, "The max_attempt %d should be non-negative. 0 means inifity attempts\n", max_attempt);
185  m_max_attempt=max_attempt;
186 }
187 
189 {
190  REQUIRE(exp_factor>1.0, "The exp_factor %f should be greater than 1.0.\n", exp_factor);
191  m_exp_factor=exp_factor;
192 }
193 
195 {
197 }
198 
199 Eigen::LDLT<Eigen::MatrixXd> CKLInferenceMethod::update_init_helper()
200 {
201  Map<MatrixXd> eigen_K(m_ktrtr.matrix, m_ktrtr.num_rows, m_ktrtr.num_cols);
202 
203  eigen_K=eigen_K+m_noise_factor*MatrixXd::Identity(m_ktrtr.num_rows, m_ktrtr.num_cols);
204 
205  Eigen::LDLT<Eigen::MatrixXd> ldlt;
206  ldlt.compute(eigen_K*CMath::sq(m_scale));
207 
208  float64_t attempt_count=0;
209  MatrixXd Kernel_D=ldlt.vectorD();
210  float64_t noise_factor=m_noise_factor;
211 
212  while (Kernel_D.minCoeff()<=m_min_coeff_kernel)
213  {
214  if (m_max_attempt>0 && attempt_count>m_max_attempt)
215  SG_ERROR("The Kernel matrix is highly non-positive definite since the min_coeff_kernel is less than %.20f",
216  " even when adding %.20f noise to the diagonal elements at max %d attempts\n",
217  m_min_coeff_kernel, noise_factor, m_max_attempt);
218  attempt_count++;
219  float64_t pre_noise_factor=noise_factor;
220  noise_factor*=m_exp_factor;
221  //updat the noise eigen_K=eigen_K+noise_factor*(m_exp_factor^attempt_count)*Identity()
222  eigen_K=eigen_K+(noise_factor-pre_noise_factor)*MatrixXd::Identity(m_ktrtr.num_rows, m_ktrtr.num_cols);
223  ldlt.compute(eigen_K*CMath::sq(m_scale));
224  Kernel_D=ldlt.vectorD();
225  }
226 
227  return ldlt;
228 }
229 
231 {
233  update();
234 
235  return SGVector<float64_t>(m_mu);
236 }
237 
239 {
241  update();
242 
244 }
245 
246 float64_t CKLInferenceMethod::evaluate(void *obj, const float64_t *parameters,
247  float64_t *gradient, const int dim, const float64_t step)
248 {
249  /* Note that parameters = parameters_pre_iter - step * gradient_pre_iter */
250  CKLInferenceMethod * obj_prt
251  = static_cast<CKLInferenceMethod *>(obj);
252 
253  REQUIRE(obj_prt, "The instance object passed to L-BFGS optimizer should not be NULL\n");
254 
255  obj_prt->lbfgs_precompute();
256  float64_t nlml=obj_prt->get_nlml_wrt_parameters();
257 
258  SGVector<float64_t> sg_gradient(gradient, dim, false);
259  obj_prt->get_gradient_of_nlml_wrt_parameters(sg_gradient);
260 
261  return nlml;
262 }
263 
265 {
268  return lik;
269 }
270 
272 {
276 }
277 
279  int m,
280  int max_linesearch,
281  int linesearch,
282  int max_iterations,
284  int past,
286  float64_t min_step,
287  float64_t max_step,
288  float64_t ftol,
289  float64_t wolfe,
290  float64_t gtol,
291  float64_t xtol,
292  float64_t orthantwise_c,
293  int orthantwise_start,
294  int orthantwise_end)
295 {
296  m_m = m;
297  m_max_linesearch = max_linesearch;
298  m_linesearch = linesearch;
299  m_max_iterations = max_iterations;
300  m_delta = delta;
301  m_past = past;
302  m_epsilon = epsilon;
303  m_min_step = min_step;
304  m_max_step = max_step;
305  m_ftol = ftol;
306  m_wolfe = wolfe;
307  m_gtol = gtol;
308  m_xtol = xtol;
309  m_orthantwise_c = orthantwise_c;
310  m_orthantwise_start = orthantwise_start;
311  m_orthantwise_end = orthantwise_end;
312 }
313 
315 {
317  update();
318 
320 }
321 
323 {
326  return SGVector<float64_t> ();
327 
328  //%lp_dhyp = likKL(v,lik,hyp.lik,y,K*post.alpha+m,[],[],j);
330  Map<VectorXd> eigen_lp_dhyp(lp_dhyp.vector, lp_dhyp.vlen);
331  SGVector<float64_t> result(1);
332  //%dnlZ.lik(j) = -sum(lp_dhyp);
333  result[0]=-eigen_lp_dhyp.sum();
334 
335  return result;
336 }
337 
339 {
340  // create eigen representation of K, Z, dfhat and alpha
341  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen/2);
342 
343  SGVector<float64_t> result;
344 
345  if (param->m_datatype.m_ctype==CT_VECTOR ||
346  param->m_datatype.m_ctype==CT_SGVECTOR)
347  {
349  "Length of the parameter %s should not be NULL\n", param->m_name)
350 
351  result=SGVector<float64_t>(*(param->m_datatype.m_length_y));
352  }
353  else
354  {
355  result=SGVector<float64_t>(1);
356  }
357 
358  for (index_t i=0; i<result.vlen; i++)
359  {
361 
362  //%dm_t = feval(mean{:}, hyp.mean, x, j);
363  if (result.vlen==1)
365  else
367 
368  Map<VectorXd> eigen_dmu(dmu.vector, dmu.vlen);
369 
370  //%dnlZ.mean(j) = -alpha'*dm_t;
371  result[i]=-eigen_alpha.dot(eigen_dmu);
372  }
373 
374  return result;
375 }
376 
378 {
379  lbfgs_parameter_t lbfgs_param;
380  lbfgs_param.m = m_m;
381  lbfgs_param.max_linesearch = m_max_linesearch;
382  lbfgs_param.linesearch = m_linesearch;
383  lbfgs_param.max_iterations = m_max_iterations;
384  lbfgs_param.delta = m_delta;
385  lbfgs_param.past = m_past;
386  lbfgs_param.epsilon = m_epsilon;
387  lbfgs_param.min_step = m_min_step;
388  lbfgs_param.max_step = m_max_step;
389  lbfgs_param.ftol = m_ftol;
390  lbfgs_param.wolfe = m_wolfe;
391  lbfgs_param.gtol = m_gtol;
392  lbfgs_param.xtol = m_xtol;
393  lbfgs_param.orthantwise_c = m_orthantwise_c;
395  lbfgs_param.orthantwise_end = m_orthantwise_end;
396 
397  float64_t nlml_opt=0;
398  void * obj_prt = static_cast<void *>(this);
399  lbfgs(m_alpha.vlen, m_alpha.vector, &nlml_opt,
400  CKLInferenceMethod::evaluate,
401  NULL, obj_prt, &lbfgs_param);
402 
403  return nlml_opt;
404 }
405 
407 {
408  REQUIRE(!strcmp(param->m_name, "scale"), "Can't compute derivative of "
409  "the nagative log marginal likelihood wrt %s.%s parameter\n",
410  get_name(), param->m_name)
411 
412  Map<MatrixXd> eigen_K(m_ktrtr.matrix, m_ktrtr.num_rows, m_ktrtr.num_cols);
413  // compute derivative K wrt scale
414  MatrixXd eigen_dK=eigen_K*m_scale*2.0;
415 
416  SGVector<float64_t> result(1);
417 
418  result[0]=get_derivative_related_cov(eigen_dK);
419 
420  return result;
421 }
422 
424 {
425  SGVector<float64_t> result;
426 
427  if (param->m_datatype.m_ctype==CT_VECTOR ||
428  param->m_datatype.m_ctype==CT_SGVECTOR)
429  {
431  "Length of the parameter %s should not be NULL\n", param->m_name)
432  result=SGVector<float64_t>(*(param->m_datatype.m_length_y));
433  }
434  else
435  {
436  result=SGVector<float64_t>(1);
437  }
438 
439  for (index_t i=0; i<result.vlen; i++)
440  {
442 
443  //dK = feval(covfunc{:},hyper,x,j);
444  if (result.vlen==1)
445  dK=m_kernel->get_parameter_gradient(param);
446  else
447  dK=m_kernel->get_parameter_gradient(param, i);
448 
449  Map<MatrixXd> eigen_dK(dK.matrix, dK.num_cols, dK.num_rows);
450 
451  result[i]=get_derivative_related_cov(eigen_dK*CMath::sq(m_scale));
452  }
453 
454  return result;
455 }
456 
458 {
460  update();
461 
462  return SGMatrix<float64_t>(m_L);
463 }
464 
465 } /* namespace shogun */
466 
467 #endif /* HAVE_EIGEN3 */
virtual const char * get_name() const =0
virtual void set_model(CLikelihoodModel *mod)
static float64_t dot(const bool *v1, const bool *v2, int32_t n)
compute dot product between v1 and v2 (blas optimized)
Definition: SGVector.h:345
virtual void update_parameter_hash()
Definition: SGObject.cpp:196
virtual SGVector< float64_t > get_derivative_wrt_likelihood_model(const TParameter *param)
virtual void update_alpha()=0
SGVector< float64_t > m_alpha
virtual SGMatrix< float64_t > get_cholesky()
virtual SGVector< float64_t > get_first_derivative_wrt_hyperparameter(const TParameter *param) const =0
int32_t lbfgs(int32_t n, float64_t *x, float64_t *ptr_fx, lbfgs_evaluate_t proc_evaluate, lbfgs_progress_t proc_progress, void *instance, lbfgs_parameter_t *_param, lbfgs_adjust_step_t proc_adjust_step)
Definition: lbfgs.cpp:207
The Inference Method base class.
virtual void set_exp_factor(float64_t exp_factor)
int32_t index_t
Definition: common.h:62
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
static T sum(T *vec, int32_t len)
return sum(vec)
Definition: SGVector.h:507
The variational Gaussian Likelihood base class. The variational distribution is Gaussian.
static T sq(T x)
x^2
Definition: Math.h:324
float64_t orthantwise_c
Definition: lbfgs.h:309
parameter struct
Definition: Parameter.h:32
virtual void update_approx_cov()=0
#define SG_ERROR(...)
Definition: SGIO.h:130
#define REQUIRE(x,...)
Definition: SGIO.h:207
virtual float64_t get_derivative_related_cov(Eigen::MatrixXd eigen_dK)=0
virtual SGVector< float64_t > get_derivative_wrt_mean(const TParameter *param)
An abstract class of the mean function.
Definition: MeanFunction.h:28
virtual const char * get_name() const
virtual void set_variational_distribution(SGVector< float64_t > mu, SGVector< float64_t > s2, const CLabels *lab)
static const float64_t epsilon
Definition: libbmrm.cpp:24
SGMatrix< float64_t > m_Sigma
TSGDataType m_datatype
Definition: Parameter.h:165
SGMatrix< float64_t > m_L
virtual float64_t get_negative_log_marginal_likelihood()
virtual void set_min_coeff_kernel(float64_t min_coeff_kernel)
virtual void check_variational_likelihood(CLikelihoodModel *mod) const
virtual float64_t lbfgs_optimization()
double float64_t
Definition: common.h:50
virtual SGVector< float64_t > get_derivative_wrt_inference_method(const TParameter *param)
virtual void set_max_attempt(index_t max_attempt)
virtual SGVector< float64_t > get_posterior_mean()
index_t num_rows
Definition: SGMatrix.h:298
virtual SGMatrix< float64_t > get_posterior_covariance()
index_t num_cols
Definition: SGMatrix.h:300
virtual Eigen::LDLT< Eigen::MatrixXd > update_init_helper()
The KL approximation inference method class.
index_t * m_length_y
Definition: DataType.h:78
virtual void set_lbfgs_parameters(int m=100, int max_linesearch=1000, int linesearch=LBFGS_LINESEARCH_DEFAULT, int max_iterations=1000, float64_t delta=0.0, int past=0, float64_t epsilon=1e-5, float64_t min_step=1e-20, float64_t max_step=1e+20, float64_t ftol=1e-4, float64_t wolfe=0.9, float64_t gtol=0.9, float64_t xtol=1e-16, float64_t orthantwise_c=0.0, int orthantwise_start=0, int orthantwise_end=1)
virtual void lbfgs_precompute()=0
virtual float64_t get_nlml_wrt_parameters()
virtual SGVector< float64_t > get_parameter_derivative(const CFeatures *features, const TParameter *param, index_t index=-1)
Definition: MeanFunction.h:52
EContainerType m_ctype
Definition: DataType.h:71
#define SG_DEBUG(...)
Definition: SGIO.h:108
virtual void set_noise_factor(float64_t noise_factor)
virtual SGVector< float64_t > get_derivative_wrt_kernel(const TParameter *param)
The class Features is the base class of all feature objects.
Definition: Features.h:68
virtual void update_chol()=0
virtual SGMatrix< float64_t > get_parameter_gradient(const TParameter *param, index_t index=-1)
Definition: Kernel.h:732
The Variational Likelihood base class.
SGVector< float64_t > m_mu
SGVector< float64_t > m_s2
The Kernel base class.
Definition: Kernel.h:153
virtual void get_gradient_of_nlml_wrt_parameters(SGVector< float64_t > gradient)=0
virtual CVariationalGaussianLikelihood * get_variational_likelihood() const
#define SG_ADD(...)
Definition: SGObject.h:67
virtual bool supports_derivative_wrt_hyperparameter() const =0
virtual void update_deriv()=0
virtual void set_model(CLikelihoodModel *mod)
#define delta
Definition: sfa.cpp:23
virtual bool parameter_hash_changed()
Definition: SGObject.cpp:209
The Likelihood model base class.
SGMatrix< float64_t > m_ktrtr
virtual float64_t get_negative_log_marginal_likelihood_helper()=0
CLikelihoodModel * m_model
index_t vlen
Definition: SGVector.h:707

SHOGUN Machine Learning Toolbox - Documentation