SHOGUN  3.2.1
 全部  命名空间 文件 函数 变量 类型定义 枚举 枚举值 友元 宏定义  
KLDualInferenceMethod.cpp
浏览该文件的文档.
1  /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Wu Lin
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  *
30  * Code adapted from
31  * http://hannes.nickisch.org/code/approxXX.tar.gz
32  * and Gaussian Process Machine Learning Toolbox
33  * http://www.gaussianprocess.org/gpml/code/matlab/doc/
34  * and the reference paper is
35  * Mohammad Emtiyaz Khan, Aleksandr Y. Aravkin, Michael P. Friedlander, Matthias Seeger
36  * Fast Dual Variational Inference for Non-Conjugate Latent Gaussian Models. ICML2013*
37  *
38  * This code specifically adapted from function in approxKL.m and infKL.m
39  */
40 
42 
43 #ifdef HAVE_EIGEN3
48 
49 using namespace Eigen;
50 
51 namespace shogun
52 {
53 
54 CKLDualInferenceMethod::CKLDualInferenceMethod() : CKLInferenceMethod()
55 {
56  init();
57 }
58 
60  CFeatures* feat, CMeanFunction* m, CLabels* lab, CLikelihoodModel* mod)
61  : CKLInferenceMethod(kern, feat, m, lab, mod)
62 {
63  init();
64 }
65 
67 {
69  update();
70 
72  return result;
73 }
74 
76 {
77 }
78 
80 {
82  REQUIRE(lik,
83  "The provided likelihood model is not a variational dual Likelihood model.\n");
84 }
85 
87 {
90 }
91 
93 {
96  return lik;
97 }
98 
99 void CKLDualInferenceMethod::init()
100 {
101  SG_ADD(&m_W, "W",
102  "noise matrix W",
104  SG_ADD(&m_sW, "sW",
105  "Square root of noise matrix W",
107  SG_ADD(&m_dv, "dv",
108  "the gradient of the variational expection wrt sigma2",
110  SG_ADD(&m_df, "df",
111  "the gradient of the variational expection wrt mu",
113  SG_ADD(&m_is_dual_valid, "is_dual_valid",
114  "whether the lambda (m_W) is valid or not",
116  m_is_dual_valid=false;
117 }
118 
120 {
121  Map<MatrixXd> eigen_K(m_ktrtr.matrix, m_ktrtr.num_rows, m_ktrtr.num_cols);
123  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
124 
125  lik->set_dual_parameters(m_W, m_labels);
126  m_is_dual_valid=lik->dual_parameters_valid();
127 
128  if (!m_is_dual_valid)
129  return;
130 
131  //construct alpha
133  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
134  eigen_alpha=-eigen_alpha;
135 
136  Map<VectorXd> eigen_sW(m_sW.vector, m_sW.vlen);
137  eigen_sW=eigen_W.array().sqrt().matrix();
138 
140  Map<MatrixXd> eigen_L(m_L.matrix, m_L.num_rows, m_L.num_cols);
141 
142  //solve L'*V=diag(sW)*K
143  Map<MatrixXd> eigen_V(m_V.matrix, m_V.num_rows, m_V.num_cols);
144  eigen_V=eigen_L.triangularView<Upper>().adjoint().solve(eigen_sW.asDiagonal()*eigen_K*CMath::sq(m_scale));
145  Map<VectorXd> eigen_s2(m_s2.vector, m_s2.vlen);
146  //Sigma=inv(inv(K)+diag(W))=K-K*diag(sW)*inv(L)'*inv(L)*diag(sW)*K
147  //v=abs(diag(Sigma))
148  eigen_s2=(eigen_K.diagonal().array()*CMath::sq(m_scale)-(eigen_V.array().pow(2).colwise().sum().transpose())).abs().matrix();
149 
150  //construct mu
152  Map<VectorXd> eigen_mean(mean.vector, mean.vlen);
153  Map<VectorXd> eigen_mu(m_mu.vector, m_mu.vlen);
154  //mu=K*alpha+m
155  eigen_mu=eigen_K*CMath::sq(m_scale)*eigen_alpha+eigen_mean;
156 }
157 
159 {
160  if (!m_is_dual_valid)
161  return CMath::INFTY;
162 
164  Map<VectorXd> eigen_mean(mean.vector, mean.vlen);
165  Map<VectorXd> eigen_mu(m_mu.vector, m_mu.vlen);
166  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
167  Map<MatrixXd> eigen_L(m_L.matrix, m_L.num_rows, m_L.num_cols);
168 
170 
172  float64_t result=0.5*eigen_alpha.dot(eigen_mu-eigen_mean)+a;
173  result+=eigen_mean.dot(eigen_alpha);
174  result-=eigen_L.diagonal().array().log().sum();
175 
176  return result;
177 }
178 
180 {
181  REQUIRE(gradient.vlen==m_alpha.vlen,
182  "The length of gradients (%d) should the same as the length of parameters (%d)\n",
183  gradient.vlen, m_alpha.vlen);
184 
185  if (!m_is_dual_valid)
186  return;
187 
188  Map<VectorXd> eigen_gradient(gradient.vector, gradient.vlen);
189 
191 
192  TParameter* lambda_param=lik->m_parameters->get_parameter("lambda");
193  SGVector<float64_t>d_lambda=lik->get_dual_first_derivative(lambda_param);
194  Map<VectorXd> eigen_d_lambda(d_lambda.vector, d_lambda.vlen);
195 
196  Map<VectorXd> eigen_mu(m_mu.vector, m_mu.vlen);
197  Map<VectorXd> eigen_s2(m_s2.vector, m_s2.vlen);
198 
199  eigen_gradient=-eigen_mu-0.5*eigen_s2+eigen_d_lambda;
200 }
201 
202 float64_t CKLDualInferenceMethod::get_nlml_wrapper(SGVector<float64_t> alpha, SGVector<float64_t> mu, SGMatrix<float64_t> L)
203 {
204  Map<MatrixXd> eigen_L(L.matrix, L.num_rows, L.num_cols);
205  Map<VectorXd> eigen_alpha(alpha.vector, alpha.vlen);
206  Map<MatrixXd> eigen_K(m_ktrtr.matrix, m_ktrtr.num_rows, m_ktrtr.num_cols);
207  //get mean vector and create eigen representation of it
209  Map<VectorXd> eigen_mu(mu.vector, mu.vlen);
210  Map<VectorXd> eigen_mean(mean.vector, mean.vlen);
211 
213 
214  SGVector<float64_t>lab=((CBinaryLabels*)m_labels)->get_labels();
215  Map<VectorXd> eigen_lab(lab.vector, lab.vlen);
216 
218 
219  float64_t trace=0;
220  //L_inv=L\eye(n);
221  //trace(L_inv'*L_inv) %V*inv(K)
222  MatrixXd eigen_t=eigen_L.triangularView<Upper>().adjoint().solve(MatrixXd::Identity(eigen_L.rows(),eigen_L.cols()));
223 
224  for(index_t idx=0; idx<eigen_t.rows(); idx++)
225  trace +=(eigen_t.col(idx).array().pow(2)).sum();
226 
227  //nlZ = -a -logdet(V*inv(K))/2 -n/2 +(alpha'*K*alpha)/2 +trace(V*inv(K))/2;
228  float64_t result=-a+eigen_L.diagonal().array().log().sum();
229 
230  result+=0.5*(-eigen_K.rows()+eigen_alpha.dot(eigen_mu-eigen_mean)+trace);
231  return result;
232 }
233 
235 {
238  return get_nlml_wrapper(m_alpha, m_mu, m_L);
239 }
240 
242 {
243  Map<MatrixXd> eigen_K(m_ktrtr.matrix, m_ktrtr.num_rows, m_ktrtr.num_cols);
244  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
245  Map<MatrixXd> eigen_L(m_L.matrix, m_L.num_rows, m_L.num_cols);
246  Map<VectorXd> eigen_sW(m_sW.vector, m_sW.vlen);
247  Map<MatrixXd> eigen_Sigma(m_Sigma.matrix, m_Sigma.num_rows, m_Sigma.num_cols);
248  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
249 
250  Map<VectorXd> eigen_dv(m_dv.vector, m_dv.vlen);
251  Map<VectorXd> eigen_df(m_df.vector, m_df.vlen);
252 
253  index_t len=m_W.vlen;
254  //U=inv(L')*diag(sW)
255  MatrixXd eigen_U=eigen_L.triangularView<Upper>().adjoint().solve(MatrixXd(eigen_sW.asDiagonal()));
256  //A=I-K*diag(sW)*inv(L)*inv(L')*diag(sW)
257  Map<MatrixXd> eigen_V(m_V.matrix, m_V.num_rows, m_V.num_cols);
258  MatrixXd eigen_A=MatrixXd::Identity(len, len)-eigen_V.transpose()*eigen_U;
259 
260  //AdK = A*dK;
261  MatrixXd AdK=eigen_A*eigen_dK;
262 
263  //z = diag(AdK) + sum(A.*AdK,2) - sum(A'.*AdK,1)';
264  VectorXd z=AdK.diagonal()+(eigen_A.array()*AdK.array()).rowwise().sum().matrix()
265  -(eigen_A.transpose().array()*AdK.array()).colwise().sum().transpose().matrix();
266 
267  float64_t result=eigen_alpha.dot(eigen_dK*(eigen_alpha/2.0-eigen_df))-z.dot(eigen_dv);
268 
269  return result;
270 }
271 
273 {
274  float64_t nlml_new=0;
275  float64_t nlml_def=0;
276 
277  Map<MatrixXd> eigen_K(m_ktrtr.matrix, m_ktrtr.num_rows, m_ktrtr.num_cols);
279 
281  {
284  SGVector<float64_t> W_tmp(len);
285  Map<VectorXd> eigen_W(W_tmp.vector, W_tmp.vlen);
286  eigen_W.fill(0.5);
287  SGVector<float64_t> sW_tmp(len);
288  Map<VectorXd> eigen_sW(sW_tmp.vector, sW_tmp.vlen);
289  eigen_sW=eigen_W.array().sqrt().matrix();
291  Map<MatrixXd> eigen_L(L_tmp.matrix, L_tmp.num_rows, L_tmp.num_cols);
292 
293  lik->set_dual_parameters(W_tmp, m_labels);
294 
295  //construct alpha
297  Map<VectorXd> eigen_alpha(alpha_tmp.vector, alpha_tmp.vlen);
298  eigen_alpha=-eigen_alpha;
299  //construct mu
301  Map<VectorXd> eigen_mean(mean.vector, mean.vlen);
302  SGVector<float64_t> mu_tmp(len);
303  Map<VectorXd> eigen_mu(mu_tmp.vector, mu_tmp.vlen);
304  //mu=K*alpha+m
305  eigen_mu=eigen_K*CMath::sq(m_scale)*eigen_alpha+eigen_mean;
306  //construct s2
307  MatrixXd eigen_V=eigen_L.triangularView<Upper>().adjoint().solve(eigen_sW.asDiagonal()*eigen_K*CMath::sq(m_scale));
308  SGVector<float64_t> s2_tmp(len);
309  Map<VectorXd> eigen_s2(s2_tmp.vector, s2_tmp.vlen);
310  eigen_s2=(eigen_K.diagonal().array()*CMath::sq(m_scale)-(eigen_V.array().pow(2).colwise().sum().transpose())).abs().matrix();
311 
312  lik->set_variational_distribution(mu_tmp, s2_tmp, m_labels);
313 
314  nlml_def=get_nlml_wrapper(alpha_tmp, mu_tmp, L_tmp);
315 
316  if (nlml_new<=nlml_def)
317  {
318  lik->set_dual_parameters(m_W, m_labels);
320  }
321  }
322 
323  if (m_alpha.vlen != m_labels->get_num_labels() || nlml_def<nlml_new)
324  {
327 
328  index_t len=m_alpha.vlen;
329 
330  m_W=SGVector<float64_t>(len);
331  for (index_t i=0; i<m_W.vlen; i++)
332  m_W[i]=0.5;
333 
334  lik->set_dual_parameters(m_W, m_labels);
335  m_sW=SGVector<float64_t>(len);
338  m_Sigma=SGMatrix<float64_t>(len, len);
339  m_V=SGMatrix<float64_t>(len, len);
340  }
341 
342  nlml_new=lbfgs_optimization();
344  TParameter* s2_param=lik->m_parameters->get_parameter("sigma2");
345  m_dv=lik->get_variational_first_derivative(s2_param);
346  TParameter* mu_param=lik->m_parameters->get_parameter("mu");
347  m_df=lik->get_variational_first_derivative(mu_param);
348 }
349 
350 float64_t CKLDualInferenceMethod::adjust_step(void *obj,
351  const float64_t *parameters, const float64_t *direction,
352  const int dim, const float64_t step)
353 {
354  /* Note that parameters = parameters_pre_iter - step * gradient_pre_iter */
355  CKLDualInferenceMethod * obj_prt
356  = static_cast<CKLDualInferenceMethod *>(obj);
357 
358  ASSERT(obj_prt != NULL);
359 
360  float64_t *non_const_direction=const_cast<float64_t *>(direction);
361  SGVector<float64_t> sg_direction(non_const_direction, dim, false);
362 
364 
365  float64_t adjust_stp=lik->adjust_step_wrt_dual_parameter(sg_direction, step);
366  return adjust_stp;
367 }
368 
369 float64_t CKLDualInferenceMethod::evaluate(void *obj, const float64_t *parameters,
370  float64_t *gradient, const int dim, const float64_t step)
371 {
372  /* Note that parameters = parameters_pre_iter - step * gradient_pre_iter */
373  CKLDualInferenceMethod * obj_prt
374  = static_cast<CKLDualInferenceMethod *>(obj);
375 
376  ASSERT(obj_prt != NULL);
377 
378  obj_prt->lbfgs_precompute();
379  float64_t nlml=obj_prt->get_dual_objective_wrt_parameters();
380 
381  SGVector<float64_t> sg_gradient(gradient, dim, false);
382  Map<VectorXd> eigen_g(sg_gradient.vector, sg_gradient.vlen);
383  obj_prt->get_gradient_of_dual_objective_wrt_parameters(sg_gradient);
384 
385  return nlml;
386 }
387 
389 {
390  lbfgs_parameter_t lbfgs_param;
391  lbfgs_param.m = m_m;
392  lbfgs_param.max_linesearch = m_max_linesearch;
393  lbfgs_param.linesearch = m_linesearch;
394  lbfgs_param.max_iterations = m_max_iterations;
395  lbfgs_param.delta = m_delta;
396  lbfgs_param.past = m_past;
397  lbfgs_param.epsilon = m_epsilon;
398  lbfgs_param.min_step = m_min_step;
399  lbfgs_param.max_step = m_max_step;
400  lbfgs_param.ftol = m_ftol;
401  lbfgs_param.wolfe = m_wolfe;
402  lbfgs_param.gtol = m_gtol;
403  lbfgs_param.xtol = m_xtol;
404  lbfgs_param.orthantwise_c = m_orthantwise_c;
406  lbfgs_param.orthantwise_end = m_orthantwise_end;
407 
408  float64_t nlml_opt=0;
409  void * obj_prt = static_cast<void *>(this);
410 
411  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
412  lbfgs(m_W.vlen, m_W.vector, &nlml_opt,
413  CKLDualInferenceMethod::evaluate,
414  NULL, obj_prt, &lbfgs_param, CKLDualInferenceMethod::adjust_step);
415  return nlml_opt;
416 }
417 
419 {
421  update();
422 
423  return SGVector<float64_t>(m_sW);
424 }
425 
427 {
428  /* get_derivative_related_cov(MatrixXd eigen_dK) does the similar job
429  * Therefore, this function body is empty
430  */
431 }
432 
434 {
435  /* L is automatically updated when update_alpha is called
436  * Therefore, this function body is empty
437  */
438 }
439 
441 {
443 }
444 
445 } /* namespace shogun */
446 
447 #endif /* HAVE_EIGEN3 */
static float64_t dot(const bool *v1, const bool *v2, int32_t n)
compute dot product between v1 and v2 (blas optimized)
Definition: SGVector.h:345
virtual CDualVariationalGaussianLikelihood * get_dual_variational_likelihood() const
SGVector< float64_t > m_alpha
int32_t lbfgs(int32_t n, float64_t *x, float64_t *ptr_fx, lbfgs_evaluate_t proc_evaluate, lbfgs_progress_t proc_progress, void *instance, lbfgs_parameter_t *_param, lbfgs_adjust_step_t proc_adjust_step)
Definition: lbfgs.cpp:207
static SGMatrix< float64_t > get_choleksy(SGVector< float64_t > W, SGVector< float64_t > sW, SGMatrix< float64_t > kernel, float64_t scale)
virtual SGVector< float64_t > get_mu_dual_parameter() const =0
int32_t index_t
Definition: common.h:62
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
static const float64_t INFTY
infinity
Definition: Math.h:1415
virtual int32_t get_num_labels() const =0
static T sum(T *vec, int32_t len)
return sum(vec)
Definition: SGVector.h:507
static SGMatrix< float64_t > get_inverse(SGMatrix< float64_t > L, SGMatrix< float64_t > kernel, SGVector< float64_t > sW, SGMatrix< float64_t > V, float64_t scale)
void log()
natural logarithm of vector elements
static T sq(T x)
x^2
Definition: Math.h:324
TParameter * get_parameter(int32_t idx)
Definition: Parameter.h:286
float64_t orthantwise_c
Definition: lbfgs.h:309
parameter struct
Definition: Parameter.h:32
virtual SGVector< float64_t > get_dual_objective_value()=0
#define REQUIRE(x,...)
Definition: SGIO.h:207
Parameter * m_parameters
Definition: SGObject.h:470
virtual SGVector< float64_t > get_mean_vector(const CFeatures *features) const =0
An abstract class of the mean function.
Definition: MeanFunction.h:28
virtual float64_t adjust_step_wrt_dual_parameter(SGVector< float64_t > direction, const float64_t step) const
SGMatrix< float64_t > m_Sigma
virtual void check_dual_inference(CLikelihoodModel *mod) const
The dual KL approximation inference method class.
SGMatrix< float64_t > m_L
#define ASSERT(x)
Definition: SGIO.h:202
void set_model(CLikelihoodModel *mod)
virtual void get_gradient_of_dual_objective_wrt_parameters(SGVector< float64_t > gradient)
virtual SGVector< float64_t > get_alpha()
virtual float64_t get_derivative_related_cov(Eigen::MatrixXd eigen_dK)
double float64_t
Definition: common.h:50
index_t num_rows
Definition: SGMatrix.h:298
virtual SGVector< float64_t > get_dual_first_derivative(const TParameter *param) const =0
index_t num_cols
Definition: SGMatrix.h:300
The KL approximation inference method class.
virtual void set_dual_parameters(SGVector< float64_t > lambda, const CLabels *lab)
The class Features is the base class of all feature objects.
Definition: Features.h:68
SGVector< float64_t > m_mu
SGVector< float64_t > m_s2
The Kernel base class.
Definition: Kernel.h:153
Binary Labels for binary classification.
Definition: BinaryLabels.h:37
virtual void set_variational_distribution(SGVector< float64_t > mu, SGVector< float64_t > s2, const CLabels *lab)
#define SG_ADD(...)
Definition: SGObject.h:67
virtual float64_t get_dual_objective_wrt_parameters()
virtual float64_t get_negative_log_marginal_likelihood_helper()
virtual SGVector< float64_t > get_diagonal_vector()
virtual SGVector< float64_t > get_variational_first_derivative(const TParameter *param) const
virtual void set_model(CLikelihoodModel *mod)
virtual bool parameter_hash_changed()
Definition: SGObject.cpp:209
Class that models dual variational likelihood.
The Likelihood model base class.
SGMatrix< float64_t > m_ktrtr
CLikelihoodModel * m_model
index_t vlen
Definition: SGVector.h:707

SHOGUN 机器学习工具包 - 项目文档