diff --git a/stan/math/opencl/kernels/multinomial_logit_glm_lpmf.hpp b/stan/math/opencl/kernels/multinomial_logit_glm_lpmf.hpp new file mode 100644 index 00000000000..20d512642c1 --- /dev/null +++ b/stan/math/opencl/kernels/multinomial_logit_glm_lpmf.hpp @@ -0,0 +1,141 @@ +#ifndef STAN_MATH_OPENCL_KERNELS_MULTINOMIAL_LOGIT_GLM_LPMF_HPP +#define STAN_MATH_OPENCL_KERNELS_MULTINOMIAL_LOGIT_GLM_LPMF_HPP +#ifdef STAN_OPENCL + +#include +#include + +namespace stan { +namespace math { +namespace opencl_kernels { + +// \cond +static constexpr const char* multinomial_logit_glm_kernel_code = STRINGIFY( + // \endcond + /** \ingroup opencl_kernels + * GPU kernel for the Generalized Linear Model (GLM) with multinomial + * distribution and softmax (logit) link function. + * + * Must be run with at least N_instances threads and local size LOCAL_SIZE_. + * Each thread handles one instance n = gid. + * + * The kernel performs two passes over the K classes for instance n: + * 1. find max(eta[n,:]) for numerical stability, + * 2. accumulate sum_exp, S_n, and logp using shifted eta + * (eta[n,k] - max) to avoid catastrophic cancellation; skips + * skips y_nk=0 terms to implement the 0*log(0)=0 convention; + * if need_delta, stash exp(eta[n,k] - max) into delta_global. + * A final loop normalizes delta (if need_delta) and subtracts + * lgamma(y_nk+1) terms (if need_logp_gamma), reading only y_global + * and delta_global, without re-reading x_beta_global or alpha_global. + * + * @param[out] logp_global partial logp sums, one per work group + * @param[out] delta_global residual matrix N_instances x N_classes + * (col-major) + * @param[in] y_global outcome counts, N_instances x N_classes (col-major) + * @param[in] x_beta_global product x*beta, N_instances x N_classes + * (col-major) + * @param[in] alpha_global intercepts: K values if is_alpha_vector, else + * N_instances x N_classes (col-major) + * @param N_instances number of instances + * @param N_classes number of outcome classes + * @param is_alpha_vector 1 if alpha is shared 1xK row, 0 if NxK + * @param need_delta 1 if delta_global should be computed and written + * @param need_logp_gamma 1 if lgamma terms should be included in logp + */ + __kernel void multinomial_logit_glm( + __global double* logp_global, __global double* delta_global, + const __global int* y_global, const __global double* x_beta_global, + const __global double* alpha_global, const int N_instances, + const int N_classes, const int is_alpha_vector, const int need_delta, + const int need_logp_gamma) { + const int gid = get_global_id(0); + const int lid = get_local_id(0); + const int lsize = get_local_size(0); + const int wg_id = get_group_id(0); + + __local double local_storage[LOCAL_SIZE_]; + + double logp = 0; + if (gid < N_instances) { + // Pass 1: row-wise max of eta for numerical stability. + double eta_max = -INFINITY; + for (int k = 0; k < N_classes; k++) { + int nk = k * N_instances + gid; + int alpha_idx = is_alpha_vector ? k : nk; + double eta_k = x_beta_global[nk] + alpha_global[alpha_idx]; + if (eta_k > eta_max) + eta_max = eta_k; + } + + // Pass 2: sum_exp, S_n, logp; if need_delta stash exp_k in + // delta_global. + double sum_exp = 0; + int S_n = 0; + for (int k = 0; k < N_classes; k++) { + int nk = k * N_instances + gid; + int alpha_idx = is_alpha_vector ? k : nk; + double shifted_eta_k + = x_beta_global[nk] + alpha_global[alpha_idx] - eta_max; + double exp_k = exp(shifted_eta_k); + sum_exp += exp_k; + int y_nk = y_global[nk]; + S_n += y_nk; + if (y_nk != 0) + logp += y_nk * shifted_eta_k; + if (need_delta) + delta_global[nk] = exp_k; + } + logp -= S_n * log(sum_exp); + + if (need_logp_gamma) + logp += lgamma(S_n + 1.0); + + // Normalize delta and/or subtract lgamma(y_nk+1) in one pass. + if (need_delta || need_logp_gamma) { + for (int k = 0; k < N_classes; k++) { + int nk = k * N_instances + gid; + int y_nk = y_global[nk]; + if (need_logp_gamma) + logp -= lgamma(y_nk + 1.0); + if (need_delta) + delta_global[nk] = y_nk - S_n * delta_global[nk] / sum_exp; + } + } + } + + // Work-group reduction of logp. + local_storage[lid] = logp; + barrier(CLK_LOCAL_MEM_FENCE); + for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; + step /= REDUCTION_STEP_SIZE) { + if (lid < step) { + for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { + local_storage[lid] += local_storage[lid + step * i]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (lid == 0) { + logp_global[wg_id] = local_storage[0]; + } + } + // \cond +); +// \endcond + +/** \ingroup opencl_kernels + * See the docs for \link kernels/multinomial_logit_glm_lpmf.hpp + * multinomial_logit_glm() \endlink + */ +const kernel_cl + multinomial_logit_glm("multinomial_logit_glm", + {multinomial_logit_glm_kernel_code}, + {{"REDUCTION_STEP_SIZE", 4}, {"LOCAL_SIZE_", 64}}); + +} // namespace opencl_kernels +} // namespace math +} // namespace stan +#endif +#endif diff --git a/stan/math/opencl/prim.hpp b/stan/math/opencl/prim.hpp index 4ae8ca1649c..2d3d2ef93ef 100644 --- a/stan/math/opencl/prim.hpp +++ b/stan/math/opencl/prim.hpp @@ -195,6 +195,7 @@ #include #include #include +#include #include #include #include diff --git a/stan/math/opencl/prim/multinomial_logit_glm_lpmf.hpp b/stan/math/opencl/prim/multinomial_logit_glm_lpmf.hpp new file mode 100644 index 00000000000..e521c929524 --- /dev/null +++ b/stan/math/opencl/prim/multinomial_logit_glm_lpmf.hpp @@ -0,0 +1,146 @@ +#ifndef STAN_MATH_OPENCL_PRIM_MULTINOMIAL_LOGIT_GLM_LPMF_HPP +#define STAN_MATH_OPENCL_PRIM_MULTINOMIAL_LOGIT_GLM_LPMF_HPP +#ifdef STAN_OPENCL + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +namespace stan { +namespace math { + +/** \ingroup opencl + * Returns the log PMF of the Generalized Linear Model (GLM) + * with multinomial distribution and softmax (logit) link function. + * This is an OpenCL overload of + * `prim/prob/multinomial_logit_glm_lpmf.hpp`. + * Alpha can be either a shared 1×K row vector or an N×K per-instance matrix. + * + * @tparam T_x type of the design matrix (N×M kernel expression) + * @tparam T_alpha type of the intercept (1×K or N×K kernel expression) + * @tparam T_beta type of the weight matrix (M×K kernel expression) + * @param y outcome count vectors: `y[n]` is a length-K vector of non-negative + * integer counts for instance n + * @param x design matrix (N×M) on OpenCL device + * @param alpha intercept: 1×K broadcast row or N×K per-instance matrix + * @param beta weight matrix (M×K) on OpenCL device + * @return log sum of multinomial log PMFs over all N instances + * @throw std::domain_error if any element of x or beta is infinite or NaN, + * or if alpha contains `+inf` or NaN (`-inf` forces the corresponding softmax + * probability to zero and is allowed) + * @throw std::domain_error if any count in y is negative + * @throw std::invalid_argument if container sizes mismatch + */ +template * = nullptr> +inline return_type_t multinomial_logit_glm_lpmf( + const std::vector>& y, const T_x& x, const T_alpha& alpha, + const T_beta& beta) { + using T_partials_return = partials_return_t; + static constexpr const char* function = "multinomial_logit_glm_lpmf"; + + const int N_instances = x.rows(); + const int N_classes = beta.cols(); + + check_size_match(function, "Rows of", "x", N_instances, "size of", "y", + y.size()); + check_size_match(function, "Columns of", "beta", N_classes, "columns of", + "alpha", alpha.cols()); + check_size_match(function, "Columns of", "x", x.cols(), "rows of", "beta", + beta.rows()); + + const int alpha_rows = alpha.rows(); + const bool is_alpha_vector = alpha_rows == 1; + if (!is_alpha_vector) { + check_size_match(function, "Rows of", "alpha", alpha_rows, "rows of", "x", + N_instances); + } + + if (N_instances == 0) { + return 0; + } + for (int n = 0; n < N_instances; ++n) { + check_size_match(function, "Size of outcome vector", y[n].size(), + "number of classes", N_classes); + check_nonnegative(function, "outcome counts", y[n]); + } + + if constexpr (!include_summand::value) { + return 0; + } + + // Flatten nested y into an N×K matrix for upload. + Eigen::MatrixXi y_mat(N_instances, N_classes); + for (int n = 0; n < N_instances; ++n) + for (int k = 0; k < N_classes; ++k) + y_mat(n, k) = y[n][k]; + matrix_cl y_cl(y_mat); + + const auto& x_val = eval(value_of(x)); + const auto& alpha_val = eval(value_of(alpha)); + const auto& beta_val = eval(value_of(beta)); + + matrix_cl x_beta_cl = x_val * beta_val; + + const int local_size + = opencl_kernels::multinomial_logit_glm.get_option("LOCAL_SIZE_"); + const int wgs = (N_instances + local_size - 1) / local_size; + + constexpr bool need_delta = is_any_autodiff_v; + + matrix_cl logp_cl(wgs, 1); + matrix_cl delta_cl(0, 0); + if constexpr (need_delta) + delta_cl = matrix_cl(N_instances, N_classes); + + try { + opencl_kernels::multinomial_logit_glm( + cl::NDRange(local_size * wgs), cl::NDRange(local_size), logp_cl, + delta_cl, y_cl, x_beta_cl, alpha_val, N_instances, N_classes, + is_alpha_vector, need_delta, !propto); + } catch (const cl::Error& e) { + check_opencl_error(function, e); + } + + T_partials_return logp = sum(from_matrix_cl(logp_cl)); + + if (!std::isfinite(logp)) { + check_cl(function, "Design matrix", x_val, "finite") = isfinite(x_val); + check_cl(function, "Intercept", alpha_val, "finite") = isfinite(alpha_val); + check_cl(function, "Weight matrix", beta_val, "finite") + = isfinite(beta_val); + } + + auto ops_partials = make_partials_propagator(x, alpha, beta); + if constexpr (need_delta) { + // dlogp/deta[n,k] = delta[n,k]; chain rule: dx = delta*beta^T, + // dalpha = delta (or colwise_sum when 1xK), dbeta = x^T*delta. + if constexpr (is_autodiff_v) + partials<0>(ops_partials) = delta_cl * transpose(beta_val); + if constexpr (is_autodiff_v) + partials<1>(ops_partials) + = is_alpha_vector ? colwise_sum(delta_cl) : delta_cl; + if constexpr (is_autodiff_v) + partials<2>(ops_partials) = transpose(x_val) * delta_cl; + } + return ops_partials.build(logp); +} + +} // namespace math +} // namespace stan + +#endif +#endif diff --git a/stan/math/prim/prob.hpp b/stan/math/prim/prob.hpp index 4fae64c1ace..671de48fdaa 100644 --- a/stan/math/prim/prob.hpp +++ b/stan/math/prim/prob.hpp @@ -179,6 +179,7 @@ #include #include #include +#include #include #include #include diff --git a/stan/math/prim/prob/multinomial_logit_glm_lpmf.hpp b/stan/math/prim/prob/multinomial_logit_glm_lpmf.hpp new file mode 100644 index 00000000000..0734a2c91de --- /dev/null +++ b/stan/math/prim/prob/multinomial_logit_glm_lpmf.hpp @@ -0,0 +1,196 @@ +#ifndef STAN_MATH_PRIM_PROB_MULTINOMIAL_LOGIT_GLM_LPMF_HPP +#define STAN_MATH_PRIM_PROB_MULTINOMIAL_LOGIT_GLM_LPMF_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace stan { +namespace math { + +/** \ingroup multivar_dists + * Returns the log PMF of the Generalized Linear Model (GLM) + * with multinomial distribution and softmax (logit) link function. + * Efficiently computes + * \f$\sum_n \mathrm{multinomial\_logit\_lpmf}(y_n \mid \eta_n)\f$ + * with \f$\eta_n = x_n \beta + \alpha_n\f$ and analytically derived gradients. + * + * Writing \f$S_n = \sum_k y_{nk}\f$ for the per-instance count total and + * \f$p_{nk} = \mathrm{softmax}(\eta_n)_k + * = \exp(\eta_{nk}) / \sum_{k'} \exp(\eta_{nk'})\f$ + * for the category probabilities, the log PMF is + * + * \f[ + * \log p(y\,|\,x,\alpha,\beta) = \sum_{n=1}^N \left[ + * \log\Gamma(S_n + 1) - \sum_{k=1}^K \log\Gamma(y_{nk} + 1) + * + \sum_{k=1}^K y_{nk}\,\log p_{nk} + * \right]. + * \f] + * + * Defining the residual + * \f$\delta_{nk} = y_{nk} - S_n\,p_{nk}\f$ + * (so that \f$\partial\log p/\partial\eta_{nk} = \delta_{nk}\f$), the + * gradients are + * + * \f[ + * \frac{\partial\log p}{\partial\alpha_{nk}} = \delta_{nk}, + * \f] + * + * \f[ + * \frac{\partial\log p}{\partial\beta_{mk}} = \sum_n x_{nm}\,\delta_{nk}, + * \f] + * + * \f[ + * \frac{\partial\log p}{\partial x_{nm}} = \sum_k \delta_{nk}\,\beta_{mk}. + * \f] + * + * When \f$\alpha\f$ is broadcast (a single \f$1\times K\f$ row used for all + * instances), \f$\partial\log p/\partial\alpha_k = \sum_n \delta_{nk}\f$. + * + * @tparam T_x type of the design matrix (N x M) + * @tparam T_alpha type of the intercept; either a row vector (1 x K) broadcast + * over all instances, or a matrix (N x K) with per-instance intercepts. + * Analogous to a scalar vs vector alpha in univariate GLMs: the K dimension + * corresponds to the class (output) space, not the instance dimension. + * @tparam T_beta type of the weight matrix (M x K) + * @param y outcome count vectors: `y[n]` is a length-K vector of non-negative + * integer counts for instance n; the counts need not sum to a fixed total + * @param x design matrix (N x M) + * @param alpha intercept; row vector (1 x K) broadcast across all instances, + * or matrix (N x K) with one bias row per instance + * @param beta weight matrix (M x K) + * @return log sum of multinomial log PMFs over all N instances + * @throw std::domain_error if any element of x or beta is infinite or NaN, or + * if alpha contains `+inf` or NaN (`-inf` forces the corresponding softmax + * probability to zero and is allowed; if all classes have `-inf` alpha for + * some instance but that instance has nonzero counts, the result is undefined), + * or if any count in y is negative + * @throw std::invalid_argument if container sizes mismatch + */ +template * = nullptr, + require_matrix_t* = nullptr, + require_matrix_t* = nullptr> +inline return_type_t multinomial_logit_glm_lpmf( + const std::vector>& y, const T_x& x, const T_alpha& alpha, + const T_beta& beta) { + using T_partials_return = partials_return_t; + using Eigen::Array; + using Eigen::Dynamic; + using T_x_ref = ref_type_if_not_constant_t; + using T_alpha_ref = ref_type_if_not_constant_t; + using T_beta_ref = ref_type_if_not_constant_t; + constexpr int T_alpha_rows = T_alpha::RowsAtCompileTime; + constexpr bool need_delta = is_any_autodiff_v; + + const size_t N_instances = x.rows(); + const size_t N_classes = beta.cols(); + + static constexpr const char* function = "multinomial_logit_glm_lpmf"; + check_size_match(function, "Rows of outcome vectors", y.size(), + "number of instances", N_instances); + check_size_match(function, "Columns of intercept", alpha.cols(), + "number of classes", N_classes); + if constexpr (T_alpha_rows != 1) { + check_size_match(function, "Rows of intercept", alpha.rows(), + "rows of design matrix", x.rows()); + } + check_size_match(function, "Columns of design matrix", x.cols(), + "rows of weight matrix", beta.rows()); + + if (size_zero(y)) { + return 0; + } + for (size_t n = 0; n < N_instances; ++n) { + check_size_match(function, "Size of outcome vector", y[n].size(), + "number of classes", N_classes); + check_nonnegative(function, "outcome counts", y[n]); + } + + if constexpr (!include_summand::value) { + return 0; + } + + T_x_ref x_ref = x; + T_alpha_ref alpha_ref = alpha; + T_beta_ref beta_ref = beta; + + const auto& x_val = to_ref_if>(value_of(x_ref)); + const auto& beta_val = to_ref_if>(value_of(beta_ref)); + + Array eta; + if constexpr (T_alpha_rows == 1) { + eta = ((x_val * beta_val).rowwise() + value_of(alpha_ref)).array(); + } else { + eta = (x_val * beta_val + value_of(alpha_ref)).array(); + } + + // Row-max shift for numerical stability; cancels in log-softmax. + const Array shifted_eta + = (eta.colwise() - eta.rowwise().maxCoeff()).eval(); + const Array exp_eta = exp(shifted_eta); + const Array softmax_mat + = exp_eta.colwise() / exp_eta.rowwise().sum(); + + const Array y_mat = to_matrix(y).array(); + const Array instance_totals = y_mat.rowwise().sum(); + + // lmultiply implements 0*log(0)=0: classes with softmax=0 and y=0 contribute + // 0. + T_partials_return logp = lmultiply(y_mat, softmax_mat).sum(); + if constexpr (include_summand::value) { + logp += lgamma(instance_totals + 1.0).sum() - lgamma(y_mat + 1.0).sum(); + } + + if (!std::isfinite(logp)) { + check_finite(function, "Weight matrix", beta_ref); + check_finite(function, "Intercept", alpha_ref); + check_finite(function, "Matrix of independent variables", x_ref); + } + + auto ops_partials = make_partials_propagator(x_ref, alpha_ref, beta_ref); + if constexpr (need_delta) { + // δ[n,k] = y[n,k] - S_n·p[n,k] + const Array delta + = y_mat.template cast() + - softmax_mat.colwise() + * instance_totals.template cast(); + + if constexpr (is_autodiff_v) { + if constexpr (T_alpha_rows == 1) + partials<1>(ops_partials) = delta.colwise().sum(); + else + partials<1>(ops_partials) = delta; + } + if constexpr (is_autodiff_v) { + partials<2>(ops_partials) + = x_val.transpose().template cast() + * delta.matrix(); + } + if constexpr (is_autodiff_v) { + edge<0>(ops_partials).partials_ = delta.matrix() * beta_val.transpose(); + } + } + return ops_partials.build(logp); +} + +template +inline return_type_t multinomial_logit_glm_lpmf( + const std::vector>& y, const T_x& x, const T_alpha& alpha, + const T_beta& beta) { + return multinomial_logit_glm_lpmf(y, x, alpha, beta); +} + +} // namespace math +} // namespace stan +#endif diff --git a/test/unit/math/opencl/rev/multinomial_logit_glm_lpmf_test.cpp b/test/unit/math/opencl/rev/multinomial_logit_glm_lpmf_test.cpp new file mode 100644 index 00000000000..ac2f9a005d3 --- /dev/null +++ b/test/unit/math/opencl/rev/multinomial_logit_glm_lpmf_test.cpp @@ -0,0 +1,291 @@ +#ifdef STAN_OPENCL +#include +#include +#include +#include +#include + +using Eigen::Dynamic; +using Eigen::Matrix; +using stan::math::matrix_cl; +using stan::math::var; +using std::vector; + +TEST(ProbDistributionsMultinomialLogitGLM, error_checking) { + int N = 3, M = 2, K = 3; + + vector> y{{1, 2, 0}, {0, 3, 1}, {2, 0, 2}}; + vector> y_bad_n{{1, 2, 0}, {0, 3, 1}}; + vector> y_bad_k{{1, 2}, {0, 3}, {2, 0}}; + vector> y_neg{{-1, 2, 0}, {0, 3, 1}, {2, 0, 2}}; + + Matrix x(N, M); + x << 1.0, -0.5, 0.3, 0.7, -0.2, 1.1; + Matrix x_bad_n(N + 1, M); + Matrix x_bad_m(N, M + 1); + Matrix x_inf = x; + x_inf(0, 0) = stan::math::INFTY; + + Matrix beta(M, K); + beta << 0.3, -0.2, 0.1, -0.1, 0.4, -0.3; + Matrix beta_bad_m(M + 1, K); + Matrix beta_bad_k(M, K + 1); + Matrix beta_inf = beta; + beta_inf(0, 0) = stan::math::INFTY; + + Matrix alpha(1, K); + alpha << 0.1, -0.3, 0.2; + Matrix alpha_bad_k(1, K + 1); + Matrix alpha_inf = alpha; + alpha_inf(0) = stan::math::INFTY; + + Matrix alpha_mat(N, K); + alpha_mat << 0.1, -0.3, 0.2, -0.2, 0.1, 0.3, 0.0, 0.2, -0.1; + Matrix alpha_mat_bad_n(N + 1, K); + Matrix alpha_mat_bad_k(N, K + 1); + + matrix_cl x_cl(x), x_bad_n_cl(x_bad_n), x_bad_m_cl(x_bad_m), + x_inf_cl(x_inf); + matrix_cl beta_cl(beta), beta_bad_m_cl(beta_bad_m), + beta_bad_k_cl(beta_bad_k), beta_inf_cl(beta_inf); + matrix_cl alpha_cl(alpha), alpha_bad_k_cl(alpha_bad_k), + alpha_inf_cl(alpha_inf); + matrix_cl alpha_mat_cl(alpha_mat), + alpha_mat_bad_n_cl(alpha_mat_bad_n), alpha_mat_bad_k_cl(alpha_mat_bad_k); + + EXPECT_NO_THROW( + stan::math::multinomial_logit_glm_lpmf(y, x_cl, alpha_cl, beta_cl)); + EXPECT_NO_THROW( + stan::math::multinomial_logit_glm_lpmf(y, x_cl, alpha_mat_cl, beta_cl)); + + EXPECT_THROW( + stan::math::multinomial_logit_glm_lpmf(y_bad_n, x_cl, alpha_cl, beta_cl), + std::invalid_argument); + EXPECT_THROW( + stan::math::multinomial_logit_glm_lpmf(y_bad_k, x_cl, alpha_cl, beta_cl), + std::invalid_argument); + EXPECT_THROW( + stan::math::multinomial_logit_glm_lpmf(y_neg, x_cl, alpha_cl, beta_cl), + std::domain_error); + + EXPECT_THROW( + stan::math::multinomial_logit_glm_lpmf(y, x_bad_n_cl, alpha_cl, beta_cl), + std::invalid_argument); + EXPECT_THROW( + stan::math::multinomial_logit_glm_lpmf(y, x_bad_m_cl, alpha_cl, beta_cl), + std::invalid_argument); + + EXPECT_THROW( + stan::math::multinomial_logit_glm_lpmf(y, x_cl, alpha_bad_k_cl, beta_cl), + std::invalid_argument); + EXPECT_THROW(stan::math::multinomial_logit_glm_lpmf( + y, x_cl, alpha_mat_bad_n_cl, beta_cl), + std::invalid_argument); + EXPECT_THROW(stan::math::multinomial_logit_glm_lpmf( + y, x_cl, alpha_mat_bad_k_cl, beta_cl), + std::invalid_argument); + + EXPECT_THROW( + stan::math::multinomial_logit_glm_lpmf(y, x_cl, alpha_cl, beta_bad_m_cl), + std::invalid_argument); + EXPECT_THROW( + stan::math::multinomial_logit_glm_lpmf(y, x_cl, alpha_cl, beta_bad_k_cl), + std::invalid_argument); + + EXPECT_THROW( + stan::math::multinomial_logit_glm_lpmf(y, x_inf_cl, alpha_cl, beta_cl), + std::domain_error); + EXPECT_THROW( + stan::math::multinomial_logit_glm_lpmf(y, x_cl, alpha_inf_cl, beta_cl), + std::domain_error); + EXPECT_THROW( + stan::math::multinomial_logit_glm_lpmf(y, x_cl, alpha_cl, beta_inf_cl), + std::domain_error); +} + +TEST(ProbDistributionsMultinomialLogitGLM, + opencl_matches_cpu_small_broadcast_alpha) { + int N = 3, M = 2, K = 3; + vector> y{{1, 2, 0}, {0, 3, 1}, {2, 0, 2}}; + Matrix x(N, M); + x << 1.0, -0.5, 0.3, 0.7, -0.2, 1.1; + Matrix alpha(1, K); + alpha << 0.1, -0.3, 0.2; + Matrix beta(M, K); + beta << 0.3, -0.2, 0.1, -0.1, 0.4, -0.3; + + auto f = [&y](const auto& x_, const auto& a_, const auto& b_) { + return stan::math::multinomial_logit_glm_lpmf(y, x_, a_, b_); + }; + auto f_propto = [&y](const auto& x_, const auto& a_, const auto& b_) { + return stan::math::multinomial_logit_glm_lpmf(y, x_, a_, b_); + }; + stan::math::test::compare_cpu_opencl_prim_rev(f, x, alpha, beta); + stan::math::test::compare_cpu_opencl_prim_rev(f_propto, x, alpha, beta); +} + +TEST(ProbDistributionsMultinomialLogitGLM, + opencl_matches_cpu_small_matrix_alpha) { + int N = 3, M = 2, K = 3; + vector> y{{1, 2, 0}, {0, 3, 1}, {2, 0, 2}}; + Matrix x(N, M); + x << 1.0, -0.5, 0.3, 0.7, -0.2, 1.1; + Matrix alpha(N, K); + alpha << 0.1, -0.3, 0.2, -0.2, 0.1, 0.3, 0.0, 0.2, -0.1; + Matrix beta(M, K); + beta << 0.3, -0.2, 0.1, -0.1, 0.4, -0.3; + + auto f = [&y](const auto& x_, const auto& a_, const auto& b_) { + return stan::math::multinomial_logit_glm_lpmf(y, x_, a_, b_); + }; + auto f_propto = [&y](const auto& x_, const auto& a_, const auto& b_) { + return stan::math::multinomial_logit_glm_lpmf(y, x_, a_, b_); + }; + stan::math::test::compare_cpu_opencl_prim_rev(f, x, alpha, beta); + stan::math::test::compare_cpu_opencl_prim_rev(f_propto, x, alpha, beta); +} + +TEST(ProbDistributionsMultinomialLogitGLM, opencl_matches_cpu_zero_instances) { + int M = 2, K = 3; + vector> y{}; + Matrix x(0, M); + Matrix alpha(1, K); + alpha << 0.1, -0.3, 0.2; + Matrix beta(M, K); + beta << 0.3, -0.2, 0.1, -0.1, 0.4, -0.3; + + auto f = [&y](const auto& x_, const auto& a_, const auto& b_) { + return stan::math::multinomial_logit_glm_lpmf(y, x_, a_, b_); + }; + auto f_propto = [&y](const auto& x_, const auto& a_, const auto& b_) { + return stan::math::multinomial_logit_glm_lpmf(y, x_, a_, b_); + }; + stan::math::test::compare_cpu_opencl_prim_rev(f, x, alpha, beta); + stan::math::test::compare_cpu_opencl_prim_rev(f_propto, x, alpha, beta); +} + +TEST(ProbDistributionsMultinomialLogitGLM, opencl_matches_cpu_zero_attributes) { + int N = 3, K = 3; + vector> y{{1, 2, 0}, {0, 3, 1}, {2, 0, 2}}; + Matrix x(N, 0); + Matrix alpha(1, K); + alpha << 0.1, -0.3, 0.2; + Matrix beta(0, K); + + auto f = [&y](const auto& x_, const auto& a_, const auto& b_) { + return stan::math::multinomial_logit_glm_lpmf(y, x_, a_, b_); + }; + auto f_propto = [&y](const auto& x_, const auto& a_, const auto& b_) { + return stan::math::multinomial_logit_glm_lpmf(y, x_, a_, b_); + }; + stan::math::test::compare_cpu_opencl_prim_rev(f, x, alpha, beta); + stan::math::test::compare_cpu_opencl_prim_rev(f_propto, x, alpha, beta); +} + +TEST(ProbDistributionsMultinomialLogitGLM, opencl_matches_cpu_single_instance) { + int N = 1, M = 3, K = 4; + vector> y{{2, 0, 3, 1}}; + Matrix x(N, M); + x << 0.5, -1.2, 0.8; + Matrix alpha(1, K); + alpha << 0.1, -0.3, 0.5, -0.2; + Matrix beta(M, K); + beta << 0.2, -0.1, 0.4, 0.0, -0.3, 0.5, -0.2, 0.1, 0.1, 0.0, 0.3, -0.4; + + auto f = [&y](const auto& x_, const auto& a_, const auto& b_) { + return stan::math::multinomial_logit_glm_lpmf(y, x_, a_, b_); + }; + stan::math::test::compare_cpu_opencl_prim_rev(f, x, alpha, beta); +} + +TEST(ProbDistributionsMultinomialLogitGLM, opencl_matches_cpu_all_zeros_y) { + int N = 2, M = 2, K = 3; + vector> y{{0, 0, 0}, {0, 0, 0}}; + Matrix x + = Matrix::Random(N, M); + Matrix alpha = Matrix::Random(1, K); + Matrix beta + = Matrix::Random(M, K); + + auto f = [&y](const auto& x_, const auto& a_, const auto& b_) { + return stan::math::multinomial_logit_glm_lpmf(y, x_, a_, b_); + }; + stan::math::test::compare_cpu_opencl_prim_rev(f, x, alpha, beta); +} + +TEST(ProbDistributionsMultinomialLogitGLM, opencl_matches_cpu_large) { + int N = 153, M = 17, K = 11; + vector> y(N, vector(K)); + for (int n = 0; n < N; ++n) + for (int k = 0; k < K; ++k) + y[n][k] = (n * K + k) % 5; + + Matrix x + = Matrix::Random(N, M); + Matrix alpha = Matrix::Random(1, K); + Matrix beta + = Matrix::Random(M, K); + + auto f = [&y](const auto& x_, const auto& a_, const auto& b_) { + return stan::math::multinomial_logit_glm_lpmf(y, x_, a_, b_); + }; + auto f_propto = [&y](const auto& x_, const auto& a_, const auto& b_) { + return stan::math::multinomial_logit_glm_lpmf(y, x_, a_, b_); + }; + stan::math::test::compare_cpu_opencl_prim_rev(f, x, alpha, beta); + stan::math::test::compare_cpu_opencl_prim_rev(f_propto, x, alpha, beta); +} + +TEST(ProbDistributionsMultinomialLogitGLM, + opencl_matches_cpu_large_matrix_alpha) { + int N = 153, M = 17, K = 11; + vector> y(N, vector(K)); + for (int n = 0; n < N; ++n) + for (int k = 0; k < K; ++k) + y[n][k] = (n * K + k) % 5; + + Matrix x + = Matrix::Random(N, M); + Matrix alpha + = Matrix::Random(N, K); + Matrix beta + = Matrix::Random(M, K); + + auto f = [&y](const auto& x_, const auto& a_, const auto& b_) { + return stan::math::multinomial_logit_glm_lpmf(y, x_, a_, b_); + }; + auto f_propto = [&y](const auto& x_, const auto& a_, const auto& b_) { + return stan::math::multinomial_logit_glm_lpmf(y, x_, a_, b_); + }; + stan::math::test::compare_cpu_opencl_prim_rev(f, x, alpha, beta); + stan::math::test::compare_cpu_opencl_prim_rev(f_propto, x, alpha, beta); +} + +TEST(ProbDistributionsMultinomialLogitGLM, opencl_neg_inf_alpha) { + // alpha[n,k]=-inf forces softmax probability to 0; y[n,k]=0 for those + // classes. Result must be finite and match the CPU prim result. + int N = 2, M = 2, K = 3; + vector> y{{2, 1, 0}, {0, 3, 2}}; + + Matrix x(N, M); + x << 1.0, 0.5, 0.3, -0.7; + + Matrix beta(M, K); + beta << 0.3, -0.2, 0.1, -0.1, 0.4, -0.3; + + Matrix alpha(N, K); + alpha << 0.2, -0.1, -stan::math::INFTY, -stan::math::INFTY, 0.4, 0.1; + + const double logp_cpu + = stan::math::multinomial_logit_glm_lpmf(y, x, alpha, beta); + ASSERT_TRUE(std::isfinite(logp_cpu)); + + matrix_cl x_cl(x), alpha_cl(alpha), beta_cl(beta); + const double logp_cl + = stan::math::multinomial_logit_glm_lpmf(y, x_cl, alpha_cl, beta_cl); + + EXPECT_TRUE(std::isfinite(logp_cl)); + EXPECT_FLOAT_EQ(logp_cpu, logp_cl); +} + +#endif diff --git a/test/unit/math/prim/prob/multinomial_logit_glm_lpmf_test.cpp b/test/unit/math/prim/prob/multinomial_logit_glm_lpmf_test.cpp new file mode 100644 index 00000000000..26d82626780 --- /dev/null +++ b/test/unit/math/prim/prob/multinomial_logit_glm_lpmf_test.cpp @@ -0,0 +1,260 @@ +#include +#include +#include +#include + +// ----------------------------------------------------------------------- +// Equivalence with multinomial_logit_lpmf (single instance, N=1) +// ----------------------------------------------------------------------- +TEST(ProbMultinomialLogitGLM, matchesMultinomialLogit) { + // N=1, K=4 classes, M=3 features + Eigen::MatrixXd x(1, 3); + x << 0.5, -1.2, 0.8; + Eigen::RowVectorXd alpha(4); + alpha << 0.1, -0.3, 0.5, -0.2; + Eigen::MatrixXd beta(3, 4); + beta << 0.2, -0.1, 0.4, 0.0, -0.3, 0.5, -0.2, 0.1, 0.1, 0.0, 0.3, -0.4; + + // eta = alpha^T + beta^T x^T (K-vector for multinomial_logit_lpmf) + Eigen::VectorXd eta = alpha.transpose() + beta.transpose() * x.transpose(); + + std::vector ns{3, 1, 0, 2}; + std::vector> y{ns}; + + EXPECT_FLOAT_EQ(stan::math::multinomial_logit_lpmf(ns, eta), + stan::math::multinomial_logit_glm_lpmf(y, x, alpha, beta)); +} + +// ----------------------------------------------------------------------- +// Equivalence with binomial_logit_lpmf (K=2 special case) +// +// With K=2, alpha=[0,0], and beta_mult = [beta_bin | 0_col] (M x 2), +// the linear predictor per instance is lin_n = [x_n·beta_bin, 0]. +// The softmax of [a, 0] is [sigmoid(a), sigmoid(-a)], so the +// multinomial log-PMF reduces to binomial_logit_lpmf(y0 | y0+y1, x·beta_bin). +// ----------------------------------------------------------------------- +TEST(ProbMultinomialLogitGLM, matchesBinomialLogitLpmf) { + // N=2 instances, M=2 features, K=2 classes + Eigen::MatrixXd x(2, 2); + x << 1.0, -0.5, 0.3, 0.7; + + Eigen::VectorXd beta_bin(2); + beta_bin << 0.4, -0.6; + + // Multinomial parameters: alpha=[0,0], beta_mult = [[beta_bin], [0]] + const Eigen::RowVectorXd alpha_mult = Eigen::RowVectorXd::Zero(2); + Eigen::MatrixXd beta_mult(2, 2); + beta_mult << beta_bin(0), 0.0, beta_bin(1), 0.0; + + const std::vector n_success{3, 2}; + const std::vector n_trials{5, 4}; + + std::vector> y(2); + for (int i = 0; i < 2; ++i) { + y[i] = {n_success[i], n_trials[i] - n_success[i]}; + } + + // Binomial logit uses eta = x * beta_bin (no intercept, matches alpha=[0,0]) + const Eigen::VectorXd eta = x * beta_bin; + EXPECT_FLOAT_EQ( + stan::math::binomial_logit_lpmf(n_success, n_trials, eta), + stan::math::multinomial_logit_glm_lpmf(y, x, alpha_mult, beta_mult)); +} + +// ----------------------------------------------------------------------- +// K=3, N=3 instances, M=2 features. +// GLM result must equal the sum of per-instance multinomial_logit_lpmf calls. +// ----------------------------------------------------------------------- +TEST(ProbMultinomialLogitGLM, K3_N3_manual) { + Eigen::MatrixXd x(3, 2); + x << 1.0, 0.5, -0.5, 1.0, 0.0, -1.0; + + Eigen::RowVectorXd alpha(3); + alpha << 0.2, -0.1, 0.5; + + Eigen::MatrixXd beta(2, 3); + beta << 0.3, -0.2, 0.1, -0.1, 0.4, -0.3; + + std::vector> y{{2, 1, 3}, {0, 4, 1}, {3, 0, 2}}; + + double expected = 0; + for (int n = 0; n < 3; ++n) { + Eigen::VectorXd eta + = alpha.transpose() + beta.transpose() * x.row(n).transpose(); + expected += stan::math::multinomial_logit_lpmf(y[n], eta); + } + + EXPECT_FLOAT_EQ(expected, + stan::math::multinomial_logit_glm_lpmf(y, x, alpha, beta)); +} + +// ----------------------------------------------------------------------- +// Matrix alpha (N x K): per-instance intercepts, full design matrix. +// Result must equal the sum of per-instance multinomial_logit_lpmf calls +// each using its own alpha row. +// ----------------------------------------------------------------------- +TEST(ProbMultinomialLogitGLM, matrixAlpha_fullX) { + const int N = 4, K = 3, M = 2; + Eigen::MatrixXd x(N, M); + x << 1.0, 0.5, -0.5, 1.0, 0.0, -1.0, 0.7, 0.3; + + Eigen::MatrixXd alpha(N, K); + alpha << 0.2, -0.1, 0.5, -0.3, 0.4, 0.1, 0.1, 0.0, -0.2, 0.5, -0.3, 0.2; + + Eigen::MatrixXd beta(M, K); + beta << 0.3, -0.2, 0.1, -0.1, 0.4, -0.3; + + std::vector> y{{2, 1, 3}, {0, 4, 1}, {3, 0, 2}, {1, 2, 1}}; + + double expected = 0; + for (int n = 0; n < N; ++n) { + Eigen::VectorXd eta + = alpha.row(n).transpose() + beta.transpose() * x.row(n).transpose(); + expected += stan::math::multinomial_logit_lpmf(y[n], eta); + } + + EXPECT_FLOAT_EQ(expected, + stan::math::multinomial_logit_glm_lpmf(y, x, alpha, beta)); +} + +// ----------------------------------------------------------------------- +// propto=true: with non-autodiff parameters the entire log PMF is constant +// w.r.t. parameters, so Stan math's early-return yields 0. +// propto=false: full log PMF must be finite and negative. +// ----------------------------------------------------------------------- +TEST(ProbMultinomialLogitGLM, propto) { + Eigen::MatrixXd x(2, 2); + x << 1.0, 0.0, 0.0, 1.0; + Eigen::RowVectorXd alpha(3); + alpha << 0.1, 0.2, 0.3; + Eigen::MatrixXd beta(2, 3); + beta << 0.1, 0.2, 0.3, 0.4, 0.5, 0.6; + + std::vector> y{{1, 2, 3}, {2, 1, 0}}; + + const double logp_full + = stan::math::multinomial_logit_glm_lpmf(y, x, alpha, beta); + EXPECT_TRUE(std::isfinite(logp_full)); + EXPECT_LT(logp_full, 0.0); + + // With all-double parameters and propto=true, Stan math elides all terms + // (the full log PMF is a constant) and returns 0. + EXPECT_EQ(stan::math::multinomial_logit_glm_lpmf(y, x, alpha, beta), + 0.0); +} + +// ----------------------------------------------------------------------- +// alpha[n,k] = -inf forces softmax probability to 0 for class k on instance n. +// When y[n,k] = 0 the 0*log(0) term must evaluate to 0 (not NaN). +// ----------------------------------------------------------------------- +TEST(ProbMultinomialLogitGLM, negInfAlpha) { + // N=2, K=3, M=2. Some alpha entries are -inf, forcing p=0 for those classes. + const int N = 2, K = 3, M = 2; + + Eigen::MatrixXd x(N, M); + x << 1.0, 0.5, 0.3, -0.7; + + Eigen::MatrixXd beta(M, K); + beta << 0.3, -0.2, 0.1, -0.1, 0.4, -0.3; + + // Per-instance (N x K) alpha with -inf entries; y is 0 for those classes. + Eigen::MatrixXd alpha(N, K); + alpha << 0.2, -0.1, -stan::math::INFTY, -stan::math::INFTY, 0.4, 0.1; + + std::vector> y{{2, 1, 0}, {0, 3, 2}}; + + const double logp = stan::math::multinomial_logit_glm_lpmf(y, x, alpha, beta); + EXPECT_TRUE(std::isfinite(logp)); + EXPECT_LT(logp, 0.0); + + // Reference: -inf drops the class from the softmax, equivalent to + // multinomial_logit_lpmf over the remaining classes. + // lgamma(0 + 1) = 0 so the normalizing constant is unaffected. + double expected = 0; + for (int n = 0; n < N; ++n) { + const Eigen::VectorXd eta + = beta.transpose() * x.row(n).transpose() + alpha.row(n).transpose(); + std::vector y_in; + std::vector eta_in; + for (int k = 0; k < K; ++k) { + if (std::isfinite(eta(k))) { + y_in.push_back(y[n][k]); + eta_in.push_back(eta(k)); + } + } + expected += stan::math::multinomial_logit_lpmf( + y_in, Eigen::Map(eta_in.data(), eta_in.size())); + } + + EXPECT_FLOAT_EQ(expected, logp); +} + +// ----------------------------------------------------------------------- +// Error handling +// ----------------------------------------------------------------------- +TEST(ProbMultinomialLogitGLM, throwsCorrectly) { + Eigen::MatrixXd x(2, 2); + x << 1.0, 0.5, -0.5, 1.0; + Eigen::RowVectorXd alpha(3); + alpha << 0.1, 0.2, 0.3; + Eigen::MatrixXd beta(2, 3); + beta << 0.1, 0.2, 0.3, 0.4, 0.5, 0.6; + + std::vector> y{{1, 2, 3}, {2, 1, 0}}; + + // Mismatched number of instances (y has 1 row, x has 2) + EXPECT_THROW( + stan::math::multinomial_logit_glm_lpmf({{1, 2, 3}}, x, alpha, beta), + std::invalid_argument); + + // Mismatched number of classes in one outcome vector + EXPECT_THROW(stan::math::multinomial_logit_glm_lpmf({{1, 2}, {2, 1, 0}}, x, + alpha, beta), + std::invalid_argument); + + // Negative counts + EXPECT_THROW(stan::math::multinomial_logit_glm_lpmf({{-1, 2, 3}, {2, 1, 0}}, + x, alpha, beta), + std::domain_error); + + // Infinite alpha + Eigen::RowVectorXd alpha_inf = alpha; + alpha_inf[0] = stan::math::INFTY; + EXPECT_THROW(stan::math::multinomial_logit_glm_lpmf(y, x, alpha_inf, beta), + std::domain_error); + + // Infinite beta + Eigen::MatrixXd beta_inf = beta; + beta_inf(0, 0) = stan::math::INFTY; + EXPECT_THROW(stan::math::multinomial_logit_glm_lpmf(y, x, alpha, beta_inf), + std::domain_error); + + // Infinite x + Eigen::MatrixXd x_inf = x; + x_inf(0, 0) = stan::math::INFTY; + EXPECT_THROW(stan::math::multinomial_logit_glm_lpmf(y, x_inf, alpha, beta), + std::domain_error); + + // alpha cols mismatch with K (beta has 3 cols but alpha has 2 cols) + Eigen::RowVectorXd alpha_bad(2); + alpha_bad << 0.1, 0.2; + EXPECT_THROW(stan::math::multinomial_logit_glm_lpmf(y, x, alpha_bad, beta), + std::invalid_argument); + + // Matrix alpha with wrong number of rows + Eigen::MatrixXd alpha_mat_bad_rows(3, 3); + EXPECT_THROW( + stan::math::multinomial_logit_glm_lpmf(y, x, alpha_mat_bad_rows, beta), + std::invalid_argument); + + // Matrix alpha with wrong number of columns + Eigen::MatrixXd alpha_mat_bad_cols(2, 2); + EXPECT_THROW( + stan::math::multinomial_logit_glm_lpmf(y, x, alpha_mat_bad_cols, beta), + std::invalid_argument); + + // beta rows mismatch with x cols (x has 2 cols, beta has 3 rows) + EXPECT_THROW(stan::math::multinomial_logit_glm_lpmf(y, x, alpha, + Eigen::MatrixXd(3, 3)), + std::invalid_argument); +} diff --git a/test/unit/math/rev/prob/multinomial_logit_glm_lpmf_test.cpp b/test/unit/math/rev/prob/multinomial_logit_glm_lpmf_test.cpp new file mode 100644 index 00000000000..e8f75d29d1f --- /dev/null +++ b/test/unit/math/rev/prob/multinomial_logit_glm_lpmf_test.cpp @@ -0,0 +1,216 @@ +#include +#include +#include +#include + +// Reference: sum multinomial_logit_lpmf over all N instances +template +inline stan::return_type_t +multinomial_logit_glm_simple_lpmf(const std::vector>& y, + const T_x& x, const T_alpha& alpha, + const T_beta& beta) { + using T_return = stan::return_type_t; + const int N = x.rows(); + // alpha is 1xK; as_column_vector_or_scalar+transpose gives a row-vector + // expression whose decay type lets rep_matrix deduce its Ret parameter. + const auto& alpha_row + = stan::math::as_column_vector_or_scalar(alpha).transpose(); + auto lin = stan::math::to_ref( + stan::math::multiply(x, beta) + + stan::math::rep_matrix>(alpha_row, + N)); + T_return lpmf = 0; + for (int n = 0; n < N; ++n) + lpmf += stan::math::multinomial_logit_lpmf( + y[n], lin.row(n).transpose().eval()); + return lpmf; +} + +TEST_F(AgradRev, MultinomialLogitGLM_matches_simple_doubles) { + using stan::math::multinomial_logit_glm_lpmf; + const size_t N = 5, M = 2, K = 3; + std::vector> y{ + {2, 1, 3}, {0, 4, 1}, {3, 0, 2}, {1, 2, 0}, {0, 1, 4}}; + Eigen::MatrixXd x(N, M); + x << -1.2, 0.5, 0.3, -0.7, 1.0, 0.2, -0.4, 0.8, 0.6, -1.1; + Eigen::MatrixXd beta(M, K); + beta << 0.3, -0.2, 0.1, -0.1, 0.4, -0.3; + Eigen::RowVectorXd alpha(K); + alpha << 0.2, -0.1, 0.5; + const double eps = 1e-13; + + EXPECT_NEAR(multinomial_logit_glm_simple_lpmf(y, x, alpha, beta), + multinomial_logit_glm_lpmf(y, x, alpha, beta), eps); + EXPECT_NEAR(multinomial_logit_glm_simple_lpmf(y, x, alpha, beta), + multinomial_logit_glm_lpmf(y, x, alpha, beta), eps); +} + +template +class ProbDistributionsMultinomialLogitGLM + : public stan::math::test::VarMatrixTypedTests {}; + +TYPED_TEST_SUITE(ProbDistributionsMultinomialLogitGLM, + stan::math::test::VarMatImpls); + +TYPED_TEST(ProbDistributionsMultinomialLogitGLM, glm_matches_simple_vars) { + using stan::math::multinomial_logit_glm_lpmf; + using stan::math::var; + using matrix_v = typename TypeParam::matrix_v; + using row_vector_v = typename TypeParam::row_vector_v; + const size_t N = 5, M = 2, K = 3; + const double eps = 1e-13; + std::vector> y{ + {2, 1, 3}, {0, 4, 1}, {3, 0, 2}, {1, 2, 0}, {0, 1, 4}}; + Eigen::MatrixXd x_val(N, M); + x_val << -1.2, 0.5, 0.3, -0.7, 1.0, 0.2, -0.4, 0.8, 0.6, -1.1; + Eigen::MatrixXd beta_val(M, K); + beta_val << 0.3, -0.2, 0.1, -0.1, 0.4, -0.3; + Eigen::RowVectorXd alpha_val(K); + alpha_val << 0.2, -0.1, 0.5; + + // Reference always uses Matrix so multinomial_logit_lpmf can extract + // rows. + Eigen::Matrix x1 = x_val, + beta1 = beta_val; + Eigen::Matrix alpha1 = alpha_val; + matrix_v x2 = x_val; + matrix_v beta2 = beta_val; + row_vector_v alpha2 = alpha_val; + + var res1 = multinomial_logit_glm_simple_lpmf(y, x1, alpha1, beta1); + var res2 = multinomial_logit_glm_lpmf(y, x2, alpha2, beta2); + (res1 + res2).grad(); + EXPECT_NEAR(res1.val(), res2.val(), eps); + for (size_t i = 0; i < M; ++i) { + for (size_t j = 0; j < N; ++j) + EXPECT_NEAR(x1.adj()(j, i), x2.adj()(j, i), eps); + for (size_t j = 0; j < K; ++j) + EXPECT_NEAR(beta1.adj()(i, j), beta2.adj()(i, j), eps); + } + for (size_t i = 0; i < K; ++i) + EXPECT_NEAR(alpha1.adj()[i], alpha2.adj()[i], eps); + + stan::math::set_zero_all_adjoints(); + + res1 = multinomial_logit_glm_simple_lpmf(y, x1, alpha1, beta1); + res2 = multinomial_logit_glm_lpmf(y, x2, alpha2, beta2); + (res1 + res2).grad(); + EXPECT_NEAR(res1.val(), res2.val(), eps); + for (size_t i = 0; i < M; ++i) { + for (size_t j = 0; j < N; ++j) + EXPECT_NEAR(x1.adj()(j, i), x2.adj()(j, i), eps); + for (size_t j = 0; j < K; ++j) + EXPECT_NEAR(beta1.adj()(i, j), beta2.adj()(i, j), eps); + } + for (size_t i = 0; i < K; ++i) + EXPECT_NEAR(alpha1.adj()[i], alpha2.adj()[i], eps); +} + +TYPED_TEST(ProbDistributionsMultinomialLogitGLM, glm_matches_simple_big) { + using stan::math::multinomial_logit_glm_lpmf; + using stan::math::var; + using matrix_v = typename TypeParam::matrix_v; + using row_vector_v = typename TypeParam::row_vector_v; + const size_t N = 37, M = 11, K = 7; + const double eps = 1e-10; + + std::vector> y(N, std::vector(K)); + for (size_t n = 0; n < N; ++n) + for (size_t k = 0; k < K; ++k) + y[n][k] = (n + k) % 5; + + Eigen::MatrixXd x_val = Eigen::MatrixXd::Random(N, M); + Eigen::MatrixXd beta_val = Eigen::MatrixXd::Random(M, K); + Eigen::RowVectorXd alpha_val = Eigen::RowVectorXd::Random(K); + + Eigen::Matrix x1 = x_val, + beta1 = beta_val; + Eigen::Matrix alpha1 = alpha_val; + matrix_v x2 = x_val; + matrix_v beta2 = beta_val; + row_vector_v alpha2 = alpha_val; + + var res1 = multinomial_logit_glm_simple_lpmf(y, x1, alpha1, beta1); + var res2 = multinomial_logit_glm_lpmf(y, x2, alpha2, beta2); + (res1 + res2).grad(); + EXPECT_NEAR(res1.val(), res2.val(), eps); + for (size_t i = 0; i < M; ++i) { + for (size_t j = 0; j < N; ++j) + EXPECT_NEAR(x1.adj()(j, i), x2.adj()(j, i), eps); + for (size_t j = 0; j < K; ++j) + EXPECT_NEAR(beta1.adj()(i, j), beta2.adj()(i, j), eps); + } + for (size_t i = 0; i < K; ++i) + EXPECT_NEAR(alpha1.adj()[i], alpha2.adj()[i], eps); +} + +TYPED_TEST(ProbDistributionsMultinomialLogitGLM, matrix_alpha_grads) { + using stan::math::multinomial_logit_glm_lpmf; + using stan::math::var; + using matrix_v = typename TypeParam::matrix_v; + const size_t N = 4, M = 2, K = 3; + const double eps = 1e-13; + std::vector> y{{2, 1, 3}, {0, 4, 1}, {3, 0, 2}, {1, 2, 0}}; + Eigen::MatrixXd x_val(N, M); + x_val << -1.2, 0.5, 0.3, -0.7, 1.0, 0.2, -0.4, 0.8; + Eigen::MatrixXd alpha_val(N, K); + alpha_val << 0.2, -0.1, 0.5, -0.3, 0.4, 0.1, 0.1, 0.0, -0.2, 0.5, -0.3, 0.2; + Eigen::MatrixXd beta_val(M, K); + beta_val << 0.3, -0.2, 0.1, -0.1, 0.4, -0.3; + // full x + matrix alpha: exercises the T_alpha_rows != 1 gradient path + // Reference uses Matrix so multinomial_logit_lpmf can extract Eigen + // rows. + Eigen::Matrix x1 = x_val, + alpha1 = alpha_val, + beta1 = beta_val; + matrix_v x2 = x_val; + matrix_v alpha2 = alpha_val; + matrix_v beta2 = beta_val; + + var res1 = 0.0; + { + auto lin = stan::math::to_ref(stan::math::multiply(x1, beta1) + alpha1); + for (size_t n = 0; n < N; ++n) + res1 += stan::math::multinomial_logit_lpmf(y[n], + lin.row(n).transpose().eval()); + } + var res2 = multinomial_logit_glm_lpmf(y, x2, alpha2, beta2); + (res1 + res2).grad(); + EXPECT_NEAR(res1.val(), res2.val(), eps); + for (size_t i = 0; i < M; ++i) { + for (size_t j = 0; j < N; ++j) + EXPECT_NEAR(x1.adj()(j, i), x2.adj()(j, i), eps); + for (size_t j = 0; j < K; ++j) + EXPECT_NEAR(beta1.adj()(i, j), beta2.adj()(i, j), eps); + } + for (size_t n = 0; n < N; ++n) + for (size_t k = 0; k < K; ++k) + EXPECT_NEAR(alpha1.adj()(n, k), alpha2.adj()(n, k), eps); +} + +TYPED_TEST(ProbDistributionsMultinomialLogitGLM, interfaces) { + using stan::math::multinomial_logit_glm_lpmf; + using matrix_v = typename TypeParam::matrix_v; + using row_vector_v = typename TypeParam::row_vector_v; + const size_t N = 3, M = 2, K = 3; + std::vector> y{{1, 2, 0}, {0, 1, 3}, {2, 0, 1}}; + Eigen::MatrixXd x_d(N, M); + x_d << 1.0, -0.5, 0.3, 0.7, -0.2, 0.4; + Eigen::MatrixXd beta_d(M, K); + beta_d << 0.1, 0.2, 0.3, 0.4, 0.5, 0.6; + Eigen::RowVectorXd alpha_d(K); + alpha_d << 0.1, 0.2, 0.3; + + matrix_v x_v = x_d; + matrix_v beta_v = beta_d; + row_vector_v alpha_v = alpha_d; + + EXPECT_NO_THROW(multinomial_logit_glm_lpmf(y, x_d, alpha_d, beta_d)); + EXPECT_NO_THROW(multinomial_logit_glm_lpmf(y, x_v, alpha_d, beta_d)); + EXPECT_NO_THROW(multinomial_logit_glm_lpmf(y, x_d, alpha_v, beta_d)); + EXPECT_NO_THROW(multinomial_logit_glm_lpmf(y, x_d, alpha_d, beta_v)); + EXPECT_NO_THROW(multinomial_logit_glm_lpmf(y, x_v, alpha_v, beta_d)); + EXPECT_NO_THROW(multinomial_logit_glm_lpmf(y, x_v, alpha_d, beta_v)); + EXPECT_NO_THROW(multinomial_logit_glm_lpmf(y, x_d, alpha_v, beta_v)); + EXPECT_NO_THROW(multinomial_logit_glm_lpmf(y, x_v, alpha_v, beta_v)); +}