% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/susie.R, R/susie_ss.R
\name{susie}
\alias{susie}
\alias{susie_suff_stat}
\title{Sum of Single Effects (SuSiE) Regression}
\usage{
susie(
  X,
  y,
  L = min(10, ncol(X)),
  scaled_prior_variance = 0.2,
  residual_variance = NULL,
  prior_weights = NULL,
  null_weight = NULL,
  standardize = TRUE,
  intercept = TRUE,
  estimate_residual_variance = TRUE,
  estimate_prior_variance = TRUE,
  estimate_prior_method = c("optim", "EM", "simple"),
  check_null_threshold = 0,
  prior_tol = 1e-09,
  residual_variance_upperbound = Inf,
  s_init = NULL,
  coverage = 0.95,
  min_abs_corr = 0.5,
  compute_univariate_zscore = FALSE,
  na.rm = FALSE,
  max_iter = 100,
  tol = 0.001,
  verbose = FALSE,
  track_fit = FALSE,
  residual_variance_lowerbound = var(drop(y))/10000,
  refine = FALSE,
  n_purity = 100
)

susie_suff_stat(
  bhat,
  shat,
  R,
  n,
  var_y,
  XtX,
  Xty,
  yty,
  X_colmeans = NA,
  y_mean = NA,
  maf = NULL,
  maf_thresh = 0,
  L = 10,
  scaled_prior_variance = 0.2,
  residual_variance = NULL,
  estimate_residual_variance = TRUE,
  estimate_prior_variance = TRUE,
  estimate_prior_method = c("optim", "EM", "simple"),
  check_null_threshold = 0,
  prior_tol = 1e-09,
  r_tol = 1e-08,
  prior_weights = NULL,
  null_weight = NULL,
  standardize = TRUE,
  max_iter = 100,
  s_init = NULL,
  coverage = 0.95,
  min_abs_corr = 0.5,
  tol = 0.001,
  verbose = FALSE,
  track_fit = FALSE,
  check_input = FALSE,
  refine = FALSE,
  check_prior = FALSE,
  n_purity = 100
)
}
\arguments{
\item{X}{An n by p matrix of covariates.}

\item{y}{The observed responses, a vector of length n.}

\item{L}{Maximum number of non-zero effects in the susie
regression model. If L is larger than the number of covariates, p,
L is set to p.}

\item{scaled_prior_variance}{The prior variance, divided by
\code{var(y)} (or by \code{(1/(n-1))yty} for
\code{susie_suff_stat}); that is, the prior variance of each
non-zero element of b is \code{var(y) * scaled_prior_variance}. The
value provided should be either a scalar or a vector of length
\code{L}. If \code{estimate_prior_variance = TRUE}, this provides
initial estimates of the prior variances.}

\item{residual_variance}{Variance of the residual. If
\code{estimate_residual_variance = TRUE}, this value provides the
initial estimate of the residual variance. By default, it is set to
\code{var(y)} in \code{susie} and \code{(1/(n-1))yty} in
\code{susie_suff_stat}.}

\item{prior_weights}{A vector of length p, in which each entry
gives the prior probability that corresponding column of X has a
nonzero effect on the outcome, y.}

\item{null_weight}{Prior probability of no effect (a number between
0 and 1, and cannot be exactly 1).}

\item{standardize}{If \code{standardize = TRUE}, standardize the
columns of X to unit variance prior to fitting (or equivalently
standardize XtX and Xty to have the same effect). Note that
\code{scaled_prior_variance} specifies the prior on the
coefficients of X \emph{after} standardization (if it is
performed). If you do not standardize, you may need to think more
carefully about specifying \code{scaled_prior_variance}. Whatever
your choice, the coefficients returned by \code{coef} are given for
\code{X} on the original input scale. Any column of \code{X} that
has zero variance is not standardized.}

\item{intercept}{If \code{intercept = TRUE}, the intercept is
fitted; it \code{intercept = FALSE}, the intercept is set to
zero. Setting \code{intercept = FALSE} is generally not
recommended.}

\item{estimate_residual_variance}{If
\code{estimate_residual_variance = TRUE}, the residual variance is
estimated, using \code{residual_variance} as an initial value. If
\code{estimate_residual_variance = FALSE}, the residual variance is
fixed to the value supplied by \code{residual_variance}.}

\item{estimate_prior_variance}{If \code{estimate_prior_variance =
TRUE}, the prior variance is estimated (this is a separate
parameter for each of the L effects). If provided,
\code{scaled_prior_variance} is then used as an initial value for
the optimization. When \code{estimate_prior_variance = FALSE}, the
prior variance for each of the L effects is determined by the
value supplied to \code{scaled_prior_variance}.}

\item{estimate_prior_method}{The method used for estimating prior
variance. When \code{estimate_prior_method = "simple"} is used, the
likelihood at the specified prior variance is compared to the
likelihood at a variance of zero, and the setting with the larger
likelihood is retained.}

\item{check_null_threshold}{When the prior variance is estimated,
compare the estimate with the null, and set the prior variance to
zero unless the log-likelihood using the estimate is larger by this
threshold amount. For example, if you set
\code{check_null_threshold = 0.1}, this will "nudge" the estimate
towards zero when the difference in log-likelihoods is small. A
note of caution that setting this to a value greater than zero may
lead the IBSS fitting procedure to occasionally decrease the ELBO.}

\item{prior_tol}{When the prior variance is estimated, compare the
estimated value to \code{prior_tol} at the end of the computation,
and exclude a single effect from PIP computation if the estimated
prior variance is smaller than this tolerance value.}

\item{residual_variance_upperbound}{Upper limit on the estimated
residual variance. It is only relevant when
\code{estimate_residual_variance = TRUE}.}

\item{s_init}{A previous susie fit with which to initialize.}

\item{coverage}{A number between 0 and 1 specifying the
\dQuote{coverage} of the estimated confidence sets.}

\item{min_abs_corr}{Minimum absolute correlation allowed in a
credible set. The default, 0.5, corresponds to a squared
correlation of 0.25, which is a commonly used threshold for
genotype data in genetic studies.}

\item{compute_univariate_zscore}{If \code{compute_univariate_zscore
= TRUE}, the univariate regression z-scores are outputted for each
variable.}

\item{na.rm}{Drop any missing values in y from both X and y.}

\item{max_iter}{Maximum number of IBSS iterations to perform.}

\item{tol}{A small, non-negative number specifying the convergence
tolerance for the IBSS fitting procedure. The fitting procedure
will halt when the difference in the variational lower bound, or
\dQuote{ELBO} (the objective function to be maximized), is
less than \code{tol}.}

\item{verbose}{If \code{verbose = TRUE}, the algorithm's progress,
and a summary of the optimization settings, are printed to the
console.}

\item{track_fit}{If \code{track_fit = TRUE}, \code{trace}
is also returned containing detailed information about the
estimates at each iteration of the IBSS fitting procedure.}

\item{residual_variance_lowerbound}{Lower limit on the estimated
residual variance. It is only relevant when
\code{estimate_residual_variance = TRUE}.}

\item{refine}{If \code{refine = TRUE}, then an additional
iterative refinement procedure is used, after the IBSS algorithm,
to check and escape from local optima (see details).}

\item{n_purity}{Passed as argument \code{n_purity} to
\code{\link{susie_get_cs}}.}

\item{bhat}{A p-vector of estimated effects.}

\item{shat}{A p-vector of standard errors.}

\item{R}{A p by p correlation matrix. It should be estimated from
the same samples used to compute \code{bhat} and \code{shat}. Using
an out-of-sample matrix may produce unreliable results.}

\item{n}{The sample size.}

\item{var_y}{The sample variance of y, defined as \eqn{y'y/(n-1)}.
When the sample variance cannot be provided, the coefficients
(returned from \code{coef}) are computed on the "standardized" X, y
scale.}

\item{XtX}{A p by p matrix \eqn{X'X} in which the columns of X
are centered to have mean zero.}

\item{Xty}{A p-vector \eqn{X'y} in which y and the columns of X are
centered to have mean zero.}

\item{yty}{A scalar \eqn{y'y} in which y is centered to have mean
zero.}

\item{X_colmeans}{A p-vector of column means of \code{X}. If both
\code{X_colmeans} and \code{y_mean} are provided, the intercept
is estimated; otherwise, the intercept is NA.}

\item{y_mean}{A scalar containing the mean of \code{y}. If both
\code{X_colmeans} and \code{y_mean} are provided, the intercept
is estimated; otherwise, the intercept is NA.}

\item{maf}{Minor allele frequency; to be used along with
\code{maf_thresh} to filter input summary statistics.}

\item{maf_thresh}{Variants having a minor allele frequency smaller
than this threshold are not used.}

\item{r_tol}{Tolerance level for eigenvalue check of positive
semidefinite matrix of R.}

\item{check_input}{If \code{check_input = TRUE},
\code{susie_suff_stat} performs additional checks on \code{XtX} and
\code{Xty}. The checks are: (1) check that \code{XtX} is positive
semidefinite; (2) check that \code{Xty} is in the space spanned by
the non-zero eigenvectors of \code{XtX}.}

\item{check_prior}{If \code{check_prior = TRUE}, it checks if the
estimated prior variance becomes unreasonably large (comparing with
10 * max(abs(z))^2).}
}
\value{
A \code{"susie"} object with some or all of the following
  elements:

\item{alpha}{An L by p matrix of posterior inclusion probabilites.}

\item{mu}{An L by p matrix of posterior means, conditional on
  inclusion.}

\item{mu2}{An L by p matrix of posterior second moments,
  conditional on inclusion.}

\item{Xr}{A vector of length n, equal to \code{X \%*\% colSums(alpha
  * mu)}.}

\item{lbf}{log-Bayes Factor for each single effect.}

\item{lbf_variable}{log-Bayes Factor for each variable and single effect.}

\item{intercept}{Intercept (fixed or estimated).}

\item{sigma2}{Residual variance (fixed or estimated).}

\item{V}{Prior variance of the non-zero elements of b, equal to
  \code{scaled_prior_variance * var(y)}.}

\item{elbo}{The value of the variational lower bound, or
  \dQuote{ELBO} (objective function to be maximized), achieved at
  each iteration of the IBSS fitting procedure.}

\item{fitted}{Vector of length n containing the fitted values of
  the outcome.}

\item{sets}{Credible sets estimated from model fit; see
  \code{\link{susie_get_cs}} for details.}

\item{pip}{A vector of length p giving the (marginal) posterior
  inclusion probabilities for all p covariates.}

\item{z}{A vector of univariate z-scores.}

\item{niter}{Number of IBSS iterations that were performed.}

\item{converged}{\code{TRUE} or \code{FALSE} indicating whether
  the IBSS converged to a solution within the chosen tolerance
  level.}

\code{susie_suff_stat} returns also outputs:

\item{XtXr}{A p-vector of \code{t(X)} times the fitted values,
  \code{X \%*\% colSums(alpha*mu)}.}
}
\description{
Performs a sparse Bayesian multiple linear regression
  of y on X, using the "Sum of Single Effects" model from Wang et al
  (2020). In brief, this function fits the regression model \eqn{y =
  \mu + X b + e}, where elements of \eqn{e} are \emph{i.i.d.} normal
  with zero mean and variance \code{residual_variance}, \eqn{\mu} is
  an intercept term and \eqn{b} is a vector of length p representing
  the effects to be estimated. The \dQuote{susie assumption} is that
  \eqn{b = \sum_{l=1}^L b_l} where each \eqn{b_l} is a vector of
  length p with exactly one non-zero element. The prior on the
  non-zero element is normal with zero mean and variance \code{var(y)
  * scaled_prior_variance}. The value of \code{L} is fixed, and
  should be chosen to provide a reasonable upper bound on the number
  of non-zero effects to be detected. Typically, the hyperparameters
  \code{residual_variance} and \code{scaled_prior_variance} will be
  estimated during model fitting, although they can also be fixed as
  specified by the user. See functions \code{\link{susie_get_cs}} and
  other functions of form \code{susie_get_*} to extract the most
  commonly-used results from a susie fit.
}
\details{
The function \code{susie} implements the IBSS algorithm
from Wang et al (2020). The option \code{refine = TRUE} implements
an additional step to help reduce problems caused by convergence of
the IBSS algorithm to poor local optima (which is rare in our
experience, but can provide misleading results when it occurs). The
refinement step incurs additional computational expense that
increases with the number of CSs found in the initial run.

The function \code{susie_suff_stat} implements essentially the same
algorithms, but using sufficient statistics. (The statistics are
sufficient for the regression coefficients \eqn{b}, but not for the
intercept \eqn{\mu}; see below for how the intercept is treated.)
If the sufficient statistics are computed correctly then the
results from \code{susie_suff_stat} should be the same as (or very
similar to) \code{susie}, although runtimes will differ as
discussed below. The simplest sufficient statistics are the sample
size \code{n}, and then the p by p matrix \eqn{X'X}, the p-vector
\eqn{X'y}, and the sum of squared y values \eqn{y'y}, all computed
after centering the columns of \eqn{X} and the vector \eqn{y} to
have mean 0; these can be computed using \code{compute_suff_stat}.
Alternatively the user can provide \code{n} and \code{bhat} (the
univariate OLS estimates from regressing y on each column of X),
\code{shat} (the standard errrors from these OLS regressions), the
p by p symmetric, positive semidefinite correlation
matrix \eqn{R = cov2cor(X'X)}, and the variance of \eqn{y}, again
all computed from centered \eqn{X} and \eqn{y}. Note that here
\code{R} and \code{bhat} should be computed using the same matrix
\eqn{X}. If you do not have access to the original \eqn{X} to
compute the matrix \code{R} then use \code{\link{susie_rss}}.

The handling of the intercept term in \code{susie_suff_stat} needs
some additional explanation. Computing the summary data after
centering \code{X} and \code{y} effectively ensures that the
resulting posterior quantities for \eqn{b} allow for an intercept
in the model; however, the actual value of the intercept cannot be
estimated from these centered data. To estimate the intercept term
the user must also provide the column means of \eqn{X} and the mean
of \eqn{y} (\code{X_colmeans} and \code{y_mean}). If these are not
provided, they are treated as \code{NA}, which results in the
intercept being \code{NA}. If for some reason you prefer to have
the intercept be 0 instead of \code{NA} then set
\code{X_colmeans = 0,y_mean = 0}.

For completeness, we note that if \code{susie_suff_stat} is run on
\eqn{X'X, X'y, y'y} computed \emph{without} centering \eqn{X} and
\eqn{y}, and with \code{X_colmeans = 0,y_mean = 0}, this is
equivalent to \code{susie} applied to \eqn{X, y} with
\code{intercept = FALSE} (although results may differ due to
different initializations of \code{residual_variance} and
\code{scaled_prior_variance}). However, this usage is not
recommended for for most situations.

The computational complexity of \code{susie} is \eqn{O(npL)} per
iteration, whereas \code{susie_suff_stat} is \eqn{O(p^2L)} per
iteration (not including the cost of computing the sufficient
statistics, which is dominated by the \eqn{O(np^2)} cost of
computing \eqn{X'X}). Because of the cost of computing \eqn{X'X},
\code{susie} will usually be faster. However, if \eqn{n >> p},
and/or if \eqn{X'X} is already computed, then
\code{susie_suff_stat} may be faster.
}
\examples{
# susie example
set.seed(1)
n = 1000
p = 1000
beta = rep(0,p)
beta[1:4] = 1
X = matrix(rnorm(n*p),nrow = n,ncol = p)
X = scale(X,center = TRUE,scale = TRUE)
y = drop(X \%*\% beta + rnorm(n))
res1 = susie(X,y,L = 10)
susie_get_cs(res1) # extract credible sets from fit
plot(beta,coef(res1)[-1])
abline(a = 0,b = 1,col = "skyblue",lty = "dashed")
plot(y,predict(res1))
abline(a = 0,b = 1,col = "skyblue",lty = "dashed")

# susie_suff_stat example
input_ss = compute_suff_stat(X,y)
res2 = with(input_ss,
            susie_suff_stat(XtX = XtX,Xty = Xty,yty = yty,n = n,
                            X_colmeans = X_colmeans,y_mean = y_mean,L = 10))
plot(coef(res1),coef(res2))
abline(a = 0,b = 1,col = "skyblue",lty = "dashed")

}
\references{
G. Wang, A. Sarkar, P. Carbonetto and M. Stephens (2020). A simple
  new approach to variable selection in regression, with application
  to genetic fine-mapping. \emph{Journal of the Royal Statistical
  Society, Series B} \bold{82}, 1273-1300 \doi{10.1101/501114}.

  Y. Zou, P. Carbonetto, G. Wang and M. Stephens (2021).
  Fine-mapping from summary data with the \dQuote{Sum of Single Effects}
  model. \emph{bioRxiv} \doi{10.1101/2021.11.03.467167}.
}
\seealso{
\code{\link{susie_get_cs}} and other \code{susie_get_*}
  functions for extracting results; \code{\link{susie_trendfilter}} for
  applying the SuSiE model to non-parametric regression, particularly
  changepoint problems, and \code{\link{susie_rss}} for applying the
  SuSiE model when one only has access to limited summary statistics
  related to \eqn{X} and \eqn{y} (typically in genetic applications).
}
