Source code for pypeit.core.pca

"""
Implement principle-component-analysis tools.

.. include common links, assuming primary doc root is up one directory
.. include:: ../include/links.rst
"""
from IPython import embed

import numpy as np

from matplotlib import pyplot as plt

from sklearn.decomposition import PCA

from pypeit import msgs
from pypeit import utils
from pypeit.core import fitting

[docs]def pca_decomposition(vectors, npca=None, pca_explained_var=99.0, mean=None):
    r"""
    Perform principle-component analysis (PCA) for a set of 1D vectors.

    The vectors are first passed to an unconstrained PCA to determine
    the growth curve of the accounted variance as a function of the
    PCA component. If specifying a number of PCA components to use
    (see `npca`), this yields the percentage of the variance
    accounted for in the analysis. If instead specifying the target
    variance percentage (see `pca_explained_var`), this is used to
    determine the number of PCA components to use in the final
    analysis.

    .. note::

        This is a fully generalized convenience function for a
        specific use of `sklearn.decomposition.PCA`_. When used
        within PypeIt, the vectors to decompose typically have the
        length of the spectral axis. This means that, within PypeIt,
        arrays are typically transposed when passed to this function.

    Args:
        vectors (`numpy.ndarray`_):
            A 2D array with vectors to analyze with shape
            :math:`(N_{\rm vec}, N_{\rm pix})`. All vectors must be
            the same length and cannot be masked.
        npca (:obj:`bool`, optional):
            The number of PCA components to keep, which must be less
            than :math:`N_{\rm vec}`. If `npca==nvec`, no PCA
            compression occurs. If None, `npca` is automatically
            determined by calculating the minimum number of
            components required to explain a given percentage of
            variance in the data. (see `pca_explained_var`).
        pca_explained_var (:obj:`float`, optional):
            The percentage (i.e., not the fraction) of the variance
            in the data accounted for by the PCA used to truncate the
            number of PCA coefficients to keep (see `npca`). Ignored
            if `npca` is provided directly.
        mean (`numpy.ndarray`_, optional):
            The mean value of each vector to subtract from the data
            before performing the PCA. If None, this is determined
            directly from the data. Shape must be :math:`N_{\rm
            vec}`.

    Returns:
        Returns four `numpy.ndarray`_ objects:
            - The coefficients of each PCA component, `coeffs`. Shape
              is :math:`(N_{\rm vec},N_{\rm comp})`.
            - The PCA component vectors, `components`. Shape is
              :math:`(N_{\rm comp},N_{\rm pix})`.
            - The mean offset of each PCA for each pixel, `pca_mean`.
              Shape is :math:`(N_{\rm pix},)`.
            - The mean offset applied to each vector before the PCA,
              `vec_mean`. Shape is :math:`(N_{\rm vec},)`.

        To reconstruct the PCA representation of the input vectors, compute::

            np.dot(coeffs, components) + pca_mean[None,:] + vec_mean[:,None]

    """
    # Check input
    if vectors.ndim != 2:
        raise ValueError('Input trace data must be a 2D array')
    nvec = vectors.shape[0]
    if nvec < 2:
        raise ValueError('There must be at least 2 vectors for the PCA analysis.')

    # Take out the mean value of each vector
    if mean is None:
        mean = np.mean(vectors, axis=1)
    vec_pca = vectors - mean[:,None]

    # Perform unconstrained PCA of the vectors
    pca = PCA()
    pca.fit(vec_pca)

    # Compute the cumulative distribution of the variance explained by the PCA components.
    # TODO: Why round to 6 decimals?  Why work in percentages?
    var_growth = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=6) * 100)
    # Number of components for a full decomposition
    npca_tot = var_growth.size

    msgs.info('The unconstrained PCA yields {0} components.'.format(npca_tot))
    if npca is None:
        # Assign the number of components to use based on the variance
        # percentage
        if pca_explained_var is None:
            raise ValueError('Must provide percentage explained variance.')
        npca = int(np.ceil(np.interp(pca_explained_var, var_growth, np.arange(npca_tot)+1))) \
                    if var_growth[0] < pca_explained_var else 1
    elif npca_tot < npca:
        raise ValueError('Too few vectors for a PCA of the requested dimensionality.  '
                         'The full (uncompressing) PCA has {0} component(s)'.format(npca_tot)
                         + ', which is less than the requested {0} component(s).'.format(npca)
                         + '  Lower the number of requested PCA component(s) or turn off the PCA.')

    msgs.info('PCA will include {0} component(s), '.format(npca)
              + 'containing {0:.3f}% of the total variance.'.format(var_growth[npca-1]))

    # Determine the PCA coefficients with the revised number of
    # components, and return the results
    pca = PCA(n_components=npca)
    pca_coeffs = pca.fit_transform(vec_pca)
    return pca_coeffs, pca.components_, pca.mean_, mean


[docs]def fit_pca_coefficients(coeff, order, ivar=None, weights=None, function='legendre', lower=3.0,
                         upper=3.0, maxrej=1, maxiter=25, coo=None, minx=None, maxx=None, 
                         debug=False):
    r"""
    Fit a parameterized function to a set of PCA coefficients,
    primarily for the purpose of predicting coefficients at
    intermediate locations.

    The coefficients of each PCA component are fit by a low-order
    polynomial, where the abscissa is set by the `coo` argument (see
    :func:`~pypeit.core.fitting.robust_fit`).

    .. note::
        This is a general function, not really specific to the PCA;
        and is really just a wrapper for
        :func:`~pypeit.core.fitting.robust_fit`.

    Args:
        coeff (`numpy.ndarray`_):
            PCA component coefficients. If the PCA decomposition used
            :math:`N_{\rm comp}` components for :math:`N_{\rm vec}`
            vectors, the shape of this array must be :math:`(N_{\rm
            vec}, N_{\rm comp})`. The array can be 1D with shape
            :math:`(N_{\rm vec},)` if there was only one PCA
            component.
        order (:obj:`int`, `numpy.ndarray`_):
            The order, :math:`o`, of the function used to fit the PCA
            coefficients. Can be a single number for all PCA
            components, or an array with an order specific to each
            component. If the latter, the shape must be
            :math:`(N_{\rm comp},)`.
        ivar (`numpy.ndarray`_, optional):
            Inverse variance in the PCA coefficients to use during
            the fit; see the `invvar` parameter of
            :func:`~pypeit.core.fitting.robust_fit`. If None, fit is
            not error weighted. If a vector with shape :math:`(N_{\rm
            vec},)`, the same error will be assumed for all PCA
            components (i.e., `ivar` will be expanded to match the
            shape of `coeff`). If a 2D array, the shape must match
            `coeff`.
        weights (`numpy.ndarray`_, optional):
            Weights to apply to the PCA coefficients during the fit;
            see the `weights` parameter of
            :func:`~pypeit.core.fitting.robust_fit`. If None, the
            weights are uniform. If a vector with shape
            :math:`(N_{\rm vec},)`, the same weights will be assumed
            for all PCA components (i.e., `weights` will be expanded
            to match the shape of `coeff`). If a 2D array, the shape
            must match `coeff`.
        function (:obj:`str`, optional):
            Type of function used to fit the data.
        lower (:obj:`float`, optional):
            Number of standard deviations used for rejecting data
            **below** the mean residual. If None, no rejection is
            performed. See :func:`~pypeit.core.fitting.robust_fit`.
        upper (:obj:`float`, optional):
            Number of standard deviations used for rejecting data
            **above** the mean residual. If None, no rejection is
            performed. See :func:`~pypeit.core.fitting.robust_fit`.
        maxrej (:obj:`int`, optional):
            Maximum number of points to reject during fit iterations.
            See :func:`~pypeit.core.fitting.robust_fit`.
        maxiter (:obj:`int`, optional):
            Maximum number of rejection iterations allows. To force
            no rejection iterations, set to 0.
        coo (`numpy.ndarray`_, optional):
            Floating-point array with the independent coordinates to
            use when fitting the PCA coefficients. If None, simply
            uses a running number. Shape must be :math:`(N_{\rm
            vec},)`.
        minx, maxx (:obj:`float`, optional):
            Minimum and maximum values used to rescale the
            independent axis data. If None, the minimum and maximum
            values of `coo` are used. See
            :func:`~pypeit.core.fitting.robust_fit`.
        debug (:obj:`bool`, optional):
            Show plots useful for debugging.

    Returns:
        `numpy.ndarray`_: One or more
        :class:`~pypeit.core.fitting.PypeItFit` instances, one per
        PCA component, that models the PCA component coefficients as
        a function of the reference coordinates. These can be used to
        predict new vectors that follow the PCA model at a new
        coordinate; see :func:`pca_predict`.
    """
    # Check the input
    #   - Get the shape of the input data to fit
    _coeff = np.asarray(coeff)
    if _coeff.ndim == 1:
        _coeff = np.expand_dims(_coeff, 1)
    if _coeff.ndim != 2:
        raise ValueError('Array with coefficiencts cannot be more than 2D')
    nvec, npca = _coeff.shape
    #   - Check the inverse variance
    _ivar = None if ivar is None else np.atleast_2d(ivar)
    if _ivar is not None and _ivar.shape != _coeff.shape:
        raise ValueError('Inverse variance array does not match input coefficients.')
    #   - Check the weights
    _weights = np.ones(_coeff.shape, dtype=float) if weights is None else np.asarray(weights)
    if _weights.ndim == 1:
        _weights = np.tile(_weights, (_coeff.shape[1],1)).T
    if _weights.shape != _coeff.shape:
        raise ValueError('Weights array does not match input coefficients.')
    #   - Set the abscissa of the data if not provided and check its
    #   shape
    if coo is None:
        coo = np.arange(nvec, dtype=float)
    if coo.size != nvec:
        raise ValueError('Vector coordinates have incorrect shape.')
    #   - Check the order of the functions to fit
    _order = np.atleast_1d(order)
    if _order.size == 1:
        _order = np.full(npca, order, dtype=int)
    if _order.size != npca:
        raise ValueError('Function order must be a single number or one number per PCA component.')
    #   - Force the values of minx and maxx if they're not provided directly
    if minx is None:
        minx = np.amin(coo)
    if maxx is None:
        maxx = np.amax(coo)

    # Instantiate the output

    # TODO: This fitting is fast. Maybe we should determine the best
    #  order for each PCA component, up to some maximum, by comparing
    #  reduction in chi-square vs added number of parameters?

    # Fit the coefficients of each PCA component so that they can be
    # interpolated to other coordinates.

    inmask = np.ones_like(coo, dtype=bool)
    model = np.empty(npca, dtype=fitting.PypeItFit)
    for i in range(npca):
        model[i] = fitting.robust_fit(coo, _coeff[:,i], _order[i], in_gpm=inmask,
                                      invvar=None if _ivar is None else _ivar[:,i],
                                      weights=_weights[:,i], function=function, maxiter=maxiter,
                                      lower=lower, upper=upper, maxrej=maxrej, sticky=False,
                                      use_mad=_ivar is None, minx=minx, maxx=maxx)
        if debug:
            # Visually check the fits
            xvec = np.linspace(np.amin(coo), np.amax(coo), num=100)
            rejected = np.logical_not(model[i].gpm) & inmask
            plt.scatter(coo[inmask], _coeff[inmask,i], marker='.', color='k', s=100,
                        facecolor='none', label='pca coeff')
            plt.scatter(coo[np.logical_not(inmask)], _coeff[np.logical_not(inmask),i], marker='.',
                        color='orange', s=100, facecolor='none',
                        label='pca coeff, masked from previous')
            if np.any(rejected):
                plt.scatter(coo[rejected], _coeff[rejected,i], marker='x', color='C3', s=80, 
                            label='robust_polyfit_djs rejected')
            plt.plot(xvec, model[i].eval(xvec), linestyle='--', color='C0',
                     label='Polynomial fit of order={0}'.format(_order[i]))
            plt.xlabel('Trace Coordinate', fontsize=14)
            plt.ylabel('PCA Coefficient', fontsize=14)
            plt.title('PCA Fit for Dimension #{0}/{1}'.format(i+1, npca))
            plt.legend()
            plt.show()

        # Propagate rejection of coeffs for this component to the next
        # component.
        # TODO: Can we put in a comment here or in the docstring
        # explaining why we do this?
        inmask = model[i].gpm.astype(bool)

    # Return the fitted model
    return model


[docs]def pca_predict(x, pca_coeffs_model, pca_components, pca_mean, mean):
    r"""
    Use a model of the PCA coefficients to predict vectors at the
    specified coordinates.

    Args:
        x (:obj:`float`, `numpy.ndarray`_):
            One or more trace coordinates at which to sample the PCA
            coefficients and produce the PCA-driven model. As used
            within PypeIt, this is typically the spatial pixel
            coordinate or echelle order number.
        pca_coeffs_model (`numpy.ndarray`_):
            An array of :class:`~pypeit.core.fitting.PypeItFit`
            objects, one PCA component, used to calculate the PCA
            coefficients at the provided position, ``x``. See
            :func:`fit_pca_coefficients`.
        pca_components (`numpy.ndarray`_):
            Vectors with the PCA components.  Shape must be
            :math:`(N_{\rm comp}, N_{\rm pix})`.
        pca_mean (`numpy.ndarray`_):
            The mean offset of the PCA decomposition for each pixel.
            Shape is :math:`(N_{\rm pix},)`.
        mean (:obj:`float`, `numpy.ndarray`_):
            The mean offset of each trace coordinate to use for the
            PCA prediction. This is typically identical to ``x``, and
            its shape must match ``x``.
    
    Returns:
        `numpy.ndarray`_: PCA constructed vectors, one per position
        ``x``. Shape is either :math:`(N_{\rm pix},)` or
        :math:`(N_{\rm x},N_{\rm pix})`, depending on the input
        shape/type of ``x``.
    """
    _x = np.atleast_1d(x)
    _mean = np.atleast_1d(mean)
    if _x.ndim != 1:
        raise ValueError('Coordinates for predicted vectors must be no more than 1D.')
    if _mean.shape != _x.shape:
        raise ValueError('Input mean must match the shape of the input prediction coordinates.')
    # Calculate the coefficients using the best fitting function
    npca = pca_components.shape[0]
    c = np.zeros((_x.size, npca), dtype=float)
    for i in range(npca):
        c[:,i] = pca_coeffs_model[i].eval(_x)
    # Calculate the predicted vectors and return them
    vectors = np.dot(c, pca_components) + pca_mean[None,:] + _mean[:,None]
    return vectors if isinstance(x, np.ndarray) else vectors[0,:]