Source code for pypeit.cache

# -*- coding: utf-8 -*-
"""
PypeIt uses the `astropy.utils.data`_ caching system to limit the size of its
package distribution in PyPI by enabling on-demand downloading of reference
files needed for specific data-reduction steps.  This module provides the
low-level utility functions that interface with the cache.

Access to the data files are handled in the code base using the
:class:`~pypeit.pypeitdata.PypeItDataPaths` object instantiated every time
PypeIt is imported.

To get the location of your pypeit cache (by default ``~/.pypeit/cache``) you
can run:

.. code-block:: python

    import astropy.config.paths
    print(astropy.config.paths.get_cache_dir('pypeit'))

.. note::

    If the hostname URL for the telluric atmospheric grids on S3 changes, the
    only place that needs to change is the file ``pypeit/data/s3_url.txt``.

.. include:: ../include/links.rst
"""
from functools import reduce
from importlib import resources
import pathlib
import urllib.error
from urllib.parse import urljoin, urlparse
from datetime import datetime

import packaging

from IPython import embed

import numpy as np

import astropy.utils.data
import github
import requests


# NOTE: pygit2 is only used for testing purposes.  It is not a requirement for a
# general user.  Hence the try block below.
try:
    from pygit2 import Repository
except ImportError:
    Repository = None
    GitError = None
else:
    from pygit2 import GitError


# NOTE: To avoid circular imports, avoid (if possible) importing anything from
# pypeit into this module!  Objects created or available in pypeit/__init__.py
# are the exceptions, for now.
from pypeit import log
from pypeit import PypeItError, PypeItPathError
from pypeit import __version__


__PYPEIT_DATA__ = resources.files('pypeit') / 'data'
__PYPEIT_REPO_PATH__ = 'pypeit/PypeIt'


[docs] def git_repo(): """ Get a reference to the local repository, if possible. """ if Repository is None: # pygit2 not available return None try: return Repository(resources.files('pypeit')) except GitError: # PypeIt not in a git repo return None
# For development versions, try to get the branch name
[docs] def git_branch(): """ Return the name/hash of the currently checked out branch Returns: :obj:`str`: Branch name or hash. Defaults to "develop" if PypeIt is not currently in a repository or pygit2 is not installed. """ repo = git_repo() if repo is None: return 'develop' if '.dev' in __version__ else __version__ return str(repo.head.target) if repo.head_is_detached else str(repo.head.shorthand)
[docs] def git_remote_path(): """ The main path to the GitHub repository. This defaults to the main repository if the repository cannot be defined (see :func:`git_repo`) or if the "origin" remote URL cannot be determined. Returns: :obj:`str`: Remote path """ repo = git_repo() if repo is None: return __PYPEIT_REPO_PATH__ try: url = repo.remotes['origin'].url except KeyError: return __PYPEIT_REPO_PATH__ return urlparse(url).path.replace('.git','').removeprefix('/')
[docs] def github_contents(repo, branch, path, recursive=True): """ (Recursively) Acquire a listing of the contents of a repository directory. Args: repo (`github.Repository`_): Repository to search branch (:obj:`str`): Name of the branch or commit hash path (:obj:`str`): Path relative to the top-level directory of the repository to search. recursive (:obj:`bool`, optional): Flag to search the directory recursively. If False, subdirectory names are included in the list of returned objects. If True, subdirectories are removed from the listing and replaced by their contents; in this case the list of all objects should only include repository files. Returns: :obj:`list`: A list of `github.ContentFile`_ objects with the repo contents. """ try: # Collect the contents contents = repo.get_contents(path, branch) except github.GithubException as e: raise PypeItPathError(f'{path} not found in the {branch} of the GitHub tree.') from e # If not searching recursively, we're done if not recursive: return contents # Check if any of the contents are directories is_dir = [c.type == 'dir' for c in contents] # If not, we're done if not any(is_dir): return contents # For each directory, append the directory contents recursively is_dir = np.where(is_dir)[0] for indx in is_dir: contents.extend(github_contents(repo, branch, contents[indx].path)) # Remove the directories from the list return [c for i,c in enumerate(contents) if i not in is_dir]
[docs] def git_most_recent_tag(): """ Return the version number for the most recent tag and the date of its last commit. Returns: :obj:`tuple`: The version number and a ISO format string with the date of the last commit included in the tag. If ``pygit2`` is not installed or no tags are found, the returned version is the same as ``pypeit.__version__`` and the date is None. """ if Repository is None: return __version__, None repo = Repository(resources.files('pypeit')) tags = [packaging.version.parse(ref.split('/')[-1]) \ for ref in repo.references if 'refs/tags' in ref] if len(tags) == 0: log.warning('Unable to find any tags in pypeit repository.') return __version__, None latest_version = str(sorted(tags)[-1]) timestamp = repo.resolve_refish(f'refs/tags/{latest_version}')[0].author.time return latest_version, datetime.fromtimestamp(timestamp).isoformat()
# AstroPy download/cache infrastructure ======================================#
[docs] def fetch_remote_file( filename: str, filetype: str, remote_host: str='github', install_script: bool=False, force_update: bool=False, full_url: str=None, return_none: bool=False, ) -> pathlib.Path: """ Use `astropy.utils.data`_ to fetch file from remote or cache The function ``download_file()`` will first look in the local cache (the option ``cache=True`` is used with this function to retrieve downloaded files from the cache, as needed) before downloading the file from the remote server. The remote file can be forcibly downloaded through the use of ``force_update``. Args: filename (str): The base filename to search for filetype (str): The subdirectory of ``pypeit/data/`` in which to find the file (e.g., ``arc_lines/reid_arxiv`` or ``sensfuncs``) remote_host (:obj:`str`, optional): The remote host scheme. Currently only 'github' and 's3_cloud' are supported. Defaults to 'github'. install_script (:obj:`bool`, optional): This function is being called from an install script (i.e., ``pypeit_install_telluric``) -- relates to warnings displayed. Defaults to False. force_update (:obj:`bool`, optional): Force `astropy.utils.data.download_file`_ to update the cache by downloading the latest version. Defaults to False. full_url (:obj:`str`, optional): The full url. If None, use :func:`_build_remote_url`). Defaults to None. return_none (:obj:`bool`, optional): Return None if the file is not found. Defaults to False. Returns: `Path`_: The local path to the desired file in the cache """ # In some cases, we have the full URL already, but most of the time not if full_url: remote_url, sources = full_url, None else: remote_url, sources = _build_remote_url(filename, filetype, remote_host=remote_host) if remote_host == "s3_cloud" and not install_script: # Display a warning that this may take a while, and the user may wish to # download use an install script log.warning(f'Note: If this file takes a while to download, you may wish to used one of ' 'the install scripts (e.g., pypeit_install_telluric) to install the file ' 'independent of this processing script.') # Get the file from cache, if available, or download from the remote server # TODO: Make timeout a function argument? try: cache_fn = astropy.utils.data.download_file( remote_url, sources=sources, timeout=10, cache="update" if force_update else True, pkgname="pypeit", ) except urllib.error.URLError as error: if remote_host == "s3_cloud" and ( requests.head(sources[0]).status_code in [requests.codes.forbidden, requests.codes.not_found] ): err_msg = ( f"The file {filename}\n" f"is not hosted in the cloud. Please download this file from" f"the PypeIt Google Drive and install it using the script" f"pypeit_install_telluric --local. See instructions at" "https://pypeit.readthedocs.io/en/latest/installing.html#additional-data" ) elif filetype == "arc_lines/lists": err_msg = ( f"Cannot find local arc line list {filename}\n" f"Use the script `pypeit_install_linelist` to install" f"your custom line list into the cache. See instructions at" "https://pypeit.readthedocs.io/en/latest/wave_calib.html#line-lists" ) elif filetype == "extinction": err_msg = ( f"Cannot find local extinction file {filename}\n" f"Use the script `pypeit_install_extinctfile` to install" f"your custom extinction file into the cache. See instructions at" "https://pypeit.readthedocs.io/en/latest/fluxing.html#extinction-correction" ) elif return_none: return None else: err_msg = ( f"Error downloading {filename}: {error}\n" f"URL attempted: {remote_url}\n" f"If the error relates to the server not being found," f"check your internet connection. If the remote server" f"name has changed, please contact the PypeIt development" "team." ) # Raise the appropriate error message raise PypeItError(err_msg) except TimeoutError as error: raise PypeItError(f"Timeout Error encountered: {error}") # If no error, return the pathlib object return pathlib.Path(cache_fn).resolve()
[docs] def search_cache(pattern: str, path_only=True): """ Search the cache for items matching a pattern string. This function searches the PypeIt cache for files whose URL keys contain the input ``pattern``, and returns the local filesystem path to those files. Args: pattern (:obj:`str`): The pattern to match within the file name of the source url. This can be None, meaning that the full contents of the cache is returned. However, note that setting ``pattern`` to None and ``path_only=True`` may not be very useful given the abstraction of the file names. path_only (:obj:`bool`, optional): Only return the path(s) to the files found in the cache. If False, a dictionary is returned where each key is the source url, and the value is the local path. Returns: :obj:`list`, :obj:`dict`: If ``path_only`` is True, this is a :obj:`list` of local paths for the objects whose normal filenames match the ``pattern``. Otherwise, this is a dictionary with keys matching the original source url, and the value set to the local path. """ # Retrieve a dictionary of the cache contents contents = astropy.utils.data.cache_contents(pkgname="pypeit") contents = {k:pathlib.Path(v) for k, v in contents.items() if pattern is None or pattern in k} return list(contents.values()) if path_only else contents
[docs] def write_file_to_cache(filename: str, cachename: str, filetype: str, remote_host: str="github"): """ Use `astropy.utils.data`_ to save local file to cache This function writes a local file to the PypeIt cache as if it came from a remote server. This is useful for being able to use locally created or separately downloaded files in place of PypeIt-distributed versions. Args: filename (str): The filename of the local file to save cachename (str): The name of the cached version of the file filetype (str): The subdirectory of ``pypeit/data/`` in which to find the file (e.g., ``arc_lines/reid_arxiv`` or ``sensfuncs``) remote_host (:obj:`str`, optional): The remote host scheme. Currently only 'github' and 's3_cloud' are supported. Defaults to 'github'. """ # Build the `url_key` as if this file were in the remote location url_key, _ = _build_remote_url(cachename, filetype, remote_host=remote_host) # Use `import_file_to_cache()` to place the `filename` into the cache astropy.utils.data.import_file_to_cache(url_key, filename, pkgname="pypeit")
[docs] def remove_from_cache(cache_url=None, pattern=None, allow_multiple=False): """ Remove a previously downloaded file from the pypeit-specific `astropy.utils.data`_ cache. To specify the file, the full URL can be provided or a name used in a cache search. Args: cache_url (:obj:`list`, :obj:`str`, optional): One or more URLs in the cache to be deleted (if they exist in the cache). If ``allow_multiple`` is False, this must be a single string. pattern (:obj:`str`, optional): A pattern to use when searching the cache for the relevant file(s). If ``allow_mulitple`` is False, this must return a single file, otherwise the function will issue a warning and nothing will be deleted. allow_multiple (:obj:`bool`, optional): If the search pattern yields multiple results, remove them all. """ if cache_url is None: _url = search_cache(pattern, path_only=False) if len(_url) == 0: log.warning(f'Cache does not include a file matching the pattern {pattern}.') return _url = list(_url.keys()) elif not isinstance(cache_url, list): _url = [cache_url] else: _url = cache_url if len(_url) > 1 and not allow_multiple: log.warning('Function found or was provided with multiple entries to be removed. Either ' 'set allow_multiple=True, or try again with a single url or more specific ' 'pattern. URLs passed/found are:\n' + '\n'.join(_url)) return # Use `clear_download_cache` to remove the file for u in _url: astropy.utils.data.clear_download_cache(hashorurl=u, pkgname='pypeit')
[docs] def parse_cache_url(url): """ Parse a URL from the cache into its relevant components. Parameters ---------- url : :obj:`str` URL of a file in the pypeit cache. A valid cache URL must include either ``'github'`` or ``'s3.cloud'`` in its address. Returns ------- host : :obj:`str` Host name, either ``'github'`` or ``'s3_cloud'``. None if the ``url`` is not valid. fork : :obj:`str` Fork name. None if the ``url`` is not valid or if the host is ``'s3_cloud'``. branch : :obj:`str` Branch name. None if the ``url`` is not valid or if the host is ``'s3_cloud'``. dir : :obj:`str` Directory name. None if the ``url`` is not valid. file : :obj:`str` File name. None if the ``url`` is not valid. """ url_parts = urlparse(url) # Get the host if 'github' in url_parts.netloc: _path = pathlib.PurePosixPath(url_parts.path) root_tuple = _path.parts[:3] if _path.is_absolute() else ('/', *_path.parts[:2]) fork = pathlib.PurePosixPath(*root_tuple) sub_path = pathlib.PurePosixPath(url_parts.path).relative_to(fork) branch = sub_path.parts[0] f_type = str(sub_path.parent.relative_to(pathlib.PurePosixPath(f'{branch}/pypeit/data'))) return 'github', str(fork), branch, f_type, sub_path.name elif 's3.cloud' in url_parts.netloc: # NOTE: I'm assuming "s3.cloud" will always be in the url ... s3_root = pathlib.PurePosixPath('/pypeit') sub_path = pathlib.PurePosixPath(url_parts.path).relative_to(s3_root) return 's3_cloud', None, None, str(sub_path.parent), sub_path.name # Unknown host log.warning(f'URL not recognized as a pypeit cache url:\n\t{url}') return None, None, None, None, None
[docs] def list_cache_contents(contents): """ Print the list of cache contents Parameters ---------- contents : :obj:`dict` A dictionary with key-value pairs that provide the original source url (key) and the path to the local file (value). This can be generated using :func:`search_cache`. """ print(f' {"HOST":>10} {"FORK":>20} {"BRANCH":>15} {"SUBDIR":>15} {"FILE":<20}') for url in contents.keys(): head, fork, branch, subdir, f = parse_cache_url(url) print(f' {head:>10} {"..." if fork is None else fork:>20}' f'{"..." if branch is None else branch:>20}' f' {subdir:>20} {f:<30}')
[docs] def _build_remote_url(f_name: str, f_type: str, remote_host: str=None): """ Build the remote URL for the `astropy.utils.data`_ functions This function keeps the URL-creation in one place. In the event that files are moved from GitHub or S3_Cloud, this is the only place that would need to be changed. Parameters ---------- f_name : str The base filename to search for f_type : str The subdirectory of ``pypeit/data/`` in which to find the file (e.g., ``arc_lines/reid_arxiv`` or ``sensfuncs``) remote_host : :obj:`str`, optional The remote host scheme. Currently only 'github' and 's3_cloud' are supported. Defaults to None. Returns ------- url : str The URL of the ``f_name`` of ``f_type`` on server ``remote_host`` sources : :obj:`list` or :obj:`None` For 's3_cloud', the list of URLs to actually try, passed to `astropy.utils.data.download_file`_, used in the event that the S3 location changes. We maintain the static URL for the name to prevent re-downloading large data files in the event the S3 location changes (but the file itself is unchanged). If None (e.g. for 'github'), then `astropy.utils.data.download_file`_ is unaffected, and the ``url`` (above) is what controls the download. """ if remote_host == "github": parts = ['https://raw.githubusercontent.com', f'/{git_remote_path()}/', f'{git_branch()}/', 'pypeit/', 'data/'] + [f'{p}/' for p in pathlib.Path(f_type).parts] + [f'{f_name}'] return reduce(lambda a, b: urljoin(a, b), parts), None if remote_host == "s3_cloud": # Build up the (permanent, fake) `remote_url` and (fluid, real) `sources` for S3 Cloud parts = [f'{p}/' for p in pathlib.Path(f_type).parts] + [f'{f_name}'] parts_perm = ['https://s3.cloud.com/pypeit/'] + parts parts_fake = [f'https://{_get_s3_hostname()}/pypeit/'] + parts return reduce(lambda a, b: urljoin(a, b), parts_perm), \ [reduce(lambda a, b: urljoin(a, b), parts_fake)] raise PypeItError(f"Remote host type {remote_host} is not supported for package data caching.")
[docs] def _get_s3_hostname() -> str: """ Get the current S3 hostname from the package file Since the S3 server hostname used to hold package data such as telluric atmospheric grids may change periodically, we keep the current hostname in a separate file (``pypeit/data/s3_url.txt``), and pull the current version from the PypeIt ``release`` branch whenever needed. .. note:: When/if the S3 URL changes, the ``release`` branch version of ``pypeit/data/s3_url.txt`` can be updated easily with a hotfix PR, and this routine will pull it. If GitHub cannot be reached, the routine uses the version of ``pypeit/data/s3_url.txt`` included with the package distribution. Returns: str: The current hostname URL of the S3 server holding package data """ # Try getting the latest version from the server, else use what's included try: remote_url = ( github.Github() .get_repo("pypeit/PypeIt") .get_contents("pypeit/data/s3_url.txt", "release") .download_url ) filepath = astropy.utils.data.download_file( remote_url, cache="update", timeout=10, pkgname="pypeit" ) except ( requests.exceptions.ConnectionError, requests.exceptions.RequestException, urllib.error.URLError, github.GithubException, TimeoutError, ): filepath = __PYPEIT_DATA__ / 's3_url.txt' # Open the file and return the URL with open(filepath, "r", encoding="utf-8") as fileobj: return fileobj.read().strip()