New Dockerfile and scripts for running tutorials in container

andrewdelman · andrewdelman · commit ac7e461869e9 · 2024-08-13T06:20:15.000Z
diff --git a/Docker/Dockerfile b/Docker/Dockerfile
@@ -1,19 +1,94 @@
 # syntax=docker/dockerfile:1
-FROM redhat/ubi8:latest
+# Dockerfile to build environment and run ECCOv4 Python tutorials.
+# Adapted from pangeo/base-image.
+FROM ubuntu:22.04
 
-# clone repository and set up in container
-RUN dnf update && dnf install git -y
-RUN cd ~ && git clone https://github.com/ECCO-GROUP/ECCO-v4-Python-Tutorial.git
-COPY ~/ECCO-v4-Python-Tutorial /
+ARG NB_USER
+ARG NB_UID
 
-# expose port to be used later with Jupyterlab
-EXPOSE 9889
+# Setup environment to match variables set by repo2docker as much as possible
+# The name of the conda environment into which the requested packages are installed
 
-# run jupyter set up script
-CMD ["chmod", "755", "/ECCO-v4-Python-Tutorial/Cloud_Setup/jupyter_env_setup.sh"]
-CMD ["/ECCO-v4-Python-Tutorial/Cloud_Setup/jupyter_env_setup.sh"]
+ENV CONDA_ENV=jupyter \
+    # Tell apt-get to not block installs by asking for interactive human input
+    # Use /bin/bash as shell, not the default /bin/sh (arrow keys, etc don't work then)
+    SHELL=/bin/bash \
+    # Setup locale to be UTF-8, avoiding gnarly hard to debug encoding errors
+    LANG=C.UTF-8  \
+    LC_ALL=C.UTF-8 \
+    # Install conda in the same place repo2docker does
+    CONDA_DIR=/srv/conda
 
-## note: might have to work out how user can input Earthdata credentials when jupyter_env_setup.sh is run, and port number/password for jupyter_lab_start.sh
 
-# run jupyter lab start script
-CMD ["~/jupyter_lab_start.sh"]
+# All env vars that reference other env vars need to be in their own ENV block
+# Path to the python environment where the jupyter notebook packages are installed
+ENV NB_PYTHON_PREFIX=${CONDA_DIR}/envs/${CONDA_ENV} \
+    # Home directory of our non-root user
+    HOME=/home/${NB_USER}
+
+# Add both our notebook env as well as default conda installation to $PATH
+# Thus, when we start a `python` process (for kernels, or notebooks, etc),
+# it loads the python in the notebook conda environment, as that comes
+# first here.
+ENV PATH=${NB_PYTHON_PREFIX}/bin:${CONDA_DIR}/bin:${PATH}
+
+# Ask dask to read config from ${CONDA_DIR}/etc rather than
+# the default of /etc, since the non-root user can write
+# to ${CONDA_DIR}/etc but not to /etc
+ENV DASK_ROOT_CONFIG=${CONDA_DIR}/etc
+
+
+RUN echo "Creating ${NB_USER} user..." \
+    # Create a group for the user to be part of, with gid same as uid
+    && groupadd --gid ${NB_UID} ${NB_USER}  \
+    # Create non-root user, with given gid, uid and create $HOME
+    && useradd --create-home --gid ${NB_UID} --no-log-init --uid ${NB_UID} ${NB_USER}
+
+# Make sure that /srv is owned by non-root user, so we can install things there
+USER root
+RUN chown -R ${NB_USER}:${NB_USER} /srv
+
+# Run conda activate each time a bash shell starts, so users don't have to manually type conda activate
+# Note this is only read by shell, but not by the jupyter notebook - that relies
+# on us starting the correct `python` process, which we do by adding the notebook conda environment's
+# bin to PATH earlier ($NB_PYTHON_PREFIX/bin)
+RUN echo ". ${CONDA_DIR}/etc/profile.d/conda.sh ; conda activate ${CONDA_ENV}" > /etc/profile.d/init_conda.sh
+
+# Install basic apt packages
+RUN echo "Installing Apt-get packages..." \
+    && apt-get update --fix-missing > /dev/null \
+    && apt-get install -y apt-utils wget tmux zip tzdata > /dev/null \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Add TZ configuration - https://github.com/PrefectHQ/prefect/issues/3061
+ENV TZ=UTC
+# ========================
+
+USER ${NB_USER}
+WORKDIR ${HOME}
+
+# Install latest mambaforge in ${CONDA_DIR}
+RUN echo "Installing Miniforge..." \
+    && URL="https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-$(uname -m).sh" \
+    && wget --quiet ${URL} -O installer.sh \
+    && /bin/bash installer.sh -u -b -p ${CONDA_DIR} \
+    && rm installer.sh \
+    && mamba clean -afy \
+    # After installing the packages, we cleanup some unnecessary files
+    # to try reduce image size - see https://jcristharif.com/conda-docker-tips.html
+    # Although we explicitly do *not* delete .pyc files, as that seems to slow down startup
+    # quite a bit unfortunately - see https://github.com/2i2c-org/infrastructure/issues/2047
+    && find ${CONDA_DIR} -follow -type f -name '*.a' -delete
+
+
+COPY --chown=${NB_USER}:${NB_USER} ./ECCO-v4-Python-Tutorial /home/${NB_USER}/ECCO-v4-Python-Tutorial
+
+RUN echo "Using environment.yml to create conda environment ${CONDA_ENV}" & \
+        mamba env create --name ${CONDA_ENV} \
+        -f ./ECCO-v4-Python-Tutorial/Docker/environment.yml
+
+EXPOSE 8888
+
+# start jupyter lab inside the container
+ENTRYPOINT ["./ECCO-v4-Python-Tutorial/Docker/jupyter_lab_start_docker.sh"]
diff --git a/Docker/README.md b/Docker/README.md
@@ -0,0 +1,43 @@
+# Run tutorials on an AWS EC2 instance using a Docker container
+
+(valid as of 2024-08-13)
+
+## Getting started
+
+Before setting up the Docker container, you will need to start an EC2 instance. Please follow the [AWS Cloud: getting started](https://ecco-v4-python-tutorial.readthedocs.io/AWS_Cloud_getting_started.html) tutorial up to the part in Step 3 where the tutorial repository is cloned using `git clone`. However, do not run `jupyter_env_setup.sh`. Instead, run `sudo dnf install docker` to install the Docker software on your instance.
+
+## Build the Docker image
+
+The `ECCO-v4-Python-Tutorial/Docker` directory has the files that you need to build a Docker image and then run it and use the tutorials. In that directory, run `./docker_image_build.sh` and it will build a Docker image named `localhost/ecco_tut_image:latest`.
+
+The build process takes a few minutes, and the image will occupy 4-5GB of storage, so make sure your instance has sufficient storage. You will also likely need about that much memory to complete the build process, so at least a `large` instance on AWS is strongly recommended.
+
+## Run the Docker image
+
+When the build completes, you will run the image, which will activate a container within your EC2 instance, and start running Jupyter lab in that container. This is done with the following command:
+
+```bash
+docker run -it -p 8888:8888 localhost/ecco_tut_image:latest
+```
+
+Note the port numbers specified under the `-p` option. The port listed after the colon is the container port, which is always 8888 unless this is changed manually in the `Dockerfile` (on the `EXPOSE 8888` line) prior to building the image. The port before the colon is what the host EC2 instance uses to communicate with the container, and this can be specified differently depending on the user's port availability.
+
+When the command above is run, you will first be queried for NASA Earthdata credentials if those are not already stored in a `~/.netrc` file under your user home directory. After entering the credentials, you will be queried for the container port number (8888 by default unless changed as described above), and an optional password (if no password is entered, none will be needed to log in to Jupyter lab).
+
+As Jupyter lab is launched, you will see a lot of output tagged `ServerApp` or `LabApp`. To free up this window you can press `Ctrl-p` `Ctrl-q`, and the window will escape the container...but importantly, the container is still running. To check the status of Docker containers, run `docker ps -a`.
+
+## Open Jupyter lab in your browser
+
+Now you need to open a connection between your local machine and the EC2 instance with the correct port forwarding. This will look something like
+
+```bash
+ssh -i ~/.ssh/aws_ec2.pem -L 9889:localhost:8888 ec2-user@100.104.70.127
+```
+
+and then in a browser window on your local machine, access the port you specified before `localhost` above
+
+```bash
+http://localhost:9889
+```
+
+You will see a screen that asks for a password, but if you didn't enter any before, you can just go ahead and click `Login`. Now you have access to the tutorial repository, and the tutorials are in the directory `Tutorials_as_Jupyter_Notebooks`. Enjoy!
diff --git a/Docker/docker_image_build.sh b/Docker/docker_image_build.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+# build docker image with build context in parent user directory
+# and Dockerfile in ECCO-v4-Python-Tutorial/Docker subdirectory,
+# passing current user info as build arguments
+
+
+cd /home/${USER}
+
+docker build . \
+    --build-arg NB_USER=${USER} \
+    --build-arg NB_UID=$(id -u ${USER}) \
+    -t ecco_tut_image \
+    -f ./ECCO-v4-Python-Tutorial/Docker/Dockerfile
diff --git a/Docker/earthdata_auth_docker.sh b/Docker/earthdata_auth_docker.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Shell script for adding NASA Earthdata authentication credentials to ~/.netrc
+# if they are not already in the file
+
+# # Start body of script
+
+red_start='\033[0;31m'
+blue_start='\033[0;34m'
+nocolor_start='\033[0m'
+
+# Set up NASA Earthdata credential
+
+echo "${red_start}Setting up NASA Earthdata authentication${nocolor_start}"
+# NASA Earthdata authentication
+# check if credentials are already archived in ~/.netrc, and if not then prompt the user for them
+earthdata_cred_stored=0
+if [ -f ~/.netrc ]; then
+    if grep -q "machine urs.earthdata.nasa.gov" ~/.netrc; then
+        earthdata_cred_stored=1
+        echo "${red_start}Earthdata credentials already archived ${nocolor_start}"
+    fi
+fi
+if [ $earthdata_cred_stored -eq 0 ]; then
+    if [ -f ~/.netrc ]; then chmod 600 ~/.netrc; fi
+    read -p 'NASA Earthdata username: ' uservar
+    read -p 'NASA Earthdata password: ' passvar
+    echo "machine urs.earthdata.nasa.gov\n    login ${uservar}\n    password ${passvar}\n" >> ~/.netrc
+    
+    echo "\n${red_start}NASA Earthdata authentication info archived in ~/.netrc${nocolor_start}"
+fi
+chmod 400 ~/.netrc
diff --git a/Docker/environment.yml b/Docker/environment.yml
diff --git a/Docker/jupyter_lab_start_docker.sh b/Docker/jupyter_lab_start_docker.sh