Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • ssa/workspaces
1 result
Show changes
Commits on Source (54)
Showing
with 299 additions and 300 deletions
[paths]
source =
./
/code/
/packages/
/code/packages
\ No newline at end of file
......@@ -65,11 +65,12 @@ services/capability/capability.log
services/capability/workflow.log
services/workflow/workflow.log
services/notification/notification.log
**/.coverage
**/.coverage*
**/htmlcov
**/coverage.xml
**/coverage.json
pyproject.toml
package-lock.json
# Ignore docker volume mount points
services/**/**/apps
......
stages:
- build-base
- push-base
- run-schema
- build-dev
- unit-test-dev
- test-coverage
- push-dev
- clean-images
# - deploy-dev
- deploy-coverage-page
- deploy-dev
- .post
# - deploy-test
variables:
PROJECT_NAME: "workspaces"
DEFAULT_DEPLOY_ENV: "dev"
# Postgres Service Variables
POSTGRES_DB: archive
POSTGRES_USER: "archive"
POSTGRES_PASSWORD: "docker"
# CI Postgres Service
services:
- name: marconi.aoc.nrao.edu/ops/ci/db:workspaces
alias: db
image: docker:19.03.12
workflow:
rules:
- if: $CI_MERGE_REQUEST_TITLE =~ /^WIP:|^Draft:/
when: never
- if: $CI_MERGE_REQUEST_IID
- if: $CI_COMMIT_TAG
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
......@@ -38,6 +51,7 @@ build base image:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_COMMIT_MESSAGE =~ /\A(?i)-ops/'
changes:
- Dockerfile.base
- docker.properties
# Push Base Image Stage
push base image:
......@@ -50,24 +64,48 @@ push base image:
- if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_COMMIT_MESSAGE =~ /\A(?i)-ops/'
changes:
- Dockerfile.base
- docker.properties
# Run Schema
run schema:
stage: run-schema
image: ${REGISTRY_URL}/ops/base:${PROJECT_NAME}
script:
- export PGPASSWORD=$POSTGRES_PASSWORD
- cd schema && ./bin/run-migrations.sh "docker"
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: always
- if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
when: always
# Build Stages
build dev workflow:
stage: build-dev
variables:
SERVICE_NAME: "workflow"
PATH_PREFIX: "services/"
extends: .build
build dev capability:
stage: build-dev
variables:
SERVICE_NAME: "capability"
PATH_PREFIX: "services/"
extends: .build
build dev notification:
stage: build-dev
variables:
SERVICE_NAME: "notification"
PATH_PREFIX: "services/"
extends: .build
build dev web:
stage: build-dev
variables:
SERVICE_NAME: "web"
PATH_PREFIX: "apps/"
extends: .build
## Test Stages ##
......@@ -97,25 +135,31 @@ unit test dev notification:
needs:
- build dev notification
# disabled stage
.unit test coverage:
# Generate Coverage reports
unit test coverage:
stage: test-coverage
image: python:3.8-slim
before_script:
-
variables:
SERVICE_NAME_1: "capability"
SERVICE_NAME_2: "workflow"
SERVICE_NAME_3: "notification"
CONTAINER_NAME_1: ${SERVICE_NAME_1}_${CI_COMMIT_SHORT_SHA}
CONTAINER_NAME_2: ${SERVICE_NAME_2}_${CI_COMMIT_SHORT_SHA}
CONTAINER_NAME_2: ${SERVICE_NAME_3}_${CI_COMMIT_SHORT_SHA}
- pip install pytest pytest-cov
script:
- docker container run --name ${CONTAINER_NAME_1} ${REGISTRY_URL}/${PROJECT_NAME}/${SERVICE_NAME_1}:${CI_COMMIT_SHORT_SHA} ./bin/run-tests.sh -cr xml -o ${SERVICE_NAME_1}.xml
- docker container run --name ${CONTAINER_NAME_2} ${REGISTRY_URL}/${PROJECT_NAME}/${SERVICE_NAME_2}:${CI_COMMIT_SHORT_SHA} ./bin/run-tests.sh -cr xml -o ${SERVICE_NAME_2}.xml
- docker container run --name ${CONTAINER_NAME_3} ${REGISTRY_URL}/${PROJECT_NAME}/${SERVICE_NAME_3}:${CI_COMMIT_SHORT_SHA} ./bin/run-tests.sh -cr xml -o ${SERVICE_NAME_3}.xml
- docker cp ${CONTAINER_NAME_1}:coverage.xml ./${CONTAINER_NAME_1}.xml
- docker cp ${CONTAINER_NAME_2}:coverage.xml ./${CONTAINER_NAME_2}.xml
- docker cp ${CONTAINER_NAME_3}:coverage.xml ./${CONTAINER_NAME_3}.xml
- coverage combine --append
- coverage report
- coverage xml
- coverage html
artifacts:
reports:
cobertura: coverage.xml
paths:
- coverage.xml
- htmlcov/
dependencies:
- unit test dev workflow
- unit test dev capability
- unit test dev notification
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
# Push Stages
push dev workflow:
......@@ -142,35 +186,69 @@ push dev notification:
needs:
- unit test dev notification
push dev web:
stage: push-dev
variables:
SERVICE_NAME: "web"
extends: .push
# UI tests coming soon!
# needs:
# - unit test dev ui
# Cleanup
clean build workflow:
stage: clean-images
stage: .post
variables:
SERVICE_NAME: "workflow"
extends: .cleanup
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_COMMIT_MESSAGE =~ /\A(?i)-debug/'
when: never
allow_failure: true
clean build capability:
stage: clean-images
stage: .post
variables:
SERVICE_NAME: "capability"
extends: .cleanup
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_COMMIT_MESSAGE =~ /\A(?i)-debug/'
when: never
allow_failure: true
clean build notification:
stage: clean-images
stage: .post
variables:
SERVICE_NAME: "notification"
extends: .cleanup
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_COMMIT_MESSAGE =~ /\A(?i)-debug/'
when: never
allow_failure: true
clean build web:
stage: .post
variables:
SERVICE_NAME: "web"
extends: .cleanup
allow_failure: true
# Deploy Stages
pages:
stage: deploy-coverage-page
image: python:3.8-slim
dependencies:
- unit test coverage
script:
- mkdir public
- mv htmlcov/* public/
artifacts:
paths:
- public
expire_in: 2 weeks
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
# Development
deploy dev:
stage: deploy-dev
variables:
ENV: "dev"
script:
- SWARM_NODE_ENV=$ENV TAG_TO_DEPLOY=$CI_DEFAULT_BRANCH docker stack deploy --compose-file docker-compose.${ENV}.yml workspaces-${ENV}
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
# Development
# deploy dev:
......@@ -229,4 +307,4 @@ clean build notification:
# SWARM_NODE_ENV="test" TAG_TO_DEPLOY="${CI_COMMIT_TAG}" docker stack deploy --compose-file docker-compose.dev.yml workspaces-dev
# rules:
# - if: $CI_COMMIT_TAG
# when: manual
\ No newline at end of file
# when: manual
SHELL := /bin/bash
.PHONY: check-build test-dev test dev setup alembic-update docker-base db build coverage clean
all: dev check-build
# Check if local code will pass CI build
.PHONY: check-build
check-build: docker-base docker-dev-images-locally test-dev
# Run tests on Dockerfile.dev images
.PHONY: test-dev
test-dev:
docker run nrao:workflow ./bin/run-tests.sh
docker run nrao:capability ./bin/run-tests.sh
docker run nrao:notification ./bin/run-tests.sh
# Run tests on Dockerfile.local containers
.PHONY: test
test:
docker exec workspaces_workflow_1 ./bin/run-tests.sh
docker exec workspaces_capability_1 ./bin/run-tests.sh
docker exec workspaces_notification_1 ./bin/run-tests.sh
# Setup local development environment
.PHONY: dev
dev: docker-base
.PHONY: setup
setup:
docker exec workspaces_capability_1 /bin/bash -c 'python -m pip install -r requirements.txt'
docker exec workspaces_capability_1 /bin/bash -c 'python -m pip install -e .'
docker exec workspaces_workflow_1 /bin/bash -c 'python -m pip install -r requirements.txt'
docker exec workspaces_workflow_1 /bin/bash -c 'python -m pip install -e .'
docker exec workspaces_notification_1 /bin/bash -c 'python -m pip install -r requirements.txt'
docker exec workspaces_notification_1 /bin/bash -c 'python -m pip install -e .'
docker-compose restart
.PHONY: alembic-update
alembic-update:
cd schema; \
env CAPO_PROFILE=local alembic upgrade head
# Build images from Dockerfile.dev
.PHONY: docker-dev-images-locally
docker-dev-images-locally:
docker build -t nrao:workflow -f services/workflow/Dockerfile.local . --build-arg capo_env=docker
docker build -t nrao:capability -f services/capability/Dockerfile.local . --build-arg capo_env=docker
docker build -t nrao:notification -f services/notification/Dockerfile.local . --build-arg capo_env=docker
# Build base image
.PHONY: docker-base
docker-base:
docker-base: db
docker build -t marconi.aoc.nrao.edu/ops/base:workspaces -f Dockerfile.base .
docker build -t marconi.aoc.nrao.edu/ops/base:nodejs-14 -f apps/web/Dockerfile.base .
# Build DB image
db:
docker build -t marconi.aoc.nrao.edu/ops/ci/db:workspaces -f ./ci/psql/Dockerfile.db .
# Build docker images
build: docker-base db
docker-compose build --no-cache
# Generate HTML coverage report
coverage:
docker exec workspaces_workflow_1 ./bin/run-tests.sh -b
cp services/workflow/.coverage ./.coverage.wf
docker exec workspaces_capability_1 ./bin/run-tests.sh -b
cp services/capability/.coverage ./.coverage.cap
docker exec workspaces_notification_1 ./bin/run-tests.sh -b
cp services/notification/.coverage ./.coverage.no
coverage combine --append
coverage html
# Clean up environment
clean:
# Clean up Docker environment
docker-compose down
-docker images -aq | xargs docker rmi
docker system prune --volumes -af
docker volume prune -f
# Delete Python cache directories
find . \( -name "*.egg-info" -o -name ".pytest_cache" \) | xargs rm -r
......@@ -30,7 +30,6 @@ setup(
install_requires=requires,
keywords=[],
packages=["datafetcher"],
package_dir={"": "src"},
classifiers=["Programming Language :: Python :: 3.8"],
entry_points={"console_scripts": ["datafetcher = datafetcher.commands:main"]},
entry_points={"console_scripts": ["datafetcher = datafetcher.datafetcher:main"]},
)
# datafetcher Dockerfile
#
# TO BUILD the docker image: -don't- "docker build" directly!
# use docker_build.sh:
# from apps/cli/executables/datafetcher,
#
# ./docker_build.sh datafetcher_test[:N]
#
# where '-t' specifies a name and N' is the version.
# (If ':N' is omitted, version is 'latest' by default.)
# tag is not required for the build, but without it
# the container name is an unhelpful hexadecimal value.
FROM continuumio/miniconda3:latest
COPY environment.yml .
ENV PATH $HOME/miniconda3/bin/conda:$PATH
# docker_build.sh should have copied environment.yml from data/;
# it will be used in the command below
RUN conda env update
# get what we'll need for the build
COPY . .
# get application files and tests
COPY src/ .
COPY test/ .
# install the application
RUN ["conda", "run", "-n", "data", "python", "setup.py", "develop"]
# we'll need a Capo profile
ENV CAPO_PROFILE local
ENV CAPO_PATH test/
# finally, run the tests. be verbose. log stuff.
# (for more detailed output, use "-vv" and/or "--log-level=DEBUG";
# to quit after first failure, use "-x")
ENTRYPOINT ["conda", "run", "-n", "data", "pytest", "-vv", "--log-level=DEBUG", "--showlocals", "test/"]
......@@ -24,7 +24,7 @@ def get_project_root() -> Path:
"""
my_path = Path(__file__)
path = my_path
while not path.name.endswith("workspaces") and not path.name.endswith("code"):
while not path.name.endswith("workspaces") and not path.name.endswith("packages"):
path = path.parent
return path
......@@ -53,7 +53,7 @@ from datafetcher.utilities import (
RetrievalMode,
)
TEST_PROFILE = "local"
TEST_PROFILE = "docker"
MISSING_SETTING = ReturnCode.MISSING_SETTING.value["code"]
MISSING_PROFILE = ReturnCode.MISSING_PROFILE.value["code"]
RUN_ALL = True
......
#!/bin/bash
# Building a Docker image in which to execute tests
# will require a copy of the local Capo properties
# file, which can be found at /home/casa/capo
# on boxes that can see /home, but which on boxes
# that can't is likely to be at ~/home/.capo for
# any given user. Find local.properties and
# copy it to our test directory. Dockerfiles
# do not support conditional logic; hence this script.
# Execute script from apps/executables/cli/datafetcher/
FILENAME=local.properties
CONTAINER_NAME=$1;shift
CACHE_FLAG=$1;shift
USAGE='Usage: $0 <container_name> [--NO-CACHE]'
if [[ -z "${CONTAINER_NAME}" ]]
then
echo "${USAGE}"
exit 1
fi
if [ -z "${CACHE_FLAG}" ]
then
shopt -s nocasematch
if [[ "${CACHE_FLAG}" =~ ^NO[-_]CACHE$ ]]
then
echo 'invalid cache flag: '"${CACHE_FLAG}"
exit 1
else
USE_CACHE=1
fi
else
USE_CACHE=0
fi
# conda will need the environment.yml
export ENV_YML=environment.yml
export YML_DIR=../../../../
cp $YML_DIR${ENV_YML} ${ENV_YML}
# The preferred version of Capo .properties files is always
# the one at /home/casa/capo, -if- this is visible
# (i.e., NRAO internal system). If not (i.e., developer laptop),
# get the one in the user's .capo directory
if [ -e /home/casa/capo/${FILENAME} ]
then
SOURCE=/home/casa/capo/${FILENAME}
elif [ -e ~/.capo/${FILENAME} ]
then
SOURCE=~/.capo/${FILENAME}
else
echo '${FILENAME} not found!'
exit 1
fi
NEW_FILE=./test/${FILENAME}
cp ${SOURCE} ${NEW_FILE}
# remove extended attributes, which would cause Capo to balk
/usr/bin/xattr -c ${NEW_FILE}
## where the magic happens
if [ "${USE_CACHE}" == 1 ]
then
echo '>>>> Using cache, if possible'
docker build . -f test/Dockerfile -t ${CONTAINER_NAME}
else
echo '>>>> no cache'
docker build . -f test/Dockerfile --no-cache -t ${CONTAINER_NAME}
fi
# now get rid of the properties file; containing sensitive info, it must NOT be saved or committed
rm -f ${NEW_FILE}
# get rid of the .yml, too
rm -f ${ENV_YML}
# to run the image: docker run ${CONTAINER_NAME}[:latest]
......@@ -66,12 +66,22 @@ def test_omitted_profile_returns_expected_code(make_tempdir, settings):
:return:
"""
# store existing CAPO_PROFILE
existing_capo_profile = os.environ["CAPO_PROFILE"]
# remove existing CAPO_PROFILE
os.environ["CAPO_PROFILE"] = ""
# omit --profile arg
args = ['--product-locator', settings.test_data['product_locator'],
'--output-dir', str(make_tempdir)]
return_code = launch_datafetcher(args, settings.capo_settings)
assert return_code == MISSING_PROFILE
# restore the existing CAPO_PROFILE
os.environ["CAPO_PROFILE"] = existing_capo_profile
@pytest.mark.skipif(not RUN_ALL, reason='debug')
def test_omitted_capo_value_returns_expected_code(make_tempdir, settings):
......
# Delivery Architecture
What is delivery? Delivery is what happens after the the active processing portion of the workflow concludes. It is the
step that moves the retrieved or generated products from the processing area to a place where they can be accessed by
the requesting user.
Most workflows proceed by retrieving some files from NGAS and running CASA on those files to produce new products. The
files are large and CASA is quite heavy, so we retrieve the files into a spool area on the Lustre filesystem and then
launch the CASA jobs on the cluster. Once CASA is finished, the files the user wants are still sitting in that spool
area on Lustre. Delivery is what gets the files from there to where the user can retrieve them.
The simplest kind of delivery is just copying files from the spool area to another location—a mere `cp`. However, we
have several complications:
- CASA mandates a certain filesystem layout for the spool area
- The filesystem layout of the delivery destination varies based on the _type_ of the product
- Users can request `tar` archives, optionally
- Users can request delivery to their own areas in Lustre
- Not specifying a delivery location implies creating a unique location under a web root
We also want to be somewhat flexible in case new streaming kinds of deliveries are mandated in the future, such as
Globus (formerly GridFTP).
The result is that the behavior of the delivery process, which is fundamentally `cp`, varies both according to options
given by the user and various facts about the data we happen to be delivering.
## Handling files
At the bottom of every delivery process is a process of being supplied files and told to deliver them. The
_Destination_ system is the core of this portion of the process. The goal here is to decouple the idea of "here is a
file to deliver" from the details of how that delivery happens. We have one concrete class here, `LocalDestination`,
which represents the common `cp` case of copying a file into the destination. If the simplest delivery
is `cp source dest`, you can think of `LocalDestination` as embodying the idea of `cp ... dest`.
The _Destination_ classes make no sense on their own, their purpose is to be passed around to other objects in the
system that know about files that need to be delivered. The _Destination_ classes just hide the details about where
those files are actually going and how they're getting there.
If we were going to support something like Globus, I expect it would appear as a peer of `LocalDestination`, as another
concrete implementation of `Destination`.
### Checksums and compression
Thinking along these lines, one can think of checksums as the construction of another file to be added to the
destination. In fact, `Destination` is 1) handed every file to be delivered, and 2) knows where the files are ultimately
going to be placed, we can see a way to handle creating a checksum file as a kind of "pass-through" step that happens
automatically. The algorithm would look something like this:
1. Make a checksum wrapper for the local destination
2. For every file we get asked to deliver, calculate its checksum before handing it off to the wrapped destination for
delivery
3. After we are done delivering files, pass a fake file containing the checksums to the wrapped destination
This kind of "wrapper" or "pass-through" thing happens often enough in object-oriented programming that it is called
the "Decorator pattern." We can handle compression the same way:
1. Make a tar archive in a scratch area somewhere
2. For every file we get asked to deliver, instead place it in the archive in the scratch area
3. After we are done delivering files, finalize the archive and pass it to the wrapped destination
The key idea here is that the next part of the system which finds files to deliver has _no idea_ about whether we are
using compression or calculating checksums or not—in fact, these wrappers are stackable. The part of the system that
finds files to deliver just passes them to the destination, and as long as the stack of wrappers and destinations has
been constructed by someone in the right order, everything will happen as it should.
The purpose of the `DestinationBuilder` is to ensure that the stack is constructed in the right way. The reason
`Destination` has a `close()` method is for these wrappers to know when we are done delivering files so they can take
their finalization steps.
## Handling products
If you look at
the [delivery directory requirements](https://open-confluence.nrao.edu/display/SPR/Delivery+Directory+Improvements),
you'll see that there are a number of requirements to group things together based on their project or their telescope,
and the directory names are based on the type of product. Knowing what you have in hand affects the layout in the
delivery directory. This means that we are not always going to have a straightforward `cp` command, because the way
files rest in the spool area doesn't necessarily match the way that they need to be laid out in the delivery directory.
The key idea here is that somebody, eventually, knows what _they_ are, and the knowledge about how that _type_ is
delivered should live with that _type_, rather than being spread around the system. Execution blocks should know what
execution blocks are supposed to look like when they get delivered; images should know what images should look like when
they are delivered, and so forth. If a new type of product is invented, supporting a wacky delivery format for that
product should be a matter of defining that product type and adding the logic just to that product. This is why we have
a `SpooledProduct` with a single method: `deliver_to(Destination)`. We expect to have a driver that at some level is
passing a destination to each of these products and saying, "write yourself to this destination."
This suggests that when we are saying "deliver from here to there," we are not saying the same thing as `cp`, which is
saying "copy these files from here to there" but actually we're saying "copy all the products from here to there,
according to how each of these products _should_ be copied." In the beginning, a simple product like an execution block
_will_
simply deliver the files in its directory directly, but as we support more complex products like OUS requests with
images, more interesting things will happen.
## Finding products
How will we know what the products are that need to be delivered? We can assume we are given a source directory with
products in it, but how do we enumerate them in order to deliver them? The most straightforward answer is we can simply
iterate the entire directory and match filename patterns with product types; if it ends with `.ms` it's a measurement
set, if it looks like `PPR.xml` it's a pipeline request, etc. Doing this amounts to having a dispatch table of common
filename patterns, which is tedious, but exhaustive and gives our code a fair amount of control.
There is a second way to figure out the products, which is by examining CASA's `piperesults` output file. This file
isn't necessarily present (after all, CASA is not _required_ for every workflow) so this method cannot ever be the
_only_ means of determining the products. But it may eventually be a requirement that we support using the
`piperesults` file. So rather than having a single class here called `ProductFinder`, we instead have an interface
called `ProductFinder` and a `HeuristicProductFinder` that does the filename dispatch approach and a
`PiperesultsProductFinder` that uses the `piperesults` file to figure it out.
## Bringing it all together
So we have a system that finds products, products that know how to write themselves to a destination, and
destinations that know how to handle local filesystem writes, compression and checksumming. This is most of what is
needed. We can see now that we want to have a main loop that looks like this:
for product in finder.find_products():
product.deliver_to(destination)
What is still missing is a small amount of plumbing to get us from here to there. We need a device for processing
the command line arguments. Some aspects of delivery are based on user-supplied options: whether we are do tar
archives or not, whether we are delivering the raw data retrieved by the data fetcher or the products generated by
CASA. Eventually we will have to support a local delivery command line option. Basically, anything the user chooses
in the archive UI that affects delivery is going to arrive to us through the command line options. So we have to add
a command line parser, which we have in `Context`.
A few lessons-learned type things from the legacy delivery system are also in the `Context`. We assume that a few
"services" are available in `Context` to the `Destination` and `ProductFinder` schemes. For web delivery, we will
eventually need to be able to generate random codes for the URL, but we want those random codes to be stable
throughout the delivery process, so there is a way to do that in the `Context`. Also creating temporary files is
provided via the `Context`, which is something the tar and checksum wrappers will eventually need. So the `Context`
is available to these classes at construction time so they can call these services as needed, or peek at command
line arguments they may care about.
And that's the theory behind delivery in a nutshell.
......@@ -8,138 +8,4 @@ This is the delivery thing.
https://open-confluence.nrao.edu/display/AAT/Proposed+Delivery+Redesign
https://open-confluence.nrao.edu/display/SPR/Delivery+Directory+Improvements
-->
## Theory
What is delivery? Delivery is what happens after the the active processing portion of the workflow concludes. It is the
step that moves the retrieved or generated products from the processing area to a place where they can be accessed by
the requesting user.
Most workflows proceed by retrieving some files from NGAS and running CASA on those files to produce new products. The
files are large and CASA is quite heavy, so we retrieve the files into a spool area on the Lustre filesystem and then
launch the CASA jobs on the cluster. Once CASA is finished, the files the user wants are still sitting in that spool
area on Lustre. Delivery is what gets the files from there to where the user can retrieve them.
The simplest kind of delivery is just copying files from the spool area to another location—a mere `cp`. However, we
have several complications:
- CASA mandates a certain filesystem layout for the spool area
- The filesystem layout of the delivery destination varies based on the _type_ of the product
- Users can request `tar` archives, optionally
- Users can request delivery to their own areas in Lustre
- Not specifying a delivery location implies creating a unique location under a web root
We also want to be somewhat flexible in case new streaming kinds of deliveries are mandated in the future, such as
Globus (formerly GridFTP).
The result is that the behavior of the delivery process, which is fundamentally `cp`, varies both according to options
given by the user and various facts about the data we happen to be delivering.
### Handling files
At the bottom of every delivery process is a process of being supplied files and told to deliver them. The
_Destination_ system is the core of this portion of the process. The goal here is to decouple the idea of "here is a
file to deliver" from the details of how that delivery happens. We have one concrete class here, `LocalDestination`,
which represents the common `cp` case of copying a file into the destination. If the simplest delivery
is `cp source dest`, you can think of `LocalDestination` as embodying the idea of `cp ... dest`.
The _Destination_ classes make no sense on their own, their purpose is to be passed around to other objects in the
system that know about files that need to be delivered. The _Destination_ classes just hide the details about where
those files are actually going and how they're getting there.
If we were going to support something like Globus, I expect it would appear as a peer of `LocalDestination`, as another
concrete implementation of `Destination`.
#### Checksums and compression
Thinking along these lines, one can think of checksums as the construction of another file to be added to the
destination. In fact, `Destination` is 1) handed every file to be delivered, and 2) knows where the files are ultimately
going to be placed, we can see a way to handle creating a checksum file as a kind of "pass-through" step that happens
automatically. The algorithm would look something like this:
1. Make a checksum wrapper for the local destination
2. For every file we get asked to deliver, calculate its checksum before handing it off to the wrapped destination for
delivery
3. After we are done delivering files, pass a fake file containing the checksums to the wrapped destination
This kind of "wrapper" or "pass-through" thing happens often enough in object-oriented programming that it is called
the "Decorator pattern." We can handle compression the same way:
1. Make a tar archive in a scratch area somewhere
2. For every file we get asked to deliver, instead place it in the archive in the scratch area
3. After we are done delivering files, finalize the archive and pass it to the wrapped destination
The key idea here is that the next part of the system which finds files to deliver has _no idea_ about whether we are
using compression or calculating checksums or not—in fact, these wrappers are stackable. The part of the system that
finds files to deliver just passes them to the destination, and as long as the stack of wrappers and destinations has
been constructed by someone in the right order, everything will happen as it should.
The purpose of the `DestinationBuilder` is to ensure that the stack is constructed in the right way. The reason
`Destination` has a `close()` method is for these wrappers to know when we are done delivering files so they can take
their finalization steps.
### Handling products
If you look at
the [delivery directory requirements](https://open-confluence.nrao.edu/display/SPR/Delivery+Directory+Improvements),
you'll see that there are a number of requirements to group things together based on their project or their telescope,
and the directory names are based on the type of product. Knowing what you have in hand affects the layout in the
delivery directory. This means that we are not always going to have a straightforward `cp` command, because the way
files rest in the spool area doesn't necessarily match the way that they need to be laid out in the delivery directory.
The key idea here is that somebody, eventually, knows what _they_ are, and the knowledge about how that _type_ is
delivered should live with that _type_, rather than being spread around the system. Execution blocks should know what
execution blocks are supposed to look like when they get delivered; images should know what images should look like when
they are delivered, and so forth. If a new type of product is invented, supporting a wacky delivery format for that
product should be a matter of defining that product type and adding the logic just to that product. This is why we have
a `SpooledProduct` with a single method: `deliver_to(Destination)`. We expect to have a driver that at some level is
passing a destination to each of these products and saying, "write yourself to this destination."
This suggests that when we are saying "deliver from here to there," we are not saying the same thing as `cp`, which is
saying "copy these files from here to there" but actually we're saying "copy all the products from here to there,
according to how each of these products _should_ be copied." In the beginning, a simple product like an execution block
_will_
simply deliver the files in its directory directly, but as we support more complex products like OUS requests with
images, more interesting things will happen.
### Finding products
How will we know what the products are that need to be delivered? We can assume we are given a source directory with
products in it, but how do we enumerate them in order to deliver them? The most straightforward answer is we can simply
iterate the entire directory and match filename patterns with product types; if it ends with `.ms` it's a measurement
set, if it looks like `PPR.xml` it's a pipeline request, etc. Doing this amounts to having a dispatch table of common
filename patterns, which is tedious, but exhaustive and gives our code a fair amount of control.
There is a second way to figure out the products, which is by examining CASA's `piperesults` output file. This file
isn't necessarily present (after all, CASA is not _required_ for every workflow) so this method cannot ever be the
_only_ means of determining the products. But it may eventually be a requirement that we support using the
`piperesults` file. So rather than having a single class here called `ProductFinder`, we instead have an interface
called `ProductFinder` and a `HeuristicProductFinder` that does the filename dispatch approach and a
`PiperesultsProductFinder` that uses the `piperesults` file to figure it out.
### Bringing it all together
So we have a system that finds products, products that know how to write themselves to a destination, and
destinations that know how to handle local filesystem writes, compression and checksumming. This is most of what is
needed. We can see now that we want to have a main loop that looks like this:
for product in finder.find_products():
product.deliver_to(destination)
What is still missing is a small amount of plumbing to get us from here to there. We need a device for processing
the command line arguments. Some aspects of delivery are based on user-supplied options: whether we are do tar
archives or not, whether we are delivering the raw data retrieved by the data fetcher or the products generated by
CASA. Eventually we will have to support a local delivery command line option. Basically, anything the user chooses
in the archive UI that affects delivery is going to arrive to us through the command line options. So we have to add
a command line parser, which we have in `Context`.
A few lessons-learned type things from the legacy delivery system are also in the `Context`. We assume that a few
"services" are available in `Context` to the `Destination` and `ProductFinder` schemes. For web delivery, we will
eventually need to be able to generate random codes for the URL, but we want those random codes to be stable
throughout the delivery process, so there is a way to do that in the `Context`. Also creating temporary files is
provided via the `Context`, which is something the tar and checksum wrappers will eventually need. So the `Context`
is available to these classes at construction time so they can call these services as needed, or peek at command
line arguments they may care about.
And that's the theory behind delivery in a nutshell.
-->
\ No newline at end of file