Skip to content

Commit 1344602

Browse files
Use existing build output (#94)
* specify stage for make * correct default pipeline input * remove limit param validation for now * increase timeout and memory on validation Lambda * set release pattern regex * fix premature exit on build errors causing container to stop * capture error output to disk as ndjson * consolidate error handling within process_allele function; remove sqs * remove failed alleles queue * script proceeds if some alleles fail * validate build for single release version * fix restore target * fix input paths in state machine * use existing build output if available * wait for backup document to finish * refactor input array for cleaner output * add use_existing_build to pipeline params * evaluate use_existing_build after file validation * refactor environment validation target * add skip_load to pipeline params * update README
1 parent 4e87b56 commit 1344602

File tree

19 files changed

+448
-900
lines changed

19 files changed

+448
-900
lines changed

Makefile

Lines changed: 11 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@ export PIPELINE_STATE_PATH := config/IMGTHLA-repository-state.json
3838
export PIPELINE_PARAMS_PATH := config/pipeline-input.json
3939
export FUNCTIONS_PATH := ${APP_NAME}/pipeline/functions
4040

41+
# Required environment variables
42+
REQUIRED_VARS := STAGE APP_NAME AWS_ACCOUNT AWS_REGION AWS_PROFILE SUBSCRIBE_EMAILS \
43+
GITHUB_REPOSITORY_OWNER GITHUB_REPOSITORY_NAME GITHUB_PERSONAL_ACCESS_TOKEN \
44+
HOST_DOMAIN SUBDOMAIN ADMIN_EMAIL NEO4J_AMI_ID APOC_VERSION GDS_VERSION
45+
4146
# print colors
4247
define blue
4348
@tput setaf 4
@@ -170,51 +175,8 @@ else
170175
endif
171176

172177
env.validate: check.dependencies
173-
ifndef STAGE
174-
$(error STAGE is not set. Please add STAGE to the environment variables.)
175-
endif
176-
ifndef APP_NAME
177-
$(error APP_NAME is not set. Please add APP_NAME to the environment variables.)
178-
endif
179-
ifndef AWS_ACCOUNT
180-
$(error AWS_ACCOUNT is not set. Please add AWS_ACCOUNT to the environment variables.)
181-
endif
182-
ifndef AWS_REGION
183-
$(error AWS_REGION is not set. Please add AWS_REGION to the environment variables.)
184-
endif
185-
ifndef AWS_PROFILE
186-
$(error AWS_PROFILE is not set. Please select an AWS profile to use.)
187-
endif
188-
ifndef SUBSCRIBE_EMAILS
189-
$(error SUBSCRIBE_EMAILS is not set. Please add SUBSCRIBE_EMAILS to the environment variables.)
190-
endif
191-
ifndef GITHUB_REPOSITORY_OWNER
192-
$(error GITHUB_REPOSITORY_OWNER is not set. Please add GITHUB_REPOSITORY_OWNER to the environment variables.)
193-
endif
194-
ifndef GITHUB_REPOSITORY_NAME
195-
$(error GITHUB_REPOSITORY_NAME is not set. Please add GITHUB_REPOSITORY_NAME to the environment variables.)
196-
endif
197-
ifndef GITHUB_PERSONAL_ACCESS_TOKEN
198-
$(error GITHUB_PERSONAL_ACCESS_TOKEN is not set. Please add GITHUB_PERSONAL_ACCESS_TOKEN to the environment variables.)
199-
endif
200-
ifndef HOST_DOMAIN
201-
$(error HOST_DOMAIN is not set. Please add HOST_DOMAIN to the environment variables.)
202-
endif
203-
ifndef SUBDOMAIN
204-
$(error SUBDOMAIN is not set. Please add SUBDOMAIN to the environment variables.)
205-
endif
206-
ifndef ADMIN_EMAIL
207-
$(error ADMIN_EMAIL is not set. Please add ADMIN_EMAIL to the environment variables.)
208-
endif
209-
ifndef NEO4J_AMI_ID
210-
$(error NEO4J_AMI_ID is not set. Please add NEO4J_AMI_ID to the environment variables.)
211-
endif
212-
ifndef APOC_VERSION
213-
$(error APOC_VERSION is not set. Please add APOC_VERSION to the environment variables.)
214-
endif
215-
ifndef GDS_VERSION
216-
$(error GDS_VERSION is not set. Please add GDS_VERSION to the environment variables.)
217-
endif
178+
$(foreach var,$(REQUIRED_VARS),\
179+
$(if $(value $(var)),,$(error $(var) is not set. Please add $(var) to the environment variables.)))
218180
ifndef CREATE_VPC
219181
$(info 'CREATE_VPC' is not set. Defaulting to 'false')
220182
$(eval export CREATE_VPC := false)
@@ -264,7 +226,9 @@ database.load.run: # args: align, kir, limit, releases
264226
[ "$$kir" ] && kir="$$kir" || kir=false && \
265227
[ "$$limit" ] && limit="$$limit" || limit="" && \
266228
[ "$$releases" ] && releases="$$releases" || releases="" && \
267-
payload="{ \"align\": $$align, \"kir\": $$kir, \"limit\": \"$$limit\", \"releases\": \"$$releases\", \"mem_profile\": false }" && \
229+
[ "$$use_existing_build" ] && use_existing_build="$$use_existing_build" || use_existing_build=false && \
230+
[ "$$skip_load" ] && skip_load="$$skip_load" || skip_load=false && \
231+
payload="{\"align\":$$align,\"kir\":$$kir,\"limit\":\"$$limit\",\"releases\":\"$$releases\",\"mem_profile\":false,\"use_existing_build\":$$use_existing_build,\"skip_load\":$$skip_load}"&&\
268232
echo "$$payload" | jq -r && \
269233
echo "$$payload" | jq > payload.json
270234
@echo "Run pipeline with this payload? [y/N] \c " && read ans && [ $${ans:-N} = y ]
@@ -370,7 +334,7 @@ database.delete:
370334
pipeline.delete:
371335
$(MAKE) -C ${APP_NAME}/pipeline/ service.delete
372336

373-
pipeline.functions.delete:
337+
pipeline.service.delete:
374338
$(MAKE) -C ${APP_NAME}/pipeline/ service.functions.delete
375339

376340
pipeline.jobs.delete:

README.md

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,14 @@ STAGE=<stage> make deploy
265265
# Deploy config files and scripts to S3
266266
STAGE=<stage> make config.deploy
267267

268-
# Run the StepFunctions State Machine to load Neo4j
269-
STAGE=<stage> make database.load.run releases=<version> align=<boolean> kir=<boolean> limit=<int>
268+
# Run the Step Functions State Machine to load Neo4j
269+
STAGE=<stage> make database.load.run \
270+
releases=<version> \
271+
align=<boolean> \
272+
kir=<boolean> \
273+
limit=<int> \
274+
use_existing_build=<boolean> \
275+
skip_load=<boolean>
270276

271277
# Retrieve Neo4j credentials after deployment
272278
STAGE=<stage> make database.get.credentials
@@ -281,7 +287,7 @@ STAGE=<stage> make get.logs
281287
STAGE=<stage> make get.data
282288

283289
# Delete all CloudFormation based services and data, default is data=false
284-
STAGE=<stage> make delete data=<true/false>
290+
STAGE=<stage> make delete data=<boolean>
285291

286292
# Delete a specific layer
287293
STAGE=<stage> make pipeline.delete
@@ -344,23 +350,33 @@ Base input parameters (excluding the `releases` value) are passed to the Step Fu
344350
```json
345351
// pipeline-input.json
346352
{
347-
"align": "False",
348-
"kir": "False",
349-
"mem_profile": "False",
350-
"limit": ""
353+
"align": false,
354+
"kir": false,
355+
"mem_profile": false,
356+
"limit": "", // Optional, defaults to false
357+
"use_existing_build": false, // Optional, defaults to false
358+
"skip_load": false // Optional, defaults to false
351359
}
352360

353361
```
354-
| Variable | Example Value | Type | Description |
355-
| ----------- | ------------- | ------ | ------------------------------------------------------------------ |
356-
| LIMIT | 1000 | string | Number of alleles to build. Leave blank ("") to build all alleles. |
357-
| ALIGN | False | string | Include or exclude alignments in the build |
358-
| KIR | False | string | Include or exclude KIR data alignments in the build |
359-
| MEM_PROFILE | False | string | Enable memory profiling (for catching memory leaks during build) |
362+
| Variable | Example Value | Type | Description |
363+
| ------------------ | ------------- | ------ | ------------------------------------------------------------------------------ |
364+
| LIMIT | 1000 | string | Number of alleles to build. Leave blank ("") to build all alleles. |
365+
| ALIGN | false | string | Include or exclude alignments in the build |
366+
| KIR | false | string | Include or exclude KIR data alignments in the build |
367+
| MEM_PROFILE | false | string | Enable memory profiling (for catching memory leaks during build) |
368+
| USE_EXISTING_BUILD | false | string | Use existing build files in S3 (if available) instead of building from scratch |
369+
| SKIP_LOAD | false | string | Skip loading the database after building |
360370

361371
The data pipeline can also be invoked from the command line:
362372
```bash
363-
STAGE=<stage> make database.load.run releases=<version> align=<boolean> kir=<boolean> limit=<int>
373+
STAGE=<stage> make database.load.run \
374+
releases=<version> \
375+
align=<boolean> \
376+
kir=<boolean> \
377+
limit=<int> \
378+
use_existing_build=<boolean> \
379+
skip_load=<boolean>
364380
```
365381

366382
#### IMGT/HLA Release Versions State
@@ -393,14 +409,20 @@ STAGE=<stage> make database.load.run releases="<version>"
393409
# Example for single version
394410
STAGE=<stage> make database.load.run releases="3510"
395411

396-
# Example for multiple versions
397-
STAGE=<stage> make database.load.run releases="3490,3500,3510"
412+
# Example for multiple versions where 3510 has already been built
413+
STAGE=<stage> make database.load.run \
414+
releases="3490,3500,3510" \
415+
use_existing_build=true
398416

399417
# Example with limit
400418
STAGE=<stage> make database.load.run releases="3510" limit="1000"
401419

402420
# Example with all arguments included
403-
STAGE=<stage> make database.load.run releases="3510" limit="" align="False" kir="False"
421+
STAGE=<stage> make database.load.run releases="3510" limit="" align=false kir=false
422+
423+
# Example of how to build all releases and skip loading
424+
STAGE=dev make database.load.run releases="300,310,320,330,340,350,360,370,380,390,3100,3110,3120,3130,3140,3150,3160,3170,3180,3190,3200,3210,3220,3230,3240,3250,3260,3270,3280,3290,3300,3310,3320,3330,3340,3350,3360,3370,3380,3390,3400,3410,3420,3430,3440,3450,3460,3470,3480,3490,3500,3510,3520,3530" skip_load=true
425+
```
404426

405427
These commands build an event payload to send to the `invoke-gfe-db-pipeline` Lambda.
406428
```json
@@ -422,9 +444,9 @@ The Lambda function returns the following object which can be viewed in CloudWat
422444
"message": "Pipeline triggered",
423445
"input": [
424446
{
425-
"ALIGN": "False",
426-
"KIR": "False",
427-
"MEM_PROFILE": "False",
447+
"ALIGN": false,
448+
"KIR": false,
449+
"MEM_PROFILE": false,
428450
"LIMIT": "",
429451
"RELEASES": "3510"
430452
},
@@ -436,7 +458,7 @@ The Lambda function returns the following object which can be viewed in CloudWat
436458
### Clean Up
437459
To tear down resources run the command. You will need to manually delete the data in the S3 bucket first to avoid an error in CloudFormation.
438460
```bash
439-
STAGE=<stage> make delete data=<true/false>
461+
STAGE=<stage> make delete data=<boolean>
440462
```
441463
Use the following commands to tear down individual services. Make sure to [backup](#backup--restore) your data first.
442464
```bash

gfe-db/database/Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,10 +118,10 @@ service.restore: #from_date=<YYYY/MM/DD/HH>
118118

119119
service.restore.pre-validate:
120120
@echo "Validating restore pre-conditions"
121-
@echo "Checking if backup exists"
122-
@backup_object=$$(aws s3 ls --recursive s3://${DATA_BUCKET_NAME}/backups/neo4j/$$from_date/ | cut -d' ' -f7 | cut -d'/' -f3-6) && \
121+
@echo "Checking if backup exists for \`s3://${DATA_BUCKET_NAME}/backups/neo4j/$$from_date/\`"
122+
@backup_object=$$(aws s3 ls --recursive s3://${DATA_BUCKET_NAME}/backups/neo4j/$$from_date/ | cut -d ' ' -f 6-) && \
123123
[ "$$backup_object" != "" ] || (echo "ERROR: Backup does not exist" && exit 1) && \
124-
echo "Found backup for $$backup_object"
124+
echo "Found backup target for $$backup_object"
125125

126126
delete: ##=> Delete resources
127127
@echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deleting ${SERVICE} service" 2>&1 | tee -a $$CFN_LOG_PATH

gfe-db/pipeline/functions/invoke_backup_script/app.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,16 @@ def lambda_handler(event, context):
4242
raise Exception("Failed to send command")
4343
else:
4444
logger.info(f"Neo4j backup invoked on instance {neo4j_database_instance_id}")
45+
46+
# wait for the command to complete successfully
47+
waiter = ssm.get_waiter('command_executed')
48+
waiter.wait(
49+
CommandId=response['Command']['CommandId'],
50+
InstanceId=neo4j_database_instance_id
51+
)
52+
53+
logger.info(f"Neo4j backup completed on instance {neo4j_database_instance_id}")
4554

46-
# TODO poll SSM until command is complete
47-
# try: poll; except Failed Command: raise Exception
48-
4955
except Exception as err:
5056
logger.error(err)
5157
raise err

gfe-db/pipeline/functions/invoke_pipeline/app.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
s3 = session.client('s3')
3030
sfn = session.client('stepfunctions')
3131

32-
release_pattern = r"^\d{3}0$"
32+
release_pattern = r"^\d{2,3}0$"
3333

3434
def lambda_handler(event, context):
3535
"""Checks for new IMGT/HLA releases and triggers the update
@@ -41,8 +41,15 @@ def lambda_handler(event, context):
4141
if "releases" in event:
4242

4343
# align, kir, mem_profile are booleans
44-
if not all([ isinstance(event[arg], bool) for arg in [ 'align', 'kir', 'mem_profile' ] ]):
45-
raise ValueError('align, kir, and mem_profile must be boolean values')
44+
execution_input_bool_keys = [
45+
'align',
46+
'kir',
47+
'mem_profile' ,
48+
'use_existing_build',
49+
'skip_load'
50+
]
51+
if not all([ isinstance(event[arg], bool) for arg in execution_input_bool_keys if arg in event ]):
52+
raise ValueError(f'{", ".join(execution_input_bool_keys)} must be boolean values')
4653

4754
# conform booleans to the current argument format
4855
event = { arg: str(val) for arg, val in event.items() }
@@ -304,7 +311,7 @@ def parse_state(state_path, params_path):
304311
import os
305312
from pathlib import Path
306313

307-
path = Path(__file__).parent / "bad-event.json"
314+
path = Path(__file__).parent / "event-use-existing-true.json"
308315
with open(path, "r") as f:
309316
event = json.load(f)
310317

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"align": false,
3+
"kir": false,
4+
"limit": "",
5+
"releases": "310",
6+
"mem_profile": false,
7+
"skip_load": true
8+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"align": false,
3+
"kir": false,
4+
"limit": "",
5+
"releases": "310",
6+
"mem_profile": false,
7+
"use_existing_build": false
8+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"align": false,
3+
"kir": false,
4+
"limit": "",
5+
"releases": "310",
6+
"mem_profile": false,
7+
"use_existing_build": true,
8+
"skip_load": false
9+
}

gfe-db/pipeline/functions/invoke_pipeline/event.json

Lines changed: 0 additions & 7 deletions
This file was deleted.

gfe-db/pipeline/functions/invoke_pipeline/invalid-release-event.json

Lines changed: 0 additions & 7 deletions
This file was deleted.

0 commit comments

Comments
 (0)