Applied advice from review around just extracting sqlite file path (S…

…3_KEY) from the S3_OBJECT_ARN, rather than having a separate parameter for local running
digital-land · Jul 16, 2024 · d637d51 · d637d51
1 parent a9ed240
commit d637d51
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -21,8 +21,9 @@ To see how the values for bucket and key are extracted have a [look here](https:
 
 ## Running locally to load data into local postgres
 
-Since running locally does not download the Digital Land Sqlite database from S3, it is necessary to set the
-$SQLITE_FILE_PATH environment variable rather than $S3_OBJECT_ARN.
+Running locally does not download the Digital Land Sqlite database from S3 directly but instead via a CDN, it is 
+necessary to ensure the $S3_OBJECT_ARN contains the correct file path. The bucket name portion of the ARN will
+be ignored and the file path will be appended to https://files.planning.data.gov.uk/.
 
 **Prerequisites**
 
@@ -41,7 +42,7 @@ application)
 
 With a fresh checkout that file configures the scripts in this repo to load the digital-land database.
 
-To load the entity database change the $SQLITE_FILE_PATH to the correct key for the entity sqlite database (see below).
+To load the entity database ensure the $S3_OBJECT_ARN has the correct key for the entity sqlite database (see below).
 
 
 2. **Create a virtualenv and install requirements**
@@ -58,7 +59,7 @@ Remember the .env file is already set to load the digital-land db. However in or
 
 6. **Run the load script to load entity database**
 
-Update the $SQLITE_FILE_PATH in the .env file to $SQLITE_FILE_PATH=entity-builder/dataset/entity.sqlite3
+Update the $S3_OBJECT_ARN in the .env file to $S3_OBJECT_ARN=arn:aws:s3:::placeholder/entity-builder/dataset/entity.sqlite3
 
     ./load_local.sh
 

diff --git a/task/.env.example b/task/.env.example
@@ -1,2 +1 @@
 export S3_OBJECT_ARN=arn:aws:s3:::digital-land-production-collection-dataset/digital-land-builder/dataset/digital-land.sqlite3
-export SQLITE_FILE_PATH=digital-land-builder/dataset/digital-land.sqlite3
diff --git a/task/load_local.sh b/task/load_local.sh
@@ -1,5 +1,21 @@
 #! /usr/bin/env bash
 
+s3_object_arn_regex="^arn:aws:s3:::([0-9A-Za-z-]*/)(.*)$"
+
+if ! [[ "$S3_OBJECT_ARN" =~ $s3_object_arn_regex ]]; then
+    echo "Received invalid S3 Object S3 ARN: $S3_OBJECT_ARN, skipping"
+    exit 1
+fi
+
+S3_KEY=${BASH_REMATCH[2]}
+
+# need to use the files cdn instead of the bucket name when loading locally without logging into aws
+DATABASE=${S3_KEY##*/}
+
+export DATABASE_NAME=${DATABASE%.*}
+echo "DATABASE NAME: $DATABASE_NAME"
+echo "$EVENT_ID: running with settings: S3_KEY=$S3_KEY, DATABASE=$DATABASE, DATABASE_NAME=$DATABASE_NAME"
+
 # download specification
 export SOURCE_URL=https://raw.githubusercontent.com/digital-land/
 mkdir -p specification/
@@ -19,27 +35,18 @@ curl -qfsL $SOURCE_URL/specification/main/specification/dataset-schema.csv > spe
 curl -qfsL $SOURCE_URL/specification/main/specification/schema.csv > specification/schema.csv
 curl -qfsL $SOURCE_URL/specification/main/specification/schema-field.csv > specification/schema-field.csv
 
-
-# need to use the files cdn instead of the bucket name when loading locally without logging into aws
-DATABASE=${SQLITE_FILE_PATH##*/}
-export DATABASE_NAME=${DATABASE%.*}
-echo "DATABASE NAME: $DATABASE_NAME"
-echo "$EVENT_ID: running with settings: SQLITE_FILE_PATH=$SQLITE_FILE_PATH, DATABASE=$DATABASE, DATABASE_NAME=$DATABASE_NAME"
-
-
-
 # if [[ $DATABASE_NAME != "entity" && $DATABASE_NAME != "digital-land" ]]; then
 #   echo "$EVENT_ID: wrong database, skipping"
 #   exit 1
 # fi
 
 
 if ! [ -f "$DATABASE_NAME.sqlite3" ]; then
-  echo "$EVENT_ID: attempting download from https://files.planning.data.gov.uk/$SQLITE_FILE_PATH"
-  if curl --fail --show-error --location "https://files.planning.data.gov.uk/$SQLITE_FILE_PATH" > "$DATABASE_NAME.sqlite3"; then
-      echo "$EVENT_ID: finished downloading from https://files.planning.data.gov.uk/$SQLITE_FILE_PATH"
+  echo "$EVENT_ID: attempting download from https://files.planning.data.gov.uk/$S3_KEY"
+  if curl --fail --show-error --location "https://files.planning.data.gov.uk/$S3_KEY" > "$DATABASE_NAME.sqlite3"; then
+      echo "$EVENT_ID: finished downloading from https://files.planning.data.gov.uk/$S3_KEY"
   else
-      echo "$EVENT_ID: failed to download from https://files.planning.data.gov.uk/$SQLITE_FILE_PATH"
+      echo "$EVENT_ID: failed to download from https://files.planning.data.gov.uk/$S3_KEY"
       rm "$DATABASE_NAME.sqlite3"  # remove the file if it was created
       exit 1
   fi