Skip to content

Commit

Permalink
chore: Update weaviate example to use Sagemaker instance and pre-popu…
Browse files Browse the repository at this point in the history
…late environment (#39)
  • Loading branch information
bryantbiggs authored Jul 18, 2023
1 parent 947c368 commit 1cef6ad
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 40 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,4 @@ terraform.rc

# Artifacts
*.zip
*.csv
4 changes: 2 additions & 2 deletions weaviate/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ statefulset.apps/weaviate 1/1 127m
kubectl get svc -n weaviate weaviate -o jsonpath='{.status.loadBalancer.ingress[0].hostname}'
```

6. Open up Sagemaker in the AWS console - navigate to Studio and select the `weaviate` profile and click `Open Studio`
7. Once the Studio is launched, click `Create Notebook`
6. Open up Sagemaker in the AWS console - navigate to Notebook and select the `weaviate` instance
7. Once the notebook is launched, click `Create Notebook`
8. In the first cell of the notebook, install the Weaviate Python client:

```python
Expand Down
4 changes: 4 additions & 0 deletions weaviate/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ terraform {
source = "hashicorp/kubernetes"
version = ">= 2.20"
}
null = {
source = "hashicorp/null"
version = ">= 3.0"
}
}

# backend "s3" {
Expand Down
186 changes: 148 additions & 38 deletions weaviate/sagemaker.tf
Original file line number Diff line number Diff line change
@@ -1,36 +1,114 @@
resource "aws_sagemaker_domain" "this" {
domain_name = local.name
auth_mode = "IAM"

# To reach VPC private resources
app_network_access_type = "VpcOnly"
app_security_group_management = "Customer"

vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnets

default_space_settings {
execution_role = aws_iam_role.this.arn
security_groups = [
module.sagemaker_sg.security_group_id,
module.vpc_endpoints.security_group_id,
]
}
resource "aws_sagemaker_notebook_instance" "this" {
name = local.name
role_arn = aws_iam_role.this.arn

instance_type = "ml.t2.medium"
platform_identifier = "notebook-al2-v2"
volume_size = 128

subnet_id = element(module.vpc.private_subnets, 0)
security_groups = [module.sagemaker_sg.security_group_id]

default_user_settings {
execution_role = aws_iam_role.this.arn
security_groups = [
module.sagemaker_sg.security_group_id,
module.vpc_endpoints.security_group_id,
]
instance_metadata_service_configuration {
minimum_instance_metadata_service_version = 2
}

lifecycle_config_name = aws_sagemaker_notebook_instance_lifecycle_configuration.this.name

tags = module.tags.tags
}

resource "aws_sagemaker_notebook_instance_lifecycle_configuration" "this" {
name = local.name

# on_create = base64encode(
# <<-EOT
# #!/bin/bash

# set -e

# # OVERVIEW
# # This script installs a single conda package in all SageMaker conda environments, apart from the JupyterSystemEnv
# # which is a system environment reserved for Jupyter.

# # NOTE: if the total runtime of this script exceeds 5 minutes, the Notebook Instance will fail to start up. If you
# # would like to run this script in the background, then replace "sudo" with "nohup sudo -b". This will allow the
# # Notebook Instance to start up while the installation happens in the background.

# sudo -u ec2-user -i <<'EOF'

# # PARAMETERS
# PACKAGE=weaviate-client

# # Note that "base" is special environment name, include it there as well.
# conda install "$PACKAGE" --name base --yes

# for env in /home/ec2-user/anaconda3/envs/*; do
# env_name=$(basename "$env")
# if [ $env_name = 'JupyterSystemEnv' ]; then
# continue
# fi

# conda install "$PACKAGE" --name "$env_name" --yes
# done

# EOF
# EOT
# )
on_start = base64encode(
<<-EOT
#!/bin/bash
set -e
# Set environment variables for notebooks
touch /etc/profile.d/jupyter-env.sh
echo "export WEAVIATE_S3_BUCKET=${module.s3_bucket.s3_bucket_id}" >> /etc/profile.d/jupyter-env.sh
# Restart command is dependent on current running Amazon Linux and JupyterLab
CURR_VERSION=$(cat /etc/os-release)
if [[ $CURR_VERSION == *$"http://aws.amazon.com/amazon-linux-ami/"* ]]; then
sudo initctl restart jupyter-server --no-wait
else
sudo systemctl --no-block restart jupyter-server.service
fi
EOT
)
}

################################################################################
# IAM Role
################################################################################

resource "aws_iam_role" "this" {
name = local.name
path = "/"
name = local.name

assume_role_policy = data.aws_iam_policy_document.this.json
managed_policy_arns = ["arn:aws:iam::aws:policy/AmazonSageMakerFullAccess"]

inline_policy {
name = local.name

policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Sid = "ListBucket",
Effect = "Allow",
Action = ["s3:ListBucket"],
Resource = [module.s3_bucket.s3_bucket_arn]
},
{
Sid = "AllObjectActions",
Effect = "Allow",
Action = "s3:*Object",
Resource = ["${module.s3_bucket.s3_bucket_arn}/*"]
}
]
})
}

tags = module.tags.tags
}

data "aws_iam_policy_document" "this" {
Expand All @@ -44,18 +122,9 @@ data "aws_iam_policy_document" "this" {
}
}

resource "aws_sagemaker_user_profile" "this" {
domain_id = aws_sagemaker_domain.this.id
user_profile_name = local.name

user_settings {
execution_role = aws_iam_role.this.arn
security_groups = [
module.sagemaker_sg.security_group_id,
module.vpc_endpoints.security_group_id,
]
}
}
################################################################################
# Security group
################################################################################

module "sagemaker_sg" {
source = "terraform-aws-modules/security-group/aws"
Expand Down Expand Up @@ -91,3 +160,44 @@ module "sagemaker_sg" {

tags = module.tags.tags
}

################################################################################
# S3 Bucket w/ Data Set
################################################################################

module "s3_bucket" {
source = "terraform-aws-modules/s3-bucket/aws"
version = "~> 3.0"

bucket_prefix = "${local.name}-"

# Allow deletion of non-empty bucket
# NOTE: This is enabled for example usage only, you should not enable this for production workloads
force_destroy = true

attach_deny_insecure_transport_policy = true
attach_require_latest_tls_policy = true

control_object_ownership = true
object_ownership = "BucketOwnerPreferred"

server_side_encryption_configuration = {
rule = {
apply_server_side_encryption_by_default = {
sse_algorithm = "AES256"
}
}
}

tags = module.tags.tags
}

resource "null_resource" "s3_data" {
provisioner "local-exec" {
command = <<-EOT
curl https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip --output embeddings.zip && \
unzip embeddings.zip -d "embeddings_data" && \
aws s3 sync embeddings_data s3://${module.s3_bucket.s3_bucket_id}/
EOT
}
}

0 comments on commit 1cef6ad

Please sign in to comment.