Skip to content

Commit

Permalink
Resolve conflicts.
Browse files Browse the repository at this point in the history
  • Loading branch information
jlewi committed Jan 18, 2018
2 parents 8e6fb87 + 4c9217d commit 11d989c
Show file tree
Hide file tree
Showing 6 changed files with 301 additions and 23 deletions.
9 changes: 7 additions & 2 deletions kubeflow/core/prototypes/all.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,13 @@ std.prune(k.core.v1.list.new([
tfjob.parts(namespace).tfJobDeploy(tfJobImage),
tfjob.parts(namespace).configMap(cloud, tfDefaultImage),
tfjob.parts(namespace).serviceAccount,
tfjob.parts(namespace).operatorRole,
tfjob.parts(namespace).operatorRoleBinding,

// TfJob controll ui
tfjob.parts(namespace).ui(tfJobImage),
tfjob.parts(namespace).uiService(tfJobUiServiceType),
tfjob.parts(namespace).ui(tfJobImage),
tfjob.parts(namespace).uiService(tfJobUiServiceType),
tfjob.parts(namespace).uiServiceAccount,
tfjob.parts(namespace).uiRole,
tfjob.parts(namespace).uiRoleBinding,
] + nfsComponents))
234 changes: 231 additions & 3 deletions kubeflow/core/tf-job.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
{
"command": [
"/opt/mlkube/tf_operator",
"--controller-config-file=/etc/config/controller_config_file.yaml",
"--alsologtostderr",
"--controller-config-file=/etc/config/controller_config_file.yaml",
"--alsologtostderr",
"-v=1",
],
"env": [
Expand Down Expand Up @@ -132,6 +132,114 @@
}
},

operatorRole: {
"apiVersion": "rbac.authorization.k8s.io/v1beta1",
"kind": "ClusterRole",
"metadata": {
"labels": {
"app": "tf-job-operator"
},
"name": "tf-job-operator"
},
"rules": [
{
"apiGroups": [
"tensorflow.org"
],
"resources": [
"tfjobs"
],
"verbs": [
"*"
]
},
{
"apiGroups": [
"apiextensions.k8s.io"
],
"resources": [
"customresourcedefinitions"
],
"verbs": [
"*"
]
},
{
"apiGroups": [
"storage.k8s.io"
],
"resources": [
"storageclasses"
],
"verbs": [
"*"
]
},
{
"apiGroups": [
"batch"
],
"resources": [
"jobs"
],
"verbs": [
"*"
]
},
{
"apiGroups": [
""
],
"resources": [
"configmaps",
"pods",
"services",
"endpoints",
"persistentvolumeclaims",
"events"
],
"verbs": [
"*"
]
},
{
"apiGroups": [
"apps",
"extensions"
],
"resources": [
"deployments"
],
"verbs": [
"*"
]
}
]
}, // operator-role

operatorRoleBinding:: {
"apiVersion": "rbac.authorization.k8s.io/v1beta1",
"kind": "ClusterRoleBinding",
"metadata": {
"labels": {
"app": "tf-job-operator"
},
"name": "tf-job-operator"
},
"roleRef": {
"apiGroup": "rbac.authorization.k8s.io",
"kind": "ClusterRole",
"name": "tf-job-operator"
},
"subjects": [
{
"kind": "ServiceAccount",
"name": "tf-job-operator",
"namespace": namespace,
}
]
}, // operator-role binding

uiService(serviceType):: {
"apiVersion": "v1",
"kind": "Service",
Expand All @@ -153,6 +261,18 @@
}
}, // uiService

uiServiceAccount: {
"apiVersion": "v1",
"kind": "ServiceAccount",
"metadata": {
"labels": {
"app": "tf-job-dashboard"
},
"name": "tf-job-dashboard",
"namespace": namespace,
}
}, // uiServiceAccount

ui(image):: {
"apiVersion": "extensions/v1beta1",
"kind": "Deployment",
Expand Down Expand Up @@ -181,11 +301,119 @@
}
]
}
]
],
"serviceAccountName": "tf-job-dashboard",
}
}
},
}, // ui

uiRole:: {
"apiVersion": "rbac.authorization.k8s.io/v1beta1",
"kind": "ClusterRole",
"metadata": {
"labels": {
"app": "tf-job-dashboard"
},
"name": "tf-job-dashboard"
},
"rules": [
{
"apiGroups": [
"tensorflow.org"
],
"resources": [
"tfjobs"
],
"verbs": [
"*"
]
},
{
"apiGroups": [
"apiextensions.k8s.io"
],
"resources": [
"customresourcedefinitions"
],
"verbs": [
"*"
]
},
{
"apiGroups": [
"storage.k8s.io"
],
"resources": [
"storageclasses"
],
"verbs": [
"*"
]
},
{
"apiGroups": [
"batch"
],
"resources": [
"jobs"
],
"verbs": [
"*"
]
},
{
"apiGroups": [
""
],
"resources": [
"configmaps",
"pods",
"services",
"endpoints",
"persistentvolumeclaims",
"events"
],
"verbs": [
"*"
]
},
{
"apiGroups": [
"apps",
"extensions"
],
"resources": [
"deployments"
],
"verbs": [
"*"
]
}
]
}, // uiRole

uiRoleBinding:: {
"apiVersion": "rbac.authorization.k8s.io/v1beta1",
"kind": "ClusterRoleBinding",
"metadata": {
"labels": {
"app": "tf-job-dashboard"
},
"name": "tf-job-dashboard"
},
"roleRef": {
"apiGroup": "rbac.authorization.k8s.io",
"kind": "ClusterRole",
"name": "tf-job-dashboard"
},
"subjects": [
{
"kind": "ServiceAccount",
"name": "tf-job-dashboard",
"namespace": namespace,
}
]
}, // uiRoleBinding
},
}
7 changes: 4 additions & 3 deletions kubeflow/tf-job/prototypes/tf-cnn-benchmarks.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,9 @@ local job =
error "num_ps must be >= 1"
else
tfJob.parts.tfJob(name, namespace, replicas) + {
tfImage: image,
terminationPolicy: {chief:{replicaName: "WORKER", replicaIndex: 0 }}
};
spec+: {
tfImage: image,
terminationPolicy: {chief:{replicaName: "WORKER", replicaIndex: 0 }}
}};

std.prune(k.core.v1.list.new([job]))
31 changes: 31 additions & 0 deletions testing/test-infra/debug_pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# This pod is useful for starting a shell that you can use to interactively debug our tests
apiVersion: batch/v1
kind: Job
metadata:
name: test-job
namespace: kubeflow-test-infra
spec:
template:
spec:
containers:
- name: test-container
image: gcr.io/mlkube-testing/kubeflow-testing:latest
command: ["tail", "-f", "/dev/null"]
volumeMounts:
- mountPath: /mnt/test-data-volume
name: kubeflow-test-volume
- mountPath: /secret/gcp-credentials
name: gcp-credentials
env:
- name: GOOGLE_APPLICATION_CREDENTIALS
value: /secret/gcp-credentials/key.json
restartPolicy: Never
volumes:
- name: kubeflow-test-volume
persistentVolumeClaim:
claimName: kubeflow-testing
- name: gcp-credentials
secret:
secretName: kubeflow-testing-credentials

backoffLimit: 4
33 changes: 23 additions & 10 deletions testing/test_deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import json
import logging
import os
import shutil
import tempfile
import uuid

Expand Down Expand Up @@ -43,7 +44,8 @@ def _setup_test(api_client, run_label):

try:
logging.info("Creating namespace %s", namespace.metadata.name)
api.create_namespace(namespace)
namespace = api.create_namespace(namespace)
logging.info("Namespace %s created.", namespace.metadata.name)
except rest.ApiException as e:
if e.status == 409:
logging.info("Namespace %s already exists.", namespace.metadata.name)
Expand Down Expand Up @@ -96,30 +98,41 @@ def run():

app_dir = os.path.join(args.test_dir, app_name)

# TODO(jlewi): In presubmits we probably want to change this so we can
# pull the changes on a branch. Its not clear whether that's well supported
# in Ksonnet yet.
kubeflow_registry = "github.com/google/kubeflow/tree/master/kubeflow"
util.run(["ks", "registry", "add", "kubeflow", kubeflow_registry], cwd=app_dir)

# Install required packages
# TODO(jlewi): For presubmits how do we pull the package from the desired
# branch at the desired commit.
packages = ["kubeflow/core", "kubeflow/tf-serving", "kubeflow/tf-job"]

for p in packages:
util.run(["ks", "pkg", "install", p], cwd=app_dir)

# Delete the vendor directory and replace with a symlink to the src
# so that we use the code at the desired commit.
target_dir = os.path.join(app_dir, "vendor", "kubeflow")

logging.info("Deleting %s", target_dir)
shutil.rmtree(target_dir)

source = os.path.join(args.test_dir, "src", "kubeflow")
logging.info("Creating link %s -> %s", target_dir, source)
os.symlink(source, target_dir)

# Deploy Kubeflow
util.run(["ks", "generate", "core", "kubeflow-core", "--name=kubeflow-core",
"--namespace=" + namespace.metadata.name], cwd=app_dir)

# TODO(jlewi): For reasons I don't understand even though we ran
# configure_kubectl above, if we don't rerun it we get rbac errors
# when we do ks apply; I think because we aren't using the proper service
# account. This might have something to do with the way ksonnet gets
# its credentials; maybe we need to configure credentials after calling
# ks init?
if args.cluster:
util.configure_kubectl(args.project, args.zone, args.cluster)

apply_command = ["ks", "apply", "default", "-c", "kubeflow-core",]

if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
with open(os.getenv("GOOGLE_APPLICATION_CREDENTIALS")) as hf:
key = json.load(hf)
apply_command.append("--as=" + key["client_email"])
util.run(apply_command, cwd=app_dir)

# Verify that the TfJob operator is actually deployed.
Expand Down
Loading

0 comments on commit 11d989c

Please sign in to comment.