Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
* automl initial commit

* lint

* fix import groupings

* add requirements.txt

* address review comments
  • Loading branch information
sirtorry authored Jul 20, 2018
1 parent 85502a0 commit ea08e6c
Show file tree
Hide file tree
Showing 8 changed files with 958 additions and 0 deletions.
297 changes: 297 additions & 0 deletions samples/snippets/automl/automl_natural_language_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
#!/usr/bin/env python

# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""This application demonstrates how to perform basic operations on Dataset
with the Google AutoML Natural Language API.
For more information, see the tutorial page at
https://cloud.google.com/natural-language/automl/docs/
"""

import argparse
import os


def create_dataset(project_id, compute_region, dataset_name, multilabel=False):
"""Create a dataset."""
# [START automl_natural_language_create_dataset]
# TODO(developer): Uncomment and set the following variables
# project_id = 'PROJECT_ID_HERE'
# compute_region = 'COMPUTE_REGION_HERE'
# dataset_name = 'DATASET_NAME_HERE'
# multilabel = True for multilabel or False for multiclass

from google.cloud import automl_v1beta1 as automl

client = automl.AutoMlClient()

# A resource that represents Google Cloud Platform location.
project_location = client.location_path(project_id, compute_region)

# Classification type is assigned based on multilabel value.
classification_type = "MULTICLASS"
if multilabel:
classification_type = "MULTILABEL"

# Specify the text classification type for the dataset.
dataset_metadata = {"classification_type": classification_type}

# Set dataset name and metadata.
my_dataset = {
"display_name": dataset_name,
"text_classification_dataset_metadata": dataset_metadata,
}

# Create a dataset with the dataset metadata in the region.
dataset = client.create_dataset(project_location, my_dataset)

# Display the dataset information.
print("Dataset name: {}".format(dataset.name))
print("Dataset id: {}".format(dataset.name.split("/")[-1]))
print("Dataset display name: {}".format(dataset.display_name))
print("Text classification dataset metadata:")
print("\t{}".format(dataset.text_classification_dataset_metadata))
print("Dataset example count: {}".format(dataset.example_count))
print("Dataset create time:")
print("\tseconds: {}".format(dataset.create_time.seconds))
print("\tnanos: {}".format(dataset.create_time.nanos))

# [END automl_natural_language_create_dataset]


def list_datasets(project_id, compute_region, filter_):
"""List all datasets."""
# [START automl_natural_language_list_datasets]
# TODO(developer): Uncomment and set the following variables
# project_id = 'PROJECT_ID_HERE'
# compute_region = 'COMPUTE_REGION_HERE'
# filter_ = 'filter expression here'

from google.cloud import automl_v1beta1 as automl

client = automl.AutoMlClient()

# A resource that represents Google Cloud Platform location.
project_location = client.location_path(project_id, compute_region)

# List all the datasets available in the region by applying filter.
response = client.list_datasets(project_location, filter_)

print("List of datasets:")
for dataset in response:
# Display the dataset information.
print("Dataset name: {}".format(dataset.name))
print("Dataset id: {}".format(dataset.name.split("/")[-1]))
print("Dataset display name: {}".format(dataset.display_name))
print("Text classification dataset metadata:")
print("\t{}".format(dataset.text_classification_dataset_metadata))
print("Dataset example count: {}".format(dataset.example_count))
print("Dataset create time:")
print("\tseconds: {}".format(dataset.create_time.seconds))
print("\tnanos: {}".format(dataset.create_time.nanos))

# [END automl_natural_language_list_datasets]


def get_dataset(project_id, compute_region, dataset_id):
"""Get the dataset."""
# [START automl_natural_language_get_dataset]
# TODO(developer): Uncomment and set the following variables
# project_id = 'PROJECT_ID_HERE'
# compute_region = 'COMPUTE_REGION_HERE'
# dataset_id = 'DATASET_ID_HERE'

from google.cloud import automl_v1beta1 as automl

client = automl.AutoMlClient()

# Get the full path of the dataset
dataset_full_id = client.dataset_path(
project_id, compute_region, dataset_id
)

# Get complete detail of the dataset.
dataset = client.get_dataset(dataset_full_id)

# Display the dataset information.
print("Dataset name: {}".format(dataset.name))
print("Dataset id: {}".format(dataset.name.split("/")[-1]))
print("Dataset display name: {}".format(dataset.display_name))
print("Text classification dataset metadata:")
print("\t{}".format(dataset.text_classification_dataset_metadata))
print("Dataset example count: {}".format(dataset.example_count))
print("Dataset create time:")
print("\tseconds: {}".format(dataset.create_time.seconds))
print("\tnanos: {}".format(dataset.create_time.nanos))

# [END automl_natural_language_get_dataset]


def import_data(project_id, compute_region, dataset_id, path):
"""Import labelled items."""
# [START automl_natural_language_import_data]
# TODO(developer): Uncomment and set the following variables
# project_id = 'PROJECT_ID_HERE'
# compute_region = 'COMPUTE_REGION_HERE'
# dataset_id = 'DATASET_ID_HERE'
# path = 'gs://path/to/file.csv'

from google.cloud import automl_v1beta1 as automl

client = automl.AutoMlClient()

# Get the full path of the dataset.
dataset_full_id = client.dataset_path(
project_id, compute_region, dataset_id
)

# Get the multiple Google Cloud Storage URIs.
input_uris = path.split(",")
input_config = {"gcs_source": {"input_uris": input_uris}}

# Import the dataset from the input URI.
response = client.import_data(dataset_full_id, input_config)

print("Processing import...")
# synchronous check of operation status.
print("Data imported. {}".format(response.result()))

# [END automl_natural_language_import_data]


def export_data(project_id, compute_region, dataset_id, output_uri):
"""Export a dataset to a Google Cloud Storage bucket."""
# [START automl_natural_language_export_data]
# TODO(developer): Uncomment and set the following variables
# project_id = 'PROJECT_ID_HERE'
# compute_region = 'COMPUTE_REGION_HERE'
# dataset_id = 'DATASET_ID_HERE'
# output_uri: 'gs://location/to/export/data'

from google.cloud import automl_v1beta1 as automl

client = automl.AutoMlClient()

# Get the full path of the dataset.
dataset_full_id = client.dataset_path(
project_id, compute_region, dataset_id
)

# Set the output URI
output_config = {"gcs_destination": {"output_uri_prefix": output_uri}}

# Export the data to the output URI.
response = client.export_data(dataset_full_id, output_config)

print("Processing export...")
# synchronous check of operation status.
print("Data exported. {}".format(response.result()))

# [END automl_natural_language_export_data]


def delete_dataset(project_id, compute_region, dataset_id):
"""Delete a dataset."""
# [START automl_natural_language_delete_dataset]
# TODO(developer): Uncomment and set the following variables
# project_id = 'PROJECT_ID_HERE'
# compute_region = 'COMPUTE_REGION_HERE'
# dataset_id = 'DATASET_ID_HERE'

from google.cloud import automl_v1beta1 as automl

client = automl.AutoMlClient()

# Get the full path of the dataset.
dataset_full_id = client.dataset_path(
project_id, compute_region, dataset_id
)

# Delete a dataset.
response = client.delete_dataset(dataset_full_id)

# synchronous check of operation status.
print("Dataset deleted. {}".format(response.result()))

# [END automl_natural_language_delete_dataset]


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
subparsers = parser.add_subparsers(dest="command")

create_dataset_parser = subparsers.add_parser(
"create_dataset", help=create_dataset.__doc__
)
create_dataset_parser.add_argument("dataset_name")
create_dataset_parser.add_argument(
"multilabel", nargs="?", choices=["False", "True"], default="False"
)

list_datasets_parser = subparsers.add_parser(
"list_datasets", help=list_datasets.__doc__
)
list_datasets_parser.add_argument(
"filter_", nargs="?", default="text_classification_dataset_metadata:*"
)

get_dataset_parser = subparsers.add_parser(
"get_dataset", help=get_dataset.__doc__
)
get_dataset_parser.add_argument("dataset_id")

import_data_parser = subparsers.add_parser(
"import_data", help=import_data.__doc__
)
import_data_parser.add_argument("dataset_id")
import_data_parser.add_argument("path")

export_data_parser = subparsers.add_parser(
"export_data", help=export_data.__doc__
)
export_data_parser.add_argument("dataset_id")
export_data_parser.add_argument("output_uri")

delete_dataset_parser = subparsers.add_parser(
"delete_dataset", help=delete_dataset.__doc__
)
delete_dataset_parser.add_argument("dataset_id")

project_id = os.environ["PROJECT_ID"]
compute_region = os.environ["REGION_NAME"]

args = parser.parse_args()

if args.command == "create_dataset":
multilabel = True if args.multilabel == "True" else False
create_dataset(
project_id, compute_region, args.dataset_name, multilabel
)
if args.command == "list_datasets":
list_datasets(project_id, compute_region, args.filter_)
if args.command == "get_dataset":
get_dataset(project_id, compute_region, args.dataset_id)
if args.command == "import_data":
import_data(project_id, compute_region, args.dataset_id, args.path)
if args.command == "export_data":
export_data(
project_id, compute_region, args.dataset_id, args.output_uri
)
if args.command == "delete_dataset":
delete_dataset(project_id, compute_region, args.dataset_id)
Loading

0 comments on commit ea08e6c

Please sign in to comment.