diff --git a/data_labeling_examples/.gitignore b/data_labeling_examples/.gitignore index 508f1c9d..cc578b3c 100644 --- a/data_labeling_examples/.gitignore +++ b/data_labeling_examples/.gitignore @@ -2,4 +2,3 @@ .classpath /target/ .settings/ -src/main/java/com/oracle/.DS_Store diff --git a/data_labeling_examples/README.md b/data_labeling_examples/README.md index c2e1dd47..261920c8 100644 --- a/data_labeling_examples/README.md +++ b/data_labeling_examples/README.md @@ -56,9 +56,44 @@ Result of CUSTOM_LABELS_MATCH algorithm: For more information [SDK for Java](https://docs.oracle.com/en-us/iaas/Content/API/SDKDocs/javasdk.htm) +### Running the Utility +1. Open Terminal on your system. +2. Verify that Java 8 or higher is installed in the system. In case you do not have java installed on your system, download it from https://www.oracle.com/java/technologies/downloads/ + +``` +java -version +``` +3. Clone the repository. + +``` +git clone https://github.com/oracle-samples/oci-data-science-ai-samples.git +``` +4. Go to data_labeling_examples directory + +``` +cd data_labeling_examples +``` +5. Run the below command to bulk label by "FIRST_LETTER_MATCH" labeling algorithm. + +``` +java -DCONFIG_FILE_PATH='~/.oci/config' -DCONFIG_PROFILE=DEFAULT -DDLS_DP_URL=https://dlsprod-dp.us-ashburn-1.oci.oraclecloud.com -DTHREAD_COUNT=20 -DDATASET_ID=ocid1.compartment.oc1..aaaaaaaawob4faujxaqxqzrb555b44wxxrfkcpapjxwp4s4hwjthu46idr5a -DLABELING_ALGORITHM=FIRST_LETTER_MATCH -DLABELS=cat,dog -cp libs/bulklabelutility-v1.jar com.oracle.datalabelingservicesamples.scripts.SingleLabelDatasetBulkLabelingScript +``` +6. Run the below command to bulk label by "FIRST_REGEX_MATCH" labeling algorithm. + +``` +java -DCONFIG_FILE_PATH='~/.oci/config' -DCONFIG_PROFILE=DEFAULT -DDLS_DP_URL=https://dlsprod-dp.us-ashburn-1.oci.oraclecloud.com -DTHREAD_COUNT=20 -DDATASET_ID=ocid1.compartment.oc1..aaaaaaaawob4faujxaqxqzrb555b44wxxrfkcpapjxwp4s4hwjthu46idr5a -DLABELING_ALGORITHM=FIRST_REGEX_MATCH -DFIRST_MATCH_REGEX_PATTERN=^abc* -DLABELS=cat,dog -cp libs/bulklabelutility-v1.jar com.oracle.datalabelingservicesamples.scripts.SingleLabelDatasetBulkLabelingScript +``` +7. Run the below command to bulk label by "CUSTOM_LABELS_MATCH" labeling algorithm. + +``` +java -DCONFIG_FILE_PATH='~/.oci/config' -DCONFIG_PROFILE=DEFAULT -DDLS_DP_URL=https://dlsprod-dp.us-ashburn-1.oci.oraclecloud.com -DTHREAD_COUNT=20 -DDATASET_ID=ocid1.compartment.oc1..aaaaaaaawob4faujxaqxqzrb555b44wxxrfkcpapjxwp4s4hwjthu46idr5a -DLABELING_ALGORITHM=CUSTOM_LABELS_MATCH -DCUSTOM_LABELS='{"dog/": ["dog"], "cat/": ["cat"] }' -cp libs/bulklabelutility-v1.jar com.oracle.datalabelingservicesamples.scripts.CustomBulkLabelingScript +``` + +Note: You can override any config using -D followed by the configuration name. The list of all configurations are mentioned in following section. + ### Configurations -Add the following configurations in config.properties file in the project to run the scripts: +Following is the list of all configurations (src/main/resources/config.properties file) supported by the bulk labeling script: ``` #Path of Config File @@ -68,10 +103,7 @@ CONFIG_FILE_PATH=~/.oci/config CONFIG_PROFILE=DEFAULT #DLS DP URL -DLS_DP_URL=https://dlstest-dp.${REGION}.oci.oraclecloud.com - -#Region where dataset is created -REGION=uk-london-1 +DLS_DP_URL=https://dlsprod-dp.uk-london-1.oci.oraclecloud.com #Dataset Id whose record you want to bulk label DATASET_ID=ocid1.compartment.oc1..aaaaaaaawob4faujxaqxqzrb555b44wxxrfkcpapjxwp4s4hwjthu46idr5a diff --git a/data_labeling_examples/libs/bulklabelutility-v1.jar b/data_labeling_examples/libs/bulklabelutility-v1.jar new file mode 100644 index 00000000..77645ca5 Binary files /dev/null and b/data_labeling_examples/libs/bulklabelutility-v1.jar differ diff --git a/data_labeling_examples/pom.xml b/data_labeling_examples/pom.xml index 7d30cdae..d1b99e61 100644 --- a/data_labeling_examples/pom.xml +++ b/data_labeling_examples/pom.xml @@ -7,7 +7,8 @@ 0.0.1-SNAPSHOT OCI Data Labeling Service Examples This repository contains code samples for OCI Data - Labeling Service + Labeling Service + com.oracle.oci.sdk @@ -19,12 +20,6 @@ oci-java-sdk-datalabelingservicedataplane 2.19.0 - - org.projectlombok - lombok - 1.18.22 - provided - org.apache.logging.log4j log4j-api @@ -40,5 +35,43 @@ slf4j-api 1.7.32 + + org.projectlombok + lombok + 1.18.22 + provided + + + 8 + 8 + + + + + org.apache.maven.plugins + maven-assembly-plugin + + + package + + single + + + + + + com.oracle.datalabelingservicesamples.scripts.SingleLabelDatasetBulkLabelingScript + + + + + jar-with-dependencies + + + + + + + \ No newline at end of file diff --git a/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/constants/DataLabelingConstants.java b/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/constants/DataLabelingConstants.java index 8973fb15..40508fac 100644 --- a/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/constants/DataLabelingConstants.java +++ b/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/constants/DataLabelingConstants.java @@ -2,5 +2,17 @@ public class DataLabelingConstants { - public static final int MAX_LIST_RECORDS_LIMITS = 1000; + public static final int MAX_LIST_RECORDS_LIMITS = 1000; + public static final int DEFAULT_THREAD_COUNT = 30; + + public static final String CONFIG_FILE_PATH = "CONFIG_FILE_PATH"; + public static final String CONFIG_PROFILE = "CONFIG_PROFILE"; + public static final String DLS_DP_URL = "DLS_DP_URL"; + public static final String DATASET_ID = "DATASET_ID"; + public static final String REGION = "REGION"; + public static final String LABELING_ALGORITHM = "LABELING_ALGORITHM"; + public static final String THREAD_COUNT = "THREAD_COUNT"; + public static final String LABELS = "LABELS"; + public static final String CUSTOM_LABELS="CUSTOM_LABELS"; + public static final String FIRST_MATCH_REGEX_PATTERN = "FIRST_MATCH_REGEX_PATTERN"; } diff --git a/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/labelingstrategies/FirstRegexMatch.java b/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/labelingstrategies/FirstRegexMatch.java index 9badb682..a3ccd583 100644 --- a/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/labelingstrategies/FirstRegexMatch.java +++ b/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/labelingstrategies/FirstRegexMatch.java @@ -3,18 +3,15 @@ import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; -import java.util.regex.Pattern; import com.oracle.bmc.datalabelingservicedataplane.model.RecordSummary; import com.oracle.datalabelingservicesamples.requests.Config; public class FirstRegexMatch implements LabelingStrategy { - private static final Pattern pattern = Pattern.compile(Config.INSTANCE.getRegexPattern()); - @Override public List getLabel(RecordSummary record) { - Matcher m = pattern.matcher(record.getName()); + Matcher m = Config.INSTANCE.getPattern().matcher(record.getName()); if (m.find()) { String firstGroup = m.group(0); for (String label : Config.INSTANCE.getLabels()) { diff --git a/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/requests/Config.java b/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/requests/Config.java index ecec5046..76d06660 100644 --- a/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/requests/Config.java +++ b/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/requests/Config.java @@ -5,7 +5,9 @@ import java.util.List; import java.util.Map; import java.util.Properties; +import java.util.regex.Pattern; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.exception.ExceptionUtils; import com.fasterxml.jackson.core.JsonProcessingException; @@ -14,6 +16,7 @@ import com.oracle.bmc.auth.AuthenticationDetailsProvider; import com.oracle.bmc.auth.ConfigFileAuthenticationDetailsProvider; import com.oracle.bmc.datalabelingservicedataplane.DataLabelingClient; +import com.oracle.datalabelingservicesamples.constants.DataLabelingConstants; import com.oracle.datalabelingservicesamples.labelingstrategies.CustomLabelMatch; import com.oracle.datalabelingservicesamples.labelingstrategies.FirstLetterMatch; import com.oracle.datalabelingservicesamples.labelingstrategies.FirstRegexMatch; @@ -33,77 +36,95 @@ public enum Config { private String configProfile; private String dpEndpoint; private String datasetId; - private String region; private List labels; private Map> customLabels; private String labelingAlgorithm; private LabelingStrategy labelingStrategy; private String regexPattern; + private Pattern pattern; private int threadCount; private Config() { try { Properties config = new Properties(); config.load(getClass().getClassLoader().getResourceAsStream("config.properties")); - configFilePath = config.getProperty("CONFIG_FILE_PATH"); - configProfile = config.getProperty("CONFIG_PROFILE"); - dpEndpoint = config.getProperty("DLS_DP_URL"); - datasetId = config.getProperty("DATASET_ID"); - region = config.getProperty("REGION"); - labelingAlgorithm = config.getProperty("LABELING_ALGORITHM"); - String threadConfig = config.getProperty("THREAD_COUNT"); - if (!threadConfig.isEmpty()) { - threadCount = Integer.parseInt(threadConfig); - } else { - threadCount = 20; - } + configFilePath = StringUtils.isEmpty(System.getProperty(DataLabelingConstants.CONFIG_FILE_PATH)) + ? config.getProperty(DataLabelingConstants.CONFIG_FILE_PATH) + : System.getProperty(DataLabelingConstants.CONFIG_FILE_PATH); + configProfile = StringUtils.isEmpty(System.getProperty(DataLabelingConstants.CONFIG_PROFILE)) + ? config.getProperty(DataLabelingConstants.CONFIG_PROFILE) + : System.getProperty(DataLabelingConstants.CONFIG_PROFILE); + dpEndpoint = StringUtils.isEmpty(System.getProperty(DataLabelingConstants.DLS_DP_URL)) + ? config.getProperty(DataLabelingConstants.DLS_DP_URL) + : System.getProperty(DataLabelingConstants.DLS_DP_URL); + datasetId = StringUtils.isEmpty(System.getProperty(DataLabelingConstants.DATASET_ID)) + ? config.getProperty(DataLabelingConstants.DATASET_ID) + : System.getProperty(DataLabelingConstants.DATASET_ID); + labelingAlgorithm = StringUtils.isEmpty(System.getProperty(DataLabelingConstants.LABELING_ALGORITHM)) + ? config.getProperty(DataLabelingConstants.LABELING_ALGORITHM) + : System.getProperty(DataLabelingConstants.LABELING_ALGORITHM); + String threadConfig = StringUtils.isEmpty(System.getProperty(DataLabelingConstants.THREAD_COUNT)) + ? config.getProperty(DataLabelingConstants.THREAD_COUNT) + : System.getProperty(DataLabelingConstants.THREAD_COUNT); + threadCount = (!threadConfig.isEmpty()) ? Integer.parseInt(threadConfig) + : DataLabelingConstants.DEFAULT_THREAD_COUNT; performAssertionOninput(); initializeLabelingStrategy(); validateAndInitializeLabels(config); - dpEndpoint = dpEndpoint.replace("${REGION}", region); dlsDpClient = initializeDpClient(); } catch (IOException ex) { ExceptionUtils.wrapAndThrow(ex); } } + private void initializeLabelingStrategy() { + switch (labelingAlgorithm) { + case "FIRST_LETTER_MATCH": + labelingStrategy = new FirstLetterMatch(); + break; + + case "FIRST_REGEX_MATCH": + labelingStrategy = new FirstRegexMatch(); + break; + + case "CUSTOM_LABELS_MATCH": + labelingStrategy = new CustomLabelMatch(); + break; + } + } + @SuppressWarnings("unchecked") private void validateAndInitializeLabels(Properties config) { switch (labelingAlgorithm) { case "FIRST_LETTER_MATCH": case "FIRST_REGEX_MATCH": - labels = Arrays.asList(config.getProperty("LABELS").split(",")); + String inputlLabels = StringUtils.isEmpty(System.getProperty(DataLabelingConstants.LABELS)) + ? config.getProperty(DataLabelingConstants.LABELS) + : System.getProperty(DataLabelingConstants.LABELS); + labels = Arrays.asList(inputlLabels.split(",")); assert null != labels && labels.isEmpty() == false : "Labels Cannot be empty"; break; + case "CUSTOM_LABELS_MATCH": try { + String customLabel = StringUtils.isEmpty(System.getProperty(DataLabelingConstants.CUSTOM_LABELS)) + ? config.getProperty(DataLabelingConstants.CUSTOM_LABELS) + : System.getProperty(DataLabelingConstants.CUSTOM_LABELS); ObjectMapper mapper = new ObjectMapper(); - customLabels = mapper.readValue(config.getProperty("CUSTOM_LABELS"), Map.class); + customLabels = mapper.readValue(customLabel, Map.class); } catch (JsonProcessingException e) { log.error("Invalid Custom Labels Provided as Input"); ExceptionUtils.wrapAndThrow(e); } - - } - if (labelingAlgorithm.equals("FIRST_REGEX_MATCH")) { - regexPattern = config.getProperty("FIRST_MATCH_REGEX_PATTERN"); - } - } - - private void initializeLabelingStrategy() { - switch (labelingAlgorithm) { - case "FIRST_LETTER_MATCH": - labelingStrategy = new FirstLetterMatch(); break; + } - case "FIRST_REGEX_MATCH": - labelingStrategy = new FirstRegexMatch(); - break; - - case "CUSTOM_LABELS_MATCH": - labelingStrategy = new CustomLabelMatch(); - break; + if (labelingAlgorithm.equals("FIRST_REGEX_MATCH")) { + regexPattern = StringUtils.isEmpty(System.getProperty(DataLabelingConstants.FIRST_MATCH_REGEX_PATTERN)) + ? config.getProperty(DataLabelingConstants.FIRST_MATCH_REGEX_PATTERN) + : System.getProperty(DataLabelingConstants.FIRST_MATCH_REGEX_PATTERN); + pattern = Pattern.compile(regexPattern); } } @@ -118,7 +139,6 @@ private DataLabelingClient initializeDpClient() { final AuthenticationDetailsProvider configFileProvider = new ConfigFileAuthenticationDetailsProvider( configFile); dlsDpClient = new DataLabelingClient(configFileProvider); - dlsDpClient.setRegion(region); dlsDpClient.setEndpoint(dpEndpoint); return dlsDpClient; } @@ -128,7 +148,6 @@ private void performAssertionOninput() { assert configProfile != null : "Config Profile cannot be empty"; assert dpEndpoint != null : "DLS DP URL cannot be empty"; assert datasetId != null : "Dataset Id cannot be empty"; - assert region != null : "Region Cannot be empty"; assert labelingAlgorithm != null : "Labeling Strategy cannot be empty"; assert threadCount >= 1 : "Invalid Thread Count Passed"; } diff --git a/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/scripts/CustomBulkLabelingScript.java b/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/scripts/CustomBulkLabelingScript.java index 85486f0c..f1c927e1 100644 --- a/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/scripts/CustomBulkLabelingScript.java +++ b/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/scripts/CustomBulkLabelingScript.java @@ -88,6 +88,7 @@ public static void main(String[] args) throws InterruptedException, ExecutionExc .runAsync(() -> processAnnotationForRecord(record, label), executorService); completableFutures.add(future); } else { + log.error("Label is null for record {}",record); failedRecordIds.add(record.getId()); } } diff --git a/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/scripts/SingleLabelDatasetBulkLabelingScript.java b/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/scripts/SingleLabelDatasetBulkLabelingScript.java index 7a95eb7e..3ad550f9 100644 --- a/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/scripts/SingleLabelDatasetBulkLabelingScript.java +++ b/data_labeling_examples/src/main/java/com/oracle/datalabelingservicesamples/scripts/SingleLabelDatasetBulkLabelingScript.java @@ -78,6 +78,7 @@ public static void main(String[] args) throws InterruptedException, ExecutionExc .runAsync(() -> processAnnotationForRecord(record, label), executorService); completableFutures.add(future); } else { + log.error("Label is null for record {}",record); failedRecordIds.add(record.getId()); } } diff --git a/data_labeling_examples/src/main/resources/config.properties b/data_labeling_examples/src/main/resources/config.properties index bfdb4a23..86c65eb7 100644 --- a/data_labeling_examples/src/main/resources/config.properties +++ b/data_labeling_examples/src/main/resources/config.properties @@ -1,7 +1,6 @@ CONFIG_FILE_PATH=~/.oci/config CONFIG_PROFILE=DEFAULT -DLS_DP_URL=https://dlstest-dp.${REGION}.oci.oraclecloud.com -REGION=uk-london-1 +DLS_DP_URL=https://dlsprod-dp.uk-london-1.oci.oraclecloud.com THREAD_COUNT=30 DATASET_ID=ocid1.datalabelingdatasetint.oc1.uk-london-1.amaaaaaaniob46iarr2zttq7c5th3jfqwab7d3vrq4daa52tcnnwhkgrowca @@ -16,9 +15,4 @@ LABELS=cat,dog FIRST_MATCH_REGEX_PATTERN=^abc* #Used for CUSTOM_LABELS_MATCH labeling algorithm -CUSTOM_LABELS={ "dog/": ["dog","pup"], "cat/": ["cat","kitten"] } - - - - - +CUSTOM_LABELS={ "dog/": ["dog","pup"], "cat/": ["cat","kitten"] } \ No newline at end of file