Azure-Samples · amitkalay · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/AzureOpenAICustomInferenceSkill/api-test.http b/AzureOpenAICustomInferenceSkill/api-test.http
diff --git a/AzureOpenAICustomInferenceSkill/currently_used_skillset.json b/AzureOpenAICustomInferenceSkill/currently_used_skillset.json
@@ -0,0 +1,68 @@
+{
+    "@odata.context": "https://amitabh-basic-canary.search.windows.net/$metadata#skillsets/$entity",
+    "@odata.etag": "\"0x8DCEFB4EAF1115B\"",
+    "name": "ignite-demo-skillset",
+    "description": "Customized Ignite Demo Skillset using a custom skill to describe an image and then merge its text",
+    "skills": [
+      {
+        "@odata.type": "#Microsoft.Skills.Custom.WebApiSkill",
+        "name": "Image Captioning Custom Skill",
+        "description": "The skill which calls a local or deployed Azure function to describe the image",
+        "context": "/document/normalized_images/*",
+        "uri": "https://shiny-enigma-5x5rrgwgwxx3p776-7071.app.github.dev/api/custom_skill",
+        "httpMethod": "POST",
+        "timeout": "PT30S",
+        "batchSize": 1000,
+        "degreeOfParallelism": null,
+        "authResourceId": null,
+        "inputs": [
+          {
+            "name": "image",
+            "source": "/document/normalized_images/*"
+          }
+        ],
+        "outputs": [
+          {
+            "name": "generative-caption",
+            "targetName": "caption"
+          }
+        ],
+        "httpHeaders": {
+          "scenario": "image-captioning"
+        },
+        "authIdentity": null
+      },
+      {
+        "@odata.type": "#Microsoft.Skills.Text.MergeSkill",
+        "name": "MSFT Text Merge Skill",
+        "description": "this is the text merging skill provided by Microsoft",
+        "context": "/document",
+        "insertPreTag": " ",
+        "insertPostTag": " ",
+        "inputs": [
+          {
+            "name": "text",
+            "source": "/document/content"
+          },
+          {
+            "name": "itemsToInsert",
+            "source": "/document/normalized_images/*/text"
+          },
+          {
+            "name": "offsets",
+            "source": "/document/normalized_images/*/contentOffset"
+          }
+        ],
+        "outputs": [
+          {
+            "name": "mergedText",
+            "targetName": "merged_content"
+          }
+        ]
+      }
+    ],
+    "cognitiveServices": null,
+    "knowledgeStore": null,
+    "indexProjections": null,
+    "encryptionKey": null
+}
diff --git a/AzureOpenAICustomInferenceSkill/custom_prompts.json b/AzureOpenAICustomInferenceSkill/custom_prompts.json
@@ -2,5 +2,6 @@
     "summarize-default-system-prompt": "You are a useful AI assistant who is an expert at succinctly summarizing long form text into a simple summary. Summarize the text given to you in about 200 words or less.",
     "entity-recognition-default-system-prompt": "You are a useful AI assistant. I need you to help me recognize entities in this piece of text. From the text given to you, identity all people names, addresses, email addresses, engineering job titles and present them as individual lists in a JSON object.",
     "image-captioning-machine-info-default-prompt": "You are a useful AI assistant who is an expert in machines. You will identify all the parts for the machines in the image sent to you and will formulate the response as a list in JSON.",
-    "image-captioning-celebrity-recognition-prompt": "You are a useful AI assistant who can recognize celebrities. For the image sent to you as a base64 encoded string, identify all the celebrities in there and present them as a list in a JSON object."
+    "image-captioning-celebrity-recognition-prompt": "You are a useful AI assistant who can recognize celebrities. For the image sent to you as a base64 encoded string, identify all the celebrities in there and present them as a list in a JSON object.",
+    "image-captioning-simple-description-prompt": "You are a useful AI assistant who can provide captions for images. You will be provided an image as a base64 encoded string."
 }
diff --git a/AzureOpenAICustomInferenceSkill/function_app.py b/AzureOpenAICustomInferenceSkill/function_app.py
@@ -6,8 +6,7 @@
 
 app = func.FunctionApp()
 
-# A healthcheck endpoint. Important to make sure that deployments are healthy.
-# It can be accessed via <base_url>/api/health
+# A healthcheck endpoint.It can be accessed via <base_url>/api/health
 @app.route(route="health", auth_level=func.AuthLevel.ANONYMOUS)
 def HealthCheck(req: func.HttpRequest) -> func.HttpResponse:
     logging.info('Calling the healthcheck endpoint')
@@ -20,7 +19,7 @@ def HealthCheck(req: func.HttpRequest) -> func.HttpResponse:
 @app.function_name(name="AOAICustomSkill")
 @app.route(route="custom_skill", auth_level=func.AuthLevel.ANONYMOUS)
 def custom_skill(req: func.HttpRequest) -> func.HttpResponse:
-    logging.info("calling the custom skill endpoint")
+    logging.info("calling the aoai custom skill endpoint")
     request_json = dict(req.get_json())
     input_values = []
     api_key = None
@@ -55,22 +54,18 @@ def call_chat_completion_model(request_body: dict, scenario: str):
         "Content-Type": "application/json",
         "api-key": api_key,
     }
-    # default our chat completion context to be for summarization
     chat_completion_system_context = {}
     messages = []
     custom_prompts = {}
-    # read from a json file called custom_prmopts.json to read the prompts for the different scenarios
     with open('custom_prompts.json', 'r') as file:
         custom_prompts = json.load(file)
-
     if scenario == SUMMARIZATION_HEADER:
         logging.info("calling into the summarization capability")
         chat_completion_system_context = {
         "role": "system",
-        "content": [ # this context has to be dynamic according to the request header
+        "content": [
             {
                 "type": "text",
-                # Note: this is a sample summarization prompt which can be tweaked according to your exact needs
                 "text": custom_prompts.get("summarize-default-system-prompt")
             }
             ]
@@ -93,7 +88,6 @@ def call_chat_completion_model(request_body: dict, scenario: str):
         "content": [
             {
                     "type": "text",
-                    # Note: this is a sample prompt which can be tweaked according to your exact needs
                     "text": custom_prompts.get("entity-recognition-default-system-prompt")
                 }
             ]
@@ -110,15 +104,18 @@ def call_chat_completion_model(request_body: dict, scenario: str):
         }
     ]
     elif scenario == IMAGE_CAPTIONING_HEADER:
-        logging.info("calling into the image captioning capability")
-        image_base64encoded = request_body.get("data", {}).get("image", "")
+        logging.info("calling the image captioning capability")
+        raw_image_data = request_body.get("data", {}).get("image", "")
+        image_data = raw_image_data.get("data")
+        image_type = raw_image_data.get("contentType")
+        image_base64encoded = f'data:{image_type};base64,{image_data}'
         messages = [ {
             "role": "system",
             "content": 
             [
                 {
                     "type": "text",
-                    "text": custom_prompts.get("image-captioning-machine-info-default-prompt")
+                    "text": custom_prompts.get("image-captioning-simple-description-prompt")
                 }
             ]
             },
@@ -131,7 +128,7 @@ def call_chat_completion_model(request_body: dict, scenario: str):
                 },
                 {
                     "type": "text",
-                    "text": "Tell me what this is and what's required to make this."
+                    "text": "I want you to describe this image in 1-2 simple sentences."
                 },
                 ]
             }
@@ -166,4 +163,5 @@ def call_chat_completion_model(request_body: dict, scenario: str):
         response_body["data"] = {"entities": top_response_text}
     elif scenario == IMAGE_CAPTIONING_HEADER:
         response_body["data"] = {"generative-caption": top_response_text}
+    logging.info(f"the response body is: {response_body}")
     return response_body