feat(ai-model): merge ai planning and insight call to accelerate the …

…aiAction (#97) --------- Co-authored-by: zhouxiao.shaw <[email protected]> * feat(ai-model): optimize AI model for element inspection * feat(ai-model): optimize AI model and add quick answer functionality --------- Co-authored-by: yuyutaotao <[email protected]> * feat(ai-model): implement quick answer functionality for element inspection --------- Co-authored-by: zhouxiao.shaw <[email protected]>
web-infra-dev · Oct 12, 2024 · f9dc0f6 · f9dc0f6
1 parent 18b5e92
commit f9dc0f6
Show file tree

Hide file tree

Showing 115 changed files with 7,407 additions and 9,311 deletions.
diff --git a/.github/workflows/ai.yml b/.github/workflows/ai.yml
@@ -55,7 +55,7 @@ jobs:
         fi
 
     - name: Build project
-      run: pnpm run build:pkg
+      run: pnpm run build
 
     - name: Run e2e tests
       run: pnpm run e2e

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -48,7 +48,7 @@ jobs:
       run: pnpm install --frozen-lockfile
 
     - name: Build project
-      run: pnpm run build:pkg
+      run: pnpm run build
 
     - name: Run tests
       run: pnpm run test
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -101,7 +101,7 @@ npx nx build @midscene/web
 Build all packages:
 
 ```sh
-pnpm run build:pkg
+pnpm run build
 ```
 
 ---

diff --git a/package.json b/package.json
@@ -3,14 +3,14 @@
   "private": true,
   "version": "0.5.1",
   "scripts": {
-    "build:pkg": "nx run-many --target=build --projects=@midscene/core,@midscene/shared,@midscene/visualizer,@midscene/web,@midscene/cli --verbose",
+    "build": "nx run-many --target=build --projects=@midscene/core,@midscene/shared,@midscene/visualizer,@midscene/web,@midscene/cli --verbose",
     "test": "nx run-many --target=test --projects=@midscene/core,--projects=@midscene/shared,@midscene/visualizer,@midscene/web,@midscene/cli --verbose",
     "test:ai": "nx run-many --target=test:ai --projects=@midscene/core,@midscene/web --verbose",
     "e2e": "nx run @midscene/web:e2e --verbose",
     "e2e:cache": "nx run @midscene/web:e2e:cache --verbose",
     "e2e:report": "nx run @midscene/web:e2e:report --verbose",
     "test:ai:all": "npm run e2e && npm run e2e:cache && npm run e2e:report && npm run test:ai",
-    "prepare": "pnpm run build:pkg && simple-git-hooks",
+    "prepare": "pnpm run build && simple-git-hooks",
     "check-dependency-version": "check-dependency-version-consistency .",
     "lint": "npx biome check . --diagnostic-level=warn --no-errors-on-unmatched --fix",
     "format:ci": "pretty-quick --since HEAD~1",

diff --git a/packages/cli/package.json b/packages/cli/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@midscene/cli",
-  "description": "Cli for Midscene.js",
+  "description": "An AI-powered automation SDK can control the page, perform assertions, and extract data in JSON format using natural language. See https://midscenejs.com/ for details.",
   "version": "0.5.1",
   "jsnext:source": "./src/index.ts",
   "main": "./dist/lib/index.js",

diff --git a/packages/midscene/package.json b/packages/midscene/package.json
@@ -1,43 +1,18 @@
 {
   "name": "@midscene/core",
-  "description": "Hello, It's Midscene",
+  "description": "An AI-powered automation SDK can control the page, perform assertions, and extract data in JSON format using natural language. See https://midscenejs.com/ for details.",
   "version": "0.5.1",
   "jsnext:source": "./src/index.ts",
+  "type": "commonjs",
   "main": "./dist/lib/index.js",
-  "module": "./dist/es/index.js",
   "types": "./dist/types/index.d.ts",
   "files": ["dist", "report", "README.md"],
   "exports": {
-    ".": {
-      "types": "./dist/types/index.d.ts",
-      "import": "./dist/es/index.js",
-      "require": "./dist/lib/index.js"
-    },
-    "./query": {
-      "types": "./dist/types/query/index.d.ts",
-      "import": "./dist/es/query/index.js",
-      "require": "./dist/lib/query/index.js"
-    },
-    "./demo_data": {
-      "types": "./demo_data/index.d.ts",
-      "import": "./demo_data/index.js",
-      "require": "./demo_data/index.js"
-    },
-    "./utils": {
-      "types": "./dist/types/utils.d.ts",
-      "import": "./dist/es/utils.js",
-      "require": "./dist/lib/utils.js"
-    },
-    "./ai-model": {
-      "types": "./dist/types/ai-model.d.ts",
-      "import": "./dist/es/ai-model.js",
-      "require": "./dist/lib/ai-model.js"
-    },
-    "./image": {
-      "types": "./dist/types/image.d.ts",
-      "import": "./dist/es/image.js",
-      "require": "./dist/lib/image.js"
-    }
+    ".": "./dist/lib/index.js",
+    "./query": "./dist/types/query/index.d.ts",
+    "./utils": "./dist/lib/utils.js",
+    "./ai-model": "./dist/lib/ai-model.js",
+    "./image": "./dist/lib/image.js"
   },
   "typesVersions": {
     "*": {
@@ -55,6 +30,8 @@
     "upgrade": "modern upgrade",
     "test": "vitest --run",
     "test:ai": "AITEST=true npm run test",
+    "evaluate": "npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
+    "evaluate:plan": "PLAN_INSPECT=true npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
     "prepublishOnly": "npm run build"
   },
   "dependencies": {

diff --git a/packages/midscene/src/ai-model/automation/index.ts b/packages/midscene/src/ai-model/automation/index.ts
@@ -6,8 +6,8 @@ import {
   callAiFn,
   transformUserMessages,
 } from '../common';
+import { systemPromptToTaskPlanning } from '../prompt/planning';
 import { describeUserPage } from '../prompt/util';
-import { systemPromptToTaskPlanning } from './planning';
 
 export async function plan(
   userPrompt: string,
@@ -16,7 +16,9 @@ export async function plan(
     callAI?: typeof callAiFn<PlanningAIResponse>;
   },
   useModel?: 'coze' | 'openAI',
-): Promise<{ plans: PlanningAction[] }> {
+): Promise<{
+  plans: PlanningAction[];
+}> {
   const { callAI, context } = opts || {};
   const { screenshotBase64 } = context;
   const { description: pageDescription } = await describeUserPage(context);
@@ -51,19 +53,12 @@ export async function plan(
     },
   ];
 
-  if (callAI) {
-    planFromAI = await callAI({
-      msgs,
-      AIActionType: AIActionType.PLAN,
-      useModel,
-    });
-  } else {
-    planFromAI = await callAiFn({
-      msgs,
-      AIActionType: AIActionType.PLAN,
-      useModel,
-    });
-  }
+  const call = callAI || callAiFn;
+  planFromAI = await call({
+    msgs,
+    AIActionType: AIActionType.PLAN,
+    useModel,
+  });
 
   const actions = planFromAI?.actions || [];
 
@@ -74,11 +69,5 @@ export async function plan(
     throw new Error(planFromAI.error);
   }
 
-  // actions.forEach((task) => {
-  //   if (task.type === 'Error') {
-  //     throw new Error(task.thought);
-  //   }
-  // });
-
   return { plans: actions };
 }
diff --git a/packages/midscene/src/ai-model/automation/planning.ts b/packages/midscene/src/ai-model/automation/planning.ts
diff --git a/packages/midscene/src/ai-model/inspect.ts b/packages/midscene/src/ai-model/inspect.ts
@@ -3,6 +3,7 @@ import type {
   AIAssertionResponse,
   AIElementParseResponse,
   AISectionParseResponse,
+  AISingleElementResponse,
   BaseElement,
   UIContext,
 } from '@/types';
@@ -31,16 +32,27 @@ export async function AiInspectElement<
 >(options: {
   context: UIContext<ElementType>;
   multi: boolean;
-  findElementDescription: string;
+  targetElementDescription: string;
   callAI?: typeof callAiFn<AIElementParseResponse>;
   useModel?: 'coze' | 'openAI';
+  quickAnswer?: AISingleElementResponse;
 }) {
-  const { context, multi, findElementDescription, callAI, useModel } = options;
+  const { context, multi, targetElementDescription, callAI, useModel } =
+    options;
   const { screenshotBase64 } = context;
   const { description, elementById } = await describeUserPage(context);
 
-  const systemPrompt = systemPromptToFindElement();
+  // meet quick answer
+  if (options.quickAnswer?.id && elementById(options.quickAnswer.id)) {
+    return {
+      parseResult: {
+        elements: [options.quickAnswer],
+      },
+      elementById,
+    };
+  }
 
+  const systemPrompt = systemPromptToFindElement();
   const msgs: AIArgs = [
     { role: 'system', content: systemPrompt },
     {
@@ -58,10 +70,10 @@ export async function AiInspectElement<
     pageDescription: \n
     ${description}
 
-    Here is the description of the findElement. Just go ahead:
+    Here is the item user want to find. Just go ahead:
     =====================================
     ${JSON.stringify({
-      description: findElementDescription,
+      description: targetElementDescription,
       multi: multiDescription(multi),
     })}
     =====================================

diff --git a/packages/midscene/src/ai-model/openai/index.ts b/packages/midscene/src/ai-model/openai/index.ts
@@ -3,9 +3,9 @@ import { AIResponseFormat } from '@/types';
 import { wrapOpenAI } from 'langsmith/wrappers';
 import OpenAI, { type ClientOptions, AzureOpenAI } from 'openai';
 import type { ChatCompletionMessageParam } from 'openai/resources';
-import { planSchema } from '../automation/planning';
 import { AIActionType } from '../common';
 import { findElementSchema } from '../prompt/element_inspector';
+import { planSchema } from '../prompt/planning';
 import { assertSchema } from '../prompt/util';
 
 export const MIDSCENE_OPENAI_INIT_CONFIG_JSON =
@@ -51,7 +51,7 @@ async function createOpenAI() {
 
   if (process.env[MIDSCENE_LANGSMITH_DEBUG]) {
     console.log('DEBUGGING MODE: langsmith wrapper enabled');
-    const openai = wrapOpenAI(new OpenAI());
+    const openai = wrapOpenAI(new OpenAI(extraConfig));
     return openai;
   }
 
@@ -73,7 +73,7 @@ export async function call(
     model,
     messages,
     response_format: responseFormat,
-    temperature: 0.2,
+    temperature: 0.1,
     stream: false,
   });
   shouldPrintTiming && console.timeEnd('Midscene - AI call');

diff --git a/packages/midscene/src/ai-model/prompt/element_inspector.ts b/packages/midscene/src/ai-model/prompt/element_inspector.ts
@@ -62,7 +62,8 @@ Input Example:
       },
       "elementInfos": [
         {
-          "id": "3", // ID of the element
+          "id": "we23xsfwe", // ID of the element
+          "indexId": "0", // Index of the element，The image is labeled to the left of the element
           "attributes": { // Attributes of the element
             "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
             "src": "https://ap-southeast-3.m",
@@ -77,7 +78,8 @@ Input Example:
           }
         },
         {
-          "id": "4", // ID of the element
+          "id": "wefew2222few2", // ID of the element
+          "indexId": "1", // Index of the element，The image is labeled to the left of the element
           "attributes": { // Attributes of the element
             "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
             "src": "data:image/png;base64,iVBORw0KGgoAAAANSU...",
@@ -93,7 +95,8 @@ Input Example:
         },
         ...
         {
-          "id": "27",
+          "id": "kwekfj2323",
+          "indexId": "2", // Index of the element，The image is labeled to the left of the element
           "attributes": {
             "nodeType": "TEXT Node",
             "class": ".product-name"
@@ -125,7 +128,7 @@ Output Example:
       "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
       "text": "",
       // ID of this element, replace with actual value in practice
-      "id": "4"
+      "id": "wefew2222few2"
     }
   ],
   "errors": []