web-infra-dev · zhoushaw · Sep 29, 2024 · Sep 12, 2024 · Sep 13, 2024 · Sep 13, 2024
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -3,6 +3,5 @@
     "source.organizeImports.biome": "explicit"
   },
   "editor.defaultFormatter": "biomejs.biome",
-  "editor.formatOnSave": true,
-  "editor.formatOnSaveMode": "modifications"
+  "editor.formatOnSave": true
 }
diff --git a/biome.json b/biome.json
@@ -9,6 +9,8 @@
       "**/dist",
       "**/test-data/**",
       "dist",
+      "__ai_responses__",
+      "ai-data/**",
       "**/doc_build",
       "*-dump.json",
       "script_get_all_texts.tmp.js",

diff --git a/packages/midscene/src/ai-model/automation/index.ts b/packages/midscene/src/ai-model/automation/index.ts
@@ -1,6 +1,11 @@
 import assert from 'node:assert';
 import type { PlanningAIResponse, PlanningAction, UIContext } from '@/types';
-import { AIActionType, type AIArgs, callAiFn } from '../common';
+import {
+  AIActionType,
+  type AIArgs,
+  callAiFn,
+  transformUserMessages,
+} from '../common';
 import { describeUserPage } from '../prompt/util';
 import { systemPromptToTaskPlanning } from './planning';
 
@@ -22,7 +27,7 @@ export async function plan(
     { role: 'system', content: systemPrompt },
     {
       role: 'user',
-      content: [
+      content: transformUserMessages([
         {
           type: 'image_url',
           image_url: {
@@ -33,19 +38,16 @@ export async function plan(
         {
           type: 'text',
           text: `
-            pageDescription: ${pageDescription}
+            pageDescription:\n 
+            ${pageDescription}
+            \n
+            Here is the description of the task. Just go ahead:
+            =====================================
+            ${userPrompt}
+            =====================================
           `,
         },
-        {
-          type: 'text',
-          text: `
-                Here is the description of the task. Just go ahead:
-                =====================================
-                ${userPrompt}
-                =====================================
-            `,
-        },
-      ],
+      ]),
     },
   ];
 

diff --git a/packages/midscene/src/ai-model/automation/planning.ts b/packages/midscene/src/ai-model/automation/planning.ts
@@ -37,8 +37,8 @@ export function systemPromptToTaskPlanning() {
   2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
 
   If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
-  
-  Return in the following JSON format:
+
+  Please return the result in JSON format as follows:
   {
     queryLanguage: '', // language of the description of the task
     actions: [ // always return in Array

diff --git a/packages/midscene/src/ai-model/common.ts b/packages/midscene/src/ai-model/common.ts
@@ -1,5 +1,6 @@
 import type OpenAI from 'openai';
 import type {
+  ChatCompletionContentPart,
   ChatCompletionSystemMessageParam,
   ChatCompletionUserMessageParam,
 } from 'openai/resources';
@@ -12,7 +13,11 @@ import {
   transfromOpenAiArgsToCoze,
   useCozeModel,
 } from './coze';
-import { callToGetJSONObject, useOpenAIModel } from './openai';
+import {
+  MIDSCENE_MODEL_TEXT_ONLY,
+  callToGetJSONObject,
+  useOpenAIModel,
+} from './openai';
 
 export type AIArgs = [
   ChatCompletionSystemMessageParam,
@@ -64,3 +69,15 @@ export async function callAiFn<T>(options: {
     'Cannot find Coze or OpenAI config. You should set at least one of them.',
   );
 }
+
+export function transformUserMessages(msgs: ChatCompletionContentPart[]) {
+  const textOnly = Boolean(process.env[MIDSCENE_MODEL_TEXT_ONLY]);
+  if (!textOnly) return msgs;
+
+  return msgs.reduce((res, msg) => {
+    if (msg.type === 'text') {
+      res += msg.text;
+    }
+    return res;
+  }, '');
+}
diff --git a/packages/midscene/src/ai-model/inspect.ts b/packages/midscene/src/ai-model/inspect.ts
@@ -10,7 +10,7 @@ import type {
   ChatCompletionSystemMessageParam,
   ChatCompletionUserMessageParam,
 } from 'openai/resources';
-import { AIActionType, callAiFn } from './common';
+import { AIActionType, callAiFn, transformUserMessages } from './common';
 import {
   multiDescription,
   systemPromptToFindElement,
@@ -45,34 +45,29 @@ export async function AiInspectElement<
     { role: 'system', content: systemPrompt },
     {
       role: 'user',
-      content: [
+      content: transformUserMessages([
         {
           type: 'image_url',
           image_url: {
             url: screenshotBase64,
-            detail: 'high',
           },
         },
         {
           type: 'text',
           text: `
-            pageDescription: \n
-            ${description}
-          `,
-        },
-        {
-          type: 'text',
-          text: `
-          Here is the description of the findElement. Just go ahead:
-          =====================================
-          ${JSON.stringify({
-            description: findElementDescription,
-            multi: multiDescription(multi),
-          })}
-          =====================================
-          `,
+    pageDescription: \n
+    ${description}
+
+    Here is the description of the findElement. Just go ahead:
+    =====================================
+    ${JSON.stringify({
+      description: findElementDescription,
+      multi: multiDescription(multi),
+    })}
+    =====================================
+  `,
         },
-      ],
+      ]),
     },
   ];
 
@@ -117,7 +112,7 @@ export async function AiExtractElementInfo<
     { role: 'system', content: systemPrompt },
     {
       role: 'user',
-      content: [
+      content: transformUserMessages([
         {
           type: 'image_url',
           image_url: {
@@ -142,7 +137,7 @@ ${typeof dataQuery === 'string' ? dataQuery : JSON.stringify(dataQuery, null, 2)
 DATA_DEMAND ends.
           `,
         },
-      ],
+      ]),
     },
   ];
 
@@ -176,7 +171,7 @@ export async function AiAssert<
     { role: 'system', content: systemPrompt },
     {
       role: 'user',
-      content: [
+      content: transformUserMessages([
         {
           type: 'image_url',
           image_url: {
@@ -186,20 +181,15 @@ export async function AiAssert<
         {
           type: 'text',
           text: `
-            pageDescription: \n
-            ${description}
-          `,
-        },
-        {
-          type: 'text',
-          text: `
-            Here is the description of the assertion. Just go ahead:
-            =====================================
-            ${assertion}
-            =====================================
-          `,
+    pageDescription: \n
+    ${description}
+    Here is the description of the assertion. Just go ahead:
+    =====================================
+    ${assertion}
+    =====================================
+  `,
         },
-      ],
+      ]),
     },
   ];
 

diff --git a/packages/midscene/src/ai-model/openai/index.ts b/packages/midscene/src/ai-model/openai/index.ts
@@ -14,6 +14,7 @@ export const MIDSCENE_MODEL_NAME = 'MIDSCENE_MODEL_NAME';
 export const MIDSCENE_LANGSMITH_DEBUG = 'MIDSCENE_LANGSMITH_DEBUG';
 export const MIDSCENE_DEBUG_AI_PROFILE = 'MIDSCENE_DEBUG_AI_PROFILE';
 export const OPENAI_API_KEY = 'OPENAI_API_KEY';
+export const MIDSCENE_MODEL_TEXT_ONLY = 'MIDSCENE_MODEL_TEXT_ONLY';
 
 const OPENAI_USE_AZURE = 'OPENAI_USE_AZURE';
 
@@ -73,6 +74,7 @@ export async function call(
     messages,
     response_format: responseFormat,
     temperature: 0.2,
+    stream: false,
   });
   shouldPrintTiming && console.timeEnd('Midscene - AI call');
   shouldPrintTiming && console.log('Midscene - AI usage', completion.usage);
@@ -110,7 +112,35 @@ export async function callToGetJSONObject<T>(
     }
   }
 
+  if (model.startsWith('gemini')) {
+    responseFormat = { type: AIResponseFormat.TEXT };
+  }
+
   const response = await call(messages, responseFormat);
   assert(response, 'empty response');
-  return JSON.parse(response.replace(/^```json\n|\n```$/g, ''));
+  const jsonContent = extractJSONFromCodeBlock(response);
+  return JSON.parse(jsonContent);
+}
+
+export function extractJSONFromCodeBlock(response: string) {
+  // First, try to match a JSON object directly in the response
+  const jsonMatch = response.match(/^\s*(\{[\s\S]*\})\s*$/);
+  if (jsonMatch) {
+    return jsonMatch[1];
+  }
+
+  // If no direct JSON object is found, try to extract JSON from a code block
+  const codeBlockMatch = response.match(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/);
+  if (codeBlockMatch) {
+    return codeBlockMatch[1];
+  }
+
+  // If no code block is found, try to find a JSON-like structure in the text
+  const jsonLikeMatch = response.match(/\{[\s\S]*\}/);
+  if (jsonLikeMatch) {
+    return jsonLikeMatch[0];
+  }
+
+  // If no JSON-like structure is found, return the original response
+  return response;
 }
diff --git a/packages/midscene/src/ai-model/prompt/element_inspector.ts b/packages/midscene/src/ai-model/prompt/element_inspector.ts
@@ -28,6 +28,9 @@ You are an expert in software page image (2D) and page element text analysis.
 - The returned data must conform to the specified JSON format.
 
 ## Output Format:
+
+Please return the result in JSON format as follows:
+
 \`\`\`json
 {
   "elements": [