improve DemoApp

mobile-dev-inc · Aug 22, 2024 · 92466ff · 92466ff
1 parent 9ae8a6d
commit 92466ff
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 62 deletions.
diff --git a/maestro-ai/README.md b/maestro-ai/README.md
@@ -2,10 +2,14 @@
 
 This project implements AI support for use in Maestro.
 
-It's both a library and a demo-app executable.
+It's both a library and an executable demo app.
 
 ### Demo app
 
+An API key is required. Set it with `MAESTRO_CLI_AI_KEY` env var. Examples:
+- OpenAI: `export MAESTRO_CLI_AI_KEY=sk-...`
+- Antrophic: `export MAESTRO_CLI_AI_KEY=sk-ant-api-...`
+
 Build it:
 
 ```console
@@ -17,3 +21,5 @@ then learn how to use it:
 ```console
 ./maestro-ai/build/install/maestro-ai-demo/bin/maestro-ai-demo --help
 ```
+
+Finally, run it:
diff --git a/maestro-ai/src/main/java/maestro/ai/DemoApp.kt b/maestro-ai/src/main/java/maestro/ai/DemoApp.kt
@@ -21,13 +21,6 @@ import java.nio.file.Path
 
 fun main(args: Array<String>) = DemoApp().main(args)
 
-// TODO(bartekpacia): Improvement ideas:
-//  * --only-fail - show only failing test cases
-//  * --json – to allow for easy filtering with JQ
-//  * --show-prompts – show prompts that were used
-//  * Possibility to pass a single screenshot
-//  Note: maybe instead of building this purpose CLI program, we can use something
-//  purpose-made for this.
 /**
  * This is a small helper program to help evaluate LLM results against a directory of screenshots and prompts.
  *
@@ -59,6 +52,8 @@ class DemoApp : CliktCommand() {
 
     private val model: String by option(help = "LLM to use").default("gpt-4o-2024-08-06")
 
+    private val showOnlyFails: Boolean by option(help = "Show only failed tests").flag()
+
     private val showPrompts: Boolean by option(help = "Show prompts").flag()
 
     private val showRawResponse: Boolean by option(help = "Show raw LLM response").flag()
@@ -67,35 +62,42 @@ class DemoApp : CliktCommand() {
 
     private val parallel: Boolean by option(help = "Run in parallel. May get rate limited").flag()
 
+    // IDEA: "--json" flag to allow for easy filtering with jq
+
     override fun run() = runBlocking {
         val apiKey = System.getenv("MAESTRO_CLI_AI_KEY")
         require(apiKey != null) { "OpenAI API key is not provided" }
 
         val testCases = inputFiles.map { it.toFile() }.map { file ->
-                require(!file.isDirectory) { "Provided file is a directory, not a file" }
-                require(file.exists()) { "Provided file does not exist" }
-                require(file.extension == "png") { "Provided file is not a PNG file" }
-                file
-            }.map { file ->
-                val filename = file.nameWithoutExtension
-                val parts = filename.split("_")
-                require(parts.size == 3) { "Screenshot name is invalid: ${file.name}" }
-
-                val appName = parts[0]
-                val index = parts[2].toInt()
-
-                val promptFile = "${file.parent}/${appName}_${parts[1]}_$index.txt"
-                println("Prompt file: $promptFile")
-                val prompt = File(promptFile).run { if (exists()) readText() else null }
-
-                TestCase(
-                    screenshot = file,
-                    appName = appName,
-                    hasDefects = parts[1] == "bad",
-                    index = index,
-                    prompt = prompt,
-                )
-            }.toList()
+            require(!file.isDirectory) { "Provided file is a directory, not a file" }
+            require(file.exists()) { "Provided file does not exist" }
+            require(file.extension == "png") { "Provided file is not a PNG file" }
+            file
+        }.map { file ->
+            val filename = file.nameWithoutExtension
+            val parts = filename.split("_")
+            require(parts.size == 3) { "Screenshot name is invalid: ${file.name}" }
+
+            val appName = parts[0]
+            val index = parts[1].toIntOrNull() ?: throw IllegalArgumentException("Invalid screenshot name: ${file.name}")
+            val status = parts[2]
+
+            val promptFile = "${file.parent}/${appName}_${index}_${status}.txt"
+            val prompt = File(promptFile).run {
+                if (exists()) {
+                    println("Found prompt file: $promptFile")
+                    readText()
+                } else null
+            }
+
+            TestCase(
+                screenshot = file,
+                appName = appName,
+                hasDefects = status == "bad",
+                index = index,
+                prompt = prompt,
+            )
+        }.toList()
 
         val aiClient: AI = when {
             model.startsWith("gpt") -> OpenAI(
@@ -129,16 +131,16 @@ class DemoApp : CliktCommand() {
                 verify(testCase, defects)
             }
 
-            if (parallel) job.await()
+            if (!parallel) job.await()
         }
-
-        println("Exited, bye!")
     }
 
     private fun verify(testCase: TestCase, defects: List<Defect>) {
         if (testCase.hasDefects) {
             // Check LLM found defects as well (i.e. didn't commit false negative)
             if (defects.isNotEmpty()) {
+                if (showOnlyFails) return
+
                 println(
                     """
                     PASS ${testCase.screenshot.name}: ${defects.size} defects found (as expected)
@@ -152,6 +154,8 @@ class DemoApp : CliktCommand() {
         } else {
             // Check that LLM didn't raise false positives
             if (defects.isEmpty()) {
+                if (showOnlyFails) return
+
                 println(
                     """
                     PASS ${testCase.screenshot.name}: No defects found (as expected)

diff --git a/maestro-ai/src/main/java/maestro/ai/Prediction.kt b/maestro-ai/src/main/java/maestro/ai/Prediction.kt
@@ -78,29 +78,30 @@ object Prediction {
 
             // Claude doesn't have a JSON mode as of 21-08-2024
             //  https://docs.anthropic.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency
-            if (aiClient is Claude) {
-                append(
-                    """
-                    |
-                    |* You must provide result as a valid JSON object, matching this structure:
-                    |
-                    |  {
-                    |      "defects": [
-                    |          {
-                    |              "category": "<defect category, string>",
-                    |              "reasoning": "<reasoning, string>"
-                    |          },
-                    |          {
-                    |              "category": "<defect category, string>",
-                    |              "reasoning": "<reasoning, string>"
-                    |          }
-                    |       ]
-                    |  }
-                    |
-                    |DO NOT output any other information in the JSON object.
-                    """.trimMargin("|")
-                )
-            }
+            //  We could do "if (aiClient is Claude)", but actually, this also helps with gpt-4o generating
+            //  never-ending stream of output.
+            append(
+                """
+                |
+                |* You must provide result as a valid JSON object, matching this structure:
+                |
+                |  {
+                |      "defects": [
+                |          {
+                |              "category": "<defect category, string>",
+                |              "reasoning": "<reasoning, string>"
+                |          },
+                |          {
+                |              "category": "<defect category, string>",
+                |              "reasoning": "<reasoning, string>"
+                |          }
+                |       ]
+                |  }
+                |
+                |DO NOT output any other information in the JSON object.
+                """.trimMargin("|")
+            )
+
 
             if (previousFalsePositives.isNotEmpty()) {
                 appendLine("Additionally, the following defects are false positives:")
@@ -109,7 +110,7 @@ object Prediction {
                 }
             }
 
-            appendLine("Be brief.")
+            appendLine("There are usually only a few defects in the screenshot. Don't generate tens of them.")
         }
 
         if (printPrompt) {

diff --git a/maestro-ai/src/main/java/maestro/ai/antrophic/Client.kt b/maestro-ai/src/main/java/maestro/ai/antrophic/Client.kt
@@ -12,6 +12,7 @@ import io.ktor.http.HttpStatusCode
 import io.ktor.http.contentType
 import io.ktor.http.isSuccess
 import io.ktor.util.encodeBase64
+import kotlinx.serialization.SerializationException
 import kotlinx.serialization.encodeToString
 import kotlinx.serialization.json.Json
 import kotlinx.serialization.json.JsonObject
@@ -103,6 +104,9 @@ class Claude(
             }
 
             json.decodeFromString<Response>(httpResponse.bodyAsText())
+        } catch (e: SerializationException) {
+            logger.error("Failed to parse response from Antrophic", e)
+            throw e
         } catch (e: Exception) {
             logger.error("Failed to complete request to Antrophic", e)
             throw e

diff --git a/maestro-ai/src/main/java/maestro/ai/openai/Client.kt b/maestro-ai/src/main/java/maestro/ai/openai/Client.kt
@@ -104,12 +104,9 @@ class OpenAI(
                 throw Exception("Failed to complete request to OpenAI: ${httpResponse.status}, $body")
             }
 
-            print(body)
-
             json.decodeFromString<ChatCompletionResponse>(body)
         } catch (e: SerializationException) {
             logger.error("Failed to parse response from OpenAI", e)
-            logger.error("Response body: ${e.message}")
             throw e
         }
         catch (e: Exception) {