diff --git a/maestro-ai/src/main/java/maestro/ai/AI.kt b/maestro-ai/src/main/java/maestro/ai/AI.kt index 1817c1916f..119a630fad 100644 --- a/maestro-ai/src/main/java/maestro/ai/AI.kt +++ b/maestro-ai/src/main/java/maestro/ai/AI.kt @@ -41,13 +41,6 @@ abstract class AI( // * OpenAI: https://platform.openai.com/docs/guides/structured-outputs // * Gemini: https://ai.google.dev/gemini-api/docs/json-mode - val checkAssertion: String = run { - val resourceStream = this::class.java.getResourceAsStream("/checkAssertion_schema.json") - ?: throw IllegalStateException("Could not find checkAssertion_schema.json in resources") - - resourceStream.bufferedReader().use { it.readText() } - } - val askForDefectsSchema: String = run { val resourceStream = this::class.java.getResourceAsStream("/askForDefects_schema.json") ?: throw IllegalStateException("Could not find askForDefects_schema.json in resources") diff --git a/maestro-ai/src/main/java/maestro/ai/DemoApp.kt b/maestro-ai/src/main/java/maestro/ai/DemoApp.kt index c64d10cbcb..d75266b321 100644 --- a/maestro-ai/src/main/java/maestro/ai/DemoApp.kt +++ b/maestro-ai/src/main/java/maestro/ai/DemoApp.kt @@ -117,13 +117,25 @@ class DemoApp : CliktCommand() { val bytes = testCase.screenshot.readBytes() val job = async { - val defects = Prediction.findDefects( + val defects = if (testCase.prompt == null) Prediction.findDefects( aiClient = aiClient, screen = bytes, previousFalsePositives = listOf(), printPrompt = showPrompts, printRawResponse = showRawResponse, - ) + ) else { + val result = Prediction.performAssertion( + aiClient = aiClient, + screen = bytes, + assertion = testCase.prompt, + printPrompt = showPrompts, + printRawResponse = showRawResponse, + ) + + if (result == null) emptyList() + else listOf(result) + } + verify(testCase, defects) } diff --git a/maestro-ai/src/main/java/maestro/ai/Prediction.kt b/maestro-ai/src/main/java/maestro/ai/Prediction.kt index c416ea4253..aa85462b1a 100644 --- a/maestro-ai/src/main/java/maestro/ai/Prediction.kt +++ b/maestro-ai/src/main/java/maestro/ai/Prediction.kt @@ -12,16 +12,10 @@ data class Defect( ) @Serializable -private data class FindDefectsResponse( +private data class ModelResponse( val defects: List, ) -@Serializable -data class PerformAssertionResult( - val passed: Boolean, - val reasoning: String, -) - object Prediction { private val json = Json { ignoreUnknownKeys = true } @@ -30,6 +24,8 @@ object Prediction { "layout" to "Some UI elements are overlapping or are cropped", ) + private val allDefectCategories = defectCategories + listOf("assertion" to "The assertion is not true") + suspend fun findDefects( aiClient: AI, screen: ByteArray, @@ -126,7 +122,7 @@ object Prediction { println("--- RAW RESPONSE END ---") } - val defects = json.decodeFromString(aiResponse.response) + val defects = json.decodeFromString(aiResponse.response) return defects.defects } @@ -136,7 +132,7 @@ object Prediction { assertion: String, printPrompt: Boolean = false, printRawResponse: Boolean = false, - ): PerformAssertionResult { + ): Defect? { val prompt = buildString { appendLine( @@ -150,22 +146,37 @@ object Prediction { """.trimMargin("|") ) + append( + """ + | + |RULES: + |* Provide response as a valid JSON, with structure described below. + |* If the assertion is false, the list in the JSON output MUST be empty. + |* If assertion is false: + | * Your response MUST only include a single defect with category "assertion". + | * Provide detailed reasoning to explain why you think the assertion is false. + """.trimMargin("|") + ) + // Claude doesn't have a JSON mode as of 21-08-2024 // https://docs.anthropic.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency // We could do "if (aiClient is Claude)", but actually, this also helps with gpt-4o sometimes - // generating never-ending stream of output. + // generatig never-ending stream of output. append( """ | |* You must provide result as a valid JSON object, matching this structure: | | { - | "result": { - | "passed": "", - | "reasoning": "" - | }, + | "defect": [ + | { + | "category": "assertion", + | "reasoning": "" + | }, + | ] | } | + |The "defects" array MUST contain at most a single JSON object. |DO NOT output any other information in the JSON object. """.trimMargin("|") ) @@ -184,7 +195,7 @@ object Prediction { identifier = "perform-assertion", imageDetail = "high", images = listOf(screen), - jsonSchema = if (aiClient is OpenAI) json.parseToJsonElement(AI.checkAssertion).jsonObject else null, + jsonSchema = if (aiClient is OpenAI) json.parseToJsonElement(AI.askForDefectsSchema).jsonObject else null, ) if (printRawResponse) { @@ -193,7 +204,7 @@ object Prediction { println("--- RAW RESPONSE END ---") } - val result = json.decodeFromString(aiResponse.response) - return result + val response = json.decodeFromString(aiResponse.response) + return response.defects.firstOrNull() } } diff --git a/maestro-ai/src/main/resources/checkAssertion_schema.json b/maestro-ai/src/main/resources/checkAssertion_schema.json deleted file mode 100644 index 2191a9ba9a..0000000000 --- a/maestro-ai/src/main/resources/checkAssertion_schema.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "name": "checkAssertion", - "description": "Return whether the provided assertion about the mobile app's UI is true", - "strict": true, - "schema": { - "type": "object", - "required": ["defects"], - "additionalProperties": false, - "properties": { - "result": { - "type": "object", - "required": ["passed", "reasoning"], - "additionalProperties": false, - "properties": { - "passed": { - "type": "boolean" - }, - "reasoning": { - "type": "string" - } - } - } - } - } -} diff --git a/maestro-cli/src/main/java/maestro/cli/runner/resultview/AnsiResultView.kt b/maestro-cli/src/main/java/maestro/cli/runner/resultview/AnsiResultView.kt index 5e620bf72b..3b7fb574ff 100644 --- a/maestro-cli/src/main/java/maestro/cli/runner/resultview/AnsiResultView.kt +++ b/maestro-cli/src/main/java/maestro/cli/runner/resultview/AnsiResultView.kt @@ -203,8 +203,8 @@ class AnsiResultView( CommandStatus.COMPLETED -> "✅" CommandStatus.FAILED -> "❌" CommandStatus.RUNNING -> "⏳" - CommandStatus.PENDING -> "\uD83D\uDD32" - CommandStatus.SKIPPED -> "⚠️️" + CommandStatus.PENDING -> "\uD83D\uDD32 " // 🔲 + CommandStatus.SKIPPED -> "⚪️" } } diff --git a/maestro-orchestra-models/src/main/java/maestro/orchestra/Commands.kt b/maestro-orchestra-models/src/main/java/maestro/orchestra/Commands.kt index 577f430c74..dd8dd59939 100644 --- a/maestro-orchestra-models/src/main/java/maestro/orchestra/Commands.kt +++ b/maestro-orchestra-models/src/main/java/maestro/orchestra/Commands.kt @@ -385,7 +385,7 @@ data class AssertWithAICommand( override fun description(): String { if (label != null) return label - return "Assert no defects with AI: $assertion" + return "Assert with AI: $assertion" } override fun evaluateScripts(jsEngine: JsEngine): Command { diff --git a/maestro-orchestra/src/main/java/maestro/orchestra/Orchestra.kt b/maestro-orchestra/src/main/java/maestro/orchestra/Orchestra.kt index 923d84be0c..f4a397b67d 100644 --- a/maestro-orchestra/src/main/java/maestro/orchestra/Orchestra.kt +++ b/maestro-orchestra/src/main/java/maestro/orchestra/Orchestra.kt @@ -351,7 +351,6 @@ class Orchestra( val defects = Prediction.findDefects( aiClient = ai, - assertion = null, screen = imageData.copy().readByteArray(), previousFalsePositives = listOf(), // TODO(bartekpacia): take it from WorkspaceConfig (or MaestroConfig?) ) @@ -363,7 +362,7 @@ class Orchestra( val word = if (defects.size == 1) "defect" else "defects" throw MaestroException.AssertionFailure( - "Ffound ${defects.size} possible $word. See the report after the test completes to learn more.", + "Found ${defects.size} possible $word. See the report after the test completes to learn more.", maestro.viewHierarchy().root, ) } @@ -381,21 +380,19 @@ class Orchestra( val imageData = Buffer() maestro.takeScreenshot(imageData, compressed = false) - val defects = Prediction.findDefects( + val defect = Prediction.performAssertion( aiClient = ai, - assertion = command.assertion, screen = imageData.copy().readByteArray(), - previousFalsePositives = listOf(), // TODO(bartekpacia): take it from WorkspaceConfig (or MaestroConfig?) + assertion = command.assertion, ) - if (defects.isNotEmpty()) { - onCommandGeneratedOutput(command, defects, imageData) + if (defect != null) { + onCommandGeneratedOutput(command, listOf(defect), imageData) if (command.optional) throw CommandSkipped - val word = if (defects.size == 1) "defect" else "defects" throw MaestroException.AssertionFailure( - "Visual AI found ${defects.size} possible $word. See the report to learn more.", + "Assertion failed. See the report to learn more.", maestro.viewHierarchy().root, ) }