diff --git a/maestro-ai/src/main/java/maestro/ai/AI.kt b/maestro-ai/src/main/java/maestro/ai/AI.kt index 73cbfa3746..1817c1916f 100644 --- a/maestro-ai/src/main/java/maestro/ai/AI.kt +++ b/maestro-ai/src/main/java/maestro/ai/AI.kt @@ -41,7 +41,14 @@ abstract class AI( // * OpenAI: https://platform.openai.com/docs/guides/structured-outputs // * Gemini: https://ai.google.dev/gemini-api/docs/json-mode - val assertVisualSchema: String = run { + val checkAssertion: String = run { + val resourceStream = this::class.java.getResourceAsStream("/checkAssertion_schema.json") + ?: throw IllegalStateException("Could not find checkAssertion_schema.json in resources") + + resourceStream.bufferedReader().use { it.readText() } + } + + val askForDefectsSchema: String = run { val resourceStream = this::class.java.getResourceAsStream("/askForDefects_schema.json") ?: throw IllegalStateException("Could not find askForDefects_schema.json in resources") diff --git a/maestro-ai/src/main/java/maestro/ai/DemoApp.kt b/maestro-ai/src/main/java/maestro/ai/DemoApp.kt index 2d725262d8..c64d10cbcb 100644 --- a/maestro-ai/src/main/java/maestro/ai/DemoApp.kt +++ b/maestro-ai/src/main/java/maestro/ai/DemoApp.kt @@ -8,10 +8,7 @@ import com.github.ajalt.clikt.parameters.options.flag import com.github.ajalt.clikt.parameters.options.option import com.github.ajalt.clikt.parameters.types.float import com.github.ajalt.clikt.parameters.types.path -import kotlinx.coroutines.CoroutineScope -import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.async -import kotlinx.coroutines.launch import kotlinx.coroutines.runBlocking import maestro.ai.antrophic.Claude import maestro.ai.openai.OpenAI @@ -79,7 +76,8 @@ class DemoApp : CliktCommand() { require(parts.size == 3) { "Screenshot name is invalid: ${file.name}" } val appName = parts[0] - val index = parts[1].toIntOrNull() ?: throw IllegalArgumentException("Invalid screenshot name: ${file.name}") + val index = + parts[1].toIntOrNull() ?: throw IllegalArgumentException("Invalid screenshot name: ${file.name}") val status = parts[2] val promptFile = "${file.parent}/${appName}_${index}_${status}.txt" @@ -93,7 +91,7 @@ class DemoApp : CliktCommand() { TestCase( screenshot = file, appName = appName, - hasDefects = status == "bad", + shouldPass = status == "good", index = index, prompt = prompt, ) @@ -123,11 +121,9 @@ class DemoApp : CliktCommand() { aiClient = aiClient, screen = bytes, previousFalsePositives = listOf(), - assertion = testCase.prompt, printPrompt = showPrompts, printRawResponse = showRawResponse, ) - verify(testCase, defects) } @@ -136,8 +132,8 @@ class DemoApp : CliktCommand() { } private fun verify(testCase: TestCase, defects: List) { - if (testCase.hasDefects) { - // Check LLM found defects as well (i.e. didn't commit false negative) + if (!testCase.shouldPass) { + // Check if LLM found defects (i.e. didn't commit false negative) if (defects.isNotEmpty()) { if (showOnlyFails) return @@ -177,6 +173,6 @@ data class TestCase( val screenshot: File, val appName: String, val prompt: String?, - val hasDefects: Boolean, + val shouldPass: Boolean, val index: Int, ) diff --git a/maestro-ai/src/main/java/maestro/ai/Prediction.kt b/maestro-ai/src/main/java/maestro/ai/Prediction.kt index 5152404ea1..c416ea4253 100644 --- a/maestro-ai/src/main/java/maestro/ai/Prediction.kt +++ b/maestro-ai/src/main/java/maestro/ai/Prediction.kt @@ -3,7 +3,6 @@ package maestro.ai import kotlinx.serialization.Serializable import kotlinx.serialization.json.Json import kotlinx.serialization.json.jsonObject -import maestro.ai.antrophic.Claude import maestro.ai.openai.OpenAI @Serializable @@ -17,10 +16,16 @@ private data class FindDefectsResponse( val defects: List, ) +@Serializable +data class PerformAssertionResult( + val passed: Boolean, + val reasoning: String, +) + object Prediction { private val json = Json { ignoreUnknownKeys = true } - private val categories = listOf( + private val defectCategories = listOf( "localization" to "Inconsistent use of language, for example mixed English and Portuguese", "layout" to "Some UI elements are overlapping or are cropped", ) @@ -28,7 +33,6 @@ object Prediction { suspend fun findDefects( aiClient: AI, screen: ByteArray, - assertion: String?, previousFalsePositives: List, printPrompt: Boolean = false, printRawResponse: Boolean = false, @@ -54,7 +58,7 @@ object Prediction { | |RULES: |* All defects you find must belong to one of the following categories: - |${categories.joinToString(separator = "\n") { " * ${it.first}: ${it.second}" }} + |${defectCategories.joinToString(separator = "\n") { " * ${it.first}: ${it.second}" }} |* If you see defects, your response MUST only include defect name and detailed reasoning for each defect. |* Provide response as a list of JSON objects, each representing : |* Do not raise false positives. Some example responses that have a high chance of being a false positive: @@ -63,23 +67,10 @@ object Prediction { """.trimMargin("|") ) - if (assertion != null) { - append( - """ - | - | - |Additionally, if the following assertion isn't true, consider it as a defect with category "assertion": - | - | "${assertion.removeSuffix("\n")}" - | - |""".trimMargin("|") - ) - } - // Claude doesn't have a JSON mode as of 21-08-2024 // https://docs.anthropic.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency - // We could do "if (aiClient is Claude)", but actually, this also helps with gpt-4o generating - // never-ending stream of output. + // We could do "if (aiClient is Claude)", but actually, this also helps with gpt-4o sometimes + // generatig never-ending stream of output. append( """ | @@ -126,7 +117,7 @@ object Prediction { identifier = "find-defects", imageDetail = "high", images = listOf(screen), - jsonSchema = if (aiClient is OpenAI) json.parseToJsonElement(AI.assertVisualSchema).jsonObject else null, + jsonSchema = if (aiClient is OpenAI) json.parseToJsonElement(AI.askForDefectsSchema).jsonObject else null, ) if (printRawResponse) { @@ -138,4 +129,71 @@ object Prediction { val defects = json.decodeFromString(aiResponse.response) return defects.defects } + + suspend fun performAssertion( + aiClient: AI, + screen: ByteArray, + assertion: String, + printPrompt: Boolean = false, + printRawResponse: Boolean = false, + ): PerformAssertionResult { + val prompt = buildString { + + appendLine( + """ + |You are a QA engineer performing quality assurance for a mobile application. + |You are given a screenshot of the application and an assertion about the UI. + |Your task is to identify if the following assertion is true: + | + | "${assertion.removeSuffix("\n")}" + | + """.trimMargin("|") + ) + + // Claude doesn't have a JSON mode as of 21-08-2024 + // https://docs.anthropic.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency + // We could do "if (aiClient is Claude)", but actually, this also helps with gpt-4o sometimes + // generating never-ending stream of output. + append( + """ + | + |* You must provide result as a valid JSON object, matching this structure: + | + | { + | "result": { + | "passed": "", + | "reasoning": "" + | }, + | } + | + |DO NOT output any other information in the JSON object. + """.trimMargin("|") + ) + } + + if (printPrompt) { + println("--- PROMPT START ---") + println(prompt) + println("--- PROMPT END ---") + } + + val aiResponse = aiClient.chatCompletion( + prompt, + model = aiClient.defaultModel, + maxTokens = 4096, + identifier = "perform-assertion", + imageDetail = "high", + images = listOf(screen), + jsonSchema = if (aiClient is OpenAI) json.parseToJsonElement(AI.checkAssertion).jsonObject else null, + ) + + if (printRawResponse) { + println("--- RAW RESPONSE START ---") + println(aiResponse.response) + println("--- RAW RESPONSE END ---") + } + + val result = json.decodeFromString(aiResponse.response) + return result + } } diff --git a/maestro-ai/src/main/resources/askForDefects_schema.json b/maestro-ai/src/main/resources/askForDefects_schema.json index c375c509e9..008d4063bc 100644 --- a/maestro-ai/src/main/resources/askForDefects_schema.json +++ b/maestro-ai/src/main/resources/askForDefects_schema.json @@ -1,6 +1,6 @@ { "name": "askForDefects", - "description": "List of possible defects found in the mobile app's UI", + "description": "Returns a list of possible defects found in the mobile app's UI", "strict": true, "schema": { "type": "object", diff --git a/maestro-ai/src/main/resources/checkAssertion_schema.json b/maestro-ai/src/main/resources/checkAssertion_schema.json new file mode 100644 index 0000000000..2191a9ba9a --- /dev/null +++ b/maestro-ai/src/main/resources/checkAssertion_schema.json @@ -0,0 +1,25 @@ +{ + "name": "checkAssertion", + "description": "Return whether the provided assertion about the mobile app's UI is true", + "strict": true, + "schema": { + "type": "object", + "required": ["defects"], + "additionalProperties": false, + "properties": { + "result": { + "type": "object", + "required": ["passed", "reasoning"], + "additionalProperties": false, + "properties": { + "passed": { + "type": "boolean" + }, + "reasoning": { + "type": "string" + } + } + } + } + } +}