mobile-dev-inc · bartekpacia · Aug 29, 2024 · Aug 9, 2024 · Aug 12, 2024 · Aug 12, 2024
diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
@@ -47,6 +47,7 @@ wiremock = "2.35.0"
 logback = "1.2.6"
 coroutines = "1.8.0"
 kotlinx-html = "0.8.0"
+clikt = "4.2.2"
 
 [libraries]
 kotlinx-coroutines-core = { module = "org.jetbrains.kotlinx:kotlinx-coroutines-core", version.ref = "coroutines" }
@@ -87,11 +88,14 @@ junit-jupiter-api = { module = "org.junit.jupiter:junit-jupiter-api", version.re
 junit-jupiter-engine = { module = "org.junit.jupiter:junit-jupiter-engine", version.ref = "junit" }
 junit-jupiter-params = { module = "org.junit.jupiter:junit-jupiter-params", version.ref = "junit" }
 kotlin-result = { module = "com.michael-bull.kotlin-result:kotlin-result", version.ref = "kotlinResult" }
+clikt = { module = "com.github.ajalt.clikt:clikt", version.ref = "clikt" }
 ktor-client-cio = { module = "io.ktor:ktor-client-cio", version.ref = "ktor" }
 ktor-client-core = { module = "io.ktor:ktor-client-core", version.ref = "ktor" }
 ktor-serial-gson = { module = "io.ktor:ktor-serialization-gson", version.ref = "ktor" }
+ktor-serial-json = { module = "io.ktor:ktor-serialization-kotlinx-json", version.ref = "ktor" }
 ktor-server-cio = { module = "io.ktor:ktor-server-cio", version.ref = "ktor" }
 ktor-server-content-negotiation = { module = "io.ktor:ktor-server-content-negotiation", version.ref = "ktor" }
+ktor-client-content-negotiation = { module = "io.ktor:ktor-client-content-negotiation", version.ref = "ktor" }
 ktor-server-core = { module = "io.ktor:ktor-server-core", version.ref = "ktor" }
 ktor-server-cors = { module = "io.ktor:ktor-server-cors", version.ref = "ktor" }
 ktor-server-netty = { module = "io.ktor:ktor-server-netty", version.ref = "ktor" }
@@ -116,6 +120,7 @@ detekt = { id = "io.gitlab.arturbosch.detekt", version.ref = "detekt" }
 protobuf = { id = "com.google.protobuf", version.ref = "googleProtobufPlugin" }
 kotlin-jvm = { id = "org.jetbrains.kotlin.jvm", version.ref = "kotlin" }
 kotlin-android = { id = "org.jetbrains.kotlin.android", version.ref = "kotlin" }
+kotlin-serialization = { id = "org.jetbrains.kotlin.plugin.serialization", version.ref = "kotlin" }
 mavenPublish = { id = "com.vanniktech.maven.publish", version = "0.19.0" }
 jreleaser = { id = "org.jreleaser", version = "1.13.1" }
 shadow = { id = "com.github.johnrengelman.shadow", version = "7.1.2" }
diff --git a/maestro-ai/README.md b/maestro-ai/README.md
@@ -0,0 +1,41 @@
+# maestro-ai
+
+This project implements AI support for use in Maestro.
+
+It's both a library and an executable demo app.
+
+## Demo app
+
+An API key is required. Set it with `MAESTRO_CLI_AI_KEY` env var. Examples:
+
+- OpenAI: `export MAESTRO_CLI_AI_KEY=sk-...`
+- Antrophic: `export MAESTRO_CLI_AI_KEY=sk-ant-api-...`
+
+### Build
+
+```console
+./gradlew :maestro-ai:installDist
+```
+
+The startup script will be generated in `./maestro-ai/build/install/maestro-ai-demo/bin/maestro-ai-demo`.
+
+### How to use
+
+First of all, try out the `--help` flag.
+
+Run test for a single screenshot that contains defects (i.e. is bad):
+
+```console
+maestro-ai-demo foo_1_bad.png
+```
+
+Run tests for all screenshots from the Uber that contain defects (i.e. are bad). Additionally, show prompts and raw
+LLM response:
+
+```console
+maestro-ai-demo \
+  --model gpt-4o-2024-08-06 \
+  --show-prompts \
+  --show-raw-response \
+  test-ai-fixtures/uber_*_bad.png
+```
diff --git a/maestro-ai/build.gradle.kts b/maestro-ai/build.gradle.kts
@@ -0,0 +1,54 @@
+import org.jetbrains.kotlin.gradle.tasks.KotlinCompilationTask
+
+plugins {
+    application
+    id("maven-publish")
+    alias(libs.plugins.kotlin.jvm)
+    alias(libs.plugins.kotlin.serialization)
+    alias(libs.plugins.mavenPublish)
+}
+
+application {
+    applicationName = "maestro-ai-demo"
+    mainClass.set("maestro.ai.DemoAppKt")
+}
+
+tasks.named<Jar>("jar") {
+    manifest {
+        attributes["Main-Class"] = "maestro.ai.DemoAppKt"
+    }
+}
+
+dependencies {
+    api(libs.kotlin.result)
+    api(libs.square.okio)
+
+    api(libs.slf4j)
+    api(libs.logback) {
+        exclude(group = "org.slf4j", module = "slf4j-api")
+    }
+
+    api(libs.ktor.client.core)
+    implementation(libs.ktor.client.cio)
+    implementation(libs.ktor.serial.json)
+    implementation(libs.ktor.client.content.negotiation)
+    implementation(libs.kotlinx.coroutines.core)
+    implementation(libs.clikt)
+
+    testImplementation(libs.junit.jupiter.api)
+    testRuntimeOnly(libs.junit.jupiter.engine)
+    testImplementation(libs.google.truth)
+    testImplementation(libs.square.mock.server)
+    testImplementation(libs.junit.jupiter.params)
+}
+
+java {
+    sourceCompatibility = JavaVersion.VERSION_1_8
+    targetCompatibility = JavaVersion.VERSION_1_8
+}
+
+tasks.named("compileKotlin", KotlinCompilationTask::class.java) {
+    compilerOptions {
+        freeCompilerArgs.addAll("-Xjdk-release=1.8")
+    }
+}
diff --git a/maestro-ai/gradle.properties b/maestro-ai/gradle.properties
@@ -0,0 +1,3 @@
+POM_NAME=Maestro AI
+POM_ARTIFACT_ID=maestro-ai
+POM_PACKAGING=jar
diff --git a/maestro-ai/src/main/java/maestro/ai/AI.kt b/maestro-ai/src/main/java/maestro/ai/AI.kt
@@ -0,0 +1,59 @@
+package maestro.ai
+
+import io.ktor.client.HttpClient
+import io.ktor.client.plugins.HttpTimeout
+import io.ktor.client.plugins.contentnegotiation.ContentNegotiation
+import kotlinx.serialization.json.Json
+import kotlinx.serialization.json.JsonObject
+import java.io.Closeable
+
+data class CompletionData(
+    val prompt: String,
+    val model: String,
+    val temperature: Float,
+    val maxTokens: Int,
+    val images: List<String>,
+    val response: String,
+)
+
+abstract class AI(
+    val defaultModel: String,
+    protected val httpClient: HttpClient,
+) : Closeable {
+
+    /**
+     * Chat completion with the AI model.
+     *
+     * Caveats:
+     *  - `jsonSchema` is only supported by OpenAI ("Structured Outputs" feature)
+     */
+    abstract suspend fun chatCompletion(
+        prompt: String,
+        images: List<ByteArray> = listOf(),
+        temperature: Float? = null,
+        model: String? = null,
+        maxTokens: Int? = null,
+        imageDetail: String? = null,
+        identifier: String? = null,
+        jsonSchema: JsonObject? = null,
+    ): CompletionData
+
+    companion object {
+        const val AI_KEY_ENV_VAR = "MAESTRO_CLI_AI_KEY"
+        const val AI_MODEL_ENV_VAR = "MAESTRO_CLI_AI_MODEL"
+
+        val defaultHttpClient = HttpClient {
+            install(ContentNegotiation) {
+                Json {
+                    ignoreUnknownKeys = true
+                }
+            }
+
+            install(HttpTimeout) {
+                connectTimeoutMillis = 10000
+                socketTimeoutMillis = 60000
+                requestTimeoutMillis = 60000
+            }
+        }
+    }
+}
diff --git a/maestro-ai/src/main/java/maestro/ai/DemoApp.kt b/maestro-ai/src/main/java/maestro/ai/DemoApp.kt
@@ -0,0 +1,194 @@
+package maestro.ai
+
+import com.github.ajalt.clikt.core.CliktCommand
+import com.github.ajalt.clikt.parameters.arguments.argument
+import com.github.ajalt.clikt.parameters.arguments.multiple
+import com.github.ajalt.clikt.parameters.options.default
+import com.github.ajalt.clikt.parameters.options.flag
+import com.github.ajalt.clikt.parameters.options.option
+import com.github.ajalt.clikt.parameters.types.float
+import com.github.ajalt.clikt.parameters.types.path
+import kotlinx.coroutines.async
+import kotlinx.coroutines.runBlocking
+import maestro.ai.antrophic.Claude
+import maestro.ai.openai.OpenAI
+import java.io.File
+import java.nio.file.Path
+
+
+fun main(args: Array<String>) = DemoApp().main(args)
+
+/**
+ * This is a small helper program to help evaluate LLM results against a directory of screenshots and prompts.
+ *
+ * ### Input format
+ *
+ * Screenshot name format:
+ * - {app_name}_{screenshot_number}_{good|bad}.png
+ *
+ * A screenshot can optionally have a prompt. In this case, the model will treat the prompt as the assertion command.
+ * To associate a prompt with a screenshot, prompt text file name must have
+ * the following format:
+ * - {app_name_{screenshot_number}_{good|bad}.txt
+ *
+ * For example:
+ * - foo_1_bad.png
+ * - bar_2_good.png
+ *
+ * ### Output format
+ *
+ * The output for a single screenshot should indicate either PASS or FAIL, screenshot name, the result, and the defects
+ * founds (if any).
+ *
+ * For example:
+ *
+ * ```text
+ * PASS uber_2_bad.png: 1 defects found (as expected)
+ * 	* layout: The prompt for entering a verification code is visible, indicating that the 2-factor authentication process is present. The screen instructs the user to enter a verification code generated for Uber, which is a typical 2-factor authentication step.
+ * ```
+ *
+ * Some of the flags change output format.
+ */
+class DemoApp : CliktCommand() {
+    private val inputFiles: List<Path> by argument(help = "screenshots to use").path(mustExist = true).multiple()
+
+    private val model: String by option(help = "LLM to use").default("gpt-4o-2024-08-06")
+
+    private val showOnlyFails: Boolean by option(help = "Show only failed tests").flag()
+
+    private val showPrompts: Boolean by option(help = "Show prompts").flag()
+
+    private val showRawResponse: Boolean by option(help = "Show raw LLM response").flag()
+
+    private val temperature: Float by option(help = "Temperature for LLM").float().default(0.2f)
+
+    private val parallel: Boolean by option(help = "Run in parallel. May get rate limited").flag()
+
+    // IDEA: "--json" flag to allow for easy filtering with jq
+
+    override fun run() = runBlocking {
+        val apiKey = System.getenv("MAESTRO_CLI_AI_KEY")
+        require(apiKey != null) { "OpenAI API key is not provided" }
+
+        val testCases = inputFiles.map { it.toFile() }.map { file ->
+            require(!file.isDirectory) { "Provided file is a directory, not a file" }
+            require(file.exists()) { "Provided file does not exist" }
+            require(file.extension == "png") { "Provided file is not a PNG file" }
+            file
+        }.map { file ->
+            val filename = file.nameWithoutExtension
+            val parts = filename.split("_")
+            require(parts.size == 3) { "Screenshot name is invalid: ${file.name}" }
+
+            val appName = parts[0]
+            val index =
+                parts[1].toIntOrNull() ?: throw IllegalArgumentException("Invalid screenshot name: ${file.name}")
+            val status = parts[2]
+
+            val promptFile = "${file.parent}/${appName}_${index}_${status}.txt"
+            val prompt = File(promptFile).run {
+                if (exists()) {
+                    println("Found prompt file: $promptFile")
+                    readText()
+                } else null
+            }
+
+            TestCase(
+                screenshot = file,
+                appName = appName,
+                shouldPass = status == "good",
+                index = index,
+                prompt = prompt,
+            )
+        }.toList()
+
+        val aiClient: AI = when {
+            model.startsWith("gpt") -> OpenAI(
+                apiKey = apiKey,
+                defaultModel = model,
+                defaultTemperature = temperature,
+            )
+
+            model.startsWith("claude") -> Claude(
+                apiKey = apiKey,
+                defaultModel = model,
+                defaultTemperature = temperature,
+            )
+
+            else -> throw IllegalArgumentException("Unknown model: $model")
+        }
+
+        testCases.forEach { testCase ->
+            val bytes = testCase.screenshot.readBytes()
+
+            val job = async {
+                val defects = if (testCase.prompt == null) Prediction.findDefects(
+                    aiClient = aiClient,
+                    screen = bytes,
+                    printPrompt = showPrompts,
+                    printRawResponse = showRawResponse,
+                ) else {
+                    val result = Prediction.performAssertion(
+                        aiClient = aiClient,
+                        screen = bytes,
+                        assertion = testCase.prompt,
+                        printPrompt = showPrompts,
+                        printRawResponse = showRawResponse,
+                    )
+
+                    if (result == null) emptyList()
+                    else listOf(result)
+                }
+
+                verify(testCase, defects)
+            }
+
+            if (!parallel) job.await()
+        }
+    }
+
+    private fun verify(testCase: TestCase, defects: List<Defect>) {
+        if (!testCase.shouldPass) {
+            // Check if LLM found defects (i.e. didn't commit false negative)
+            if (defects.isNotEmpty()) {
+                if (showOnlyFails) return
+
+                println(
+                    """
+                    PASS ${testCase.screenshot.name}: ${defects.size} defects found (as expected)
+                    ${defects.joinToString("\n") { "\t* ${it.category}: ${it.reasoning}" }}
+                    """.trimIndent()
+                )
+            } else {
+                println("FAIL ${testCase.screenshot.name} false-negative: No defects found but some were expected")
+            }
+
+        } else {
+            // Check that LLM didn't raise false positives
+            if (defects.isEmpty()) {
+                if (showOnlyFails) return
+
+                println(
+                    """
+                    PASS ${testCase.screenshot.name}: No defects found (as expected)
+                    """.trimIndent()
+                )
+            } else {
+                println(
+                    """
+                    FAIL ${testCase.screenshot.name} false-positive: ${defects.size} defects found but none were expected
+                    ${defects.joinToString("\n") { "\t* ${it.category}: ${it.reasoning}" }}
+                    """.trimIndent()
+                )
+            }
+        }
+    }
+}
+
+data class TestCase(
+    val screenshot: File,
+    val appName: String,
+    val prompt: String?,
+    val shouldPass: Boolean,
+    val index: Int,
+)