Skip to content

Commit

Permalink
improve DemoApp
Browse files Browse the repository at this point in the history
  • Loading branch information
bartekpacia committed Aug 22, 2024
1 parent 9ae8a6d commit 92466ff
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 62 deletions.
8 changes: 7 additions & 1 deletion maestro-ai/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@

This project implements AI support for use in Maestro.

It's both a library and a demo-app executable.
It's both a library and an executable demo app.

### Demo app

An API key is required. Set it with `MAESTRO_CLI_AI_KEY` env var. Examples:
- OpenAI: `export MAESTRO_CLI_AI_KEY=sk-...`
- Antrophic: `export MAESTRO_CLI_AI_KEY=sk-ant-api-...`

Build it:

```console
Expand All @@ -17,3 +21,5 @@ then learn how to use it:
```console
./maestro-ai/build/install/maestro-ai-demo/bin/maestro-ai-demo --help
```

Finally, run it:
72 changes: 38 additions & 34 deletions maestro-ai/src/main/java/maestro/ai/DemoApp.kt
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,6 @@ import java.nio.file.Path

fun main(args: Array<String>) = DemoApp().main(args)

// TODO(bartekpacia): Improvement ideas:
// * --only-fail - show only failing test cases
// * --json – to allow for easy filtering with JQ
// * --show-prompts – show prompts that were used
// * Possibility to pass a single screenshot
// Note: maybe instead of building this purpose CLI program, we can use something
// purpose-made for this.
/**
* This is a small helper program to help evaluate LLM results against a directory of screenshots and prompts.
*
Expand Down Expand Up @@ -59,6 +52,8 @@ class DemoApp : CliktCommand() {

private val model: String by option(help = "LLM to use").default("gpt-4o-2024-08-06")

private val showOnlyFails: Boolean by option(help = "Show only failed tests").flag()

private val showPrompts: Boolean by option(help = "Show prompts").flag()

private val showRawResponse: Boolean by option(help = "Show raw LLM response").flag()
Expand All @@ -67,35 +62,42 @@ class DemoApp : CliktCommand() {

private val parallel: Boolean by option(help = "Run in parallel. May get rate limited").flag()

// IDEA: "--json" flag to allow for easy filtering with jq

override fun run() = runBlocking {
val apiKey = System.getenv("MAESTRO_CLI_AI_KEY")
require(apiKey != null) { "OpenAI API key is not provided" }

val testCases = inputFiles.map { it.toFile() }.map { file ->
require(!file.isDirectory) { "Provided file is a directory, not a file" }
require(file.exists()) { "Provided file does not exist" }
require(file.extension == "png") { "Provided file is not a PNG file" }
file
}.map { file ->
val filename = file.nameWithoutExtension
val parts = filename.split("_")
require(parts.size == 3) { "Screenshot name is invalid: ${file.name}" }

val appName = parts[0]
val index = parts[2].toInt()

val promptFile = "${file.parent}/${appName}_${parts[1]}_$index.txt"
println("Prompt file: $promptFile")
val prompt = File(promptFile).run { if (exists()) readText() else null }

TestCase(
screenshot = file,
appName = appName,
hasDefects = parts[1] == "bad",
index = index,
prompt = prompt,
)
}.toList()
require(!file.isDirectory) { "Provided file is a directory, not a file" }
require(file.exists()) { "Provided file does not exist" }
require(file.extension == "png") { "Provided file is not a PNG file" }
file
}.map { file ->
val filename = file.nameWithoutExtension
val parts = filename.split("_")
require(parts.size == 3) { "Screenshot name is invalid: ${file.name}" }

val appName = parts[0]
val index = parts[1].toIntOrNull() ?: throw IllegalArgumentException("Invalid screenshot name: ${file.name}")
val status = parts[2]

val promptFile = "${file.parent}/${appName}_${index}_${status}.txt"
val prompt = File(promptFile).run {
if (exists()) {
println("Found prompt file: $promptFile")
readText()
} else null
}

TestCase(
screenshot = file,
appName = appName,
hasDefects = status == "bad",
index = index,
prompt = prompt,
)
}.toList()

val aiClient: AI = when {
model.startsWith("gpt") -> OpenAI(
Expand Down Expand Up @@ -129,16 +131,16 @@ class DemoApp : CliktCommand() {
verify(testCase, defects)
}

if (parallel) job.await()
if (!parallel) job.await()
}

println("Exited, bye!")
}

private fun verify(testCase: TestCase, defects: List<Defect>) {
if (testCase.hasDefects) {
// Check LLM found defects as well (i.e. didn't commit false negative)
if (defects.isNotEmpty()) {
if (showOnlyFails) return

println(
"""
PASS ${testCase.screenshot.name}: ${defects.size} defects found (as expected)
Expand All @@ -152,6 +154,8 @@ class DemoApp : CliktCommand() {
} else {
// Check that LLM didn't raise false positives
if (defects.isEmpty()) {
if (showOnlyFails) return

println(
"""
PASS ${testCase.screenshot.name}: No defects found (as expected)
Expand Down
49 changes: 25 additions & 24 deletions maestro-ai/src/main/java/maestro/ai/Prediction.kt
Original file line number Diff line number Diff line change
Expand Up @@ -78,29 +78,30 @@ object Prediction {

// Claude doesn't have a JSON mode as of 21-08-2024
// https://docs.anthropic.com/en/docs/test-and-evaluate/strengthen-guardrails/increase-consistency
if (aiClient is Claude) {
append(
"""
|
|* You must provide result as a valid JSON object, matching this structure:
|
| {
| "defects": [
| {
| "category": "<defect category, string>",
| "reasoning": "<reasoning, string>"
| },
| {
| "category": "<defect category, string>",
| "reasoning": "<reasoning, string>"
| }
| ]
| }
|
|DO NOT output any other information in the JSON object.
""".trimMargin("|")
)
}
// We could do "if (aiClient is Claude)", but actually, this also helps with gpt-4o generating
// never-ending stream of output.
append(
"""
|
|* You must provide result as a valid JSON object, matching this structure:
|
| {
| "defects": [
| {
| "category": "<defect category, string>",
| "reasoning": "<reasoning, string>"
| },
| {
| "category": "<defect category, string>",
| "reasoning": "<reasoning, string>"
| }
| ]
| }
|
|DO NOT output any other information in the JSON object.
""".trimMargin("|")
)


if (previousFalsePositives.isNotEmpty()) {
appendLine("Additionally, the following defects are false positives:")
Expand All @@ -109,7 +110,7 @@ object Prediction {
}
}

appendLine("Be brief.")
appendLine("There are usually only a few defects in the screenshot. Don't generate tens of them.")
}

if (printPrompt) {
Expand Down
4 changes: 4 additions & 0 deletions maestro-ai/src/main/java/maestro/ai/antrophic/Client.kt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import io.ktor.http.HttpStatusCode
import io.ktor.http.contentType
import io.ktor.http.isSuccess
import io.ktor.util.encodeBase64
import kotlinx.serialization.SerializationException
import kotlinx.serialization.encodeToString
import kotlinx.serialization.json.Json
import kotlinx.serialization.json.JsonObject
Expand Down Expand Up @@ -103,6 +104,9 @@ class Claude(
}

json.decodeFromString<Response>(httpResponse.bodyAsText())
} catch (e: SerializationException) {
logger.error("Failed to parse response from Antrophic", e)
throw e
} catch (e: Exception) {
logger.error("Failed to complete request to Antrophic", e)
throw e
Expand Down
3 changes: 0 additions & 3 deletions maestro-ai/src/main/java/maestro/ai/openai/Client.kt
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,9 @@ class OpenAI(
throw Exception("Failed to complete request to OpenAI: ${httpResponse.status}, $body")
}

print(body)

json.decodeFromString<ChatCompletionResponse>(body)
} catch (e: SerializationException) {
logger.error("Failed to parse response from OpenAI", e)
logger.error("Response body: ${e.message}")
throw e
}
catch (e: Exception) {
Expand Down

0 comments on commit 92466ff

Please sign in to comment.