From 9c9842bfb6f2191b11d097d3259118297f83b5e1 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 19 Jan 2024 08:08:29 -0800 Subject: [PATCH 1/3] Add `initialize_google` and fix `require_api_key` --- operate/config.py | 14 ++++++++++---- operate/models/apis.py | 25 ++++++++++++++++++------- operate/operate.py | 2 +- 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/operate/config.py b/operate/config.py index e3ca3e49..67b3a55a 100644 --- a/operate/config.py +++ b/operate/config.py @@ -3,6 +3,7 @@ from dotenv import load_dotenv from openai import OpenAI from prompt_toolkit.shortcuts import input_dialog +import google.generativeai as genai class Config: @@ -18,15 +19,19 @@ class Config: def __init__(self): load_dotenv() self.verbose = False - self.openai_api_key = os.getenv("OPENAI_API_KEY", "") - self.google_api_key = os.getenv("GOOGLE_API_KEY", "") def initialize_openai(self): client = OpenAI() - client.api_key = self.openai_api_key + client.api_key = os.getenv("OPENAI_API_KEY") client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url) return client + def initialize_google(self): + genai.configure(api_key=os.getenv("GOOGLE_API_KEY"), transport="rest") + model = genai.GenerativeModel("gemini-pro-vision") + + return model + def validation(self, model, voice_mode): """ Validate the input parameters for the dialog operation. @@ -39,7 +44,8 @@ def validation(self, model, voice_mode): ) def require_api_key(self, key_name, key_description, is_required): - if is_required and not getattr(self, key_name.lower()): + key_exists = bool(os.environ.get(key_name)) + if is_required and not key_exists: self.prompt_and_save_api_key(key_name, key_description) def prompt_and_save_api_key(self, key_name, key_description): diff --git a/operate/models/apis.py b/operate/models/apis.py index c1f94e30..a9f95912 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -8,7 +8,7 @@ from PIL import Image from ultralytics import YOLO -import google.generativeai as genai + from operate.config import Config from operate.exceptions import ModelNotRecognizedException from operate.utils.screenshot import ( @@ -35,9 +35,14 @@ # Load configuration VERBOSE = Config().verbose +config = Config() +client = config.initialize_openai() async def get_next_action(model, messages, objective, session_id): + if VERBOSE: + print("[Self-Operating Computer][get_next_action]") + print("[Self-Operating Computer][get_next_action] model", model) if model == "gpt-4": return call_gpt_4_vision_preview(messages), None if model == "gpt-4-with-som": @@ -52,8 +57,6 @@ async def get_next_action(model, messages, objective, session_id): def call_gpt_4_vision_preview(messages): - config = Config() - client = config.initialize_openai() if VERBOSE: print("[Self Operating Computer][get_next_action][call_gpt_4_v]") time.sleep(1) @@ -137,7 +140,10 @@ def call_gemini_pro_vision(messages, objective): """ Get the next action for Self-Operating Computer using Gemini Pro Vision """ - config = Config() + if VERBOSE: + print( + "[Self Operating Computer][call_gemini_pro_vision]", + ) # sleep for a second time.sleep(1) try: @@ -152,11 +158,18 @@ def call_gemini_pro_vision(messages, objective): time.sleep(1) prompt = get_system_prompt(objective) - model = genai.GenerativeModel("gemini-pro-vision") + model = config.initialize_google() + if VERBOSE: + print("[Self Operating Computer][call_gemini_pro_vision] model", model) response = model.generate_content([prompt, Image.open(screenshot_filename)]) content = response.text[1:] + if VERBOSE: + print( + "[Self Operating Computer][call_gemini_pro_vision] response", response + ) + print("[Self Operating Computer][call_gemini_pro_vision] content", content) content = json.loads(content) if VERBOSE: @@ -176,8 +189,6 @@ def call_gemini_pro_vision(messages, objective): async def call_gpt_4_vision_preview_labeled(messages, objective): - config = Config() - client = config.initialize_openai() time.sleep(1) try: yolo_model = YOLO("./operate/models/weights/best.pt") # Load your trained model diff --git a/operate/operate.py b/operate/operate.py index 8050956e..da1963d5 100644 --- a/operate/operate.py +++ b/operate/operate.py @@ -45,6 +45,7 @@ def main(model, terminal_prompt, voice_mode=False): Returns: None """ + mic = None # Initialize `WhisperMic`, if `voice_mode` is True @@ -109,7 +110,6 @@ def main(model, terminal_prompt, voice_mode=False): while True: if VERBOSE: - print("[Self Operating Computer]") print("[Self Operating Computer] loop_count", loop_count) try: operations, session_id = asyncio.run( From 9ba17f4ba0389e8c6d48fc2f52f4978163b9a974 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 19 Jan 2024 08:11:31 -0800 Subject: [PATCH 2/3] fix duplicate `save_api_key_to_env` call --- operate/config.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/operate/config.py b/operate/config.py index 67b3a55a..f476837b 100644 --- a/operate/config.py +++ b/operate/config.py @@ -61,11 +61,6 @@ def prompt_and_save_api_key(self, key_name, key_description): load_dotenv() # Reload environment variables # Update the instance attribute with the new key - if key_value: - self.save_api_key_to_env(key_name, key_value) - load_dotenv() # Reload environment variables - setattr(self, key_name.lower(), key_value) - @staticmethod def save_api_key_to_env(key_name, key_value): with open(".env", "a") as file: From 4c61db46538ff71b3caaddf309ea3ae2ddfa8524 Mon Sep 17 00:00:00 2001 From: Josh Bickett Date: Fri, 19 Jan 2024 08:25:25 -0800 Subject: [PATCH 3/3] Fix `initialize_openai` issue --- operate/config.py | 5 +++++ operate/models/apis.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/operate/config.py b/operate/config.py index f476837b..290e0fa3 100644 --- a/operate/config.py +++ b/operate/config.py @@ -45,6 +45,11 @@ def validation(self, model, voice_mode): def require_api_key(self, key_name, key_description, is_required): key_exists = bool(os.environ.get(key_name)) + if self.verbose: + print("[Config] require_api_key") + print("[Config] key_name", key_name) + print("[Config] key_description", key_description) + print("[Config] key_exists", key_exists) if is_required and not key_exists: self.prompt_and_save_api_key(key_name, key_description) diff --git a/operate/models/apis.py b/operate/models/apis.py index a9f95912..a99f2b80 100644 --- a/operate/models/apis.py +++ b/operate/models/apis.py @@ -36,7 +36,6 @@ # Load configuration VERBOSE = Config().verbose config = Config() -client = config.initialize_openai() async def get_next_action(model, messages, objective, session_id): @@ -60,6 +59,7 @@ def call_gpt_4_vision_preview(messages): if VERBOSE: print("[Self Operating Computer][get_next_action][call_gpt_4_v]") time.sleep(1) + client = config.initialize_openai() try: screenshots_dir = "screenshots" if not os.path.exists(screenshots_dir): @@ -190,6 +190,7 @@ def call_gemini_pro_vision(messages, objective): async def call_gpt_4_vision_preview_labeled(messages, objective): time.sleep(1) + client = config.initialize_openai() try: yolo_model = YOLO("./operate/models/weights/best.pt") # Load your trained model screenshots_dir = "screenshots"