From 5cc4dd4cabad770e5548d278a052242d2525ee10 Mon Sep 17 00:00:00 2001 From: Jeremy D <115047575+bmosaicml@users.noreply.github.com> Date: Thu, 14 Dec 2023 13:06:59 -0600 Subject: [PATCH] update openai wrapper to work with tiktoken interface and newest openai version (#794) * update openai wrapper to work with tiktoken interface * update openai wrapper to work with tiktoken interface * add deprecation note * fix completion endpoint * update to newest openai version * monkey patch api key * fix type * fix issues * fix issues * edit * fix typing * openai --------- Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> Co-authored-by: Max Marion --- .../models/inference_api_wrapper/interface.py | 7 +- .../inference_api_wrapper/openai_causal_lm.py | 88 +++++++----- mcli/mcli-openai-eval.yaml | 43 ++---- .../{lm_tasks.yaml => lm_tasks_v0.2.yaml} | 42 +++--- scripts/eval/yamls/openai_eval.yaml | 30 ++-- setup.py | 2 +- .../test_inference_api_eval_wrapper.py | 134 ++++++++++-------- 7 files changed, 164 insertions(+), 182 deletions(-) rename scripts/eval/yamls/{lm_tasks.yaml => lm_tasks_v0.2.yaml} (69%) diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py index 2d84599772..9d0ce7deb3 100644 --- a/llmfoundry/models/inference_api_wrapper/interface.py +++ b/llmfoundry/models/inference_api_wrapper/interface.py @@ -39,8 +39,7 @@ def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer): def get_metrics(self, is_train: bool = False): if is_train: - raise NotImplementedError( - 'You cannot use inference wrappers for training') + metrics = None else: metrics = self.eval_metrics @@ -55,6 +54,7 @@ def rebatch(self, batch: Batch): return batch def eval_forward(self, batch: Batch, outputs: Optional[Any] = None): + padding_tok = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id # If the batch mode is generate, we will generate a requested number of tokens using the underlying # model's generate function. Extra generation kwargs can be passed in via the batch. Strings will # be returned from eval_forward @@ -80,8 +80,7 @@ def eval_forward(self, batch: Batch, outputs: Optional[Any] = None): [output_logits, next_logit_tensor.reshape(1, -1)]) padding = torch.nn.functional.one_hot( - torch.full((seqlen - output_logits.shape[0],), - self.tokenizer.pad_token_id), + torch.full((seqlen - output_logits.shape[0],), padding_tok), num_classes=self.tokenizer.vocab_size) output_logits = torch.cat([output_logits, padding]) output_logits_batch.append(output_logits) diff --git a/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py b/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py index 609112b944..7257b98bd8 100644 --- a/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py +++ b/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py @@ -5,8 +5,9 @@ import logging import os +import random from time import sleep -from typing import Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union import torch from composer.core.types import Batch @@ -22,6 +23,9 @@ 'OpenAICausalLMEvalWrapper', 'OpenAIChatAPIEvalWrapper', ] +from openai.types.chat.chat_completion import ChatCompletion +from openai.types.completion import Completion +from openai.types.completion_choice import Logprobs MAX_RETRIES = 10 @@ -30,6 +34,9 @@ class OpenAIEvalInterface(InferenceAPIEvalWrapper): def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None: super().__init__(model_cfg, tokenizer) + assert os.getenv( + 'OPENAI_API_KEY' + ) is not None, 'No OpenAI API Key found. Ensure it is saved as an environmental variable called OPENAI_API_KEY.' try: import openai except ImportError as e: @@ -37,13 +44,13 @@ def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None: extra_deps_group='openai', conda_package='openai', conda_channel='conda-forge') from e - openai.api_key = os.getenv('OPENAI_API_KEY') + self.client = openai.OpenAI() self.model_name = model_cfg['version'] def generate_completion(self, prompt: str, num_tokens: int): raise NotImplementedError() - def process_result(self, completion: Optional[dict]): + def process_result(self, completion): # pyright: ignore raise NotImplementedError() def get_next_token_logit_tensor(self, prompt: str, num_tokens: int = 1): @@ -52,7 +59,7 @@ def get_next_token_logit_tensor(self, prompt: str, num_tokens: int = 1): def try_generate_completion(self, prompt: str, num_tokens: int): try: - from openai.error import RateLimitError + from openai import APITimeoutError, RateLimitError except ImportError as e: raise MissingConditionalImportError( extra_deps_group='openai', @@ -60,19 +67,24 @@ def try_generate_completion(self, prompt: str, num_tokens: int): conda_channel='conda-forge') from e tries = 0 completion = None + delay = 1 while tries < MAX_RETRIES: tries += 1 try: - completion = self.generate_completion(prompt, num_tokens) break except RateLimitError as e: - if 'You exceeded your current quota' in str(e._message): + if 'You exceeded your current quota' in str( + e._message): # pyright: ignore raise e - sleep(60) + delay *= 2 * (1 + random.random()) + sleep(delay) continue - except Exception: + except APITimeoutError as e: + delay *= 2 * (1 + random.random()) + sleep(delay) continue + return completion @@ -80,17 +92,16 @@ class OpenAIChatAPIEvalWrapper(OpenAIEvalInterface): def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None: super().__init__(model_cfg, tokenizer) - try: - import openai - except ImportError as e: - raise MissingConditionalImportError( - extra_deps_group='openai', - conda_package='openai', - conda_channel='conda-forge') from e - self.generate_completion = lambda prompt, num_tokens: openai.ChatCompletion.create( - self.model_name, + self.generate_completion = lambda prompt, num_tokens: self.client.chat.completions.create( + model=self.model_name, messages=[{ + 'role': + 'system', + 'content': + model_cfg.get('sytsem_role_prompt', + 'Please complete the following text: ') + }, { 'role': 'user', 'content': prompt }], @@ -162,6 +173,7 @@ def eval_forward(self, batch: Batch, outputs: Optional[Any] = None): # than what the continuation would expect. # Get around this issue by retokenizing the batch to remove spacing from the continuation as well as # decoding the whole continuation at once. + padding_tok = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id output_logits_batch = [] batch = self.rebatch(batch) for tokens, cont_idxs in zip(batch['input_ids'], @@ -182,20 +194,21 @@ def eval_forward(self, batch: Batch, outputs: Optional[Any] = None): if next_logit_tensor is not None: output_logits = torch.cat([output_logits, next_logit_tensor]) padding = torch.nn.functional.one_hot( - torch.full((seqlen - output_logits.shape[0],), - self.tokenizer.pad_token_id), + torch.full((seqlen - output_logits.shape[0],), padding_tok), num_classes=self.tokenizer.vocab_size) output_logits = torch.cat([output_logits, padding]) output_logits_batch.append(output_logits) return torch.stack(output_logits_batch).to(batch['input_ids'].device) - def process_result(self, completion: Optional[dict]): - assert isinstance(completion, dict) - if len(completion['choices']) > 0: + def process_result(self, completion: Optional[ChatCompletion]): + if completion is None: + raise ValueError("Couldn't generate model output") + + if len(completion.choices) > 0: tensors = [] - for t in self.tokenizer(completion['choices'][0]['message'] - ['content'])['input_ids']: + for t in self.tokenizer( + completion.choices[0].message.content)['input_ids']: tensors.append( self.tokenizer.construct_logit_tensor( {self.tokenizer.decode([t]): 0.0})) @@ -213,29 +226,26 @@ class OpenAICausalLMEvalWrapper(OpenAIEvalInterface): def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer) -> None: super().__init__(model_cfg, tokenizer) - try: - import openai - except ImportError as e: - raise MissingConditionalImportError( - extra_deps_group='openai', - conda_package='openai', - conda_channel='conda-forge') from e - - self.generate_completion = lambda prompt, num_tokens: openai.Completion.create( - engine=self.model_name, + # TODO: this will be deprecated + self.generate_completion = lambda prompt, num_tokens: self.client.completions.create( + model=self.model_name, prompt=prompt, - max_tokens=1, + max_tokens=num_tokens, logprobs=5, temperature=0.0) - def process_result(self, completion: Optional[dict]): + def process_result(self, completion: Optional[Completion]): if completion is None: raise ValueError("Couldn't generate model output") - assert isinstance(completion, dict) - if len(completion['choices'][0]['logprobs']['top_logprobs']) > 0: + if TYPE_CHECKING: + assert isinstance(completion, Completion) + assert isinstance(completion.choices[0].logprobs, Logprobs) + assert isinstance(completion.choices[0].logprobs.top_logprobs, list) + + if len(completion.choices[0].logprobs.top_logprobs[0]) > 0: tensor = self.tokenizer.construct_logit_tensor( - dict(completion['choices'][0]['logprobs']['top_logprobs'][0])) + dict(completion.choices[0].logprobs.top_logprobs[0])) return tensor else: # the model sometimes stops early even though we are still requesting tokens! diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml index 179b078fb6..dbccee83ba 100644 --- a/mcli/mcli-openai-eval.yaml +++ b/mcli/mcli-openai-eval.yaml @@ -12,8 +12,8 @@ command: | # Mosaic Cloud will use run_name (with a unique suffix) to populate the env var $RUN_NAME run_name: openai-eval -# gpu_num: # -# gpu_type: # +gpu_num: # +gpu_type: # cluster: # replace with your cluster here! image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest @@ -25,41 +25,22 @@ parameters: device_eval_batch_size: 4 models: - - model_name: openai/davinci - model: - name: openai_causal_lm - version: davinci - tokenizer: - name: openai - kwargs: - name: davinci - - - model_name: openai/ada - model: - name: openai_causal_lm - version: ada - tokenizer: - name: openai - kwargs: - name: ada - - - model_name: openai/gpt-4 + model_name: openai/gpt-3.5-turbo model: name: openai_chat - version: gpt-4 + version: gpt-3.5-turbo tokenizer: - name: openai + name: tiktoken kwargs: - name: gpt-4 + model_name: gpt-3.5-turbo - - model_name: openai/gpt-3.5-turbo + model_name: openai/davinci model: - name: openai_chat - version: gpt-3.5-turbo + name: openai_causal_lm + version: davinci tokenizer: - name: openai + name: tiktoken kwargs: - name: gpt-3.5-turbo + model_name: davinci - icl_tasks: 'eval/yamls/lm_tasks.yaml' - eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml' + icl_tasks: 'eval/yamls/lm_tasks_v0.2.yaml' diff --git a/scripts/eval/yamls/lm_tasks.yaml b/scripts/eval/yamls/lm_tasks_v0.2.yaml similarity index 69% rename from scripts/eval/yamls/lm_tasks.yaml rename to scripts/eval/yamls/lm_tasks_v0.2.yaml index a8b00ba75c..32d4c9f718 100644 --- a/scripts/eval/yamls/lm_tasks.yaml +++ b/scripts/eval/yamls/lm_tasks_v0.2.yaml @@ -1,31 +1,26 @@ icl_tasks: - label: jeopardy - dataset_uri: eval/local_data/world_knowledge/jeopardy_all.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [10] + dataset_uri: eval/local_data/world_knowledge/jeopardy_all.jsonl + num_fewshot: [3] icl_task_type: language_modeling continuation_delimiter: "\nAnswer: " # this separates questions from answers has_categories: true - label: bigbench_qa_wikidata - dataset_uri: eval/local_data/world_knowledge/bigbench_qa_wikidata.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [10] + dataset_uri: eval/local_data/world_knowledge/bigbench_qa_wikidata.jsonl + num_fewshot: [3] icl_task_type: language_modeling - - label: lambada_openai - dataset_uri: eval/local_data/language_understanding/lambada_openai.jsonl - num_fewshot: [0] + label: bigbench_dyck_languages + dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_dyck_languages.jsonl + num_fewshot: [5] icl_task_type: language_modeling - - label: bigbench_conlang_translation - dataset_uri: eval/local_data/language_understanding/bigbench_conlang_translation.jsonl + label: lambada_openai + dataset_uri: eval/local_data/language_understanding/lambada_openai.jsonl num_fewshot: [0] icl_task_type: language_modeling -- - label: bigbench_dyck_languages - dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_dyck_languages.jsonl - num_fewshot: [10] - icl_task_type: language_modeling - label: bigbench_cs_algorithms dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_cs_algorithms.jsonl @@ -34,35 +29,30 @@ icl_tasks: - label: bigbench_operators dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_operators.jsonl - num_fewshot: [10] - icl_task_type: language_modeling -- - label: bigbench_repeat_copy_logic - dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_repeat_copy_logic.jsonl - num_fewshot: [10] + num_fewshot: [3] icl_task_type: language_modeling - label: simple_arithmetic_nospaces dataset_uri: eval/local_data/symbolic_problem_solving/simple_arithmetic_nospaces.jsonl - num_fewshot: [10] + num_fewshot: [5] icl_task_type: language_modeling - label: simple_arithmetic_withspaces dataset_uri: eval/local_data/symbolic_problem_solving/simple_arithmetic_withspaces.jsonl - num_fewshot: [10] + num_fewshot: [5] icl_task_type: language_modeling - label: pubmed_qa_labeled - dataset_uri: eval/local_data/reading_comprehension/pubmed_qa_labeled.jsonl # ADD YOUR OWN DATASET URI + dataset_uri: eval/local_data/reading_comprehension/pubmed_qa_labeled.jsonl num_fewshot: [10] icl_task_type: language_modeling - label: squad - dataset_uri: eval/local_data/reading_comprehension/squad.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [10] + dataset_uri: eval/local_data/reading_comprehension/squad.jsonl + num_fewshot: [3] icl_task_type: language_modeling - label: coqa - dataset_uri: eval/local_data/reading_comprehension/coqa.jsonl # ADD YOUR OWN DATASET URI + dataset_uri: eval/local_data/reading_comprehension/coqa.jsonl num_fewshot: [0] icl_task_type: language_modeling diff --git a/scripts/eval/yamls/openai_eval.yaml b/scripts/eval/yamls/openai_eval.yaml index e1afe78015..9f4da1435f 100644 --- a/scripts/eval/yamls/openai_eval.yaml +++ b/scripts/eval/yamls/openai_eval.yaml @@ -3,32 +3,22 @@ max_seq_len: 1024 device_eval_batch_size: 4 models: - - model_name: openai/davinci - model: - name: openai_causal_lm - version: davinci - tokenizer: - name: openai - kwargs: - name: davinci -- - model_name: openai/gpt-4 + model_name: openai/gpt-3.5-turbo model: name: openai_chat - version: gpt-4 + version: gpt-3.5-turbo tokenizer: - name: openai + name: tiktoken kwargs: - name: gpt-4 + model_name: gpt-3.5-turbo - - model_name: openai/gpt-3.5-turbo + model_name: openai/davinci model: - name: openai_chat - version: gpt-3.5-turbo + name: openai_causal_lm + version: davinci tokenizer: - name: openai + name: tiktoken kwargs: - name: gpt-3.5-turbo + model_name: davinci -icl_tasks: 'eval/yamls/lm_tasks.yaml' -eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml' +icl_tasks: 'eval/yamls/lm_tasks_v0.2.yaml' diff --git a/setup.py b/setup.py index b4ff85f992..9853aa17bf 100644 --- a/setup.py +++ b/setup.py @@ -115,7 +115,7 @@ ] extra_deps['openai'] = [ - 'openai==0.27.8', + 'openai==1.3.8', 'tiktoken==0.4.0', ] extra_deps['all-cpu'] = set( diff --git a/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py b/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py index 6e5f91de00..a125203e19 100644 --- a/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py +++ b/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py @@ -1,6 +1,7 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import os from typing import Dict from unittest.mock import patch @@ -13,6 +14,12 @@ from llmfoundry.utils.builders import build_icl_evaluators +@pytest.fixture(scope='module') +def openai_api_key_env_var() -> str: + os.environ['OPENAI_API_KEY'] = 'dummy' + return os.environ['OPENAI_API_KEY'] + + def load_icl_config(): return DictConfig({ 'icl_tasks': @@ -34,60 +41,69 @@ def load_icl_config(): }) +class MockTopLogProb: + + def __init__(self, expected_token: str) -> None: + self.top_logprobs = [{expected_token: 0}] + + +class MockLogprob: + + def __init__(self, expected_token: str) -> None: + self.logprobs = MockTopLogProb(expected_token) + + +class MockCompletion: + + def __init__(self, expected_token: str) -> None: + self.choices = [MockLogprob(expected_token)] + + +class MockContent: + + def __init__(self, expected_token: str) -> None: + setattr(self, 'content', expected_token) + + +class MockMessage: + + def __init__(self, expected_token: str) -> None: + setattr(self, 'message', MockContent(expected_token)) + + +class MockChatCompletion: + + def __init__(self, expected_token: str) -> None: + setattr(self, 'choices', [MockMessage(expected_token)]) + + def mock_create(**kwargs: Dict[str, str]): prompt = kwargs['prompt'] if prompt == 'AMERICAN HISTORY: On May 29, 1765 Patrick Henrys Stamp Act protest was interrupted with this one word\nAnswer:': # pyright: ignore[reportUnnecessaryComparison] - return { - 'choices': [{ - 'logprobs': { - 'top_logprobs': [{ - ' Tre': 0, - }], - }, - }], - } + return MockCompletion(' Tre') + elif prompt == 'AMERICAN HISTORY: On May 29, 1765 Patrick Henrys Stamp Act protest was interrupted with this one word\nAnswer: Tre': # pyright: ignore[reportUnnecessaryComparison] - return { - 'choices': [{ - 'logprobs': { - 'top_logprobs': [{ - 'ason': 0, - }], - }, - }], - } + return MockCompletion('ason') + elif prompt == 'AMERICAN HISTORY: On May 29, 1765 Patrick Henrys Stamp Act protest was interrupted with this one word\nAnswer: Treason': # pyright: ignore[reportUnnecessaryComparison] - return { - 'choices': [{ - 'logprobs': { - 'top_logprobs': [{ - '!': 0, - }], - }, - }], - } + return MockCompletion('!') + else: # dummy token to make sure the model is incorrect on any other prompt - return { - 'choices': [{ - 'logprobs': { - 'top_logprobs': [{ - ' ': 0, - }], - }, - }], - } - - -def test_openai_api_eval_wrapper(tmp_path: str): + return MockCompletion(' ') + + +def test_openai_api_eval_wrapper(tmp_path: str, openai_api_key_env_var: str): _ = pytest.importorskip('openai') - with patch('openai.Completion') as mock: - mock.create = mock_create - model_name = 'davinci' - tokenizer = TiktokenTokenizerWrapper(model_name=model_name, - pad_token='<|endoftext|>') - model = OpenAICausalLMEvalWrapper(model_cfg={'version': model_name}, - tokenizer=tokenizer) + + model_name = 'davinci' + tokenizer = TiktokenTokenizerWrapper(model_name=model_name, + pad_token='<|endoftext|>') + model = OpenAICausalLMEvalWrapper(model_cfg={'version': model_name}, + tokenizer=tokenizer) + with patch.object(model, 'client') as mock: + mock.completions.create = mock_create + task_cfg = load_icl_config() evaluators, _ = build_icl_evaluators(task_cfg.icl_tasks, tokenizer, @@ -107,22 +123,18 @@ def test_openai_api_eval_wrapper(tmp_path: str): assert acc == 0.5 -def test_chat_api_eval_wrapper(tmp_path: str): +def test_chat_api_eval_wrapper(tmp_path: str, openai_api_key_env_var: str): _ = pytest.importorskip('openai') - with patch('openai.ChatCompletion') as mock: - mock.create.return_value = { - 'choices': [{ - 'message': { - 'role': 'assistant', - 'content': 'Treason!' - }, - }], - } - model_name = 'gpt-3.5-turbo' - tokenizer = TiktokenTokenizerWrapper(model_name=model_name, - pad_token='<|endoftext|>') - chatmodel = OpenAIChatAPIEvalWrapper(model_cfg={'version': model_name}, - tokenizer=tokenizer) + + model_name = 'gpt-3.5-turbo' + tokenizer = TiktokenTokenizerWrapper(model_name=model_name, + pad_token='<|endoftext|>') + chatmodel = OpenAIChatAPIEvalWrapper(model_cfg={'version': model_name}, + tokenizer=tokenizer) + with patch.object(chatmodel, 'client') as mock: + mock.chat.completions.create.return_value = MockChatCompletion( + 'Treason!') + task_cfg = load_icl_config() evaluators, _ = build_icl_evaluators(task_cfg.icl_tasks, tokenizer,