mosaicml · bmosaicml · Jan 23, 2024 · Feb 2, 2024 · Feb 2, 2024 · Feb 2, 2024
@@ -23,6 +23,23 @@
 from llmfoundry.data import (ConcatTokensDataset, NoConcatDataset,
                              Seq2SeqFinetuningCollator,
                              build_finetuning_dataloader)
+from llmfoundry.eval import (InContextLearningCodeEvalAccuracy,
+                             InContextLearningCodeEvalDataset,
+                             InContextLearningDataset,
+                             InContextLearningGenerationExactMatchAccuracy,
+                             InContextLearningGenerationTaskWithAnswersDataset,
+                             InContextLearningLMAccuracy,
+                             InContextLearningLMExpectedCalibrationError,
+                             InContextLearningLMTaskDataset,
+                             InContextLearningMCExpectedCalibrationError,
+                             InContextLearningMetric,
+                             InContextLearningMultipleChoiceAccuracy,
+                             InContextLearningMultipleChoiceTaskDataset,
+                             InContextLearningSchemaTaskDataset,
+                             get_continuation_span, get_fewshot_sample_idxs,
+                             get_icl_task_dataloader, make_padded_input,
+                             strip_data, tokenizer_needs_prefix_space,
+                             trim_context)
 from llmfoundry.models.hf import ComposerHFCausalLM, ComposerHFT5
 from llmfoundry.models.layers.attention import (
     MultiheadAttention, attn_bias_shape, build_alibi_bias, build_attn_bias,
@@ -59,6 +76,26 @@
     'algorithms',
     'callbacks',
     'TiktokenTokenizerWrapper',
+    'InContextLearningLMAccuracy',
+    'InContextLearningMultipleChoiceAccuracy',
+    'InContextLearningGenerationExactMatchAccuracy',
+    'InContextLearningMCExpectedCalibrationError',
+    'InContextLearningLMExpectedCalibrationError',
+    'InContextLearningMetric',
+    'InContextLearningCodeEvalAccuracy',
+    'InContextLearningDataset',
+    'InContextLearningGenerationTaskWithAnswersDataset',
+    'InContextLearningLMTaskDataset',
+    'InContextLearningCodeEvalDataset',
+    'InContextLearningMultipleChoiceTaskDataset',
+    'InContextLearningSchemaTaskDataset',
+    'get_icl_task_dataloader',
+    'strip_data',
+    'tokenizer_needs_prefix_space',
+    'trim_context',
+    'get_continuation_span',
+    'get_fewshot_sample_idxs',
+    'make_padded_input',
     'registry',
 ]
 

@@ -1,2 +1,41 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
+
+"""Natively supported datasets."""
+
+from llmfoundry.eval.datasets import (
+    InContextLearningCodeEvalDataset, InContextLearningDataset,
+    InContextLearningGenerationTaskWithAnswersDataset,
+    InContextLearningLMTaskDataset, InContextLearningMultipleChoiceTaskDataset,
+    InContextLearningSchemaTaskDataset, get_continuation_span,
+    get_fewshot_sample_idxs, get_icl_task_dataloader, make_padded_input,
+    strip_data, tokenizer_needs_prefix_space, trim_context)
+from llmfoundry.eval.metrics import (
+    InContextLearningCodeEvalAccuracy,
+    InContextLearningGenerationExactMatchAccuracy, InContextLearningLMAccuracy,
+    InContextLearningLMExpectedCalibrationError,
+    InContextLearningMCExpectedCalibrationError, InContextLearningMetric,
+    InContextLearningMultipleChoiceAccuracy)
+
+__all__ = [
+    'InContextLearningLMAccuracy',
+    'InContextLearningMultipleChoiceAccuracy',
+    'InContextLearningGenerationExactMatchAccuracy',
+    'InContextLearningMCExpectedCalibrationError',
+    'InContextLearningLMExpectedCalibrationError',
+    'InContextLearningMetric',
+    'InContextLearningCodeEvalAccuracy',
+    'InContextLearningDataset',
+    'InContextLearningGenerationTaskWithAnswersDataset',
+    'InContextLearningLMTaskDataset',
+    'InContextLearningCodeEvalDataset',
+    'InContextLearningMultipleChoiceTaskDataset',
+    'InContextLearningSchemaTaskDataset',
+    'get_icl_task_dataloader',
+    'strip_data',
+    'tokenizer_needs_prefix_space',
+    'trim_context',
+    'get_continuation_span',
+    'get_fewshot_sample_idxs',
+    'make_padded_input',
+]
@@ -46,7 +46,6 @@
 
 class InContextLearningDataset(Dataset):
     r"""A base dataset that constructs batches for in-context learning task.
-
     evaluations. The dataset format is expected to be a local jsonl file, a
     cloud link to a jsonl file, or a Hugging Face dataset link. 'context' refers
     to the input a model will recieve before generating an output. For example,

@@ -4,6 +4,7 @@
 from llmfoundry.models.hf import ComposerHFCausalLM, ComposerHFT5
 from llmfoundry.models.inference_api_wrapper import (FMAPICasualLMEvalWrapper,
                                                      FMAPIChatAPIEvalWrapper,
+                                                     GeminiChatAPIEvalrapper,
                                                      OpenAICausalLMEvalWrapper,
                                                      OpenAIChatAPIEvalWrapper)
 from llmfoundry.models.mpt import (ComposerMPTCausalLM, MPTConfig,
@@ -17,6 +18,7 @@
 models.register('fmapi_causal_lm', func=FMAPICasualLMEvalWrapper)
 models.register('openai_chat', func=OpenAIChatAPIEvalWrapper)
 models.register('fmapi_chat', func=FMAPIChatAPIEvalWrapper)
+models.register('gemini_chat', func=GeminiChatAPIEvalrapper)
 
 __all__ = [
     'ComposerHFCausalLM',

@@ -3,13 +3,16 @@
 
 from llmfoundry.models.inference_api_wrapper.fmapi import (
     FMAPICasualLMEvalWrapper, FMAPIChatAPIEvalWrapper)
+from llmfoundry.models.inference_api_wrapper.gemini_chat import \
+    GeminiChatAPIEvalrapper
 from llmfoundry.models.inference_api_wrapper.interface import \
     InferenceAPIEvalWrapper
 from llmfoundry.models.inference_api_wrapper.openai_causal_lm import (
     OpenAICausalLMEvalWrapper, OpenAIChatAPIEvalWrapper)
 
 __all__ = [
     'OpenAICausalLMEvalWrapper',
+    'GeminiChatAPIEvalrapper',
     'OpenAIChatAPIEvalWrapper',
     'InferenceAPIEvalWrapper',
     'FMAPICasualLMEvalWrapper',

@@ -49,6 +49,7 @@ def block_until_ready(self, base_url: str):
 
     def __init__(self, om_model_config: DictConfig, tokenizer: AutoTokenizer):
         is_local = om_model_config.pop('local', False)
+        api_key = om_model_config.pop('api_key', None)
         if is_local:
             base_url = os.environ.get('MOSAICML_MODEL_ENDPOINT',
                                       'http://0.0.0.0:8080/v2')
@@ -60,7 +61,7 @@ def __init__(self, om_model_config: DictConfig, tokenizer: AutoTokenizer):
                 'Must specify base_url or use local=True in model_cfg for FMAPIsEvalWrapper'
             )
 
-        super().__init__(om_model_config, tokenizer)
+        super().__init__(om_model_config, tokenizer, api_key)
 
 
 class FMAPICasualLMEvalWrapper(FMAPIEvalInterface, OpenAICausalLMEvalWrapper):

@@ -0,0 +1,153 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+import os
+import random
+from time import sleep
+from typing import TYPE_CHECKING, Any, List, Optional, Union
+
+from composer.core.types import Batch
+from composer.utils.import_helpers import MissingConditionalImportError
+from omegaconf import DictConfig
+from transformers import AutoTokenizer
+
+log = logging.getLogger(__name__)
+
+from transformers import AutoTokenizer
+
+from llmfoundry.models.inference_api_wrapper.interface import \
+    InferenceAPIEvalWrapper
+
+if TYPE_CHECKING:
+    from openai.types.chat.chat_completion import ChatCompletion
+
+MAX_RETRIES = 3
+
+__all__ = [
+    'GeminiChatAPIEvalrapper',
+]
+
+log = logging.getLogger(__name__)
+
+
+class GeminiChatAPIEvalrapper(InferenceAPIEvalWrapper):
+    """Databricks Foundational Model API wrapper for causal LM models."""
+
+    def __init__(self, om_model_config: DictConfig,
+                 tokenizer: AutoTokenizer) -> None:
+        api_key = om_model_config.pop('api_key', None)
+        if api_key is None:
+            api_key = os.environ.get('GEMINI_API_KEY')
+        try:
+            import google.generativeai as google_genai
+        except ImportError as e:
+            # TODO: should google-generativeai be grouped with openai in setup.py?
+            raise MissingConditionalImportError(
+                extra_deps_group='openai',
+                conda_package='google-generativeai',
+                conda_channel='conda-forge') from e
+        google_genai.configure(api_key=api_key)
+        super().__init__(om_model_config, tokenizer)
+        self.model_cfg = om_model_config
+        self.model = google_genai.GenerativeModel(
+            om_model_config.get('version', ''))
+        ignore = [
+            google_genai.types.HarmCategory.HARM_CATEGORY_HARASSMENT,
+            google_genai.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
+            google_genai.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
+            google_genai.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
+        ]
+        self.safety_settings = {
+            category: google_genai.types.HarmBlockThreshold.BLOCK_NONE
+            for category in ignore
+        }
+
+    def generate_completion(
+            self,
+            prompt: str,  #
+            num_tokens: int,
+            generation_kwargs: Optional[dict] = None) -> 'ChatCompletion':
+        if generation_kwargs is None:
+            generation_kwargs = {}
+        if isinstance(prompt, str):
+            generation_config = google_genai.types.GenerationConfig(
+                candidate_count=1,
+                max_output_tokens=num_tokens,
+                temperature=generation_kwargs.get('temperature', 0))
+            response = self.model.generate_content(
+                prompt,
+                safety_settings=self.safety_settings,
+                generation_config=generation_config)
+            return response
+        else:
+            raise ValueError(f'Prompt must be str: {prompt}')
+
+    def completion_to_string(self, completion: 'ChatCompletion'):
+        try:
+            # sometimes gemini will block outputs due to content filters
+            return [completion.text]
+        except:
+            return ['']
+
+    def eval_forward(self, batch: Batch, outputs: Optional[Any] = None):
+        # Override the base class because Chat's API always strips spacing from model outputs resulting in different tokens
+        # than what the continuation would expect.
+        # Get around this issue by retokenizing the batch to remove spacing from the continuation as well as
+        # decoding the whole continuation at once.
+        padding_tok = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else self.tokenizer.eos_token_id
+        if batch.get('mode', '') == 'generate':
+            outputs = []
+            # generate-based implementation
+            for tokens, _ in zip(batch['input_ids'], batch['labels']):
+
+                tokens = tokens.tolist()
+                tokens = [t for t in tokens if t != padding_tok]
+                prompt = self.tokenizer.decode(tokens)
+
+                if 'generation_length' in batch:
+                    num_tokens = batch['generation_length']
+                elif 'generation_kwargs' in batch:
+                    num_tokens = batch['generation_kwargs'].get(
+                        'max_new_tokens', 2)
+
+                for _ in range(
+                        0,
+                        batch.get('generation_kwargs',
+                                  {}).get('num_return_sequences', 1)):
+                    api_output = self.try_generate_completion(  #
+                        prompt,
+                        num_tokens=num_tokens,
+                        generation_kwargs=batch.get('generation_kwargs', {}))
+
+                    assert api_output is not None
+                    sample_output = self.completion_to_string(
+                        api_output)[  # pyright: ignore
+                            0]
+                    outputs.append(sample_output)
+            return outputs
+        else:
+            raise ValueError("Only 'generate' tasks are supported.")
+
+    def try_generate_completion(self,
+                                prompt: Union[str, List],
+                                num_tokens: int,
+                                generation_kwargs: Optional[dict] = None):
+        if generation_kwargs is None:
+            generation_kwargs = {}
+
+        tries = 0
+        completion = None
+        delay = 1
+        while tries < MAX_RETRIES:
+            tries += 1
+            try:
+                completion = self.generate_completion(prompt, num_tokens,
+                                                      generation_kwargs)
+                break
+            except Exception as e:
+                delay *= 2 * (1 + random.random())
+                sleep(delay)
+                continue
+
+        return completion