diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index 936c5d66ba..6dff05bc50 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -51,6 +51,11 @@ class StreamingTextDataset(StreamingDataset): smaller epoch size. Defaults to ``None``. predownload (int, optional): Target number of samples ahead to download the shards of while iterating. Defaults to ``100_000``. + cache_limit (Union[int, str], optional) - Maximum size in bytes of this StreamingDataset's + shard cache. Before downloading a shard, the least recently used resident shard(s) may + be evicted (deleted from the local cache) in order to stay under the limit. Set to None + to disable shard eviction. Supports integer bytes as well as string human-readable + bytes (e.g., 100b, 64kb, 77mb, and so on). Defaults to None. partition_algo (str): Which partitioning algorithm to use. Defaults to ``orig``. num_canonical_nodes (int, optional): Canonical number of nodes for shuffling with resumption. Defaults to ``None``, which is interpreted as the number of nodes of the @@ -77,6 +82,7 @@ def __init__(self, keep_zip: bool = False, epoch_size: Optional[int] = None, predownload: int = 100_000, + cache_limit: Optional[Union[int, str]] = None, partition_algo: str = 'orig', num_canonical_nodes: Optional[int] = None, batch_size: Optional[int] = None, @@ -118,6 +124,7 @@ def __init__(self, keep_zip=keep_zip, epoch_size=epoch_size, predownload=predownload, + cache_limit=cache_limit, partition_algo=partition_algo, num_canonical_nodes=num_canonical_nodes, batch_size=batch_size, diff --git a/scripts/inference/hf_generate.py b/scripts/inference/hf_generate.py index aeda54189a..5b28e0b598 100644 --- a/scripts/inference/hf_generate.py +++ b/scripts/inference/hf_generate.py @@ -9,6 +9,7 @@ from argparse import ArgumentParser, ArgumentTypeError, Namespace from contextlib import nullcontext +import numpy as np import torch from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, pipeline) @@ -318,17 +319,37 @@ def _generate(encoded_inp): # Print generations delimiter = '#' * 100 - for prompt, gen in zip(batch, decoded_gen): - continuation = gen[len(prompt):] + # decode the encoded prompt to handle the case when the tokenizer + # trims extra spaces or does other pre-tokenization things + effective_prompts = tokenizer.batch_decode(encoded_inp['input_ids'], + skip_special_tokens=True) + for idx, (effective_prompt, prompt, gen) in enumerate( + zip(effective_prompts, batch, decoded_gen)): + continuation = gen[len(effective_prompt):] print(delimiter) - print('\033[92m' + prompt + '\033[0m' + continuation) + if len(continuation) > 0: + print('\033[92m' + prompt + '\033[0m' + continuation) + else: + print('Warning. No non-special output tokens generated.') + print( + 'This can happen if the generation only contains padding/eos tokens.' + ) + print('Debug:') + full_generation = tokenizer.batch_decode( + encoded_gen, skip_special_tokens=False)[idx] + print('\033[92m' + 'Prompt:\n' + prompt + '\033[0m') + print('Full generation:\n' + full_generation) + print(delimiter) # Print timing info bs = len(batch) + # ensure that gen_tokens >= 1 in case model only generated padding tokens + gen_tokens = np.maximum(gen_tokens, np.ones_like(gen_tokens)) output_tokens = gen_tokens - input_tokens total_input_tokens = input_tokens.sum() total_output_tokens = output_tokens.sum() + encode_latency = 1000 * (encode_end - encode_start) gen_latency = 1000 * (gen_end - gen_start) decode_latency = 1000 * (decode_end - decode_start)