Skip to content

Commit

Permalink
fix V: the empire fixes back
Browse files Browse the repository at this point in the history
  • Loading branch information
milocress committed Apr 19, 2024
1 parent bc2d5d3 commit cd73f60
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 37 deletions.
3 changes: 2 additions & 1 deletion llmfoundry/data/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,14 @@ def build_dataloader(cfg: Dict[str, Any], tokenizer: PreTrainedTokenizerBase,
device_batch_size (int): The size of the batches (number of examples)
that the dataloader will produce.
"""
name = cfg.pop('name')
kwargs: Dict[str, Any] = {
**cfg, 'tokenizer': tokenizer,
'device_batch_size': device_batch_size
}

return construct_from_registry(
name=cfg.name,
name=name,
registry=registry.dataloaders,
partial_function=False,
pre_validation_function=None,
Expand Down
12 changes: 6 additions & 6 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,14 +195,14 @@ def build_finetuning_dataloader(
sampling_method=dataset_cfg.get('sampling_method', 'balanced'),
sampling_granularity=dataset_cfg.get('sampling_granularity', 1),
batching_method=dataset_cfg.get('batching_method', 'random'),
max_seq_len=dataset_cfg.max_seq_len,
max_seq_len=dataset_cfg['max_seq_len'],
allow_unsafe_types=dataset_cfg.get('allow_unsafe_types', False),
replication=dataset_cfg.get('replication', None),
)

else:
# Build HF dataloader
dataset_name_or_path = dataset_cfg.hf_name
dataset_name_or_path = dataset_cfg['hf_name']
split = dataset_cfg.get('split')
if split is None:
raise MissingHuggingFaceURLSplitError()
Expand All @@ -228,14 +228,14 @@ def build_finetuning_dataloader(
dataset_name=dataset_name_or_path,
split=split,
safe_load=dataset_cfg.get('safe_load', False),
max_seq_len=dataset_cfg.max_seq_len,
max_seq_len=dataset_cfg['max_seq_len'],
preprocessing_fn=preprocessing_fn,
tokenizer=tokenizer,
target_prompts=dataset_cfg.get('target_prompts',
_DEFAULT_TARGET_PROMPTS),
target_responses=dataset_cfg.get('target_responses',
_DEFAULT_TARGET_RESPONSES),
decoder_only_format=dataset_cfg.decoder_only_format,
decoder_only_format=dataset_cfg['decoder_only_format'],
hf_kwargs=dataset_cfg.get('hf_kwargs', {}))

# Ensure dataset is large enough.
Expand All @@ -246,7 +246,7 @@ def build_finetuning_dataloader(
full_dataset_size = len(streaming_dataset)
if full_dataset_size < minimum_dataset_size:
raise NotEnoughDatasetSamplesError(
dataset_name=dataset_cfg.hf_name,
dataset_name=dataset_cfg['hf_name'],
split=split,
dataloader_batch_size=dataloader_batch_size,
world_size=world_size,
Expand All @@ -255,7 +255,7 @@ def build_finetuning_dataloader(
# Initialize sampler.
sampler = dist.get_sampler(streaming_dataset,
drop_last=drop_last,
shuffle=dataset_cfg.shuffle)
shuffle=dataset_cfg['shuffle'])

assert streaming_dataset is not None # for pyright
dl = DataLoader(
Expand Down
18 changes: 9 additions & 9 deletions llmfoundry/data/packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import logging
import tempfile
from typing import Callable, Dict, Iterable, List, Literal, Optional, Tuple
from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Tuple

import numpy as np
import torch
Expand Down Expand Up @@ -361,7 +361,7 @@ def auto_packing_ratio(dataloader_cfg: DictConfig,


def profile_packing(
dataloader_cfg: DictConfig,
dataloader_cfg: Dict[str, Any],
tokenizer: PreTrainedTokenizerBase,
min_ratio: float,
max_ratio: float,
Expand All @@ -385,7 +385,7 @@ def profile_packing(

from llmfoundry.data.dataloader import build_dataloader

dataset_cfg = dataloader_cfg.dataset
dataset_cfg = dataloader_cfg['dataset']
max_seq_len = dataset_cfg.get('max_seq_len')
max_leftovers_to_keep = dataset_cfg.get('max_leftovers_to_keep', None)

Expand All @@ -397,22 +397,22 @@ def profile_packing(
'prefetch_factor': None,
'persistent_workers': False,
})
dataloader_cfg.dataset.packing_ratio = 1.0
dataloader_cfg['dataset_cfg']['packing_ratio'] = 1.0

# If streaming dataset, use a temporary local folder for profiling
local_rank_zero = dist.get_global_rank() - dist.get_local_rank()
if dataloader_cfg.dataset.get('remote') is not None:
if dataloader_cfg['dataset'].get('remote') is not None:
tmp_path_to_broadcast = tempfile.TemporaryDirectory().name
gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
tmp_path = gathered_paths[local_rank_zero]
dataloader_cfg.dataset.local = tmp_path
dataloader_cfg['dataset']['local'] = tmp_path

if dataloader_cfg.dataset.get('streams') is not None:
for stream_config in dataloader_cfg.dataset.streams.values():
if dataloader_cfg['dataset'].get('streams') is not None:
for stream_config in dataloader_cfg['dataset']['streams'].values():
tmp_path_to_broadcast = tempfile.TemporaryDirectory().name
gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
tmp_path = gathered_paths[local_rank_zero]
stream_config.local = tmp_path
stream_config['local'] = tmp_path

# Determine the packing_ratio values we'll try
packing_ratios, raw_batch_sizes = [], []
Expand Down
29 changes: 15 additions & 14 deletions llmfoundry/utils/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from llmfoundry.eval.datasets.in_context_learning_evaluation import \
get_icl_task_dataloader
from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper
from llmfoundry.utils.config_utils import to_str_dict
from llmfoundry.utils.registry_utils import construct_from_registry
from llmfoundry.utils.warnings import VersionedDeprecationWarning

Expand All @@ -50,9 +51,9 @@


def build_evaluators(
eval_loader_config: Optional[Union[DictConfig, ListConfig]],
icl_tasks_config: Optional[Union[str, ListConfig]],
eval_gauntlet_config: Optional[Union[str, DictConfig]],
eval_loader_config: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]],
icl_tasks_config: Optional[Union[str, List[Dict[str, Any]]]],
eval_gauntlet_config: Optional[Union[str, Dict[str, Any]]],
*,
tokenizer: PreTrainedTokenizerBase,
device_eval_batch_size: int,
Expand Down Expand Up @@ -85,23 +86,23 @@ def build_evaluators(


def build_eval_loaders(
eval_loader_config: Union[DictConfig, ListConfig],
eval_loader_config: Union[Dict[str, Any], List[Dict[str, Any]]],
tokenizer: PreTrainedTokenizerBase,
device_eval_batch_size: int,
) -> List[Evaluator]:
evaluators: List[Evaluator] = []
if isinstance(eval_loader_config, ListConfig):
eval_configs: ListConfig = eval_loader_config
if isinstance(eval_loader_config, list):
eval_configs = eval_loader_config
is_multi_eval = True
else:
eval_configs = ListConfig([eval_loader_config])
eval_configs = [eval_loader_config]
is_multi_eval = False

for eval_config in eval_configs:
eval_dataloader = build_dataloader(eval_config, tokenizer,
device_eval_batch_size)
eval_loader: Evaluator = Evaluator(
label=f'eval/{eval_config.label}' if is_multi_eval else 'eval',
label=f"eval/{eval_config['label']}" if is_multi_eval else 'eval',
dataloader=eval_dataloader,
# Load the eval data to fail fast. metrics will get added
# later in add_metrics_to_eval_loaders, after the model is loaded
Expand Down Expand Up @@ -129,8 +130,8 @@ def add_metrics_to_eval_loaders(


def build_icl_data_and_gauntlet(
icl_tasks_config: Union[str, ListConfig],
eval_gauntlet_config: Optional[Union[str, DictConfig]],
icl_tasks_config: Union[str, List[Dict[str, Any]]],
eval_gauntlet_config: Optional[Union[str, Dict[str, Any]]],
tokenizer: PreTrainedTokenizerBase,
device_eval_batch_size: int,
icl_seq_len: int,
Expand All @@ -147,15 +148,15 @@ def build_icl_data_and_gauntlet(
if isinstance(eval_gauntlet_config, str):
with open(eval_gauntlet_config, 'r') as icl_f:
eval_gauntlet_cfg = om.load(icl_f)
eval_gauntlet = eval_gauntlet_cfg.eval_gauntlet
elif isinstance(eval_gauntlet_config, DictConfig): # pyright: ignore
eval_gauntlet = to_str_dict(eval_gauntlet_cfg['eval_gauntlet'])
elif isinstance(eval_gauntlet_config, dict): # pyright: ignore
eval_gauntlet = eval_gauntlet_config
else:
raise ValueError(
f'Got invalid type for eval_gauntlet_config: {type(eval_gauntlet_config)}'
)
eval_gauntlet.logger_keys = logger_keys
eval_gauntlet.benchmark_sizes = {
eval_gauntlet['logger_keys'] = logger_keys
eval_gauntlet['benchmark_sizes'] = {
e.label: e.dataloader.num_samples for e in icl_evaluators
}
eval_gauntlet_cb = EvalGauntlet(**eval_gauntlet)
Expand Down
12 changes: 5 additions & 7 deletions scripts/eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,10 +262,8 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
for code_path in (eval_config.code_paths or []):
import_file(code_path)

model_configs = ListConfig(eval_config.models)
eval_gauntlet_config = DictConfig(
eval_config.eval_gauntlet
) if eval_config.eval_gauntlet else eval_config.eval_gauntlet_str
model_configs = eval_config.models
eval_gauntlet_config = eval_config.eval_gauntlet if eval_config.eval_gauntlet else eval_config.eval_gauntlet_str

# the below line fixes a strange issue where the fsdp_config is a DictConfig rather than a Dict,
# despite the type hint being Dict[str, Any] and the `cfg` object being sent to `to_container`.
Expand All @@ -280,9 +278,9 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
} if fsdp_config else None # pyright fix

# Mandatory Evaluation Parameters
icl_tasks: Union[ListConfig, str, None] = ListConfig(
eval_config.icl_tasks
) if eval_config.icl_tasks else eval_config.icl_tasks_str
icl_tasks: Union[
ListConfig, str,
None] = eval_config.icl_tasks if eval_config.icl_tasks else eval_config.icl_tasks_str
assert icl_tasks is not None, 'icl_tasks must be specified in the config'

# Optional Evaluation Parameters with default values
Expand Down

0 comments on commit cd73f60

Please sign in to comment.