Skip to content

Commit

Permalink
precommit
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg committed Aug 24, 2023
1 parent 141b4c6 commit 4b2cfc1
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 6 deletions.
10 changes: 6 additions & 4 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0
from typing import Union
import logging
import os
from typing import Union

import datasets as hf_datasets
import torch
from composer.utils import dist, get_file, parse_uri
from omegaconf import DictConfig
from torch.utils.data import DataLoader
from transformers import PreTrainedTokenizerBase
import datasets as hf_datasets

from llmfoundry.data.finetuning.collator import Seq2SeqFinetuningCollator
from llmfoundry.data.finetuning.tasks import dataset_constructor
Expand Down Expand Up @@ -255,8 +255,10 @@ def _validate_config(dataset_cfg: DictConfig):
)


def _build_hf_dataset_from_remote(cfg: DictConfig,
tokenizer: PreTrainedTokenizerBase) -> Union[hf_datasets.DatasetDict, hf_datasets.Dataset, hf_datasets.IterableDatasetDict, hf_datasets.IterableDataset]:
def _build_hf_dataset_from_remote(
cfg: DictConfig, tokenizer: PreTrainedTokenizerBase
) -> Union[hf_datasets.DatasetDict, hf_datasets.Dataset,
hf_datasets.IterableDatasetDict, hf_datasets.IterableDataset]:
"""Builds a dataset from a remote object store.
This function supports 'jsonl', 'csv', and 'parquet' file formats for the dataset. It will attempt to download
Expand Down
7 changes: 5 additions & 2 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,11 @@ def get_preprocessing_fn_from_str(self,

return preprocessing_fn

def build_from_hf(self, cfg: DictConfig, max_seq_len: int,
tokenizer: PreTrainedTokenizerBase) -> Union[hf_datasets.DatasetDict, hf_datasets.Dataset, hf_datasets.IterableDatasetDict, hf_datasets.IterableDataset]:
def build_from_hf(
self, cfg: DictConfig, max_seq_len: int,
tokenizer: PreTrainedTokenizerBase
) -> Union[hf_datasets.DatasetDict, hf_datasets.Dataset,
hf_datasets.IterableDatasetDict, hf_datasets.IterableDataset]:
"""Load a HuggingFace Datasets, preprocess, and tokenize.
Note: This function will drop examples where the prompt is longer than the max_seq_len
Expand Down

0 comments on commit 4b2cfc1

Please sign in to comment.