Skip to content

Commit

Permalink
training script improved
Browse files Browse the repository at this point in the history
  • Loading branch information
AdityaNG committed May 2, 2024
1 parent 58e064d commit c17aa64
Show file tree
Hide file tree
Showing 14 changed files with 1,901 additions and 1,002 deletions.
58 changes: 31 additions & 27 deletions kan_gpt/dataset.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import os
import pickle
import pandas as pd

import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import GPT2LMHeadModel, GPT2Tokenizer

from kan_gpt.mingpt.bpe import BPETokenizer


class WebTextDataset(Dataset):
"""
"""
Dataset for the Sort problem. E.g. for problem length 6:
Input: 0 0 2 1 0 1 -> Output: 0 0 0 1 1 2
Which will feed into the transformer concatenated as:
Expand All @@ -19,7 +20,7 @@ class WebTextDataset(Dataset):
"""

def __init__(self, split, model_type, block_size=1024, vocab_size=50257):
assert split in {'train', 'test', 'valid'}
assert split in {"train", "test", "valid"}

self.split = split
self.block_size = block_size
Expand All @@ -39,58 +40,61 @@ def __init__(self, split, model_type, block_size=1024, vocab_size=50257):
tokenized_lengths = []

for _, row in self.data.iterrows():
text = row['text']
text = row["text"]

tokenized = self.tokenizer.encode(text=text, add_special_tokens=False)
tokenized = self.tokenizer.encode(
text=text, add_special_tokens=False
)
tokenized_length = len(tokenized)

tokenized_data.append(tokenized)
tokenized_lengths.append(tokenized_length)
self.data['tokenized'] = tokenized_data
self.data['tokenized_length'] = tokenized_lengths

self.data["tokenized"] = tokenized_data
self.data["tokenized_length"] = tokenized_lengths

self.dataset = {
'x': [],
'y': [],
"x": [],
"y": [],
}

for _, row in self.data.iterrows():
tokenized = row['tokenized']
tokenized_length = row['tokenized_length']
tokenized = row["tokenized"]
tokenized_length = row["tokenized_length"]

for index in range(0, tokenized_length - (self.block_size-1), 1):
mid = index + (self.block_size-1)
for index in range(
0, tokenized_length - (self.block_size - 1), 1
):
mid = index + (self.block_size - 1)
x = tokenized[index:mid]
y = tokenized[index+1:mid+1]
y = tokenized[index + 1 : mid + 1]

self.dataset["x"].append(x)
self.dataset["y"].append(y)

self.dataset['x'].append(x)
self.dataset['y'].append(y)

# Write to pkl
with open(self.pickel_path, 'wb') as f:
with open(self.pickel_path, "wb") as f:
pickle.dump(self.dataset, f)

# Read from pkl
with open(self.pickel_path, 'rb') as f:
with open(self.pickel_path, "rb") as f:
self.dataset = pickle.load(f)

self.dataset = pd.DataFrame(self.dataset)


def __len__(self):
return len(self.dataset)

def get_vocab_size(self):
return self.vocab_size

def get_block_size(self):
return self.block_size

def __getitem__(self, idx):
x = self.dataset['x'][idx]
y = self.dataset['y'][idx]

x = self.dataset["x"][idx]
y = self.dataset["y"][idx]

x = torch.tensor(x, dtype=torch.long)
y = torch.tensor(y, dtype=torch.long)
Expand Down
Loading

0 comments on commit c17aa64

Please sign in to comment.