Skip to content

Commit

Permalink
feat: added model saving after every iteration
Browse files Browse the repository at this point in the history
  • Loading branch information
jcabrero committed Aug 9, 2024
1 parent c3e0fbd commit efb1538
Show file tree
Hide file tree
Showing 14 changed files with 140 additions and 33 deletions.
6 changes: 6 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Examples

- Basic Logistic Regression: It uses a mock dataset and a Logistic Regression model. It can serve to test the speed of training and check various model parameters.
- Basic Neural Network: Shows how to build a more complex model, yet simple enough. It uses two consecutive linear layers and a Sigmoid activation.
- Basic Neural Network with 5 parties: Uses 5 parties to train a model on a distributed dataset.
- Convolutional Neural Network: It uses a more complex CIFAR 10 model and permits training on it.
35 changes: 31 additions & 4 deletions examples/conv_net/client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import argparse
import os
import uuid
from collections import OrderedDict

import numpy as np
Expand All @@ -23,6 +25,28 @@ def __init__(self, net, trainloader, valloader, config):
self.criterion = nn.CrossEntropyLoss()
self.optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

self.__iteration = 0
self.path = os.path.join(f"/tmp/", f"model_{uuid.uuid4()}")
while os.path.exists(self.path):
self.path = os.path.join(f"/tmp/", f"model_{uuid.uuid4()}")
os.makedirs(self.path)

@property
def iteration(self):
self.__iteration += 1
return self.__iteration - 1

def save_model(self):
"""
Saves the model to a file.
Returns:
None
"""
path = os.path.join(self.path, f"iteration_{self.iteration}.pth")
logger.info(f"Saving model to {self.path}")
torch.save(self.net.state_dict(), path)

def train(self):
"""
Trains the network for the given number of epochs using the given optimizer and criterion.
Expand All @@ -49,11 +73,12 @@ def train(self):

# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
if i % 2000 == 1999: # print every 2000 mini-batches
print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}")
running_loss = 0.0

print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss:.4f}")
self.save_model()

def local_evaluate(self):
"""
Expand All @@ -76,7 +101,9 @@ def local_evaluate(self):
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
logger.warning(f"Accuracy of the network on the {total} validation images: {100 * correct / total}%")
logger.warning(
f"Accuracy of the network on the {total} validation images: {100 * correct / total}%"
)
return correct / total


Expand Down
12 changes: 6 additions & 6 deletions examples/conv_net/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,18 @@
def load_datasets(num_clients: int):
# Download and transform CIFAR-10 (train and test)
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)
trainset = CIFAR10("./dataset", train=True, download=True, transform=transform)
testset = CIFAR10("./dataset", train=False, download=True, transform=transform)

# Split training set into `num_clients` partitions to simulate different local datasets
if 1 == 1:
partition_size = 4000 # For faster training on local datasets
partition_size = 4000 # For faster training on local datasets
num_partitions = len(trainset) // partition_size
lengths = [partition_size] * num_partitions + [len(trainset) - (partition_size * num_partitions)]
lengths = [partition_size] * num_partitions + [
len(trainset) - (partition_size * num_partitions)
]

else:
partition_size = len(trainset) // num_clients
Expand All @@ -36,5 +38,3 @@ def load_datasets(num_clients: int):
trainloaders.append(DataLoader(ds_train, shuffle=True))
valloaders.append(DataLoader(ds_val))
return trainloaders, valloaders


21 changes: 12 additions & 9 deletions examples/conv_net/model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch import nn, optim


class NeuralNet(nn.Module):
Expand All @@ -16,14 +16,13 @@ def __init__(self):
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x



if __name__ == "__main__":
from examples.conv_net.dataset import load_datasets

Expand All @@ -37,7 +36,9 @@ def forward(self, x):
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Generate data
trainloaders, valloaders = load_datasets(1) # We're using only one client for this example
trainloaders, valloaders = load_datasets(
1
) # We're using only one client for this example

for epoch in range(num_epochs):
running_loss = 0.0
Expand All @@ -56,12 +57,12 @@ def forward(self, x):

# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
if i % 2000 == 1999: # print every 2000 mini-batches
print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}")
running_loss = 0.0

print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss:.4f}")

correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
Expand All @@ -75,6 +76,8 @@ def forward(self, x):
total += labels.size(0)
correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')
print(
f"Accuracy of the network on the 10000 test images: {100 * correct // total} %"
)
print("Finished Training")
print(model.state_dict())
25 changes: 25 additions & 0 deletions examples/logistic_regression/client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import argparse
import os
import uuid
from collections import OrderedDict

import numpy as np
Expand All @@ -25,6 +27,28 @@ def __init__(self, net, trainloader, valloader, config):
self.net.parameters(), lr=self.config["learning_rate"]
)

self.__iteration = 0
self.path = os.path.join(f"/tmp/", f"model_{uuid.uuid4()}")
while os.path.exists(self.path):
self.path = os.path.join(f"/tmp/", f"model_{uuid.uuid4()}")
os.makedirs(self.path)

@property
def iteration(self):
self.__iteration += 1
return self.__iteration - 1

def save_model(self):
"""
Saves the model to a file.
Returns:
None
"""
path = os.path.join(self.path, f"iteration_{self.iteration}.pth")
logger.info(f"Saving model to {self.path}")
torch.save(self.net.state_dict(), path)

def train(self):
"""
Trains the network for the given number of epochs using the given optimizer and criterion.
Expand Down Expand Up @@ -53,6 +77,7 @@ def train(self):

avg_loss = total_loss / len(self.trainloader)
logger.warning(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")
self.save_model()

def local_evaluate(self):
"""
Expand Down
25 changes: 25 additions & 0 deletions examples/neural_net/client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import argparse
import os
import uuid
from collections import OrderedDict

import numpy as np
Expand All @@ -25,6 +27,28 @@ def __init__(self, net, trainloader, valloader, config):
self.net.parameters(), lr=self.config["learning_rate"]
)

self.__iteration = 0
self.path = os.path.join(f"/tmp/", f"model_{uuid.uuid4()}")
while os.path.exists(self.path):
self.path = os.path.join(f"/tmp/", f"model_{uuid.uuid4()}")
os.makedirs(self.path)

@property
def iteration(self):
self.__iteration += 1
return self.__iteration - 1

def save_model(self):
"""
Saves the model to a file.
Returns:
None
"""
path = os.path.join(self.path, f"iteration_{self.iteration}.pth")
logger.info(f"Saving model to {self.path}")
torch.save(self.net.state_dict(), path)

def train(self):
"""
Trains the network for the given number of epochs using the given optimizer and criterion.
Expand Down Expand Up @@ -53,6 +77,7 @@ def train(self):

avg_loss = total_loss / len(self.trainloader)
logger.warning(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")
self.save_model()

def local_evaluate(self):
"""
Expand Down
4 changes: 1 addition & 3 deletions examples/neural_net/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,7 @@ def load_datasets(
num_samples: int = 1000,
num_features: int = 10,
):
dataset = NeuralNetDataset(
num_samples, num_features
) # 1000 samples, 10 features
dataset = NeuralNetDataset(num_samples, num_features) # 1000 samples, 10 features

# Split dataset into `num_clients` partitions
partition_size = len(dataset) // num_clients
Expand Down
2 changes: 1 addition & 1 deletion examples/neural_net/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def forward(self, x):
predicted = (predicted > 0.5).float()
total += predicted.size(0)
correct += (predicted == labels).sum().item()
#if predicted != labels:
# if predicted != labels:
# print(f"Predicted: {predicted}, Actual: {labels}")

accuracy = 100 * correct / total
Expand Down
26 changes: 26 additions & 0 deletions examples/neural_net_5/client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import argparse
import os
import uuid
from collections import OrderedDict

import numpy as np
Expand All @@ -25,6 +27,28 @@ def __init__(self, net, trainloader, valloader, config):
self.net.parameters(), lr=self.config["learning_rate"]
)

self.__iteration = 0
self.path = os.path.join(f"/tmp/", f"model_{uuid.uuid4()}")
while os.path.exists(self.path):
self.path = os.path.join(f"/tmp/", f"model_{uuid.uuid4()}")
os.makedirs(self.path)

@property
def iteration(self):
self.__iteration += 1
return self.__iteration - 1

def save_model(self):
"""
Saves the model to a file.
Returns:
None
"""
path = os.path.join(self.path, f"iteration_{self.iteration}.pth")
logger.info(f"Saving model to {self.path}")
torch.save(self.net.state_dict(), path)

def train(self):
"""
Trains the network for the given number of epochs using the given optimizer and criterion.
Expand Down Expand Up @@ -54,6 +78,8 @@ def train(self):
avg_loss = total_loss / len(self.trainloader)
logger.warning(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")

self.save_model()

def local_evaluate(self):
"""
Evaluates the network on the validation dataset and returns the accuracy.
Expand Down
4 changes: 1 addition & 3 deletions examples/neural_net_5/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,7 @@ def load_datasets(
num_samples: int = 1000,
num_features: int = 10,
):
dataset = NeuralNetDataset(
num_samples, num_features
) # 1000 samples, 10 features
dataset = NeuralNetDataset(num_samples, num_features) # 1000 samples, 10 features

# Split dataset into `num_clients` partitions
partition_size = len(dataset) // num_clients
Expand Down
2 changes: 1 addition & 1 deletion examples/neural_net_5/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def forward(self, x):
predicted = (predicted > 0.5).float()
total += predicted.size(0)
correct += (predicted == labels).sum().item()
#if predicted != labels:
# if predicted != labels:
# print(f"Predicted: {predicted}, Actual: {labels}")

accuracy = 100 * correct / total
Expand Down
4 changes: 1 addition & 3 deletions nillion_fl/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,6 @@ def learning_iteration(self, learning_request):
if modulo > 0:
expected_results += 1



def store_secrets_thread():
"""
Thread function to store secrets.
Expand All @@ -82,7 +80,7 @@ def store_secrets_thread():
new_parameters = self.nillion_client.get_compute_result(expected_results)
new_parameters = sorted(new_parameters, key=lambda x: x[0])
new_parameters = np.concatenate([x[1] for x in new_parameters])
self.parameters = new_parameters[:self.num_parameters]
self.parameters = new_parameters[: self.num_parameters]

# Wait for the secrets storage thread to finish
logger.debug("Waiting for thread to join()...")
Expand Down
2 changes: 1 addition & 1 deletion nillion_fl/nilvm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ async def get_compute_results_from_nillion(self, num_results):
logger.debug(
"✅ Compute complete for compute_id %s", compute_event.uuid
)
#logger.debug("🖥️ The result is %s", compute_event.result.value)
# logger.debug("🖥️ The result is %s", compute_event.result.value)
return compute_results

async def get_compute_result(self, num_results=1):
Expand Down
5 changes: 3 additions & 2 deletions nillion_fl/nilvm/fed_avg/src/custom_fed_avg.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@

# Step 0: Nada Numpy is imported with this line
import nada_numpy as na
from nada_dsl import Output, PublicInteger, Input, SecretInteger
from nada_dsl import Input, Output, PublicInteger, SecretInteger

NUM_PARTIES = 2
DIM = 2500
DIM = 500


def nada_main() -> List[Output]:
"""
Expand Down

0 comments on commit efb1538

Please sign in to comment.