feat: added model saving after every iteration

NillionNetwork · Aug 9, 2024 · efb1538 · efb1538
1 parent c3e0fbd
commit efb1538
Show file tree

Hide file tree

Showing 14 changed files with 140 additions and 33 deletions.
diff --git a/examples/README.md b/examples/README.md
@@ -0,0 +1,6 @@
+# Examples
+
+- Basic Logistic Regression: It uses a mock dataset and a Logistic Regression model. It can serve to test the speed of training and check various model parameters.
+- Basic Neural Network: Shows how to build a more complex model, yet simple enough. It uses two consecutive linear layers and a Sigmoid activation.
+- Basic Neural Network with 5 parties: Uses 5 parties to train a model on a distributed dataset.
+- Convolutional Neural Network: It uses a more complex CIFAR 10 model and permits training on it.
diff --git a/examples/conv_net/client.py b/examples/conv_net/client.py
@@ -1,4 +1,6 @@
 import argparse
+import os
+import uuid
 from collections import OrderedDict
 
 import numpy as np
@@ -23,6 +25,28 @@ def __init__(self, net, trainloader, valloader, config):
         self.criterion = nn.CrossEntropyLoss()
         self.optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
 
+        self.__iteration = 0
+        self.path = os.path.join(f"/tmp/", f"model_{uuid.uuid4()}")
+        while os.path.exists(self.path):
+            self.path = os.path.join(f"/tmp/", f"model_{uuid.uuid4()}")
+        os.makedirs(self.path)
+
+    @property
+    def iteration(self):
+        self.__iteration += 1
+        return self.__iteration - 1
+
+    def save_model(self):
+        """
+        Saves the model to a file.
+
+        Returns:
+            None
+        """
+        path = os.path.join(self.path, f"iteration_{self.iteration}.pth")
+        logger.info(f"Saving model to {self.path}")
+        torch.save(self.net.state_dict(), path)
+
     def train(self):
         """
         Trains the network for the given number of epochs using the given optimizer and criterion.
@@ -49,11 +73,12 @@ def train(self):
 
                 # print statistics
                 running_loss += loss.item()
-                if i % 2000 == 1999:    # print every 2000 mini-batches
-                    print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
+                if i % 2000 == 1999:  # print every 2000 mini-batches
+                    print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}")
                     running_loss = 0.0
-    
+
         print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss:.4f}")
+        self.save_model()
 
     def local_evaluate(self):
         """
@@ -76,7 +101,9 @@ def local_evaluate(self):
                 _, predicted = torch.max(outputs.data, 1)
                 total += labels.size(0)
                 correct += (predicted == labels).sum().item()
-        logger.warning(f"Accuracy of the network on the {total} validation images: {100 * correct / total}%")
+        logger.warning(
+            f"Accuracy of the network on the {total} validation images: {100 * correct / total}%"
+        )
         return correct / total
 
 

diff --git a/examples/conv_net/dataset.py b/examples/conv_net/dataset.py
@@ -9,16 +9,18 @@
 def load_datasets(num_clients: int):
     # Download and transform CIFAR-10 (train and test)
     transform = transforms.Compose(
-        [transforms.ToTensor(),
-        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+        [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
+    )
     trainset = CIFAR10("./dataset", train=True, download=True, transform=transform)
     testset = CIFAR10("./dataset", train=False, download=True, transform=transform)
 
     # Split training set into `num_clients` partitions to simulate different local datasets
     if 1 == 1:
-        partition_size = 4000 # For faster training on local datasets
+        partition_size = 4000  # For faster training on local datasets
         num_partitions = len(trainset) // partition_size
-        lengths = [partition_size] * num_partitions + [len(trainset) - (partition_size * num_partitions)]
+        lengths = [partition_size] * num_partitions + [
+            len(trainset) - (partition_size * num_partitions)
+        ]
 
     else:
         partition_size = len(trainset) // num_clients
@@ -36,5 +38,3 @@ def load_datasets(num_clients: int):
         trainloaders.append(DataLoader(ds_train, shuffle=True))
         valloaders.append(DataLoader(ds_val))
     return trainloaders, valloaders
-
-
diff --git a/examples/conv_net/model.py b/examples/conv_net/model.py
@@ -1,6 +1,6 @@
 import torch
-from torch import nn, optim
 import torch.nn.functional as F
+from torch import nn, optim
 
 
 class NeuralNet(nn.Module):
@@ -16,14 +16,13 @@ def __init__(self):
     def forward(self, x):
         x = self.pool(F.relu(self.conv1(x)))
         x = self.pool(F.relu(self.conv2(x)))
-        x = torch.flatten(x, 1) # flatten all dimensions except batch
+        x = torch.flatten(x, 1)  # flatten all dimensions except batch
         x = F.relu(self.fc1(x))
         x = F.relu(self.fc2(x))
         x = self.fc3(x)
         return x
 
 
-
 if __name__ == "__main__":
     from examples.conv_net.dataset import load_datasets
 
@@ -37,7 +36,9 @@ def forward(self, x):
     optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
 
     # Generate data
-    trainloaders, valloaders = load_datasets(1)  # We're using only one client for this example
+    trainloaders, valloaders = load_datasets(
+        1
+    )  # We're using only one client for this example
 
     for epoch in range(num_epochs):
         running_loss = 0.0
@@ -56,12 +57,12 @@ def forward(self, x):
 
             # print statistics
             running_loss += loss.item()
-            if i % 2000 == 1999:    # print every 2000 mini-batches
-                print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
+            if i % 2000 == 1999:  # print every 2000 mini-batches
+                print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}")
                 running_loss = 0.0
-    
+
         print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss:.4f}")
-    
+
         correct = 0
         total = 0
         # since we're not training, we don't need to calculate the gradients for our outputs
@@ -75,6 +76,8 @@ def forward(self, x):
                 total += labels.size(0)
                 correct += (predicted == labels).sum().item()
 
-        print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')
+        print(
+            f"Accuracy of the network on the 10000 test images: {100 * correct // total} %"
+        )
     print("Finished Training")
     print(model.state_dict())
diff --git a/examples/logistic_regression/client.py b/examples/logistic_regression/client.py
@@ -1,4 +1,6 @@
 import argparse
+import os
+import uuid
 from collections import OrderedDict
 
 import numpy as np
@@ -25,6 +27,28 @@ def __init__(self, net, trainloader, valloader, config):
             self.net.parameters(), lr=self.config["learning_rate"]
         )
 
+        self.__iteration = 0
+        self.path = os.path.join(f"/tmp/", f"model_{uuid.uuid4()}")
+        while os.path.exists(self.path):
+            self.path = os.path.join(f"/tmp/", f"model_{uuid.uuid4()}")
+        os.makedirs(self.path)
+
+    @property
+    def iteration(self):
+        self.__iteration += 1
+        return self.__iteration - 1
+
+    def save_model(self):
+        """
+        Saves the model to a file.
+
+        Returns:
+            None
+        """
+        path = os.path.join(self.path, f"iteration_{self.iteration}.pth")
+        logger.info(f"Saving model to {self.path}")
+        torch.save(self.net.state_dict(), path)
+
     def train(self):
         """
         Trains the network for the given number of epochs using the given optimizer and criterion.
@@ -53,6 +77,7 @@ def train(self):
 
             avg_loss = total_loss / len(self.trainloader)
             logger.warning(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")
+        self.save_model()
 
     def local_evaluate(self):
         """

diff --git a/examples/neural_net/client.py b/examples/neural_net/client.py
@@ -1,4 +1,6 @@
 import argparse
+import os
+import uuid
 from collections import OrderedDict
 
 import numpy as np
@@ -25,6 +27,28 @@ def __init__(self, net, trainloader, valloader, config):
             self.net.parameters(), lr=self.config["learning_rate"]
         )
 
+        self.__iteration = 0
+        self.path = os.path.join(f"/tmp/", f"model_{uuid.uuid4()}")
+        while os.path.exists(self.path):
+            self.path = os.path.join(f"/tmp/", f"model_{uuid.uuid4()}")
+        os.makedirs(self.path)
+
+    @property
+    def iteration(self):
+        self.__iteration += 1
+        return self.__iteration - 1
+
+    def save_model(self):
+        """
+        Saves the model to a file.
+
+        Returns:
+            None
+        """
+        path = os.path.join(self.path, f"iteration_{self.iteration}.pth")
+        logger.info(f"Saving model to {self.path}")
+        torch.save(self.net.state_dict(), path)
+
     def train(self):
         """
         Trains the network for the given number of epochs using the given optimizer and criterion.
@@ -53,6 +77,7 @@ def train(self):
 
             avg_loss = total_loss / len(self.trainloader)
             logger.warning(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")
+        self.save_model()
 
     def local_evaluate(self):
         """

diff --git a/examples/neural_net/dataset.py b/examples/neural_net/dataset.py
@@ -37,9 +37,7 @@ def load_datasets(
     num_samples: int = 1000,
     num_features: int = 10,
 ):
-    dataset = NeuralNetDataset(
-        num_samples, num_features
-    )  # 1000 samples, 10 features
+    dataset = NeuralNetDataset(num_samples, num_features)  # 1000 samples, 10 features
 
     # Split dataset into `num_clients` partitions
     partition_size = len(dataset) // num_clients

diff --git a/examples/neural_net/model.py b/examples/neural_net/model.py
@@ -61,7 +61,7 @@ def forward(self, x):
                 predicted = (predicted > 0.5).float()
                 total += predicted.size(0)
                 correct += (predicted == labels).sum().item()
-                #if predicted != labels:
+                # if predicted != labels:
                 #    print(f"Predicted: {predicted}, Actual: {labels}")
 
         accuracy = 100 * correct / total

diff --git a/examples/neural_net_5/client.py b/examples/neural_net_5/client.py
@@ -1,4 +1,6 @@
 import argparse
+import os
+import uuid
 from collections import OrderedDict
 
 import numpy as np
@@ -25,6 +27,28 @@ def __init__(self, net, trainloader, valloader, config):
             self.net.parameters(), lr=self.config["learning_rate"]
         )
 
+        self.__iteration = 0
+        self.path = os.path.join(f"/tmp/", f"model_{uuid.uuid4()}")
+        while os.path.exists(self.path):
+            self.path = os.path.join(f"/tmp/", f"model_{uuid.uuid4()}")
+        os.makedirs(self.path)
+
+    @property
+    def iteration(self):
+        self.__iteration += 1
+        return self.__iteration - 1
+
+    def save_model(self):
+        """
+        Saves the model to a file.
+
+        Returns:
+            None
+        """
+        path = os.path.join(self.path, f"iteration_{self.iteration}.pth")
+        logger.info(f"Saving model to {self.path}")
+        torch.save(self.net.state_dict(), path)
+
     def train(self):
         """
         Trains the network for the given number of epochs using the given optimizer and criterion.
@@ -54,6 +78,8 @@ def train(self):
             avg_loss = total_loss / len(self.trainloader)
             logger.warning(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")
 
+        self.save_model()
+
     def local_evaluate(self):
         """
         Evaluates the network on the validation dataset and returns the accuracy.

diff --git a/examples/neural_net_5/dataset.py b/examples/neural_net_5/dataset.py
@@ -37,9 +37,7 @@ def load_datasets(
     num_samples: int = 1000,
     num_features: int = 10,
 ):
-    dataset = NeuralNetDataset(
-        num_samples, num_features
-    )  # 1000 samples, 10 features
+    dataset = NeuralNetDataset(num_samples, num_features)  # 1000 samples, 10 features
 
     # Split dataset into `num_clients` partitions
     partition_size = len(dataset) // num_clients

diff --git a/examples/neural_net_5/model.py b/examples/neural_net_5/model.py
@@ -61,7 +61,7 @@ def forward(self, x):
                 predicted = (predicted > 0.5).float()
                 total += predicted.size(0)
                 correct += (predicted == labels).sum().item()
-                #if predicted != labels:
+                # if predicted != labels:
                 #    print(f"Predicted: {predicted}, Actual: {labels}")
 
         accuracy = 100 * correct / total

diff --git a/nillion_fl/client.py b/nillion_fl/client.py
@@ -61,8 +61,6 @@ def learning_iteration(self, learning_request):
         if modulo > 0:
             expected_results += 1
 
-
-
         def store_secrets_thread():
             """
             Thread function to store secrets.
@@ -82,7 +80,7 @@ def store_secrets_thread():
         new_parameters = self.nillion_client.get_compute_result(expected_results)
         new_parameters = sorted(new_parameters, key=lambda x: x[0])
         new_parameters = np.concatenate([x[1] for x in new_parameters])
-        self.parameters = new_parameters[:self.num_parameters]
+        self.parameters = new_parameters[: self.num_parameters]
 
         # Wait for the secrets storage thread to finish
         logger.debug("Waiting for thread to join()...")

diff --git a/nillion_fl/nilvm/client.py b/nillion_fl/nilvm/client.py
@@ -177,7 +177,7 @@ async def get_compute_results_from_nillion(self, num_results):
                 logger.debug(
                     "✅  Compute complete for compute_id %s", compute_event.uuid
                 )
-                #logger.debug("🖥️  The result is %s", compute_event.result.value)
+                # logger.debug("🖥️  The result is %s", compute_event.result.value)
         return compute_results
 
     async def get_compute_result(self, num_results=1):

diff --git a/nillion_fl/nilvm/fed_avg/src/custom_fed_avg.py b/nillion_fl/nilvm/fed_avg/src/custom_fed_avg.py
@@ -4,10 +4,11 @@
 
 # Step 0: Nada Numpy is imported with this line
 import nada_numpy as na
-from nada_dsl import Output, PublicInteger, Input, SecretInteger
+from nada_dsl import Input, Output, PublicInteger, SecretInteger
 
 NUM_PARTIES = 2
-DIM = 2500
+DIM = 500
+
 
 def nada_main() -> List[Output]:
     """