Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML Integration] Integration Branch Combining C++ and Python Work #23

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ MODULE.bazel.lock
bazel-CloudMesh

data/*
subtaskIndex_*.txt
third_party/cppzmq*
third_party/libzmq*

# ml dependencies
*.pyc
CIFAR10/*.txt
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# CloudMesh
# CloudMesh

## Dependencies
Follow this to install bazel - https://bazel.build/install
Expand Down
2 changes: 2 additions & 0 deletions include/IPC/zmq_receiver.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@ class ZMQReceiver {
unsigned int port;
zmq::context_t context;
zmq::socket_t socket;
std::string address;

public:
ZMQReceiver();
std::string receive();
std::string getAddress();
};

#endif // _ZMQ_RECEIVER_H_
4 changes: 3 additions & 1 deletion include/IPC/zmq_sender.h
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
#ifndef _ZMQ_SENDER_H_
#define _ZMQ_SENDER_H_

#include "../utility.h"
#include <string>
#include <zmq.hpp>
#include "../utility.h"

class ZMQSender {
unsigned int port;
zmq::context_t context;
zmq::socket_t socket;
std::string address;

public:
ZMQSender();
void send(const std::string& message);
std::string getAddress();
};

#endif // _ZMQ_SENDER_H_
12 changes: 8 additions & 4 deletions include/Peers/provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@ class Provider : public Peer {
bool isLeader;
std::unique_ptr<TaskRequest> taskRequest;
std::unique_ptr<TaskResponse> taskResponse;
ZMQSender zmq_sender;
ZMQReceiver zmq_receiver;

ZMQSender ml_zmq_sender;
ZMQReceiver ml_zmq_receiver;

ZMQSender aggregator_zmq_sender;
ZMQReceiver aggregator_zmq_receiver;

public:
Provider(const char* port, std::string uuid);
Expand All @@ -32,9 +36,9 @@ class Provider : public Peer {
void followerHandleTaskRequest();
void processData();
void processWorkload(); // worker function to manipulate the TaskRequest
std::vector<int>
std::string
ingestTrainingData(); // worker function to load training data into memory
TaskResponse aggregateResults(std::vector<std::vector<int>> followerData);
TaskResponse aggregateResults(std::vector<std::string> followerData);
};

#endif
9 changes: 5 additions & 4 deletions include/RequestResponse/task_response.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
#include "payload.h"

class TaskResponse : public Payload {
std::vector<int> trainingData;
// bytes representing the training data
std::string modelStateDict;

public:
TaskResponse();
TaskResponse(const std::vector<int>& trainingData);
TaskResponse(const std::string& modelStateDict);

std::vector<int> getTrainingData() const;
void setTrainingData(const std::vector<int>& trainingData);
std::string getTrainingData() const;
void setTrainingData(const std::string& modelStateDict);

google::protobuf::Message* serializeToProto() const override;
void deserializeFromProto(
Expand Down
2 changes: 1 addition & 1 deletion include/utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ namespace fs = std::filesystem;
/*
* Defines the data location of training files.
*/
const std::string DATA_DIR = "data";
const std::string DATA_DIR = "CIFAR10/train";

struct IpAddress {
std::string host;
Expand Down
2 changes: 1 addition & 1 deletion main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ int main(int argc, char* argv[]) {

if (requestType == "c") {
TaskRequest request =
TaskRequest(numRequestedWorkers, ".*subtaskData_.*\\.txt$",
TaskRequest(numRequestedWorkers, ".*\\.jpg$",
TaskRequest::GLOB_PATTERN);
r.setTaskRequest(request);
// sends the task request to the leader and provider peers
Expand Down
74 changes: 74 additions & 0 deletions ml/aggregator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# This is the main script for training a SimpleCNN model on CIFAR10 dataset.
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import time
import pickle
import zmq

from networks import SimpleCNN
from dataloader import CIFAR10Dataset, get_data_loaders
from utils import train, val, test


def nn_aggregator(state_dicts):
"""
This function takes in a list of models and averages their weights.
"""
# get the state_dicts of the models
keys = state_dicts[0].keys()

# initialize the average state_dict and do the averaging
avg_state_dict = {}
for key in keys:
tensors = [
state_dict[key].float()
for state_dict in state_dicts
if key in state_dict.keys()
]
avg_state_dict[key] = torch.stack(tensors).mean(dim=0)

return avg_state_dict


def main():
# Set up the context and responder socket
port_rec = int(input("Enter the ZMQ sender port number: "))
port_send = int(input("Enter the ZMQ reciever port number: "))

context = zmq.Context()
responder = context.socket(zmq.REP)
responder.setsockopt(zmq.LINGER, 0)
responder.bind("tcp://*:" + str(port_rec))

sender = context.socket(zmq.REQ)
sender.setsockopt(zmq.LINGER, 0)
sender.connect("tcp://localhost:" + str(port_send))

# recieve the models from fake_peer.py
state_dicts = []
for i in range(3):
sd = responder.recv()
sd = pickle.loads(sd)
state_dicts.append(sd)
responder.send_string("ACK")

# average the models
avg_state_dict = nn_aggregator(state_dicts)

# send the averaged model back to fake_peer.py
avg_model = SimpleCNN()
avg_model.load_state_dict(avg_state_dict)
avg_model = pickle.dumps(avg_model)

sender.send(avg_model)
_ = sender.recv_string()

print("Model averaging complete.")
return


if __name__ == "__main__":
main()
13 changes: 9 additions & 4 deletions ml/dataloader/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


class CIFAR10Dataset(Dataset):
def __init__(self, root, transform=None):
def __init__(self, root, data_file_names, transform=None):
self.root = root
self.transform = transform
self.classes = (
Expand All @@ -22,9 +22,14 @@ def __init__(self, root, transform=None):
"truck",
)
self.class_to_idx = {cls_name: i for i, cls_name in enumerate(self.classes)}
self.samples = [
s for s in os.listdir(root) if os.path.isfile(os.path.join(root, s))
]
samples = [s for s in os.listdir(root) if os.path.isfile(os.path.join(root, s))]

self.samples = []
for s in samples:
for dfn in data_file_names:
if dfn in s:
self.samples.append(s)
break

def __len__(self):
return len(self.samples)
Expand Down
123 changes: 123 additions & 0 deletions ml/fake_peer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import time
import zmq
import pickle

from utils import test
from dataloader import CIFAR10Dataset, get_data_loaders


def main():
port_first = 5555
port_second = 5557
port_third = 5559
port_aggregator = 5561

context = zmq.Context()

responder1 = context.socket(zmq.REP)
responder1.setsockopt(zmq.LINGER, 0)
responder1.bind("tcp://*:" + str(port_first))
sender1 = context.socket(zmq.REQ)
sender1.connect("tcp://localhost:" + str(port_first + 1))

responder2 = context.socket(zmq.REP)
responder2.setsockopt(zmq.LINGER, 0)
responder2.bind("tcp://*:" + str(port_second))
sender2 = context.socket(zmq.REQ)
sender2.connect("tcp://localhost:" + str(port_second + 1))

responder3 = context.socket(zmq.REP)
responder3.setsockopt(zmq.LINGER, 0)
responder3.bind("tcp://*:" + str(port_third))
sender3 = context.socket(zmq.REQ)
sender3.connect("tcp://localhost:" + str(port_third + 1))

responder_aggregator = context.socket(zmq.REP)
responder_aggregator.setsockopt(zmq.LINGER, 0)
responder_aggregator.bind("tcp://*:" + str(port_aggregator))
sender_aggregator = context.socket(zmq.REQ)
sender_aggregator.connect("tcp://localhost:" + str(port_aggregator + 1))

payload = [
"train_first.txt",
"train_second.txt",
"train_third.txt",
]

# send the payload to the workers
print("Sending payload to workers")
sender1.send_string(payload[0])
sender2.send_string(payload[1])
sender3.send_string(payload[2])
_ = sender1.recv_string()
_ = sender2.recv_string()
_ = sender3.recv_string()

# recieve payload with all of the trained models
print("Receiving models from workers")
models = []

model = responder1.recv()
model = pickle.loads(model)
models.append(model)
responder1.send_string("ACK")

model = responder2.recv()
model = pickle.loads(model)
models.append(model)
responder2.send_string("ACK")

model = responder3.recv()
model = pickle.loads(model)
models.append(model)
responder3.send_string("ACK")

# send the models to the aggregator
print("Sending models to aggregator")

for model in models:
sender_aggregator.send(pickle.dumps(model))
ack = sender_aggregator.recv_string()
print(ack)

# recieve the final model from the aggregator
print("Receiving final model from aggregator")
final_model = responder_aggregator.recv()
final_model = pickle.loads(final_model)
responder_aggregator.send_string("ACK")
print("Final model received")
print(final_model)

# test the final model
print("Testing the final model")
transform = transforms.Compose(
[
transforms.Resize((32, 32)),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
]
)
payload = "test.txt"
data_path = "CIFAR10/"
data_file_names = []
with open(os.path.join(data_path, payload)) as f:
data_file_names = f.read().splitlines()
test_dataset = CIFAR10Dataset(
os.path.join(data_path, "test"), data_file_names, transform=transform
)
batch_size = 64
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
final_model.to(device)
criterion = nn.CrossEntropyLoss()
test(final_model, device, test_loader, criterion, data_path)

return


if __name__ == "__main__":
main()
39 changes: 39 additions & 0 deletions ml/generate_temp_glob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import os
import matplotlib.pyplot as plt
import numpy as np

np.random.seed(0)


def generate_temp_glob(folder):
files = os.listdir(folder)
files = [file for file in files if file.endswith(".jpg")]
files = np.array(files)

# split into three randomly but equally distributed arrays
np.random.shuffle(files)
one_third, two_third = int(len(files) / 3), int(2 * len(files) / 3)
first, second, third = (
files[:one_third],
files[one_third:two_third],
files[two_third:],
)

return first, second, third


def save_as_txt(files, filename):
with open(filename, "w") as f:
for file in files:
f.write(file + "\n")


if __name__ == "__main__":
first, second, third = generate_temp_glob("CIFAR10/train")
save_as_txt(first, "ml/train_first.txt")
save_as_txt(second, "ml/train_second.txt")
save_as_txt(third, "ml/train_third.txt")

files = os.listdir("CIFAR10/test")
files = [file for file in files if file.endswith(".jpg")]
save_as_txt(files, "ml/test.txt")
Loading