Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DQN: batch_shape = (None,) + tuple(shape) TypeError: 'int' object is not iterable #34

Open
Nazanin-87 opened this issue Aug 16, 2022 · 0 comments

Comments

@Nazanin-87
Copy link

Hi everyone,
I modified the DQN algorithm in this repository to a multi-agent DQN approach for a wireless network environment. Actually, I wrote this code inspired by a repository on GitHub. Although the original code works well, when I change the environment, the following error occurs.
Traceback (most recent call last): File "D:/main -DQN.py", line 452, in <module> main() File "D:/main -DQN.py", line 432, in main algo = DQN( args) # n_clusters is the action dimension in DQN File "D:/main -DQN.py", line 158, in __init__ self.agent = Agent( args, self.tau) File "D:/main -DQN.py", line 246, in __init__ self.model = self.network() File "D:/main -DQN.py", line 254, in network inp = Input((self.state_dim)) File "C:\Users\AppData\Roaming\Python\Python37\site-packages\keras\engine\topology.py", line 1451, in Input batch_shape = (None,) + tuple(shape) TypeError: 'int' object is not iterable
The complete code is as follows:
`
from keras.backend.tensorflow_backend import set_session
import tensorflow as tf
import pandas as pd
import numpy as np
import sys
import os
import copy, json, argparse
from numpy import pi
from random import random, uniform, choices, randint, sample, randrange
import random
import math
from tqdm import tqdm
import keras.backend as K
from keras.optimizers import Adam
from keras.models import Model
from keras.layers import Dense, Flatten, Input
from collections import deque

class Environ:

def __init__(self, args):
    self.args=args
    self.state_dim= (self.args.A, )
    self.action_dim=args.C
    self.bs = complex((500 / 2), (500/ 2))
    self.S=(np.zeros(self.args.A)).reshape(-1)

def Location(self):
    rx = uniform(0, 500)
    ry = uniform(0, 500)
    Loc = complex(rx, ry)
    return Loc

def PathGain(self,Loc):
    d = abs(Loc- self.bs)
    d=d  **(-3)
    u = np.random.rand(1, 1)
    sigma = 1
    x = sigma * np.sqrt(-2 * np.log(u))
    h=  d* x
    return h

def reset(self):  # Reset the states
    s=np.zeros(self.args.A)
    return s.reshape(-1)

def RecievePower(self,UsersLoc):
    H=self.PathGain(UsersLoc)
    UsersRecievePower=self.args.P*H
    return UsersRecievePower

def TotalRate(self, actionRB_i,actionRB):
    interference = self.args.Noise
    Loc_i=self.Location()
    for j in range(self.args.A):
        if actionRB_i ==actionRB[j] :
            Loc_j = self.Location()
            RecievePower_j = self.RecievePower(Loc_j)
            interference = interference + RecievePower_j
        else:
            interference= interference
    RecievePower_i = self.RecievePower(Loc_i)
    SINR = interference / (interference-RecievePower_i)
    Rate =self.args.BW*( np.log2( SINR))
    return Rate

def computeQoS(self,actionRB,actionRB_i):
    TotalRate=self.TotalRate(actionRB,actionRB_i)
    if TotalRate >=self.args.Rmin:
        QoS=1.0
    else:
        QoS=0.0
    return QoS

def ComputeState(self,actionRB):
    QoS=np.zeros(self.args.A)
    for i in range(self.args.A):
        actionRB_i=actionRB[i]
        QoS[i] = self.computeQoS(actionRB,actionRB_i)
    S = np.zeros( self.args.A)
    for i in range(self.args.A):
        S[i]=QoS[i]
    self.S=S
    return self.S.reshape(-1)

def Reward(self,actionRB,actionRB_i):
    Rate = np.zeros(self.args.A)
    Satisfied_Users = 0
    for i in range(self.args.A):
        Rate[i] = self.TotalRate(actionRB, actionRB_i)
        Satisfied_Users = Satisfied_Users + self.computeQoS(actionRB)
    TotalRate = 0.0
    TotalPower = self.args.circuitPower
    for i in range(self.args.A):
        TotalRate = TotalRate + Rate[i]
        TotalPower = TotalPower + self.args.P
    if Satisfied_Users == self.args.A:
        reward = TotalRate / TotalPower
    else:
        reward = self.args.negative_cost
    return reward

def step(self,actionRB):
    next_s = self.ComputeState(actionRB)
    r = self.Reward(actionRB)
    done = False
    info = None
    return next_s, r, done, info

class Environment(object):

def __init__(self, gym_env, action_repeat):
    self.env = gym_env
    self.timespan = action_repeat
    self.gym_actions = 2  # range(gym_env.action_space.n)
    self.state_buffer = deque()

def get_action_size(self):
    return self.env.action_dim

def get_state_size(self):
    return self.env.state_dim

def reset(self):
    # Clear the state buffer
    self.state_buffer = deque()
    x_t = self.env.reset()
    s_t = np.stack([x_t for i in range(self.timespan)], axis=0)
    for i in range(self.timespan - 1):
        self.state_buffer.append(x_t)
    return s_t

def step(self, action):
    x_t1, r_t, terminal, info = self.env.step(action)
    previous_states = np.array(self.state_buffer)
    s_t1 = np.empty((self.timespan, *self.env.state_dim))
    s_t1[:self.timespan - 1, :] = previous_states
    s_t1[self.timespan - 1] = x_t1
    # Pop the oldest frame, add the current frame to the queue
    self.state_buffer.popleft()
    self.state_buffer.append(x_t1)
    return s_t1, r_t, terminal, info

def render(self):
    return self.env.render()

class DQN:
def init(self, args):
# Environment and DQN parameters
self.args=args
self.action_dim = self.args.C
self.state_dim = self.args.A
self.buffer_size = self.args.capacity
# Memory Buffer for Experience Replay
self.buffer = MemoryBuffer(self.buffer_size)
self.epsilon=self.args.eps
self.tau = 1.0
self.agent = Agent( args, self.tau)

def policy_action(self, s):
    if random() <= self.epsilon:
        return randrange(self.action_dim)
    else:
        return np.argmax(self.agent.predict(s)[0])

def train_agent(self):
    # Sample experience from memory buffer
    s, a, r, d, new_s, idx = self.buffer.sample_batch(self.batch_size)
    # Apply Bellman Equation on batch samples to train our DQN
    q  = self.agent.predict(s)
    next_q  = self.agent.predict(new_s)
    q_targ  = self.agent.target_predict(new_s)
    for i in range(s.shape[0]):
        if d[i]:
            q[i, a[i]] = r[i]
        else:
            next_best_action = np.argmax(next_q[i, :])
            q[i, a[i]] = r[i] + self.args.gamma * q_targ[i, next_best_action]
    # Train on batch
    self.agent.fit(s, q)
    # Decay epsilon
    self.epsilon *= self.args.eps_decay

def train(self, env, args, summary_writer):
    results = []
    tqdm_e = tqdm(range(self.args.nepisodes), desc='Score', leave=True, unit=" episodes")
    for e in tqdm_e:
        # Reset episode
        time, cumul_reward, done = 0, 0, False
        old_state = env.reset()

        while not done:
            # if args.render:
            #     env.render()
            # Actor picks an action (following the policy)
            a=[]
            for i in range(self.args.A):
                a[i]= self.policy_action(old_state)

            # Retrieve new state, reward, and whether the state is terminal
            new_state, r, done, _ = env.step(a)
            # Memorize for experience replay
            self.memorize(old_state, a, r, done, new_state)
            # Update current state
            old_state = new_state
            cumul_reward += r
            time += 1
            # Train DDQN and transfer weights to target network
            if(self.buffer.size() > args.batch_size):
                self.train_agent(self.args.batch_size)
                self.agent.transfer_weights()
       # Gather stats every episode for plotting
        if(args.gather_stats):
            mean, stdev = gather_stats(self, env)
            results.append([e, mean, stdev])

        # Export results for Tensorboard
        score = tfSummary('score', cumul_reward)
        summary_writer.add_summary(score, global_step=e)
        summary_writer.flush()

        # Display score
        tqdm_e.set_description("Score: " + str(cumul_reward))
        tqdm_e.refresh()

    return results

def memorize(self, state, action, reward, done, new_state):
    self.buffer.memorize(state, action, reward, done, new_state)

def save_weights(self, path):
    path += '_LR_{}'.format(self.args.learningrate)
    self.agent.save(path)

def load_weights(self, path):
    self.agent.load_weights(path)

class Agent:
def init(self, args, tau):
self.args=args
self.state_dim = self.args.A
self.action_dim = self.args.C
self.tau = tau
self.lr=self.args.learningrate
# Initialize Deep Q-Network
self.model = self.network()
self.model.compile(Adam(self.lr), 'mse')
# Build target Q-Network
self.target_model = self.network()
self.target_model.compile(Adam(self.lr), 'mse')
self.target_model.set_weights(self.model.get_weights())

def network(self):
    inp = Input((self.state_dim))

    if(len(self.state_dim) > 2):
        inp = Input((self.state_dim[1:]))
        x = conv_block(inp, 32, (2, 2), 8)
        x = conv_block(x, 64, (2, 2), 4)
        x = conv_block(x, 64, (2, 2), 3)
        x = Flatten()(x)
        x = Dense(256, activation='relu')(x)
    else:
        x = Flatten()(inp)
        x = Dense(64, activation='relu')(x)
        x = Dense(64, activation='relu')(x)

    x = Dense(self.action_dim, activation='linear')(x)
    return Model(inp, x)

def transfer_weights(self):
    W = self.model.get_weights()
    tgt_W = self.target_model.get_weights()
    for i in range(len(W)):
    #  updated based on Polyak averaging method
        tgt_W[i] = self.tau * W[i] + (1 - self.tau) * tgt_W[i]
    self.target_model.set_weights(tgt_W)

def fit(self, inp, targ):
    self.model.fit(self.reshape(inp), targ, epochs=1, verbose=0)

def predict(self, inp):
    return self.model.predict(self.reshape(inp))

def target_predict(self, inp):
    return self.target_model.predict(self.reshape(inp))

def reshape(self, x):
    if len(x.shape) < 4 and len(self.state_dim) > 2:
        return np.expand_dims(x, axis=-1)
    elif len(x.shape) < 3:
        return np.expand_dims(x, axis=-1)
    else:
        return x

def save(self, path):
    self.model.save_weights(path + '.h5')

def load_weights(self, path):
    self.model.load_weights(path)

class MemoryBuffer(object):
def init(self, buffer_size):
# Standard Buffer
self.buffer = deque()
self.count = 0
self.buffer_size = buffer_size

def memorize(self, state, action, reward, done, new_state):
    experience = (state, action, reward, done, new_state)
    # Check if buffer is already full
    if self.count < self.buffer_size:
        self.buffer.append(experience)
        self.count += 1
    else:
        self.buffer.popleft()
        self.buffer.append(experience)

def size(self):
    return self.count

def sample_batch(self, batch_size):
    batch = []
    if self.count < batch_size:
        idx = None
        batch = random.sample(self.buffer, self.count)
    else:
        idx = None
        batch = random.sample(self.buffer, batch_size)

    # Return a batch of experience
    s_batch = np.array([i[0] for i in batch])
    a_batch = np.array([i[1] for i in batch])
    r_batch = np.array([i[2] for i in batch])
    d_batch = np.array([i[3] for i in batch])
    new_s_batch = np.array([i[4] for i in batch])
    return s_batch, a_batch, r_batch, d_batch, new_s_batch, idx

def update(self, idx):
    self.buffer.update(idx)

def clear(self):
    self.buffer = deque()
    self.count = 0

def get_session():
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
return tf.Session(config=config)

def tfSummary(tag, val):
return tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=val)])

def gather_stats(agent, env):
score = []
for k in range(10):
old_state = env.reset()
cumul_r, done = 0, False
while not done:
a = agent.policy_action(old_state)
old_state, r, done, _ = env.step(a)
cumul_r += r
score.append(cumul_r)
return np.mean(np.array(score)), np.std(np.array(score))

def conv_block(inp, d=3, pool_size=(2, 2), k=3):
conv = conv_layer(d, k)(inp)
return MaxPooling2D(pool_size=pool_size)(conv)

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

def parse_args(args):
parser = argparse.ArgumentParser(description='Training parameters')
#
parser.add_argument('--out_dir', type=str, default='experiments', help="Name of the output directory")
parser.add_argument('--consecutive_frames', type=int, default=2, help="Number of consecutive frames (action repeat)")
parser.add_argument('--gather_stats', dest='gather_stats', action='store_true', help="Compute Average reward per episode (slower)")
parser.add_argument('--A', type=int, default='10', help="The number of agents")
parser.add_argument('--C', type=int, default='30', help="The number of Resources")
parser.add_argument('--Noise', type=float, default='0.00000000000001', help="The background noise")
parser.add_argument('--BW', type=int, default='180000', help="The bandwidth")
parser.add_argument('--Rmin', type=int, default='1000000', help="Agents' QoS")
parser.add_argument('--P', type=float, default='0.01', help="Agents' transmit power")
parser.add_argument('--circuitPower', type=float, default='0.05', help="The circuit Power")
parser.add_argument('--negative_cost', type=float, default='-1.0', help="The negative cost")
parser.add_argument('--capacity', type=int, default='500', help="Capacity of Replay Buffer")
parser.add_argument('--learningrate', type=float, default='0.01', help="The learning rate")
parser.add_argument('--eps', type=float, default='0.8', help="The epsilon")
parser.add_argument('--eps_decay', type=float, default='0.99', help="The epsilon decay")
parser.add_argument('--eps_increment', type=float, default='0.003', help="The epsilon increment")
parser.add_argument('--batch_size', type=int, default='8', help="The batch size")
parser.add_argument('--gamma', type=float, default='0.99', help="The discount factor")
parser.add_argument('--nepisodes', type=int, default='500', help="The number of episodes")
parser.add_argument('--nsteps', type=int, default='500', help="The number of steps")
parser.add_argument('--env', type=str, default='Environ', help="Wireless environment")
parser.add_argument('--gpu', type=str, default="", help='GPU ID')

args=parser.parse_args(args)

parser.set_defaults(render=False)
return args

def main(args=None):
# Parse arguments
if args is None:
args = sys.argv[1:]
args = parse_args(args)
# Check if a GPU ID was set
if args.gpu:
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
set_session(get_session())

summary_writer = tf.summary.FileWriter("/tensorboard_" + args.env)
# Initialize the wireless environment
users_env = Environ(args)
# print(users_env)

# Wrap the environment to use consecutive frames
env = Environment(users_env, args.consecutive_frames)
env.reset()

# Define parameters for the DDQN and DDPG algorithms
state_dim = env.get_state_size()
action_dim = users_env.action_dim
# The maximum and minimum values for precoding vectors
# act_range = 1
# act_min = 0

# Initialize the DQN algorithm for the clustering optimization
algo = DQN( args)  # n_clusters is the action dimension in DQN
# if args.step == "train":
    # Train
stats = algo.train(env, args, summary_writer)
# Export results to CSV
if(args.gather_stats):
    df = pd.DataFrame(np.array(stats))
    df.to_csv(args.out_dir + "/logs.csv", header=['Episode', 'Mean', 'Stddev'], float_format='%10.5f')
    # df.to_csv(args.type + "/logs.csv", header=['Episode', 'Mean', 'Stddev'], float_format='%10.5f')

    # Save weights and close environments
exp_dir = '{}/models_A_{}_C_{}_Rmin_{}/'.format(args.out_dir, args.A, args.C, args.Rmin)
# exp_dir = '{}/models/'.format(args.type)
if not os.path.exists(exp_dir):
    os.makedirs(exp_dir)
# Save DDQN
export_path = '{}_{}_NB_EP_{}_BS_{}'.format(exp_dir, "DQN", args.nepisodes, args.batch_size)
algo.save_weights(export_path)

if name == "main":
main()
`
Thanks in advance for your help.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant