class Dense(Layer):
    def __init__(self, units):
        super(Dense, self).__init__(units)
        self.W = None
        self.b = None
        
    def fprop(self, inputs, pass_type='train'):
        self.inputs = inputs
        if self.W is None:
            self.W = np.random.uniform(low=-0.01, high=0.01, size=(self.units, inputs.shape[1]))
            self.b = np.random.uniform(low=-0.01, high=0.01, size=self.units)
        return np.dot(inputs, self.W.T) + self.b
    
    def bprop(self, outputs_deriv):
        return np.dot(outputs_deriv, self.W)
    
    def update_weights(self, outputs_deriv, learning_rate):
        self.W -= learning_rate * np.dot(outputs_deriv.T, self.inputs)
        self.b -= learning_rate * outputs_deriv.sum(axis=0)# forward-propagation step
output = x_batch
for layer in self.layers:
    output = layer.fprop(output, pass_type='train')
                
# backward-propagation step
outputs_deriv = self.loss.grad(y_batch, output)
for layer in self.layers[::-1]:
    layer.update_weights(outputs_deriv, learning_rate)
    outputs_deriv = layer.bprop(outputs_deriv)