diff --git a/src/layer.rs b/src/layer.rs index 8cf8517b..afb5687a 100644 --- a/src/layer.rs +++ b/src/layer.rs @@ -210,8 +210,8 @@ impl Layer { } let backend: Rc> = self.backend.clone(); - blob_data = Arc::new(RwLock::new(SharedTensor::new(backend.device(), &vec![1,1,1]).unwrap())); // [1,1,1] for CUDA - blob_gradient = Arc::new(RwLock::new(SharedTensor::new(backend.device(), &vec![1,1,1]).unwrap())); // [1,1,1] for CUDA + blob_data = Arc::new(RwLock::new(SharedTensor::new(&[1,1,1]))); // [1,1,1] for CUDA + blob_gradient = Arc::new(RwLock::new(SharedTensor::new(&[1,1,1]))); // [1,1,1] for CUDA } self.output_blob_names.push(blob_name.clone()); self.output_blobs_data.push(blob_data.clone()); @@ -234,8 +234,8 @@ impl Layer { info!("{} -> {}", self.name, blob_name); let backend: Rc> = self.backend.clone(); - let output_data = Arc::new(RwLock::new(SharedTensor::new(backend.device(), &vec![1,1,1]).unwrap())); // [1,1,1] for CUDA - let output_gradient = Arc::new(RwLock::new(SharedTensor::new(backend.device(), &vec![1,1,1]).unwrap())); // [1,1,1] for CUDA + let output_data = Arc::new(RwLock::new(SharedTensor::new(&[1,1,1]))); // [1,1,1] for CUDA + let output_gradient = Arc::new(RwLock::new(SharedTensor::new(&[1,1,1]))); // [1,1,1] for CUDA self.output_blobs_data.push(output_data); self.output_blobs_gradient.push(output_gradient); } @@ -264,8 +264,8 @@ impl Layer { let net_weight_id = weights_len; let output_data = self.output_blobs_data[weight_id].read().unwrap(); debug!("Layer {} - creating weight and gradient of size {:?}", &layer_config.name, output_data.desc()); - let weight_data = Arc::new(RwLock::new(SharedTensor::::new(output_data.latest_device(), output_data.desc()).unwrap())); - let weight_gradient = Arc::new(RwLock::new(SharedTensor::::new(output_data.latest_device(), output_data.desc()).unwrap())); + let weight_data = Arc::new(RwLock::new(SharedTensor::new(output_data.desc()))); + let weight_gradient = Arc::new(RwLock::new(SharedTensor::new(output_data.desc()))); self.weights_data.push(weight_data.clone()); self.weights_gradient.push(weight_gradient.clone()); @@ -460,11 +460,6 @@ impl Layer { self.input_blobs_data[input_i].write().unwrap().reshape(&reshaped_shape).unwrap(); } - self.worker.sync(&self.backend, - &mut self.input_blobs_data, &mut self.input_blobs_gradient, - &mut self.weights_data, &mut self.weights_gradient, - &mut self.output_blobs_data, &mut self.output_blobs_gradient); - let forward_time = timeit_loops!(1, { if self.is_using_in_place() { self.worker.forward(&self.backend, &vec![], &self.weights_data, &mut self.output_blobs_data); @@ -497,11 +492,6 @@ impl Layer { self.output_blobs_gradient[output_i] = output.clone(); } - self.worker.sync(&self.backend, - &mut self.input_blobs_data, &mut self.input_blobs_gradient, - &mut self.weights_data, &mut self.weights_gradient, - &mut self.output_blobs_data, &mut self.output_blobs_gradient); - if self.is_using_in_place() { self.worker.backward_input(&self.backend, &self.weights_data, @@ -527,11 +517,6 @@ impl Layer { /// /// This method is mostly used when doing backpropagation. pub fn backward_parameters(&mut self) { - self.worker.sync(&self.backend, - &mut self.input_blobs_data, &mut self.input_blobs_gradient, - &mut self.weights_data, &mut self.weights_gradient, - &mut self.output_blobs_data, &mut self.output_blobs_gradient); - self.worker.backward_parameters(&self.backend, &self.output_blobs_data, &self.output_blobs_gradient, @@ -553,13 +538,11 @@ impl Layer { /// /// [3]: ../solver/enum.LRPolicy.html pub fn update_weights>(&mut self, backend: &SolverB) { - let mut shared_a = ::util::native_scalar(-1f32); - let _ = shared_a.add_device(IBackend::device(backend)); - shared_a.sync(IBackend::device(backend)).unwrap(); + // PERF: allocate this scalar once + let shared_a = ::util::native_scalar(-1f32); for (weight_gradient, weight_data) in self.learnable_weights_gradients().iter().zip(&mut self.learnable_weights_data()) { - weight_gradient.write().unwrap().sync(IBackend::device(backend)).unwrap(); - weight_data.write().unwrap().sync(IBackend::device(backend)).unwrap(); - backend.axpy_plain(&shared_a, &weight_gradient.read().unwrap(), &mut weight_data.write().unwrap()).unwrap(); + backend.axpy(&shared_a, &weight_gradient.read().unwrap(), + &mut weight_data.write().unwrap()).unwrap(); } } @@ -695,7 +678,6 @@ impl Layer { } let mut weight_lock = weight.write().unwrap(); - weight_lock.sync(native_backend.device()).unwrap(); let capnp_tensor = capnp_weight.get_tensor().unwrap(); let mut shape = Vec::new(); @@ -705,7 +687,7 @@ impl Layer { } weight_lock.reshape(&shape).unwrap(); - let mut native_slice = weight_lock.get_mut(native_backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice::(); + let mut native_slice = weight_lock.write_only(native_backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice::(); let data = capnp_tensor.get_data().unwrap(); for k in 0..data.len() { native_slice[k as usize] = data.get(k); @@ -814,8 +796,7 @@ impl<'a, B: IBackend> CapnpWrite<'a> for Layer { let mut capnp_weight = weights.borrow().get(i as u32); capnp_weight.set_name(name); - let mut weight_lock = weight.write().unwrap(); - weight_lock.sync(native_backend.device()).unwrap(); + let weight_lock = weight.write().unwrap(); let mut tensor = capnp_weight.init_tensor(); { @@ -825,7 +806,8 @@ impl<'a, B: IBackend> CapnpWrite<'a> for Layer { } } { - let native_slice = weight_lock.get(native_backend.device()).unwrap().as_native().unwrap().as_slice::(); + let native_slice = weight_lock.read(native_backend.device()) + .unwrap().as_native().unwrap().as_slice::(); let mut tensor_data = tensor.borrow().init_data(native_slice.len() as u32); for (i, datum) in native_slice.iter().enumerate() { tensor_data.set(i as u32, *datum); @@ -1025,74 +1007,6 @@ pub trait ILayer : ComputeOutput + ComputeInputGradient>], - input_gradients: &mut [ArcLock>], - weights_data: &mut [ArcLock>], - weights_gradients: &mut [ArcLock>], - output_data: &mut Vec>>, - output_gradients: &mut Vec>>) { - if self.sync_native() { - let backend = native_backend(); - for tensor in input_data { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in input_gradients { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in weights_data { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in weights_gradients { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in output_data { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in output_gradients { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - } else { - for tensor in input_data { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in input_gradients { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in weights_data { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in weights_gradients { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in output_data { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - for tensor in output_gradients { - let mut sync = tensor.write().unwrap(); - match sync.add_device(backend.device()) { _ => sync.sync(backend.device()).unwrap() } - } - } - } - /// Return whether "anonymous" output blobs are created automatically for the layer. /// /// If this method returns true, Network::init will create enough "anonymous" output diff --git a/src/layers/activation/relu.rs b/src/layers/activation/relu.rs index ecb266d6..c995eae3 100644 --- a/src/layers/activation/relu.rs +++ b/src/layers/activation/relu.rs @@ -56,8 +56,8 @@ impl + ReluPointwise> ComputeOutput for ReL input_data: &[&SharedTensor], output_data: &mut [&mut SharedTensor]) { match input_data.get(0) { - Some(input) => backend.relu_plain(input, output_data[0]).unwrap(), - None => backend.relu_pointwise_plain(output_data[0]).unwrap(), + Some(input) => backend.relu(input, output_data[0]).unwrap(), + None => backend.relu_pointwise(output_data[0]).unwrap(), } } } @@ -72,8 +72,8 @@ impl + ReluPointwise> ComputeInputGradient input_data: &[&SharedTensor], input_gradients: &mut [&mut SharedTensor]) { match output_data.get(0) { - Some(_) => backend.relu_grad_plain(output_data[0], output_gradients[0], input_data[0], input_gradients[0]).unwrap(), - None => backend.relu_pointwise_grad_plain(input_data[0], input_gradients[0]).unwrap(), + Some(_) => backend.relu_grad(output_data[0], output_gradients[0], input_data[0], input_gradients[0]).unwrap(), + None => backend.relu_pointwise_grad(input_data[0], input_gradients[0]).unwrap(), } } } @@ -115,7 +115,7 @@ impl> ComputeOutput for ReLU { input_data: &[&SharedTensor], output_data: &mut [&mut SharedTensor]) { match input_data.get(0) { - Some(input) => backend.relu_plain(input, output_data[0]).unwrap(), + Some(input) => backend.relu(input, output_data[0]).unwrap(), None => panic!("No input provided for ReLU layer."), } } @@ -131,7 +131,7 @@ impl> ComputeInputGradient for ReLU { input_data: &[&SharedTensor], input_gradients: &mut [&mut SharedTensor]) { match output_data.get(0) { - Some(_) => backend.relu_grad_plain(output_data[0], output_gradients[0], input_data[0], input_gradients[0]).unwrap(), + Some(_) => backend.relu_grad(output_data[0], output_gradients[0], input_data[0], input_gradients[0]).unwrap(), None => panic!("No output_data provided for ReLU layer backward."), } } diff --git a/src/layers/activation/sigmoid.rs b/src/layers/activation/sigmoid.rs index fb5a051c..4d4a6253 100644 --- a/src/layers/activation/sigmoid.rs +++ b/src/layers/activation/sigmoid.rs @@ -60,8 +60,8 @@ impl + conn::SigmoidPointwise> ComputeOutp input_data: &[&SharedTensor], output_data: &mut [&mut SharedTensor]) { match input_data.get(0) { - Some(input) => backend.sigmoid_plain(input, output_data[0]).unwrap(), - None => backend.sigmoid_pointwise_plain(output_data[0]).unwrap(), + Some(input) => backend.sigmoid(input, output_data[0]).unwrap(), + None => backend.sigmoid_pointwise(output_data[0]).unwrap(), } } } @@ -76,8 +76,9 @@ impl + conn::SigmoidPointwise> ComputeInpu input_data: &[&SharedTensor], input_gradients: &mut [&mut SharedTensor]) { match output_data.get(0) { - Some(_) => backend.sigmoid_grad_plain(output_data[0], output_gradients[0], input_data[0], input_gradients[0]).unwrap(), - None => backend.sigmoid_pointwise_grad_plain(input_data[0], input_gradients[0]).unwrap(), + Some(_) => backend.sigmoid_grad(output_data[0], output_gradients[0], + input_data[0], input_gradients[0]).unwrap(), + None => backend.sigmoid_pointwise_grad(input_data[0], input_gradients[0]).unwrap(), } } } @@ -119,7 +120,7 @@ impl> ComputeOutput for Sigmoid { input_data: &[&SharedTensor], output_data: &mut [&mut SharedTensor]) { match input_data.get(0) { - Some(input) => backend.sigmoid_plain(input, output_data[0]).unwrap(), + Some(input) => backend.sigmoid(input, output_data[0]).unwrap(), None => panic!("No input provided for Sigmoid layer."), } } @@ -135,7 +136,8 @@ impl> ComputeInputGradient for Sigmoid input_data: &[&SharedTensor], input_gradients: &mut [&mut SharedTensor]) { match output_data.get(0) { - Some(_) => backend.sigmoid_grad_plain(output_data[0], output_gradients[0], input_data[0], input_gradients[0]).unwrap(), + Some(_) => backend.sigmoid_grad(output_data[0], output_gradients[0], + input_data[0], input_gradients[0]).unwrap(), None => panic!("No output_data provided for Sigmoid layer backward."), } } diff --git a/src/layers/activation/tanh.rs b/src/layers/activation/tanh.rs index 5ec2ad85..78544d94 100644 --- a/src/layers/activation/tanh.rs +++ b/src/layers/activation/tanh.rs @@ -57,8 +57,8 @@ impl + conn::TanhPointwise> ComputeOutput], output_data: &mut [&mut SharedTensor]) { match input_data.get(0) { - Some(input) => backend.tanh_plain(input, output_data[0]).unwrap(), - None => backend.tanh_pointwise_plain(output_data[0]).unwrap(), + Some(input) => backend.tanh(input, output_data[0]).unwrap(), + None => backend.tanh_pointwise(output_data[0]).unwrap(), } } } @@ -73,8 +73,9 @@ impl + conn::TanhPointwise> ComputeInputGradi input_data: &[&SharedTensor], input_gradients: &mut [&mut SharedTensor]) { match output_data.get(0) { - Some(_) => backend.tanh_grad_plain(output_data[0], output_gradients[0], input_data[0], input_gradients[0]).unwrap(), - None => backend.tanh_pointwise_grad_plain(input_data[0], input_gradients[0]).unwrap(), + Some(_) => backend.tanh_grad(output_data[0], output_gradients[0], + input_data[0], input_gradients[0]).unwrap(), + None => backend.tanh_pointwise_grad(input_data[0], input_gradients[0]).unwrap(), } } } @@ -116,7 +117,7 @@ impl> ComputeOutput for TanH { input_data: &[&SharedTensor], output_data: &mut [&mut SharedTensor]) { match input_data.get(0) { - Some(input) => backend.tanh_plain(input, output_data[0]).unwrap(), + Some(input) => backend.tanh(input, output_data[0]).unwrap(), None => panic!("No input provided for TanH layer."), } } @@ -132,7 +133,8 @@ impl> ComputeInputGradient for TanH { input_data: &[&SharedTensor], input_gradients: &mut [&mut SharedTensor]) { match output_data.get(0) { - Some(_) => backend.tanh_grad_plain(output_data[0], output_gradients[0], input_data[0], input_gradients[0]).unwrap(), + Some(_) => backend.tanh_grad(output_data[0], output_gradients[0], + input_data[0], input_gradients[0]).unwrap(), None => panic!("No output_data provided for TanH layer backward."), } } diff --git a/src/layers/common/convolution.rs b/src/layers/common/convolution.rs index 4016b861..6f9b9534 100644 --- a/src/layers/common/convolution.rs +++ b/src/layers/common/convolution.rs @@ -64,7 +64,7 @@ impl> Convolution { fn create_filter(&self, device: &DeviceType, input_shape: &[usize]) -> SharedTensor { let filter_shape = self.calculate_filter_shape(input_shape); - SharedTensor::::new(device, &filter_shape).unwrap() + SharedTensor::::new(&filter_shape) } } @@ -156,12 +156,12 @@ impl> ILayer for Convolution { fn resize_shared_workspace(&mut self, backend: Rc, workspace: Option>>) -> Option>> { let required_size = self.convolution_config.as_ref().unwrap().workspace_size(); let new_workspace = if workspace.is_none() { - Arc::new(RwLock::new(SharedTensor::::new(IBackend::device(&*backend), &(required_size)).unwrap())) + Arc::new(RwLock::new(SharedTensor::::new(&[required_size]))) } else { let old_workspace = workspace.as_ref().unwrap().clone(); let old_workspace_size = old_workspace.read().unwrap().capacity(); if old_workspace_size < required_size { - Arc::new(RwLock::new(SharedTensor::::new(IBackend::device(&*backend), &(required_size)).unwrap())) + Arc::new(RwLock::new(SharedTensor::::new(&[required_size]))) } else { workspace.unwrap() } @@ -181,7 +181,8 @@ impl> ComputeOutput for Convolution let filter_data = weights[0]; let conv_config = self.convolution_config.as_ref().unwrap(); let mut workspace = self.workspace.as_ref().unwrap().write().unwrap(); - backend.convolution_plain(filter_data, input_data[0], output_data[0], &mut workspace, conv_config).unwrap(); + backend.convolution(filter_data, input_data[0], output_data[0], + &mut workspace, conv_config).unwrap(); } } @@ -197,7 +198,9 @@ impl> ComputeInputGradient for Conv let conv_config = self.convolution_config.as_ref().unwrap(); let mut workspace = self.workspace.as_ref().unwrap().write().unwrap(); // compute gradient w.r.t. input - backend.convolution_grad_data_plain(filter_data, output_gradients[0], input_gradients[0], &mut workspace, conv_config).unwrap(); + backend.convolution_grad_data(filter_data, + output_gradients[0], input_gradients[0], + &mut workspace, conv_config).unwrap(); } } @@ -213,7 +216,9 @@ impl> ComputeParametersGradient for let conv_config = self.convolution_config.as_ref().unwrap(); let mut workspace = self.workspace.as_ref().unwrap().write().unwrap(); // compute gradient w.r.t. filter - backend.convolution_grad_filter_plain(input_data[0], output_gradients[0], filter_gradient, &mut workspace, conv_config).unwrap(); + backend.convolution_grad_filter(input_data[0], output_gradients[0], + filter_gradient, &mut workspace, + conv_config).unwrap(); } } diff --git a/src/layers/common/linear.rs b/src/layers/common/linear.rs index c7ec6391..47a71cca 100644 --- a/src/layers/common/linear.rs +++ b/src/layers/common/linear.rs @@ -18,7 +18,6 @@ //! //! In the context of convolutional neural networks this layer is also //! called a "fully-connected layer" if it is used at the end of the network. -use std::rc::Rc; use co::backend::IBackend; use co::tensor::SharedTensor; use coblas::transpose::Transpose; @@ -75,14 +74,6 @@ impl> ILayer for Linear { true } - fn init(&mut self, backend: Rc) { - let device = ::device(&backend); - let _ = self.one.add_device(device); - self.one.sync(device).unwrap(); - let _ = self.zero.add_device(device); - self.zero.sync(device).unwrap(); - } - fn reshape(&mut self, backend: ::std::rc::Rc, input_data: &mut Vec>>, @@ -106,10 +97,6 @@ impl> ILayer for Linear { output_size: self.output_size, }; filler.fill(&mut weight.write().unwrap()); - - let native_backend = ::util::native_backend(); - let bound_weight = weight.read().unwrap(); - let native_output = bound_weight.get(native_backend.device()).unwrap().as_native().unwrap(); } if let Some(weight) = weights_gradient.get(0) { weight.write().unwrap().resize(&weight_shape).unwrap(); @@ -123,12 +110,20 @@ impl> ComputeOutput for Linear { weights: &[&SharedTensor], input_data: &[&SharedTensor], output_data: &mut [&mut SharedTensor]) { - backend.gemm_plain(&self.one, Transpose::NoTrans, input_data[0], Transpose::Trans, weights[0], &self.zero, output_data[0]).unwrap(); + backend.gemm(&self.one, + Transpose::NoTrans, input_data[0], + Transpose::Trans, weights[0], + &self.zero, + output_data[0]).unwrap(); let has_bias_term = false; // TODO: implement bias term if has_bias_term { let bias_multiplier = unimplemented!(); let bias_data = unimplemented!(); - backend.gemm_plain(&self.one, Transpose::NoTrans, bias_multiplier, Transpose::NoTrans, bias_data, &self.one, output_data[0]).unwrap(); + backend.gemm(&self.one, + Transpose::NoTrans, bias_multiplier, + Transpose::NoTrans, bias_data, + &self.one, + output_data[0]).unwrap(); } } } @@ -142,7 +137,11 @@ impl> ComputeInputGradient for Linear { input_data: &[&SharedTensor], input_gradients: &mut [&mut SharedTensor]) { // Gradient with respect to input data - backend.gemm_plain(&self.one, Transpose::NoTrans, output_gradients[0], Transpose::NoTrans, weights_data[0], &self.zero, input_gradients[0]).unwrap(); + backend.gemm(&self.one, + Transpose::NoTrans, output_gradients[0], + Transpose::NoTrans, weights_data[0], + &self.zero, + input_gradients[0]).unwrap(); } } @@ -154,7 +153,11 @@ impl> ComputeParametersGradient for Linear { input_data: &[&SharedTensor], parameters_gradients: &mut [&mut SharedTensor]) { // gradient w.r.t. weights - backend.gemm_plain(&self.one, Transpose::Trans, output_gradients[0], Transpose::NoTrans, input_data[0], &self.zero, parameters_gradients[0]).unwrap(); + backend.gemm(&self.one, + Transpose::Trans, output_gradients[0], + Transpose::NoTrans, input_data[0], + &self.zero, + parameters_gradients[0]).unwrap(); // TODO: implement gradient w.r.t bias // if (bias_term_ && this->param_propagate_down_[1]) { diff --git a/src/layers/common/log_softmax.rs b/src/layers/common/log_softmax.rs index 476f2fb5..d7c06363 100644 --- a/src/layers/common/log_softmax.rs +++ b/src/layers/common/log_softmax.rs @@ -32,7 +32,7 @@ impl> ComputeOutput for LogSoftmax { _weights: &[&SharedTensor], input_data: &[&SharedTensor], output_data: &mut [&mut SharedTensor]) { - backend.log_softmax_plain(input_data[0], output_data[0]).unwrap(); + backend.log_softmax(input_data[0], output_data[0]).unwrap(); } } @@ -44,7 +44,8 @@ impl> ComputeInputGradient for LogSo output_gradients: &[&SharedTensor], input_data: &[&SharedTensor], input_gradients: &mut [&mut SharedTensor]) { - backend.log_softmax_grad_plain(output_data[0], output_gradients[0], input_gradients[0]).unwrap(); + backend.log_softmax_grad(output_data[0], output_gradients[0], + input_gradients[0]).unwrap(); } } diff --git a/src/layers/common/pooling.rs b/src/layers/common/pooling.rs index 1c1e91fa..ca033660 100644 --- a/src/layers/common/pooling.rs +++ b/src/layers/common/pooling.rs @@ -123,7 +123,8 @@ impl> ComputeOutput for Pooling output_data: &mut [&mut SharedTensor]) { let config = &self.pooling_configs[0]; match self.mode { - PoolingMode::Max => backend.pooling_max_plain(input_data[0], output_data[0], &*config).unwrap(), + PoolingMode::Max => backend.pooling_max(input_data[0], output_data[0], + &*config).unwrap(), // TODO: implement average pooling // PoolingMode::Average => unimplemented!(), } @@ -140,7 +141,9 @@ impl> ComputeInputGradient for Pooling< input_gradients: &mut [&mut SharedTensor]) { let config = &self.pooling_configs[0]; match self.mode { - PoolingMode::Max => backend.pooling_max_grad_plain(output_data[0], output_gradients[0], input_data[0], input_gradients[0], config).unwrap() + PoolingMode::Max => backend.pooling_max_grad( + output_data[0], output_gradients[0], + input_data[0], input_gradients[0], config).unwrap() } } } diff --git a/src/layers/common/softmax.rs b/src/layers/common/softmax.rs index 0ab38c6e..dfac9cdf 100644 --- a/src/layers/common/softmax.rs +++ b/src/layers/common/softmax.rs @@ -33,7 +33,7 @@ impl> ComputeOutput for Softmax { _weights: &[&SharedTensor], input_data: &[&SharedTensor], output_data: &mut [&mut SharedTensor]) { - backend.softmax_plain(input_data[0], output_data[0]).unwrap(); + backend.softmax(input_data[0], output_data[0]).unwrap(); } } @@ -45,7 +45,8 @@ impl> ComputeInputGradient for Softmax output_gradients: &[&SharedTensor], input_data: &[&SharedTensor], input_gradients: &mut [&mut SharedTensor]) { - backend.softmax_grad_plain(output_data[0], output_gradients[0], input_gradients[0]).unwrap(); + backend.softmax_grad(output_data[0], output_gradients[0], + input_gradients[0]).unwrap(); } } diff --git a/src/layers/container/sequential.rs b/src/layers/container/sequential.rs index 90b005bb..175493ba 100644 --- a/src/layers/container/sequential.rs +++ b/src/layers/container/sequential.rs @@ -158,8 +158,10 @@ impl + 'static> Sequential { info!("Input {} -> {}", self.input_data_tensors.len(), tensor_name); let ibackend: Rc> = backend; - let data_tensor: ArcLock> = Arc::new(RwLock::new(SharedTensor::new(ibackend.device(), &input_shape).unwrap())); - let gradient_tensor: ArcLock> = Arc::new(RwLock::new(SharedTensor::new(ibackend.device(), &input_shape).unwrap())); + let data_tensor: ArcLock> = Arc::new(RwLock::new( + SharedTensor::new(&input_shape))); + let gradient_tensor: ArcLock> = Arc::new(RwLock::new( + SharedTensor::new(&input_shape))); self.input_data_tensors.push(data_tensor.clone()); self.input_gradient_tensors.push(gradient_tensor.clone()); diff --git a/src/layers/loss/negative_log_likelihood.rs b/src/layers/loss/negative_log_likelihood.rs index 6082299c..f0c080f0 100644 --- a/src/layers/loss/negative_log_likelihood.rs +++ b/src/layers/loss/negative_log_likelihood.rs @@ -73,8 +73,10 @@ impl ComputeOutput for NegativeLogLikelihood { let batch_size = Self::batch_size(labels.desc()); let native = native_backend(); - let native_labels = labels.get(native.device()).unwrap().as_native().unwrap().as_slice::(); - let native_probabilities = probabilities.get(native.device()).unwrap().as_native().unwrap().as_slice::(); + let native_labels = labels.read(native.device()).unwrap() + .as_native().unwrap().as_slice::(); + let native_probabilities = probabilities.read(native.device()).unwrap() + .as_native().unwrap().as_slice::(); let mut writable_loss = Vec::::new(); for &label_value in native_labels { @@ -86,7 +88,8 @@ impl ComputeOutput for NegativeLogLikelihood { loss = loss / (batch_size as f32); writable_loss = vec![loss]; - ::util::write_to_memory(output_data[0].get_mut(native.device()).unwrap(), &writable_loss); + ::util::write_to_memory(output_data[0].write_only(native.device()).unwrap(), + &writable_loss); } } @@ -103,15 +106,16 @@ impl ComputeInputGradient for NegativeLogLikelihood { let num_classes = self.num_classes; let native = native_backend(); - let native_labels = labels.get(native.device()).unwrap().as_native().unwrap().as_slice::(); + let native_labels = labels.read(native.device()).unwrap() + .as_native().unwrap().as_slice::(); let mut writable_gradient = vec![0f32; input_gradients[0].desc().size()]; for (batch_n, &label_value) in native_labels.iter().enumerate() { let index = (num_classes * batch_n) + label_value as usize; writable_gradient[index] = -1f32; } - input_gradients[0].sync(native.device()).unwrap(); - ::util::write_to_memory(input_gradients[0].get_mut(native.device()).unwrap(), &writable_gradient); + ::util::write_to_memory(input_gradients[0].write_only(native.device()).unwrap(), + &writable_gradient); } } diff --git a/src/solver/confusion_matrix.rs b/src/solver/confusion_matrix.rs index b5b7e349..0b32bc74 100644 --- a/src/solver/confusion_matrix.rs +++ b/src/solver/confusion_matrix.rs @@ -48,7 +48,8 @@ impl ConfusionMatrix { /// The prediction for each sample of the batch is found by /// determining which output value had the smallest loss. pub fn get_predictions(&self, network_out: &mut SharedTensor) -> Vec { - let native_infered = network_out.get(native_backend().device()).unwrap().as_native().unwrap(); + let native_infered = network_out.read(native_backend().device()).unwrap() + .as_native().unwrap(); let predictions_slice = native_infered.as_slice::(); let mut predictions = Vec::::new(); diff --git a/src/solvers/mod.rs b/src/solvers/mod.rs index fcda65f0..e0e70c65 100644 --- a/src/solvers/mod.rs +++ b/src/solvers/mod.rs @@ -68,13 +68,14 @@ trait SGDSolver, NetB: IBackend + LayerOps::new(IBackend::device(backend), &1).unwrap(); + // PERF: preallocate tensor once + let mut result = SharedTensor::new(&[1]); // gradient.sumsq_diff(self.backend(), &mut result); - self.backend().dot_plain(&gradient, &gradient, &mut result); + self.backend().dot(&gradient, &gradient, &mut result); - let mut result = SharedTensor::::new(IBackend::device(backend), &1).unwrap(); - match result.add_device(native.device()) { _ => result.sync(native.device()).unwrap() } - match result.get(native.device()).unwrap() { + // FIXME: I've removed redefinition of `result` that was here. + // Code was invalid. Not sure what it meant. It may explode. + match result.read(native.device()).unwrap() { &MemoryType::Native(ref sumsq_result) => { let sumsq_diff_slice = sumsq_result.as_slice::(); sumsq_diff += sumsq_diff_slice[0]; diff --git a/src/solvers/sgd/mod.rs b/src/solvers/sgd/mod.rs index 64cab199..6159ecb9 100644 --- a/src/solvers/sgd/mod.rs +++ b/src/solvers/sgd/mod.rs @@ -31,8 +31,7 @@ macro_rules! impl_isolver_sgd { for weight_gradient in net.learnable_weights_gradients() { let shape = weight_gradient.read().unwrap().desc().clone(); - let mut tensor = SharedTensor::new(IBackend::device(&*self.backend), - &shape).unwrap(); + let mut tensor = SharedTensor::new(&shape); let filler = ::weight::FillerType::Constant { value: 0f32 }; filler.fill(&mut tensor); diff --git a/src/solvers/sgd/momentum.rs b/src/solvers/sgd/momentum.rs index 15dbb759..334e7d4b 100644 --- a/src/solvers/sgd/momentum.rs +++ b/src/solvers/sgd/momentum.rs @@ -46,19 +46,12 @@ impl> Momentum { /// /// [2]: ../../../solver/struct.Solver.html#method.from_config pub fn new(backend: Rc) -> Momentum { - let (lr, momentum) = { - let device = IBackend::device(backend.as_ref()); - - (SharedTensor::::new(device, &1).unwrap(), - SharedTensor::::new(device, &1).unwrap()) - }; - Momentum { history: Vec::new(), backend: backend, - lr: lr, - momentum: momentum, + lr: SharedTensor::::new(&[1]), + momentum: SharedTensor::::new(&[1]), } } @@ -71,6 +64,7 @@ impl, NetB: IBackend + LayerOps + 'static> SGD history_blob_id: usize, global_lr: &f32, blob_lr: &f32) { + // PERF: check if value is changed before writing it ::weight::FillerType::Constant { value: global_lr * blob_lr }.fill(&mut self.lr); @@ -83,20 +77,14 @@ impl, NetB: IBackend + LayerOps + 'static> SGD let device = IBackend::device(backend); let history_blob = &self.history[history_blob_id]; + Axpby::axpby(backend, + &self.lr, + &weight_gradient.read().unwrap(), + &self.momentum, + &mut history_blob.write().unwrap()).unwrap(); - let _ = weight_gradient.write().unwrap().add_device(device); - weight_gradient.write().unwrap().sync(device).unwrap(); - let _ = history_blob.write().unwrap().add_device(device); - history_blob.write().unwrap().sync(device).unwrap(); - - Axpby::axpby_plain(backend, - &self.lr, - &weight_gradient.read().unwrap(), - &self.momentum, - &mut history_blob.write().unwrap()).unwrap(); - - backend.copy_plain( - &history_blob.read().unwrap(), &mut weight_gradient.write().unwrap()).unwrap(); + backend.copy(&history_blob.read().unwrap(), + &mut weight_gradient.write().unwrap()).unwrap(); } } diff --git a/src/util.rs b/src/util.rs index 91f4c3ef..b7a29cf0 100644 --- a/src/util.rs +++ b/src/util.rs @@ -51,16 +51,16 @@ pub fn write_batch_sample(tensor: &mut SharedT let batch_size = tensor.desc().size(); let sample_size = batch_size / tensor.desc()[0]; - let _ = tensor.add_device(native_backend.device()); - tensor.sync(native_backend.device()).unwrap(); - write_to_memory_offset(tensor.get_mut(native_backend.device()).unwrap(), &data, i * sample_size); + write_to_memory_offset(tensor.write_only(native_backend.device()).unwrap(), + &data, + i * sample_size); } /// Create a Collenchyma SharedTensor for a scalar value. pub fn native_scalar(scalar: T) -> SharedTensor { let native = native_backend(); - let mut shared_scalar = SharedTensor::::new(native.device(), &vec![1]).unwrap(); - write_to_memory(shared_scalar.get_mut(native.device()).unwrap(), &[scalar]); + let mut shared_scalar = SharedTensor::::new(&[1]); + write_to_memory(shared_scalar.write_only(native.device()).unwrap(), &[scalar]); shared_scalar } @@ -79,20 +79,12 @@ pub trait Axpby : Axpy + Scal { /// Performs the operation y := a*x + b*y . /// /// Consists of a scal(b, y) followed by a axpby(a,x,y). - fn axpby(&self, a: &mut SharedTensor, x: &mut SharedTensor, b: &mut SharedTensor, y: &mut SharedTensor) -> Result<(), ::co::error::Error> { + fn axpby(&self, a: &SharedTensor, x: &SharedTensor, b: &SharedTensor, + y: &mut SharedTensor) -> Result<(), ::co::error::Error> { try!(self.scal(b, y)); try!(self.axpy(a, x, y)); Ok(()) } - - /// Performs the operation y := a*x + b*y . - /// - /// Consists of a scal(b, y) followed by a axpby(a,x,y). - fn axpby_plain(&self, a: &SharedTensor, x: &SharedTensor, b: &SharedTensor, y: &mut SharedTensor) -> Result<(), ::co::error::Error> { - try!(self.scal_plain(b, y)); - try!(self.axpy_plain(a, x, y)); - Ok(()) - } } impl + Scal> Axpby for T {} diff --git a/src/weight.rs b/src/weight.rs index 09fc631e..e4a8a114 100644 --- a/src/weight.rs +++ b/src/weight.rs @@ -169,23 +169,20 @@ impl FillerType { pub fn fill(&self, weight: &mut SharedTensor) { let native = native_backend(); let native_device = native.device(); - let actual_device = weight.latest_device().clone(); - // sync to native so we can fill - match weight.add_device(native_device) { _ => weight.sync(native_device).unwrap() } match *self { - FillerType::Constant { value } => Self::fill_constant(weight, value), - FillerType::Glorot { input_size, output_size } => Self::fill_glorot(weight, input_size, output_size), + FillerType::Constant { value } => + Self::fill_constant(weight, value), + FillerType::Glorot { input_size, output_size } => + Self::fill_glorot(weight, input_size, output_size), } - - // sync back to the actual device - weight.sync(&actual_device).unwrap(); } /// Directly use the [Constant Filler](#variant.Constant). pub fn fill_constant(weight: &mut SharedTensor, value: f32) { let native = native_backend(); - let native_weight = weight.get_mut(native.device()).unwrap().as_mut_native().unwrap(); + let native_weight = weight.write_only(native.device()).unwrap() + .as_mut_native().unwrap(); for e in native_weight.as_mut_slice::() { *e = value; @@ -195,7 +192,8 @@ impl FillerType { /// Directly use the [Glorot Filler](#variant.Glorot). pub fn fill_glorot(weight: &mut SharedTensor, num_inputs: usize, num_outputs: usize) { let native = native_backend(); - let native_weight = weight.get_mut(native.device()).unwrap().as_mut_native().unwrap(); + let native_weight = weight.write_only(native.device()).unwrap() + .as_mut_native().unwrap(); let init_range = (6.0f32 / (num_inputs as f32 + num_outputs as f32)).sqrt();