diff --git a/bmtrain/block_layer.py b/bmtrain/block_layer.py
index 98200465..aa154b96 100644
--- a/bmtrain/block_layer.py
+++ b/bmtrain/block_layer.py
@@ -1,4 +1,5 @@
 from typing import Dict, Iterable, Iterator, Union, List
+import gc
 
 from .utils import (round_up, tp_split_tensor)
 from .global_var import config
@@ -215,6 +216,9 @@ def init_param_storage(self):
                 param.data[:] = \
                     torch.tensor([], dtype=d_dtype, device=d_device).set_(contiguous_param.storage(), offset_st, (offset_end - offset_st,))[:]
                 del contiguous_param
+
+                gc.collect()
+                torch.cuda.empty_cache()
             else:
                 param.data = torch.tensor([], dtype=param.dtype, device=param.device)
                 setattr(param, "_start_partition", None)
@@ -374,6 +378,9 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                 torch.tensor([], dtype=d_dtype, device=d_device).set_(self._storage_params[kw_name].storage(), to_offset_st, (to_offset_end - to_offset_st,))[:] = \
                     torch.tensor([], dtype=d_dtype, device=d_device).set_(contiguous_param.storage(), offset_st, (offset_end - offset_st,))[:]
                 del contiguous_param
+
+                gc.collect()
+                torch.cuda.empty_cache()
             elif strict:
                 missing_keys.append(key)
 
diff --git a/bmtrain/nccl/__init__.py b/bmtrain/nccl/__init__.py
index 0f4129d5..544dd3e3 100644
--- a/bmtrain/nccl/__init__.py
+++ b/bmtrain/nccl/__init__.py
@@ -119,7 +119,7 @@ def allReduce(
     If src == dst, the operation is performed in-place.
 
     """
-    assert src.dtype == dst.dtype, "send and recv buffers must be the same time"
+    assert src.dtype == dst.dtype, "send and recv buffers must be the same type"
     assert src.is_cuda and dst.is_cuda
 
     sendbuff = src.data_ptr()
@@ -197,7 +197,7 @@ def broadcast(
 
     """
 
-    assert src.dtype == dst.dtype, "send and recv buffers must be the same time"
+    assert src.dtype == dst.dtype, "send and recv buffers must be the same type"
     assert src.is_cuda and dst.is_cuda
 
     sendbuff = src.data_ptr()
@@ -237,7 +237,7 @@ def reduce(
     If src == dst, the operation is performed in-place.
 
     """
-    assert src.dtype == dst.dtype, "send and recv buffers must be the same time"
+    assert src.dtype == dst.dtype, "send and recv buffers must be the same type"
     assert src.is_cuda and dst.is_cuda
 
     sendbuff = src.data_ptr()
@@ -266,7 +266,7 @@ def allGather(
     The dst buffer is only used on rank root.
 
     """
-    assert src.dtype == dst.dtype, "send and recv buffers must be the same time"
+    assert src.dtype == dst.dtype, "send and recv buffers must be the same type"
     assert src.is_cuda and dst.is_cuda
 
     sendbuff = src.data_ptr()
@@ -303,7 +303,7 @@ def reduceScatter(
     The dst buffer on rank `i` will contail the i-th block of the reduced result.
 
     """
-    assert src.dtype == dst.dtype, "send and recv buffers must be the same time"
+    assert src.dtype == dst.dtype, "send and recv buffers must be the same type"
     assert src.is_cuda and dst.is_cuda
 
     sendbuff = src.data_ptr()