diff --git a/test/dtypes/test_nf4.py b/test/dtypes/test_nf4.py index e3b25e3c3..55bbe0bcb 100644 --- a/test/dtypes/test_nf4.py +++ b/test/dtypes/test_nf4.py @@ -192,6 +192,20 @@ def test_to_copy(self, dtype: torch.dtype): nf4_to_dtype = inpt_tensor_nf4.to(dtype) torch.testing.assert_allclose(inpt_tensor, nf4_to_dtype, atol=0.13, rtol=0.13) + @unittest.skipIf(not torch.cuda.is_available(), "Need cuda for test") + def test_to_copy_device(self): + inpt_tensor = torch.rand(128, device='cpu') + t = to_nf4(inpt_tensor, 32, 2) + assert t.device == torch.device('cpu') + z = t.cuda() + assert z.device.type == "cuda" # Because the device could be cuda:0 + x = z.cpu() + assert x.device == torch.device('cpu') + + inpt_tensor = torch.rand(128, device='cuda') + t = to_nf4(inpt_tensor, 32, 2) + assert t.device.type == "cuda" + @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) def test_to_dtype(self, dtype: torch.dtype): inpt_tensor = torch.rand(128, dtype=dtype) diff --git a/torchao/dtypes/nf4tensor.py b/torchao/dtypes/nf4tensor.py index 886eb6c0a..f09d53821 100644 --- a/torchao/dtypes/nf4tensor.py +++ b/torchao/dtypes/nf4tensor.py @@ -47,7 +47,7 @@ def _to_copy(func, *args, **kwargs): if not args[0][0].is_contiguous(): assert args[0][0].t().is_contiguous() return func(args[0][0].t()).t() - return args[0][0].get_original_weight().to(args[1]["dtype"]) + return args[0][0].get_original_weight().to(args[1]["dtype"]).to(args[1]["device"]) @implements([torch.ops.aten.to.dtype])