Coverage for src/flag_gems/experimental_ops/rsqrt

1import torch

2import triton

3import triton.language as tl

6@triton.jit

7def rsqrt_(x_ptr, n_elements, BLOCK_SIZE: tl.constexpr):

8 pid = tl.program_id(axis=0)

9 block_start = pid * BLOCK_SIZE

10 offsets = block_start + tl.arange(0, BLOCK_SIZE)

11 mask = offsets < n_elements

13 x = tl.load(x_ptr + offsets, mask=mask)

14 x_fp32 = x.to(tl.float32)

15 res_fp32 = 1.0 / tl.sqrt(x_fp32)

16 res = res_fp32.to(x.dtype)

17 tl.store(x_ptr + offsets, res, mask=mask)

20# Keep a handle to the Triton kernel before defining the Python wrapper with the same name.

21rsqrt__triton_kernel = rsqrt_

24def rsqrt_(*args, **kwargs):

25 # Resolve input tensor from positional or keyword arguments

26 x = None

27 if len(args) >= 1:

28 x = args[0]

29 else:

30 x = kwargs.get("input", None)

31 if x is None:

32 x = kwargs.get("self", None)

34 if x is None:

35 raise ValueError("rsqrt_ expects a tensor as its first argument")

37 if not isinstance(x, torch.Tensor):

38 raise TypeError("rsqrt_ expects a torch.Tensor")

40 if not x.is_cuda:

41 raise AssertionError("Input tensor must be on CUDA device")

43 if not x.is_contiguous():

44 raise AssertionError("Input tensor must be contiguous")

46 if x.dtype not in (torch.float16, torch.bfloat16, torch.float32, torch.float64):

47 raise TypeError(

48 "rsqrt_ only supports floating point tensors (float16, bfloat16, float32, float64)"

51 n_elements = x.numel()

52 if n_elements == 0:

53 return x

55 grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)

56 rsqrt__triton_kernel[grid](x, n_elements, BLOCK_SIZE=1024)

57 return x

Coverage for src/flag_gems/experimental_ops/rsqrt_.py: 0%