Coverage for src/flag_gems/experimental_ops/reciprocal

1import torch

2import triton

3import triton.language as tl

6@triton.jit

7def reciprocal_(x_ptr, n_elements, BLOCK_SIZE: tl.constexpr):

8 pid = tl.program_id(axis=0)

9 block_start = pid * BLOCK_SIZE

10 offsets = block_start + tl.arange(0, BLOCK_SIZE)

11 mask = offsets < n_elements

12 x = tl.load(x_ptr + offsets, mask=mask)

13 out = 1.0 / x

14 tl.store(x_ptr + offsets, out, mask=mask)

17# Preserve a reference to the Triton kernel before defining the Python wrapper with the same name.

18reciprocal___kernel = reciprocal_

21def reciprocal_(x: torch.Tensor):

22 # Fallback for unsupported cases

23 supported_dtypes = {torch.float16, torch.bfloat16, torch.float32}

24 if (

25 (not isinstance(x, torch.Tensor))

26 or (not x.is_cuda)

27 or (not x.is_contiguous())

28 or (x.dtype not in supported_dtypes)

29 ):

30 return torch.ops.aten.reciprocal_(x)

32 n_elements = x.numel()

33 if n_elements == 0:

34 return x

36 BLOCK_SIZE = 1024

37 grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) # noqa: E731

38 reciprocal___kernel[grid](x, n_elements, BLOCK_SIZE=BLOCK_SIZE)

39 return x

Coverage for src/flag_gems/experimental_ops/reciprocal_.py: 0%