Coverage for src/flag_gems/experimental_ops/sinc

1import torch

2import triton

3import triton.language as tl

6@triton.jit

7def sinc_(x_ptr, n_elements, BLOCK_SIZE: tl.constexpr):

8 pid = tl.program_id(axis=0)

9 block_start = pid * BLOCK_SIZE

10 offsets = block_start + tl.arange(0, BLOCK_SIZE)

11 mask = offsets < n_elements

13 x = tl.load(x_ptr + offsets, mask=mask, other=0)

14 x_f32 = x.to(tl.float32)

16 pi = 3.141592653589793

17 z = x_f32 * pi

18 is_zero = x_f32 == 0.0

19 denom = tl.where(is_zero, 1.0, z)

20 s = tl.sin(z)

21 y_f32 = s / denom

22 y_f32 = tl.where(is_zero, 1.0, y_f32)

24 y = y_f32.to(x.dtype)

25 tl.store(x_ptr + offsets, y, mask=mask)

28_sinc_kernel = sinc_

31def sinc_(x: torch.Tensor):

32 assert x.is_cuda, "Input tensor must be on CUDA device."

33 assert x.is_contiguous(), "Input tensor must be contiguous."

34 assert x.is_floating_point(), "sinc_ expects a floating point tensor."

36 n_elements = x.numel()

37 if n_elements == 0:

38 return x

40 BLOCK_SIZE = 1024

41 grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)

42 _sinc_kernel[grid](x, n_elements, BLOCK_SIZE=BLOCK_SIZE)

43 return x

Coverage for src/flag_gems/experimental_ops/sinc_.py: 0%