Coverage for src/flag_gems/ops/i0.py: 52%

1# Generated by KernelGen: https://github.com/flagos-ai/KernelGen

2import logging

4import torch

5import triton

6import triton.language as tl

8logger = logging.getLogger(__name__)

11@triton.jit

12def i0_kernel(x_ptr, out_ptr, n_elements, BLOCK_SIZE: tl.constexpr):

13 pid = tl.program_id(axis=0)

14 block_start = pid * BLOCK_SIZE

15 offsets = block_start + tl.arange(0, BLOCK_SIZE)

16 mask = offsets < n_elements

18 x = tl.load(x_ptr + offsets, mask=mask, other=0)

19 x_f32 = x.to(tl.float32)

20 ax = tl.abs(x_f32)

22 # Small region: |x| <= 3.75

23 t = x_f32 / 3.75

24 y = t * t

25 p_small = 1.0 + y * (

26 3.5156229

27 + y

28 * (

29 3.0899424

30 + y * (1.2067492 + y * (0.2659732 + y * (0.0360768 + y * 0.0045813)))

31 )

32 )

34 # Large region: |x| > 3.75

35 yb = 3.75 / ax

36 p_big = 0.39894228 + yb * (

37 0.01328592

38 + yb

39 * (

40 0.00225319

41 + yb

42 * (

43 -0.00157565

44 + yb

45 * (

46 0.00916281

47 + yb

48 * (

49 -0.02057706

50 + yb * (0.02635537 + yb * (-0.01647633 + yb * 0.00392377))

51 )

52 )

53 )

54 )

55 )

56 # Avoid division by zero via masking; big branch only used when ax > 3.75

57 res_big = tl.exp(ax) * p_big / tl.sqrt(ax)

59 use_small = ax <= 3.75

60 res = tl.where(use_small, p_small, res_big)

62 # Store result; Triton will cast to the dtype of out_ptr as needed

63 tl.store(out_ptr + offsets, res, mask=mask)

66def _launch_i0(out: torch.Tensor, x: torch.Tensor):

67 assert x.is_cuda and out.is_cuda, "Input and output must be CUDA tensors"

68 assert (

69 out.numel() == x.numel()

70 ), "Input and output must have the same number of elements"

71 assert out.device == x.device, "Input and output must be on the same device"

73 x_in = x

74 out_in = out

76 # Ensure floating point compute

77 if not x_in.is_floating_point():

78 x_in = x_in.to(torch.get_default_dtype())

80 # Cast input to match the desired output dtype if needed

81 # (Compute will be done in fp32 inside kernel; store will cast to out dtype)

82 if x_in.dtype != out_in.dtype:

83 x_in = x_in.to(out_in.dtype)

85 x_contig = x_in.contiguous()

86 out_was_noncontig = not out_in.is_contiguous()

87 out_contig = out_in.contiguous() if out_was_noncontig else out_in

89 n_elements = out_contig.numel()

90 BLOCK_SIZE = 1024

91 grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)

93 i0_kernel[grid](x_contig, out_contig, n_elements, BLOCK_SIZE=BLOCK_SIZE)

95 if out_was_noncontig:

96 out_in.copy_(out_contig)

97 return out_in

100def i0(x: torch.Tensor):

101 logger.debug("GEMS I0")

102 if not x.is_cuda:

103 raise ValueError("i0: input tensor must be on CUDA device")

104 # Result dtype follows PyTorch's floating type behavior

105 out_dtype = x.dtype if x.is_floating_point() else torch.get_default_dtype()

106 out = torch.empty_like(x.to(dtype=out_dtype), dtype=out_dtype, device=x.device)

107 _launch_i0(out, x)

108 return out

109

110

111def i0_out(x: torch.Tensor, out: torch.Tensor):

112 logger.debug("GEMS I0_OUT")

113 if not (x.is_cuda and out.is_cuda):

114 raise ValueError("i0_out: input and output tensors must be on CUDA device")

115 if not out.is_floating_point():

116 raise TypeError("i0_out: output tensor must be a floating point type")

117 if x.numel() != out.numel():

118 raise ValueError(

119 "i0_out: input and output must have the same number of elements"

120 )

121 _launch_i0(out, x)

122 return out