Coverage for src/flag_gems/ops/arcsinh.py: 68%

1# Generated by KernelGen: https://github.com/flagos-ai/KernelGen

2import logging

4import torch

5import triton

6import triton.language as tl

8logger = logging.getLogger(__name__)

11@triton.jit

12def arcsinh_kernel(x_ptr, out_ptr, n_elements, BLOCK_SIZE: tl.constexpr):

13 pid = tl.program_id(axis=0)

14 block_start = pid * BLOCK_SIZE

15 offsets = block_start + tl.arange(0, BLOCK_SIZE)

16 mask = offsets < n_elements

18 x = tl.load(x_ptr + offsets, mask=mask, other=0)

20 # Compute asinh using: asinh(x) = log(x + sqrt(x*x + 1))

21 x_f32 = x.to(tl.float32)

22 tmp = x_f32 * x_f32 + 1.0

23 sqrt_term = tl.sqrt(tmp)

24 y_f32 = tl.log(x_f32 + sqrt_term)

26 # Store result; will cast to out dtype as needed

27 tl.store(out_ptr + offsets, y_f32, mask=mask)

30def _ensure_cuda_tensor(t):

31 if not isinstance(t, torch.Tensor):

32 raise TypeError("Expected a torch.Tensor")

33 if not t.is_cuda:

34 raise ValueError("Input tensors must be on CUDA device")

35 if t.is_complex():

36 raise NotImplementedError(

37 "Complex dtypes are not supported by this Triton kernel"

38 )

41def _arcsinh_impl(input_tensor: torch.Tensor, out_tensor: torch.Tensor = None):

42 _ensure_cuda_tensor(input_tensor)

44 # Determine result dtype following basic promotion: float -> same, otherwise float32

45 if input_tensor.is_floating_point():

46 result_dtype = input_tensor.dtype

47 else:

48 result_dtype = torch.float32

50 x = input_tensor

51 n_elements = x.numel()

53 if out_tensor is None:

54 out = torch.empty_like(x, dtype=result_dtype, device=x.device)

55 else:

56 _ensure_cuda_tensor(out_tensor)

57 if out_tensor.numel() != n_elements:

58 raise ValueError(

59 "Output tensor must have the same number of elements as input"

60 )

61 # Enforce dtype consistent with promotion

62 if out_tensor.dtype != (result_dtype):

63 raise TypeError(

64 f"Output tensor has dtype {out_tensor.dtype}, expected {result_dtype}"

65 )

66 out = out_tensor

68 # Work with contiguous buffers for the kernel

69 x_contig = x.contiguous()

70 out_contig = out if out.is_contiguous() else out.contiguous()

72 # Launch kernel

73 BLOCK_SIZE = 1024

74 grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)

75 arcsinh_kernel[grid](x_contig, out_contig, n_elements, BLOCK_SIZE=BLOCK_SIZE)

77 # If out was non-contiguous, copy back

78 if out_contig.data_ptr() != out.data_ptr():

79 out.copy_(out_contig)

81 return out

84def arcsinh(input_tensor: torch.Tensor):

85 logger.debug("GEMS ARCSINH")

86 return _arcsinh_impl(input_tensor)

89def arcsinh_out(input_tensor: torch.Tensor, out: torch.Tensor):

90 logger.debug("GEMS ARCSINH_OUT")

91 return _arcsinh_impl(input_tensor, out)