Coverage for src/flag_gems/runtime/backend/_kunlunxin/ops/randn_like.py: 0%
24 statements
« prev ^ index » next coverage.py v7.6.9, created at 2026-03-24 15:40 +0800
« prev ^ index » next coverage.py v7.6.9, created at 2026-03-24 15:40 +0800
1import logging
3import torch
4import triton
6from flag_gems.runtime import torch_device_fn
7from flag_gems.utils.random_utils import philox_backend_seed_offset
9from .randn import randn_kernel
11logger = logging.getLogger("flag_gems").getChild(__name__.lstrip("."))
12UNROLL = 4
15def randn_like(
16 x, *, dtype=None, layout=None, device=None, pin_memory=None, memory_format=None
17):
18 logger.debug("GEMS RANDN_LIKE")
19 if device is None:
20 device = x.device.index
21 if dtype is None:
22 dtype = x.dtype
23 out = torch.empty_like(x, device=device, dtype=dtype)
24 N = x.numel()
25 cluster_num = 12
26 BLOCK_SIZE = min(triton.next_power_of_2(triton.cdiv(N, cluster_num * UNROLL)), 1024)
27 grid_fn = triton.cdiv(N, BLOCK_SIZE * UNROLL)
28 # (TODO) Using Triton autotuner makes kernel parameters opaque to the caller,
29 # hence we cannot obtain the per thread offset as in Pytorch.
30 increment = triton.cdiv(N, UNROLL)
31 philox_seed, philox_offset = philox_backend_seed_offset(increment)
32 with torch_device_fn.device(x.device):
33 randn_kernel[(grid_fn,)](out, N, philox_seed, philox_offset, BLOCK_SIZE)
34 return out