Coverage for src/flag_gems/runtime/backend/_kunlunxin/ops/rand_like.py: 0%

26 statements  

« prev     ^ index     » next       coverage.py v7.6.9, created at 2026-03-25 02:48 +0800

1import logging 

2 

3import torch 

4import triton 

5 

6from flag_gems.runtime import torch_device_fn 

7from flag_gems.utils.random_utils import philox_backend_seed_offset 

8 

9from .rand import choose_unroll, rand_kernel_1, rand_kernel_2 

10 

11logger = logging.getLogger("flag_gems").getChild(__name__.lstrip(".")) 

12# UNROLL = 4 

13 

14 

15def rand_like( 

16 x, *, dtype=None, layout=None, device=None, pin_memory=None, memory_format=None 

17): 

18 logger.debug("GEMS RAND_LIKE") 

19 if device is None: 

20 device = x.device 

21 if dtype is None: 

22 dtype = x.dtype 

23 out = torch.empty_like(x, device=device, dtype=dtype) 

24 N = x.numel() 

25 # grid_fn = lambda meta: (triton.cdiv(N, meta["BLOCK"] * UNROLL),) 

26 cluster_num = 12 

27 UNROLL = choose_unroll(N) 

28 BLOCK_SIZE = min(triton.next_power_of_2(triton.cdiv(N, cluster_num * UNROLL)), 1024) 

29 grid_fn = triton.cdiv(N, BLOCK_SIZE * UNROLL) 

30 # (TODO) Using Triton autotuner makes kernel parameters opaque to the caller, 

31 # hence we cannot obtain the per thread offset as in Pytorch. 

32 increment = triton.cdiv(N, UNROLL) 

33 philox_seed, philox_offset = philox_backend_seed_offset(increment) 

34 with torch_device_fn.device(x.device): 

35 if UNROLL <= 4: 

36 rand_kernel_1[(grid_fn,)]( 

37 out, N, philox_seed, philox_offset, BLOCK_SIZE, UNROLL 

38 ) 

39 else: 

40 rand_kernel_2[(grid_fn,)]( 

41 out, N, philox_seed, philox_offset, BLOCK_SIZE, UNROLL 

42 ) 

43 return out