Coverage for src/flag_gems/utils/random

1import torch

2import triton

3import triton.language as tl

5import flag_gems

6from flag_gems.runtime import torch_device_fn

8try:

9 uint_to_uniform_float = tl.uint_to_uniform_float

10except AttributeError:

11 # Copied from triton.language package for compatibility

12 @triton.jit

13 def uint_to_uniform_float(x):

14 """

15 Numerically stable function to convert a random uint into a random float uniformly sampled in [0, 1).

16 """

17 # TODO: fix frontend issues and cleanup

18 # conditions can be simplified

19 # scale is ((2**23 - 1) / 2**23) * 2**(N_BITS - 1)

20 if tl.constexpr(x.dtype == tl.uint32) or tl.constexpr(x.dtype == tl.int32):

21 # maximum value such that `MAX_INT * scale < 1.0` (with float rounding)

22 x = x.to(tl.int32, bitcast=True)

23 scale = 4.6566127342e-10

24 else:

25 tl.static_assert(

26 tl.constexpr(x.dtype == tl.uint64) or tl.constexpr(x.dtype == tl.int64)

27 )

28 x = x.to(tl.int64, bitcast=True)

29 scale = 1.0842020432385337e-19

30 x = tl.where(x < 0, -x - 1, x)

31 return x * scale

34# This function is roughly a python wrapper of CUDAGeneratorImpl::philox_cuda_state in Pytorch.

35# https://github.com/pytorch/pytorch/blob/8a4597980c2692b73f35fb3c7145eaeaf2273e77/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp#L452

36# It returns the current state of the default Philox RNG in seed and offset and

37# updates the next offset by adding `increment`.

38def philox_backend_seed_offset(increment, generator=None):

39 if generator is None:

40 device = torch_device_fn.current_device()

41 generator = torch_device_fn.default_generators[device]

42 state_copy = generator.get_state()

43 # TODO[kunlunxin]: we will upgrade torch version in 2025.04

44 if flag_gems.vendor_name in ("kunlunxin", "aipu"):

45 c0, c1 = state_copy.view(torch.int64)[-2], state_copy.view(torch.int64)[-1]

46 else:

47 c0, c1 = state_copy.view(torch.int64)

49 seed, offset = int(c0), int(c1)

50 increment = (increment + 3) // 4 * 4

51 c1 += increment

52 # get_state returns a new tensor, so it needs set_state to update the actual generator state.

53 generator.set_state(state_copy)

54 return seed, offset

57def set_philox_state(seed, offset, device=None):

58 assert offset % 4 == 0

59 device = device or torch_device_fn.current_device()

60 gen = torch_device_fn.default_generators[device]

61 state_copy = gen.get_state()

62 state_copy.view(torch.int64)[0] = seed

63 state_copy.view(torch.int64)[1] = offset

64 gen.set_state(state_copy)

65 return

68def per_thread_offset(N, num_blocks, num_warps, warp_threads=32):

69 block_threads = num_warps * warp_threads

70 max_threads = num_blocks * block_threads

71 offset = (N + max_threads - 1) // max_threads

72 return offset

75@triton.jit

76def uniform(seed, philox_offset, offset):

77 seed = seed.to(tl.int64)

78 philox_offset = philox_offset.to(tl.int64)

79 c0 = (philox_offset & 0xFFFFFFFF).to(tl.uint32)

80 c1 = ((philox_offset >> 32) & 0xFFFFFFFF).to(tl.uint32)

81 i4 = offset

82 c0 += i4

83 _O = c0 * 0

84 r0, r1, r2, r3 = tl.philox(seed, c0, c1, _O, _O)

85 r0 = uint_to_uniform_float(r0)

86 r1 = uint_to_uniform_float(r1)

87 r2 = uint_to_uniform_float(r2)

88 r3 = uint_to_uniform_float(r3)

89 return r0, r1, r2, r3

Coverage for src/flag_gems/utils/random_utils.py: 52%

60 statements