Coverage for src/flag_gems/runtime/backend/

1import logging

3import torch

4import triton

5import triton.language as tl

7from flag_gems.ops.runtime.backend._aipu.ops.cumsum import normed_cumsum

8from flag_gems.utils import libentry

9from flag_gems.utils.random_utils import philox_backend_seed_offset, uniform

11logger = logging.getLogger(__name__)

14@libentry()

15@triton.jit(do_not_specialize=["K", "N", "philox_seed", "philox_offset"])

16def multinomial_with_replacement(

17 cdf_ptr, out_ptr, K, N, philox_seed, philox_offset, NBLOCK: tl.constexpr = 128

18):

19 # The computation is arranged in a 2d grid of blocks, each producing

20 # a batch of samples for a particular distribution.

21 # <------------------- grid.x --------------------->

22 # | dist0.batch0 | dist0.batch1 | dist0.batch2 ...

23 # grid.y | dist1.batch0 | dist1.batch1 | dist1.batch2 ...

24 # | dist2.batch0 | dist2.batch1 | dist2.batch2 ...

25 y_off = tl.program_id(1) * N

26 n = tl.program_id(0) * NBLOCK + tl.arange(0, NBLOCK)

27 rv, _, _, _ = uniform(philox_seed, philox_offset, y_off + n)

29 # Do a binary search for each random number on the cumulative probabilities.

30 # Each random number always selects the leftmost index of the data greater

31 # than or equal to itself. However, this is likely to give a wrong result

32 # in case the first probability is zero which is not expected to selected.

33 # This error happens when the tossed random number is also zero. To avoid

34 # this mistake, we simply perturb random variable with a small number.

35 rv += 0.0001

36 rv = tl.where(rv > 0.9999, 0.9999, rv)

38 cdf_ptr += tl.program_id(1) * K

39 start = tl.zeros((NBLOCK,), dtype=tl.int32)

40 end = tl.zeros((NBLOCK,), dtype=tl.int32) + K - 1

41 steps = tl.math.log2(K.to(tl.float32)).to(tl.int32) + 1

42 for _ in range(steps):

43 mid = start + (end - start) // 2

44 x = tl.load(cdf_ptr + mid, mask=n < N)

45 start = tl.where(x < rv, mid + 1, start)

46 end = tl.where(x < rv, end, mid)

48 # Returns the last index in case of an overflow

49 start = tl.where(start >= K, K - 1, start)

51 tl.store(out_ptr + y_off + n, start, mask=n < N)

54def multinomial(prob, n_samples, with_replacement=False, *, gen=None):

55 logger.debug("GEMS MULTINOMIAL")

56 assert prob.dtype in (torch.float16, torch.float32, torch.bfloat16, torch.float64)

57 assert 0 < prob.dim() <= 2, "prob_dist must be 1 or 2 dim"

58 n_categories = prob.size(-1)

59 assert n_categories <= (1 << 24), "number of categories cannot exceed 2^24"

60 assert (

61 with_replacement or n_samples <= n_categories

62 ), "cannot sample n_samples > prob.size(-1) samples without replacement."

64 # Sampling without replacement

65 if (not with_replacement) or n_samples == 1:

66 # In case of with_replacement, sampling is approximated by selecing

67 # the top k indices over sorted probabilities with an exponential pertubation

68 # s = argmax( p / q ) where q ~ Exp(1)

69 q = torch.empty_like(prob).exponential_(1.0)

70 s = torch.div(prob, q, out=q)

71 if n_samples == 1:

72 return torch.argmax(s, dim=-1, keepdim=True).to(torch.int64)

73 else:

74 vals, indices = torch.topk(s, n_samples, dim=-1)

75 return indices.to(torch.int64)

77 cum_prob = normed_cumsum(prob, dim=-1)

79 if cum_prob.dim() == 1:

80 n_dist = 1

81 out = torch.empty((n_samples,), device=prob.device, dtype=torch.int64)

82 else:

83 n_dist = cum_prob.size(0)

84 out = torch.empty((n_dist, n_samples), device=prob.device, dtype=torch.int64)

85 # The CTA level parallelism is framed in a 2d grid of blocks with grid.y

86 # indexing into distributions and grid.x output sample batches

87 increment = n_dist * n_samples

88 philox_seed, philox_offset = philox_backend_seed_offset(increment, generator=gen)

89 grid = lambda META: (triton.cdiv(n_samples, META["NBLOCK"]), n_dist)

90 multinomial_with_replacement[grid](

91 cum_prob, out, n_categories, n_samples, philox_seed, philox_offset

92 )

93 return out

Coverage for src/flag_gems/runtime/backend/_aipu/ops/multinomial.py: 0%

52 statements