Coverage for src/flag_gems/runtime/backend/_cambricon/ops/multinomial.py: 0%
62 statements
« prev ^ index » next coverage.py v7.6.9, created at 2026-03-11 02:28 +0800
« prev ^ index » next coverage.py v7.6.9, created at 2026-03-11 02:28 +0800
1import logging
3import torch
4import triton
5import triton.language as tl
6from triton.language.extra.mlu.libdevice import philox as _philox
8from flag_gems.utils import libentry
9from flag_gems.utils.random_utils import (
10 philox_backend_seed_offset,
11 uint_to_uniform_float,
12)
14logger = logging.getLogger("flag_gems").getChild(__name__.lstrip("."))
17@libentry()
18@triton.jit(do_not_specialize=["K", "N", "philox_seed", "philox_offset"])
19def multinomial_with_replacement(
20 cdf_ptr, out_ptr, K, N, philox_seed, philox_offset, NBLOCK: tl.constexpr = 128
21):
22 # The computation is arranged in a 2d grid of blocks, each producing
23 # a batch of samples for a particular distribution.
24 # <------------------- grid.x --------------------->
25 # | dist0.batch0 | dist0.batch1 | dist0.batch2 ...
26 # grid.y | dist1.batch0 | dist1.batch1 | dist1.batch2 ...
27 # | dist2.batch0 | dist2.batch1 | dist2.batch2 ...
28 y_off = tl.program_id(1) * N
29 n = tl.program_id(0) * NBLOCK + tl.arange(0, NBLOCK)
30 y_off_step = tl.program_id(0) * NBLOCK
32 philox_seed = philox_seed.to(tl.int64)
33 philox_offset = philox_offset.to(tl.int64)
34 sl = (philox_seed & 0xFFFFFFFF).to(tl.uint32)
35 sh = ((philox_seed >> 32) & 0xFFFFFFFF).to(tl.uint32)
36 c0 = (philox_offset & 0xFFFFFFFF).to(tl.uint32)
37 c1 = ((philox_offset >> 32) & 0xFFFFFFFF).to(tl.uint32)
38 r = _philox(NBLOCK, sl, sh, c0 + y_off + y_off_step, c1, 0, 0, 10)
39 r = uint_to_uniform_float(r)
40 rv = r[:, 0]
41 # rv = tl.reshape(r[0, :], [NBLOCK], can_reorder=True)
43 # Do a binary search for each random number on the cumulative probabilities.
44 # Each random number always selects the leftmost index of the data greater
45 # than or equal to itself. However, this is likely to give a wrong result
46 # in case the first probability is zero which is not expected to selected.
47 # This error happens when the tossed random number is also zero. To avoid
48 # this mistake, we simply perturb random variable with a small number.
49 rv += 0.0001
50 rv = tl.where(rv > 0.9999, 0.9999, rv)
52 cdf_ptr += tl.program_id(1) * K
53 start = tl.zeros((NBLOCK,), dtype=tl.int32)
54 end = tl.zeros((NBLOCK,), dtype=tl.int32) + K - 1
55 steps = tl.math.log2(K.to(tl.float32)).to(tl.int32) + 1
56 for _ in range(steps):
57 mid = start + (end - start) // 2
58 x = tl.load(cdf_ptr + mid, mask=n < N)
59 start = tl.where(x < rv, mid + 1, start)
60 end = tl.where(x < rv, end, mid)
62 # Returns the last index in case of an overflow
63 start = tl.where(start >= K, K - 1, start)
65 tl.store(out_ptr + y_off + n, start, mask=n < N)
68def multinomial(prob, n_samples, with_replacement=False, *, gen=None):
69 logger.debug("GEMS_CAMBRICON MULTINOMIAL")
70 assert prob.dtype in (torch.float16, torch.float32, torch.bfloat16, torch.float64)
71 assert 0 < prob.dim() <= 2, "prob_dist must be 1 or 2 dim"
72 n_categories = prob.size(-1)
73 assert n_categories <= (1 << 24), "number of categories cannot exceed 2^24"
74 assert (
75 with_replacement or n_samples <= n_categories
76 ), "cannot sample n_samples > prob.size(-1) samples without replacement."
78 # Sampling without replacement
79 if (not with_replacement) or n_samples == 1:
80 # In case of with_replacement, sampling is approximated by selecing
81 # the top k indices over sorted probabilities with an exponential pertubation
82 # s = argmax( p / q ) where q ~ Exp(1)
83 q = torch.empty_like(prob).exponential_(1.0)
84 s = torch.div(prob, q, out=q)
85 if n_samples == 1:
86 return torch.argmax(s, dim=-1, keepdim=True).to(torch.int64)
87 else:
88 vals, indices = torch.topk(s, n_samples, dim=-1)
89 return indices.to(torch.int64)
91 from . import normed_cumsum
93 cum_prob = normed_cumsum(prob, dim=-1)
95 if cum_prob.dim() == 1:
96 n_dist = 1
97 out = torch.empty((n_samples,), device=prob.device, dtype=torch.int64)
98 else:
99 n_dist = cum_prob.size(0)
100 out = torch.empty((n_dist, n_samples), device=prob.device, dtype=torch.int64)
101 # The CTA level parallelism is framed in a 2d grid of blocks with grid.y
102 # indexing into distributions and grid.x output sample batches
103 increment = n_dist * n_samples
104 philox_seed, philox_offset = philox_backend_seed_offset(increment, generator=gen)
105 grid = lambda META: (triton.cdiv(n_samples, META["NBLOCK"]), n_dist)
106 multinomial_with_replacement[grid](
107 cum_prob, out, n_categories, n_samples, philox_seed, philox_offset, num_warps=1
108 )
109 return out