Coverage for src/flag_gems/runtime/backend/_mthreads/ops/sort.py: 0%
210 statements
« prev ^ index » next coverage.py v7.6.9, created at 2026-03-11 02:28 +0800
« prev ^ index » next coverage.py v7.6.9, created at 2026-03-11 02:28 +0800
1import logging
3import torch
4import triton
5import triton.language as tl
7from flag_gems.ops.topk import _get_finfo_val, _get_iinfo_val, argsort
8from flag_gems.runtime import torch_device_fn
9from flag_gems.utils import libentry
11logger = logging.getLogger(
12 f'flag_gems.runtime.backend._mthreads.ops.{__name__.split(".")[-1]}'
13)
16def unwrap_if_constexpr(o):
17 return o.value if isinstance(o, tl.constexpr) else o
20@tl.constexpr
21def get_int_t(num_bits: tl.constexpr, signed: tl.constexpr) -> tl.dtype:
22 num_bits = unwrap_if_constexpr(num_bits)
23 signed = unwrap_if_constexpr(signed)
24 return tl.core.get_int_dtype(num_bits, signed)
27@tl.constexpr
28def one_zeros(num_bits: tl.constexpr) -> int:
29 num_bits = unwrap_if_constexpr(num_bits)
30 return 1 << (num_bits - 1)
33@tl.constexpr
34def zero_ones(num_bits: tl.constexpr) -> int:
35 num_bits = unwrap_if_constexpr(num_bits)
36 return (1 << (num_bits - 1)) - 1
39@triton.jit
40def uint_to_uint(x, descending: tl.constexpr = False):
41 out = ~x if descending else x
42 return out
45@triton.jit
46def int_to_uint(x, descending: tl.constexpr = False):
47 num_bits: tl.constexpr = x.dtype.primitive_bitwidth
48 udtype = get_int_t(num_bits, False)
49 ux = tl.cast(x, udtype, bitcast=True)
50 if descending:
51 # 0111111....1
52 bit_mask: tl.constexpr = zero_ones(num_bits)
53 bit_mask_tensor = tl.full((), value=bit_mask, dtype=udtype)
54 out = ux ^ bit_mask_tensor
55 else:
56 # 1000000...0
57 sign_bit_mask: tl.constexpr = one_zeros(num_bits)
58 sign_bit_mask_tensor = tl.full((), value=sign_bit_mask, dtype=udtype)
59 out = ux ^ sign_bit_mask_tensor
60 return out
63@triton.jit
64def floating_to_uint(x, descending: tl.constexpr = False):
65 num_bits: tl.constexpr = x.dtype.primitive_bitwidth
66 sdtype = get_int_t(num_bits, True)
67 udtype = get_int_t(num_bits, False)
68 sx = x.to(sdtype, bitcast=True)
69 ux = x.to(udtype, bitcast=True)
71 sign_bit_mask_v: tl.constexpr = one_zeros(num_bits)
72 sign_bit_mask = tl.full((), value=sign_bit_mask_v, dtype=udtype)
73 # mind the dtype, right_shift for signed is arithmetic right shift
74 # Fix for triton 3.1 or else `sx >> rshift_bits` is promoted to int32
75 rshift_bits = tl.full((), value=num_bits - 1, dtype=sdtype)
76 mask = sign_bit_mask | (sx >> rshift_bits).to(udtype, bitcast=True)
77 tl.static_assert(mask.dtype == udtype, "type mismatch")
78 # 1000000000...0 for positive
79 # 1111111111...1 for negative
80 if descending:
81 out = ux ^ (~mask)
82 else:
83 out = ux ^ mask
84 return out.to(udtype, bitcast=True)
87@triton.jit
88def convert_to_uint_preverse_order(x: tl.tensor, descending: tl.constexpr = False):
89 if x.dtype.is_floating():
90 out = floating_to_uint(x, descending)
91 elif x.dtype.is_int_signed():
92 out = int_to_uint(x, descending)
93 elif x.dtype.is_int_unsigned():
94 out = uint_to_uint(x, descending)
95 return out
98@triton.jit
99def compute_global_hist_kernel(
100 arr_ptr,
101 out_ptr,
102 num_passes,
103 m,
104 n,
105 tiles_n_per_cta,
106 TILE_N: tl.constexpr,
107 TILE_R: tl.constexpr,
108 num_bits_per_pass: tl.constexpr,
109 descending: tl.constexpr,
110):
111 # arr_ptr: (m, n)
112 # out_ptr: (m, n_passes, r), where r = 2 ** k_bits is the number of bins
113 pid = tl.program_id(0)
114 pid_n = pid // m
115 pid_m = pid % m
117 r: tl.constexpr = 2**num_bits_per_pass
118 bfe_mask: tl.constexpr = (1 << num_bits_per_pass) - 1 # a.k.a. 2 ** k_bits - 1
119 CTA_TILE_N: tl.constexpr = TILE_N * tiles_n_per_cta
120 cta_n_start = CTA_TILE_N * pid_n
121 cta_n_end = tl.minimum(cta_n_start + CTA_TILE_N, n)
123 for p in range(0, num_passes): # parallel
124 bit_offset = p * num_bits_per_pass
125 for r_start in range(0, r, TILE_R): # parallel
126 bin_indices = r_start + tl.arange(0, TILE_R)
127 acc = tl.zeros((TILE_R, TILE_N), dtype=tl.int64)
128 for n_start in range(cta_n_start, cta_n_end, TILE_N): # sequantial
129 n_offsets = n_start + tl.arange(0, TILE_N) # (TILE_N, )
130 mask = n_offsets < cta_n_end
131 arr = tl.load(arr_ptr + pid_m * n + n_offsets, mask=mask)
132 arr = convert_to_uint_preverse_order(arr, descending)
133 key = (arr >> bit_offset) & bfe_mask # (TILE_N, )
134 matches = tl.where(
135 mask, (bin_indices[:, None] == key), False
136 ) # (TILE_R, TILE_N)
137 acc += matches
138 local_sum = tl.sum(acc, axis=1)
139 tl.atomic_add(
140 out_ptr + pid_m * num_passes * r + p * r + bin_indices,
141 local_sum,
142 sem="relaxed",
143 )
146@triton.jit
147def sweep(
148 arr_ptr,
149 associate_arr_ptr, # inputs: (key & value)
150 out_ptr,
151 associate_out_ptr, # outputs: (key & value)
152 excumsum_bins_ptr,
153 status_ptr, # aux input and status
154 n_passes,
155 pass_id,
156 bit_offset,
157 m,
158 N,
159 OUT_N,
160 TILE_N: tl.constexpr,
161 TILE_R: tl.constexpr,
162 k_bits: tl.constexpr,
163 descending: tl.constexpr,
164):
165 # r: num_bins = 2 ** k_bits
166 # OUT_N: grid_n = cdiv(N, )
168 # arr_ptr: (m, N)
169 # out_ptr: (m, N)
170 # excumsum_bins_ptr: (m, n_passes, r)
171 # flag_ptr: (m, r, OUT_N)
173 # grid: (m, grid_r, grid_n)
175 # load data
176 pid = tl.program_id(0)
177 pid_m = pid % m
178 pid_n = pid // m
179 pid_r = tl.program_id(1)
181 # bit masks
182 aggregate_mask: tl.constexpr = 1 << 30
183 inclusive_prefix_mask: tl.constexpr = 1 << 31
184 v_mask: tl.constexpr = (1 << 30) - 1
185 bfe_mask: tl.constexpr = (1 << k_bits) - 1 # a.k.a. 2 ** k_bits - 1
187 # initialize flag to zero-local sum is not ready
188 r: tl.constexpr = 2**k_bits
189 cta_r_start = pid_r * TILE_R
190 cta_r_end = tl.minimum(cta_r_start + TILE_R, r)
192 # cumsum for a bin_index
193 n_offsets = pid_n * TILE_N + tl.arange(0, TILE_N) # (TILE_N, )
194 mask = n_offsets < N
195 arr = tl.load(arr_ptr + pid_m * N + n_offsets, mask=mask)
196 arr_u = convert_to_uint_preverse_order(arr, descending)
197 key = (arr_u >> bit_offset) & bfe_mask # (TILE_N, )
199 # since triton can only use scalar as condition, loop by bin_index
200 # status must be pre zero-initialized, or else we have to initialize it
201 for bin_index in range(cta_r_start, cta_r_end):
202 matches = tl.where(mask, key == bin_index, False) # (TILE_N, ) bool
203 # cta level cumsum per bin
204 # CAUTION: tl.sum in triton 3.2 does not promote type
205 local_sum = tl.sum(matches.to(tl.uint32), axis=0)
206 pack0 = aggregate_mask | local_sum
207 status_offset = pid_m * (r * OUT_N) + bin_index * OUT_N + pid_n
208 tl.store(status_ptr + status_offset, pack0, cache_modifier=".cg")
210 # decoupled lookback
211 exclusive_prefix = tl.zeros((), dtype=tl.uint32)
212 i_lookback = pid_n - 1
213 while i_lookback >= 0:
214 flag_offset_i = pid_m * (r * OUT_N) + bin_index * OUT_N + i_lookback
215 pack1 = 0
216 while pack1 == 0:
217 # pack1 = tl.load(status_ptr + flag_offset_i, volatile=True) # uin32
218 pack1 = tl.atomic_cas(status_ptr + flag_offset_i, 0, 0, sem="acquire")
219 exclusive_prefix += pack1 & v_mask
220 if (pack1 & aggregate_mask) == aggregate_mask:
221 i_lookback -= 1
222 else:
223 i_lookback = -1
224 pack2 = inclusive_prefix_mask | (exclusive_prefix + local_sum)
225 tl.store(status_ptr + status_offset, pack2, cache_modifier=".cg")
227 local_ex_cumsum = (
228 tl.cumsum(matches.to(tl.uint32), axis=0) - matches
229 ) # (TILE_N, )
230 ex_cumsum_in_bin = (
231 exclusive_prefix + local_ex_cumsum
232 ) # global ex_cumsum_in_bin (TILE_N, )
234 # ex_cumsum_bins (m, n_passes, r)
235 ex_cumsum_bins = tl.load(
236 excumsum_bins_ptr + pid_m * (n_passes * r) + pass_id * r + bin_index
237 ) # scalar
238 pos = ex_cumsum_bins + ex_cumsum_in_bin # (TILE_N, )
240 # scatter
241 tl.store(out_ptr + pid_m * N + pos, arr, mask=matches)
242 if associate_arr_ptr is not None:
243 associate_arr = tl.load(
244 associate_arr_ptr + pid_m * N + n_offsets, mask=mask
245 )
246 tl.store(associate_out_ptr + pid_m * N + pos, associate_arr, mask=matches)
249def radix_sort(arr, k_bits=8, descending=False):
250 n = arr.shape[-1]
251 m = arr.numel() // n
252 assert n < (1 << 30), "we have not implemented 2**30 per launch"
253 dtype = arr.dtype
254 num_bits = 1 if dtype == torch.bool else (arr.itemsize * 8)
256 TILE_N = 1024
257 tiles_n_per_cta = 8
258 CTA_TILE_N = tiles_n_per_cta * TILE_N
260 num_bins = 2**k_bits
261 n_passes = triton.cdiv(num_bits, k_bits)
262 TILE_R = 16
264 grid_n = triton.cdiv(n, CTA_TILE_N)
265 grid_for_global_hist = (m * grid_n, 1, 1)
267 with torch_device_fn.device(arr.device):
268 global_hist = torch.zeros(
269 (m, n_passes, num_bins), device=arr.device, dtype=torch.int32
270 )
271 compute_global_hist_kernel[grid_for_global_hist](
272 arr,
273 global_hist,
274 n_passes,
275 m,
276 n,
277 tiles_n_per_cta,
278 TILE_N,
279 TILE_R,
280 k_bits,
281 descending,
282 )
283 ex_cumsum_bins = torch.cumsum(global_hist, -1) - global_hist
284 ex_cumsum_bins = ex_cumsum_bins.to(torch.int32)
286 # sort
287 arr_in = torch.clone(arr)
288 indices_in = (
289 torch.arange(0, n, dtype=torch.int64, device=arr_in.device)
290 .broadcast_to(arr.shape)
291 .contiguous()
292 )
293 arr_out = torch.empty_like(arr)
294 indices_out = torch.empty_like(indices_in)
296 TILE_R = 8
297 grid_r = triton.cdiv(num_bins, TILE_R)
298 TILE_N = 2048
299 grid_n = triton.cdiv(n, TILE_N)
300 grid_for_sweep = (m * grid_n, grid_r)
302 status = torch.empty(
303 (m, num_bins, grid_n), device=arr.device, dtype=torch.int32
304 )
306 for i in range(0, n_passes):
307 bit_offset = i * k_bits
308 status.zero_()
309 sweep[grid_for_sweep](
310 arr_in,
311 indices_in,
312 arr_out,
313 indices_out,
314 ex_cumsum_bins,
315 status,
316 n_passes,
317 i,
318 bit_offset,
319 m,
320 n,
321 grid_n,
322 TILE_N,
323 TILE_R,
324 k_bits,
325 descending,
326 )
327 # print(f"< sorted last {bit_offset + k_bits:>2d} bits: {arr_out}")
328 arr_in, arr_out = arr_out, arr_in
329 indices_in, indices_out = indices_out, indices_in
331 return arr_in, indices_in
334@libentry()
335@triton.jit()
336def sort_kernel(
337 in_ptr,
338 out_ptr,
339 out_index_ptr,
340 N: tl.constexpr,
341 BLOCK_SIZE: tl.constexpr,
342 DESCENDING: tl.constexpr,
343 IS_FLOAT: tl.constexpr,
344):
345 cols = tl.arange(0, BLOCK_SIZE)
346 mask = cols < N
347 offset = tl.program_id(0) * N + cols
348 in_ptr += offset
349 out_ptr += offset
350 out_index_ptr += offset
352 if IS_FLOAT:
353 mask_val = _get_finfo_val(in_ptr.dtype.element_ty, return_max=not DESCENDING)
354 in_val = tl.load(in_ptr, mask=mask, other=mask_val)
355 else:
356 mask_val = _get_iinfo_val(in_ptr.dtype.element_ty, return_max=not DESCENDING)
357 in_val = tl.load(in_ptr, mask=mask, other=mask_val)
359 index_val = tl.arange(0, BLOCK_SIZE)
361 sorted_in_val, sorted_index_val = argsort(
362 in_val, index_val, 0, descending=DESCENDING
363 )
364 tl.store(out_ptr, sorted_in_val, mask=mask)
365 tl.store(out_index_ptr, sorted_index_val, mask=mask)
368def sort(inp, dim=-1, descending=False):
369 # We only implement stable radix sort here
370 logger.debug("GEMS_MTHREADS SORT")
371 return sort_stable(inp, stable=False, dim=dim, descending=descending)
374def sort_stable(inp, *, stable, dim=-1, descending=False):
375 logger.debug("GEMS_MTHREADS SORT.STABLE")
376 # We only implement stable radix sort here
377 _ = stable
378 sort_elem_cnt = inp.shape[dim]
379 if sort_elem_cnt == 1:
380 return inp, torch.zeros_like(inp, dtype=torch.int64)
382 if dim < 0:
383 dim = dim + inp.ndim
384 if dim != inp.ndim - 1:
385 inp = torch.movedim(inp, dim, -1).contiguous()
386 else:
387 inp = inp.contiguous()
389 dtype = inp.dtype
390 num_bits_per_pass = 1 if dtype == torch.bool else 4
391 out, out_index = radix_sort(inp, num_bits_per_pass, descending)
393 if dim != inp.ndim - 1:
394 out = torch.movedim(out, -1, dim)
395 out_index = torch.movedim(out_index, -1, dim)
396 return out, out_index