Coverage for src/flag_gems/ops/std.py: 45%

1import logging

3import torch

4import triton

5import triton.language as tl

7from flag_gems import runtime

8from flag_gems.utils import dim_compress

10logger = logging.getLogger(__name__)

13@triton.jit

14def _std_map_kernel(X, Tmp_sum, Tmp_sum_sq, N, BLOCK_N: tl.constexpr):

15 pid = tl.program_id(0)

16 offset = pid * BLOCK_N + tl.arange(0, BLOCK_N)

17 mask = offset < N

18 x = tl.load(X + offset, mask=mask, other=0.0).to(tl.float32)

19 sum_val = tl.sum(x, axis=0)

20 sum_sq_val = tl.sum(x * x, axis=0)

21 tl.store(Tmp_sum + pid, sum_val)

22 tl.store(Tmp_sum_sq + pid, sum_sq_val)

25@triton.jit

26def _std_reduce_kernel(

27 Tmp_sum, Tmp_sum_sq, Out, N, correction, BLOCK_NUM, BLOCK_SIZE: tl.constexpr

28):

29 total_sum_acc = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)

30 total_sum_sq_acc = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)

31 for off in range(0, BLOCK_NUM, BLOCK_SIZE):

32 offset = off + tl.arange(0, BLOCK_SIZE)

33 mask = offset < BLOCK_NUM

34 tmp_sum_vals = tl.load(Tmp_sum + offset, mask=mask, other=0.0).to(tl.float32)

35 tmp_sum_sq_vals = tl.load(Tmp_sum_sq + offset, mask=mask, other=0.0).to(

36 tl.float32

37 )

38 total_sum_acc += tmp_sum_vals

39 total_sum_sq_acc += tmp_sum_sq_vals

40 total_sum = tl.sum(total_sum_acc, axis=0)

41 total_sum_sq = tl.sum(total_sum_sq_acc, axis=0)

42 mean = total_sum / N

43 var = (total_sum_sq / N) - (mean * mean)

44 var = var * N / tl.maximum(N - correction, 1.0)

45 safe_var = tl.maximum(var, 0.0)

46 std_dev = tl.sqrt(safe_var)

47 tl.store(Out, std_dev.to(Out.dtype.element_ty))

50@triton.autotune(configs=runtime.get_tuned_config("naive_reduction"), key=["M", "N"])

51@triton.jit

52def _std_fused_dim_kernel(

53 X,

54 Out,

55 stride_x_row,

56 stride_x_col,

57 M,

58 N,

59 correction,

60 BLOCK_M: tl.constexpr,

61 BLOCK_N: tl.constexpr,

62):

63 pid_group = tl.program_id(axis=0)

64 start_row = pid_group * BLOCK_M

65 row_offsets = start_row + tl.arange(0, BLOCK_M)

66 row_mask = row_offsets < M

68 mean_acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

69 x_row_ptrs = X + row_offsets[:, None] * stride_x_row

71 for off in range(0, N, BLOCK_N):

72 col_offsets = off + tl.arange(0, BLOCK_N)

73 col_mask = col_offsets < N

74 x_ptrs = x_row_ptrs + col_offsets[None, :] * stride_x_col

75 final_mask = row_mask[:, None] & col_mask[None, :]

76 x = tl.load(x_ptrs, mask=final_mask, other=0.0)

77 mean_acc += x.to(tl.float32)

79 mean = tl.sum(mean_acc, axis=1) / N

81 var_acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

82 for off in range(0, N, BLOCK_N):

83 col_offsets = off + tl.arange(0, BLOCK_N)

84 col_mask = col_offsets < N

85 x_ptrs = x_row_ptrs + col_offsets[None, :] * stride_x_col

86 final_mask = row_mask[:, None] & col_mask[None, :]

87 x = tl.load(x_ptrs, mask=final_mask, other=0.0)

88 diff = x.to(tl.float32) - mean[:, None]

89 var_acc += tl.where(final_mask, diff * diff, 0.0)

91 var = tl.sum(var_acc, axis=1)

93 denom = N - correction

94 var = var / tl.maximum(denom, 1e-12)

95 safe_var = tl.maximum(var, 0.0)

96 std_dev = tl.sqrt(safe_var)

98 out_ptrs = Out + row_offsets

99 tl.store(out_ptrs, std_dev.to(Out.dtype.element_ty), mask=row_mask)

100

101

102def std(x, dim=None, *, correction=None, keepdim=False):

103 effective_correction = 1.0 if correction is None else float(correction)

104 original_shape = x.shape

105 input_ndim = x.ndim

106

107 if dim is None:

108 logger.debug("GEMS STD (Global Simple Map-Reduce Path)")

109 N = x.numel()

110 if N == 0 or N - effective_correction <= 0:

111 return torch.full([], float("nan"), device=x.device, dtype=x.dtype)

112

113 BLOCK_N_MAP = 1024

114 BLOCK_NUM = triton.cdiv(N, BLOCK_N_MAP)

115 tmp_sum = torch.empty((BLOCK_NUM,), dtype=torch.float32, device=x.device)

116 tmp_sum_sq = torch.empty((BLOCK_NUM,), dtype=torch.float32, device=x.device)

117 _std_map_kernel[(BLOCK_NUM,)](

118 x.contiguous(), tmp_sum, tmp_sum_sq, N, BLOCK_N_MAP

119 )

120 out = torch.empty([], device=x.device, dtype=x.dtype)

121 BLOCK_SIZE_REDUCE = 1024

122 _std_reduce_kernel[(1,)](

123 tmp_sum,

124 tmp_sum_sq,

125 out,

126 N,

127 effective_correction,

128 BLOCK_NUM,

129 BLOCK_SIZE_REDUCE,

130 )

131 return out.view([1] * input_ndim) if keepdim else out

132

133 else:

134 logger.warning(

135 f"GEMS std: Using compatible but non-optimal path for dim={dim} (dim_compress)."

136 )

137

138 if isinstance(dim, int):

139 dim_list = [dim]

140 else:

141 dim_list = list(dim)

142 dim_list_normalized = [d % input_ndim for d in dim_list]

143

144 x_view = dim_compress(x, dim_list_normalized)

145

146 N = 1

147 for d in dim_list_normalized:

148 N *= original_shape[d]

149 M = x.numel() // N

150

151 stride_x_row, stride_x_col = N, 1

152

153 output_shape_kept = list(original_shape)

154 for d in dim_list_normalized:

155 output_shape_kept[d] = 1

156

157 if M * N > 0 and (N - effective_correction <= 0):

158 final_shape = [

159 s for i, s in enumerate(original_shape) if i not in dim_list_normalized

160 ]

161 return torch.full(

162 final_shape if not keepdim else output_shape_kept,

163 float("nan"),

164 device=x.device,

165 dtype=x.dtype,

166 )

167

168 out = torch.empty(output_shape_kept, device=x.device, dtype=x.dtype)

169 if M * N == 0:

170 return out.squeeze(dim=tuple(dim_list_normalized)) if not keepdim else out

171

172 grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]),)

173

174 _std_fused_dim_kernel[grid](

175 x_view, out.view(M), stride_x_row, stride_x_col, M, N, effective_correction

176 )

177

178 return out.squeeze(dim=tuple(dim_list_normalized)) if not keepdim else out