Coverage for src/flag_gems/ops/triu.py: 66%

1import logging

3import torch

4import triton

5import triton.language as tl

7from flag_gems import runtime

8from flag_gems.runtime import torch_device_fn

9from flag_gems.utils import libentry

10from flag_gems.utils import triton_lang_extension as tle

12logger = logging.getLogger(__name__)

15@libentry()

16@triton.autotune(configs=runtime.get_tuned_config("triu"), key=["M", "N"])

17@triton.jit(do_not_specialize=["diagonal"])

18def triu_kernel(

19 X,

20 Y,

21 M,

22 N,

23 diagonal,

24 M_BLOCK_SIZE: tl.constexpr,

25 N_BLOCK_SIZE: tl.constexpr,

26):

27 pid = tle.program_id(0)

28 row = pid * M_BLOCK_SIZE + tl.arange(0, M_BLOCK_SIZE)[:, None]

29 m_mask = row < M

30 X += row * N

31 Y += row * N

33 for n_offset in range(0, N, N_BLOCK_SIZE):

34 cols = n_offset + tl.arange(0, N_BLOCK_SIZE)[None, :]

35 n_mask = cols < N

36 mask = m_mask and n_mask

38 x = tl.load(X + cols, mask, other=0.0)

39 y = tl.where(row + diagonal <= cols, x, 0.0)

40 tl.store(Y + cols, y, mask=mask)

43@libentry()

44@triton.autotune(

45 configs=runtime.get_tuned_config("triu_batch"),

46 key=["batch", "MN", "N", "diagonal"],

47)

48@triton.jit(do_not_specialize=["diagonal"])

49def triu_batch_kernel(

50 X,

51 Y,

52 batch,

53 MN,

54 N,

55 diagonal,

56 BATCH_BLOCK_SIZE: tl.constexpr,

57 MN_BLOCK_SIZE: tl.constexpr,

58):

59 batch_id = tle.program_id(0)

60 mn_id = tle.program_id(1)

61 row = batch_id * BATCH_BLOCK_SIZE + tl.arange(0, BATCH_BLOCK_SIZE)[:, None]

62 batch_mask = row < batch

63 X += row * MN

64 Y += row * MN

66 cols = mn_id * MN_BLOCK_SIZE + tl.arange(0, MN_BLOCK_SIZE)[None, :]

67 mn_mask = cols < MN

68 mask = batch_mask and mn_mask

69 x = tl.load(X + cols, mask, other=0.0)

70 m = cols // N

71 n = cols % N

72 y = tl.where(m + diagonal <= n, x, 0.0)

73 tl.store(Y + cols, y, mask=mask)

76def _check_batch_contiguous(tensor, allow_zero_stride=True):

77 if tensor.is_contiguous():

78 return True, tensor

80 dims = tensor.dim()

82 if dims >= 2:

83 n = tensor.size(-1)

84 stride_row, stride_col = tensor.stride(-2), tensor.stride(-1)

86 if not (stride_col == 1 and stride_row == n):

87 return False, tensor.contiguous()

89 if allow_zero_stride and dims <= 3:

90 return True, tensor

92 expected_stride = tensor.size(-1) * tensor.size(-2)

93 for i in range(dims - 3, -1, -1):

94 if (

95 allow_zero_stride

96 and i == 0

97 and (tensor.stride(i) == 0 or tensor.size(i) == 1)

98 ):

99 continue

100

101 if tensor.stride(i) != expected_stride:

102 return False, tensor.contiguous()

103

104 expected_stride *= tensor.size(i)

105

106 return True, tensor

107

108

109def triu(A, diagonal=0):

110 logger.debug("GEMS TRIU")

111

112 assert len(A.shape) > 1, "Input tensor must have at least 2 dimensions"

113

114 can_use_directly, A_input = _check_batch_contiguous(A, allow_zero_stride=False)

115

116 out = torch.empty(

117 A.shape, dtype=A.dtype, device=A.device, memory_format=torch.contiguous_format

118 )

119

120 M, N = A_input.shape[-2:]

121

122 with torch_device_fn.device(A_input.device):

123 if len(A_input.shape) == 2:

124 grid = lambda meta: (triton.cdiv(M, meta["M_BLOCK_SIZE"]),)

125 triu_kernel[grid](A_input, out, M, N, diagonal)

126 else:

127 batch = int(torch.numel(A_input) / M / N)

128 B = A_input.view(batch, -1)

129 grid = lambda meta: (

130 triton.cdiv(batch, meta["BATCH_BLOCK_SIZE"]),

131 triton.cdiv(M * N, meta["MN_BLOCK_SIZE"]),

132 )

133 triu_batch_kernel[grid](B, out, batch, M * N, N, diagonal)

134 out = out.view(A.shape)

135

136 return out

137

138

139def triu_(A, diagonal=0):

140 logger.debug("GEMS TRIU_ (inplace)")

141

142 assert len(A.shape) > 1, "Input tensor must have at least 2 dimensions"

143 diagonal = int(diagonal)

144 M, N = A.shape[-2:]

145

146 can_use_directly, A_to_use = _check_batch_contiguous(A, allow_zero_stride=True)

147

148 if not can_use_directly:

149 logger.debug(

150 "Input tensor does not satisfy contiguity requirements, "

151 "using temporary tensor for computation"

152 )

153

154 result_temp = torch.empty_like(A_to_use, memory_format=torch.contiguous_format)

155

156 with torch_device_fn.device(A.device):

157 if len(A.shape) == 2:

158 grid = lambda meta: (triton.cdiv(M, meta["M_BLOCK_SIZE"]),)

159 triu_kernel[grid](A_to_use, result_temp, M, N, diagonal)

160 else:

161 batch = int(torch.numel(A) / M / N)

162 B = A_to_use.view(batch, -1)

163 result_temp_flat = result_temp.view(batch, -1)

164 grid = lambda meta: (

165 triton.cdiv(batch, meta["BATCH_BLOCK_SIZE"]),

166 triton.cdiv(M * N, meta["MN_BLOCK_SIZE"]),

167 )

168 triu_batch_kernel[grid](B, result_temp_flat, batch, M * N, N, diagonal)

169

170 A.copy_(result_temp)

171 else:

172 with torch_device_fn.device(A.device):

173 if len(A.shape) == 2:

174 grid = lambda meta: (triton.cdiv(M, meta["M_BLOCK_SIZE"]),)

175 triu_kernel[grid](A, A, M, N, diagonal)

176 else:

177 batch = int(torch.numel(A) / M / N)

178 B = A.view(batch, -1)

179 grid = lambda meta: (

180 triton.cdiv(batch, meta["BATCH_BLOCK_SIZE"]),

181 triton.cdiv(M * N, meta["MN_BLOCK_SIZE"]),

182 )

183 triu_batch_kernel[grid](B, B, batch, M * N, N, diagonal)

184

185 return A