Coverage for src/flag_gems/fused/FLA/index.py: 69%

1# This file contains code copied from the flash-linear-attention project.

2# The original source code was licensed under the MIT license and included

3# the following copyright notice:

5# ruff: noqa: E501

6import torch

7import triton

9from flag_gems.fused.FLA.utils import tensor_cache

12@tensor_cache

13def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:

14 return cu_seqlens[1:] - cu_seqlens[:-1]

17@tensor_cache

18def prepare_chunk_indices(

19 cu_seqlens: torch.LongTensor, chunk_size: int

20) -> torch.LongTensor:

21 indices = torch.cat(

23 torch.arange(n)

24 for n in triton.cdiv(prepare_lens(cu_seqlens), chunk_size).tolist()

27 return torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(cu_seqlens)

30@tensor_cache

31def prepare_chunk_offsets(

32 cu_seqlens: torch.LongTensor, chunk_size: int

33) -> torch.LongTensor:

34 return torch.cat(

35 [cu_seqlens.new_tensor([0]), triton.cdiv(prepare_lens(cu_seqlens), chunk_size)]

36 ).cumsum(-1)