Coverage for src/flag_gems/ops/lift_fresh

1# Generated by KernelGen: https://github.com/flagos-ai/KernelGen

2import logging

4import torch

5import triton

6import triton.language as tl

8logger = logging.getLogger(__name__)

11@triton.jit

12def _copy_kernel(in_ptr, out_ptr, n_elements, BLOCK_SIZE: tl.constexpr):

13 pid = tl.program_id(axis=0)

14 block_start = pid * BLOCK_SIZE

15 offsets = block_start + tl.arange(0, BLOCK_SIZE)

16 mask = offsets < n_elements

17 x = tl.load(in_ptr + offsets, mask=mask)

18 tl.store(out_ptr + offsets, x, mask=mask)

21def lift_fresh_copy(*args, **kwargs):

22 logger.debug("GEMS LIFT_FRESH_COPY")

23 # Attempt to find the input tensor from args/kwargs

24 x = None

25 if len(args) > 0 and isinstance(args[0], torch.Tensor):

26 x = args[0]

27 elif "self" in kwargs and isinstance(kwargs["self"], torch.Tensor):

28 x = kwargs["self"]

29 else:

30 for v in list(args) + list(kwargs.values()):

31 if isinstance(v, torch.Tensor):

32 x = v

33 break

34 if x is None:

35 raise ValueError("lift_fresh_copy expects a Tensor argument")

37 if not x.is_cuda:

38 raise ValueError("lift_fresh_copy Triton kernel requires a CUDA tensor")

40 x_contig = x.contiguous()

41 out = torch.empty_like(x_contig, memory_format=torch.contiguous_format)

43 n_elements = x_contig.numel()

44 grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)

45 _copy_kernel[grid](x_contig, out, n_elements, BLOCK_SIZE=1024)

47 return out.view_as(x_contig)

50def lift_fresh_copy_out(x: torch.Tensor, out: torch.Tensor = None):

51 logger.debug("GEMS LIFT_FRESH_COPY_OUT")

52 if x is None or not isinstance(x, torch.Tensor):

53 raise ValueError("lift_fresh_copy_out expects 'x' to be a Tensor")

54 if not x.is_cuda:

55 raise ValueError("lift_fresh_copy_out Triton kernel requires CUDA tensors")

57 x_contig = x.contiguous()

59 if out is None:

60 out = torch.empty_like(x_contig, memory_format=torch.contiguous_format)

61 else:

62 if not out.is_cuda:

63 raise ValueError("Output tensor 'out' must be on CUDA")

64 if out.dtype != x_contig.dtype:

65 raise ValueError("Output tensor 'out' must have the same dtype as input")

66 # Resize to match input shape and ensure contiguous layout

67 if out.numel() != x_contig.numel() or not out.is_contiguous():

68 out.resize_(x_contig.shape)

69 if not out.is_contiguous():

70 out = out.contiguous()

72 n_elements = x_contig.numel()

73 grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)

74 _copy_kernel[grid](x_contig, out, n_elements, BLOCK_SIZE=1024)

76 return out.view_as(x_contig)

Coverage for src/flag_gems/ops/lift_fresh_copy.py: 38%

55 statements