Lint

joydddd · joydddd · commit 84c8e31d176a · 2025-05-30T21:32:58.000-04:00
diff --git a/examples/long_sum.py b/examples/long_sum.py
@@ -1,21 +1,28 @@
 from __future__ import annotations
-from unittest import result
 
 import torch
 
 import helion
 import helion.language as hl
 
+
 def baseline_sum(x: torch.Tensor) -> torch.Tensor:
     return x.sum(-1)
 
+
 # Naive Reduction: Load the entire reduction dim at once, and reduce in reg.
-@helion.kernel(config=helion.Config(block_sizes=[[1]], reduction_loops=[None], num_warps=32, num_stages=4, indexing='block_ptr'))
+@helion.kernel(
+    config=helion.Config(
+        block_sizes=[[1]],
+        reduction_loops=[None],
+        num_warps=32,
+        num_stages=4,
+        indexing="block_ptr",
+    )
+)
 def longsum(x: torch.Tensor) -> torch.Tensor:
     m, _ = x.size()
-    out = torch.empty(
-        [m], dtype=x.dtype, device=x.device
-    )
+    out = torch.empty([m], dtype=x.dtype, device=x.device)
 
     for tile_m in hl.tile(m):
         out[tile_m] = x[tile_m, :].sum(-1)
@@ -26,37 +33,39 @@ def longsum(x: torch.Tensor) -> torch.Tensor:
 @helion.kernel(
     config=helion.Config(
         block_sizes=[[1]],
-        reduction_loops=[32768], # [None] for naive reduction, [tile_size] for looped reduction
+        reduction_loops=[
+            32768
+        ],  # [None] for naive reduction, [tile_size] for looped reduction
         num_warps=16,
         num_stages=5,
         indexing="pointer",
     )
 )
 def longsum_w_red_loop(x: torch.Tensor) -> torch.Tensor:
     m, _ = x.size()
-    out = torch.empty(
-        [m], dtype=x.dtype, device=x.device
-    )
+    out = torch.empty([m], dtype=x.dtype, device=x.device)
 
     for tile_m in hl.tile(m):
         out[tile_m] = x[tile_m, :].sum(-1)
     return out
 
 
 # This generates the same code as above, but manually implements looped reduction.
-@helion.kernel(config=helion.Config(block_sizes=[[32768], [1]], num_warps=16, num_stages=5, indexing='pointer'))
+@helion.kernel(
+    config=helion.Config(
+        block_sizes=[[32768], [1]], num_warps=16, num_stages=5, indexing="pointer"
+    )
+)
 def longsum_manual(x: torch.Tensor) -> torch.Tensor:
     m, n = x.size()
-    out = torch.empty(
-        [m], dtype=x.dtype, device=x.device
-    )
+    out = torch.empty([m], dtype=x.dtype, device=x.device)
 
     # Call register_block_size to know block_size_n outside of the reduction loop.
     block_size_n = hl.register_block_size(n)
 
     for tile_m in hl.tile(m):
         acc = hl.zeros([tile_m, block_size_n], dtype=x.dtype)
-        for tile_n in hl.tile(n, block_size=block_size_n): # Reduction loop
+        for tile_n in hl.tile(n, block_size=block_size_n):  # Reduction loop
             acc += x[tile_m, tile_n]
         out[tile_m] = acc.sum(-1)
     return out
@@ -72,7 +81,9 @@ def check(m: int, n: int) -> None:
     print("✅ Results Match ✅ naive reduction")
 
     helion_red_loop_out = longsum_w_red_loop(x)
-    torch.testing.assert_close(helion_red_loop_out, baseline_sum(x), rtol=1e-2, atol=1e-1)
+    torch.testing.assert_close(
+        helion_red_loop_out, baseline_sum(x), rtol=1e-2, atol=1e-1
+    )
     print("✅ Results Match ✅ Reduction Loop")
 
     helion_manual_out = longsum_manual(x)
@@ -89,7 +100,7 @@ def check(m: int, n: int) -> None:
 
 
 def main() -> None:
-    check(4, 130000) # seq_len = 128k
+    check(4, 130000)  # seq_len = 128k
 
 
 if __name__ == "__main__":