1
+ module {
2
+ func.func @nested_parallel (%arg0: index , %arg1: index , %arg2: index ) {
3
+ %0 = memref.alloc (%arg0 , %arg1 ) : memref <?x?xf32 >
4
+ %1 = memref.alloc (%arg1 , %arg2 ) : memref <?x?xf32 >
5
+ %2 = memref.alloc (%arg0 , %arg2 ) : memref <?x?xf32 >
6
+
7
+ %3 = arith.addi %arg0 , %arg1 : index
8
+ %4 = arith.addi %arg1 , %arg2 : index
9
+ %c0 = arith.constant 0 : index
10
+ %c1 = arith.constant 1 : index
11
+ %c2 = arith.constant 2 : index
12
+ %c10 = arith.constant 10 : index
13
+ // Outer parallel loop with dynamic bounds and stride
14
+ scf.parallel (%i , %j ) = (%arg0 , %arg1 ) to (%3 , %4 ) step (%arg1 , %c2 ) {
15
+ // Inner parallel loop with constant bounds and stride
16
+ scf.parallel (%k ) = (%c0 ) to (%c10 ) step (%c1 ) {
17
+ %val0 = memref.load %0 [%i , %j ] : memref <?x?xf32 >
18
+ %val1 = memref.load %1 [%j , %k ] : memref <?x?xf32 >
19
+ %result = arith.addf %val0 , %val1 : f32
20
+ memref.store %result , %2 [%i , %k ] : memref <?x?xf32 >
21
+ }
22
+ }
23
+
24
+ memref.dealloc %0 : memref <?x?xf32 >
25
+ memref.dealloc %1 : memref <?x?xf32 >
26
+ memref.dealloc %2 : memref <?x?xf32 >
27
+ return
28
+ }
29
+ }
0 commit comments