-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathProgram.clef
More file actions
468 lines (423 loc) · 19.9 KB
/
Program.clef
File metadata and controls
468 lines (423 loc) · 19.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
/// HelloNappy — NPU Element-wise Multiply on XDNA2
///
/// Demonstrates Clef dispatching computation to the NPU via XRT's C++ API.
/// Generates two integer signal patterns on the host (triangular wave and
/// square wave), stages them to NPU buffers, runs an element-wise multiply
/// kernel across 4 AIE tiles, reads back the result, and displays all three
/// signals as numeric output on the console.
///
/// NPU dispatch path (C++ hw_context):
/// xrt::device(0) → xrt::xclbin(path) → device.register_xclbin(xclbin)
/// → extract uuid (C API) → xrt::hw_context(device, uuid, shared)
/// → xrt::kernel(context, name) → xrt::bo allocation → xrt::run dispatch
///
/// The xclbin (HelloNappyKernel.xclbin) and instruction binary
/// (HelloNappyKernel_insts.bin) are produced by the kernel project
/// (HelloNappyKernel.fidproj, target=npu) using the Composer AIE backend.
/// Both files must be in the working directory at runtime.
///
/// MLIR_AIE DPU kernel dispatch protocol:
/// arg 0: opcode (uint64) — 0 for standard dispatch
/// arg 1: instr (BO) — instruction buffer object
/// arg 2: ninstr (uint32) — number of instruction words
/// arg 3: bo0 (BO) — input buffer A
/// arg 4: bo1 (BO) — input buffer B
/// arg 5: bo2 (BO) — output buffer C
module HelloNappy.Program
open Console
open Format
open HelloNappy.XrtCpp
[<EntryPoint>]
let main _ =
Console.writeln ""
Console.writeln " ============================================================"
Console.writeln " HelloNappy - NPU Element-wise Multiply"
Console.writeln " Clef + Fidelity on AMD XDNA2 (Strix Halo)"
Console.writeln " ============================================================"
Console.writeln ""
Console.writeln " Samples: 64 (4 tiles x 16 elements)"
Console.writeln " Kernel: MLIR_AIE (element-wise multiply)"
Console.writeln " API: XRT C++ (hw_context path)"
Console.writeln ""
// ─── Signal generation (host-side) ───
let sigA = NativePtr.stackalloc<int> 64
let sigB = NativePtr.stackalloc<int> 64
let sigOut = NativePtr.stackalloc<int> 64
// Generate triangular wave: period=32, peak=100
Console.writeln " [host] Generating input signals..."
let mutable gi = 0
while gi < 64 do
let phase = gi % 32
if phase < 16 then
NativePtr.set sigA gi ((2 * 100 * phase / 16) - 100)
else
NativePtr.set sigA gi (100 - (2 * 100 * (phase - 16) / 16))
gi <- gi + 1
// Generate square wave: period=16, +1/-1
let mutable si = 0
while si < 64 do
let phase = si % 16
if phase < 8 then
NativePtr.set sigB si 1
else
NativePtr.set sigB si -1
si <- si + 1
// Display signal A
Console.writeln ""
Console.writeln " Signal A: Triangular (period=32, peak=100)"
Console.writeln " ----------------------------------------"
let mutable pi = 0
while pi < 64 do
Console.write " ["
Console.write (Format.int pi)
Console.write "] = "
Console.writeln (Format.int (NativePtr.get sigA pi))
pi <- pi + 1
// Display signal B
Console.writeln ""
Console.writeln " Signal B: Square (period=16, +1/-1)"
Console.writeln " ----------------------------------------"
let mutable qi = 0
while qi < 64 do
Console.write " ["
Console.write (Format.int qi)
Console.write "] = "
Console.writeln (Format.int (NativePtr.get sigB qi))
qi <- qi + 1
// ═══════════════════════════════════════════════════════════
// NPU dispatch pipeline (C++ hw_context path)
// ═══════════════════════════════════════════════════════════
Console.writeln ""
Console.writeln " [npu] Dispatching A * B to NPU..."
Console.writeln ""
let mutable err = 0
// ─── Pimpl storage: 16 bytes each (shared_ptr<impl>) ───
let deviceStorage = NativePtr.stackalloc<byte> 16
let xclbinStorage = NativePtr.stackalloc<byte> 16
let hwCtxStorage = NativePtr.stackalloc<byte> 16
let kernelStorage = NativePtr.stackalloc<byte> 16
let runStorage = NativePtr.stackalloc<byte> 16
let boInstrStorage = NativePtr.stackalloc<byte> 16
let boAStorage = NativePtr.stackalloc<byte> 16
let boBStorage = NativePtr.stackalloc<byte> 16
let boOutStorage = NativePtr.stackalloc<byte> 16
// Zero-initialize all pimpl storage (zeroed shared_ptr = null)
let mutable zi = 0
while zi < 16 do
NativePtr.set deviceStorage zi 0uy
NativePtr.set xclbinStorage zi 0uy
NativePtr.set hwCtxStorage zi 0uy
NativePtr.set kernelStorage zi 0uy
NativePtr.set runStorage zi 0uy
NativePtr.set boInstrStorage zi 0uy
NativePtr.set boAStorage zi 0uy
NativePtr.set boBStorage zi 0uy
NativePtr.set boOutStorage zi 0uy
zi <- zi + 1
// Track which objects need destruction
let mutable deviceAlive = false
let mutable xclbinAlive = false
let mutable hwCtxAlive = false
let mutable kernelAlive = false
let mutable runAlive = false
let mutable boInstrAlive = false
let mutable boAAlive = false
let mutable boBAlive = false
let mutable boOutAlive = false
// Addresses for passing to FidelityExtern (nativeint = pointer)
let deviceAddr = NativePtr.toNativeInt deviceStorage
let xclbinAddr = NativePtr.toNativeInt xclbinStorage
let hwCtxAddr = NativePtr.toNativeInt hwCtxStorage
let kernelAddr = NativePtr.toNativeInt kernelStorage
let runAddr = NativePtr.toNativeInt runStorage
let boInstrAddr = NativePtr.toNativeInt boInstrStorage
let boAAddr = NativePtr.toNativeInt boAStorage
let boBAddr = NativePtr.toNativeInt boBStorage
let boOutAddr = NativePtr.toNativeInt boOutStorage
// ─── Step 1: Construct xrt::device(0) ───
Console.writeln " [npu] Constructing xrt::device(0)..."
deviceConstruct deviceAddr 0u
deviceAlive <- true
Console.writeln " [npu] Device constructed."
// ─── Step 2: Read xclbin file into memory, construct xrt::xclbin ───
// Uses the const axlf* constructor, not the string_view variant. File I/O
// stays in Clef's control via libc open/read/close. The axlf* constructor
// parses an in-memory buffer and does not throw on file-not-found.
// (The string_view constructor throws std::bad_alloc on XRT 2.21.0.)
let mutable xclbinBufPtr = 0n
let mutable xclbinFileSize = 0L
if err = 0 then
Console.writeln " [npu] Loading xclbin: HelloNappyKernel.xclbin"
let xclbinFd = Fidelity.Libc.IO.openat -100 (Some "HelloNappyKernel.xclbin".Pointer) 0
if xclbinFd < 0 then
Console.writeln " FAILED: HelloNappyKernel.xclbin not found in working directory"
err <- 1
else
xclbinFileSize <- Fidelity.Libc.IO.lseek xclbinFd 0L 2
let _ = Fidelity.Libc.IO.lseek xclbinFd 0L 0
if xclbinFileSize <= 0L then
Console.writeln " FAILED: xclbin file is empty or lseek failed"
let _ = Fidelity.Libc.IO.close xclbinFd
err <- 1
else
Console.write " [npu] Xclbin file: "
Console.write (Format.int (int xclbinFileSize))
Console.writeln " bytes"
// mmap the file: PROT_READ=1, MAP_PRIVATE=2
match Fidelity.Libc.Memory.mmap None (unativeint xclbinFileSize) 1 2 xclbinFd 0L with
| None ->
Console.writeln " FAILED: mmap returned NULL"
let _ = Fidelity.Libc.IO.close xclbinFd
err <- 1
| Some mapResult ->
let _ = Fidelity.Libc.IO.close xclbinFd
if mapResult = -1n then // MAP_FAILED = (void*)-1
Console.writeln " FAILED: mmap of xclbin file (MAP_FAILED)"
err <- 1
else
xclbinBufPtr <- mapResult
Console.writeln " [npu] Constructing xrt::xclbin from buffer..."
xclbinConstructFromBuffer xclbinAddr xclbinBufPtr
xclbinAlive <- true
Console.writeln " [npu] Xclbin parsed."
// ─── Step 3: Register xclbin with device ───
// register_xclbin returns xrt::uuid (16 bytes) via hidden sret pointer.
// Caller provides return storage as the first argument.
let regUuidStorage = NativePtr.stackalloc<byte> 16
let mutable rui = 0
while rui < 16 do
NativePtr.set regUuidStorage rui 0uy
rui <- rui + 1
let regUuidAddr = NativePtr.toNativeInt regUuidStorage
if err = 0 then
Console.writeln " [npu] Registering xclbin with device..."
deviceRegisterXclbin regUuidAddr deviceAddr xclbinAddr
Console.writeln " [npu] Xclbin registered."
// ─── Step 4: Extract UUID via C API ───
// C API path avoids struct-return ABI; writes 16 bytes into caller buffer.
let uuidBuf = NativePtr.stackalloc<byte> 16
let mutable ui = 0
while ui < 16 do
NativePtr.set uuidBuf ui 0uy
ui <- ui + 1
let uuidAddr = NativePtr.toNativeInt uuidBuf
if err = 0 then
Console.writeln " [npu] Extracting UUID (C API)..."
match xclbinAllocFilename "HelloNappyKernel.xclbin".Pointer with
| None ->
Console.writeln " FAILED: C API xclbin alloc for UUID extraction"
err <- 1
| Some xhdl ->
let uuidResult = xclbinGetUUID xhdl uuidAddr
let _ = xclbinFreeHandle xhdl
if uuidResult <> 0 then
Console.writeln $" FAILED: UUID extraction returned {Format.int uuidResult}"
err <- 1
else
Console.writeln " [npu] UUID extracted."
// ─── Step 5: Construct xrt::hw_context(device, uuid, shared) ───
if err = 0 then
Console.writeln " [npu] Constructing xrt::hw_context (shared mode)..."
hwContextConstruct hwCtxAddr deviceAddr uuidAddr 1 // 1 = shared
hwCtxAlive <- true
Console.writeln " [npu] Hardware context constructed."
// ─── Step 6: Skip xclbin destruct ───
// Phase 1 limitation: xrt::xclbin::~xclbin() is not exported from libxrt_coreutil.so
// (compiler-synthesized inline shared_ptr destructor). The xclbin object leaks;
// acceptable for this short-lived process. Phase 2 (Farscape OnClass) will generate
// the shared_ptr destructor sequence directly.
// ─── Step 7: Construct xrt::kernel(hw_context, "MLIR_AIE") ───
// Build GCC __cxx11 std::string with SSO for "MLIR_AIE" (8 chars, fits SSO ≤ 15).
//
// SSO layout (32 bytes):
// [0..7] _M_dataplus._M_p = pointer to _M_local_buf (self + 16)
// [8..15] _M_string_length = 8
// [16..31] _M_local_buf = "MLIR_AIE\0" + padding
if err = 0 then
Console.writeln " [npu] Constructing xrt::kernel(MLIR_AIE)..."
let stdStr = NativePtr.stackalloc<byte> 32
let stdStrAddr = NativePtr.toNativeInt stdStr
// _M_p points to internal buffer at offset 16
let ptrSlot = NativePtr.ofNativeInt<nativeint> stdStrAddr
NativePtr.set ptrSlot 0 (stdStrAddr + 16n)
// _M_string_length = 8
let lenSlot = NativePtr.ofNativeInt<nativeint> (stdStrAddr + 8n)
NativePtr.set lenSlot 0 8n
// Copy "MLIR_AIE\0" into local buffer at offset 16
let kernName = "MLIR_AIE"
let mutable ki = 0
while ki < 8 do
NativePtr.set stdStr (16 + ki) (NativePtr.get kernName.Pointer ki)
ki <- ki + 1
NativePtr.set stdStr 24 0uy // null terminator
kernelConstruct kernelAddr hwCtxAddr stdStrAddr
kernelAlive <- true
Console.writeln " [npu] Kernel constructed."
// ─── Step 8: Load NPU instructions from file ───
let mutable ninstr = 0u
if err = 0 then
Console.writeln " [npu] Loading instructions: HelloNappyKernel_insts.bin"
let instrFd = Fidelity.Libc.IO.openat -100 (Some "HelloNappyKernel_insts.bin".Pointer) 0
if instrFd < 0 then
Console.writeln " FAILED: could not open instruction file"
err <- 1
else
let instrFileSize = Fidelity.Libc.IO.lseek instrFd 0L 2
let _ = Fidelity.Libc.IO.lseek instrFd 0L 0
if instrFileSize <= 0L then
Console.writeln " FAILED: instruction file is empty or lseek failed"
let _ = Fidelity.Libc.IO.close instrFd
err <- 1
else
let instrBytes = unativeint instrFileSize
Console.write " [npu] Instruction file: "
Console.write (Format.int (int instrFileSize))
Console.writeln " bytes"
// Allocate instruction BO: XCL_BO_FLAGS_CACHEABLE = 0x1000000
let grp1 = kernelGroupId kernelAddr 1
boConstructDevice boInstrAddr deviceAddr instrBytes 0x1000000u (uint grp1)
boInstrAlive <- true
// Map, read instructions from file, sync to device
let instrMap = boMap boInstrAddr
if instrMap = 0n then
Console.writeln " FAILED: instruction buffer map"
let _ = Fidelity.Libc.IO.close instrFd
err <- 1
else
let bytesRead = Fidelity.Libc.IO.read instrFd (Some instrMap) instrBytes
let _ = Fidelity.Libc.IO.close instrFd
if int64 bytesRead <> instrFileSize then
Console.writeln " FAILED: incomplete instruction read"
err <- 1
else
boSync boInstrAddr 0 instrBytes 0un
ninstr <- uint (int instrFileSize / 4)
Console.write " [npu] Loaded "
Console.write (Format.int (int ninstr))
Console.writeln " instruction words"
// ─── Step 9: Allocate data buffers ───
// 64 int32 samples * 4 bytes = 256 bytes
// XRT_BO_FLAGS_HOST_ONLY = 0x2000000
let dataSz = unativeint 256
if err = 0 then
let grp3 = kernelGroupId kernelAddr 3
let grp4 = kernelGroupId kernelAddr 4
let grp5 = kernelGroupId kernelAddr 5
boConstructDevice boAAddr deviceAddr dataSz 0x2000000u (uint grp3)
boAAlive <- true
boConstructDevice boBAddr deviceAddr dataSz 0x2000000u (uint grp4)
boBAlive <- true
boConstructDevice boOutAddr deviceAddr dataSz 0x2000000u (uint grp5)
boOutAlive <- true
Console.writeln " [npu] Data buffers allocated."
// ─── Step 10: Map, stage input data, sync to device ───
if err = 0 then
Console.writeln " [npu] Staging input data..."
let mapA = boMap boAAddr
let mapB = boMap boBAddr
if mapA = 0n || mapB = 0n then
Console.writeln " FAILED: map data buffers"
err <- 1
else
let ptrA = NativePtr.ofNativeInt<int> mapA
let ptrB = NativePtr.ofNativeInt<int> mapB
let mutable ci = 0
while ci < 64 do
NativePtr.set ptrA ci (NativePtr.get sigA ci)
NativePtr.set ptrB ci (NativePtr.get sigB ci)
ci <- ci + 1
boSync boAAddr 0 dataSz 0un
boSync boBAddr 0 dataSz 0un
Console.writeln " [npu] Input data staged."
// ─── Step 11: Construct run, set DPU kernel args, dispatch ───
if err = 0 then
Console.writeln " [npu] Setting DPU kernel arguments..."
runConstruct runAddr kernelAddr
runAlive <- true
// DPU kernel args per MLIR_AIE xclbin metadata:
// arg 0: opcode = 0 (uint64)
// arg 1: instr = instruction BO
// arg 2: ninstr = instruction count (uint32 via uint64)
// arg 3: bo0 = input A
// arg 4: bo1 = input B
// arg 5: bo2 = output C
// arg 0: opcode = 0 (scalar, 8 bytes)
let opcodeVal = NativePtr.stackalloc<uint64> 1
NativePtr.set opcodeVal 0 0UL
runSetArgScalar runAddr 0 (NativePtr.toNativeInt opcodeVal) (unativeint 8)
// arg 1: instruction BO
runSetArgBO runAddr 1 boInstrAddr
// arg 2: ninstr (scalar, 4 bytes as uint32)
let ninstrVal = NativePtr.stackalloc<uint> 1
NativePtr.set ninstrVal 0 ninstr
runSetArgScalar runAddr 2 (NativePtr.toNativeInt ninstrVal) (unativeint 4)
// arg 3,4,5: data buffer BOs
runSetArgBO runAddr 3 boAAddr
runSetArgBO runAddr 4 boBAddr
runSetArgBO runAddr 5 boOutAddr
Console.writeln " [npu] Dispatching kernel..."
runStart runAddr
// Wait with 30 second timeout
let timeoutVal = NativePtr.stackalloc<int64> 1
NativePtr.set timeoutVal 0 30000L
let state = runWait runAddr (NativePtr.toNativeInt timeoutVal)
Console.writeln $" [npu] Kernel completed (state={Format.int state})."
// ─── Step 12: Read back results ───
if err = 0 then
boSync boOutAddr 1 dataSz 0un // device-to-host
Console.writeln " [npu] Reading back result..."
let mapOut = boMap boOutAddr
if mapOut = 0n then
Console.writeln " FAILED: map output buffer"
err <- 1
else
let ptrOut = NativePtr.ofNativeInt<int> mapOut
let mutable ri = 0
while ri < 64 do
NativePtr.set sigOut ri (NativePtr.get ptrOut ri)
ri <- ri + 1
// ═══════════════════════════════════════════════════════════
// Cleanup: destruct in reverse construction order
// Phase 1: explicit destructor calls. Future: PSG escape
// analysis will place these automatically via PimplLifecycle
// coeffect and pimpl-lifecycle-lowering MLIR plugin.
// ═══════════════════════════════════════════════════════════
if runAlive then
runDestruct runAddr
if boOutAlive then
boDestruct boOutAddr
if boBAlive then
boDestruct boBAddr
if boAAlive then
boDestruct boAAddr
if boInstrAlive then
boDestruct boInstrAddr
if kernelAlive then
kernelDestruct kernelAddr
if hwCtxAlive then
hwContextDestruct hwCtxAddr
// xclbin destructor: not exported (compiler-synthesized inline shared_ptr).
// Phase 1: leaks on process exit. Phase 2: Farscape generates shared_ptr teardown.
if deviceAlive then
deviceDestruct deviceAddr
// Release mmap'd xclbin buffer
if xclbinBufPtr <> 0n then
let _ = Fidelity.Libc.Memory.munmap (Some xclbinBufPtr) (unativeint xclbinFileSize)
()
if err = 0 then
Console.writeln " [npu] Cleanup done."
// ─── Display output ───
if err = 0 then
Console.writeln ""
Console.writeln " Output: A * B (computed on NPU, 4 tiles)"
Console.writeln " ----------------------------------------"
let mutable oi = 0
while oi < 64 do
Console.write " ["
Console.write (Format.int oi)
Console.write "] = "
Console.writeln (Format.int (NativePtr.get sigOut oi))
oi <- oi + 1
Console.writeln ""
Console.writeln " Done."
err