高层次综合设计乒乓buffer(double-buffer/pingpong-buffer)

高层次综合设计乒乓buffer(double-buffer/pingpong-buffer) 一、数组优化为乒乓缓存或者FIFO1.double-buffer/ping-pong buffer2.FIFO3.流水线设计吞吐量优化二、乒乓缓存ping-pong buffer1.通常用于将一块内存分成两个区域或者两个独立内存在数据处理的时候交替读写使得读写可以同事进行2.在vivado hls中把数组显式配置为乒乓缓存ping-pong buffer或者FIFO,消除访存瓶颈实现任务流水线和数据驱动三、将数组实现为乒乓缓存1.需要在一块存储区上同时进行load和compute计算需要compute和store,并且这些操作之间有数据依赖2.vivado hls没有专门的ping_pong指令通过数组分区手动索引来切换和改造并且配合dataflow实现了重叠四、乒乓buffer的实现方式1.可以使用array_partitionblock方式将一个大数组切成两个独立的物理内存2.通过index索引进行读写目标的切换3.使用dataflow指令让生产者和消费者函数重叠运行各自访问不同的内存块五、双缓冲处理的循环1.案例一#define N 1024void top(int in[N], int out[N]) {#pragma HLS dataflowint buf[2][N]; // 两块缓冲#pragma HLS ARRAY_PARTITION variablebuf complete dim1// 注意dim1 将第一维(2)完全分割得到 buf0[N] 和 buf1[N] 两个独立 RAM// 第一趟加载到 buf0同时处理 buf1第一次无效需主循环中控制// 这里用乒乓循环模型load(in, buf[0], 0); // 加载块0for (int i 0; i M; i) {if (i % 2 0) {load(in, buf[0], i); // 生产者写入 buf0process(buf[1], out, i); // 消费者处理 buf1} else {load(in, buf[1], i);process(buf[0], out, i);}}}2.案例二void load(int* in, int* buf, int block_idx);void process(int* buf, int* out, int block_idx);void top(int in[N], int out[N]) {#pragma HLS dataflowint buf0[N], buf1[N]; // 两个独立数组#pragma HLS STREAM variablebuf0 depth... // 不一定是 stream可保留为 BRAM#pragma HLS STREAM variablebuf1 ...for (int i 0; i BLOCKS; i) {if (i % 2 0) {load(in, buf0, i);process(buf1, out, i); // 上一个块已就绪} else {load(in, buf1, i);process(buf0, out, i);}}}上述的两个案例load和process没办法重叠因为有了if-else,dataflow指令没办法作用到load和process上让其重叠3.如何解决问题此写法不能让 load 和 process 真正重叠因为它们在同一个循环内顺序调用。要实现重叠必须启动两个并行执行的函数体这正是 dataflow 擅长的地方。可以使用两个“永久循环”或更实际的手动将乒乓缓冲写成显式两段流水线通过 hls::stream 传递块索引同步。实现真正重叠的改写方案采用双进程 令牌同步模式在 Vivado HLS dataflow 区域中让 load 与 process 完全并发执行。#include hls_stream.h#define N 1024#define NUM_BLOCKS 10// Load function: reads from input, writes to buffer, sends tokenvoid load_process(int* in, int buf0[N], int buf1[N],hls::streamint load_sync,hls::streamint process_sync) {for (int blk 0; blk NUM_BLOCKS; blk) {int token load_sync.read(); // wait for available buffer token (0 or 1)int* buf (token 0) ? buf0 : buf1;// Load data into buffor (int i 0; i N; i) {buf[i] in[blk * N i];}// Notify process which buffer is readyprocess_sync.write(token);}}// Process function: waits for token, processes buffer, sends back tokenvoid process_process(int* out, int buf0[N], int buf1[N],hls::streamint process_sync,hls::streamint load_sync) {for (int blk 0; blk NUM_BLOCKS; blk) {int token process_sync.read(); // wait for ready bufferint* buf (token 0) ? buf0 : buf1;// Process datafor (int i 0; i N; i) {out[blk * N i] buf[i] * 2; // example}// Notify load that this buffer is freeload_sync.write(token);}}void top(int in[N * NUM_BLOCKS], int out[N * NUM_BLOCKS]) {#pragma HLS dataflowint buf0[N], buf1[N];#pragma HLS STREAM variablebuf0 ... // optional if needed as FIFO? but here we use BRAM. We may just partition to avoid false dependency.#pragma HLS ARRAY_PARTITION variablebuf0 cyclic factor... // if needed// We dont necessarily need STREAM pragma if we keep them as RAM.// Synchronization streamshls::streamint load2proc; // token indicating which buffer has been loadedhls::streamint proc2load; // token indicating which buffer has been processed and free// Initially, both buffers are free: send two tokensproc2load.write(0);proc2load.write(1);// Launch concurrent processesload_process(in, buf0, buf1, proc2load, load2proc);process_process(out, buf0, buf1, load2proc, proc2load);}或者#include hls_stream.h#define N 1024#define NUM_BLOCKS 10// Load function: reads from input, writes to buffer, sends tokenvoid load_process(int* in, int buf0[N], int buf1[N],hls::streamint load_sync,hls::streamint process_sync) {for (int blk 0; blk NUM_BLOCKS; blk) {int token load_sync.read(); // wait for available buffer token (0 or 1)int* buf (token 0) ? buf0 : buf1;// Load data into buffor (int i 0; i N; i) {buf[i] in[blk * N i];}// Notify process which buffer is readyprocess_sync.write(token);}}// Process function: waits for token, processes buffer, sends back tokenvoid process_process(int* out, int buf0[N], int buf1[N],hls::streamint process_sync,hls::streamint load_sync) {for (int blk 0; blk NUM_BLOCKS; blk) {int token process_sync.read(); // wait for ready bufferint* buf (token 0) ? buf0 : buf1;// Process datafor (int i 0; i N; i) {out[blk * N i] buf[i] * 2; // example}// Notify load that this buffer is freeload_sync.write(token);}}void top(int in[N * NUM_BLOCKS], int out[N * NUM_BLOCKS]) {#pragma HLS dataflowint buf0[N], buf1[N];#pragma HLS STREAM variablebuf0 ... // optional if needed as FIFO? but here we use BRAM. We may just partition to avoid false dependency.#pragma HLS ARRAY_PARTITION variablebuf0 cyclic factor... // if needed// We dont necessarily need STREAM pragma if we keep them as RAM.// Synchronization streamshls::streamint load2proc; // token indicating which buffer has been loadedhls::streamint proc2load; // token indicating which buffer has been processed and free// Initially, both buffers are free: send two tokensproc2load.write(0);proc2load.write(1);// Launch concurrent processesload_process(in, buf0, buf1, proc2load, load2proc);process_process(out, buf0, buf1, load2proc, proc2load);}六、使用ap_memory FSM实现乒乓buffer#include hls_stream.h// 常量定义#define BLOCK_SIZE 1024 // 每次处理的块大小#define NUM_BLOCKS 4 // 总块数// 状态机状态enum State { LOAD_BUF0, LOAD_BUF1, PROC_BUF0, PROC_BUF1, IDLE };void pingpong_fsm(int* in, // 输入ap_memory 接口int* out, // 输出ap_memory 接口int total_size // 总数据量必须为 BLOCK_SIZE * NUM_BLOCKS) {// 将顶层接口指定为 ap_memory类似 BRAM 端口addr, ce, d, q#pragma HLS INTERFACE ap_memory portin#pragma HLS INTERFACE ap_memory portout#pragma HLS INTERFACE s_axilite porttotal_size#pragma HLS INTERFACE s_axilite portreturn// 局部乒乓缓冲指定为单端口块 RAMap_memory 资源int buf0[BLOCK_SIZE];int buf1[BLOCK_SIZE];#pragma HLS RESOURCE variablebuf0 coreRAM_1P_BRAM#pragma HLS RESOURCE variablebuf1 coreRAM_1P_BRAM// 消除由于重复使用同一数组造成的迭代间假依赖手动 FSM 会顺序读写但需告知工具#pragma HLS dependence variablebuf0 inter false#pragma HLS dependence variablebuf1 inter false// 状态寄存器及辅助变量State state LOAD_BUF0;int in_addr 0; // 输入数据地址全局索引int out_addr 0; // 输出数据地址全局索引int local_addr 0; // 局部缓冲内的地址int block_cnt 0; // 已处理的块数// 主循环每个周期执行一个操作读或写while (block_cnt NUM_BLOCKS) {#pragma HLS pipeline II1 // 关键目标单周期吞吐让状态机步进成为流水switch (state) {case LOAD_BUF0: {if (in_addr total_size) {buf0[local_addr] in[in_addr]; // 从 ap_memory 读入in_addr;local_addr;}// 一块加载完毕切换到处理 buf0同时准备加载 buf1if (local_addr BLOCK_SIZE) {local_addr 0;state PROC_BUF0; // 如果此时 buf1 未满不能立即加载需再判断// 但若要同时加载 buf1则需在这里启动加载进程。// 由于是单 FSM无法在同一时刻做两件事因此采用顺序交替处理 buf0 完后再加载 buf1。}break;}case LOAD_BUF1: {if (in_addr total_size) {buf1[local_addr] in[in_addr];in_addr;local_addr;}if (local_addr BLOCK_SIZE) {local_addr 0;state PROC_BUF1;}break;}case PROC_BUF0: {if (out_addr total_size) {int tmp buf0[local_addr] * 2; // 简单处理乘以2out[out_addr] tmp;out_addr;local_addr;}if (local_addr BLOCK_SIZE) {local_addr 0;block_cnt;// 一块处理结束如果还有数据则开始加载下一块到刚刚释放的 buf0if (in_addr total_size) {state LOAD_BUF0; // 注意此处交替处理 buf0 后立即开始加载 buf0乒乓} else {state IDLE;}}break;}case PROC_BUF1: {if (out_addr total_size) {int tmp buf1[local_addr] * 2;out[out_addr] tmp;out_addr;local_addr;}if (local_addr BLOCK_SIZE) {local_addr 0;block_cnt;if (in_addr total_size) {state LOAD_BUF1;} else {state IDLE;}}break;}default: // IDLEbreak;}}}