IPCP ChampSim

成员变量

class IPCP_L1 : public Prefetcher
{
public:
   CACHE *m_parent_cache;
   IP_TABLE_L1 trackers_l1[NUM_CPUS][NUM_IP_TABLE_L1_ENTRIES];
   DELTA_PRED_TABLE DPT_l1[NUM_CPUS][4096];
   uint64_t ghb_l1[NUM_CPUS][NUM_GHB_ENTRIES];
   uint64_t prev_cpu_cycle[NUM_CPUS];
   uint64_t num_misses[NUM_CPUS];
   float mpkc[NUM_CPUS] = {0};
   int spec_nl[NUM_CPUS] = {0};
   ...

1. IP table的结构

Pasted image 20230411133525.png

class IP_TABLE_L1
{
public:
    uint64_t ip_tag;
    uint64_t last_page;       // last page seen by IP
    uint64_t last_cl_offset;  // last cl offset in the 4KB page
    int64_t last_stride;      // last delta observed
    uint16_t ip_valid;        // Valid IP or not
    int conf;                 // CS conf
    uint16_t signature;       // CPLX signature
    uint16_t str_dir;         // stream direction
    uint16_t str_valid;       // stream valid
    uint16_t str_strength;    // stream strength
} 

2. DELTA_PRED_TABLE

table里面的每个entry存一个delta和一个conf

class DELTA_PRED_TABLE
{
public:
    int delta;
    int conf;
};

3. Region Stream Table

Pasted image 20230411141820.png

class REGION_STREAM_TABLE {
public:
	uint64_t region_id;		
	uint64_t tentative_dense;	// tentative dense bit
	uint64_t trained_dense;		// trained dense bit
	uint64_t pos_neg_count;		// positive/negative stream counter
	uint64_t dir;				// direction of stream - 1 for +ve and 0 for -ve
	uint64_t lru;				// lru for replacement
	// bit vector to store which lines in the 2KB region have been accessed
	uint8_t line_access[NUM_OF_LINES_IN_REGION];
};

2. Process

a. GS

i. demand access来了之后,检查rstable,是否落在trained page,设置flag

//Checking if IP is already classified as a part of the GS class, so that for the new region we will set the tentative (spec_dense) bit.
for(int i = 0; i < NUM_RST_ENTRIES; i++) {
	if(rstable[cpu][i].region_id == ((trackers_l1[cpu][index].last_vpage << 1) 
		| (trackers_l1[cpu][index].last_line_offset >> 5))) {
		if(rstable[cpu][i].trained_dense == 1)
			flag = 1;
		break;
	}
}

ii. train rstable

找到对应的region

for(c = 0; c < NUM_RST_ENTRIES; c++) {
	if(((curr_page << 1) | (line_offset >> 5)) == rstable[cpu][c].region_id) {
		if(rstable[cpu][c].line_access[line_offset & REGION_OFFSET_MASK] == 0) {
			rstable[cpu][c].line_access[line_offset & REGION_OFFSET_MASK] = 1;
		}

		if(rstable[cpu][c].pos_neg_count >= MAX_POS_NEG_COUNT 
			|| rstable[cpu][c].pos_neg_count <= 0) {
			rstable[cpu][c].pos_neg_count = MAX_POS_NEG_COUNT/2;
		}

		if(stride > 0)
			rstable[cpu][c].pos_neg_count++;
		else
			rstable[cpu][c].pos_neg_count--;

		if(rstable[cpu][c].trained_dense == 0) {
			int count = 0;
			for(int i = 0; i < NUM_OF_LINES_IN_REGION; i++)
				if(rstable[cpu][c].line_access[line_offset & REGION_OFFSET_MASK] == 1)
					count++;

			if(count > 24)	//75% of the cache lines in the region are accessed.       
				rstable[cpu][c].trained_dense = 1;   
		}
		if (flag == 1)
			rstable[cpu][c].tentative_dense = 1;

		if(rstable[cpu][c].tentative_dense == 1 || rstable[cpu][c].trained_dense == 1) {
			if(rstable[cpu][c].pos_neg_count > (MAX_POS_NEG_COUNT/2))
				rstable[cpu][c].dir = 1;	//1 for positive direction
			else
				rstable[cpu][c].dir = 0;	//0 for negative direction
			trackers_l1[cpu][index].str_valid = 1;
			
			trackers_l1[cpu][index].str_dir = rstable[cpu][c].dir;
		}
		else
			trackers_l1[cpu][index].str_valid = 0; 
		break;
	}
}

没有找到region

分配一个新的region, 根据lru进行替换
初始化rsentry
根据flat设置tentative_dense bit

// curr page has no entry in rstable. Then replace lru.
if (c == NUM_RST_ENTRIES) {
	// check lru
	for (c = 0; c < NUM_RST_ENTRIES; c++) {
		if (rstable[cpu][c].lru == (NUM_RST_ENTRIES - 1))
			break;
	}
	for (int i = 0; i < NUM_RST_ENTRIES; i++) {
		if (rstable[cpu][i].lru < rstable[cpu][c].lru)
			rstable[cpu][i].lru++;
	}
	if (flag == 1)
		rstable[cpu][c].tentative_dense = 1;
	else
		rstable[cpu][c].tentative_dense = 0;

	rstable[cpu][c].region_id = (curr_page << 1) | (line_offset >> 5);
	rstable[cpu][c].trained_dense = 0;
	rstable[cpu][c].pos_neg_count = MAX_POS_NEG_COUNT / 2;
	rstable[cpu][c].dir = 0;
	rstable[cpu][c].lru = 0;
	for (int i = 0; i < NUM_OF_LINES_IN_REGION; i++)
		rstable[cpu][c].line_access[i] = 0;
}

iii. prefetch

如果stream valid, 根据prefetch degree和stream direction发送prefetch请求
发送前用RR filter进行过滤
如果stream非valid则设置flag为0,开启后面的CS

if (trackers_l1[cpu][index].str_valid == 1) { // stream IP
	// for stream, prefetch with twice the usual degree
	if (prefetch_degree[cpu][1] < 3)
		flag = 1;
	meta_counter[cpu][0]++;
	total_count[cpu]++;
	for (int i = 0; i < prefetch_degree[cpu][1]; i++) {
		uint64_t pf_address = 0;

		if (trackers_l1[cpu][index].str_dir == 1) { // +ve stream
			pf_address = (line_addr + i + 1) << LOG2_BLOCK_SIZE;
			metadata = encode_metadata(1, S_TYPE, spec_nl[cpu]); // stride is 1
		} else { // -ve stream
			pf_address = (line_addr - i - 1) << LOG2_BLOCK_SIZE;
			metadata = encode_metadata(-1, S_TYPE, spec_nl[cpu]); // stride is -1
		}

		if (acc[cpu][1] < 75)
			metadata = encode_metadata(0, S_TYPE, spec_nl[cpu]);
		// Check if prefetch address is in same 4 KB page
		if ((pf_address >> LOG2_PAGE_SIZE) != (addr >> LOG2_PAGE_SIZE))
			break;

		trackers_l1[cpu][index].pref_type = S_TYPE;

		int found_in_filter = 0;
		for (int i = 0; i < recent_request_filter.size(); i++) {
			if (recent_request_filter[i] == ((pf_address >> 6) & RR_TAG_MASK)) {
				// Prefetch address is present in RR filter
				found_in_filter = 1;
			}
		}
		// Issue prefetch request only if prefetch address is not present in RR filter
		if (found_in_filter == 0) {
			prefetch_line(ip, addr, pf_address, FILL_L1, metadata);
			// Add to RR filter
			recent_request_filter.push_back((pf_address >> 6) & RR_TAG_MASK);
			if (recent_request_filter.size() > NUM_OF_RR_ENTRIES)
				recent_request_filter.erase(recent_request_filter.begin());
		}
		num_prefs++;
	}
}
else
	flag = 1;

b. CS

i. train

// update constant stride(CS) confidence
trackers_l1[cpu][index].conf = update_conf(stride, trackers_l1[cpu][index].last_stride, trackers_l1[cpu][index].conf);
// update CS only if confidence is zero
if (trackers_l1[cpu][index].conf == 0)
	trackers_l1[cpu][index].last_stride = stride;

对应的update_conf函数

/*If the actual stride and predicted stride are equal, then the confidence counter is incremented. */
int update_conf(int stride, int pred_stride, int conf) {
    // use 2-bit saturating counter for confidence
    if (stride == pred_stride) { 
        conf++;
        if (conf > 3)
            conf = 3;
    } else {
        conf--;
        if (conf < 0)
            conf = 0;
    }

    return conf;
}

ii. prefetch

如果confidence足够的话,根据stride和prefetch degree发送prefetch请求
发送前用RR filter进行过滤

if (trackers_l1[cpu][index].conf > 1 && trackers_l1[cpu][index].last_stride != 0 && flag == 1) {
        meta_counter[cpu][1]++;
        total_count[cpu]++;

        if (prefetch_degree[cpu][2] < 2)
            flag = 1;
        else
            flag = 0;

        for (int i = 0; i < prefetch_degree[cpu][2]; i++) {
            uint64_t pf_address = (line_addr + (trackers_l1[cpu][index].last_stride * (i + 1))) << LOG2_BLOCK_SIZE;

            // Check if prefetch address is in same 4 KB page
            if ((pf_address >> LOG2_PAGE_SIZE) != (addr >> LOG2_PAGE_SIZE))
                break;

            trackers_l1[cpu][index].pref_type = CS_TYPE;
            bl_index = hash_bloom(pf_address);
            stats[cpu][CS_TYPE].bl_request[bl_index] = 1;
            if (acc[cpu][2] > 75)
                metadata = encode_metadata(trackers_l1[cpu][index].last_stride, CS_TYPE, spec_nl[cpu]);
            else
                metadata = encode_metadata(0, CS_TYPE, spec_nl[cpu]);

            int found_in_filter = 0;
            for (int i = 0; i < recent_request_filter.size(); i++) {
                if (recent_request_filter[i] == ((pf_address >> 6) & RR_TAG_MASK))
                    // Prefetch address is present in RR filter
                    found_in_filter = 1;
            }
            // Issue prefetch request only if prefetch address is not present in RR filter
            if (found_in_filter == 0) {
                prefetch_line(ip, addr, pf_address, FILL_L1, metadata);
                // Add to RR filter
                recent_request_filter.push_back((pf_address >> 6) & RR_TAG_MASK);
                if (recent_request_filter.size() > NUM_OF_RR_ENTRIES)
                    recent_request_filter.erase(recent_request_filter.begin());
            }
            num_prefs++;
        }
    }
    else
        flag = 1;

c. CPLX

i. training

last_signature = trackers_l1[cpu][index].signature;
// update complex stride(CPLX) confidence
CSPT_l1[cpu][last_signature].conf = update_conf(stride, CSPT_l1[cpu][last_signature].stride, CSPT_l1[cpu][last_signature].conf);

// update CPLX only if confidence is zero
if (CSPT_l1[cpu][last_signature].conf == 0)
	CSPT_l1[cpu][last_signature].stride = stride;

// calculate and update new signature in IP table
signature = update_sig_l1(last_signature, stride);
trackers_l1[cpu][index].signature = signature;

其中update_conf和CS是一样的
update_sig_l1函数如下

uint16_t update_sig_l1(uint16_t old_sig, int delta) {
    uint16_t new_sig = 0;
    int sig_delta = 0;

    // 7-bit sign magnitude form, since we need to track deltas from +63 to -63
    sig_delta = (delta < 0) ? (((-1) * delta) + (1 << 6)) : delta;
    new_sig = ((old_sig << 1) ^ sig_delta) & ((1 << NUM_SIG_BITS) - 1);

    return new_sig;
}

ii. prefetch

跟SPP一样speculate lookahead的prefetch

// if conf>=0, continue looking for stride
if (CSPT_l1[cpu][signature].conf >= 0 && CSPT_l1[cpu][signature].stride != 0 && flag == 1) {
	int pref_offset = 0, i = 0; // CPLX IP
	meta_counter[cpu][2]++;
	total_count[cpu]++;

	for (i = 0; i < prefetch_degree[cpu][3] + CPLX_DIST; i++) {
		pref_offset += CSPT_l1[cpu][signature].stride;
		uint64_t pf_address = ((line_addr + pref_offset) << LOG2_BLOCK_SIZE);

		// Check if prefetch address is in same 4 KB page
		if (((pf_address >> LOG2_PAGE_SIZE) != (addr >> LOG2_PAGE_SIZE)) ||
			(CSPT_l1[cpu][signature].conf == -1) ||
			(CSPT_l1[cpu][signature].stride == 0))
			// if new entry in CSPT or stride is zero, break
			break;

		// we are not prefetching at L2 for CPLX type, so encode stride as 0
		trackers_l1[cpu][index].pref_type = CPLX_TYPE;
		metadata = encode_metadata(0, CPLX_TYPE, spec_nl[cpu]);

		// prefetch only when conf>0 for CPLX
		if (CSPT_l1[cpu][signature].conf > 0 && i >= CPLX_DIST) {
			bl_index = hash_bloom(pf_address);
			stats[cpu][CPLX_TYPE].bl_request[bl_index] = 1;
			trackers_l1[cpu][index].pref_type = 3;

			int found_in_filter = 0;
			for (int i = 0; i < recent_request_filter.size(); i++) {
				if (recent_request_filter[i] == ((pf_address >> 6) & RR_TAG_MASK))
					// Prefetch address is present in RR filter
					found_in_filter = 1;
			}
			// Issue prefetch request only if prefetch address is not present in RR filter
			if (found_in_filter == 0) {
				prefetch_line(ip, addr, pf_address, FILL_L1, metadata);
				// Add to RR filter
				recent_request_filter.push_back((pf_address >> 6) & RR_TAG_MASK);
				if (recent_request_filter.size() > NUM_OF_RR_ENTRIES)
					recent_request_filter.erase(recent_request_filter.begin());
			}
			num_prefs++;
		}
		signature = update_sig_l1(signature, CSPT_l1[cpu][signature].stride);
	}
}

d. NL prefetcher

如果GS/CS/CPLX都没有发送预取,则到NL Prefetcher

// if no prefetches are issued till now, speculatively issue a next_line prefetch
if (num_prefs == 0 && spec_nl[cpu] == 1) {
	if (flag_nl[cpu] == 0)
		flag_nl[cpu] = 1;
	else {
		uint64_t pf_address = ((addr >> LOG2_BLOCK_SIZE) + 1) << LOG2_BLOCK_SIZE;
		if ((pf_address >> LOG2_PAGE_SIZE) != (addr >> LOG2_PAGE_SIZE)) {
			// update the IP table entries and return if NL request is not to the same 4 KB page
			trackers_l1[cpu][index].last_line_offset = line_offset;
			trackers_l1[cpu][index].last_vpage = curr_page;
			return;
		}
		bl_index = hash_bloom(pf_address);
		stats[cpu][NL_TYPE].bl_request[bl_index] = 1;
		metadata = encode_metadata(1, NL_TYPE, spec_nl[cpu]);

		int found_in_filter = 0;
		for (int i = 0; i < recent_request_filter.size(); i++){
			if (recent_request_filter[i] == ((pf_address >> 6) & RR_TAG_MASK))
				// Prefetch address is present in RR filter
				found_in_filter = 1;
		}
		// Issue prefetch request only if prefetch address is not present in RR filter
		if (found_in_filter == 0) {
			prefetch_line(ip, addr, pf_address, FILL_L1, metadata);
			// Add to RR filter
			recent_request_filter.push_back((pf_address >> 6) & RR_TAG_MASK);
			if (recent_request_filter.size() > NUM_OF_RR_ENTRIES)
				recent_request_filter.erase(recent_request_filter.begin());
		}
		trackers_l1[cpu][index].pref_type = NL_TYPE;
		meta_counter[cpu][3]++;
		total_count[cpu]++;

		if (acc[cpu][4] < 40)
			flag_nl[cpu] = 0;
	} // NL IP
}

注意NL Prefetcher会根据MPKI进行控制

// update spec nl bit when num misses crosses certain threshold
if (num_misses[cpu] % 256 == 0 && cache_hit == 0) {
	mpki[cpu] = ((num_misses[cpu] * 1000.0) / (ooo_cpu[cpu].num_retired - ooo_cpu[cpu].warmup_instructions));

	if (mpki[cpu] > spec_nl_threshold)
		spec_nl[cpu] = 0;
	else
		spec_nl[cpu] = 1;
}

4. Others

prefetch degree throttle

// Updating prefetch degree based on accuracy
for (int i = 0; i < 5; i++) {
	if (pref_filled[cpu][i] % 256 == 0) {
		acc_useful[cpu][i] = acc_useful[cpu][i] / 2.0 + (pref_useful[cpu][i] - acc_useful[cpu][i]) / 2.0;
		acc_filled[cpu][i] = acc_filled[cpu][i] / 2.0 + (pref_filled[cpu][i] - acc_filled[cpu][i]) / 2.0;

		if (acc_filled[cpu][i] != 0)
			acc[cpu][i] = 100.0 * acc_useful[cpu][i] / (acc_filled[cpu][i]);
		else
			acc[cpu][i] = 60;

		if (acc[cpu][i] > 75) {
			prefetch_degree[cpu][i]++;
			if (i == 1) {
				// For GS class, degree is incremented/decremented by 2.
				prefetch_degree[cpu][i]++;
				if (prefetch_degree[cpu][i] > 6)
					prefetch_degree[cpu][i] = 6;
			}
			else if (prefetch_degree[cpu][i] > 3)
				prefetch_degree[cpu][i] = 3;
		}
		else if (acc[cpu][i] < 40) {
			prefetch_degree[cpu][i]--;
			if (i == 1)
				prefetch_degree[cpu][i]--;
			if (prefetch_degree[cpu][i] < 1)
				prefetch_degree[cpu][i] = 1;
		}
	}
}