CSAPP Cachelab总结

Part A

第一部分是用C语言实现一个Cache模拟器,实现代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#include "cachelab.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int s,S,E,b;
FILE *fp;
int hit_count,miss_count,eviction_count;
typedef struct _Cache{
int valid,tag,last_visited;
}Cache;
Cache ** cache= NULL;
char input[100];
unsigned int char2hex(char c){
//printf("arg=%c\n",c);
if(c >= '0'&& c <= '9'){
//printf("value = %d\n",c - '0');
return c - '0';
}
else if(c >= 'a' && c <= 'f'){
return c - 'a' + 10;
}
else if(c >= 'A' && c <= 'F'){
return c - 'A' + 10;
}
return 0;
}

void parse(int argc, char **argv)
{
for(int i = 0; i < argc; i++){
if(argv[i][0] == '-'){
if(argv[i][1] == 's')
{
s = atoi(argv[++i]);
S = 1 << s;
}
else if(argv[i][1] == 'E'){
E = atoi(argv[++i]);
}
else if(argv[i][1] == 'b'){
b = atoi(argv[++i]);
}
else if(argv[i][1] == 't'){
i++;
fp = fopen((argv[i]),"r");
}
}
}
}
void cache_option(int op,unsigned int addr,int size,int time)
{
int tag,index;
index = (addr >> b) & ((-1U) >> (32 - s));
tag = addr >> (s + b);
printf("op=%d addr=%x tag:%x index=%x\n",op,addr,tag,index);
if(op == 1){ // I_type
return;
}
if(op == 2 || op == 4){ // S_type or M_type need to write to the cache
int hit = 0;
for(int i = 0;i < E;i++){
if(cache[index][i].tag == tag){ //hit
cache[index][i].last_visited = 0;
hit_count++;
hit = 1;
printf("hit\n");
break;
}
}
if(hit == 0){
int sign = 0;
for(int i = 0;i < E;i++){
if(cache[index][i].valid == 0){
cache[index][i].valid = 1;
cache[index][i].last_visited = 0;
cache[index][i].tag = tag;
sign = 1;
miss_count++;
printf("miss\n");
break;
}
}
if(sign == 0){
eviction_count++;
miss_count++;
printf("eviction\nmiss\n");
int flag = 0,max_last_visited = 0;
for(int i = 0;i < E;i++){
if(cache[index][i].last_visited > max_last_visited){
max_last_visited = cache[index][i].last_visited;
flag = i;
}
}
cache[index][flag].last_visited = 0;
cache[index][flag].tag = tag;
}
}
}
if(op == 3 || op == 4){ // L_type or M_type need to write the cache
int hit = 0;
for(int i = 0;i < E;i++){
if(cache[index][i].tag == tag){ //hit
cache[index][i].last_visited = 0;
hit_count++;
printf("hit\n");
hit = 1;
break;
}
}
if(hit == 0){
int sign = 0;
for(int i = 0;i < E;i++){
if(cache[index][i].valid == 0){
cache[index][i].valid = 1;
cache[index][i].last_visited = 0;
cache[index][i].tag = tag;
sign = 1;
miss_count++;
printf("miss\n");
break;
}
}
if(sign == 0){
eviction_count++;
miss_count++;
printf("eviction\nmiss\n");
int flag = 0,max_last_visited = 0;
for(int i = 0;i < E;i++){
if(cache[index][i].last_visited > max_last_visited){
max_last_visited = cache[index][i].last_visited;
flag = i;
}
}
cache[index][flag].last_visited = 0;
cache[index][flag].tag = tag;
}
}
}
for(int i = 0;i < S;i++){
for(int j = 0;j < E;j++){
if(cache[i][j].valid == 1){
cache[i][j].last_visited++;
}
}
}
}
int main(int argc, char* argv[])
{
parse(argc,argv);
//printf("s:%d S:%d E:%d b:%d",s,S,E,b);
hit_count = 0;
miss_count = 0;
eviction_count = 0;
cache = (Cache **)malloc(sizeof(Cache *) * S);
for(int i = 0;i < S;i++){
cache[i] = (Cache *)malloc(sizeof(Cache) * E);
}
for(int i = 0;i < S;i++){
for(int j = 0;j < E;j++){
cache[i][j].last_visited = -1;
cache[i][j].valid = 0;
cache[i][j].tag = -1;
}
}
int time = 0;
while(fgets(input,200,fp)){
//printf("%s\n",input);
int flag = 0;
int size = 0;
unsigned int addr = 0;
int len = strlen(input) - 1;
int op = 0;
for(int i = 0;i < len;i++){
if(input[i] == 'I'){
op = 1;
}
else if(input[i] == 'S'){
op = 2;
}
else if(input[i] == 'L'){
op = 3;
}
else if(input[i] == 'M'){
op = 4;
}
else if(input[i] == ' '){
continue;
}
else if(input[i] == ','){
flag = 1;
//op = 0;
}
else{
if(flag == 0){
//printf("%c\n",input[i]);
addr = addr * 16 + char2hex(input[i]);
//printf("ADDR=%x\n",addr);
}
else{
size = size * 10 + (input[i] - '0');
}
}
//cache_option(op,addr,size,time);
}
//printf("addr=%x size=%x\n",addr,size);
cache_option(op,addr,size,time);
}
printSummary(hit_count,miss_count,eviction_count);
for(int i = 0;i < S;i++){
free(cache[i]);
}
free(cache);
fclose(fp);
return 0;
}

s,S用来记录组索引位数和组数(S = 2^s)

E记录每组行数

b记录块索引位数

(上述参数与书中描述相同)

然后定义一个Cache结构体来模拟

1
2
3
typedef struct _Cache{
int valid,tag,last_visited;
}Cache;

char2hex函数将字符转为十六进制下的数值,该函数原理简单不过多赘述

parse函数处理命令行输入的命令,为Cache的各个参数进行赋值,直接按定义实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
void parse(int argc, char **argv)
{
for(int i = 0; i < argc; i++){
if(argv[i][0] == '-'){
if(argv[i][1] == 's')
{
s = atoi(argv[++i]);
S = 1 << s;
}
else if(argv[i][1] == 'E'){
E = atoi(argv[++i]);
}
else if(argv[i][1] == 'b'){
b = atoi(argv[++i]);
}
else if(argv[i][1] == 't'){
i++;
fp = fopen((argv[i]),"r");
}
}
}
}

接下来是最重要的Cache模拟函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
void cache_option(int op,unsigned int addr,int size,int time)
{
int tag,index;
index = (addr >> b) & ((-1U) >> (32 - s));
tag = addr >> (s + b);
printf("op=%d addr=%x tag:%x index=%x\n",op,addr,tag,index);
if(op == 1){ // I_type
return;
}
if(op == 2 || op == 4){ // S_type or M_type need to write to the cache
int hit = 0;
for(int i = 0;i < E;i++){
if(cache[index][i].tag == tag){ //hit
cache[index][i].last_visited = 0;
hit_count++;
hit = 1;
printf("hit\n");
break;
}
}
if(hit == 0){
int sign = 0;
for(int i = 0;i < E;i++){
if(cache[index][i].valid == 0){
cache[index][i].valid = 1;
cache[index][i].last_visited = 0;
cache[index][i].tag = tag;
sign = 1;
miss_count++;
printf("miss\n");
break;
}
}
if(sign == 0){
eviction_count++;
miss_count++;
printf("eviction\nmiss\n");
int flag = 0,max_last_visited = 0;
for(int i = 0;i < E;i++){
if(cache[index][i].last_visited > max_last_visited){
max_last_visited = cache[index][i].last_visited;
flag = i;
}
}
cache[index][flag].last_visited = 0;
cache[index][flag].tag = tag;
}
}
}
if(op == 3 || op == 4){ // L_type or M_type need to write the cache
int hit = 0;
for(int i = 0;i < E;i++){
if(cache[index][i].tag == tag){ //hit
cache[index][i].last_visited = 0;
hit_count++;
printf("hit\n");
hit = 1;
break;
}
}
if(hit == 0){
int sign = 0;
for(int i = 0;i < E;i++){
if(cache[index][i].valid == 0){
cache[index][i].valid = 1;
cache[index][i].last_visited = 0;
cache[index][i].tag = tag;
sign = 1;
miss_count++;
printf("miss\n");
break;
}
}
if(sign == 0){
eviction_count++;
miss_count++;
printf("eviction\nmiss\n");
int flag = 0,max_last_visited = 0;
for(int i = 0;i < E;i++){
if(cache[index][i].last_visited > max_last_visited){
max_last_visited = cache[index][i].last_visited;
flag = i;
}
}
cache[index][flag].last_visited = 0;
cache[index][flag].tag = tag;
}
}
}
for(int i = 0;i < S;i++){
for(int j = 0;j < E;j++){
if(cache[i][j].valid == 1){
cache[i][j].last_visited++;
}
}
}
}

访问地址的index和tag位按定义设置即可,通过op值来区分I/S/L/M四种type的指令,需要注意的一点是,Mtype指令既需要读也需要写,所以在读写的if中均加入Mtype的情况将其在读写两种情况中均执行一遍即可

  • 如果是Itype指令,那么对Cache无读写操作,直接返回即可
  • 如果是Stype或Mtype,那么会对Cache执行一次写操作,对Cache的每一行搜索tag值,如果命中则更新hit_count(总命中数)和上次访问时间last_count,标志本次访问标记hit为1(代表命中);若未命中则按照定义向Cache中执行写入,如果有valid的行则写入valid行并给miss_count计数,修改Cache该行的参数即可;若没有valid行则通过LRU策略选最久未访问的行进行替换,给eviction_count和miss_count计数并更新Cache的参数
  • 如果是Ltype或Mtype,那么会对Cache执行一次读操作,由于我们的模拟器对Cache只执行模拟不执行实际的读写,故读操作和写操作在模拟中的行为是一样的,在此不重复赘述,参考上面写操作过程即可

这样就完成了Cache的模拟

在主函数中按顺序处理参数-按参数对Cache初始化(由于s,E,B参数不固定故采取堆分配初始化Cache)-读入模拟指令-调用cache_option进行模拟cache-打印hit/miss/eviction三种行为的数量-free cache即可

Part B

第二部分要完成一个矩阵转置,但要求Cache miss数量尽可能少,测试情况有三种:32x32,64x64,61x67

由于b = 5,故块大小为2^5=32,又由于sizeof(int) = 4,所以每个块内可放32 / sizeof(int) = 8个数据

,并且有2^5组(s = 5),每组1行

32x32

由于Cache的一行能存储八个数据,所以很自然的会考虑8x8的矩阵分块形式

注意到writeup中写到我们只能使用12个变量(也就是寄存器),那么为什么会提到这一点呢:这是因为变量是寄存器,如果我们把数据存到变量再存储到另一个矩阵中,这样就比正常的B[j][i] = A[i][j]少了当i=j时,B[j]这一行会替换A[i]那一行导致的miss,可以在一定程度上减少miss数量

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
for(int i = 0;i < 4;i++){
for(int j = 0;j < 4;j++){
for(int k = (i << 3); k < ((i + 1) << 3);k++){
int h = j << 3;
r1 = A[k][h];
r2 = A[k][h + 1];
r3 = A[k][h + 2];
r4 = A[k][h + 3];
r5 = A[k][h + 4];
r6 = A[k][h + 5];
r7 = A[k][h + 6];
r8 = A[k][h + 7];

B[h][k] = r1;
B[h + 1][k] = r2;
B[h + 2][k] = r3;
B[h + 3][k] = r4;
B[h + 4][k] = r5;
B[h + 5][k] = r6;
B[h + 6][k] = r7;
B[h + 7][k] = r8;
}
}
}

这样我们使用8个变量(不超过限制),就可以将miss数缩小到满分的300以内()

64x64

由于这次矩阵变为64x64的int矩阵,所以一个Cache只能存储四行,很自然的会想到将矩阵进行4x4分块,但是这样分块又会导致Cache一行八块不能充分利用(不能全中),所以为了更好的利用题目中给定参数的Cache,我们将矩阵先分成8x8的块,再将8x8的块分成4x4的块

注意到采用这种方式的分块会导致每隔四行Cache就会被覆盖,所以我们采取先对8x8中的上半部的两个4x4先转置后复制的策略(这样可以避免4x4带来的浪费),然后将下半部每隔分隔4列的两列中的8个元素依次放到寄存器中,然后不断对这八个元素找到对应位置然后换位赋值(就是原来先复制后赋值过程中一开始错放的位置),这样不断重复完成赋值:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
for(int i = 0;i < 64;i += 8){
for(int j = 0;j < 64;j += 8){
for(int k = i;k < i + 4;k++){
r1 = A[k][j];
r2 = A[k][j + 1];
r3 = A[k][j + 2];
r4 = A[k][j + 3];
r5 = A[k][j + 4];
r6 = A[k][j + 5];
r7 = A[k][j + 6];
r8 = A[k][j + 7];

B[j][k] = r1;
B[j + 1][k] = r2;
B[j + 2][k] = r3;
B[j + 3][k] = r4;
B[j][k + 4] = r5;
B[j + 1][k + 4] = r6;
B[j + 2][k + 4] = r7;
B[j + 3][k + 4] = r8;
}
for(int k = j;k < j + 4;k++){
r1 = A[i + 4][k];
r5 = A[i + 4][k + 4];
r2 = A[i + 5][k];
r6 = A[i + 5][k + 4];
r3 = A[i + 6][k];
r7 = A[i + 6][k + 4];
r4 = A[i + 7][k];
r8 = A[i + 7][k + 4];

tran = B[k][i + 4];
B[k][i + 4] = r1;
r1 = tran;
tran = B[k][i + 5];
B[k][i + 5] = r2;
r2 = tran;
tran = B[k][i + 6];
B[k][i + 6] = r3;
r3 = tran;
tran = B[k][i + 7];
B[k][i + 7] = r4;
r4 = tran;

B[k + 4][i] = r1;
B[k + 4][i + 4] = r5;
B[k + 4][i + 1] = r2;
B[k + 4][i + 5] = r6;
B[k + 4][i + 2] = r3;
B[k + 4][i + 6] = r7;
B[k + 4][i + 3] = r4;
B[k + 4][i + 7] = r8;
}
}
}

61x67

最后一种情况比64x64简单一些,我们直接矩阵进行8x8进行分块然后对其他块直接转置即可,测试可知这样即可满足要求:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
for(int i = 0;i < N;i += 8){
for(int j = 0;j < M;j++){
if(i + 8 <= N && j < M){
r1 = A[i][j];
r2 = A[i + 1][j];
r3 = A[i + 2][j];
r4 = A[i + 3][j];
r5 = A[i + 4][j];
r6 = A[i + 5][j];
r7 = A[i + 6][j];
r8 = A[i + 7][j];

B[j][i] = r1;
B[j][i + 1] = r2;
B[j][i + 2] = r3;
B[j][i + 3] = r4;
B[j][i + 4] = r5;
B[j][i + 5] = r6;
B[j][i + 6] = r7;
B[j][i + 7] = r8;
}
else{
for(int k = i; k < i + 8 && k < N; k++){
B[j][k] = A[k][j];
}
}
}
}

trans.c的源代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
/* 
* trans.c - Matrix transpose B = A^T
*
* Each transpose function must have a prototype of the form:
* void trans(int M, int N, int A[N][M], int B[M][N]);
*
* A transpose function is evaluated by counting the number of misses
* on a 1KB direct mapped cache with a block size of 32 bytes.
*/
#include <stdio.h>
#include "cachelab.h"

int is_transpose(int M, int N, int A[N][M], int B[M][N]);

/*
* transpose_submit - This is the solution transpose function that you
* will be graded on for Part B of the assignment. Do not change
* the description string "Transpose submission", as the driver
* searches for that string to identify the transpose function to
* be graded.
*/
char transpose_submit_desc[] = "Transpose submission";
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{
int r1,r2,r3,r4,r5,r6,r7,r8;
if(N == 32){
for(int i = 0;i < 4;i++){
for(int j = 0;j < 4;j++){
for(int k = (i << 3); k < ((i + 1) << 3);k++){
int h = j << 3;
r1 = A[k][h];
r2 = A[k][h + 1];
r3 = A[k][h + 2];
r4 = A[k][h + 3];
r5 = A[k][h + 4];
r6 = A[k][h + 5];
r7 = A[k][h + 6];
r8 = A[k][h + 7];

B[h][k] = r1;
B[h + 1][k] = r2;
B[h + 2][k] = r3;
B[h + 3][k] = r4;
B[h + 4][k] = r5;
B[h + 5][k] = r6;
B[h + 6][k] = r7;
B[h + 7][k] = r8;
}
}
}
}
else if(N == 64){
int tran;
for(int i = 0;i < 64;i += 8){
for(int j = 0;j < 64;j += 8){
for(int k = i;k < i + 4;k++){
r1 = A[k][j];
r2 = A[k][j + 1];
r3 = A[k][j + 2];
r4 = A[k][j + 3];
r5 = A[k][j + 4];
r6 = A[k][j + 5];
r7 = A[k][j + 6];
r8 = A[k][j + 7];

B[j][k] = r1;
B[j + 1][k] = r2;
B[j + 2][k] = r3;
B[j + 3][k] = r4;
B[j][k + 4] = r5;
B[j + 1][k + 4] = r6;
B[j + 2][k + 4] = r7;
B[j + 3][k + 4] = r8;
}
for(int k = j;k < j + 4;k++){
r1 = A[i + 4][k];
r5 = A[i + 4][k + 4];
r2 = A[i + 5][k];
r6 = A[i + 5][k + 4];
r3 = A[i + 6][k];
r7 = A[i + 6][k + 4];
r4 = A[i + 7][k];
r8 = A[i + 7][k + 4];

tran = B[k][i + 4];
B[k][i + 4] = r1;
r1 = tran;
tran = B[k][i + 5];
B[k][i + 5] = r2;
r2 = tran;
tran = B[k][i + 6];
B[k][i + 6] = r3;
r3 = tran;
tran = B[k][i + 7];
B[k][i + 7] = r4;
r4 = tran;

B[k + 4][i] = r1;
B[k + 4][i + 4] = r5;
B[k + 4][i + 1] = r2;
B[k + 4][i + 5] = r6;
B[k + 4][i + 2] = r3;
B[k + 4][i + 6] = r7;
B[k + 4][i + 3] = r4;
B[k + 4][i + 7] = r8;
}
}
}
}
else{
for(int i = 0;i < N;i += 8){
for(int j = 0;j < M;j++){
if(i + 8 <= N && j < M){
r1 = A[i][j];
r2 = A[i + 1][j];
r3 = A[i + 2][j];
r4 = A[i + 3][j];
r5 = A[i + 4][j];
r6 = A[i + 5][j];
r7 = A[i + 6][j];
r8 = A[i + 7][j];

B[j][i] = r1;
B[j][i + 1] = r2;
B[j][i + 2] = r3;
B[j][i + 3] = r4;
B[j][i + 4] = r5;
B[j][i + 5] = r6;
B[j][i + 6] = r7;
B[j][i + 7] = r8;
}
else{
for(int k = i; k < i + 8 && k < N; k++){
B[j][k] = A[k][j];
}
}
}
}
}
}

/*
* You can define additional transpose functions below. We've defined
* a simple one below to help you get started.
*/

/*
* trans - A simple baseline transpose function, not optimized for the cache.
*/
char trans_desc[] = "Simple row-wise scan transpose";
void trans(int M, int N, int A[N][M], int B[M][N])
{
int i, j, tmp;

for (i = 0; i < N; i++) {
for (j = 0; j < M; j++) {
tmp = A[i][j];
B[j][i] = tmp;
}
}

}

/*
* registerFunctions - This function registers your transpose
* functions with the driver. At runtime, the driver will
* evaluate each of the registered functions and summarize their
* performance. This is a handy way to experiment with different
* transpose strategies.
*/
void registerFunctions()
{
/* Register your solution function */
registerTransFunction(transpose_submit, transpose_submit_desc);

/* Register any additional transpose functions */
registerTransFunction(trans, trans_desc);

}

/*
* is_transpose - This helper function checks if B is the transpose of
* A. You can check the correctness of your transpose by calling
* it before returning from the transpose function.
*/
int is_transpose(int M, int N, int A[N][M], int B[M][N])
{
int i, j;

for (i = 0; i < N; i++) {
for (j = 0; j < M; ++j) {
if (A[i][j] != B[j][i]) {
return 0;
}
}
}
return 1;
}



总结

本次实验通过对Cache的模拟以及针对Cache命中率进行优化的编程让我对Cache的原理有了更深层次的理解,也让我发现了在第一次学习Cache概念时有一定的误解(例如在何时进行替换以及怎样替换),而矩阵转置的部分也十分考验思维,也花费了许久的时间,总体来说本次实验在概念上和思维上都得到了极大的收获与锻炼


本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!