Part A
第一部分是用C语言实现一个Cache模拟器,实现代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 #include "cachelab.h" #include <stdio.h> #include <stdlib.h> #include <string.h> int s,S,E,b; FILE *fp;int hit_count,miss_count,eviction_count;typedef struct _Cache { int valid,tag,last_visited; }Cache; Cache ** cache= NULL ; char input[100 ];unsigned int char2hex (char c) { if (c >= '0' && c <= '9' ){ return c - '0' ; } else if (c >= 'a' && c <= 'f' ){ return c - 'a' + 10 ; } else if (c >= 'A' && c <= 'F' ){ return c - 'A' + 10 ; } return 0 ; }void parse (int argc, char **argv) { for (int i = 0 ; i < argc; i++){ if (argv[i][0 ] == '-' ){ if (argv[i][1 ] == 's' ) { s = atoi(argv[++i]); S = 1 << s; } else if (argv[i][1 ] == 'E' ){ E = atoi(argv[++i]); } else if (argv[i][1 ] == 'b' ){ b = atoi(argv[++i]); } else if (argv[i][1 ] == 't' ){ i++; fp = fopen((argv[i]),"r" ); } } } }void cache_option (int op,unsigned int addr,int size,int time) { int tag,index; index = (addr >> b) & ((-1U ) >> (32 - s)); tag = addr >> (s + b); printf ("op=%d addr=%x tag:%x index=%x\n" ,op,addr,tag,index); if (op == 1 ){ return ; } if (op == 2 || op == 4 ){ int hit = 0 ; for (int i = 0 ;i < E;i++){ if (cache[index][i].tag == tag){ cache[index][i].last_visited = 0 ; hit_count++; hit = 1 ; printf ("hit\n" ); break ; } } if (hit == 0 ){ int sign = 0 ; for (int i = 0 ;i < E;i++){ if (cache[index][i].valid == 0 ){ cache[index][i].valid = 1 ; cache[index][i].last_visited = 0 ; cache[index][i].tag = tag; sign = 1 ; miss_count++; printf ("miss\n" ); break ; } } if (sign == 0 ){ eviction_count++; miss_count++; printf ("eviction\nmiss\n" ); int flag = 0 ,max_last_visited = 0 ; for (int i = 0 ;i < E;i++){ if (cache[index][i].last_visited > max_last_visited){ max_last_visited = cache[index][i].last_visited; flag = i; } } cache[index][flag].last_visited = 0 ; cache[index][flag].tag = tag; } } } if (op == 3 || op == 4 ){ int hit = 0 ; for (int i = 0 ;i < E;i++){ if (cache[index][i].tag == tag){ cache[index][i].last_visited = 0 ; hit_count++; printf ("hit\n" ); hit = 1 ; break ; } } if (hit == 0 ){ int sign = 0 ; for (int i = 0 ;i < E;i++){ if (cache[index][i].valid == 0 ){ cache[index][i].valid = 1 ; cache[index][i].last_visited = 0 ; cache[index][i].tag = tag; sign = 1 ; miss_count++; printf ("miss\n" ); break ; } } if (sign == 0 ){ eviction_count++; miss_count++; printf ("eviction\nmiss\n" ); int flag = 0 ,max_last_visited = 0 ; for (int i = 0 ;i < E;i++){ if (cache[index][i].last_visited > max_last_visited){ max_last_visited = cache[index][i].last_visited; flag = i; } } cache[index][flag].last_visited = 0 ; cache[index][flag].tag = tag; } } } for (int i = 0 ;i < S;i++){ for (int j = 0 ;j < E;j++){ if (cache[i][j].valid == 1 ){ cache[i][j].last_visited++; } } } }int main (int argc, char * argv[]) { parse(argc,argv); hit_count = 0 ; miss_count = 0 ; eviction_count = 0 ; cache = (Cache **)malloc (sizeof (Cache *) * S); for (int i = 0 ;i < S;i++){ cache[i] = (Cache *)malloc (sizeof (Cache) * E); } for (int i = 0 ;i < S;i++){ for (int j = 0 ;j < E;j++){ cache[i][j].last_visited = -1 ; cache[i][j].valid = 0 ; cache[i][j].tag = -1 ; } } int time = 0 ; while (fgets(input,200 ,fp)){ int flag = 0 ; int size = 0 ; unsigned int addr = 0 ; int len = strlen (input) - 1 ; int op = 0 ; for (int i = 0 ;i < len;i++){ if (input[i] == 'I' ){ op = 1 ; } else if (input[i] == 'S' ){ op = 2 ; } else if (input[i] == 'L' ){ op = 3 ; } else if (input[i] == 'M' ){ op = 4 ; } else if (input[i] == ' ' ){ continue ; } else if (input[i] == ',' ){ flag = 1 ; } else { if (flag == 0 ){ addr = addr * 16 + char2hex(input[i]); } else { size = size * 10 + (input[i] - '0' ); } } } cache_option(op,addr,size,time); } printSummary(hit_count,miss_count,eviction_count); for (int i = 0 ;i < S;i++){ free (cache[i]); } free (cache); fclose(fp); return 0 ; }
s,S用来记录组索引位数和组数(S = 2^s
)
E记录每组行数
b记录块索引位数
(上述参数与书中描述相同)
然后定义一个Cache结构体来模拟
1 2 3 typedef struct _Cache { int valid,tag,last_visited; }Cache;
char2hex函数将字符转为十六进制下的数值,该函数原理简单不过多赘述
parse函数处理命令行输入的命令,为Cache的各个参数进行赋值,直接按定义实现:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 void parse (int argc, char **argv) { for (int i = 0 ; i < argc; i++){ if (argv[i][0 ] == '-' ){ if (argv[i][1 ] == 's' ) { s = atoi(argv[++i]); S = 1 << s; } else if (argv[i][1 ] == 'E' ){ E = atoi(argv[++i]); } else if (argv[i][1 ] == 'b' ){ b = atoi(argv[++i]); } else if (argv[i][1 ] == 't' ){ i++; fp = fopen((argv[i]),"r" ); } } } }
接下来是最重要的Cache模拟函数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 void cache_option (int op,unsigned int addr,int size,int time) { int tag,index; index = (addr >> b) & ((-1U ) >> (32 - s)); tag = addr >> (s + b); printf ("op=%d addr=%x tag:%x index=%x\n" ,op,addr,tag,index); if (op == 1 ){ return ; } if (op == 2 || op == 4 ){ int hit = 0 ; for (int i = 0 ;i < E;i++){ if (cache[index][i].tag == tag){ cache[index][i].last_visited = 0 ; hit_count++; hit = 1 ; printf ("hit\n" ); break ; } } if (hit == 0 ){ int sign = 0 ; for (int i = 0 ;i < E;i++){ if (cache[index][i].valid == 0 ){ cache[index][i].valid = 1 ; cache[index][i].last_visited = 0 ; cache[index][i].tag = tag; sign = 1 ; miss_count++; printf ("miss\n" ); break ; } } if (sign == 0 ){ eviction_count++; miss_count++; printf ("eviction\nmiss\n" ); int flag = 0 ,max_last_visited = 0 ; for (int i = 0 ;i < E;i++){ if (cache[index][i].last_visited > max_last_visited){ max_last_visited = cache[index][i].last_visited; flag = i; } } cache[index][flag].last_visited = 0 ; cache[index][flag].tag = tag; } } } if (op == 3 || op == 4 ){ int hit = 0 ; for (int i = 0 ;i < E;i++){ if (cache[index][i].tag == tag){ cache[index][i].last_visited = 0 ; hit_count++; printf ("hit\n" ); hit = 1 ; break ; } } if (hit == 0 ){ int sign = 0 ; for (int i = 0 ;i < E;i++){ if (cache[index][i].valid == 0 ){ cache[index][i].valid = 1 ; cache[index][i].last_visited = 0 ; cache[index][i].tag = tag; sign = 1 ; miss_count++; printf ("miss\n" ); break ; } } if (sign == 0 ){ eviction_count++; miss_count++; printf ("eviction\nmiss\n" ); int flag = 0 ,max_last_visited = 0 ; for (int i = 0 ;i < E;i++){ if (cache[index][i].last_visited > max_last_visited){ max_last_visited = cache[index][i].last_visited; flag = i; } } cache[index][flag].last_visited = 0 ; cache[index][flag].tag = tag; } } } for (int i = 0 ;i < S;i++){ for (int j = 0 ;j < E;j++){ if (cache[i][j].valid == 1 ){ cache[i][j].last_visited++; } } } }
访问地址的index和tag位按定义设置即可,通过op值来区分I/S/L/M四种type的指令,需要注意的一点是,Mtype指令既需要读也需要写,所以在读写的if中均加入Mtype的情况将其在读写两种情况中均执行一遍即可
如果是Itype指令,那么对Cache无读写操作,直接返回即可
如果是Stype或Mtype,那么会对Cache执行一次写操作,对Cache的每一行搜索tag值,如果命中则更新hit_count(总命中数)和上次访问时间last_count,标志本次访问标记hit为1(代表命中);若未命中则按照定义向Cache中执行写入,如果有valid的行则写入valid行并给miss_count计数,修改Cache该行的参数即可;若没有valid行则通过LRU策略选最久未访问的行进行替换,给eviction_count和miss_count计数并更新Cache的参数
如果是Ltype或Mtype,那么会对Cache执行一次读操作,由于我们的模拟器对Cache只执行模拟不执行实际的读写,故读操作和写操作在模拟中的行为是一样的,在此不重复赘述,参考上面写操作过程即可
这样就完成了Cache的模拟
在主函数中按顺序处理参数-按参数对Cache初始化(由于s,E,B参数不固定故采取堆分配初始化Cache)-读入模拟指令-调用cache_option进行模拟cache-打印hit/miss/eviction三种行为的数量-free
cache即可
Part B
第二部分要完成一个矩阵转置,但要求Cache
miss数量尽可能少,测试情况有三种:32x32,64x64,61x67
由于b = 5
,故块大小为2^5=32
,又由于sizeof(int) = 4
,所以每个块内可放32 / sizeof(int) = 8
个数据
,并且有2^5组(s = 5
),每组1行
32x32
由于Cache的一行能存储八个数据,所以很自然的会考虑8x8的矩阵分块形式
注意到writeup中写到我们只能使用12个变量(也就是寄存器),那么为什么会提到这一点呢:这是因为变量是寄存器,如果我们把数据存到变量再存储到另一个矩阵中,这样就比正常的B[j][i] = A[i][j]
少了当i=j
时,B[j]
这一行会替换A[i]
那一行导致的miss,可以在一定程度上减少miss数量
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 for (int i = 0 ;i < 4 ;i++){ for (int j = 0 ;j < 4 ;j++){ for (int k = (i << 3 ); k < ((i + 1 ) << 3 );k++){ int h = j << 3 ; r1 = A[k][h]; r2 = A[k][h + 1 ]; r3 = A[k][h + 2 ]; r4 = A[k][h + 3 ]; r5 = A[k][h + 4 ]; r6 = A[k][h + 5 ]; r7 = A[k][h + 6 ]; r8 = A[k][h + 7 ]; B[h][k] = r1; B[h + 1 ][k] = r2; B[h + 2 ][k] = r3; B[h + 3 ][k] = r4; B[h + 4 ][k] = r5; B[h + 5 ][k] = r6; B[h + 6 ][k] = r7; B[h + 7 ][k] = r8; } } }
这样我们使用8个变量(不超过限制),就可以将miss数缩小到满分的300以内()
64x64
由于这次矩阵变为64x64的int矩阵,所以一个Cache只能存储四行,很自然的会想到将矩阵进行4x4分块,但是这样分块又会导致Cache一行八块不能充分利用(不能全中),所以为了更好的利用题目中给定参数的Cache,我们将矩阵先分成8x8的块,再将8x8的块分成4x4的块
注意到采用这种方式的分块会导致每隔四行Cache就会被覆盖,所以我们采取先对8x8中的上半部的两个4x4先转置后复制的策略(这样可以避免4x4带来的浪费),然后将下半部每隔分隔4列的两列中的8个元素依次放到寄存器中,然后不断对这八个元素找到对应位置然后换位赋值(就是原来先复制后赋值过程中一开始错放的位置),这样不断重复完成赋值:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 for (int i = 0 ;i < 64 ;i += 8 ){ for (int j = 0 ;j < 64 ;j += 8 ){ for (int k = i;k < i + 4 ;k++){ r1 = A[k][j]; r2 = A[k][j + 1 ]; r3 = A[k][j + 2 ]; r4 = A[k][j + 3 ]; r5 = A[k][j + 4 ]; r6 = A[k][j + 5 ]; r7 = A[k][j + 6 ]; r8 = A[k][j + 7 ]; B[j][k] = r1; B[j + 1 ][k] = r2; B[j + 2 ][k] = r3; B[j + 3 ][k] = r4; B[j][k + 4 ] = r5; B[j + 1 ][k + 4 ] = r6; B[j + 2 ][k + 4 ] = r7; B[j + 3 ][k + 4 ] = r8; } for (int k = j;k < j + 4 ;k++){ r1 = A[i + 4 ][k]; r5 = A[i + 4 ][k + 4 ]; r2 = A[i + 5 ][k]; r6 = A[i + 5 ][k + 4 ]; r3 = A[i + 6 ][k]; r7 = A[i + 6 ][k + 4 ]; r4 = A[i + 7 ][k]; r8 = A[i + 7 ][k + 4 ]; tran = B[k][i + 4 ]; B[k][i + 4 ] = r1; r1 = tran; tran = B[k][i + 5 ]; B[k][i + 5 ] = r2; r2 = tran; tran = B[k][i + 6 ]; B[k][i + 6 ] = r3; r3 = tran; tran = B[k][i + 7 ]; B[k][i + 7 ] = r4; r4 = tran; B[k + 4 ][i] = r1; B[k + 4 ][i + 4 ] = r5; B[k + 4 ][i + 1 ] = r2; B[k + 4 ][i + 5 ] = r6; B[k + 4 ][i + 2 ] = r3; B[k + 4 ][i + 6 ] = r7; B[k + 4 ][i + 3 ] = r4; B[k + 4 ][i + 7 ] = r8; } } }
61x67
最后一种情况比64x64简单一些,我们直接矩阵进行8x8进行分块然后对其他块直接转置即可,测试可知这样即可满足要求:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 for (int i = 0 ;i < N;i += 8 ){ for (int j = 0 ;j < M;j++){ if (i + 8 <= N && j < M){ r1 = A[i][j]; r2 = A[i + 1 ][j]; r3 = A[i + 2 ][j]; r4 = A[i + 3 ][j]; r5 = A[i + 4 ][j]; r6 = A[i + 5 ][j]; r7 = A[i + 6 ][j]; r8 = A[i + 7 ][j]; B[j][i] = r1; B[j][i + 1 ] = r2; B[j][i + 2 ] = r3; B[j][i + 3 ] = r4; B[j][i + 4 ] = r5; B[j][i + 5 ] = r6; B[j][i + 6 ] = r7; B[j][i + 7 ] = r8; } else { for (int k = i; k < i + 8 && k < N; k++){ B[j][k] = A[k][j]; } } } }
trans.c的源代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 #include <stdio.h> #include "cachelab.h" int is_transpose (int M, int N, int A[N][M], int B[M][N]) ;char transpose_submit_desc[] = "Transpose submission" ;void transpose_submit (int M, int N, int A[N][M], int B[M][N]) { int r1,r2,r3,r4,r5,r6,r7,r8; if (N == 32 ){ for (int i = 0 ;i < 4 ;i++){ for (int j = 0 ;j < 4 ;j++){ for (int k = (i << 3 ); k < ((i + 1 ) << 3 );k++){ int h = j << 3 ; r1 = A[k][h]; r2 = A[k][h + 1 ]; r3 = A[k][h + 2 ]; r4 = A[k][h + 3 ]; r5 = A[k][h + 4 ]; r6 = A[k][h + 5 ]; r7 = A[k][h + 6 ]; r8 = A[k][h + 7 ]; B[h][k] = r1; B[h + 1 ][k] = r2; B[h + 2 ][k] = r3; B[h + 3 ][k] = r4; B[h + 4 ][k] = r5; B[h + 5 ][k] = r6; B[h + 6 ][k] = r7; B[h + 7 ][k] = r8; } } } } else if (N == 64 ){ int tran; for (int i = 0 ;i < 64 ;i += 8 ){ for (int j = 0 ;j < 64 ;j += 8 ){ for (int k = i;k < i + 4 ;k++){ r1 = A[k][j]; r2 = A[k][j + 1 ]; r3 = A[k][j + 2 ]; r4 = A[k][j + 3 ]; r5 = A[k][j + 4 ]; r6 = A[k][j + 5 ]; r7 = A[k][j + 6 ]; r8 = A[k][j + 7 ]; B[j][k] = r1; B[j + 1 ][k] = r2; B[j + 2 ][k] = r3; B[j + 3 ][k] = r4; B[j][k + 4 ] = r5; B[j + 1 ][k + 4 ] = r6; B[j + 2 ][k + 4 ] = r7; B[j + 3 ][k + 4 ] = r8; } for (int k = j;k < j + 4 ;k++){ r1 = A[i + 4 ][k]; r5 = A[i + 4 ][k + 4 ]; r2 = A[i + 5 ][k]; r6 = A[i + 5 ][k + 4 ]; r3 = A[i + 6 ][k]; r7 = A[i + 6 ][k + 4 ]; r4 = A[i + 7 ][k]; r8 = A[i + 7 ][k + 4 ]; tran = B[k][i + 4 ]; B[k][i + 4 ] = r1; r1 = tran; tran = B[k][i + 5 ]; B[k][i + 5 ] = r2; r2 = tran; tran = B[k][i + 6 ]; B[k][i + 6 ] = r3; r3 = tran; tran = B[k][i + 7 ]; B[k][i + 7 ] = r4; r4 = tran; B[k + 4 ][i] = r1; B[k + 4 ][i + 4 ] = r5; B[k + 4 ][i + 1 ] = r2; B[k + 4 ][i + 5 ] = r6; B[k + 4 ][i + 2 ] = r3; B[k + 4 ][i + 6 ] = r7; B[k + 4 ][i + 3 ] = r4; B[k + 4 ][i + 7 ] = r8; } } } } else { for (int i = 0 ;i < N;i += 8 ){ for (int j = 0 ;j < M;j++){ if (i + 8 <= N && j < M){ r1 = A[i][j]; r2 = A[i + 1 ][j]; r3 = A[i + 2 ][j]; r4 = A[i + 3 ][j]; r5 = A[i + 4 ][j]; r6 = A[i + 5 ][j]; r7 = A[i + 6 ][j]; r8 = A[i + 7 ][j]; B[j][i] = r1; B[j][i + 1 ] = r2; B[j][i + 2 ] = r3; B[j][i + 3 ] = r4; B[j][i + 4 ] = r5; B[j][i + 5 ] = r6; B[j][i + 6 ] = r7; B[j][i + 7 ] = r8; } else { for (int k = i; k < i + 8 && k < N; k++){ B[j][k] = A[k][j]; } } } } } } char trans_desc[] = "Simple row-wise scan transpose" ;void trans (int M, int N, int A[N][M], int B[M][N]) { int i, j, tmp; for (i = 0 ; i < N; i++) { for (j = 0 ; j < M; j++) { tmp = A[i][j]; B[j][i] = tmp; } } }void registerFunctions () { registerTransFunction(transpose_submit, transpose_submit_desc); registerTransFunction(trans, trans_desc); }int is_transpose (int M, int N, int A[N][M], int B[M][N]) { int i, j; for (i = 0 ; i < N; i++) { for (j = 0 ; j < M; ++j) { if (A[i][j] != B[j][i]) { return 0 ; } } } return 1 ; }
总结
本次实验通过对Cache的模拟以及针对Cache命中率进行优化的编程让我对Cache的原理有了更深层次的理解,也让我发现了在第一次学习Cache概念时有一定的误解(例如在何时进行替换以及怎样替换),而矩阵转置的部分也十分考验思维,也花费了许久的时间,总体来说本次实验在概念上和思维上都得到了极大的收获与锻炼