csapp arch_lab

csapp lab arch_lab

初见arch lab

In this lab, you will learn about the design and implementation of a pipelined Y86-64 processor, optimizing both it and a benchmark program to maximize performance. You are allowed to make any semanticspreserving transformation to the benchmark program, or to make enhancements to the pipelined processor, or both. When you have completed the lab, you will have a keen appreciation for the interactions between code and hardware that affect the performance of your programs.

The lab is organized into three parts, each with its own handin. In Part A you will write some simple Y86-64 programs and become familiar with the Y86-64 tools. In Part B, you will extend the SEQ simulator with a new instruction. These two parts will prepare you for Part C, the heart of the lab, where you will optimize the Y86-64 benchmark program and the processor design.

Part A

这一部分主要使用Y86-64汇编语言改写C语言程序,示例在example.c中。

1
2
3
4
5
/* linked list element */
typedef struct ELE {
long val;
struct ELE *next;
} *list_ptr;

可以看到其给出了链表的数据结构定义

sumlist

第一个程序要求迭代求和链表元素之和,我们的程序应该包括设计栈空间,引用函数,并且停止。同时给出了测试所用的案例。

1
2
3
4
5
6
7
8
9
10
/* sum_list - Sum the elements of a linked list */
long sum_list(list_ptr ls)
{
long val = 0;
while (ls) {
val += ls->val;
ls = ls->next;
}
return val;
}

一个简单但标准的Y86-64程序结构可以参考书上的252页。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# sum_list - Sum the elements of a linked list
# Execution begins at address 0
.pos 0
irmovq stack, %rsp # Set up stack pointer
call main # Execute main program
halt # Terminate program

# Sample linked list
.align 8
ele1:
.quad 0x00a
.quad ele2
ele2:
.quad 0x0b0
.quad ele3
ele3:
.quad 0xc00
.quad 0

main:
irmovq ele1,%rdi
call sum_list
ret

# long sum_list(list_ptr ls)
# start in %rdi
sum_list:
irmovq $0, %rax # rax stores val
jmp test

loop:
mrmovq (%rdi), %rsi
addq %rsi, %rax
mrmovq 8(%rdi), %rdi

test:
andq %rdi, %rdi
jne loop
ret

# Stack starts here and grows to lower addresses
.pos 0x200
stack:

使用yas编译,使用yis模拟运行程序

sum.yo

看到rax寄存器中结果为0xcba,程序运行正确。

rsum_list

使用递归计算链表元素之和。

1
2
3
4
5
6
7
8
9
10
11
/* rsum_list - Recursive version of sum_list */
long rsum_list(list_ptr ls)
{
if (!ls)
return 0;
else {
long val = ls->val;
long rest = rsum_list(ls->next);
return val + rest;
}
}

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# /* rsum_list - Recursive version of sum_list */
# Execution begins at address 0
.pos 0
irmovq stack, %rsp # Set up stack pointer
call main # Execute main program
halt # Terminate program

# Sample linked list
.align 8
ele1:
.quad 0x00a
.quad ele2
ele2:
.quad 0x0b0
.quad ele3
ele3:
.quad 0xc00
.quad 0

main:
irmovq ele1,%rdi
call rsum_list
ret

# long sum_list(list_ptr ls)
# start in %rdi
rsum_list:
andq %rdi, %rdi
je return # if(!ls)
mrmovq (%rdi), %rbx # val = ls->val
mrmovq 8(%rdi), %rdi # ls = ls->next
pushq %rbx
call rsum_list # rsum_list(ls->next)
popq %rbx
addq %rbx, %rax # val + rest
ret
return:
irmovq $0, %rax
ret


# Stack starts here and grows to lower addresses
.pos 0x200
stack:

测试结果如下

rsum.yo

rax寄存器显示0xcba,结果正确。

copy

要求数组各元素的按位异或值

1
2
3
4
5
6
7
8
9
10
11
12
/* copy_block - Copy src to dest and return xor checksum of src */
long copy_block(long *src, long *dest, long len)
{
long result = 0;
while (len > 0) {
long val = *src++;
*dest++ = val;
result ^= val;
len--;
}
return result;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
/* copy_block - Copy src to dest and return xor checksum of src */
# Execution begins at address 0
.pos 0
irmovq stack, %rsp # Set up stack pointer
call main # Execute main program
halt # Terminate program

# Sample
.align 8
# Source block
src:
.quad 0x00a
.quad 0x0b0
.quad 0xc00

# Destination block
dest:
.quad 0x111
.quad 0x222
.quad 0x333

main:
irmovq src, %rdi # src
irmovq dest, %rsi # dest
irmovq $3, %rdx # len
call copy_block
ret

# long copy_block(long *src, long *dest, long len)
# src in %rdi
# dest in %rsi
# len in %rdx
copy_block:
irmovq $8, %r8
irmovq $1, %r9
irmovq $0, %rax
andq %rdx, %rdx
jmp test
loop:
mrmovq (%rdi), %r10 # val = *src1
addq %r8, %rdi # src++
rmmovq %r10, (%rsi) # *dest = val
addq %r8, %rsi # dest++
xorq %r10, %rax # result ^= val
subq %r9, %rdx # len--. Set CC
test:
jne loop # Stop when 0
ret

# Stack starts here and grows to lower addresses
.pos 0x200
stack:

image-20230910133633223

结果正确

Part B

to extend the SEQ processor to support the iaddq

让处理器支持iaddq指令,修改hcl文件。

书中P264Y86-64处理器对一条指令的处理包括以下几个步骤:

  • 取址:根据 PC 的值从内存中读取指令字节
    • 指令指示符字节的两个四位部分,为icode:ifun
    • 寄存器指示符字节,为 rA, rB
    • 8字节常数字,为 valC
    • 计算下一条指令地址,为 valP
  • 译码:从寄存器读入最多两个操作数
    • rA, rB 指明的寄存器,读为 valA, valB
    • 对于指令popq, pushq, call, ret也可能从%rsp中读
  • 执行:根据ifun计算,或计算内存引用的有效地址,或增加或减少栈指针
    • 对上述三者之一进行的操作得到的值为valE
    • 如果是计算,则设置条件码
    • 对于条件传送指令,检验条件码和传送条件,并据此更新目标寄存器
    • 对于跳转指令,决定是否选择分支
  • 访存:输入写入内存或从内存读出数据
    • 若是从内存中读出数据,则读出的值为valM
  • 写回:最多写两个结果到寄存器
  • 更新 PC:将 PC 设置成下一条指令的地址

其执行过程与OPqirmovq类似,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
指令为:iaddq V, rB
取指:
icode:ifun <- M_1[PC]
rA:rB <- M_1[PC+1]
valC <- M_8[PC+2]
valP <- PC+10

译码:
valB <- R[rB]

执行:
valE <- valB + valC
Set CC

访存:

写回:
R[rB] <- valE

更新PC:
PC <- valP

根据上述代码修改hcl文件,

其中取值阶段instr_valid need_regids need_valC需要加上iiaddq

1
2
3
bool instr_valid = icode in 
{ INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ,
IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ, IIADDQ };
1
2
3
bool need_regids =
icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ,
IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ };
1
2
bool need_valC =
icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL, IIADDQ };

译码和写回 srcB(产生valB)的寄存器,需要在rB的括号中加上iiaddq

1
2
3
4
5
word srcB = [
icode in { IOPQ, IRMMOVQ, IMRMOVQ, IIADDQ } : rB;
icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't need register
];

dst_E表明写端口 E 的目的寄存器,计算出来的值valE将放在那里。最终结果要存放在rB中,需要在rB的前面加上iiaddq

1
2
3
4
5
6
word dstE = [
icode in { IRRMOVQ } && Cnd : rB;
icode in { IIRMOVQ, IOPQ, IIADDQ } : rB;
icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP;
1 : RNONE; # Don't write any register
];

执行阶段ALUaluAaluB进行计算,aluA可以是valAvalC8-8aluB只能是valB

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
## Select input A to ALU
word aluA = [
icode in { IRRMOVQ, IOPQ } : valA;
icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ } : valC;
icode in { ICALL, IPUSHQ } : -8;
icode in { IRET, IPOPQ } : 8;
# Other instructions don't need ALU
];

## Select input B to ALU
word aluB = [
icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL,
IPUSHQ, IRET, IPOPQ, IIADDQ } : valB;
icode in { IRRMOVQ, IIRMOVQ } : 0;
# Other instructions don't need ALU
];

同时需要更新条件码寄存器

1
bool set_cc = icode in { IOPQ,IIADDQ };

iiaddq不涉及访存和转移操作,无修改访存阶段和更新PC阶段。

使用lab附的SEQ模拟器的TTY模式对HCL文件进行测试。(同时也建议大家去看看GUI模式,很惊艳

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
voidsolar@admin:~/csapp_lab/archlab-handout/sim/seq$ ./ssim -t ../y86-code/asumi.yo
Y86-64 Processor: seq-full.hcl
137 bytes of code read
IF: Fetched irmovq at 0x0. ra=----, rb=%rsp, valC = 0x100
IF: Fetched call at 0xa. ra=----, rb=----, valC = 0x38
Wrote 0x13 to address 0xf8
IF: Fetched irmovq at 0x38. ra=----, rb=%rdi, valC = 0x18
IF: Fetched irmovq at 0x42. ra=----, rb=%rsi, valC = 0x4
IF: Fetched call at 0x4c. ra=----, rb=----, valC = 0x56
Wrote 0x55 to address 0xf0
IF: Fetched xorq at 0x56. ra=%rax, rb=%rax, valC = 0x0
IF: Fetched andq at 0x58. ra=%rsi, rb=%rsi, valC = 0x0
IF: Fetched jmp at 0x5a. ra=----, rb=----, valC = 0x83
IF: Fetched jne at 0x83. ra=----, rb=----, valC = 0x63
IF: Fetched mrmovq at 0x63. ra=%r10, rb=%rdi, valC = 0x0
IF: Fetched addq at 0x6d. ra=%r10, rb=%rax, valC = 0x0
IF: Fetched iaddq at 0x6f. ra=----, rb=%rdi, valC = 0x8
IF: Fetched iaddq at 0x79. ra=----, rb=%rsi, valC = 0xffffffffffffffff
IF: Fetched jne at 0x83. ra=----, rb=----, valC = 0x63
IF: Fetched mrmovq at 0x63. ra=%r10, rb=%rdi, valC = 0x0
IF: Fetched addq at 0x6d. ra=%r10, rb=%rax, valC = 0x0
IF: Fetched iaddq at 0x6f. ra=----, rb=%rdi, valC = 0x8
IF: Fetched iaddq at 0x79. ra=----, rb=%rsi, valC = 0xffffffffffffffff
IF: Fetched jne at 0x83. ra=----, rb=----, valC = 0x63
IF: Fetched mrmovq at 0x63. ra=%r10, rb=%rdi, valC = 0x0
IF: Fetched addq at 0x6d. ra=%r10, rb=%rax, valC = 0x0
IF: Fetched iaddq at 0x6f. ra=----, rb=%rdi, valC = 0x8
IF: Fetched iaddq at 0x79. ra=----, rb=%rsi, valC = 0xffffffffffffffff
IF: Fetched jne at 0x83. ra=----, rb=----, valC = 0x63
IF: Fetched mrmovq at 0x63. ra=%r10, rb=%rdi, valC = 0x0
IF: Fetched addq at 0x6d. ra=%r10, rb=%rax, valC = 0x0
IF: Fetched iaddq at 0x6f. ra=----, rb=%rdi, valC = 0x8
IF: Fetched iaddq at 0x79. ra=----, rb=%rsi, valC = 0xffffffffffffffff
IF: Fetched jne at 0x83. ra=----, rb=----, valC = 0x63
IF: Fetched ret at 0x8c. ra=----, rb=----, valC = 0x0
IF: Fetched ret at 0x55. ra=----, rb=----, valC = 0x0
IF: Fetched halt at 0x13. ra=----, rb=----, valC = 0x0
32 instructions executed
Status = HLT
Condition Codes: Z=1 S=0 O=0
Changed Register State:
%rax: 0x0000000000000000 0x0000abcdabcdabcd
%rsp: 0x0000000000000000 0x0000000000000100
%rdi: 0x0000000000000000 0x0000000000000038
%r10: 0x0000000000000000 0x0000a000a000a000
Changed Memory State:
0x00f0: 0x0000000000000000 0x0000000000000055
0x00f8: 0x0000000000000000 0x0000000000000013
ISA Check Succeeds

基准测试

运行基准测试保证指令集原有指令没有被破坏

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
voidsolar@admin:~/csapp_lab/archlab-handout/sim/seq$ cd ../y86-code; make testssim
../seq/ssim -t asum.yo > asum.seq
../seq/ssim -t asumr.yo > asumr.seq
../seq/ssim -t cjr.yo > cjr.seq
../seq/ssim -t j-cc.yo > j-cc.seq
../seq/ssim -t poptest.yo > poptest.seq
../seq/ssim -t pushquestion.yo > pushquestion.seq
../seq/ssim -t pushtest.yo > pushtest.seq
../seq/ssim -t prog1.yo > prog1.seq
../seq/ssim -t prog2.yo > prog2.seq
../seq/ssim -t prog3.yo > prog3.seq
../seq/ssim -t prog4.yo > prog4.seq
../seq/ssim -t prog5.yo > prog5.seq
../seq/ssim -t prog6.yo > prog6.seq
../seq/ssim -t prog7.yo > prog7.seq
../seq/ssim -t prog8.yo > prog8.seq
../seq/ssim -t ret-hazard.yo > ret-hazard.seq
grep "ISA Check" *.seq
asum.seq:ISA Check Succeeds
asumr.seq:ISA Check Succeeds
cjr.seq:ISA Check Succeeds
j-cc.seq:ISA Check Succeeds
poptest.seq:ISA Check Succeeds
prog1.seq:ISA Check Succeeds
prog2.seq:ISA Check Succeeds
prog3.seq:ISA Check Succeeds
prog4.seq:ISA Check Succeeds
prog5.seq:ISA Check Succeeds
prog6.seq:ISA Check Succeeds
prog7.seq:ISA Check Succeeds
prog8.seq:ISA Check Succeeds
pushquestion.seq:ISA Check Succeeds
pushtest.seq:ISA Check Succeeds
ret-hazard.seq:ISA Check Succeeds
rm asum.seq asumr.seq cjr.seq j-cc.seq poptest.seq pushquestion.seq pushtest.seq prog1.seq prog2.seq prog3.seq prog4.seq prog5.seq prog6.seq prog7.seq prog8.seq ret-hazard.seq

回归测试

测试除了iaddq以外的指令

1
2
3
4
5
6
7
8
9
10
11
12
13
voidsolar@admin:~/csapp_lab/archlab-handout/sim/y86-code$ cd ../ptest; make SIM=../seq/ssim
./optest.pl -s ../seq/ssim
Simulating with ../seq/ssim
All 49 ISA Checks Succeed
./jtest.pl -s ../seq/ssim
Simulating with ../seq/ssim
All 64 ISA Checks Succeed
./ctest.pl -s ../seq/ssim
Simulating with ../seq/ssim
All 22 ISA Checks Succeed
./htest.pl -s ../seq/ssim
Simulating with ../seq/ssim
All 600 ISA Checks Succeed

测试自己实现的iaddq

1
2
3
4
5
6
7
8
9
10
11
12
13
voidsolar@admin:~/csapp_lab/archlab-handout/sim/ptest$ cd ../ptest; make SIM=../seq/ssim TFLAGS=-i
./optest.pl -s ../seq/ssim -i
Simulating with ../seq/ssim
All 58 ISA Checks Succeed
./jtest.pl -s ../seq/ssim -i
Simulating with ../seq/ssim
All 96 ISA Checks Succeed
./ctest.pl -s ../seq/ssim -i
Simulating with ../seq/ssim
All 22 ISA Checks Succeed
./htest.pl -s ../seq/ssim -i
Simulating with ../seq/ssim
All 756 ISA Checks Succeed

至此Part B全部完成

Part C

我们需要修改HCL和ncopy来优化程序,通过程序的效率计算分数

首先iaddq是一条效率很高的指令,它能够将两步化为一步。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*
* ncopy - copy src to dst, returning number of positive ints
* contained in src array.
*/
word_t ncopy(word_t *src, word_t *dst, word_t len)
{
word_t count = 0;
word_t val;

while (len > 0) {
val = *src++;
*dst++ = val;
if (val > 0)
count++;
len--;
}
return count;
}

原汇编代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# You can modify this portion
# Loop header
xorq %rax,%rax # count = 0;
andq %rdx,%rdx # len <= 0?
jle Done # if so, goto Done:

Loop: mrmovq (%rdi), %r10 # read val from src...
rmmovq %r10, (%rsi) # ...and store it to dst
andq %r10, %r10 # val <= 0?
jle Npos # if so, goto Npos:
irmovq $1, %r10
addq %r10, %rax # count++
Npos: irmovq $1, %r10
subq %r10, %rdx # len--
irmovq $8, %r10
addq %r10, %rdi # src++
addq %r10, %rsi # dst++
andq %rdx,%rdx # len > 0?
jg Loop # if so, goto Loop:

利用iaddq替换原有的赋值指令。替换后

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# You can modify this portion
# Loop header
xorq %rax,%rax # count = 0;
andq %rdx,%rdx # len <= 0?
jle Done # if so, goto Done:

Loop:
mrmovq (%rdi), %r10 # read val from src...
rmmovq %r10, (%rsi) # ...and store it to dst
andq %r10, %r10 # val <= 0?
jle Npos # if so, goto Npos:
iaddq $1, %rax # count++
Npos:
iaddq $-1, %rdx # len--
iaddq $8, %rdi # src++
iaddq $8, %rsi # dst++
andq %rdx,%rdx # len > 0?
jg Loop # if so, goto Loop:

根据文档的提示,我们尝试使用循环展开对程序进行优化,循环展开通过增加每次迭代计算的元素的数量,减少循环的迭代次数来提升效率。

本文在参考了别人的博文的基础上使用6路循环展开

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# Loop header
andq %rdx,%rdx # len <= 0?
jmp test
Loop:
mrmovq (%rdi),%r8
rmmovq %r8,(%rsi)
andq %r8,%r8
jle Loop1
iaddq $1,%rax
Loop1:
mrmovq 8(%rdi),%r8
rmmovq %r8,8(%rsi)
andq %r8,%r8
jle Loop2
iaddq $1,%rax
Loop2:
mrmovq 16(%rdi),%r8
rmmovq %r8,16(%rsi)
andq %r8,%r8
jle Loop3
iaddq $1,%rax
Loop3:
mrmovq 24(%rdi),%r8
rmmovq %r8,24(%rsi)
andq %r8,%r8
jle Loop4
iaddq $1,%rax
Loop4:
mrmovq 32(%rdi),%r8
rmmovq %r8,32(%rsi)
andq %r8,%r8
jle Loop5
iaddq $1,%rax
Loop5:
mrmovq 40(%rdi),%r8
rmmovq %r8,40(%rsi)
iaddq $48,%rdi
iaddq $48,%rsi
andq %r8,%r8
jle test
iaddq $1,%rax
test:
iaddq $-6, %rdx # 先减,判断够不够6个
jge Loop # 6路展开
iaddq $-8,%rdi
iaddq $-8,%rsi
iaddq $6, %rdx
jmp test2 #剩下的
Lore:
mrmovq (%rdi),%r8
rmmovq %r8,(%rsi)
andq %r8,%r8
jle test2
iaddq $1,%rax
test2:
iaddq $8,%rdi
iaddq $8,%rsi
iaddq $-1, %rdx
jge Lore

逻辑简单:每次循环都对6个数进行复制,每次复制就设置一个条件语句判断返回时是否加1,对于剩下的数据每次循环只对1个数进行复制。

此时测试发现对于小数据的CPE值非常大,需要考虑对小数据进行优化。

于是对剩余数据采取3路循环展开。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
test:
iaddq $-6, %rdx # 先减,判断够不够6个
jge Loop # 6路展开
iaddq $6, %rdx
jmp test2 #剩下的

L:
mrmovq (%rdi),%r8
rmmovq %r8,(%rsi)
andq %r8,%r8
jle L1
iaddq $1,%rax
L1:
mrmovq 8(%rdi),%r8
rmmovq %r8,8(%rsi)
andq %r8,%r8
jle L2
iaddq $1,%rax
L2:
mrmovq 16(%rdi),%r8
rmmovq %r8,16(%rsi)
iaddq $24,%rdi
iaddq $24,%rsi
andq %r8,%r8
jle test2
iaddq $1,%rax
test2:
iaddq $-3, %rdx # 先减,判断够不够3个
jge L
iaddq $2, %rdx # -1则不剩了,直接Done,0 剩一个, 1剩2个
je R0
jl Done
mrmovq (%rdi),%r8
rmmovq %r8,(%rsi)
andq %r8,%r8
jle R2
iaddq $1,%rax
R2:
mrmovq 8(%rdi),%r8
rmmovq %r8,8(%rsi)
andq %r8,%r8
jle Done
iaddq $1,%rax
jmp Done
R0:
mrmovq (%rdi),%r8
rmmovq %r8,(%rsi)
andq %r8,%r8
jle Done
iaddq $1,%rax

消除气泡

程序多次使用

1
2
mrmovq (%rdi),%r8
rmmovq %r8,(%rsi)

使用转发避免数据冒险,也至少会有一个气泡。

另外一种优化方法是多取一个寄存器,连续进行两次数据复制

1
2
3
4
mrmovq (%rdi), %r8
mrmovq 8(%rdi), %r9
rmmovq %r8, (%rsi)
rmmovq %r9, 8(%rsi)

源程序如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# Loop header
andq %rdx,%rdx # len <= 0?
jmp test
Loop:
mrmovq (%rdi),%r8
mrmovq 8(%rdi),%r9
andq %r8,%r8
rmmovq %r8,(%rsi)
rmmovq %r9,8(%rsi)
jle Loop1
iaddq $1,%rax
Loop1:
andq %r9,%r9
jle Loop2
iaddq $1,%rax
Loop2:
mrmovq 16(%rdi),%r8
mrmovq 24(%rdi),%r9
andq %r8,%r8
rmmovq %r8,16(%rsi)
rmmovq %r9,24(%rsi)
jle Loop3
iaddq $1,%rax
Loop3:
andq %r9,%r9
jle Loop4
iaddq $1,%rax
Loop4:
mrmovq 32(%rdi),%r8
mrmovq 40(%rdi),%r9
andq %r8,%r8
rmmovq %r8,32(%rsi)
rmmovq %r9,40(%rsi)
jle Loop5
iaddq $1,%rax
Loop5:
iaddq $48,%rdi
iaddq $48,%rsi
andq %r9,%r9
jle test
iaddq $1,%rax
test:
iaddq $-6, %rdx # 先减,判断够不够6个
jge Loop # 6路展开
iaddq $6, %rdx
jmp test2 #剩下的

L:
mrmovq (%rdi),%r8
andq %r8,%r8
rmmovq %r8,(%rsi)
jle L1
iaddq $1,%rax
L1:
mrmovq 8(%rdi),%r8
andq %r8,%r8
rmmovq %r8,8(%rsi)
jle L2
iaddq $1,%rax
L2:
mrmovq 16(%rdi),%r8
iaddq $24,%rdi
rmmovq %r8,16(%rsi)
iaddq $24,%rsi
andq %r8,%r8
jle test2
iaddq $1,%rax
test2:
iaddq $-3, %rdx # 先减,判断够不够3个
jge L
iaddq $2, %rdx # -1则不剩了,直接Done,0 剩一个, 1剩2个
je R0
jl Done
mrmovq (%rdi),%r8
mrmovq 8(%rdi),%r9
rmmovq %r8,(%rsi)
rmmovq %r9,8(%rsi)
andq %r8,%r8
jle R2
iaddq $1,%rax
R2:
andq %r9,%r9
jle Done
iaddq $1,%rax
jmp Done
R0:
mrmovq (%rdi),%r8
andq %r8,%r8
rmmovq %r8,(%rsi)
jle Done
iaddq $1,%rax
1
2
Average CPE     8.16
Score 46.7/60.0

总结

  • csapp第四章对于我来讲太难了,但在自己亲手设计指令的过程中能模拟流水线的工作流程并尝试优化,让我这个noob得以一窥处理器体系结构的冰山一角。
  • 本lab同之前的lab一样,诚意满满,yas、yis、ssim、psim等模拟、测试工具一应俱全,整个过程如同游戏闯关一样令人着迷。

csapp arch_lab
http://htwzxwj.github.io/2023/09/10/csapp-arch-lab/
作者
End0rph1n
发布于
2023年9月10日
许可协议