From 4dbe36b134ab7bffa24a334aa676163937247ba7 Mon Sep 17 00:00:00 2001 From: Lucas-Wye Date: Fri, 9 Aug 2024 08:21:21 +0000 Subject: [PATCH] [rtl] support zvk --- configgen/generated/blastoise.json | 3 +- configgen/generated/machamp.json | 3 +- configgen/generated/psyduck.json | 20 +- configgen/generated/sandslash.json | 3 +- configgen/src/Main.scala | 17 +- ipemu/src/TestBench.scala | 2 +- t1/src/Bundles.scala | 4 +- t1/src/Lane.scala | 235 ++++++++++++++++- t1/src/LaneZvk.scala | 47 ++++ t1/src/T1.scala | 35 ++- t1/src/VectorFunctionUnit.scala | 6 +- t1/src/decoder/Decoder.scala | 25 +- t1/src/decoder/InstructionDocumentation.scala | 26 ++ t1/src/decoder/T1DecodePattern.scala | 1 + t1/src/decoder/attribute/isItype.scala | 7 + t1/src/decoder/attribute/isUnsigned0.scala | 26 ++ t1/src/decoder/attribute/isUnsigned1.scala | 26 ++ t1/src/decoder/attribute/isVtype.scala | 16 ++ t1/src/decoder/attribute/isZvk.scala | 56 ++++ t1/src/decoder/attribute/zvkUop.scala | 80 ++++++ t1/src/laneStage/LaneExecutionBridge.scala | 44 +++- t1/src/laneStage/LaneStage1.scala | 241 +++++++++++++++++- t1/src/laneStage/LaneStage3.scala | 32 ++- t1/src/laneStage/SlotTokenManager.scala | 27 +- t1/src/laneStage/ZvkCrossReadUnit.scala | 133 ++++++++++ t1/src/vrf/VRF.scala | 25 ++ 26 files changed, 1099 insertions(+), 41 deletions(-) create mode 100644 t1/src/LaneZvk.scala create mode 100644 t1/src/decoder/attribute/isZvk.scala create mode 100644 t1/src/decoder/attribute/zvkUop.scala create mode 100644 t1/src/laneStage/ZvkCrossReadUnit.scala diff --git a/configgen/generated/blastoise.json b/configgen/generated/blastoise.json index 290ef86c1..88c465075 100644 --- a/configgen/generated/blastoise.json +++ b/configgen/generated/blastoise.json @@ -167,7 +167,8 @@ ] ] ], - "zvbbModuleParameters": [] + "zvbbModuleParameters": [], + "zvkModuleParameters": [] } }, "generator": "org.chipsalliance.t1.rtl.T1" diff --git a/configgen/generated/machamp.json b/configgen/generated/machamp.json index ceeaf5e59..865f6c13a 100644 --- a/configgen/generated/machamp.json +++ b/configgen/generated/machamp.json @@ -151,7 +151,8 @@ ] ], "floatModuleParameters": [], - "zvbbModuleParameters": [] + "zvbbModuleParameters": [], + "zvkModuleParameters": [] } }, "generator": "org.chipsalliance.t1.rtl.T1" diff --git a/configgen/generated/psyduck.json b/configgen/generated/psyduck.json index 04a2f3572..6abca9424 100644 --- a/configgen/generated/psyduck.json +++ b/configgen/generated/psyduck.json @@ -4,7 +4,8 @@ "dLen": 256, "extensions": [ "Zve32f", - "Zvbb" + "Zvbb", + "Zvk" ], "t1customInstructions": [], "vrfBankSize": 1, @@ -184,6 +185,23 @@ 3 ] ] + ], + "zvkModuleParameters": [ + [ + { + "parameter": { + "datapathWidth": 32, + "latency": 3 + }, + "generator": "org.chipsalliance.t1.rtl.LaneZvk" + }, + [ + 0, + 1, + 2, + 3 + ] + ] ] } }, diff --git a/configgen/generated/sandslash.json b/configgen/generated/sandslash.json index 688085fe1..25f76a682 100644 --- a/configgen/generated/sandslash.json +++ b/configgen/generated/sandslash.json @@ -151,7 +151,8 @@ ] ], "floatModuleParameters": [], - "zvbbModuleParameters": [] + "zvbbModuleParameters": [], + "zvkModuleParameters": [] } }, "generator": "org.chipsalliance.t1.rtl.T1" diff --git a/configgen/src/Main.scala b/configgen/src/Main.scala index 88e3bc326..d2b155491 100644 --- a/configgen/src/Main.scala +++ b/configgen/src/Main.scala @@ -100,14 +100,15 @@ object Main { Seq(0, 1, 2, 3))), floatModuleParameters = Seq((SerializableModuleGenerator(classOf[LaneFloat], LaneFloatParam(32, 3)), Seq(0, 1, 2, 3))), - zvbbModuleParameters = Seq() + zvbbModuleParameters = Seq(), + zvkModuleParameters = Seq(), ) ) if (doEmit) param.emit(targetFile) param } - // DLEN256 VLEN256; FP; VRF p0rw,p1rw bank1; LSU bank8 beatbyte 8; Zvbb + // DLEN256 VLEN256; FP; VRF p0rw,p1rw bank1; LSU bank8 beatbyte 8; Zvbb; Zvk @main def psyduck( @arg(name = "target-file", short = 't') targetFile: os.Path, @arg(name = "emit", short = 'e', doc = "emit config") doEmit: Boolean = true @@ -117,7 +118,7 @@ object Main { val param = T1Parameter( vLen, dLen, - extensions = Seq("Zve32f", "Zvbb"), + extensions = Seq("Zve32f", "Zvbb", "Zvk"), t1customInstructions = Nil, vrfBankSize = 1, vrfRamType = RamType.p0rwp1rw, @@ -151,7 +152,9 @@ object Main { floatModuleParameters = Seq((SerializableModuleGenerator(classOf[LaneFloat], LaneFloatParam(32, 3)), Seq(0, 1, 2, 3))), zvbbModuleParameters = - Seq((SerializableModuleGenerator(classOf[LaneZvbb], LaneZvbbParam(32, 3)), Seq(0, 1, 2, 3))) + Seq((SerializableModuleGenerator(classOf[LaneZvbb], LaneZvbbParam(32, 3)), Seq(0, 1, 2, 3))), + zvkModuleParameters = + Seq((SerializableModuleGenerator(classOf[LaneZvk], LaneZvkParam(32, 3)), Seq(0, 1, 2, 3))), ) ) if (doEmit) param.emit(targetFile) @@ -201,7 +204,8 @@ object Main { ), Seq(0, 1, 2, 3))), floatModuleParameters = Seq(), - zvbbModuleParameters = Seq() // TODO + zvbbModuleParameters = Seq(), + zvkModuleParameters = Seq(), ) ) if (doEmit) param.emit(targetFile) @@ -251,7 +255,8 @@ object Main { ), Seq(0, 1, 2, 3))), floatModuleParameters = Seq(), - zvbbModuleParameters = Seq() // TODO + zvbbModuleParameters = Seq(), + zvkModuleParameters = Seq(), ) ) if (doEmit) param.emit(targetFile) diff --git a/ipemu/src/TestBench.scala b/ipemu/src/TestBench.scala index c3c96b318..e74e47350 100644 --- a/ipemu/src/TestBench.scala +++ b/ipemu/src/TestBench.scala @@ -253,7 +253,7 @@ class TestBench(generator: SerializableModuleGenerator[T1, T1Parameter]) laneProbes.flatMap(laneProbe => laneProbe.slots.map(slot => slot.writeTag === tag.U && slot.writeQueueEnq && slot.writeMask.orR) ) ++ laneProbes.flatMap(laneProbe => - laneProbe.crossWriteProbe.map(cp => cp.bits.writeTag === tag.U && cp.valid && cp.bits.writeMask.orR) + laneProbe.crossWriteProbe.map(cp => cp.bits.writeTag === tag.U && cp.valid && cp.bits.writeMask.orR) // TODO: zvkCrossWriteProbe ) ++ // vrf write from lsu lsuProbe.slots.map(slot => slot.dataInstruction === tag.U && slot.writeValid && slot.dataMask.orR) ++ diff --git a/t1/src/Bundles.scala b/t1/src/Bundles.scala index e873f946b..7bf67f298 100644 --- a/t1/src/Bundles.scala +++ b/t1/src/Bundles.scala @@ -619,8 +619,10 @@ class ExecutionUnitRecord(parameter: LaneParameter)(isLastSlot: Boolean) extends val maskForFilter: UInt = UInt(4.W) // false -> lsb of cross read group val executeIndex: Bool = Bool() + val zvkExecuteIndex: Option[UInt] = Option.when(parameter.zvkEnable)(UInt(2.W)) val source: Vec[UInt] = Vec(3, UInt(parameter.datapathWidth.W)) val crossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 2).W)) + val zvkCrossReadSource: Option[UInt] = Option.when(isLastSlot && parameter.zvkEnable)(UInt((parameter.datapathWidth * 4).W)) /** groupCounter need use to update `Lane.maskFormatResultForGroup` */ val groupCounter: UInt = UInt(parameter.groupNumberBits.W) val sSendResponse: Option[Bool] = Option.when(isLastSlot)(Bool()) @@ -725,4 +727,4 @@ class T1Retire(xLen: Int) extends Bundle { val rd: ValidIO[T1RdRetire] = Valid(new T1RdRetire(xLen)) val csr: ValidIO[T1CSRRetire] = Valid(new T1CSRRetire) val mem: ValidIO[EmptyBundle] = Valid(new EmptyBundle) -} \ No newline at end of file +} diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala index 6b7dec0c2..9388ac96e 100644 --- a/t1/src/Lane.scala +++ b/t1/src/Lane.scala @@ -62,6 +62,7 @@ class LaneProbe(parameter: LaneParameter) extends Bundle { val instructionValid: UInt = UInt(parameter.chainingSize.W) val crossWriteProbe: Vec[ValidIO[LaneWriteProbe]] = Vec(2, Valid(new LaneWriteProbe(parameter.instructionIndexBits))) + val zvkCrossWriteProbe: Option[Vec[ValidIO[LaneWriteProbe]]] = Option.when(parameter.zvkEnable)(Vec(4, Valid(new LaneWriteProbe(parameter.instructionIndexBits)))) val vrfProbe: VRFProbe = new VRFProbe(parameter.vrfParam) } @@ -86,7 +87,9 @@ case class LaneParameter( laneNumber: Int, chainingSize: Int, crossLaneVRFWriteEscapeQueueSize: Int, + crossLaneVRFWriteEscapeZvkQueueSize: Int, fpuEnable: Boolean, + zvkEnable: Boolean, portFactor: Int, vrfRamType: RamType, decoderParam: DecoderParam, @@ -132,7 +135,7 @@ case class LaneParameter( * * for each number in table below, it represent a [[datapathWidth]] * {{{ - * lane0 | lane1 | ... | lane8 + * lane0 | lane1 | ... | lane7 * offset0 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 * offset1 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 * offset2 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 @@ -178,7 +181,7 @@ case class LaneParameter( val executionQueueSize: Int = 4 /** Parameter for [[VRF]] */ - def vrfParam: VRFParam = VRFParam(vLen, laneNumber, datapathWidth, chainingSize, portFactor, vrfRamType) + def vrfParam: VRFParam = VRFParam(vLen, laneNumber, datapathWidth, chainingSize, portFactor, zvkEnable, vrfRamType) } /** Instantiate [[Lane]] from [[T1]], @@ -210,8 +213,27 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ * TODO: benchmark the usecase for tuning the Ring Bus width. * find a real world case for using `narrow` and `widen` aggressively. */ + // 0: 0.0 - 0.1 + // 1: 0.2 - 0.3 + // 2: 0.4 - 0.5 + // 3: 0.6 - 0.7 + // 4: 1.0 - 1.1 + // 5: 1.2 - 1.3 + // 6: 1.4 - 1.5 + // 7: 1.6 - 1.7 + + // 0: 0.0 - 0.1 - 0.2 - 0.3 + // 1: 0.4 - 0.5 - 0.6 - 0.7 + // 2: 1.0 - 1.1 - 1.2 - 1.3 + // 3: 1.4 - 1.5 - 1.6 - 1.7 + // 4: 2.0 - 2.1 - 2.2 - 2.3 + // 5: 2.4 - 2.5 - 2.6 - 2.7 + // 6: 3.0 - 3.1 - 3.2 - 3.3 + // 7: 3.4 - 3.5 - 3.6 - 3.7 @public val readBusPort: Vec[RingPort[ReadBusData]] = IO(Vec(2, new RingPort(new ReadBusData(parameter)))) + @public + val zvkReadBusPort: Option[Vec[RingPort[ReadBusData]]] = Option.when(parameter.zvkEnable)(IO(Vec(4, new RingPort(new ReadBusData(parameter))))) /** VRF Write Interface. * only used for `narrow` an `widen` @@ -220,6 +242,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ */ @public val writeBusPort: Vec[RingPort[WriteBusData]] = IO(Vec(2, new RingPort(new WriteBusData(parameter)))) + val zvkWriteBusPort: Option[Vec[RingPort[WriteBusData]]] = Option.when(parameter.zvkEnable)(IO(Vec(4, new RingPort(new WriteBusData(parameter))))) /** request from [[T1.decode]] to [[Lane]]. */ @public @@ -301,7 +324,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ @public val loadDataInLSUWriteQueue: UInt = IO(Input(UInt(parameter.chainingSize.W))) - /** How many dataPath will writ by instruction in this lane */ + /** How many dataPath will write by instruction in this lane */ @public val writeCount: UInt = IO(Input(UInt((parameter.vlMaxBits - log2Ceil(parameter.laneNumber) - log2Ceil(parameter.dataPathByteWidth)).W))) @@ -322,6 +345,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // TODO: remove dontTouch(writeBusPort) + if(parameter.zvkEnable) { + dontTouch(zvkWriteBusPort.get) + } /** VRF instantces. */ val vrf: Instance[VRF] = Instantiate(new VRF(parameter.vrfParam)) @@ -380,10 +406,17 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val allVrfWriteAfterCheck: Seq[VRFWriteRequest] = Seq.tabulate(parameter.chainingSize + 3) { i => RegInit(0.U.asTypeOf(vrfWriteArbiter.head.bits)) } + val zvkAllVrfWriteAfterCheck: Option[Seq[VRFWriteRequest]] = Option.when(parameter.zvkEnable)(Seq.tabulate(parameter.chainingSize + 5) { i => + RegInit(0.U.asTypeOf(vrfWriteArbiter.head.bits)) + }) val afterCheckValid: Seq[Bool] = Seq.tabulate(parameter.chainingSize + 3) { _ => RegInit(false.B)} val afterCheckDequeueReady: Vec[Bool] = Wire(Vec(parameter.chainingSize + 3, Bool())) val afterCheckDequeueFire: Seq[Bool] = afterCheckValid.zip(afterCheckDequeueReady).map {case (v, r) => v && r} + val zvkAfterCheckValid: Option[Seq[Bool]] = Option.when(parameter.zvkEnable)(Seq.tabulate(parameter.chainingSize + 5) { _ => RegInit(false.B)}) + val zvkAfterCheckDequeueReady: Option[Vec[Bool]] = Option.when(parameter.zvkEnable)(Wire(Vec(parameter.chainingSize + 5, Bool()))) + val zvkAfterCheckDequeueFire: Option[Seq[Bool]] = Option.when(parameter.zvkEnable)(zvkAfterCheckValid.get.zip(zvkAfterCheckDequeueReady.get).map {case (v, r) => v && r}) + /** for each slot, assert when it is asking [[T1]] to change mask */ val slotMaskRequestVec: Vec[ValidIO[UInt]] = Wire( Vec( @@ -442,8 +475,12 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val readCheckRequestVec: Vec[VRFReadRequest] = Wire(Vec(parameter.chainingSize * 3 + 2, new VRFReadRequest(parameter.vrfParam.regNumBits, parameter.vrfOffsetBits, parameter.instructionIndexBits) )) + val zvkReadCheckRequestVec: Option[Vec[VRFReadRequest]] = Option.when(parameter.zvkEnable)(Wire(Vec(parameter.chainingSize * 3 + 4, + new VRFReadRequest(parameter.vrfParam.regNumBits, parameter.vrfOffsetBits, parameter.instructionIndexBits) + ))) val readCheckResult: Vec[Bool] = Wire(Vec(parameter.chainingSize * 3 + 2, Bool())) + val zvkReadCheckResult: Option[Vec[Bool]] = Option.when(parameter.zvkEnable)(Wire(Vec(parameter.chainingSize * 3 + 4, Bool()))) /** signal used for prohibiting slots to access VRF. * a slot will become inactive when: @@ -467,7 +504,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val slotCanShift: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) /** Which data group is waiting for the result of the cross-lane read */ - val readBusDequeueGroup: UInt = Wire(UInt(parameter.groupNumberBits.W)) + val readBusDequeueGroup: UInt = Wire(UInt(parameter.groupNumberBits.W)) // TODO: readBusDequeueGroup is currently unused /** enqueue valid for execution unit */ val executeEnqueueValid: Vec[Bool] = Wire(Vec(parameter.chainingSize, Bool())) @@ -524,6 +561,18 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ pipe = true ) )) + val zvkCrossLaneWriteQueue: Option[Seq[Queue[VRFWriteRequest]]] = Option.when(parameter.zvkEnable)(Seq.tabulate(4)(i => Module( + new Queue( + new VRFWriteRequest( + parameter.vrfParam.regNumBits, + parameter.vrfOffsetBits, + parameter.instructionIndexBits, + parameter.datapathWidth + ), + parameter.crossLaneVRFWriteEscapeZvkQueueSize, + pipe = true + ) + ))) val maskedWriteUnit: Instance[MaskedWrite] = Instantiate(new MaskedWrite(parameter)) val tokenManager: Instance[SlotTokenManager] = Instantiate(new SlotTokenManager(parameter)) slotControl.zipWithIndex.foreach { @@ -671,6 +720,13 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ readCheckRequestVec((parameter.chainingSize - index - 1) * 3 + portIndex) := stage1.vrfCheckRequest(portIndex) stage1.checkResult(portIndex) := readCheckResult((parameter.chainingSize - index - 1) * 3 + portIndex) } + val zvkCheckSize = if (isLastSlot && parameter.zvkEnable) 7 else 3 + if(parameter.zvkEnable) { + Seq.tabulate(zvkCheckSize){ portIndex => + zvkReadCheckRequestVec.get((parameter.chainingSize - index - 1) * 3 + portIndex) := stage1.zvkVrfCheckRequest.get(portIndex) + stage1.zvkCheckResult.get(portIndex) := zvkReadCheckResult.get((parameter.chainingSize - index - 1) * 3 + portIndex) + } + } // connect cross read bus if(isLastSlot) { val tokenSize = parameter.crossLaneVRFWriteEscapeQueueSize @@ -695,6 +751,30 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // dequeue to cross read unit stage1.readBusDequeue.get(portIndex) <> queue.io.deq } + val zvKTokenSize = parameter.crossLaneVRFWriteEscapeZvkQueueSize + if(parameter.zvkEnable) { + zvkReadBusPort.get.zipWithIndex.foreach {case (readPort, portIndex) => + // tx + val tokenReg = RegInit(0.U(log2Ceil(zvKTokenSize + 1).W)) + val tokenReady: Bool = tokenReg =/= zvKTokenSize.U + stage1.zvkReadBusRequest.get(portIndex).ready := tokenReady + readPort.deq.valid := stage1.zvkReadBusRequest.get(portIndex).valid && tokenReady + readPort.deq.bits := stage1.zvkReadBusRequest.get(portIndex).bits + val tokenUpdate = Mux(readPort.deq.valid, 1.U, -1.S(tokenReg.getWidth.W).asUInt) + when(readPort.deq.valid ^ readPort.deqRelease) { + tokenReg := tokenReg + tokenUpdate + } + // rx + // rx queue + val queue = Module(new Queue(chiselTypeOf(readPort.deq.bits), zvKTokenSize, pipe=true)) + queue.io.enq.valid := readPort.enq.valid + queue.io.enq.bits := readPort.enq.bits + readPort.enqRelease := queue.io.deq.fire + assert(queue.io.enq.ready || !readPort.enq.valid) + // dequeue to cross read unit + stage1.zvkReadBusDequeue.get(portIndex) <> queue.io.deq + } + } // cross write writeBusPort.zipWithIndex.foreach {case (writePort, portIndex) => @@ -710,6 +790,21 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ tokenReg := tokenReg + tokenUpdate } } + if(parameter.zvkEnable) { + zvkWriteBusPort.get.zipWithIndex.foreach {case (writePort, portIndex) => + val tokenReg = RegInit(0.U(log2Ceil(zvKTokenSize + 1).W)) + val tokenReady: Bool = tokenReg =/= zvKTokenSize.U + writePort.deq.valid := stage3.zvkCrossWritePort.get(portIndex).valid && tokenReady + writePort.deq.bits := stage3.zvkCrossWritePort.get(portIndex).bits + stage3.zvkCrossWritePort.get(portIndex).ready := tokenReady + + // update token + val tokenUpdate = Mux(writePort.deq.valid, 1.U, -1.S(tokenReg.getWidth.W).asUInt) + when(writePort.deq.valid ^ writePort.deqRelease) { + tokenReg := tokenReg + tokenUpdate + } + } + } } stage2.enqueue.valid := stage1.dequeue.valid && executionUnit.enqueue.ready @@ -742,6 +837,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ executionUnit.enqueue.bits.crossReadSource.zip(stage1.dequeue.bits.crossReadSource).foreach { case (sink, source) => sink := source } + executionUnit.enqueue.bits.zvkCrossReadSource.zip(stage1.dequeue.bits.zvkCrossReadSource).foreach { case (sink, source) => + sink := source + } executionUnit.ffoByOtherLanes := ffoRecord.ffoByOtherLanes executionUnit.selfCompleted := ffoRecord.selfCompleted @@ -788,6 +886,9 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ stage3.enqueue.bits.pipeData := stage2.dequeue.bits.pipeData.getOrElse(DontCare) stage3.enqueue.bits.ffoIndex := executionUnit.dequeue.bits.ffoIndex executionUnit.dequeue.bits.crossWriteData.foreach(data => stage3.enqueue.bits.crossWriteData := data) + if(parameter.zvkEnable) { + executionUnit.dequeue.bits.zvkCrossWriteData.foreach(data => stage3.enqueue.bits.zvkCrossWriteData.get := data) + } stage2.dequeue.bits.sSendResponse.foreach(_ => stage3.enqueue.bits.sSendResponse := _) executionUnit.dequeue.bits.ffoSuccess.foreach(_ => stage3.enqueue.bits.ffoSuccess := _) @@ -835,7 +936,6 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // probeWire.slots(index).probeStage1 := ??? } - // cross write bus <> write queue crossLaneWriteQueue.zipWithIndex.foreach {case (queue, index) => val port = writeBusPort(index) @@ -853,6 +953,24 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ assert(queue.io.enq.ready || !port.enq.valid) port.enqRelease := queue.io.deq.fire } + if(parameter.zvkEnable) { + zvkCrossLaneWriteQueue.get.zipWithIndex.foreach {case (queue, index) => + val port = zvkWriteBusPort.get(index) + // ((counter << 1) >> parameter.vrfParam.vrfOffsetBits).low(3) + val registerIncreaseBase = parameter.vrfParam.vrfOffsetBits - 1 + queue.io.enq.valid := port.enq.valid + queue.io.enq.bits.vd := + // 3: 8 reg => log(2, 8) + slotControl.head.laneRequest.vd + port.enq.bits.counter(registerIncreaseBase + 3 - 1, registerIncreaseBase) + queue.io.enq.bits.offset := port.enq.bits.counter ## index.U(2.W)(0) + queue.io.enq.bits.data := port.enq.bits.data + queue.io.enq.bits.last := DontCare + queue.io.enq.bits.instructionIndex := port.enq.bits.instructionIndex + queue.io.enq.bits.mask := FillInterleaved(2, port.enq.bits.mask) + assert(queue.io.enq.ready || !port.enq.valid) + port.enqRelease := queue.io.deq.fire + } + } val vfus: Seq[Instance[VFUModule]] = instantiateVFU(parameter.vfuInstantiateParameter)( requestVec, @@ -870,7 +988,7 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val queueBeforeMaskWrite: Queue[VRFWriteRequest] = Module(new Queue(chiselTypeOf(maskedWriteUnit.enqueue.bits), entries = 1, pipe = true)) val writeSelect: UInt = Wire(UInt((parameter.chainingSize + 3).W)) - val writeCavitation: UInt = VecInit(allVrfWriteAfterCheck.map(_.mask === 0.U)).asUInt + val writeCavitation: UInt = VecInit(allVrfWriteAfterCheck.map(_.mask === 0.U)).asUInt // TODO // 处理 rf { @@ -899,7 +1017,13 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ } // all vrf write - val allVrfWrite: Seq[DecoupledIO[VRFWriteRequest]] = vrfWriteArbiter ++ crossLaneWriteQueue.map(_.io.deq) + val allVrfWrite: Seq[DecoupledIO[VRFWriteRequest]] = vrfWriteArbiter ++ crossLaneWriteQueue.map(_.io.deq) ++ { + if(parameter.zvkEnable) { + zvkCrossLaneWriteQueue.get.map(_.io.deq) + } else { + Seq() + } + } // check all write vrf.writeCheck.zip(allVrfWrite).foreach {case (check, write) => check.vd := write.bits.vd @@ -909,6 +1033,10 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ vrf.readCheck.zip(readCheckRequestVec).foreach{case (sink, source) => sink := source} readCheckResult.zip(vrf.readCheckResult).foreach{case (sink, source) => sink := source} + if(parameter.zvkEnable) { + vrf.zvkReadCheck.get.zip(zvkReadCheckRequestVec.get).foreach{case (sink, source) => sink := source} + zvkReadCheckResult.get.zip(vrf.zvkReadCheckResult.get).foreach{case (sink, source) => sink := source} + } allVrfWriteAfterCheck.zipWithIndex.foreach { case (req, i) => val check = vrf.writeAllow(i) @@ -923,16 +1051,59 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ afterCheckValid(i) := enqFire } } + if(parameter.zvkEnable) { + zvkAllVrfWriteAfterCheck.get.zipWithIndex.foreach { case (req, i) => + val check = { + if(i < 7) { + vrf.writeAllow(i) + } else { + false.B + } + } + val enqReady = check && (!zvkAfterCheckValid.get(i) || zvkAfterCheckDequeueReady.get(i)) + val enqFire = enqReady && allVrfWrite(i).valid + allVrfWrite(i).ready := enqReady + when(enqFire) { + req := allVrfWrite(i).bits + } + val deqFire = zvkAfterCheckDequeueFire.get(i) + when(deqFire ^ enqFire) { + zvkAfterCheckValid.get(i) := enqFire + } + } + } // Arbiter writeSelect := ffo(VecInit(afterCheckValid).asUInt & (~writeCavitation).asUInt) afterCheckDequeueReady.zipWithIndex.foreach { case (p, i) => p := (writeSelect(i) && queueBeforeMaskWrite.io.enq.ready) || writeCavitation(i) } + if(parameter.zvkEnable) { + zvkAfterCheckDequeueReady.get.zipWithIndex.foreach { case (p, i) => + p := { + if(i < 6) { + (writeSelect(i) && queueBeforeMaskWrite.io.enq.ready) || writeCavitation(i) + } else { + (queueBeforeMaskWrite.io.enq.ready) + } + } + } + } maskedWriteUnit.enqueue <> queueBeforeMaskWrite.io.deq queueBeforeMaskWrite.io.enq.valid := writeSelect.orR - queueBeforeMaskWrite.io.enq.bits := Mux1H(writeSelect, allVrfWriteAfterCheck) + + queueBeforeMaskWrite.io.enq.bits := { + // if(parameter.zvkEnable) { + // Mux( + // laneRequest.bits.decodeResult(Decoder.zvk), + // Mux1H(writeSelect, zvkAllVrfWriteAfterCheck.get), + // Mux1H(writeSelect, allVrfWriteAfterCheck) + // ) + // } else { + Mux1H(writeSelect, allVrfWriteAfterCheck) + // } + } // TODO vrf.write <> maskedWriteUnit.dequeue readBeforeMaskedWrite <> maskedWriteUnit.vrfReadRequest @@ -1192,6 +1363,12 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ rpt.valid := afterCheckDequeueFire(parameter.chainingSize + 1 + rptIndex) rpt.bits := allVrfWriteAfterCheck(parameter.chainingSize + 1 + rptIndex).instructionIndex } + if(parameter.zvkEnable) { + tokenManager.zvkCrossWriteReports.get.zipWithIndex.foreach {case (rpt, rptIndex) => + rpt.valid := zvkAfterCheckDequeueFire.get(parameter.chainingSize + 1 + rptIndex) + rpt.bits := zvkAllVrfWriteAfterCheck.get(parameter.chainingSize + 1 + rptIndex).instructionIndex + } + } // todo: add mask unit write token tokenManager.responseReport.valid := laneResponse.valid tokenManager.responseReport.bits := laneResponse.bits.instructionIndex @@ -1209,8 +1386,21 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ // slot write tokenManager.slotWriteReport.zipWithIndex.foreach {case (rpt, rptIndex) => // All masks are also removed here - rpt.valid := afterCheckDequeueFire(rptIndex) - rpt.bits := allVrfWriteAfterCheck(rptIndex).instructionIndex + if(parameter.zvkEnable) { + rpt.valid := Mux( + laneRequest.bits.decodeResult(Decoder.zvk), + zvkAfterCheckDequeueFire.get(rptIndex), + afterCheckDequeueFire(rptIndex) + ) + rpt.bits := Mux( + laneRequest.bits.decodeResult(Decoder.zvk), + zvkAllVrfWriteAfterCheck.get(rptIndex).instructionIndex, + allVrfWriteAfterCheck(rptIndex).instructionIndex + ) + } else { + rpt.valid := afterCheckDequeueFire(rptIndex) + rpt.bits := allVrfWriteAfterCheck(rptIndex).instructionIndex + } } tokenManager.writePipeEnqReport.valid := queueBeforeMaskWrite.io.enq.fire @@ -1223,8 +1413,22 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ tokenManager.topWriteEnq.valid := vrfWriteChannel.fire tokenManager.topWriteEnq.bits := vrfWriteChannel.bits.instructionIndex - tokenManager.topWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize) - tokenManager.topWriteDeq.bits := allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex + tokenManager.topWriteDeq.valid := { + if(parameter.zvkEnable) { + zvkAfterCheckDequeueFire.get(parameter.chainingSize) + } else { + afterCheckDequeueFire(parameter.chainingSize) + } + } + if(parameter.zvkEnable) { + tokenManager.topWriteDeq.bits := Mux( + laneRequest.bits.decodeResult(Decoder.zvk), + zvkAllVrfWriteAfterCheck.get(parameter.chainingSize).instructionIndex, + allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex + ) + } else { + tokenManager.topWriteDeq.bits := allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex + } // probe wire probeWire.laneRequestStall := laneRequest.valid && !laneRequest.ready @@ -1236,5 +1440,12 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ pb.bits.writeTag := port.deq.bits.instructionIndex pb.bits.writeMask := port.deq.bits.mask } + if(parameter.zvkEnable) { + probeWire.zvkCrossWriteProbe.get.zip(zvkWriteBusPort.get).foreach {case (pb, port) => + pb.valid := port.deq.valid + pb.bits.writeTag := port.deq.bits.instructionIndex + pb.bits.writeMask := port.deq.bits.mask + } + } probeWire.vrfProbe := probe.read(vrf.vrfProbe) } diff --git a/t1/src/LaneZvk.scala b/t1/src/LaneZvk.scala new file mode 100644 index 000000000..87933c74c --- /dev/null +++ b/t1/src/LaneZvk.scala @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3.experimental.hierarchy.instantiable +import chisel3._ +import chisel3.experimental.{SerializableModule, SerializableModuleParameter} +import chisel3.util._ +import org.chipsalliance.t1.rtl.decoder.{BoolField, Decoder} + +object LaneZvkParam { + implicit def rw: upickle.default.ReadWriter[LaneZvkParam] = upickle.default.macroRW +} + +case class LaneZvkParam(datapathWidth: Int, latency: Int) extends VFUParameter with SerializableModuleParameter { + val inputBundle = new LaneZvkRequest(datapathWidth) + val decodeField: BoolField = Decoder.zvk + val outputBundle = new LaneZvkResponse(datapathWidth) + override val NeedSplit: Boolean = false +} + +class LaneZvkRequest(datapathWidth: Int) extends VFUPipeBundle { + val src = Vec(3, UInt(datapathWidth.W)) + val opcode = UInt(4.W) + val vSew = UInt(2.W) + val shifterSize = UInt(log2Ceil(datapathWidth).W) +} + +class LaneZvkResponse(datapathWidth: Int) extends VFUPipeBundle { + val data = UInt(datapathWidth.W) +} + +@instantiable +class LaneZvk(val parameter: LaneZvkParam) + extends VFUModule(parameter) with SerializableModule[LaneZvkParam]{ + val response: LaneZvkResponse = Wire(new LaneZvkResponse(parameter.datapathWidth)) + val request : LaneZvkRequest = connectIO(response).asTypeOf(parameter.inputBundle) + + val src: UInt = request.src(1) // vs2 + val rs: UInt = request.src(0) // vs1 or rs1 + val vSew: UInt = UIntToOH(request.vSew) // sew = 0, 1, 2 + + response.data := src + // response.data := Mux1H(UIntToOH(request.opcode), Seq( )) +} + diff --git a/t1/src/T1.scala b/t1/src/T1.scala index afc7ebb68..0770eb69b 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -122,6 +122,14 @@ case class T1Parameter( instruction => instruction.instructionSet.name match { case "rv_v" => true case "rv_zvbb" => if (zvbbEnable) true else false + // Zvk + case "rv_zvkg" => if (zvkEnable) true else false + // case "rv_zvkn" => if (zvkEnable) true else false // TODO: no implementations for SEW=64 + case "rv_zvkned" => if (zvkEnable) true else false + case "rv_zvknha" => if (zvkEnable) true else false + // case "rv_zvknhb" => if (zvkEnable) true else false // TODO: no implementations for SEW=64 + case "rv_zvksed" => if (zvkEnable) true else false + case "rv_zvksh" => if (zvkEnable) true else false case _ => false }} ++ t1customInstructions.map(_.instruction) @@ -132,7 +140,7 @@ case class T1Parameter( } } - require(extensions.forall(Seq("Zve32x", "Zve32f", "Zvbb").contains), "unsupported extension.") + require(extensions.forall(Seq("Zve32x", "Zve32f", "Zvbb", "Zvk").contains), "unsupported extension.") // TODO: require bank not overlap /** xLen of T1, we currently only support 32. */ val xLen: Int = 32 @@ -143,15 +151,19 @@ case class T1Parameter( /** TODO: configure it. */ val instructionQueueSize: Int = 4 - /** crosslane write token size */ + /** crosslane write token size, unclear how many would be good */ val vrfWriteQueueSize: Int = 4 + val vrfWriteZvkQueueSize: Int = 8 /** does t1 has floating datapath? */ val fpuEnable: Boolean = extensions.contains("Zve32f") - /** support of zvbb */ + /** support of Zvbb */ lazy val zvbbEnable: Boolean = extensions.contains("Zvbb") + /** support of Zvk */ + lazy val zvkEnable: Boolean = extensions.contains("Zvk") + /** how many chaining does T1 support, this is not a parameter yet. */ val chainingSize: Int = 4 @@ -225,7 +237,7 @@ case class T1Parameter( // and the values are their respective delays. val crossLaneConnectCycles: Seq[Seq[Int]] = Seq.tabulate(laneNumber)(_ => Seq(1, 1)) - val decoderParam: DecoderParam = DecoderParam(fpuEnable, zvbbEnable, allInstructions) + val decoderParam: DecoderParam = DecoderParam(fpuEnable, zvbbEnable, zvkEnable, allInstructions) /** paraemter for AXI4. */ val axi4BundleParameter: AXI4BundleParameter = AXI4BundleParameter( @@ -261,7 +273,9 @@ case class T1Parameter( laneNumber = laneNumber, chainingSize = chainingSize, crossLaneVRFWriteEscapeQueueSize = vrfWriteQueueSize, + crossLaneVRFWriteEscapeZvkQueueSize = vrfWriteZvkQueueSize, fpuEnable = fpuEnable, + zvkEnable = zvkEnable, portFactor = vrfBankSize, vrfRamType = vrfRamType, decoderParam = decoderParam, @@ -287,7 +301,7 @@ case class T1Parameter( axi4BundleParameter = axi4BundleParameter, name = "main" ) - def vrfParam: VRFParam = VRFParam(vLen, laneNumber, datapathWidth, chainingSize, vrfBankSize, vrfRamType) + def vrfParam: VRFParam = VRFParam(vLen, laneNumber, datapathWidth, chainingSize, vrfBankSize, zvkEnable, vrfRamType) require(xLen == datapathWidth) def adderParam: LaneAdderParam = LaneAdderParam(datapathWidth, 0) } @@ -1628,6 +1642,17 @@ class T1(val parameter: T1Parameter) laneVec(index).readBusPort(portIndex).enq ) + laneVec(readSourceIndex).zvkReadBusPort.get(readSourcePort).deqRelease := Pipe( + laneVec(index).zvkReadBusPort.get(portIndex).enqRelease, + 0.U.asTypeOf(new EmptyBundle), + cycle + ).valid + connectWithShifter(cycle)( + laneVec(readSourceIndex).zvkReadBusPort.get(readSourcePort).deq, + laneVec(index).zvkReadBusPort.get(portIndex).enq + ) + + // write connect laneVec(index).writeBusPort(portIndex).deqRelease := Pipe( laneVec(readSourceIndex).writeBusPort(readSourcePort).enqRelease, diff --git a/t1/src/VectorFunctionUnit.scala b/t1/src/VectorFunctionUnit.scala index cf06a66af..cee525256 100644 --- a/t1/src/VectorFunctionUnit.scala +++ b/t1/src/VectorFunctionUnit.scala @@ -106,7 +106,8 @@ case class VFUInstantiateParameter( divfpModuleParameters: Seq[(SerializableModuleGenerator[LaneDivFP, LaneDivFPParam], Seq[Int])], otherModuleParameters: Seq[(SerializableModuleGenerator[OtherUnit, OtherUnitParam], Seq[Int])], floatModuleParameters: Seq[(SerializableModuleGenerator[LaneFloat, LaneFloatParam], Seq[Int])], - zvbbModuleParameters: Seq[(SerializableModuleGenerator[LaneZvbb, LaneZvbbParam], Seq[Int])] + zvbbModuleParameters: Seq[(SerializableModuleGenerator[LaneZvbb, LaneZvbbParam], Seq[Int])], + zvkModuleParameters: Seq[(SerializableModuleGenerator[LaneZvk, LaneZvkParam], Seq[Int])], ) { val genVec: Seq[(SerializableModuleGenerator[_ <: VFUModule, _ <: VFUParameter], Seq[Int])] = logicModuleParameters ++ @@ -117,7 +118,8 @@ case class VFUInstantiateParameter( divfpModuleParameters ++ otherModuleParameters ++ floatModuleParameters ++ - zvbbModuleParameters + zvbbModuleParameters ++ + zvkModuleParameters genVec.foreach { case (_, connect) => connect.foreach(connectIndex => require(connectIndex < slotCount)) diff --git a/t1/src/decoder/Decoder.scala b/t1/src/decoder/Decoder.scala index 3a0299389..112e34f74 100644 --- a/t1/src/decoder/Decoder.scala +++ b/t1/src/decoder/Decoder.scala @@ -13,7 +13,7 @@ import org.chipsalliance.t1.rtl.decoder.attribute._ object DecoderParam { implicit def rwP: upickle.default.ReadWriter[DecoderParam] = upickle.default.macroRW } -case class DecoderParam(fpuEnable: Boolean, zvbbEnable: Boolean, allInstructions: Seq[Instruction]) +case class DecoderParam(fpuEnable: Boolean, zvbbEnable: Boolean, zvkEnable: Boolean, allInstructions: Seq[Instruction]) trait T1DecodeFiled[D <: Data] extends DecodeField[T1DecodePattern, D] with FieldName @@ -225,6 +225,10 @@ object Decoder { override def getTriState(pattern: T1DecodePattern): TriState = pattern.isZvbb.value } + object zvk extends BoolField { + override def getTriState(pattern: T1DecodePattern): TriState = pattern.isZvk.value + } + object topUop extends T1TopUopField { override def genTable(pattern: T1DecodePattern): BitPat = pattern.topUop.value match { case _: TopT0.type => BitPat("b000") @@ -345,6 +349,19 @@ object Decoder { case _: zvbbUop8.type => BitPat("b1000") // andn case _ => BitPat.dontCare(4) } + case zvkCase: ZvkUOPType => + zvkCase match { + case _: zvkUop0.type => BitPat("b0000") // + case _: zvkUop1.type => BitPat("b0001") // + case _: zvkUop2.type => BitPat("b0010") // + case _: zvkUop3.type => BitPat("b0011") // + case _: zvkUop4.type => BitPat("b0100") // + case _: zvkUop5.type => BitPat("b0101") // + case _: zvkUop6.type => BitPat("b0110") // + case _: zvkUop7.type => BitPat("b0111") // + case _: zvkUop8.type => BitPat("b1000") // + case _ => BitPat.dontCare(4) + } case _ => BitPat.dontCare(4) } } @@ -422,6 +439,12 @@ object Decoder { zvbb, ) else Seq() + } ++ { + if (param.zvkEnable) + Seq( + zvk, + ) + else Seq() } def allDecodePattern(param: DecoderParam): Seq[T1DecodePattern] = param.allInstructions.map(T1DecodePattern(_, param)).toSeq.sortBy(_.instruction.name) diff --git a/t1/src/decoder/InstructionDocumentation.scala b/t1/src/decoder/InstructionDocumentation.scala index 86c5a7e35..b506c61c3 100644 --- a/t1/src/decoder/InstructionDocumentation.scala +++ b/t1/src/decoder/InstructionDocumentation.scala @@ -439,5 +439,31 @@ case class InstructionDocumentation(instruction: Instruction, param: DecoderPara case "vwsll.vv" => "TODO!" case "vwsll.vx" => "TODO!" case "vwsll.vi" => "TODO!" + // rv_zvkg + case "vghsh.vv" => "TODO!" + case "vgmul.vv" => "TODO!" + // rv_zvkned + case "vaesdf.vv" => "TODO!" + case "vaesdf.vs" => "TODO!" + case "vaesdm.vv" => "TODO!" + case "vaesdm.vs" => "TODO!" + case "vaesef.vv" => "TODO!" + case "vaesef.vs" => "TODO!" + case "vaesem.vv" => "TODO!" + case "vaesem.vs" => "TODO!" + case "vaesz.vs" => "TODO!" + case "vaeskf1.vi" => "TODO!" + case "vaeskf2.vi" => "TODO!" + // rv_zvknha + case "vsha2ms.vv" => "TODO!" + case "vsha2ch.vv" => "TODO!" + case "vsha2cl.vv" => "TODO!" + // rv_zvksed + case "vsm4k.vi" => "TODO!" + case "vsm4r.vv" => "TODO!" + case "vsm4r.vs" => "TODO!" + // rv_zvksh + case "vsm3c.vi" => "TODO!" + case "vsm3me.vv" => "TODO!" } } diff --git a/t1/src/decoder/T1DecodePattern.scala b/t1/src/decoder/T1DecodePattern.scala index 5c7d10733..3b7d9b3a7 100644 --- a/t1/src/decoder/T1DecodePattern.scala +++ b/t1/src/decoder/T1DecodePattern.scala @@ -108,6 +108,7 @@ case class T1DecodePattern(instruction: Instruction, param: DecoderParam) extend def isVwmacc: isVwmacc = attribute.isVwmacc(this) def isWidenreduce: isWidenreduce = attribute.isWidenreduce(this) def isZvbb: isZvbb = attribute.isZvbb(this) + def isZvk: isZvk = attribute.isZvk(this) def fpExecutionType: FpExecutionType.Type = attribute.FpExecutionType(this) def topUop: TopUop = attribute.TopUop(this) def decoderUop: DecoderUop = attribute.DecoderUop(this) diff --git a/t1/src/decoder/attribute/isItype.scala b/t1/src/decoder/attribute/isItype.scala index 5ba9baf2e..c3db64750 100644 --- a/t1/src/decoder/attribute/isItype.scala +++ b/t1/src/decoder/attribute/isItype.scala @@ -54,6 +54,13 @@ object isItype { // rv_zvbb "vror.vi", "vwsll.vi", + // rv_zvkned + "vaeskf1.vi", + "vaeskf2.vi", + // rv_zvksed + "vsm4k.vi", + // rv_zvksh + "vsm3c.vi", ) allMatched.contains(t1DecodePattern.instruction.name) } diff --git a/t1/src/decoder/attribute/isUnsigned0.scala b/t1/src/decoder/attribute/isUnsigned0.scala index fb041c3c7..29b2bf9d3 100644 --- a/t1/src/decoder/attribute/isUnsigned0.scala +++ b/t1/src/decoder/attribute/isUnsigned0.scala @@ -146,6 +146,32 @@ object isUnsigned0 { "vwsll.vv", "vwsll.vx", "vwsll.vi", + // rv_zvkg + "vghsh.vv", + "vgmul.vv", + // rv_zvkned + "vaesdf.vv", + "vaesdf.vs", + "vaesdm.vv", + "vaesdm.vs", + "vaesef.vv", + "vaesef.vs", + "vaesem.vv", + "vaesem.vs", + "vaesz.vs", + "vaeskf1.vi", + "vaeskf2.vi", + // rv_zvknha + "vsha2ms.vv", + "vsha2ch.vv", + "vsha2cl.vv", + // rv_zvksed + "vsm4k.vi", + "vsm4r.vv", + "vsm4r.vs", + // rv_zvksh + "vsm3c.vi", + "vsm3me.vv", ) allMatched.contains(t1DecodePattern.instruction.name) } diff --git a/t1/src/decoder/attribute/isUnsigned1.scala b/t1/src/decoder/attribute/isUnsigned1.scala index cf4f517a0..595ecfbab 100644 --- a/t1/src/decoder/attribute/isUnsigned1.scala +++ b/t1/src/decoder/attribute/isUnsigned1.scala @@ -118,6 +118,32 @@ object isUnsigned1 { "vwsll.vv", "vwsll.vx", "vwsll.vi", + // rv_zvkg + "vghsh.vv", + "vgmul.vv", + // rv_zvkned + "vaesdf.vv", + "vaesdf.vs", + "vaesdm.vv", + "vaesdm.vs", + "vaesef.vv", + "vaesef.vs", + "vaesem.vv", + "vaesem.vs", + "vaesz.vs", + "vaeskf1.vi", + "vaeskf2.vi", + // rv_zvknha + "vsha2ms.vv", + "vsha2ch.vv", + "vsha2cl.vv", + // rv_zvksed + "vsm4k.vi", + "vsm4r.vv", + "vsm4r.vs", + // rv_zvksh + "vsm3c.vi", + "vsm3me.vv", ) allMatched.contains(t1DecodePattern.instruction.name) } diff --git a/t1/src/decoder/attribute/isVtype.scala b/t1/src/decoder/attribute/isVtype.scala index 7649d715a..0ecb480e5 100644 --- a/t1/src/decoder/attribute/isVtype.scala +++ b/t1/src/decoder/attribute/isVtype.scala @@ -186,6 +186,22 @@ object isVtype { "vrol.vv", "vror.vv", "vwsll.vv", + // rv_zvkg + "vghsh.vv", + "vgmul.vv", + // rv_zvkned + "vaesdf.vv", + "vaesdm.vv", + "vaesef.vv", + "vaesem.vv", + // rv_zvknha + "vsha2ms.vv", + "vsha2ch.vv", + "vsha2cl.vv", + // rv_zvksed + "vsm4r.vv", + // rv_zvksh + "vsm3me.vv", ) allMatched.contains(t1DecodePattern.instruction.name) } diff --git a/t1/src/decoder/attribute/isZvk.scala b/t1/src/decoder/attribute/isZvk.scala new file mode 100644 index 000000000..459423778 --- /dev/null +++ b/t1/src/decoder/attribute/isZvk.scala @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl.decoder.attribute + +import org.chipsalliance.t1.rtl.decoder.T1DecodePattern + +object isZvk { + def apply(t1DecodePattern: T1DecodePattern): isZvk = + Seq( + y _ -> Y, + n _ -> N, + dc _ -> DC + ).collectFirst { + case (fn, tri) if fn(t1DecodePattern) => isZvk(tri) + }.get + + def y(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched = if(t1DecodePattern.param.zvkEnable) Seq( + "vghsh.vv", + "vgmul.vv", + "vaesdf.vv", + "vaesdf.vs", + "vaesdm.vv", + "vaesdm.vs", + "vaesef.vv", + "vaesef.vs", + "vaesem.vv", + "vaesem.vs", + "vaesz.vs", + "vaeskf1.vi", + "vaeskf2.vi", + "vsha2ms.vv", + "vsha2ch.vv", + "vsha2cl.vv", + "vsm4k.vi", + "vsm4r.vv", + "vsm4r.vs", + "vsm3c.vi", + "vsm3me.vv", + ) else Seq() + allMatched.contains(t1DecodePattern.instruction.name) + } + def n(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched = t1DecodePattern.param.allInstructions.filter(i => + !(y(t1DecodePattern) || dc(t1DecodePattern)) + ) + allMatched.contains(t1DecodePattern.instruction) + } + + def dc(t1DecodePattern: T1DecodePattern): Boolean = false +} + +case class isZvk(value: TriState) extends BooleanDecodeAttribute { + override val description: String = "goes to [[org.chipsalliance.t1.rtl.LaneZvk]]." +} diff --git a/t1/src/decoder/attribute/zvkUop.scala b/t1/src/decoder/attribute/zvkUop.scala new file mode 100644 index 000000000..6194e3234 --- /dev/null +++ b/t1/src/decoder/attribute/zvkUop.scala @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl.decoder.attribute + +import org.chipsalliance.t1.rtl.decoder.T1DecodePattern + +trait ZvkUOPType extends Uop +object zvkUop0 extends ZvkUOPType // +object zvkUop1 extends ZvkUOPType // +object zvkUop2 extends ZvkUOPType // +object zvkUop3 extends ZvkUOPType // +object zvkUop4 extends ZvkUOPType // +object zvkUop5 extends ZvkUOPType // +object zvkUop6 extends ZvkUOPType // +object zvkUop7 extends ZvkUOPType // +object zvkUop8 extends ZvkUOPType // + +object ZvkUOP { + def apply(t1DecodePattern: T1DecodePattern): Uop = { + Seq( + t0 _ -> zvkUop0, + t1 _ -> zvkUop1, + t2 _ -> zvkUop2, + t3 _ -> zvkUop3, + t4 _ -> zvkUop4, + t5 _ -> zvkUop5, + t6 _ -> zvkUop6, + t7 _ -> zvkUop7, + t8 _ -> zvkUop8, + ).collectFirst { + case (fn, tpe) if fn(t1DecodePattern) => tpe + }.getOrElse(UopDC) + } + def t0(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t1(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t2(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t3(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t4(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t5(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t6(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t7(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } + def t8(t1DecodePattern: T1DecodePattern): Boolean = { + val allMatched: Seq[String] = Seq( + ) + allMatched.contains(t1DecodePattern.instruction.name) + } +} diff --git a/t1/src/laneStage/LaneExecutionBridge.scala b/t1/src/laneStage/LaneExecutionBridge.scala index cf3cf1c9d..520b0ff26 100644 --- a/t1/src/laneStage/LaneExecutionBridge.scala +++ b/t1/src/laneStage/LaneExecutionBridge.scala @@ -13,6 +13,7 @@ import org.chipsalliance.t1.rtl.decoder.Decoder class LaneExecuteRequest(parameter: LaneParameter, isLastSlot: Boolean) extends Bundle { val src: Vec[UInt] = Vec(3, UInt(parameter.datapathWidth.W)) val crossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 2).W)) + val zvkCrossReadSource: Option[UInt] = Option.when(isLastSlot && parameter.zvkEnable)(UInt((parameter.datapathWidth * 4).W)) val bordersForMaskLogic: Bool = Bool() val mask: UInt = UInt((parameter.datapathWidth / 8).W) val maskForFilter: UInt = UInt((parameter.datapathWidth / 8).W) @@ -32,6 +33,7 @@ class LaneExecuteResponse(parameter: LaneParameter, isLastSlot: Boolean) extends val data: UInt = UInt(parameter.datapathWidth.W) val ffoIndex: UInt = UInt(log2Ceil(parameter.vLen / 8).W) val crossWriteData: Option[Vec[UInt]] = Option.when(isLastSlot)(Vec(2, UInt(parameter.datapathWidth.W))) + val zvkCrossWriteData: Option[Vec[UInt]] = Option.when(isLastSlot && parameter.zvkEnable)(Vec(4, UInt(parameter.datapathWidth.W))) val ffoSuccess: Option[Bool] = Option.when(isLastSlot)(Bool()) val fpReduceValid: Option[Bool] = Option.when(parameter.fpuEnable && isLastSlot)(Bool()) } @@ -84,6 +86,9 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd // execution result from execute unit val executionResult = RegInit(0.U(parameter.datapathWidth.W)) val crossWriteLSB: Option[UInt] = Option.when(isLastSlot)(RegInit(0.U(parameter.datapathWidth.W))) + val zvkCrossWriteLSB0: Option[UInt] = Option.when(isLastSlot && parameter.zvkEnable)(RegInit(0.U(parameter.datapathWidth.W))) + val zvkCrossWriteLSB1: Option[UInt] = Option.when(isLastSlot && parameter.zvkEnable)(RegInit(0.U(parameter.datapathWidth.W))) + val zvkCrossWriteLSB2: Option[UInt] = Option.when(isLastSlot && parameter.zvkEnable)(RegInit(0.U(parameter.datapathWidth.W))) val responseFinish: Bool = RegInit(true.B) when(vfuRequest.fire ^ dataResponse.fire) { responseFinish := dataResponse.fire @@ -149,6 +154,9 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd executionRecord.maskForFilter := enqueue.bits.maskForFilter executionRecord.source := enqueue.bits.src executionRecord.crossReadSource.foreach(_ := enqueue.bits.crossReadSource.get) + if(parameter.zvkEnable) { + executionRecord.zvkCrossReadSource.foreach(_ := enqueue.bits.zvkCrossReadSource.get) + } executionRecord.sSendResponse.foreach(_ := enqueue.bits.sSendResponse.get) executionRecord.groupCounter := enqueue.bits.groupCounter executionRecord.decodeResult := enqueue.bits.decodeResult @@ -166,6 +174,15 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd val cutCrossReadData: Vec[UInt] = cutUInt(executionRecord.crossReadSource.get, parameter.datapathWidth) Mux(executionRecord.executeIndex, cutCrossReadData(1), cutCrossReadData(0)) } + val quadrupleCollapse: Option[UInt] = Option.when(isLastSlot && parameter.zvkEnable) { + val cutCrossReadData: Vec[UInt] = cutUInt(executionRecord.zvkCrossReadSource.get, parameter.datapathWidth) + Mux1H(UIntToOH(executionRecord.zvkExecuteIndex.get), Seq( + cutCrossReadData(0), + cutCrossReadData(1), + cutCrossReadData(2), + cutCrossReadData(3), + )) + } // For cross read, extend 32 bit source1 to 64 bit, then select by executeIndex def dataExtend(data: UInt, sign: Bool): UInt = { @@ -205,7 +222,7 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd ) } else { normalSource1 - } + } // TODO: vs1 cross val reduceFoldSource2: Option[UInt] = Option.when(isLastSlot)(Wire(UInt(parameter.datapathWidth.W))) /** src2 for the execution, @@ -214,7 +231,16 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd val finalSource2: UInt = if (isLastSlot) { Mux( executionRecord.crossReadVS2, - doubleCollapse.get, + { + if(parameter.zvkEnable) { + Mux(executionRecord.decodeResult(Decoder.zvk), + quadrupleCollapse.get, + doubleCollapse.get, + ) + } else { + doubleCollapse.get + } + }, Mux( executionRecord.decodeResult(Decoder.crossWrite) || (executionRecord.decodeResult(Decoder.widenReduce) && !sendFoldReduce.get), extendSource2, @@ -362,6 +388,17 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd crossWriteLSB.foreach { crossWriteData => crossWriteData := dataDequeue } + if(parameter.zvkEnable) { + zvkCrossWriteLSB0.foreach { crossWriteData => + crossWriteData := dataDequeue + } + zvkCrossWriteLSB1.zip(zvkCrossWriteLSB0).foreach {case (zvkCrossWriteData1, zvkCrossWriteData0) => + zvkCrossWriteData1 := zvkCrossWriteData0 + } + zvkCrossWriteLSB2.zip(zvkCrossWriteLSB1).foreach {case (zvkCrossWriteData2, zvkCrossWriteData1) => + zvkCrossWriteData2 := zvkCrossWriteData1 + } + } } /** update value for [[maskFormatResultForGroup]] */ @@ -542,6 +579,9 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd } queue.io.enq.bits.ffoIndex := recordQueue.io.deq.bits.groupCounter ## dataResponse.bits.data(4, 0) queue.io.enq.bits.crossWriteData.foreach(_ := VecInit((crossWriteLSB ++ Seq(dataDequeue)).toSeq)) + if(parameter.zvkEnable) { + queue.io.enq.bits.zvkCrossWriteData.foreach(_ := VecInit((zvkCrossWriteLSB0 ++ zvkCrossWriteLSB1 ++ zvkCrossWriteLSB2 ++ Seq(dataDequeue)).toSeq)) + } queue.io.enq.bits.ffoSuccess.foreach(_ := dataResponse.bits.ffoSuccess) queue.io.enq.bits.fpReduceValid.foreach(_ := !waitFirstValidFire.get) recordQueue.io.deq.ready := dataResponse.valid || (recordNotExecute && queue.io.enq.ready) diff --git a/t1/src/laneStage/LaneStage1.scala b/t1/src/laneStage/LaneStage1.scala index f44826e79..d4354bbc6 100644 --- a/t1/src/laneStage/LaneStage1.scala +++ b/t1/src/laneStage/LaneStage1.scala @@ -9,7 +9,7 @@ import chisel3.probe.{Probe, ProbeValue, define} import chisel3.util._ import chisel3.util.experimental.decode.DecodeBundle import org.chipsalliance.t1.rtl.decoder.Decoder -import org.chipsalliance.t1.rtl.lane.{CrossReadUnit, LaneState, VrfReadPipe} +import org.chipsalliance.t1.rtl.lane.{CrossReadUnit, ZvkCrossReadUnit, LaneState, VrfReadPipe} class LaneStage1Enqueue(parameter: LaneParameter, isLastSlot: Boolean) extends Bundle { val groupCounter: UInt = UInt(parameter.groupNumberBits.W) @@ -46,6 +46,7 @@ class LaneStage1Dequeue(parameter: LaneParameter, isLastSlot: Boolean) extends B // read result val src: Vec[UInt] = Vec(3, UInt(parameter.datapathWidth.W)) val crossReadSource: Option[UInt] = Option.when(isLastSlot)(UInt((parameter.datapathWidth * 2).W)) + val zvkCrossReadSource: Option[UInt] = Option.when(isLastSlot && parameter.zvkEnable)(UInt((parameter.datapathWidth * 4).W)) // pipe state // for exe stage @@ -83,8 +84,14 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { @public val vrfCheckRequest: Vec[VRFReadRequest] = IO(Vec(readCheckSize, Output(readRequestType))) + val zvkReadCheckSize: Int = if(isLastSlot && parameter.zvkEnable) 7 else 3 + @public + val zvkVrfCheckRequest: Option[Vec[VRFReadRequest]] = Option.when(parameter.zvkEnable)(IO(Vec(zvkReadCheckSize, Output(readRequestType)))) + @public val checkResult: Vec[Bool] = IO(Vec(readCheckSize, Input(Bool()))) + @public + val zvkCheckResult: Option[Vec[Bool]] = Option.when(parameter.zvkEnable)(IO(Vec(zvkReadCheckSize, Input(Bool())))) /** VRF read result for each slot, * 3 is for [[source1]] [[source2]] [[source3]] @@ -96,11 +103,19 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { val readBusDequeue: Option[Vec[DecoupledIO[ReadBusData]]] = Option.when(isLastSlot)(IO( Vec(2, Flipped(Decoupled(new ReadBusData(parameter: LaneParameter)))) )) + @public + val zvkReadBusDequeue: Option[Vec[DecoupledIO[ReadBusData]]] = Option.when(isLastSlot & parameter.zvkEnable)(IO( + Vec(4, Flipped(Decoupled(new ReadBusData(parameter: LaneParameter)))) + )) @public val readBusRequest: Option[Vec[DecoupledIO[ReadBusData]]] = Option.when(isLastSlot)(IO(Vec(2, Decoupled(new ReadBusData(parameter))))) + @public + val zvkReadBusRequest: Option[Vec[DecoupledIO[ReadBusData]]] = + Option.when(isLastSlot & parameter.zvkEnable)(IO(Vec(4, Decoupled(new ReadBusData(parameter))))) + val groupCounter: UInt = enqueue.bits.groupCounter // todo: param @@ -125,12 +140,30 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { val queueAfterCheckMSB: Option[Queue[VRFReadQueueEntry]] = Option.when(isLastSlot)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) + val queueAfterCheckZvkLSBLSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) + val queueAfterCheckZvkLSBMSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) + val queueAfterCheckZvkMSBLSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) + val queueAfterCheckZvkMSBMSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeAfterCheck))) + // read request queue for cross read lsb & msb val queueBeforeCheckLSB: Option[Queue[VRFReadQueueEntry]] = Option.when(isLastSlot)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) val queueBeforeCheckMSB: Option[Queue[VRFReadQueueEntry]] = Option.when(isLastSlot)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) + val queueBeforeCheckZvkLSBLSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) + val queueBeforeCheckZvkLSBMSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) + val queueBeforeCheckZvkMSBLSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) + val queueBeforeCheckZvkMSBMSB: Option[Queue[VRFReadQueueEntry]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(vrfReadEntryType, readRequestQueueSizeBeforeCheck))) + // pipe from enqueue val pipeQueue: Queue[LaneStage1Enqueue] = Module( @@ -147,11 +180,35 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { val afterCheckQueueVec: Seq[Queue[VRFReadQueueEntry]] = Seq(queueAfterCheck1, queueAfterCheck2, queueAfterCheckVd) ++ queueAfterCheckLSB ++ queueAfterCheckMSB - val allReadQueueReady: Bool = beforeCheckQueueVec.map(_.io.enq.ready).reduce(_ && _) + + val beforeCheckZvkQueueVec: Seq[Queue[VRFReadQueueEntry]] = + Seq(queueBeforeCheck1, queueBeforeCheck2, queueBeforeCheckVd) ++ + queueBeforeCheckZvkLSBLSB ++ queueBeforeCheckZvkLSBMSB ++ + queueBeforeCheckZvkMSBLSB ++ queueBeforeCheckZvkMSBMSB + val afterCheckZvkQueueVec: Seq[Queue[VRFReadQueueEntry]] = + Seq(queueAfterCheck1, queueAfterCheck2, queueAfterCheckVd) ++ + queueAfterCheckZvkLSBLSB ++ queueAfterCheckZvkLSBMSB ++ + queueAfterCheckZvkMSBLSB ++ queueAfterCheckZvkMSBMSB + + val allReadQueueReady: Bool = { + val ready = beforeCheckQueueVec.map(_.io.enq.ready).reduce(_ && _) + if(parameter.zvkEnable) { + val zvkReady = beforeCheckZvkQueueVec.map(_.io.enq.ready).reduce(_ && _) + Mux(enqueue.bits.decodeResult(Decoder.crossRead) & enqueue.bits.decodeResult(Decoder.zvk), zvkReady, ready) + } else { + ready + } + } beforeCheckQueueVec.foreach{ q => q.io.enq.bits.instructionIndex := enqueue.bits.instructionIndex q.io.enq.bits.groupIndex := enqueue.bits.groupCounter } + if(parameter.zvkEnable) { + beforeCheckZvkQueueVec.foreach{ q => + q.io.enq.bits.instructionIndex := enqueue.bits.instructionIndex + q.io.enq.bits.groupIndex := enqueue.bits.groupCounter + } + } enqueue.ready := allReadQueueReady && pipeQueue.io.enq.ready @@ -162,6 +219,14 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { after.io.enq.valid := before.io.deq.valid && checkResult(i) after.io.enq.bits := before.io.deq.bits } + if(parameter.zvkEnable) { + beforeCheckZvkQueueVec.zip(afterCheckZvkQueueVec).zipWithIndex.foreach { case ((before, after), i) => + zvkVrfCheckRequest.get(i) := before.io.deq.bits + before.io.deq.ready := after.io.enq.ready && zvkCheckResult.get(i) + after.io.enq.valid := before.io.deq.valid && zvkCheckResult.get(i) + after.io.enq.bits := before.io.deq.bits + } + } // request enqueue queueBeforeCheck1.io.enq.valid := enqueue.fire && enqueue.bits.decodeResult(Decoder.vtype) && !enqueue.bits.skipRead queueBeforeCheck2.io.enq.valid := enqueue.fire && !enqueue.bits.skipRead @@ -169,6 +234,11 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { (queueBeforeCheckLSB ++ queueBeforeCheckMSB).foreach { q => q.io.enq.valid := enqueue.valid && allReadQueueReady && enqueue.bits.decodeResult(Decoder.crossRead) } + if(parameter.zvkEnable) { + (queueBeforeCheckZvkLSBLSB ++ queueBeforeCheckZvkLSBMSB ++ queueBeforeCheckZvkMSBLSB ++ queueBeforeCheckZvkMSBMSB).foreach { q => + q.io.enq.valid := enqueue.valid && allReadQueueReady && enqueue.bits.decodeResult(Decoder.crossRead) && enqueue.bits.decodeResult(Decoder.zvk) + } + } // calculate vs queueBeforeCheck1.io.enq.bits.vs := Mux( @@ -223,6 +293,29 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B } + if(parameter.zvkEnable) { // TODO: check here + queueBeforeCheckZvkLSBLSB.foreach { q => + q.io.enq.bits.vs := enqueue.bits.vs2 + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) + q.io.enq.bits.readSource := 1.U + q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B + } + queueBeforeCheckZvkLSBMSB.foreach { q => + q.io.enq.bits.vs := enqueue.bits.vs2 + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) + q.io.enq.bits.readSource := 1.U + q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B + } + queueBeforeCheckZvkMSBLSB.foreach { q => + q.io.enq.bits.vs := enqueue.bits.vs2 + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) + q.io.enq.bits.readSource := 1.U + q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B + } + queueBeforeCheckZvkMSBMSB.foreach { q => + q.io.enq.bits.vs := enqueue.bits.vs2 + groupCounter(parameter.groupNumberBits - 2, parameter.vrfOffsetBits - 1) + q.io.enq.bits.readSource := 1.U + q.io.enq.bits.offset := groupCounter(parameter.vrfOffsetBits - 2, 0) ## true.B + } + } + // read pipe val readPipe0: Instance[VrfReadPipe] = Instantiate(new VrfReadPipe(parameter, arbitrate = false)) val readPipe1: Instance[VrfReadPipe] = Instantiate(new VrfReadPipe(parameter, arbitrate = isLastSlot)) @@ -240,6 +333,10 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { // cross lane queue val dataQueueLSB = Option.when(isLastSlot)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) val dataQueueMSB = Option.when(isLastSlot)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) + val dataQueueZvkLSBLSB = Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) + val dataQueueZvkLSBMSB = Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) // TODO + val dataQueueZvkMSBLSB = Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) // TODO + val dataQueueZvkMSBMSB = Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt(parameter.datapathWidth.W), dataQueueSize))) val dataQueueNotFull2: Bool = { val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) @@ -295,17 +392,80 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { blockingHandshake(port, queue.io.deq, dataQueueNotFullMSB) } + if(parameter.zvkEnable) { + readPipe1.contender.zip(queueAfterCheckZvkLSBLSB).foreach { case (port, queue) => + val dataQueueNotFullLSBLSB: Bool = { + val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) + val doEnq = queue.io.deq.fire + val doDeq = dataQueueZvkLSBLSB.get.io.deq.fire + val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) + when(doEnq ^ doDeq) { + counterReg := counterReg + countChange + } + !counterReg(log2Ceil(dataQueueSize)) + } + blockingHandshake(port, queue.io.deq, dataQueueNotFullLSBLSB) + } + readPipe1.contender.zip(queueAfterCheckZvkLSBMSB).foreach { case (port, queue) => + val dataQueueNotFullLSBMSB: Bool = { + val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) + val doEnq = queue.io.deq.fire + val doDeq = dataQueueZvkLSBMSB.get.io.deq.fire + val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) + when(doEnq ^ doDeq) { + counterReg := counterReg + countChange + } + !counterReg(log2Ceil(dataQueueSize)) + } + blockingHandshake(port, queue.io.deq, dataQueueNotFullLSBMSB) + } + readPipe2.contender.zip(queueAfterCheckZvkMSBLSB).foreach { case (port, queue) => + val dataQueueNotFullMSBLSB: Bool = { + val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) + val doEnq = queue.io.deq.fire + val doDeq = dataQueueZvkMSBLSB.get.io.deq.fire + val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) + when(doEnq ^ doDeq) { + counterReg := counterReg + countChange + } + !counterReg(log2Ceil(dataQueueSize)) + } + blockingHandshake(port, queue.io.deq, dataQueueNotFullMSBLSB) + } + readPipe2.contender.zip(queueAfterCheckZvkMSBMSB).foreach { case (port, queue) => + val dataQueueNotFullMSBMSB: Bool = { + val counterReg = RegInit(0.U(log2Ceil(dataQueueSize + 1).W)) + val doEnq = queue.io.deq.fire + val doDeq = dataQueueZvkMSBMSB.get.io.deq.fire + val countChange = Mux(doEnq, 1.U, -1.S(log2Ceil(dataQueueSize + 1).W).asUInt) + when(doEnq ^ doDeq) { + counterReg := counterReg + countChange + } + !counterReg(log2Ceil(dataQueueSize)) + } + blockingHandshake(port, queue.io.deq, dataQueueNotFullMSBMSB) + } + } + // data: pipe <-> queue if (isLastSlot) { // pipe1 <-> dataQueueVs2 dataQueueVs2.io.enq <> readPipe1.dequeue // pipe1 <> dataQueueLSB dataQueueLSB.zip(readPipe1.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } + if(parameter.zvkEnable) { + dataQueueZvkLSBLSB.zip(readPipe1.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } + dataQueueZvkLSBMSB.zip(readPipe1.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } + } // pipe2 <-> dataQueueVd dataQueueVd.io.enq <> readPipe2.dequeue // pipe2 <-> dataQueueMSB dataQueueMSB.zip(readPipe2.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } + if(parameter.zvkEnable) { + dataQueueZvkMSBLSB.zip(readPipe2.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } + dataQueueZvkMSBMSB.zip(readPipe2.contenderDequeue).foreach { case (sink, source) => sink.io.enq <> source } + } } else { dataQueueVs2.io.enq <> readPipe1.dequeue dataQueueVd.io.enq <> readPipe2.dequeue @@ -316,6 +476,12 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { Option.when(isLastSlot)(Module(new Queue(UInt((parameter.datapathWidth * 2).W), 1))) val crossReadStageFree: Option[Bool] = Option.when(isLastSlot)(Wire(Bool())) val crossReadUnitOp: Option[Instance[CrossReadUnit]] = Option.when(isLastSlot)(Instantiate(new CrossReadUnit(parameter))) + + val zvkCrossReadResultQueue: Option[Queue[UInt]] = + Option.when(isLastSlot && parameter.zvkEnable)(Module(new Queue(UInt((parameter.datapathWidth * 4).W), 1))) + val zvkCrossReadStageFree: Option[Bool] = Option.when(isLastSlot && parameter.zvkEnable)(Wire(Bool())) + val zvkCrossReadUnitOp: Option[Instance[ZvkCrossReadUnit]] = Option.when(isLastSlot && parameter.zvkEnable)(Instantiate(new ZvkCrossReadUnit(parameter))) + if (isLastSlot) { val dataGroupQueue: Queue[UInt] = Module( @@ -341,7 +507,34 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { assert(dataGroupQueue.io.enq.ready || !dataGroupQueue.io.enq.valid) dataGroupQueue.io.enq.bits := enqueue.bits.groupCounter dataGroupQueue.io.deq.ready := crossReadUnit.dataInputLSB.fire - dequeue.bits.readBusDequeueGroup.get := crossReadUnitOp.get.currentGroup + dequeue.bits.readBusDequeueGroup.get := crossReadUnitOp.get.currentGroup // TODO: readBusDequeueGroup is currently unused + + if(parameter.zvkEnable) { + val zvkDataGroupQueue: Queue[UInt] = + Module( + new Queue( + UInt(parameter.groupNumberBits.W), + readRequestQueueSizeBeforeCheck + readRequestQueueSizeBeforeCheck + dataQueueSize + 2 + ) + ) + val zvkCrossReadUnit = zvkCrossReadUnitOp.get + zvkCrossReadUnit.dataInputLSBLSB <> dataQueueZvkLSBLSB.get.io.deq + zvkCrossReadUnit.dataInputLSBMSB <> dataQueueZvkLSBMSB.get.io.deq + zvkCrossReadUnit.dataInputMSBLSB <> dataQueueZvkMSBLSB.get.io.deq + zvkCrossReadUnit.dataInputMSBMSB <> dataQueueZvkMSBMSB.get.io.deq + zvkCrossReadUnit.laneIndex := laneIndexReg + zvkCrossReadUnit.dataGroup := zvkDataGroupQueue.io.deq.bits + zvkReadBusRequest.get.zip(zvkCrossReadUnit.readBusRequest.get).foreach { case (sink, source) => sink <> source} + zvkCrossReadUnit.readBusDequeue.get.zip(zvkReadBusDequeue.get).foreach { case (sink, source) => sink <> source} + zvkCrossReadResultQueue.get.io.enq <> zvkCrossReadUnit.crossReadDequeue + zvkCrossReadStageFree.get := zvkCrossReadUnit.crossReadStageFree + + // data group + zvkDataGroupQueue.io.enq.valid := enqueue.fire && enqueue.bits.decodeResult(Decoder.crossRead) + assert(zvkDataGroupQueue.io.enq.ready || !zvkDataGroupQueue.io.enq.valid) + zvkDataGroupQueue.io.enq.bits := enqueue.bits.groupCounter + zvkDataGroupQueue.io.deq.ready := zvkCrossReadUnit.dataInputLSBLSB.fire + } } val source1Select: UInt = Mux( @@ -353,6 +546,9 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { dequeue.bits.groupCounter := pipeQueue.io.deq.bits.groupCounter dequeue.bits.src := VecInit(Seq(source1Select, dataQueueVs2.io.deq.bits, dataQueueVd.io.deq.bits)) dequeue.bits.crossReadSource.foreach(_ := crossReadResultQueue.get.io.deq.bits) + if(parameter.zvkEnable) { + dequeue.bits.zvkCrossReadSource.foreach(_ := zvkCrossReadResultQueue.get.io.deq.bits) + } dequeue.bits.sSendResponse.foreach(_ := pipeQueue.io.deq.bits.sSendResponse.get) dequeue.bits.decodeResult := pipeQueue.io.deq.bits.decodeResult dequeue.bits.vSew1H := pipeQueue.io.deq.bits.vSew1H @@ -374,7 +570,12 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { dataQueueVs2.io.deq.valid || pipeQueue.io.deq.bits.skipRead, dataQueueVd.io.deq.valid || (pipeQueue.io.deq.bits.decodeResult(Decoder.sReadVD)) ) ++ - crossReadResultQueue.map(_.io.deq.valid || !pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead)) + crossReadResultQueue.map(_.io.deq.valid || !pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead)) ++ { + if(parameter.zvkEnable) zvkCrossReadResultQueue.map(_.io.deq.valid || + (!pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead) && + !pipeQueue.io.deq.bits.decodeResult(Decoder.zvk)) + ) else Seq() + } val allDataQueueValid: Bool = VecInit(dataQueueValidVec).asUInt.andR dequeue.valid := allDataQueueValid && pipeQueue.io.deq.valid dataQueueVs1.ready := allDataQueueValid && dequeue.ready && pipeQueue.io.deq.bits.decodeResult(Decoder.vtype) @@ -382,6 +583,9 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { dataQueueVd.io.deq.ready := allDataQueueValid && dequeue.ready && !pipeQueue.io.deq.bits.decodeResult(Decoder.sReadVD) crossReadResultQueue.foreach(_.io.deq.ready := allDataQueueValid && dequeue.ready && pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead)) + if(parameter.zvkEnable) { + zvkCrossReadResultQueue.foreach(_.io.deq.ready := allDataQueueValid && dequeue.ready && (pipeQueue.io.deq.bits.decodeResult(Decoder.crossRead) && pipeQueue.io.deq.bits.decodeResult(Decoder.zvk))) + } stageValid := pipeQueue.io.deq.valid val stageFinish = !stageValid @@ -395,15 +599,34 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { val stageFinishProbe = IO(Output(Probe(Bool()))) @public val readFinishProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool())))) + @public val sSendCrossReadResultLSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool())))) @public val sSendCrossReadResultMSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool())))) + + @public + val sSendZvkCrossReadResultLSBLSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) + @public + val sSendZvkCrossReadResultLSBMSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) + @public + val sSendZvkCrossReadResultMSBLSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) + @public + val sSendZvkCrossReadResultMSBMSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) + @public val wCrossReadLSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool())))) @public val wCrossReadMSBProbe = Option.when(isLastSlot)(IO(Output(Probe(Bool())))) @public + val wZvkCrossReadLSBLSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) + @public + val wZvkCrossReadLSBMSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) + @public + val wZvkCrossReadMSBLSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) + @public + val wZvkCrossReadMSBMSBProbe = Option.when(isLastSlot && parameter.zvkEnable)(IO(Output(Probe(Bool())))) + @public val vrfReadRequestProbe: Seq[(Bool, Bool)] = Seq.fill(3)((IO(Output(Probe(Bool()))),IO(Output(Probe(Bool()))))) @@ -418,6 +641,16 @@ class LaneStage1(parameter: LaneParameter, isLastSlot: Boolean) extends Module { sSendCrossReadResultMSBProbe.foreach(p => define(p, ProbeValue(crossReadUnitOp.get.crossWriteState.sSendCrossReadResultMSB))) wCrossReadLSBProbe.foreach(p => define(p, ProbeValue(crossReadUnitOp.get.crossWriteState.wCrossReadLSB))) wCrossReadMSBProbe.foreach(p => define(p, ProbeValue(crossReadUnitOp.get.crossWriteState.wCrossReadMSB))) + if (parameter.zvkEnable) { + sSendZvkCrossReadResultLSBLSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.sSendCrossReadResult(0)))) + sSendZvkCrossReadResultLSBMSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.sSendCrossReadResult(1)))) + sSendZvkCrossReadResultMSBLSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.sSendCrossReadResult(2)))) + sSendZvkCrossReadResultMSBMSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.sSendCrossReadResult(3)))) + wZvkCrossReadLSBLSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.wCrossRead(0)))) + wZvkCrossReadLSBMSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.wCrossRead(1)))) + wZvkCrossReadMSBLSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.wCrossRead(2)))) + wZvkCrossReadMSBMSBProbe.foreach(p => define(p, ProbeValue(zvkCrossReadUnitOp.get.crossWriteState.wCrossRead(3)))) + } } vrfReadRequestProbe.zipWithIndex.foreach { case((ready, valid), i) => diff --git a/t1/src/laneStage/LaneStage3.scala b/t1/src/laneStage/LaneStage3.scala index 8ccc6fbc1..6e29a181e 100644 --- a/t1/src/laneStage/LaneStage3.scala +++ b/t1/src/laneStage/LaneStage3.scala @@ -17,6 +17,7 @@ class LaneStage3Enqueue(parameter: LaneParameter, isLastSlot: Boolean) extends B val mask: UInt = UInt((parameter.datapathWidth/8).W) val ffoIndex: UInt = UInt(log2Ceil(parameter.vLen / 8).W) val crossWriteData: Vec[UInt] = Vec(2, UInt(parameter.datapathWidth.W)) + val zvkCrossWriteData: Option[Vec[UInt]] = Option.when(parameter.zvkEnable)(Vec(4, UInt(parameter.datapathWidth.W))) val sSendResponse: Bool = Bool() val ffoSuccess: Bool = Bool() val fpReduceValid: Option[Bool] = Option.when(parameter.fpuEnable && isLastSlot)(Bool()) @@ -56,6 +57,9 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { @public val crossWritePort: Option[Vec[DecoupledIO[WriteBusData]]] = Option.when(isLastSlot)(IO(Vec(2, Decoupled(new WriteBusData(parameter))))) + @public + val zvkCrossWritePort: Option[Vec[DecoupledIO[WriteBusData]]] = + Option.when(isLastSlot & parameter.zvkEnable)(IO(Vec(4, Decoupled(new WriteBusData(parameter))))) val stageValidReg: Option[Bool] = Option.when(isLastSlot) (RegInit(false.B)) @@ -65,6 +69,11 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { /** schedule cross lane write MSB */ val sCrossWriteMSB: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) + val sZvkCrossWriteLSBLSB: Option[Bool] = Option.when(isLastSlot && parameter.zvkEnable)(RegInit(true.B)) + val sZvkCrossWriteLSBMSB: Option[Bool] = Option.when(isLastSlot && parameter.zvkEnable)(RegInit(true.B)) + val sZvkCrossWriteMSBLSB: Option[Bool] = Option.when(isLastSlot && parameter.zvkEnable)(RegInit(true.B)) + val sZvkCrossWriteMSBMSB: Option[Bool] = Option.when(isLastSlot && parameter.zvkEnable)(RegInit(true.B)) + // state for response to scheduler /** schedule send [[LaneResponse]] to scheduler */ val sSendResponse: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) @@ -76,6 +85,9 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { when(enqueue.fire) { pipeEnqueue.foreach(_ := enqueue.bits) (sCrossWriteLSB ++ sCrossWriteMSB).foreach(_ := !enqueue.bits.decodeResult(Decoder.crossWrite)) + if(parameter.zvkEnable) { + (sZvkCrossWriteLSBLSB ++ sZvkCrossWriteLSBMSB ++ sZvkCrossWriteMSBLSB ++ sZvkCrossWriteMSBMSB).foreach(_ := !(enqueue.bits.decodeResult(Decoder.crossWrite) & enqueue.bits.decodeResult(Decoder.zvk))) + } (sSendResponse ++ wResponseFeedback).foreach( _ := enqueue.bits.decodeResult(Decoder.scheduler) || enqueue.bits.sSendResponse ) @@ -107,6 +119,19 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { sendState(index) := true.B } } + if(parameter.zvkEnable) { + val zvkSendState = (sZvkCrossWriteLSBLSB ++ sZvkCrossWriteLSBMSB ++ sZvkCrossWriteMSBLSB ++ sZvkCrossWriteMSBMSB).toSeq + zvkCrossWritePort.get.zipWithIndex.foreach { case (port, index) => + port.valid := stageValidReg.get && !zvkSendState(index) + port.bits.mask := 0.U((parameter.datapathWidth / 2 / 8).W)// Note: leave it for empty + port.bits.data := pipeEnqueue.get.zvkCrossWriteData.get(index) + port.bits.counter := pipeEnqueue.get.groupCounter + port.bits.instructionIndex := pipeEnqueue.get.instructionIndex + when(port.fire) { + zvkSendState(index) := true.B + } + } + } // scheduler synchronization val schedulerFinish: Bool = (sSendResponse ++ wResponseFeedback).reduce(_ && _) @@ -151,7 +176,10 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { // Handshake /** Cross-lane writing is over */ - val CrossLaneWriteOver: Bool = (sCrossWriteLSB ++ sCrossWriteMSB).reduce(_ && _) + val CrossLaneWriteOver: Bool = (sCrossWriteLSB ++ sCrossWriteMSB ++ + sZvkCrossWriteLSBLSB ++ sZvkCrossWriteLSBMSB ++ + sZvkCrossWriteMSBLSB ++ sZvkCrossWriteMSBMSB + ).reduce(_ && _) enqueue.ready := !stageValidReg.get || (CrossLaneWriteOver && schedulerFinish && vrfWriteReady) val dequeueFire = stageValidReg.get && CrossLaneWriteOver && schedulerFinish && vrfWriteReady @@ -185,4 +213,4 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { vrfWriteRequest <> vrfWriteQueue.io.deq vrfWriteRequest.bits.offset := vrfPtrReplica.io.deq.bits vrfWriteRequest.valid := vrfPtrReplica.io.deq.valid -} \ No newline at end of file +} diff --git a/t1/src/laneStage/SlotTokenManager.scala b/t1/src/laneStage/SlotTokenManager.scala index ae8bb531e..10cf593dc 100644 --- a/t1/src/laneStage/SlotTokenManager.scala +++ b/t1/src/laneStage/SlotTokenManager.scala @@ -73,6 +73,9 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { @public val crossWriteReports: Vec[ValidIO[UInt]] = IO(Vec(2, Flipped(Valid(UInt(parameter.instructionIndexBits.W))))) + @public + val zvkCrossWriteReports: Option[Vec[ValidIO[UInt]]] = Option.when(parameter.zvkEnable)(IO(Vec(4, Flipped(Valid(UInt(parameter.instructionIndexBits.W)))))) + @public val responseReport: ValidIO[UInt] = IO(Flipped(Valid(UInt(parameter.instructionIndexBits.W)))) @@ -148,6 +151,10 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { val feedbackToken: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) val crossWriteTokenLSB: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) val crossWriteTokenMSB: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) + val zvkCrossWriteTokenLSBLSB: Option[Seq[UInt]] = Option.when(parameter.zvkEnable)(Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W)))) + val zvkCrossWriteTokenLSBMSB: Option[Seq[UInt]] = Option.when(parameter.zvkEnable)(Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W)))) + val zvkCrossWriteTokenMSBLSB: Option[Seq[UInt]] = Option.when(parameter.zvkEnable)(Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W)))) + val zvkCrossWriteTokenMSBMSB: Option[Seq[UInt]] = Option.when(parameter.zvkEnable)(Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W)))) // Feedback is not accurate (index load/store may have already finished the instruction) val responseIndexQueue = Module(new Queue(UInt(parameter.instructionIndexBits.W), parameter.chainingSize + 1, flow = true)) @@ -165,9 +172,23 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { val crossWriteDeqMSB = maskAnd(crossWriteReports.last.valid, indexToOH(crossWriteReports.last.bits, parameter.chainingSize)).asUInt + val zvkCrossWriteDeqLSBLSB = + Option.when(parameter.zvkEnable)(maskAnd(zvkCrossWriteReports.get.head.valid, indexToOH(zvkCrossWriteReports.get.head.bits, parameter.chainingSize)).asUInt) + val zvkCrossWriteDeqLSBMSB = + Option.when(parameter.zvkEnable)(maskAnd(zvkCrossWriteReports.get.head.valid, indexToOH(zvkCrossWriteReports.get.head.bits, parameter.chainingSize)).asUInt) + val zvkCrossWriteDeqMSBLSB = + Option.when(parameter.zvkEnable)(maskAnd(zvkCrossWriteReports.get.head.valid, indexToOH(zvkCrossWriteReports.get.head.bits, parameter.chainingSize)).asUInt) + val zvkCrossWriteDeqMSBMSB = + Option.when(parameter.zvkEnable)(maskAnd(zvkCrossWriteReports.get.last.valid, indexToOH(zvkCrossWriteReports.get.last.bits, parameter.chainingSize)).asUInt) + val pendingCrossWriteLSB = tokenUpdate(crossWriteTokenLSB, crossWriteDoEnq, crossWriteDeqLSB) val pendingCrossWriteMSB = tokenUpdate(crossWriteTokenMSB, crossWriteDoEnq, crossWriteDeqMSB) + val zvkPendingCrossWriteLSBLSB = Option.when(parameter.zvkEnable)(tokenUpdate(zvkCrossWriteTokenLSBLSB.get, crossWriteDoEnq, zvkCrossWriteDeqLSBLSB.get)) + val zvkPendingCrossWriteLSBMSB = Option.when(parameter.zvkEnable)(tokenUpdate(zvkCrossWriteTokenLSBMSB.get, crossWriteDoEnq, zvkCrossWriteDeqLSBMSB.get)) + val zvkPendingCrossWriteMSBLSB = Option.when(parameter.zvkEnable)(tokenUpdate(zvkCrossWriteTokenMSBLSB.get, crossWriteDoEnq, zvkCrossWriteDeqMSBLSB.get)) + val zvkPendingCrossWriteMSBMSB = Option.when(parameter.zvkEnable)(tokenUpdate(zvkCrossWriteTokenMSBMSB.get, crossWriteDoEnq, zvkCrossWriteDeqMSBMSB.get)) + // response & feedback update val responseDoEnq: UInt = maskAnd(enqReport.valid && !enqReport.bits.sSendResponse, enqOH).asUInt @@ -186,7 +207,11 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { val pendingResponse = tokenUpdate(responseToken, responseDoEnq, responseDoDeq) // todo: Precise feedback val pendingFeedback = feedbackUpdate(feedbackToken, responseDoEnq, feedbackDoDeq) - pendingSlotWrite | pendingCrossWriteLSB | pendingCrossWriteMSB | pendingResponse | pendingFeedback + if(parameter.zvkEnable) { + pendingSlotWrite | pendingCrossWriteLSB | pendingCrossWriteMSB | zvkPendingCrossWriteLSBLSB.get | zvkPendingCrossWriteLSBMSB.get | zvkPendingCrossWriteMSBLSB.get | zvkPendingCrossWriteMSBMSB.get | pendingResponse | pendingFeedback + } else { + pendingSlotWrite | pendingCrossWriteLSB | pendingCrossWriteMSB | pendingResponse | pendingFeedback + } } else { pendingSlotWrite } diff --git a/t1/src/laneStage/ZvkCrossReadUnit.scala b/t1/src/laneStage/ZvkCrossReadUnit.scala new file mode 100644 index 000000000..f6e790130 --- /dev/null +++ b/t1/src/laneStage/ZvkCrossReadUnit.scala @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl.lane + +import chisel3._ +import chisel3.experimental.hierarchy.{instantiable, public} +import chisel3.util._ +import org.chipsalliance.t1.rtl.{LaneParameter, ReadBusData} + +class ZvkCrossReadState extends Bundle { + val sSendCrossReadResult: Vec[Bool] = Vec(4, Bool()) + val wCrossRead: Vec[Bool] = Vec(4, Bool()) +} + +@instantiable +class ZvkCrossReadUnit(parameter: LaneParameter) extends Module { + @public + val dataInputLSBLSB: DecoupledIO[UInt] = IO(Flipped(Decoupled(UInt(parameter.datapathWidth.W)))) + @public + val dataInputLSBMSB: DecoupledIO[UInt] = IO(Flipped(Decoupled(UInt(parameter.datapathWidth.W)))) + @public + val dataInputMSBLSB: DecoupledIO[UInt] = IO(Flipped(Decoupled(UInt(parameter.datapathWidth.W)))) + @public + val dataInputMSBMSB: DecoupledIO[UInt] = IO(Flipped(Decoupled(UInt(parameter.datapathWidth.W)))) + @public + val laneIndex: UInt = IO(Input(UInt(parameter.laneNumberBits.W))) + @public + val dataGroup: UInt = IO(Input(UInt(parameter.groupNumberBits.W))) + @public + val currentGroup: UInt = IO(Output(UInt(parameter.groupNumberBits.W))) + + @public + val readBusDequeue: Option[Vec[DecoupledIO[ReadBusData]]] = + Option.when(parameter.zvkEnable)( + IO( + Vec(4, Flipped(Decoupled(new ReadBusData(parameter: LaneParameter))) + )) + ) + @public + val readBusRequest: Option[Vec[DecoupledIO[ReadBusData]]] = + Option.when(parameter.zvkEnable)(IO(Vec(4, Decoupled(new ReadBusData(parameter))))) + + @public + val crossReadDequeue: DecoupledIO[UInt] = IO(Decoupled(UInt((parameter.datapathWidth * 4).W))) + @public + val crossReadStageFree: Bool = IO(Output(Bool())) + @public + val crossWriteState = IO(Output(new ZvkCrossReadState)) + + val stageValid: Bool = RegInit(false.B) + val sSendCrossReadResultLSBLSB, sSendCrossReadResultMSBLSB, wCrossReadLSBLSB, wCrossReadMSBLSB = RegInit(true.B) + val sSendCrossReadResultLSBMSB, sSendCrossReadResultMSBMSB, wCrossReadLSBMSB, wCrossReadMSBMSB = RegInit(true.B) + val stateVec: Seq[Bool] = Seq( + sSendCrossReadResultLSBLSB, + sSendCrossReadResultLSBMSB, + sSendCrossReadResultMSBLSB, + sSendCrossReadResultMSBMSB, + wCrossReadLSBLSB, + wCrossReadLSBMSB, + wCrossReadMSBLSB, + wCrossReadMSBMSB, + ) + val sendDataVec: Vec[UInt] = RegInit(VecInit(Seq.fill(4)(0.U(parameter.datapathWidth.W)))) + val groupCounter: UInt = RegInit(0.U(parameter.groupNumberBits.W)) + val receiveDataVec: Vec[UInt] = RegInit(VecInit(Seq.fill(4)(0.U(parameter.datapathWidth.W)))) + val sendState = Seq( + sSendCrossReadResultLSBLSB, + sSendCrossReadResultLSBMSB, + sSendCrossReadResultMSBLSB, + sSendCrossReadResultMSBMSB, + ) + val receiveState = Seq( + wCrossReadLSBLSB, + wCrossReadLSBMSB, + wCrossReadMSBLSB, + wCrossReadMSBMSB, + ) + + readBusRequest.get.zipWithIndex.foreach { case (port, index) => + port.valid := stageValid && !sendState(index) + port.bits.data := sendDataVec(index) + when(port.fire) { sendState(index) := true.B} + } + + readBusDequeue.get.zipWithIndex.foreach { case (port, index) => + when(port.fire) { + receiveState(index) := true.B + receiveDataVec(index) := port.bits.data + } + port.ready := !receiveState(index) + } + val allStateReady: Bool = stateVec.reduce(_ && _) + val stageReady: Bool = !stageValid || (allStateReady && crossReadDequeue.ready) + val allSourceValid: Bool = Seq( + dataInputLSBLSB.valid, + dataInputLSBMSB.valid, + dataInputMSBLSB.valid, + dataInputMSBMSB.valid, + ).reduce(_ && _) + val enqueueFire: Bool = stageReady && allSourceValid + dataInputLSBLSB.ready := allSourceValid && stageReady + dataInputLSBMSB.ready := allSourceValid && stageReady + dataInputMSBLSB.ready := allSourceValid && stageReady + dataInputMSBMSB.ready := allSourceValid && stageReady + + when(enqueueFire ^ crossReadDequeue.fire) { + stageValid := enqueueFire + } + when(enqueueFire) { + stateVec.foreach(_ := false.B) + sendDataVec := VecInit(Seq( + dataInputLSBLSB.bits, + dataInputLSBMSB.bits, + dataInputMSBLSB.bits, + dataInputMSBMSB.bits, + )) + groupCounter := dataGroup + } + currentGroup := groupCounter + crossReadDequeue.bits := receiveDataVec.asUInt + crossReadDequeue.valid := allStateReady && stageValid + crossReadStageFree := (!stageValid) && stateVec.reduce(_ && _) + + crossWriteState.sSendCrossReadResult(0) := sSendCrossReadResultLSBLSB + crossWriteState.sSendCrossReadResult(1) := sSendCrossReadResultLSBMSB + crossWriteState.sSendCrossReadResult(2) := sSendCrossReadResultMSBLSB + crossWriteState.sSendCrossReadResult(3) := sSendCrossReadResultMSBMSB + crossWriteState.wCrossRead(0) := wCrossReadLSBLSB + crossWriteState.wCrossRead(1) := wCrossReadLSBMSB + crossWriteState.wCrossRead(2) := wCrossReadMSBLSB + crossWriteState.wCrossRead(3) := wCrossReadMSBMSB +} diff --git a/t1/src/vrf/VRF.scala b/t1/src/vrf/VRF.scala index 724ef6372..85e157acd 100644 --- a/t1/src/vrf/VRF.scala +++ b/t1/src/vrf/VRF.scala @@ -51,6 +51,7 @@ case class VRFParam( datapathWidth: Int, chainingSize: Int, portFactor: Int, + zvkEnable: Boolean, ramType: RamType) extends SerializableModuleParameter { @@ -150,9 +151,15 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar val readCheck: Vec[VRFReadRequest] = IO(Vec(parameter.chainingSize * 3 + 2, Input( new VRFReadRequest(parameter.regNumBits, parameter.vrfOffsetBits, parameter.instructionIndexBits) ))) + @public + val zvkReadCheck: Option[Vec[VRFReadRequest]] = Option.when(parameter.zvkEnable)(IO(Vec(parameter.chainingSize * 3 + 4, Input( + new VRFReadRequest(parameter.regNumBits, parameter.vrfOffsetBits, parameter.instructionIndexBits) + )))) @public val readCheckResult: Vec[Bool] = IO(Vec(parameter.chainingSize * 3 + 2, Output(Bool()))) + @public + val zvkReadCheckResult: Option[Vec[Bool]] = Option.when(parameter.zvkEnable)(IO(Vec(parameter.chainingSize * 3 + 4, Output(Bool())))) /** VRF read results. */ @public @@ -274,6 +281,24 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar }.reduce(_ && _) } + if(parameter.zvkEnable) { + zvkReadCheck.get.zip(zvkReadCheckResult.get).foreach { case (req, res) => + val recordSelect = chainingRecord + val readRecord = + Mux1H(recordSelect.map(_.bits.instIndex === req.instructionIndex), recordSelect.map(_.bits)) + res := + recordSelect.zip(recordValidVec).zipWithIndex.map { + case ((r, f), recordIndex) => + val checkModule = Instantiate(new ChainingCheck(parameter)) + checkModule.read := req + checkModule.readRecord := readRecord + checkModule.record := r + checkModule.recordValid := f + checkModule.checkResult + }.reduce(_ && _) + } + } + val checkSize: Int = readRequests.size val (firstOccupied, secondOccupied) = readRequests.zipWithIndex.foldLeft( (0.U(parameter.rfBankNum.W), 0.U(parameter.rfBankNum.W))