-
Notifications
You must be signed in to change notification settings - Fork 751
/
Copy pathLoadUnit.scala
1954 lines (1774 loc) · 92.1 KB
/
LoadUnit.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/***************************************************************************************
* Copyright (c) 2020-2021 Institute of Computing Technology, Chinese Academy of Sciences
* Copyright (c) 2020-2021 Peng Cheng Laboratory
*
* XiangShan is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
*
* See the Mulan PSL v2 for more details.
***************************************************************************************/
package xiangshan.mem
import org.chipsalliance.cde.config.Parameters
import chisel3._
import chisel3.util._
import utils._
import utility._
import xiangshan._
import xiangshan.ExceptionNO._
import xiangshan.backend.Bundles.{DynInst, MemExuInput, MemExuOutput}
import xiangshan.backend.fu.PMPRespBundle
import xiangshan.backend.fu.FuConfig._
import xiangshan.backend.fu.FuType
import xiangshan.backend.ctrlblock.{DebugLsInfoBundle, LsTopdownInfo}
import xiangshan.backend.rob.RobPtr
import xiangshan.backend.ctrlblock.DebugLsInfoBundle
import xiangshan.backend.fu.NewCSR._
import xiangshan.backend.fu.util.SdtrigExt
import xiangshan.mem.mdp._
import xiangshan.mem.Bundles._
import xiangshan.cache._
import xiangshan.cache.wpu.ReplayCarry
import xiangshan.cache.mmu._
class LoadToLsqReplayIO(implicit p: Parameters) extends XSBundle
with HasDCacheParameters
with HasTlbConst
{
// mshr refill index
val mshr_id = UInt(log2Up(cfg.nMissEntries).W)
// get full data from store queue and sbuffer
val full_fwd = Bool()
// wait for data from store inst's store queue index
val data_inv_sq_idx = new SqPtr
// wait for address from store queue index
val addr_inv_sq_idx = new SqPtr
// replay carry
val rep_carry = new ReplayCarry(nWays)
// data in last beat
val last_beat = Bool()
// replay cause
val cause = Vec(LoadReplayCauses.allCauses, Bool())
// performance debug information
val debug = new PerfDebugInfo
// tlb hint
val tlb_id = UInt(log2Up(loadfiltersize).W)
val tlb_full = Bool()
// alias
def mem_amb = cause(LoadReplayCauses.C_MA)
def tlb_miss = cause(LoadReplayCauses.C_TM)
def fwd_fail = cause(LoadReplayCauses.C_FF)
def dcache_rep = cause(LoadReplayCauses.C_DR)
def dcache_miss = cause(LoadReplayCauses.C_DM)
def wpu_fail = cause(LoadReplayCauses.C_WF)
def bank_conflict = cause(LoadReplayCauses.C_BC)
def rar_nack = cause(LoadReplayCauses.C_RAR)
def raw_nack = cause(LoadReplayCauses.C_RAW)
def misalign_nack = cause(LoadReplayCauses.C_MF)
def nuke = cause(LoadReplayCauses.C_NK)
def need_rep = cause.asUInt.orR
}
class LoadToLsqIO(implicit p: Parameters) extends XSBundle {
// ldu -> lsq UncacheBuffer
val ldin = DecoupledIO(new LqWriteBundle)
// uncache-mmio -> ldu
val uncache = Flipped(DecoupledIO(new MemExuOutput))
val ld_raw_data = Input(new LoadDataFromLQBundle)
// uncache-nc -> ldu
val nc_ldin = Flipped(DecoupledIO(new LsPipelineBundle))
// storequeue -> ldu
val forward = new PipeLoadForwardQueryIO
// ldu -> lsq LQRAW
val stld_nuke_query = new LoadNukeQueryIO
// ldu -> lsq LQRAR
val ldld_nuke_query = new LoadNukeQueryIO
}
class LoadToLoadIO(implicit p: Parameters) extends XSBundle {
val valid = Bool()
val data = UInt(XLEN.W) // load to load fast path is limited to ld (64 bit) used as vaddr src1 only
val dly_ld_err = Bool()
}
class LoadUnitTriggerIO(implicit p: Parameters) extends XSBundle {
val tdata2 = Input(UInt(64.W))
val matchType = Input(UInt(2.W))
val tEnable = Input(Bool()) // timing is calculated before this
val addrHit = Output(Bool())
}
class LoadUnit(implicit p: Parameters) extends XSModule
with HasLoadHelper
with HasPerfEvents
with HasDCacheParameters
with HasCircularQueuePtrHelper
with HasVLSUParameters
with SdtrigExt
{
val io = IO(new Bundle() {
// control
val redirect = Flipped(ValidIO(new Redirect))
val csrCtrl = Flipped(new CustomCSRCtrlIO)
// int issue path
val ldin = Flipped(Decoupled(new MemExuInput))
val ldout = Decoupled(new MemExuOutput)
// vec issue path
val vecldin = Flipped(Decoupled(new VecPipeBundle))
val vecldout = Decoupled(new VecPipelineFeedbackIO(isVStore = false))
// misalignBuffer issue path
val misalign_ldin = Flipped(Decoupled(new LsPipelineBundle))
val misalign_ldout = Valid(new LqWriteBundle)
// data path
val tlb = new TlbRequestIO(2)
val pmp = Flipped(new PMPRespBundle()) // arrive same to tlb now
val dcache = new DCacheLoadIO
val sbuffer = new LoadForwardQueryIO
val ubuffer = new LoadForwardQueryIO
val lsq = new LoadToLsqIO
val tl_d_channel = Input(new DcacheToLduForwardIO)
val forward_mshr = Flipped(new LduToMissqueueForwardIO)
// val refill = Flipped(ValidIO(new Refill))
val l2_hint = Input(Valid(new L2ToL1Hint))
val tlb_hint = Flipped(new TlbHintReq)
// fast wakeup
// TODO: implement vector fast wakeup
val fast_uop = ValidIO(new DynInst) // early wakeup signal generated in load_s1, send to RS in load_s2
// trigger
val fromCsrTrigger = Input(new CsrTriggerBundle)
// prefetch
val prefetch_train = ValidIO(new LdPrefetchTrainBundle()) // provide prefetch info to sms
val prefetch_train_l1 = ValidIO(new LdPrefetchTrainBundle()) // provide prefetch info to stream & stride
// speculative for gated control
val s1_prefetch_spec = Output(Bool())
val s2_prefetch_spec = Output(Bool())
val prefetch_req = Flipped(ValidIO(new L1PrefetchReq)) // hardware prefetch to l1 cache req
val canAcceptLowConfPrefetch = Output(Bool())
val canAcceptHighConfPrefetch = Output(Bool())
// ifetchPrefetch
val ifetchPrefetch = ValidIO(new SoftIfetchPrefetchBundle)
// load to load fast path
val l2l_fwd_in = Input(new LoadToLoadIO)
val l2l_fwd_out = Output(new LoadToLoadIO)
val ld_fast_match = Input(Bool())
val ld_fast_fuOpType = Input(UInt())
val ld_fast_imm = Input(UInt(12.W))
// rs feedback
val wakeup = ValidIO(new DynInst)
val feedback_fast = ValidIO(new RSFeedback) // stage 2
val feedback_slow = ValidIO(new RSFeedback) // stage 3
val ldCancel = Output(new LoadCancelIO()) // use to cancel the uops waked by this load, and cancel load
// load ecc error
val s3_dly_ld_err = Output(Bool()) // Note that io.s3_dly_ld_err and io.lsq.s3_dly_ld_err is different
// schedule error query
val stld_nuke_query = Flipped(Vec(StorePipelineWidth, Valid(new StoreNukeQueryIO)))
// queue-based replay
val replay = Flipped(Decoupled(new LsPipelineBundle))
val lq_rep_full = Input(Bool())
// misc
val s2_ptr_chasing = Output(Bool()) // provide right pc for hw prefetch
// Load fast replay path
val fast_rep_in = Flipped(Decoupled(new LqWriteBundle))
val fast_rep_out = Decoupled(new LqWriteBundle)
// to misalign buffer
val misalign_buf = Decoupled(new LqWriteBundle)
// Load RAR rollback
val rollback = Valid(new Redirect)
// perf
val debug_ls = Output(new DebugLsInfoBundle)
val lsTopdownInfo = Output(new LsTopdownInfo)
val correctMissTrain = Input(Bool())
})
val s1_ready, s2_ready, s3_ready = WireInit(false.B)
// Pipeline
// --------------------------------------------------------------------------------
// stage 0
// --------------------------------------------------------------------------------
// generate addr, use addr to query DCache and DTLB
val s0_valid = Wire(Bool())
val s0_mmio_select = Wire(Bool())
val s0_nc_select = Wire(Bool())
val s0_misalign_select= Wire(Bool())
val s0_kill = Wire(Bool())
val s0_can_go = s1_ready
val s0_fire = s0_valid && s0_can_go
val s0_mmio_fire = s0_mmio_select && s0_can_go
val s0_nc_fire = s0_nc_select && s0_can_go
val s0_out = Wire(new LqWriteBundle)
val s0_tlb_valid = Wire(Bool())
val s0_tlb_hlv = Wire(Bool())
val s0_tlb_hlvx = Wire(Bool())
val s0_tlb_vaddr = Wire(UInt(VAddrBits.W))
val s0_tlb_fullva = Wire(UInt(XLEN.W))
val s0_dcache_vaddr = Wire(UInt(VAddrBits.W))
val s0_is128bit = Wire(Bool())
val s0_misalign_wakeup_fire = s0_misalign_select && s0_can_go &&
io.dcache.req.ready &&
io.misalign_ldin.bits.misalignNeedWakeUp
// flow source bundle
class FlowSource extends Bundle {
val vaddr = UInt(VAddrBits.W)
val mask = UInt((VLEN/8).W)
val uop = new DynInst
val try_l2l = Bool()
val has_rob_entry = Bool()
val rep_carry = new ReplayCarry(nWays)
val mshrid = UInt(log2Up(cfg.nMissEntries).W)
val isFirstIssue = Bool()
val fast_rep = Bool()
val ld_rep = Bool()
val l2l_fwd = Bool()
val prf = Bool()
val prf_rd = Bool()
val prf_wr = Bool()
val prf_i = Bool()
val sched_idx = UInt(log2Up(LoadQueueReplaySize+1).W)
// Record the issue port idx of load issue queue. This signal is used by load cancel.
val deqPortIdx = UInt(log2Ceil(LoadPipelineWidth).W)
val frm_mabuf = Bool()
// vec only
val isvec = Bool()
val is128bit = Bool()
val uop_unit_stride_fof = Bool()
val reg_offset = UInt(vOffsetBits.W)
val vecActive = Bool() // 1: vector active element or scala mem operation, 0: vector not active element
val is_first_ele = Bool()
// val flowPtr = new VlflowPtr
val usSecondInv = Bool()
val mbIndex = UInt(vlmBindexBits.W)
val elemIdx = UInt(elemIdxBits.W)
val elemIdxInsideVd = UInt(elemIdxBits.W)
val alignedType = UInt(alignTypeBits.W)
val vecBaseVaddr = UInt(VAddrBits.W)
//for Svpbmt NC
val isnc = Bool()
val paddr = UInt(PAddrBits.W)
val data = UInt((VLEN+1).W)
}
val s0_sel_src = Wire(new FlowSource)
// load flow select/gen
// src 0: misalignBuffer load (io.misalign_ldin)
// src 1: super load replayed by LSQ (cache miss replay) (io.replay)
// src 2: fast load replay (io.fast_rep_in)
// src 3: mmio (io.lsq.uncache)
// src 4: nc (io.lsq.nc_ldin)
// src 5: load replayed by LSQ (io.replay)
// src 6: hardware prefetch from prefetchor (high confidence) (io.prefetch)
// NOTE: Now vec/int loads are sent from same RS
// A vec load will be splited into multiple uops,
// so as long as one uop is issued,
// the other uops should have higher priority
// src 7: vec read from RS (io.vecldin)
// src 8: int read / software prefetch first issue from RS (io.in)
// src 9: load try pointchaising when no issued or replayed load (io.fastpath)
// src10: hardware prefetch from prefetchor (high confidence) (io.prefetch)
// priority: high to low
val s0_rep_stall = io.ldin.valid && isAfter(io.replay.bits.uop.lqIdx, io.ldin.bits.uop.lqIdx) ||
io.vecldin.valid && isAfter(io.replay.bits.uop.lqIdx, io.vecldin.bits.uop.lqIdx)
private val SRC_NUM = 11
private val Seq(
mab_idx, super_rep_idx, fast_rep_idx, mmio_idx, nc_idx, lsq_rep_idx,
high_pf_idx, vec_iss_idx, int_iss_idx, l2l_fwd_idx, low_pf_idx
) = (0 until SRC_NUM).toSeq
// load flow source valid
val s0_src_valid_vec = WireInit(VecInit(Seq(
io.misalign_ldin.valid,
io.replay.valid && io.replay.bits.forward_tlDchannel,
io.fast_rep_in.valid,
io.lsq.uncache.valid,
io.lsq.nc_ldin.valid,
io.replay.valid && !io.replay.bits.forward_tlDchannel && !s0_rep_stall,
io.prefetch_req.valid && io.prefetch_req.bits.confidence > 0.U,
io.vecldin.valid,
io.ldin.valid, // int flow first issue or software prefetch
io.l2l_fwd_in.valid,
io.prefetch_req.valid && io.prefetch_req.bits.confidence === 0.U,
)))
// load flow source ready
val s0_src_ready_vec = Wire(Vec(SRC_NUM, Bool()))
s0_src_ready_vec(0) := true.B
for(i <- 1 until SRC_NUM){
s0_src_ready_vec(i) := !s0_src_valid_vec.take(i).reduce(_ || _)
}
// load flow source select (OH)
val s0_src_select_vec = WireInit(VecInit((0 until SRC_NUM).map{i => s0_src_valid_vec(i) && s0_src_ready_vec(i)}))
val s0_hw_prf_select = s0_src_select_vec(high_pf_idx) || s0_src_select_vec(low_pf_idx)
val s0_tlb_no_query = s0_hw_prf_select || s0_sel_src.prf_i ||
s0_src_select_vec(fast_rep_idx) || s0_src_select_vec(mmio_idx) ||
s0_src_select_vec(nc_idx)
s0_valid := !s0_kill && (s0_src_select_vec(nc_idx) || ((
s0_src_valid_vec(mab_idx) ||
s0_src_valid_vec(super_rep_idx) ||
s0_src_valid_vec(fast_rep_idx) ||
s0_src_valid_vec(lsq_rep_idx) ||
s0_src_valid_vec(high_pf_idx) ||
s0_src_valid_vec(vec_iss_idx) ||
s0_src_valid_vec(int_iss_idx) ||
s0_src_valid_vec(l2l_fwd_idx) ||
s0_src_valid_vec(low_pf_idx)
) && !s0_src_select_vec(mmio_idx) && io.dcache.req.ready))
s0_mmio_select := s0_src_select_vec(mmio_idx) && !s0_kill
s0_nc_select := s0_src_select_vec(nc_idx) && !s0_kill
//judgment: is NC with data or not.
//If true, it's from `io.lsq.nc_ldin` or `io.fast_rep_in`
val s0_nc_with_data = s0_sel_src.isnc && !s0_kill
s0_misalign_select := s0_src_select_vec(mab_idx) && !s0_kill
// if is hardware prefetch or fast replay, don't send valid to tlb
s0_tlb_valid := (
s0_src_valid_vec(mab_idx) ||
s0_src_valid_vec(super_rep_idx) ||
s0_src_valid_vec(lsq_rep_idx) ||
s0_src_valid_vec(vec_iss_idx) ||
s0_src_valid_vec(int_iss_idx) ||
s0_src_valid_vec(l2l_fwd_idx)
) && io.dcache.req.ready
// which is S0's out is ready and dcache is ready
val s0_try_ptr_chasing = s0_src_select_vec(l2l_fwd_idx)
val s0_do_try_ptr_chasing = s0_try_ptr_chasing && s0_can_go && io.dcache.req.ready
val s0_ptr_chasing_vaddr = io.l2l_fwd_in.data(5, 0) +& io.ld_fast_imm(5, 0)
val s0_ptr_chasing_canceled = WireInit(false.B)
s0_kill := s0_ptr_chasing_canceled
// prefetch related ctrl signal
io.canAcceptLowConfPrefetch := s0_src_ready_vec(low_pf_idx) && io.dcache.req.ready
io.canAcceptHighConfPrefetch := s0_src_ready_vec(high_pf_idx) && io.dcache.req.ready
// query DTLB
io.tlb.req.valid := s0_tlb_valid
io.tlb.req.bits.cmd := Mux(s0_sel_src.prf,
Mux(s0_sel_src.prf_wr, TlbCmd.write, TlbCmd.read),
TlbCmd.read
)
io.tlb.req.bits.isPrefetch := s0_sel_src.prf
io.tlb.req.bits.vaddr := s0_tlb_vaddr
io.tlb.req.bits.fullva := s0_tlb_fullva
io.tlb.req.bits.checkfullva := s0_src_select_vec(vec_iss_idx) || s0_src_select_vec(int_iss_idx)
io.tlb.req.bits.hyperinst := s0_tlb_hlv
io.tlb.req.bits.hlvx := s0_tlb_hlvx
io.tlb.req.bits.size := Mux(s0_sel_src.isvec, s0_sel_src.alignedType(2,0), LSUOpType.size(s0_sel_src.uop.fuOpType))
io.tlb.req.bits.kill := s0_kill || s0_tlb_no_query // if does not need to be translated, kill it
io.tlb.req.bits.memidx.is_ld := true.B
io.tlb.req.bits.memidx.is_st := false.B
io.tlb.req.bits.memidx.idx := s0_sel_src.uop.lqIdx.value
io.tlb.req.bits.debug.robIdx := s0_sel_src.uop.robIdx
io.tlb.req.bits.no_translate := s0_tlb_no_query // hardware prefetch and fast replay does not need to be translated, need this signal for pmp check
io.tlb.req.bits.debug.pc := s0_sel_src.uop.pc
io.tlb.req.bits.debug.isFirstIssue := s0_sel_src.isFirstIssue
// query DCache
io.dcache.req.valid := s0_valid && !s0_sel_src.prf_i && !s0_nc_with_data
io.dcache.req.bits.cmd := Mux(s0_sel_src.prf_rd,
MemoryOpConstants.M_PFR,
Mux(s0_sel_src.prf_wr, MemoryOpConstants.M_PFW, MemoryOpConstants.M_XRD)
)
io.dcache.req.bits.vaddr := s0_dcache_vaddr
io.dcache.req.bits.vaddr_dup := s0_dcache_vaddr
io.dcache.req.bits.mask := s0_sel_src.mask
io.dcache.req.bits.data := DontCare
io.dcache.req.bits.isFirstIssue := s0_sel_src.isFirstIssue
io.dcache.req.bits.instrtype := Mux(s0_sel_src.prf, DCACHE_PREFETCH_SOURCE.U, LOAD_SOURCE.U)
io.dcache.req.bits.debug_robIdx := s0_sel_src.uop.robIdx.value
io.dcache.req.bits.replayCarry := s0_sel_src.rep_carry
io.dcache.req.bits.id := DontCare // TODO: update cache meta
io.dcache.req.bits.lqIdx := s0_sel_src.uop.lqIdx
io.dcache.pf_source := Mux(s0_hw_prf_select, io.prefetch_req.bits.pf_source.value, L1_HW_PREFETCH_NULL)
io.dcache.is128Req := s0_is128bit
// load flow priority mux
def fromNullSource(): FlowSource = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
out
}
def fromMisAlignBufferSource(src: LsPipelineBundle): FlowSource = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
out.vaddr := src.vaddr
out.mask := src.mask
out.uop := src.uop
out.try_l2l := false.B
out.has_rob_entry := false.B
out.rep_carry := src.replayCarry
out.mshrid := src.mshrid
out.frm_mabuf := true.B
out.isFirstIssue := false.B
out.fast_rep := false.B
out.ld_rep := false.B
out.l2l_fwd := false.B
out.prf := false.B
out.prf_rd := false.B
out.prf_wr := false.B
out.sched_idx := src.schedIndex
out.isvec := src.isvec
out.is128bit := src.is128bit
out.vecActive := true.B
out
}
def fromFastReplaySource(src: LqWriteBundle): FlowSource = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
out.vaddr := src.vaddr
out.paddr := src.paddr
out.mask := src.mask
out.uop := src.uop
out.try_l2l := false.B
out.has_rob_entry := src.hasROBEntry
out.rep_carry := src.rep_info.rep_carry
out.mshrid := src.rep_info.mshr_id
out.frm_mabuf := src.isFrmMisAlignBuf
out.isFirstIssue := false.B
out.fast_rep := true.B
out.ld_rep := src.isLoadReplay
out.l2l_fwd := false.B
out.prf := LSUOpType.isPrefetch(src.uop.fuOpType) && !src.isvec
out.prf_rd := src.uop.fuOpType === LSUOpType.prefetch_r
out.prf_wr := src.uop.fuOpType === LSUOpType.prefetch_w
out.prf_i := false.B
out.sched_idx := src.schedIndex
out.isvec := src.isvec
out.is128bit := src.is128bit
out.uop_unit_stride_fof := src.uop_unit_stride_fof
out.reg_offset := src.reg_offset
out.vecActive := src.vecActive
out.is_first_ele := src.is_first_ele
out.usSecondInv := src.usSecondInv
out.mbIndex := src.mbIndex
out.elemIdx := src.elemIdx
out.elemIdxInsideVd := src.elemIdxInsideVd
out.alignedType := src.alignedType
out.isnc := src.nc
out.data := src.data
out
}
// TODO: implement vector mmio
def fromMmioSource(src: MemExuOutput) = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
out.mask := 0.U
out.uop := src.uop
out.try_l2l := false.B
out.has_rob_entry := false.B
out.rep_carry := 0.U.asTypeOf(out.rep_carry)
out.mshrid := 0.U
out.frm_mabuf := false.B
out.isFirstIssue := false.B
out.fast_rep := false.B
out.ld_rep := false.B
out.l2l_fwd := false.B
out.prf := false.B
out.prf_rd := false.B
out.prf_wr := false.B
out.prf_i := false.B
out.sched_idx := 0.U
out.vecActive := true.B
out
}
def fromNcSource(src: LsPipelineBundle): FlowSource = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
out.vaddr := src.vaddr
out.paddr := src.paddr
out.mask := genVWmask(src.vaddr, src.uop.fuOpType(1,0))
out.uop := src.uop
out.has_rob_entry := true.B
out.sched_idx := src.schedIndex
out.isvec := src.isvec
out.is128bit := src.is128bit
out.vecActive := src.vecActive
out.isnc := true.B
out.data := src.data
out
}
def fromNormalReplaySource(src: LsPipelineBundle): FlowSource = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
out.mask := Mux(src.isvec, src.mask, genVWmask(src.vaddr, src.uop.fuOpType(1, 0)))
out.uop := src.uop
out.try_l2l := false.B
out.has_rob_entry := true.B
out.rep_carry := src.replayCarry
out.mshrid := src.mshrid
out.frm_mabuf := false.B
out.isFirstIssue := false.B
out.fast_rep := false.B
out.ld_rep := true.B
out.l2l_fwd := false.B
out.prf := LSUOpType.isPrefetch(src.uop.fuOpType) && !src.isvec
out.prf_rd := src.uop.fuOpType === LSUOpType.prefetch_r
out.prf_wr := src.uop.fuOpType === LSUOpType.prefetch_w
out.prf_i := false.B
out.sched_idx := src.schedIndex
out.isvec := src.isvec
out.is128bit := src.is128bit
out.uop_unit_stride_fof := src.uop_unit_stride_fof
out.reg_offset := src.reg_offset
out.vecActive := src.vecActive
out.is_first_ele := src.is_first_ele
out.usSecondInv := src.usSecondInv
out.mbIndex := src.mbIndex
out.elemIdx := src.elemIdx
out.elemIdxInsideVd := src.elemIdxInsideVd
out.alignedType := src.alignedType
out
}
// TODO: implement vector prefetch
def fromPrefetchSource(src: L1PrefetchReq): FlowSource = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
out.mask := 0.U
out.uop := DontCare
out.try_l2l := false.B
out.has_rob_entry := false.B
out.rep_carry := 0.U.asTypeOf(out.rep_carry)
out.mshrid := 0.U
out.frm_mabuf := false.B
out.isFirstIssue := false.B
out.fast_rep := false.B
out.ld_rep := false.B
out.l2l_fwd := false.B
out.prf := true.B
out.prf_rd := !src.is_store
out.prf_wr := src.is_store
out.prf_i := false.B
out.sched_idx := 0.U
out
}
def fromVecIssueSource(src: VecPipeBundle): FlowSource = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
out.mask := src.mask
out.uop := src.uop
out.try_l2l := false.B
out.has_rob_entry := true.B
// TODO: VLSU, implement replay carry
out.rep_carry := 0.U.asTypeOf(out.rep_carry)
out.mshrid := 0.U
out.frm_mabuf := false.B
// TODO: VLSU, implement first issue
// out.isFirstIssue := src.isFirstIssue
out.fast_rep := false.B
out.ld_rep := false.B
out.l2l_fwd := false.B
out.prf := false.B
out.prf_rd := false.B
out.prf_wr := false.B
out.prf_i := false.B
out.sched_idx := 0.U
// Vector load interface
out.isvec := true.B
// vector loads only access a single element at a time, so 128-bit path is not used for now
out.is128bit := is128Bit(src.alignedType)
out.uop_unit_stride_fof := src.uop_unit_stride_fof
// out.rob_idx_valid := src.rob_idx_valid
// out.inner_idx := src.inner_idx
// out.rob_idx := src.rob_idx
out.reg_offset := src.reg_offset
// out.offset := src.offset
out.vecActive := src.vecActive
out.is_first_ele := src.is_first_ele
// out.flowPtr := src.flowPtr
out.usSecondInv := src.usSecondInv
out.mbIndex := src.mBIndex
out.elemIdx := src.elemIdx
out.elemIdxInsideVd := src.elemIdxInsideVd
out.vecBaseVaddr := src.basevaddr
out.alignedType := src.alignedType
out
}
def fromIntIssueSource(src: MemExuInput): FlowSource = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
val addr = io.ldin.bits.src(0) + SignExt(io.ldin.bits.uop.imm(11, 0), VAddrBits)
out.mask := genVWmask(addr, src.uop.fuOpType(1,0))
out.uop := src.uop
out.try_l2l := false.B
out.has_rob_entry := true.B
out.rep_carry := 0.U.asTypeOf(out.rep_carry)
out.mshrid := 0.U
out.frm_mabuf := false.B
out.isFirstIssue := true.B
out.fast_rep := false.B
out.ld_rep := false.B
out.l2l_fwd := false.B
out.prf := LSUOpType.isPrefetch(src.uop.fuOpType)
out.prf_rd := src.uop.fuOpType === LSUOpType.prefetch_r
out.prf_wr := src.uop.fuOpType === LSUOpType.prefetch_w
out.prf_i := src.uop.fuOpType === LSUOpType.prefetch_i
out.sched_idx := 0.U
out.vecActive := true.B // true for scala load
out
}
// TODO: implement vector l2l
def fromLoadToLoadSource(src: LoadToLoadIO): FlowSource = {
val out = WireInit(0.U.asTypeOf(new FlowSource))
out.mask := genVWmask(0.U, LSUOpType.ld)
// When there's no valid instruction from RS and LSQ, we try the load-to-load forwarding.
// Assume the pointer chasing is always ld.
out.uop.fuOpType := LSUOpType.ld
out.try_l2l := true.B
// we dont care out.isFirstIssue and out.rsIdx and s0_sqIdx in S0 when trying pointchasing
// because these signals will be updated in S1
out.has_rob_entry := false.B
out.mshrid := 0.U
out.frm_mabuf := false.B
out.rep_carry := 0.U.asTypeOf(out.rep_carry)
out.isFirstIssue := true.B
out.fast_rep := false.B
out.ld_rep := false.B
out.l2l_fwd := true.B
out.prf := false.B
out.prf_rd := false.B
out.prf_wr := false.B
out.prf_i := false.B
out.sched_idx := 0.U
out
}
// set default
val s0_src_selector = WireInit(s0_src_valid_vec)
if (!EnableLoadToLoadForward) { s0_src_selector(l2l_fwd_idx) := false.B }
val s0_src_format = Seq(
fromMisAlignBufferSource(io.misalign_ldin.bits),
fromNormalReplaySource(io.replay.bits),
fromFastReplaySource(io.fast_rep_in.bits),
fromMmioSource(io.lsq.uncache.bits),
fromNcSource(io.lsq.nc_ldin.bits),
fromNormalReplaySource(io.replay.bits),
fromPrefetchSource(io.prefetch_req.bits),
fromVecIssueSource(io.vecldin.bits),
fromIntIssueSource(io.ldin.bits),
(if (EnableLoadToLoadForward) fromLoadToLoadSource(io.l2l_fwd_in) else fromNullSource()),
fromPrefetchSource(io.prefetch_req.bits)
)
s0_sel_src := ParallelPriorityMux(s0_src_selector, s0_src_format)
// fast replay and hardware prefetch don't need to query tlb
val int_issue_vaddr = io.ldin.bits.src(0) + SignExt(io.ldin.bits.uop.imm(11, 0), VAddrBits)
val int_vec_vaddr = Mux(s0_src_valid_vec(vec_iss_idx), io.vecldin.bits.vaddr(VAddrBits - 1, 0), int_issue_vaddr)
s0_tlb_vaddr := Mux(
s0_src_valid_vec(mab_idx),
io.misalign_ldin.bits.vaddr,
Mux(
s0_src_valid_vec(super_rep_idx) || s0_src_valid_vec(lsq_rep_idx),
io.replay.bits.vaddr,
int_vec_vaddr
)
)
s0_dcache_vaddr := Mux(
s0_src_select_vec(fast_rep_idx), io.fast_rep_in.bits.vaddr,
Mux(s0_hw_prf_select, io.prefetch_req.bits.getVaddr(),
Mux(s0_src_select_vec(nc_idx), io.lsq.nc_ldin.bits.vaddr, // not for dcache access, but for address alignment check
s0_tlb_vaddr))
)
val s0_alignType = Mux(s0_sel_src.isvec, s0_sel_src.alignedType(1,0), s0_sel_src.uop.fuOpType(1, 0))
val s0_addr_aligned = LookupTree(s0_alignType, List(
"b00".U -> true.B, //b
"b01".U -> (s0_dcache_vaddr(0) === 0.U), //h
"b10".U -> (s0_dcache_vaddr(1, 0) === 0.U), //w
"b11".U -> (s0_dcache_vaddr(2, 0) === 0.U) //d
))
// address align check
XSError(s0_sel_src.isvec && s0_dcache_vaddr(3, 0) =/= 0.U && s0_sel_src.alignedType(2), "unit-stride 128 bit element is not aligned!")
val s0_check_vaddr_low = s0_dcache_vaddr(4, 0)
val s0_check_vaddr_Up_low = LookupTree(s0_alignType, List(
"b00".U -> 0.U,
"b01".U -> 1.U,
"b10".U -> 3.U,
"b11".U -> 7.U
)) + s0_check_vaddr_low
//TODO vec?
val s0_rs_cross16Bytes = s0_check_vaddr_Up_low(4) =/= s0_check_vaddr_low(4)
val s0_misalignWith16Byte = !s0_rs_cross16Bytes && !s0_addr_aligned && !s0_hw_prf_select
val s0_misalignNeedWakeUp = s0_sel_src.frm_mabuf && io.misalign_ldin.bits.misalignNeedWakeUp
val s0_finalSplit = s0_sel_src.frm_mabuf && io.misalign_ldin.bits.isFinalSplit
s0_is128bit := s0_sel_src.is128bit || s0_misalignWith16Byte
// only first issue of int / vec load intructions need to check full vaddr
s0_tlb_fullva := Mux(s0_src_valid_vec(mab_idx),
io.misalign_ldin.bits.fullva,
Mux(s0_src_select_vec(vec_iss_idx),
io.vecldin.bits.vaddr,
Mux(
s0_src_select_vec(int_iss_idx),
io.ldin.bits.src(0) + SignExt(io.ldin.bits.uop.imm(11, 0), XLEN),
s0_dcache_vaddr
)
)
)
s0_tlb_hlv := Mux(
s0_src_valid_vec(mab_idx),
LSUOpType.isHlv(io.misalign_ldin.bits.uop.fuOpType),
Mux(
s0_src_valid_vec(super_rep_idx) || s0_src_valid_vec(lsq_rep_idx),
LSUOpType.isHlv(io.replay.bits.uop.fuOpType),
Mux(
s0_src_valid_vec(int_iss_idx),
LSUOpType.isHlv(io.ldin.bits.uop.fuOpType),
false.B
)
)
)
s0_tlb_hlvx := Mux(
s0_src_valid_vec(mab_idx),
LSUOpType.isHlvx(io.misalign_ldin.bits.uop.fuOpType),
Mux(
s0_src_valid_vec(super_rep_idx) || s0_src_valid_vec(lsq_rep_idx),
LSUOpType.isHlvx(io.replay.bits.uop.fuOpType),
Mux(
s0_src_valid_vec(int_iss_idx),
LSUOpType.isHlvx(io.ldin.bits.uop.fuOpType),
false.B
)
)
)
// accept load flow if dcache ready (tlb is always ready)
// TODO: prefetch need writeback to loadQueueFlag
s0_out := DontCare
s0_out.vaddr := Mux(s0_nc_with_data, s0_sel_src.vaddr, s0_dcache_vaddr)
s0_out.fullva := s0_tlb_fullva
s0_out.mask := s0_sel_src.mask
s0_out.uop := s0_sel_src.uop
s0_out.isFirstIssue := s0_sel_src.isFirstIssue
s0_out.hasROBEntry := s0_sel_src.has_rob_entry
s0_out.isPrefetch := s0_sel_src.prf
s0_out.isHWPrefetch := s0_hw_prf_select
s0_out.isFastReplay := s0_sel_src.fast_rep
s0_out.isLoadReplay := s0_sel_src.ld_rep
s0_out.isFastPath := s0_sel_src.l2l_fwd
s0_out.mshrid := s0_sel_src.mshrid
s0_out.isvec := s0_sel_src.isvec
s0_out.is128bit := s0_is128bit
s0_out.isFrmMisAlignBuf := s0_sel_src.frm_mabuf
s0_out.uop_unit_stride_fof := s0_sel_src.uop_unit_stride_fof
s0_out.paddr :=
Mux(s0_src_valid_vec(nc_idx), io.lsq.nc_ldin.bits.paddr,
Mux(s0_src_valid_vec(fast_rep_idx), io.fast_rep_in.bits.paddr,
Mux(s0_src_select_vec(int_iss_idx) && s0_sel_src.prf_i, 0.U,
io.prefetch_req.bits.paddr))) // only for nc, fast_rep, prefetch
s0_out.tlbNoQuery := s0_tlb_no_query
// s0_out.rob_idx_valid := s0_rob_idx_valid
// s0_out.inner_idx := s0_inner_idx
// s0_out.rob_idx := s0_rob_idx
s0_out.reg_offset := s0_sel_src.reg_offset
// s0_out.offset := s0_offset
s0_out.vecActive := s0_sel_src.vecActive
s0_out.usSecondInv := s0_sel_src.usSecondInv
s0_out.is_first_ele := s0_sel_src.is_first_ele
s0_out.elemIdx := s0_sel_src.elemIdx
s0_out.elemIdxInsideVd := s0_sel_src.elemIdxInsideVd
s0_out.alignedType := s0_sel_src.alignedType
s0_out.mbIndex := s0_sel_src.mbIndex
s0_out.vecBaseVaddr := s0_sel_src.vecBaseVaddr
// s0_out.flowPtr := s0_sel_src.flowPtr
s0_out.uop.exceptionVec(loadAddrMisaligned) := (!s0_addr_aligned || s0_sel_src.uop.exceptionVec(loadAddrMisaligned)) && s0_sel_src.vecActive && !s0_misalignWith16Byte
s0_out.isMisalign := (!s0_addr_aligned || s0_sel_src.uop.exceptionVec(loadAddrMisaligned)) && s0_sel_src.vecActive
s0_out.forward_tlDchannel := s0_src_select_vec(super_rep_idx)
when(io.tlb.req.valid && s0_sel_src.isFirstIssue) {
s0_out.uop.debugInfo.tlbFirstReqTime := GTimer()
}.otherwise{
s0_out.uop.debugInfo.tlbFirstReqTime := s0_sel_src.uop.debugInfo.tlbFirstReqTime
}
s0_out.schedIndex := s0_sel_src.sched_idx
//for Svpbmt Nc
s0_out.nc := s0_sel_src.isnc
s0_out.data := s0_sel_src.data
s0_out.misalignWith16Byte := s0_misalignWith16Byte
s0_out.misalignNeedWakeUp := s0_misalignNeedWakeUp
s0_out.isFinalSplit := s0_finalSplit
// load fast replay
io.fast_rep_in.ready := (s0_can_go && io.dcache.req.ready && s0_src_ready_vec(fast_rep_idx))
// mmio
io.lsq.uncache.ready := s0_mmio_fire
io.lsq.nc_ldin.ready := s0_src_ready_vec(nc_idx) && s0_can_go
// load flow source ready
// cache missed load has highest priority
// always accept cache missed load flow from load replay queue
io.replay.ready := (s0_can_go && io.dcache.req.ready && (s0_src_ready_vec(lsq_rep_idx) && !s0_rep_stall || s0_src_select_vec(super_rep_idx)))
// accept load flow from rs when:
// 1) there is no lsq-replayed load
// 2) there is no fast replayed load
// 3) there is no high confidence prefetch request
io.vecldin.ready := s0_can_go && io.dcache.req.ready && s0_src_ready_vec(vec_iss_idx)
io.ldin.ready := s0_can_go && io.dcache.req.ready && s0_src_ready_vec(int_iss_idx)
io.misalign_ldin.ready := s0_can_go && io.dcache.req.ready && s0_src_ready_vec(mab_idx)
// for hw prefetch load flow feedback, to be added later
// io.prefetch_in.ready := s0_hw_prf_select
// dcache replacement extra info
// TODO: should prefetch load update replacement?
io.dcache.replacementUpdated := Mux(s0_src_select_vec(lsq_rep_idx) || s0_src_select_vec(super_rep_idx), io.replay.bits.replacementUpdated, false.B)
// load wakeup
// TODO: vector load wakeup? frm_mabuf wakeup?
val s0_wakeup_selector = Seq(
s0_misalign_wakeup_fire,
s0_src_valid_vec(super_rep_idx),
s0_src_valid_vec(fast_rep_idx),
s0_mmio_fire,
s0_nc_fire,
s0_src_valid_vec(lsq_rep_idx),
s0_src_valid_vec(int_iss_idx)
)
val s0_wakeup_format = Seq(
io.misalign_ldin.bits.uop,
io.replay.bits.uop,
io.fast_rep_in.bits.uop,
io.lsq.uncache.bits.uop,
io.lsq.nc_ldin.bits.uop,
io.replay.bits.uop,
io.ldin.bits.uop,
)
val s0_wakeup_uop = ParallelPriorityMux(s0_wakeup_selector, s0_wakeup_format)
io.wakeup.valid := s0_fire && !s0_sel_src.isvec && !s0_sel_src.frm_mabuf && (
s0_src_valid_vec(super_rep_idx) ||
s0_src_valid_vec(fast_rep_idx) ||
s0_src_valid_vec(lsq_rep_idx) ||
(s0_src_valid_vec(int_iss_idx) && !s0_sel_src.prf &&
!s0_src_valid_vec(vec_iss_idx) && !s0_src_valid_vec(high_pf_idx))
) || s0_mmio_fire || s0_nc_fire || s0_misalign_wakeup_fire
io.wakeup.bits := s0_wakeup_uop
// prefetch.i(Zicbop)
io.ifetchPrefetch.valid := RegNext(s0_src_select_vec(int_iss_idx) && s0_sel_src.prf_i)
io.ifetchPrefetch.bits.vaddr := RegEnable(s0_out.vaddr, 0.U, s0_src_select_vec(int_iss_idx) && s0_sel_src.prf_i)
XSDebug(io.dcache.req.fire,
p"[DCACHE LOAD REQ] pc ${Hexadecimal(s0_sel_src.uop.pc)}, vaddr ${Hexadecimal(s0_dcache_vaddr)}\n"
)
XSDebug(s0_valid,
p"S0: pc ${Hexadecimal(s0_out.uop.pc)}, lId ${Hexadecimal(s0_out.uop.lqIdx.asUInt)}, " +
p"vaddr ${Hexadecimal(s0_out.vaddr)}, mask ${Hexadecimal(s0_out.mask)}\n")
// Pipeline
// --------------------------------------------------------------------------------
// stage 1
// --------------------------------------------------------------------------------
// TLB resp (send paddr to dcache)
val s1_valid = RegInit(false.B)
val s1_in = Wire(new LqWriteBundle)
val s1_out = Wire(new LqWriteBundle)
val s1_kill = Wire(Bool())
val s1_can_go = s2_ready
val s1_fire = s1_valid && !s1_kill && s1_can_go
val s1_vecActive = RegEnable(s0_out.vecActive, true.B, s0_fire)
val s1_nc_with_data = RegNext(s0_nc_with_data)
s1_ready := !s1_valid || s1_kill || s2_ready
when (s0_fire) { s1_valid := true.B }
.elsewhen (s1_fire) { s1_valid := false.B }
.elsewhen (s1_kill) { s1_valid := false.B }
s1_in := RegEnable(s0_out, s0_fire)
val s1_fast_rep_dly_kill = RegEnable(io.fast_rep_in.bits.lateKill, io.fast_rep_in.valid) && s1_in.isFastReplay
val s1_fast_rep_dly_err = RegEnable(io.fast_rep_in.bits.delayedLoadError, io.fast_rep_in.valid) && s1_in.isFastReplay
val s1_l2l_fwd_dly_err = RegEnable(io.l2l_fwd_in.dly_ld_err, io.l2l_fwd_in.valid) && s1_in.isFastPath
val s1_dly_err = s1_fast_rep_dly_err || s1_l2l_fwd_dly_err
val s1_vaddr_hi = Wire(UInt())
val s1_vaddr_lo = Wire(UInt())
val s1_vaddr = Wire(UInt())
val s1_paddr_dup_lsu = Wire(UInt())
val s1_gpaddr_dup_lsu = Wire(UInt())
val s1_paddr_dup_dcache = Wire(UInt())
val s1_exception = ExceptionNO.selectByFu(s1_out.uop.exceptionVec, LduCfg).asUInt.orR // af & pf exception were modified below.
val s1_tlb_miss = io.tlb.resp.bits.miss && io.tlb.resp.valid && s1_valid
val s1_tlb_fast_miss = io.tlb.resp.bits.fastMiss && io.tlb.resp.valid && s1_valid
val s1_tlb_hit = !io.tlb.resp.bits.miss && io.tlb.resp.valid && s1_valid
val s1_pbmt = Mux(s1_tlb_hit, io.tlb.resp.bits.pbmt.head, 0.U(Pbmt.width.W))
val s1_nc = s1_in.nc
val s1_prf = s1_in.isPrefetch
val s1_hw_prf = s1_in.isHWPrefetch
val s1_sw_prf = s1_prf && !s1_hw_prf
val s1_tlb_memidx = io.tlb.resp.bits.memidx
s1_vaddr_hi := s1_in.vaddr(VAddrBits - 1, 6)
s1_vaddr_lo := s1_in.vaddr(5, 0)
s1_vaddr := Cat(s1_vaddr_hi, s1_vaddr_lo)
s1_paddr_dup_lsu := Mux(s1_in.tlbNoQuery, s1_in.paddr, io.tlb.resp.bits.paddr(0))
s1_paddr_dup_dcache := Mux(s1_in.tlbNoQuery, s1_in.paddr, io.tlb.resp.bits.paddr(1))
s1_gpaddr_dup_lsu := Mux(s1_in.isFastReplay, s1_in.paddr, io.tlb.resp.bits.gpaddr(0))
when (s1_tlb_memidx.is_ld && io.tlb.resp.valid && !s1_tlb_miss && s1_tlb_memidx.idx === s1_in.uop.lqIdx.value) {
// printf("load idx = %d\n", s1_tlb_memidx.idx)
s1_out.uop.debugInfo.tlbRespTime := GTimer()
}
io.tlb.req_kill := s1_kill || s1_dly_err
io.tlb.req.bits.pmp_addr := s1_in.paddr
io.tlb.resp.ready := true.B
io.dcache.s1_paddr_dup_lsu <> s1_paddr_dup_lsu
io.dcache.s1_paddr_dup_dcache <> s1_paddr_dup_dcache
io.dcache.s1_kill := s1_kill || s1_dly_err || s1_tlb_miss || s1_exception
io.dcache.s1_kill_data_read := s1_kill || s1_dly_err || s1_tlb_fast_miss
// store to load forwarding
io.sbuffer.valid := s1_valid && !(s1_exception || s1_tlb_miss || s1_kill || s1_dly_err || s1_prf)
io.sbuffer.vaddr := s1_vaddr
io.sbuffer.paddr := s1_paddr_dup_lsu
io.sbuffer.uop := s1_in.uop
io.sbuffer.sqIdx := s1_in.uop.sqIdx
io.sbuffer.mask := s1_in.mask
io.sbuffer.pc := s1_in.uop.pc // FIXME: remove it
io.ubuffer.valid := s1_valid && s1_nc_with_data && !(s1_exception || s1_tlb_miss || s1_kill || s1_dly_err || s1_prf)
io.ubuffer.vaddr := s1_vaddr
io.ubuffer.paddr := s1_paddr_dup_lsu
io.ubuffer.uop := s1_in.uop
io.ubuffer.sqIdx := s1_in.uop.sqIdx
io.ubuffer.mask := s1_in.mask
io.ubuffer.pc := s1_in.uop.pc // FIXME: remove it
io.lsq.forward.valid := s1_valid && !(s1_exception || s1_tlb_miss || s1_kill || s1_dly_err || s1_prf)
io.lsq.forward.vaddr := s1_vaddr
io.lsq.forward.paddr := s1_paddr_dup_lsu
io.lsq.forward.uop := s1_in.uop
io.lsq.forward.sqIdx := s1_in.uop.sqIdx
io.lsq.forward.sqIdxMask := 0.U
io.lsq.forward.mask := s1_in.mask
io.lsq.forward.pc := s1_in.uop.pc // FIXME: remove it
// st-ld violation query
// if store unit is 128-bits memory access, need match 128-bit
private val s1_isMatch128 = io.stld_nuke_query.map(x => (x.bits.matchLine || ((s1_in.isvec || s1_in.misalignWith16Byte) && s1_in.is128bit)))
val s1_nuke_paddr_match = VecInit((0 until StorePipelineWidth).zip(s1_isMatch128).map{case (w, s) => {Mux(s,
s1_paddr_dup_lsu(PAddrBits-1, 4) === io.stld_nuke_query(w).bits.paddr(PAddrBits-1, 4),
s1_paddr_dup_lsu(PAddrBits-1, 3) === io.stld_nuke_query(w).bits.paddr(PAddrBits-1, 3))}})
val s1_nuke = VecInit((0 until StorePipelineWidth).map(w => {
io.stld_nuke_query(w).valid && // query valid
isAfter(s1_in.uop.robIdx, io.stld_nuke_query(w).bits.robIdx) && // older store
s1_nuke_paddr_match(w) && // paddr match
(s1_in.mask & io.stld_nuke_query(w).bits.mask).orR // data mask contain
})).asUInt.orR && !s1_tlb_miss
s1_out := s1_in
s1_out.vaddr := s1_vaddr
s1_out.fullva := io.tlb.resp.bits.fullva
s1_out.vaNeedExt := io.tlb.resp.bits.excp(0).vaNeedExt
s1_out.isHyper := io.tlb.resp.bits.excp(0).isHyper
s1_out.paddr := s1_paddr_dup_lsu
s1_out.gpaddr := s1_gpaddr_dup_lsu
s1_out.isForVSnonLeafPTE := io.tlb.resp.bits.isForVSnonLeafPTE
s1_out.tlbMiss := s1_tlb_miss
s1_out.ptwBack := io.tlb.resp.bits.ptwBack
s1_out.rep_info.debug := s1_in.uop.debugInfo
s1_out.rep_info.nuke := s1_nuke && !s1_sw_prf