Hi,
You can find the configuration I usually use. There are a few things that
are a bit unrealistic, for example, large SQ size, but I usually do so to
account for better features on a real machine that we do not have at the
moment. The command would have:
--cpu-type=DerivO3CPU --bp-typ=TAGE_SC_L_64KB
--l1d-hwp-type=IndirectMemoryPrefetcher --l2-hwp-type=L2MultiPrefetcher .
I am still working on a better design, especially a better prefetcher
configuration, but wanted to post it here to open up the discussion. With
this, I usually get 6GB/S for stream on a single-core compared to 1011GB/s
on a real machine. This is significantly better than the default
configuration that would deliver something around 1-2GB/S.
diff --git a/configs/common/Caches.py b/configs/common/Caches.py
index 1468b953c..fe22a7c27 100644
--- a/configs/common/Caches.py
+++ b/configs/common/Caches.py
@@ -47,11 +47,12 @@ from m5.objects import *
# specific instantiations.
class L1Cache(Cache):
- assoc = 2
- tag_latency = 2
- data_latency = 2
- response_latency = 2
- mshrs = 4
+ assoc = 8
+ tag_latency = 4
+ data_latency = 4
+ response_latency = 4
+ mshrs = 20
+ write_buffers = 20
tgts_per_mshr = 20
class L1_ICache(L1Cache):
@@ -63,13 +64,13 @@ class L1_DCache(L1Cache):
pass
class L2Cache(Cache):
- assoc = 8
- tag_latency = 20
- data_latency = 20
- response_latency = 20
- mshrs = 20
+ assoc = 16
+ tag_latency = 12
+ data_latency = 12
+ response_latency = 12
+ mshrs = 32
tgts_per_mshr = 12
- write_buffers = 8
+ write_buffers = 32
class IOCache(Cache):
assoc = 8
@@ -81,7 +82,7 @@ class IOCache(Cache):
tgts_per_mshr = 12
class PageTableWalkerCache(Cache):
- assoc = 2
+ assoc = 4
tag_latency = 2
data_latency = 2
response_latency = 2
diff --git a/configs/common/Options.py b/configs/common/Options.py
index a63cc7b08..ad3a6b25e 100644
--- a/configs/common/Options.py
+++ b/configs/common/Options.py
@@ -148,9 +148,9 @@ def addNoISAOptions(parser):
parser.add_argument("--l1i_size", type=str, default="32kB")
parser.add_argument("--l2_size", type=str, default="2MB")
parser.add_argument("--l3_size", type=str, default="16MB")
- parser.add_argument("--l1d_assoc", type=int, default=2)
- parser.add_argument("--l1i_assoc", type=int, default=2)
- parser.add_argument("--l2_assoc", type=int, default=8)
+ parser.add_argument("--l1d_assoc", type=int, default=8)
+ parser.add_argument("--l1i_assoc", type=int, default=8)
+ parser.add_argument("--l2_assoc", type=int, default=16)
parser.add_argument("--l3_assoc", type=int, default=16)
parser.add_argument("--cacheline_size", type=int, default=64)
@@ -238,7 +238,7 @@ def addCommonOptions(parser):
the selected cache)""")
parser.add_argument("--checker", action="store_true")
parser.add_argument("--cpu-clock", action="store", type=str,
- default='2GHz',
+ default='3.66GHz',
help="Clock for blocks running at CPU speed")
parser.add_argument("--smt", action="store_true", default=False,
help="""
diff --git a/src/arch/x86/X86TLB.py b/src/arch/x86/X86TLB.py
index 8abc93c19..d5139c162 100644
--- a/src/arch/x86/X86TLB.py
+++ b/src/arch/x86/X86TLB.py
@@ -54,7 +54,7 @@ class X86TLB(BaseTLB):
cxx_class = 'gem5::X86ISA::TLB'
cxx_header = 'arch/x86/tlb.hh'
- size = Param.Unsigned(64, "TLB size")
+ size = Param.Unsigned(128, "TLB size")
system = Param.System(Parent.any, "system object")
walker = Param.X86PagetableWalker(\
X86PagetableWalker(), "page table walker")
diff --git a/src/cpu/o3/O3CPU.py b/src/cpu/o3/O3CPU.py
index fb1a9dc9d..e39e73267 100644
--- a/src/cpu/o3/O3CPU.py
+++ b/src/cpu/o3/O3CPU.py
@@ -73,9 +73,9 @@ class O3CPU(BaseCPU):
activity = Param.Unsigned(0, "Initial count")
- cacheStorePorts = Param.Unsigned(200, "Cache Ports. "
+ cacheStorePorts = Param.Unsigned(4, "Cache Ports. "
"Constrains stores only.")
- cacheLoadPorts = Param.Unsigned(200, "Cache Ports. "
+ cacheLoadPorts = Param.Unsigned(4, "Cache Ports. "
"Constrains loads only.")
decodeToFetchDelay = Param.Cycles(1, "Decode to fetch delay")
@@ -85,7 +85,7 @@ class O3CPU(BaseCPU):
commitToFetchDelay = Param.Cycles(1, "Commit to fetch delay")
fetchWidth = Param.Unsigned(8, "Fetch width")
fetchBufferSize = Param.Unsigned(64, "Fetch buffer size in bytes")
- fetchQueueSize = Param.Unsigned(32, "Fetch queue size in micro-ops "
+ fetchQueueSize = Param.Unsigned(128, "Fetch queue size in micro-ops "
"per-thread")
renameToDecodeDelay = Param.Cycles(1, "Rename to decode delay")
@@ -123,15 +123,15 @@ class O3CPU(BaseCPU):
backComSize = Param.Unsigned(5, "Time buffer size for backwards
communication")
forwardComSize = Param.Unsigned(5, "Time buffer size for forward
communication")
- LQEntries = Param.Unsigned(32, "Number of load queue entries")
- SQEntries = Param.Unsigned(32, "Number of store queue entries")
+ LQEntries = Param.Unsigned(128, "Number of load queue entries")
+ SQEntries = Param.Unsigned(128, "Number of store queue entries")
LSQDepCheckShift = Param.Unsigned(4, "Number of places to shift addr
before check")
LSQCheckLoads = Param.Bool(True,
"Should dependency violations be checked for loads & stores or
just stores")
store_set_clear_period = Param.Unsigned(250000,
"Number of load/store insts before the dep predictor should be
invalidated")
- LFSTSize = Param.Unsigned(1024, "Last fetched store table size")
- SSITSize = Param.Unsigned(1024, "Store set ID table size")
+ LFSTSize = Param.Unsigned(2048, "Last fetched store table size")
+ SSITSize = Param.Unsigned(2048, "Store set ID table size")
numRobs = Param.Unsigned(1, "Number of Reorder Buffers");
@@ -154,8 +154,8 @@ class O3CPU(BaseCPU):
"registers")
numPhysCCRegs = Param.Unsigned(_defaultNumPhysCCRegs,
"Number of physical cc registers")
- numIQEntries = Param.Unsigned(64, "Number of instruction queue
entries")
- numROBEntries = Param.Unsigned(192, "Number of reorder buffer entries")
+ numIQEntries = Param.Unsigned(128, "Number of instruction queue
entries")
+ numROBEntries = Param.Unsigned(320, "Number of reorder buffer entries")
smtNumFetchingThreads = Param.Unsigned(1, "SMT Number of Fetching
Threads")
smtFetchPolicy = Param.SMTFetchPolicy('RoundRobin', "SMT Fetch policy")
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index 78999ee46..6d921bb2b 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -115,6 +115,8 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const O3CPUParams
¶ms)
thread[tid].init(cpu, iew_ptr, params, this, tid);
thread[tid].setDcachePort(&dcachePort);
}
+
+ std::cout<<"maxLQEntries "<<maxLQEntries<<" maxSQEntries
"<<maxSQEntries<<std::endl;
}
diff --git a/src/mem/MemCtrl.py b/src/mem/MemCtrl.py
index 90d0e5004..84b2b6f39 100644
--- a/src/mem/MemCtrl.py
+++ b/src/mem/MemCtrl.py
@@ -70,11 +70,11 @@ class MemCtrl(QoSMemCtrl):
# threshold in percent for when to forcefully trigger writes and
# start emptying the write buffer
- write_high_thresh_perc = Param.Percent(85, "Threshold to force writes")
+ write_high_thresh_perc = Param.Percent(60, "Threshold to force writes")
# threshold in percentage for when to start writes if the read
# queue is empty
- write_low_thresh_perc = Param.Percent(50, "Threshold to start writes")
+ write_low_thresh_perc = Param.Percent(40, "Threshold to start writes")
# minimum write bursts to schedule before switching back to reads
min_writes_per_switch = Param.Unsigned(16, "Minimum write bursts
before "
diff --git a/src/mem/cache/prefetch/Prefetcher.py
b/src/mem/cache/prefetch/Prefetcher.py
index 7d704881b..a6974c863 100644
--- a/src/mem/cache/prefetch/Prefetcher.py
+++ b/src/mem/cache/prefetch/Prefetcher.py
@@ -120,13 +120,16 @@ class MultiPrefetcher(BasePrefetcher):
prefetchers = VectorParam.BasePrefetcher([], "Array of prefetchers")
+
+
+
class QueuedPrefetcher(BasePrefetcher):
type = "QueuedPrefetcher"
abstract = True
cxx_class = 'gem5::prefetch::Queued'
cxx_header = "mem/cache/prefetch/queued.hh"
latency = Param.Int(1, "Latency for generated prefetches")
- queue_size = Param.Int(32, "Maximum number of queued prefetches")
+ queue_size = Param.Int(64, "Maximum number of queued prefetches")
max_prefetch_requests_with_pending_translation = Param.Int(32,
"Maximum number of queued prefetches that have a missing
translation")
queue_squash = Param.Bool(True, "Squash queued prefetch on demand
access")
@@ -191,7 +194,7 @@ class IndirectMemoryPrefetcher(QueuedPrefetcher):
type = 'IndirectMemoryPrefetcher'
cxx_class = 'gem5::prefetch::IndirectMemory'
cxx_header = "mem/cache/prefetch/indirect_memory.hh"
- pt_table_entries = Param.MemorySize("16",
+ pt_table_entries = Param.MemorySize("32",
"Number of entries of the Prefetch Table")
pt_table_assoc = Param.Unsigned(16, "Associativity of the Prefetch
Table")
pt_table_indexing_policy = Param.BaseIndexingPolicy(
@@ -257,9 +260,9 @@ class SignaturePathPrefetcher(QueuedPrefetcher):
pattern_table_replacement_policy = Param.BaseReplacementPolicy(LRURP(),
"Replacement policy of the pattern table")
- prefetch_confidence_threshold = Param.Float(0.5,
+ prefetch_confidence_threshold = Param.Float(0.25,
"Minimum confidence to issue prefetches")
- lookahead_confidence_threshold = Param.Float(0.75,
+ lookahead_confidence_threshold = Param.Float(0.275,
"Minimum confidence to continue exploring lookahead entries")
class SignaturePathPrefetcherV2(SignaturePathPrefetcher):
@@ -529,3 +532,6 @@ class PIFPrefetcher(QueuedPrefetcher):
if not isinstance(simObj, SimObject):
raise TypeError("argument must be of SimObject type")
self.addEvent(HWPProbeEventRetiredInsts(self,
simObj,"RetiredInstsPC"))
+
+class L2MultiPrefetcher(MultiPrefetcher):
+ prefetchers = VectorParam.BasePrefetcher([SignaturePathPrefetcher(),
AMPMPrefetcher(), DCPTPrefetcher()], "Array of prefetchers")
\ No newline at end of file
On Sat, Apr 23, 2022 at 12:21 PM Jason Lowe-Power <[email protected]>
wrote:
> Majid,
>
> These are all great suggestions! Do you have a configuration file that you
> would be willing to share? It would be a huge benefit to the community if
> we had some better default configurations in the "examples" for gem5
> configuration files.
>
> We're also trying to use the new standard library for these kinds of
> "good" configurations. We can work with you to create a "prebuilt board"
> with all of these parameters and even run nightly/weekly tests to make sure
> there are no performance regressions.
>
> Thanks!
> Jason
>
> On Fri, Apr 22, 2022 at 7:52 PM Majid Jalili <[email protected]> wrote:
>
>> I think it is hard to get to a real machine level in terms of BW. But By
>> looking at your stats, I found the lsqFullEvents is high.
>> You can go after the CPU to make it more aggressive, increasing
>> Load/Store queue size, and ROB depth are the minimal changes you can make.
>> I usually do at least ROB sizes of 256 or 320. With that, you may set the
>> LSQ size to at least 1/4 of ROB size.
>> For MSHRs, your numbers are good now, 10 is too little even in intel
>> machines, I found recently they increased that to 16-20.
>> The other thing you can try to st is the cache latencies, make sure that
>> they are reasonable.
>> For prefetcher, you can use IMPPrefetcher in addition to DCPT, it has a
>> pretty aggressive stream prefetcher inside.
>> Also, DRAM memory mapping is important, I do not remember what is the
>> default for the the mem type you are using
>>
>> Majid
>>
>>
>>
>> On Sat, Apr 16, 2022 at 2:12 AM 王子聪 <[email protected]> wrote:
>>
>>> Hi Majid,
>>>
>>> Thanks for your suggestion! I check the default number of MSHRs (in
>>> configs/common/Caches.py), and found the default #MSHR of L1/L2 are 4 and
>>> 20 respectively.
>>>
>>> According to the PACT’18 paper "Cimple: Instruction and Memory Level
>>> Parallelism: A DSL for Uncovering ILP and MLP”, it says that "Modern
>>> processors typically have 6–10 L1 cache MSHRs”, and "Intel’s Haswell
>>> microarchitecture uses 10 L1 MSHRs (Line Fill Buffers) for
>>> handling outstanding L1 misses”. So I change to L1 #MSHRs to 16 and L2
>>> #MSHRs to 32 (which I think it’s enough to handling outstanding misses),
>>> and then change the L1/L2 prefetcher type to DCPT. Then I got the STREAM
>>> output as shown in below:
>>>
>>> ./build/X86/gem5.opt configs/example/se.py --cpu-type=O3CPU --caches
>>> --l1d_size=256kB --l1i_size=256kB
>>> --param="system.cpu[0].dcache.mshrs=16;system.cpu[0].icache.mshrs=16;system.l2.mshrs=32"
>>> --l2cache --l2_size=8MB --l1i-hwp-type=DCPTPrefetcher
>>> --l1d-hwp-type=DCPTPrefetcher --l2-hwp-type=DCPTPrefetcher
>>> --mem-type=DDR3_1600_8x8 -c ../stream/stream
>>> -------------------------------------------------------------
>>> Function Best Rate MB/s Avg time Min time Max time
>>> Copy: 3479.8 0.004598 0.004598 0.004598
>>> Scale: 3554.0 0.004502 0.004502 0.004502
>>> Add: 4595.0 0.005223 0.005223 0.005223
>>> Triad: 4705.9 0.005100 0.005100 0.005100
>>> -------------------------------------------------------------
>>>
>>> The busutil of DRAM also improved:
>>> -------------------------------------------------------------
>>> system.mem_ctrls.dram.bytesRead 239947840 # Total bytes read
>>> (Byte)
>>> system.mem_ctrls.dram.bytesWritten 121160640 # Total bytes
>>> written (Byte)
>>> system.mem_ctrls.dram.avgRdBW 1611.266685 # Average DRAM read
>>> bandwidth in MiBytes/s ((Byte/Second))
>>> system.mem_ctrls.dram.avgWrBW 813.602251 # Average DRAM write
>>> bandwidth in MiBytes/s ((Byte/Second))
>>> system.mem_ctrls.dram.peakBW 12800.00 # Theoretical peak
>>> bandwidth in MiByte/s ((Byte/Second))
>>> system.mem_ctrls.dram.busUtil 18.94 # Data bus
>>> utilization in percentage (Ratio)
>>> system.mem_ctrls.dram.busUtilRead 12.59 # Data bus
>>> utilization in percentage for reads (Ratio)
>>> system.mem_ctrls.dram.busUtilWrite 6.36 # Data bus
>>> utilization in percentage for writes (Ratio)
>>> system.mem_ctrls.dram.pageHitRate 89.16 # Row buffer hit
>>> rate, read and write combined (Ratio)
>>> -------------------------------------------------------------
>>>
>>> It’s indeed improving the achieved bandwidth, but still a little far
>>> away from the peak bandwidth of DDR3_1600 (12800 MiB/s). stats.txt is
>>> uploaded for reference (
>>> https://gist.github.com/wzc314/cf29275f853ee0b2fcd865f9b492c355)
>>>
>>> Any idea is appreciated!
>>> Thank you in advance!
>>>
>>> Bests,
>>> Zicong
>>>
>>>
>>>
>>> 2022年4月16日 00:08,Majid Jalili <[email protected]> 写道:
>>>
>>> Hi,
>>> Make sure your system has enough MSHRs, out of the box, L1, and L2 are
>>> set to have a few MSHR entries.
>>> Also, stride prefetcher is not the best, you may try something better:
>>> DCPT gives me better numbers.
>>>
>>> On Fri, Apr 15, 2022 at 4:57 AM Zicong Wang via gem5-users <
>>> [email protected]> wrote:
>>> Hi Jason,
>>>
>>> We are testing the memory bandwidth program STREAM (
>>> https://www.cs.virginia.edu/stream/), but the results show that the
>>> CPU cannot fully utilize the DDR bandwidth, and the achieved bandwidth is
>>> quite low and about 1/10 of the peak bandwidth (peakBW in stats.txt). I
>>> tested the STREAM binary on my x86 computer and got the near
>>> peak bandwidth, so I believe the program is ok.
>>>
>>> I've seen the maillist dialogue
>>> https://www.mail-archive.com/[email protected]/msg12965.html, and
>>> I think I've met the similar problem. So I tried the suggestions proposed
>>> by Andreas, including enable l1/l2 prefetcher, using ARM
>>> detailed CPU. Although these methods can improve the bandwidth, the results
>>> show it has limited effect. Besides, I've also tested the STREAM program in
>>> FS mode with x86 O3/Minor/TimingSimple CPU, and tested it in SE mode with
>>> ruby option, but all the results are similar and there is no essential
>>> difference.
>>>
>>> I guess it is a general problem in simulation with gem5. I'm wondering
>>> if the result is expected or is there something wrong with the system model?
>>>
>>> Two of the experimental results are attached for reference:
>>>
>>> 1. X86 O3CPU, SE-mode, w/o l2 prefetcher:
>>>
>>> ./build/X86/gem5.opt --outdir=m5out-stream configs/example/se.py
>>> --cpu-type=O3CPU --caches --l1d_size=256kB --l1i_size=256kB --l2cache
>>> --l2_size=8MB --mem-type=DDR3_1600_8x8 -c ../stream/stream
>>>
>>> STREAM output:
>>>
>>> -------------------------------------------------------------
>>> Function Best Rate MB/s Avg time Min time Max time
>>> Copy: 1099.0 0.014559 0.014559 0.014559
>>> Scale: 1089.7 0.014683 0.014683 0.014683
>>> Add: 1213.0 0.019786 0.019786 0.019786
>>> Triad: 1222.1 0.019639 0.019639 0.019639
>>> -------------------------------------------------------------
>>>
>>> stats.txt (dram related):
>>>
>>> system.mem_ctrls.dram.bytesRead 238807808 # Total bytes read
>>> (Byte)
>>> system.mem_ctrls.dram.bytesWritten 121179776 # Total bytes
>>> written (Byte)
>>> system.mem_ctrls.dram.avgRdBW 718.689026 # Average DRAM read
>>> bandwidth in MiBytes/s ((Byte/Second))
>>> system.mem_ctrls.dram.avgWrBW 364.688977 # Average DRAM
>>> write bandwidth in MiBytes/s ((Byte/Second))
>>> system.mem_ctrls.dram.peakBW 12800.00 # Theoretical peak
>>> bandwidth in MiByte/s ((Byte/Second))
>>> system.mem_ctrls.dram.busUtil 8.46 # Data bus
>>> utilization in percentage (Ratio)
>>> system.mem_ctrls.dram.busUtilRead 5.61 # Data bus
>>> utilization in percentage for reads (Ratio)
>>> system.mem_ctrls.dram.busUtilWrite 2.85 # Data bus
>>> utilization in percentage for writes (Ratio)
>>> system.mem_ctrls.dram.pageHitRate 40.57 # Row buffer hit
>>> rate, read and write combined (Ratio)
>>>
>>>
>>>
>>> 2. X86 O3CPU, SE-mode, w/ l2 prefetcher:
>>>
>>> ./build/X86/gem5.opt --outdir=m5out-stream-l2hwp configs/example/se.py
>>> --cpu-type=O3CPU --caches --l1d_size=256kB --l1i_size=256kB --l2cache
>>> --l2_size=8MB --l2-hwp-typ=StridePrefetcher --mem-type=DDR3_1600_8x8 -c
>>> ../stream/stream
>>>
>>> STREAM output:
>>>
>>> -------------------------------------------------------------
>>> Function Best Rate MB/s Avg time Min time Max time
>>> Copy: 1703.9 0.009390 0.009390 0.009390
>>> Scale: 1718.6 0.009310 0.009310 0.009310
>>> Add: 2087.3 0.011498 0.011498 0.011498
>>> Triad: 2227.2 0.010776 0.010776 0.010776
>>> -------------------------------------------------------------
>>> stats.txt (dram related):
>>>
>>> system.mem_ctrls.dram.bytesRead 238811712 # Total bytes read
>>> (Byte)
>>> system.mem_ctrls.dram.bytesWritten 121179840 # Total bytes
>>> written (Byte)
>>> system.mem_ctrls.dram.avgRdBW 1014.129912 # Average DRAM read
>>> bandwidth in MiBytes/s ((Byte/Second))
>>> system.mem_ctrls.dram.avgWrBW 514.598298 # Average DRAM
>>> write bandwidth in MiBytes/s ((Byte/Second))
>>> system.mem_ctrls.dram.peakBW 12800.00 # Theoretical peak
>>> bandwidth in MiByte/s ((Byte/Second))
>>> system.mem_ctrls.dram.busUtil 11.94 # Data bus
>>> utilization in percentage (Ratio)
>>> system.mem_ctrls.dram.busUtilRead 7.92 # Data bus
>>> utilization in percentage for reads (Ratio)
>>> system.mem_ctrls.dram.busUtilWrite 4.02 # Data bus
>>> utilization in percentage for writes (Ratio)
>>> system.mem_ctrls.dram.pageHitRate 75.37 # Row buffer hit
>>> rate, read and write combined (Ratio)
>>>
>>>
>>>
>>> STREAM compiling options:
>>>
>>> gcc -O2 -static -DSTREAM_ARRAY_SIZE=1000000 -DNTIMES=2 stream.c -o
>>> stream
>>>
>>> All the experiments are performed on the latest stable
>>> version (141cc37c2d4b93959d4c249b8f7e6a8b2ef75338, v21.2.1).
>>>
>>> Thank you very much!
>>>
>>>
>>>
>>> Best Regards,
>>>
>>> Zicong
>>>
>>>
>>>
>>> _______________________________________________
>>> gem5-users mailing list -- [email protected]
>>> To unsubscribe send an email to [email protected]
>>> %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
>>>
>>>
>>>
_______________________________________________
gem5-users mailing list -- [email protected]
To unsubscribe send an email to [email protected]
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s