[gem5-users] Re: Low memory bandwidth achieved with STREAM benchmark

Majid Jalili via gem5-users Sun, 24 Apr 2022 19:17:28 -0700

Hi,
You can find the configuration I usually use. There are a few things that
are a bit unrealistic, for example, large SQ size, but I usually do so to
account for better features on a real machine that we do not have at the
moment. The command would have:


--cpu-type=DerivO3CPU  --bp-typ=TAGE_SC_L_64KB
--l1d-hwp-type=IndirectMemoryPrefetcher --l2-hwp-type=L2MultiPrefetcher .

I am still working on a better design, especially a better prefetcher
configuration, but wanted to post it here to open up the discussion. With
this, I usually get 6GB/S for stream on a single-core compared to 1011GB/s
on a real machine. This is significantly better than the default
configuration that would deliver something around 1-2GB/S.


diff --git a/configs/common/Caches.py b/configs/common/Caches.py
index 1468b953c..fe22a7c27 100644
--- a/configs/common/Caches.py
+++ b/configs/common/Caches.py
@@ -47,11 +47,12 @@ from m5.objects import *
 # specific instantiations.

 class L1Cache(Cache):
-    assoc = 2
-    tag_latency = 2
-    data_latency = 2
-    response_latency = 2
-    mshrs = 4
+    assoc = 8
+    tag_latency = 4
+    data_latency = 4
+    response_latency = 4
+    mshrs = 20
+    write_buffers = 20
     tgts_per_mshr = 20

 class L1_ICache(L1Cache):
@@ -63,13 +64,13 @@ class L1_DCache(L1Cache):
     pass

 class L2Cache(Cache):
-    assoc = 8
-    tag_latency = 20
-    data_latency = 20
-    response_latency = 20
-    mshrs = 20
+    assoc = 16
+    tag_latency = 12
+    data_latency = 12
+    response_latency = 12
+    mshrs = 32
     tgts_per_mshr = 12
-    write_buffers = 8
+    write_buffers = 32

 class IOCache(Cache):
     assoc = 8
@@ -81,7 +82,7 @@ class IOCache(Cache):
     tgts_per_mshr = 12

 class PageTableWalkerCache(Cache):
-    assoc = 2
+    assoc = 4
     tag_latency = 2
     data_latency = 2
     response_latency = 2
diff --git a/configs/common/Options.py b/configs/common/Options.py
index a63cc7b08..ad3a6b25e 100644
--- a/configs/common/Options.py
+++ b/configs/common/Options.py
@@ -148,9 +148,9 @@ def addNoISAOptions(parser):
     parser.add_argument("--l1i_size", type=str, default="32kB")
     parser.add_argument("--l2_size", type=str, default="2MB")
     parser.add_argument("--l3_size", type=str, default="16MB")
-    parser.add_argument("--l1d_assoc", type=int, default=2)
-    parser.add_argument("--l1i_assoc", type=int, default=2)
-    parser.add_argument("--l2_assoc", type=int, default=8)
+    parser.add_argument("--l1d_assoc", type=int, default=8)
+    parser.add_argument("--l1i_assoc", type=int, default=8)
+    parser.add_argument("--l2_assoc", type=int, default=16)
     parser.add_argument("--l3_assoc", type=int, default=16)
     parser.add_argument("--cacheline_size", type=int, default=64)

@@ -238,7 +238,7 @@ def addCommonOptions(parser):
                         the selected cache)""")
     parser.add_argument("--checker", action="store_true")
     parser.add_argument("--cpu-clock", action="store", type=str,
-                        default='2GHz',
+                        default='3.66GHz',
                         help="Clock for blocks running at CPU speed")
     parser.add_argument("--smt", action="store_true", default=False,
                         help="""
diff --git a/src/arch/x86/X86TLB.py b/src/arch/x86/X86TLB.py
index 8abc93c19..d5139c162 100644
--- a/src/arch/x86/X86TLB.py
+++ b/src/arch/x86/X86TLB.py
@@ -54,7 +54,7 @@ class X86TLB(BaseTLB):
     cxx_class = 'gem5::X86ISA::TLB'
     cxx_header = 'arch/x86/tlb.hh'

-    size = Param.Unsigned(64, "TLB size")
+    size = Param.Unsigned(128, "TLB size")
     system = Param.System(Parent.any, "system object")
     walker = Param.X86PagetableWalker(\
             X86PagetableWalker(), "page table walker")
diff --git a/src/cpu/o3/O3CPU.py b/src/cpu/o3/O3CPU.py
index fb1a9dc9d..e39e73267 100644
--- a/src/cpu/o3/O3CPU.py
+++ b/src/cpu/o3/O3CPU.py
@@ -73,9 +73,9 @@ class O3CPU(BaseCPU):

     activity = Param.Unsigned(0, "Initial count")

-    cacheStorePorts = Param.Unsigned(200, "Cache Ports. "
+    cacheStorePorts = Param.Unsigned(4, "Cache Ports. "
           "Constrains stores only.")
-    cacheLoadPorts = Param.Unsigned(200, "Cache Ports. "
+    cacheLoadPorts = Param.Unsigned(4, "Cache Ports. "
           "Constrains loads only.")

     decodeToFetchDelay = Param.Cycles(1, "Decode to fetch delay")
@@ -85,7 +85,7 @@ class O3CPU(BaseCPU):
     commitToFetchDelay = Param.Cycles(1, "Commit to fetch delay")
     fetchWidth = Param.Unsigned(8, "Fetch width")
     fetchBufferSize = Param.Unsigned(64, "Fetch buffer size in bytes")
-    fetchQueueSize = Param.Unsigned(32, "Fetch queue size in micro-ops "
+    fetchQueueSize = Param.Unsigned(128, "Fetch queue size in micro-ops "
                                     "per-thread")

     renameToDecodeDelay = Param.Cycles(1, "Rename to decode delay")
@@ -123,15 +123,15 @@ class O3CPU(BaseCPU):
     backComSize = Param.Unsigned(5, "Time buffer size for backwards
communication")
     forwardComSize = Param.Unsigned(5, "Time buffer size for forward
communication")

-    LQEntries = Param.Unsigned(32, "Number of load queue entries")
-    SQEntries = Param.Unsigned(32, "Number of store queue entries")
+    LQEntries = Param.Unsigned(128, "Number of load queue entries")
+    SQEntries = Param.Unsigned(128, "Number of store queue entries")
     LSQDepCheckShift = Param.Unsigned(4, "Number of places to shift addr
before check")
     LSQCheckLoads = Param.Bool(True,
         "Should dependency violations be checked for loads & stores or
just stores")
     store_set_clear_period = Param.Unsigned(250000,
             "Number of load/store insts before the dep predictor should be
invalidated")
-    LFSTSize = Param.Unsigned(1024, "Last fetched store table size")
-    SSITSize = Param.Unsigned(1024, "Store set ID table size")
+    LFSTSize = Param.Unsigned(2048, "Last fetched store table size")
+    SSITSize = Param.Unsigned(2048, "Store set ID table size")

     numRobs = Param.Unsigned(1, "Number of Reorder Buffers");

@@ -154,8 +154,8 @@ class O3CPU(BaseCPU):
                                       "registers")
     numPhysCCRegs = Param.Unsigned(_defaultNumPhysCCRegs,
                                    "Number of physical cc registers")
-    numIQEntries = Param.Unsigned(64, "Number of instruction queue
entries")
-    numROBEntries = Param.Unsigned(192, "Number of reorder buffer entries")
+    numIQEntries = Param.Unsigned(128, "Number of instruction queue
entries")
+    numROBEntries = Param.Unsigned(320, "Number of reorder buffer entries")

     smtNumFetchingThreads = Param.Unsigned(1, "SMT Number of Fetching
Threads")
     smtFetchPolicy = Param.SMTFetchPolicy('RoundRobin', "SMT Fetch policy")
diff --git a/src/cpu/o3/lsq.cc b/src/cpu/o3/lsq.cc
index 78999ee46..6d921bb2b 100644
--- a/src/cpu/o3/lsq.cc
+++ b/src/cpu/o3/lsq.cc
@@ -115,6 +115,8 @@ LSQ::LSQ(CPU *cpu_ptr, IEW *iew_ptr, const O3CPUParams
&params)
         thread[tid].init(cpu, iew_ptr, params, this, tid);
         thread[tid].setDcachePort(&dcachePort);
     }
+
+    std::cout<<"maxLQEntries "<<maxLQEntries<<" maxSQEntries
"<<maxSQEntries<<std::endl;
 }


diff --git a/src/mem/MemCtrl.py b/src/mem/MemCtrl.py
index 90d0e5004..84b2b6f39 100644
--- a/src/mem/MemCtrl.py
+++ b/src/mem/MemCtrl.py
@@ -70,11 +70,11 @@ class MemCtrl(QoSMemCtrl):

     # threshold in percent for when to forcefully trigger writes and
     # start emptying the write buffer
-    write_high_thresh_perc = Param.Percent(85, "Threshold to force writes")
+    write_high_thresh_perc = Param.Percent(60, "Threshold to force writes")

     # threshold in percentage for when to start writes if the read
     # queue is empty
-    write_low_thresh_perc = Param.Percent(50, "Threshold to start writes")
+    write_low_thresh_perc = Param.Percent(40, "Threshold to start writes")

     # minimum write bursts to schedule before switching back to reads
     min_writes_per_switch = Param.Unsigned(16, "Minimum write bursts
before "
diff --git a/src/mem/cache/prefetch/Prefetcher.py
b/src/mem/cache/prefetch/Prefetcher.py
index 7d704881b..a6974c863 100644
--- a/src/mem/cache/prefetch/Prefetcher.py
+++ b/src/mem/cache/prefetch/Prefetcher.py
@@ -120,13 +120,16 @@ class MultiPrefetcher(BasePrefetcher):

     prefetchers = VectorParam.BasePrefetcher([], "Array of prefetchers")

+
+
+
 class QueuedPrefetcher(BasePrefetcher):
     type = "QueuedPrefetcher"
     abstract = True
     cxx_class = 'gem5::prefetch::Queued'
     cxx_header = "mem/cache/prefetch/queued.hh"
     latency = Param.Int(1, "Latency for generated prefetches")
-    queue_size = Param.Int(32, "Maximum number of queued prefetches")
+    queue_size = Param.Int(64, "Maximum number of queued prefetches")
     max_prefetch_requests_with_pending_translation = Param.Int(32,
         "Maximum number of queued prefetches that have a missing
translation")
     queue_squash = Param.Bool(True, "Squash queued prefetch on demand
access")
@@ -191,7 +194,7 @@ class IndirectMemoryPrefetcher(QueuedPrefetcher):
     type = 'IndirectMemoryPrefetcher'
     cxx_class = 'gem5::prefetch::IndirectMemory'
     cxx_header = "mem/cache/prefetch/indirect_memory.hh"
-    pt_table_entries = Param.MemorySize("16",
+    pt_table_entries = Param.MemorySize("32",
         "Number of entries of the Prefetch Table")
     pt_table_assoc = Param.Unsigned(16, "Associativity of the Prefetch
Table")
     pt_table_indexing_policy = Param.BaseIndexingPolicy(
@@ -257,9 +260,9 @@ class SignaturePathPrefetcher(QueuedPrefetcher):
     pattern_table_replacement_policy = Param.BaseReplacementPolicy(LRURP(),
         "Replacement policy of the pattern table")

-    prefetch_confidence_threshold = Param.Float(0.5,
+    prefetch_confidence_threshold = Param.Float(0.25,
         "Minimum confidence to issue prefetches")
-    lookahead_confidence_threshold = Param.Float(0.75,
+    lookahead_confidence_threshold = Param.Float(0.275,
         "Minimum confidence to continue exploring lookahead entries")

 class SignaturePathPrefetcherV2(SignaturePathPrefetcher):
@@ -529,3 +532,6 @@ class PIFPrefetcher(QueuedPrefetcher):
         if not isinstance(simObj, SimObject):
             raise TypeError("argument must be of SimObject type")
         self.addEvent(HWPProbeEventRetiredInsts(self,
simObj,"RetiredInstsPC"))
+
+class L2MultiPrefetcher(MultiPrefetcher):
+    prefetchers = VectorParam.BasePrefetcher([SignaturePathPrefetcher(),
AMPMPrefetcher(), DCPTPrefetcher()], "Array of prefetchers")
\ No newline at end of file









On Sat, Apr 23, 2022 at 12:21 PM Jason Lowe-Power <[email protected]>
wrote:

> Majid,
>
> These are all great suggestions! Do you have a configuration file that you
> would be willing to share? It would be a huge benefit to the community if
> we had some better default configurations in the "examples" for gem5
> configuration files.
>
> We're also trying to use the new standard library for these kinds of
> "good" configurations. We can work with you to create a "prebuilt board"
> with all of these parameters and even run nightly/weekly tests to make sure
> there are no performance regressions.
>
> Thanks!
> Jason
>
> On Fri, Apr 22, 2022 at 7:52 PM Majid Jalili <[email protected]> wrote:
>
>> I think it is hard to get to a real machine level in terms of BW. But By
>> looking at your stats, I found the lsqFullEvents is high.
>> You can go after the CPU to make it more aggressive, increasing
>> Load/Store queue size, and ROB depth are the minimal changes you can make.
>> I usually do at least ROB sizes of 256 or 320. With that, you may set the
>> LSQ size to at least 1/4  of ROB size.
>> For MSHRs, your numbers are good now, 10 is too little even in intel
>> machines, I found recently they increased that to 16-20.
>> The other thing you can try to st is the cache latencies, make sure that
>> they are reasonable.
>> For prefetcher, you can use IMPPrefetcher in addition to DCPT, it has a
>> pretty aggressive stream prefetcher inside.
>> Also, DRAM memory mapping is important, I do not remember what is the
>> default for the the mem type you are using
>>
>> Majid
>>
>>
>>
>> On Sat, Apr 16, 2022 at 2:12 AM 王子聪 <[email protected]> wrote:
>>
>>> Hi Majid,
>>>
>>> Thanks for your suggestion! I check the default number of MSHRs (in
>>> configs/common/Caches.py), and found the default #MSHR of L1/L2 are 4 and
>>> 20 respectively.
>>>
>>> According to the PACT’18 paper "Cimple: Instruction and Memory Level
>>> Parallelism: A DSL for Uncovering ILP and MLP”,  it says that "Modern
>>> processors typically have 6–10 L1 cache MSHRs”, and "Intel’s Haswell
>>> microarchitecture uses 10 L1 MSHRs (Line Fill Buffers) for
>>> handling outstanding L1 misses”. So I change to L1 #MSHRs to 16 and L2
>>> #MSHRs to 32 (which I think it’s enough to handling outstanding misses),
>>> and then change the L1/L2 prefetcher type to DCPT. Then I got the STREAM
>>> output as shown in below:
>>>
>>> ./build/X86/gem5.opt configs/example/se.py --cpu-type=O3CPU --caches
>>> --l1d_size=256kB --l1i_size=256kB
>>> --param="system.cpu[0].dcache.mshrs=16;system.cpu[0].icache.mshrs=16;system.l2.mshrs=32"
>>> --l2cache --l2_size=8MB --l1i-hwp-type=DCPTPrefetcher
>>> --l1d-hwp-type=DCPTPrefetcher --l2-hwp-type=DCPTPrefetcher
>>> --mem-type=DDR3_1600_8x8 -c ../stream/stream
>>> -------------------------------------------------------------
>>> Function    Best Rate MB/s  Avg time     Min time     Max time
>>> Copy:            3479.8     0.004598     0.004598     0.004598
>>> Scale:           3554.0     0.004502     0.004502     0.004502
>>> Add:             4595.0     0.005223     0.005223     0.005223
>>> Triad:           4705.9     0.005100     0.005100     0.005100
>>> -------------------------------------------------------------
>>>
>>> The busutil of DRAM also improved:
>>> -------------------------------------------------------------
>>> system.mem_ctrls.dram.bytesRead          239947840  # Total bytes read
>>> (Byte)
>>> system.mem_ctrls.dram.bytesWritten       121160640  # Total bytes
>>> written (Byte)
>>> system.mem_ctrls.dram.avgRdBW          1611.266685  # Average DRAM read
>>> bandwidth in MiBytes/s ((Byte/Second))
>>> system.mem_ctrls.dram.avgWrBW           813.602251  # Average DRAM write
>>> bandwidth in MiBytes/s ((Byte/Second))
>>> system.mem_ctrls.dram.peakBW              12800.00  # Theoretical peak
>>> bandwidth in MiByte/s ((Byte/Second))
>>> system.mem_ctrls.dram.busUtil                18.94  # Data bus
>>> utilization in percentage (Ratio)
>>> system.mem_ctrls.dram.busUtilRead            12.59  # Data bus
>>> utilization in percentage for reads (Ratio)
>>> system.mem_ctrls.dram.busUtilWrite            6.36  # Data bus
>>> utilization in percentage for writes (Ratio)
>>> system.mem_ctrls.dram.pageHitRate            89.16  # Row buffer hit
>>> rate, read and write combined (Ratio)
>>> -------------------------------------------------------------
>>>
>>> It’s indeed improving the achieved bandwidth, but still a little far
>>> away from the peak bandwidth of DDR3_1600 (12800 MiB/s). stats.txt is
>>> uploaded for reference (
>>> https://gist.github.com/wzc314/cf29275f853ee0b2fcd865f9b492c355)
>>>
>>> Any idea is appreciated!
>>> Thank you in advance!
>>>
>>> Bests,
>>> Zicong
>>>
>>>
>>>
>>> 2022年4月16日 00:08，Majid Jalili <[email protected]> 写道：
>>>
>>> Hi,
>>> Make sure your system has enough MSHRs, out of the box, L1, and L2 are
>>> set to have a few MSHR entries.
>>> Also, stride prefetcher is not the best, you may try something better:
>>> DCPT gives me better numbers.
>>>
>>> On Fri, Apr 15, 2022 at 4:57 AM Zicong Wang via gem5-users <
>>> [email protected]> wrote:
>>> Hi Jason,
>>>
>>>   We are testing the memory bandwidth program STREAM (
>>> https://www.cs.virginia.edu/stream/), but the results show that the
>>> CPU cannot fully utilize the DDR bandwidth, and the achieved bandwidth is
>>> quite low and about 1/10 of the peak bandwidth (peakBW in stats.txt). I
>>> tested the STREAM binary on my x86 computer and got the near
>>> peak bandwidth, so I believe the program is ok.
>>>
>>>   I've seen the maillist dialogue
>>> https://www.mail-archive.com/[email protected]/msg12965.html, and
>>> I think I've met the similar problem. So I tried the suggestions proposed
>>> by Andreas, including enable l1/l2 prefetcher, using ARM
>>> detailed CPU. Although these methods can improve the bandwidth, the results
>>> show it has limited effect. Besides, I've also tested the STREAM program in
>>> FS mode with x86 O3/Minor/TimingSimple CPU, and tested it in SE mode with
>>> ruby option, but all the results are similar and there is no essential
>>> difference.
>>>
>>>   I guess it is a general problem in simulation with gem5. I'm wondering
>>> if the result is expected or is there something wrong with the system model?
>>>
>>>   Two of the experimental results are attached for reference:
>>>
>>> 1. X86 O3CPU, SE-mode, w/o l2 prefetcher:
>>>
>>> ./build/X86/gem5.opt --outdir=m5out-stream configs/example/se.py
>>> --cpu-type=O3CPU --caches --l1d_size=256kB --l1i_size=256kB --l2cache
>>> --l2_size=8MB --mem-type=DDR3_1600_8x8 -c ../stream/stream
>>>
>>> STREAM output:
>>>
>>> -------------------------------------------------------------
>>> Function    Best Rate MB/s     Avg time     Min time     Max time
>>> Copy:            1099.0     0.014559     0.014559     0.014559
>>> Scale:           1089.7     0.014683     0.014683     0.014683
>>> Add:             1213.0     0.019786     0.019786     0.019786
>>> Triad:           1222.1     0.019639     0.019639     0.019639
>>> -------------------------------------------------------------
>>>
>>> stats.txt (dram related):
>>>
>>> system.mem_ctrls.dram.bytesRead          238807808   # Total bytes read
>>> (Byte)
>>> system.mem_ctrls.dram.bytesWritten       121179776   # Total bytes
>>> written (Byte)
>>> system.mem_ctrls.dram.avgRdBW           718.689026   # Average DRAM read
>>> bandwidth in MiBytes/s ((Byte/Second))
>>> system.mem_ctrls.dram.avgWrBW           364.688977   # Average DRAM
>>> write bandwidth in MiBytes/s ((Byte/Second))
>>> system.mem_ctrls.dram.peakBW              12800.00   # Theoretical peak
>>> bandwidth in MiByte/s ((Byte/Second))
>>> system.mem_ctrls.dram.busUtil                 8.46   # Data bus
>>> utilization in percentage (Ratio)
>>> system.mem_ctrls.dram.busUtilRead             5.61   # Data bus
>>> utilization in percentage for reads (Ratio)
>>> system.mem_ctrls.dram.busUtilWrite            2.85   # Data bus
>>> utilization in percentage for writes (Ratio)
>>> system.mem_ctrls.dram.pageHitRate            40.57   # Row buffer hit
>>> rate, read and write combined (Ratio)
>>>
>>>
>>>
>>> 2. X86 O3CPU, SE-mode, w/ l2 prefetcher:
>>>
>>> ./build/X86/gem5.opt --outdir=m5out-stream-l2hwp configs/example/se.py
>>> --cpu-type=O3CPU --caches --l1d_size=256kB --l1i_size=256kB --l2cache
>>> --l2_size=8MB --l2-hwp-typ=StridePrefetcher --mem-type=DDR3_1600_8x8 -c
>>> ../stream/stream
>>>
>>> STREAM output:
>>>
>>> -------------------------------------------------------------
>>> Function    Best Rate MB/s     Avg time     Min time     Max time
>>> Copy:            1703.9     0.009390     0.009390     0.009390
>>> Scale:           1718.6     0.009310     0.009310     0.009310
>>> Add:             2087.3     0.011498     0.011498     0.011498
>>> Triad:           2227.2     0.010776     0.010776     0.010776
>>> -------------------------------------------------------------
>>> stats.txt (dram related):
>>>
>>> system.mem_ctrls.dram.bytesRead          238811712   # Total bytes read
>>> (Byte)
>>> system.mem_ctrls.dram.bytesWritten       121179840   # Total bytes
>>> written (Byte)
>>> system.mem_ctrls.dram.avgRdBW          1014.129912   # Average DRAM read
>>> bandwidth in MiBytes/s ((Byte/Second))
>>> system.mem_ctrls.dram.avgWrBW           514.598298   # Average DRAM
>>> write bandwidth in MiBytes/s ((Byte/Second))
>>> system.mem_ctrls.dram.peakBW              12800.00   # Theoretical peak
>>> bandwidth in MiByte/s ((Byte/Second))
>>> system.mem_ctrls.dram.busUtil                11.94   # Data bus
>>> utilization in percentage (Ratio)
>>> system.mem_ctrls.dram.busUtilRead             7.92   # Data bus
>>> utilization in percentage for reads (Ratio)
>>> system.mem_ctrls.dram.busUtilWrite            4.02   # Data bus
>>> utilization in percentage for writes (Ratio)
>>> system.mem_ctrls.dram.pageHitRate            75.37   # Row buffer hit
>>> rate, read and write combined (Ratio)
>>>
>>>
>>>
>>> STREAM compiling options:
>>>
>>> gcc -O2 -static -DSTREAM_ARRAY_SIZE=1000000 -DNTIMES=2 stream.c -o
>>> stream
>>>
>>> All the experiments are performed on the latest stable
>>> version (141cc37c2d4b93959d4c249b8f7e6a8b2ef75338, v21.2.1).
>>>
>>>   Thank you very much!
>>>
>>>
>>>
>>> Best Regards,
>>>
>>> Zicong
>>>
>>>
>>>
>>> _______________________________________________
>>> gem5-users mailing list -- [email protected]
>>> To unsubscribe send an email to [email protected]
>>> %(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s
>>>
>>>
>>>

_______________________________________________
gem5-users mailing list -- [email protected]
To unsubscribe send an email to [email protected]
%(web_page_url)slistinfo%(cgiext)s/%(_internal_name)s

[gem5-users] Re: Low memory bandwidth achieved with STREAM benchmark

Reply via email to