[PATCH] D71775: [ThreadPool] On Windows, extend usage to all CPU sockets and all NUMA groups

Alexandre Ganea via Phabricator via cfe-commits Thu, 09 Jan 2020 14:19:42 -0800

aganea updated this revision to Diff 237197.
aganea marked 13 inline comments as done.
aganea added a comment.
Herald added subscribers: lucyrfox, mgester, arpith-jacob, nicolasvasilache, 
antiagainst, shauheen, burmako, jpienaar, rriddle.


Updated as suggested by @rnk.
I've also removed `ThreadPoolStrategy::PinThreads` because it wasn't 
conclusive. In the best case, pinning threads to a core / hyperthread, was 
similar in performance to that of using a full CPU socket affinity; in the 
worst case, it was degrading performance. The NT scheduler seems to be doing a 
pretty good job here, so we'll stick with it :-)


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D71775/new/

https://reviews.llvm.org/D71775

Files:
  clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
  clang-tools-extra/clangd/TUScheduler.cpp
  clang-tools-extra/clangd/index/Background.cpp
  clang-tools-extra/clangd/index/Background.h
  clang-tools-extra/clangd/index/BackgroundRebuild.h
  clang/lib/Tooling/AllTUsExecution.cpp
  clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
  clang/tools/clang-scan-deps/ClangScanDeps.cpp
  lld/ELF/SyntheticSections.cpp
  llvm/include/llvm/ADT/BitVector.h
  llvm/include/llvm/ADT/SmallBitVector.h
  llvm/include/llvm/LTO/LTO.h
  llvm/include/llvm/Support/ThreadPool.h
  llvm/include/llvm/Support/Threading.h
  llvm/lib/CodeGen/ParallelCG.cpp
  llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
  llvm/lib/LTO/LTO.cpp
  llvm/lib/LTO/LTOBackend.cpp
  llvm/lib/LTO/ThinLTOCodeGenerator.cpp
  llvm/lib/Support/Host.cpp
  llvm/lib/Support/Parallel.cpp
  llvm/lib/Support/ThreadPool.cpp
  llvm/lib/Support/Threading.cpp
  llvm/lib/Support/Unix/Threading.inc
  llvm/lib/Support/Windows/Threading.inc
  llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
  llvm/tools/dsymutil/dsymutil.cpp
  llvm/tools/gold/gold-plugin.cpp
  llvm/tools/llvm-cov/CodeCoverage.cpp
  llvm/tools/llvm-cov/CoverageExporterJson.cpp
  llvm/tools/llvm-cov/CoverageReport.cpp
  llvm/tools/llvm-lto2/llvm-lto2.cpp
  llvm/tools/llvm-profdata/llvm-profdata.cpp
  llvm/unittests/Support/Host.cpp
  llvm/unittests/Support/TaskQueueTest.cpp
  llvm/unittests/Support/ThreadPool.cpp
  llvm/unittests/Support/Threading.cpp
  mlir/lib/Pass/Pass.cpp

Index: mlir/lib/Pass/Pass.cpp
===================================================================
--- mlir/lib/Pass/Pass.cpp
+++ mlir/lib/Pass/Pass.cpp
@@ -411,7 +411,8 @@
   // Create the async executors if they haven't been created, or if the main
   // pipeline has changed.
   if (asyncExecutors.empty() || hasSizeMismatch(asyncExecutors.front(), mgrs))
-    asyncExecutors.assign(llvm::hardware_concurrency(), mgrs);
+    asyncExecutors.assign(llvm::hardware_concurrency().compute_thread_count(),
+                          mgrs);
 
   // Run a prepass over the module to collect the operations to execute over.
   // This ensures that an analysis manager exists for each operation, as well as
Index: llvm/unittests/Support/Threading.cpp
===================================================================
--- llvm/unittests/Support/Threading.cpp
+++ llvm/unittests/Support/Threading.cpp
@@ -21,7 +21,8 @@
   auto Num = heavyweight_hardware_concurrency();
   // Since Num is unsigned this will also catch us trying to
   // return -1.
-  ASSERT_LE(Num, thread::hardware_concurrency());
+  ASSERT_LE(Num.compute_thread_count(),
+            hardware_concurrency().compute_thread_count());
 }
 
 #if LLVM_ENABLE_THREADS
Index: llvm/unittests/Support/ThreadPool.cpp
===================================================================
--- llvm/unittests/Support/ThreadPool.cpp
+++ llvm/unittests/Support/ThreadPool.cpp
@@ -8,11 +8,13 @@
 
 #include "llvm/Support/ThreadPool.h"
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/Threading.h"
 
 #include "gtest/gtest.h"
 
@@ -69,6 +71,8 @@
 
   void SetUp() override { MainThreadReady = false; }
 
+  void TestAllThreads(ThreadPoolStrategy S);
+
   std::condition_variable WaitMainThread;
   std::mutex WaitMainThreadMutex;
   bool MainThreadReady = false;
@@ -131,7 +135,7 @@
 
 TEST_F(ThreadPoolTest, GetFuture) {
   CHECK_UNSUPPORTED();
-  ThreadPool Pool{2};
+  ThreadPool Pool(hardware_concurrency(2));
   std::atomic_int i{0};
   Pool.async([this, &i] {
     waitForMainThread();
@@ -162,3 +166,45 @@
   }
   ASSERT_EQ(5, checked_in);
 }
+
+#if LLVM_ENABLE_THREADS == 1
+
+void ThreadPoolTest::TestAllThreads(ThreadPoolStrategy S) {
+  // FIXME: Skip these tests on non-Windows because multi-socket system were not
+  // tested on Unix yet, and llvm::get_thread_affinity_mask() isn't implemented
+  // for Unix.
+  Triple Host(Triple::normalize(sys::getProcessTriple()));
+  if (!Host.isOSWindows())
+    return;
+
+  llvm::DenseSet<llvm::BitVector> ThreadsUsed;
+  std::mutex Lock;
+  unsigned Threads = 0;
+  {
+    ThreadPool Pool(S);
+    Threads = Pool.getThreadCount();
+    for (size_t I = 0; I < 10000; ++I) {
+      Pool.async([&] {
+        waitForMainThread();
+        std::lock_guard<std::mutex> Guard(Lock);
+        auto Mask = llvm::get_thread_affinity_mask();
+        ThreadsUsed.insert(Mask);
+      });
+    }
+    ASSERT_EQ(true, ThreadsUsed.empty());
+    setMainThreadReady();
+  }
+  ASSERT_EQ(llvm::get_cpus(), ThreadsUsed.size());
+}
+
+TEST_F(ThreadPoolTest, AllThreads_UseAllRessources) {
+  CHECK_UNSUPPORTED();
+  TestAllThreads({});
+}
+
+TEST_F(ThreadPoolTest, AllThreads_OneThreadPerCore) {
+  CHECK_UNSUPPORTED();
+  TestAllThreads(llvm::heavyweight_hardware_concurrency());
+}
+
+#endif
Index: llvm/unittests/Support/TaskQueueTest.cpp
===================================================================
--- llvm/unittests/Support/TaskQueueTest.cpp
+++ llvm/unittests/Support/TaskQueueTest.cpp
@@ -22,7 +22,7 @@
 };
 
 TEST_F(TaskQueueTest, OrderedFutures) {
-  ThreadPool TP(1);
+  ThreadPool TP(hardware_concurrency(1));
   TaskQueue TQ(TP);
   std::atomic<int> X{ 0 };
   std::atomic<int> Y{ 0 };
@@ -66,7 +66,7 @@
 }
 
 TEST_F(TaskQueueTest, UnOrderedFutures) {
-  ThreadPool TP(1);
+  ThreadPool TP(hardware_concurrency(1));
   TaskQueue TQ(TP);
   std::atomic<int> X{ 0 };
   std::atomic<int> Y{ 0 };
@@ -96,7 +96,7 @@
 }
 
 TEST_F(TaskQueueTest, FutureWithReturnValue) {
-  ThreadPool TP(1);
+  ThreadPool TP(hardware_concurrency(1));
   TaskQueue TQ(TP);
   std::future<std::string> F1 = TQ.async([&] { return std::string("Hello"); });
   std::future<int> F2 = TQ.async([&] { return 42; });
Index: llvm/unittests/Support/Host.cpp
===================================================================
--- llvm/unittests/Support/Host.cpp
+++ llvm/unittests/Support/Host.cpp
@@ -37,7 +37,8 @@
     // Initially this is only testing detection of the number of
     // physical cores, which is currently only supported/tested for
     // x86_64 Linux and Darwin.
-    return (Host.getArch() == Triple::x86_64 &&
+    return Host.isOSWindows() ||
+           (Host.getArch() == Triple::x86_64 &&
             (Host.isOSDarwin() || Host.getOS() == Triple::Linux));
   }
 
Index: llvm/tools/llvm-profdata/llvm-profdata.cpp
===================================================================
--- llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -307,8 +307,11 @@
 
   // If NumThreads is not specified, auto-detect a good default.
   if (NumThreads == 0)
-    NumThreads =
-        std::min(hardware_concurrency(), unsigned((Inputs.size() + 1) / 2));
+    NumThreads = std::min(hardware_concurrency().compute_thread_count(),
+                          unsigned((Inputs.size() + 1) / 2));
+  // FIXME: There's a bug here, where setting NumThreads = Inputs.size() fails
+  // the merge_empty_profile.test because the InstrProfWriter.ProfileKind isn't
+  // merged, thus the emitted file ends up with a PF_Unknown kind.
 
   // Initialize the writer contexts.
   SmallVector<std::unique_ptr<WriterContext>, 4> Contexts;
@@ -320,7 +323,7 @@
     for (const auto &Input : Inputs)
       loadInput(Input, Remapper, Contexts[0].get());
   } else {
-    ThreadPool Pool(NumThreads);
+    ThreadPool Pool(hardware_concurrency(NumThreads));
 
     // Load the inputs in parallel (N/NumThreads serial steps).
     unsigned Ctx = 0;
Index: llvm/tools/llvm-lto2/llvm-lto2.cpp
===================================================================
--- llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -65,8 +65,8 @@
                                        "import files for the "
                                        "distributed backend case"));
 
-static cl::opt<int> Threads("thinlto-threads",
-                            cl::init(llvm::heavyweight_hardware_concurrency()));
+// Default to using all hardware cores in the system.
+static cl::opt<int> Threads("thinlto-threads", cl::init(0));
 
 static cl::list<std::string> SymbolResolutions(
     "r",
Index: llvm/tools/llvm-cov/CoverageReport.cpp
===================================================================
--- llvm/tools/llvm-cov/CoverageReport.cpp
+++ llvm/tools/llvm-cov/CoverageReport.cpp
@@ -356,11 +356,8 @@
 
   // If NumThreads is not specified, auto-detect a good default.
   if (NumThreads == 0)
-    NumThreads =
-        std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(),
-                              unsigned(Files.size())));
-
-  ThreadPool Pool(NumThreads);
+    NumThreads = Files.size();
+  ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads));
 
   std::vector<FileCoverageSummary> FileReports;
   FileReports.reserve(Files.size());
Index: llvm/tools/llvm-cov/CoverageExporterJson.cpp
===================================================================
--- llvm/tools/llvm-cov/CoverageExporterJson.cpp
+++ llvm/tools/llvm-cov/CoverageExporterJson.cpp
@@ -163,11 +163,9 @@
                         ArrayRef<FileCoverageSummary> FileReports,
                         const CoverageViewOptions &Options) {
   auto NumThreads = Options.NumThreads;
-  if (NumThreads == 0) {
-    NumThreads = std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(),
-                                       unsigned(SourceFiles.size())));
-  }
-  ThreadPool Pool(NumThreads);
+  if (NumThreads == 0)
+    NumThreads = SourceFiles.size();
+  ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads));
   json::Array FileArray;
   std::mutex FileArrayMutex;
 
Index: llvm/tools/llvm-cov/CodeCoverage.cpp
===================================================================
--- llvm/tools/llvm-cov/CodeCoverage.cpp
+++ llvm/tools/llvm-cov/CodeCoverage.cpp
@@ -944,9 +944,7 @@
 
   // If NumThreads is not specified, auto-detect a good default.
   if (NumThreads == 0)
-    NumThreads =
-        std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(),
-                              unsigned(SourceFiles.size())));
+    NumThreads = SourceFiles.size();
 
   if (!ViewOpts.hasOutputDirectory() || NumThreads == 1) {
     for (const std::string &SourceFile : SourceFiles)
@@ -954,7 +952,7 @@
                           ShowFilenames);
   } else {
     // In -output-dir mode, it's safe to use multiple threads to print files.
-    ThreadPool Pool(NumThreads);
+    ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads));
     for (const std::string &SourceFile : SourceFiles)
       Pool.async(&CodeCoverageTool::writeSourceFileView, this, SourceFile,
                  Coverage.get(), Printer.get(), ShowFilenames);
Index: llvm/tools/gold/gold-plugin.cpp
===================================================================
--- llvm/tools/gold/gold-plugin.cpp
+++ llvm/tools/gold/gold-plugin.cpp
@@ -134,8 +134,8 @@
   static unsigned OptLevel = 2;
   // Default parallelism of 0 used to indicate that user did not specify.
   // Actual parallelism default value depends on implementation.
-  // Currently only affects ThinLTO, where the default is
-  // llvm::heavyweight_hardware_concurrency.
+  // Currently only affects ThinLTO, where the default is the max cores in the
+  // system.
   static unsigned Parallelism = 0;
   // Default regular LTO codegen parallelism (number of partitions).
   static unsigned ParallelCodeGenParallelismLevel = 1;
Index: llvm/tools/dsymutil/dsymutil.cpp
===================================================================
--- llvm/tools/dsymutil/dsymutil.cpp
+++ llvm/tools/dsymutil/dsymutil.cpp
@@ -258,7 +258,7 @@
   if (opt::Arg *NumThreads = Args.getLastArg(OPT_threads))
     Options.LinkOpts.Threads = atoi(NumThreads->getValue());
   else
-    Options.LinkOpts.Threads = thread::hardware_concurrency();
+    Options.LinkOpts.Threads = 0; // Use all available hardware threads
 
   if (Options.DumpDebugMap || Options.LinkOpts.Verbose)
     Options.LinkOpts.Threads = 1;
@@ -540,9 +540,10 @@
     // Shared a single binary holder for all the link steps.
     BinaryHolder BinHolder;
 
-    unsigned ThreadCount =
-        std::min<unsigned>(Options.LinkOpts.Threads, DebugMapPtrsOrErr->size());
-    ThreadPool Threads(ThreadCount);
+    unsigned ThreadCount = Options.LinkOpts.Threads;
+    if (!ThreadCount)
+      ThreadCount = DebugMapPtrsOrErr->size();
+    ThreadPool Threads(hardware_concurrency(ThreadCount));
 
     // If there is more than one link to execute, we need to generate
     // temporary files.
Index: llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
===================================================================
--- llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
+++ llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
@@ -3020,7 +3020,7 @@
     // errors from unchecked llvm::Error objects.
     Error RemarkLinkAllError = Error::success();
 
-    ThreadPool pool(3);
+    ThreadPool pool(hardware_concurrency(3));
     pool.async(AnalyzeAll);
     pool.async(CloneAll);
     pool.async(RemarkLinkAll, std::ref(RemarkLinkAllError));
Index: llvm/lib/Support/Windows/Threading.inc
===================================================================
--- llvm/lib/Support/Windows/Threading.inc
+++ llvm/lib/Support/Windows/Threading.inc
@@ -16,6 +16,8 @@
 #include "WindowsSupport.h"
 #include <process.h>
 
+#include <bitset>
+
 // Windows will at times define MemoryFence.
 #ifdef MemoryFence
 #undef MemoryFence
@@ -122,3 +124,163 @@
              ? SetThreadPriorityResult::SUCCESS
              : SetThreadPriorityResult::FAILURE;
 }
+
+struct ProcessorGroup {
+  unsigned ID;
+  unsigned AllThreads;
+  unsigned UsableThreads;
+  unsigned ThreadsPerCore;
+  uint64_t Affinity;
+};
+
+template <typename F>
+static bool IterateProcInfo(LOGICAL_PROCESSOR_RELATIONSHIP Relationship, F Fn) {
+  DWORD Len = 0;
+  BOOL R = ::GetLogicalProcessorInformationEx(Relationship, NULL, &Len);
+  if (R || GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+    return false;
+  }
+  auto *Info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)calloc(1, Len);
+  R = ::GetLogicalProcessorInformationEx(Relationship, Info, &Len);
+  if (R) {
+    auto *End =
+        (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Info + Len);
+    for (auto *Curr = Info; Curr < End;
+         Curr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Curr +
+                                                            Curr->Size)) {
+      if (Curr->Relationship != Relationship)
+        continue;
+      Fn(Curr);
+    }
+  }
+  free(Info);
+  return true;
+}
+
+static ArrayRef<ProcessorGroup> getProcessorGroups() {
+  auto computeGroups = []() {
+    SmallVector<ProcessorGroup, 4> Groups;
+
+    auto HandleGroup = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) {
+      GROUP_RELATIONSHIP &El = ProcInfo->Group;
+      for (unsigned J = 0; J < El.ActiveGroupCount; ++J) {
+        ProcessorGroup G;
+        G.ID = Groups.size();
+        G.AllThreads = El.GroupInfo[J].MaximumProcessorCount;
+        G.UsableThreads = El.GroupInfo[J].ActiveProcessorCount;
+        assert(G.UsableThreads <= 64);
+        G.Affinity = El.GroupInfo[J].ActiveProcessorMask;
+        Groups.push_back(G);
+      }
+    };
+
+    if (!IterateProcInfo(RelationGroup, HandleGroup))
+      return std::vector<ProcessorGroup>();
+
+    auto HandleProc = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) {
+      PROCESSOR_RELATIONSHIP &El = ProcInfo->Processor;
+      assert(El.GroupCount == 1);
+      unsigned NumHyperThreads = 1;
+      // If the flag is set, each core supports more than one hyper-thread.
+      if (El.Flags & LTP_PC_SMT)
+          NumHyperThreads = std::bitset<64>(El.GroupMask[0].Mask).count();
+      unsigned I = El.GroupMask[0].Group;
+      Groups[I].ThreadsPerCore = NumHyperThreads;
+    };
+
+    if (!IterateProcInfo(RelationProcessorCore, HandleProc))
+      return std::vector<ProcessorGroup>();
+
+    // If there's an affinity mask set on one of the CPUs, then assume the user
+    // wants to constrain the current process to only a single CPU.
+    for (auto &G : Groups) {
+      if (G.UsableThreads != G.AllThreads) {
+        ProcessorGroup NewG{G};
+        Groups.clear();
+        Groups.push_back(NewG);
+        break;
+      }
+    }
+
+    return std::vector<ProcessorGroup>(Groups.begin(), Groups.end());
+  };
+  static auto Groups = computeGroups();
+  return ArrayRef<ProcessorGroup>(Groups);
+}
+
+template <typename R, typename UnaryPredicate>
+static unsigned aggregate(R &&Range, UnaryPredicate P) {
+  unsigned I{};
+  for (const auto &It : Range)
+    I += P(It);
+  return I;
+}
+
+// for sys::getHostNumPhysicalCores
+int computeHostNumPhysicalCores() {
+  static unsigned Cores =
+      aggregate(getProcessorGroups(), [](const ProcessorGroup &G) {
+        return G.UsableThreads / G.ThreadsPerCore;
+      });
+  return Cores;
+}
+
+int computeHostNumHardwareThreads() {
+  static unsigned Threads =
+      aggregate(getProcessorGroups(),
+                [](const ProcessorGroup &G) { return G.UsableThreads; });
+  return Threads;
+}
+
+// Assign the current thread to a more appropriate CPU socket or CPU group
+void llvm::ThreadPoolStrategy::apply_thread_strategy(
+    unsigned ThreadPoolNum) const {
+  ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
+
+  assert(ThreadPoolNum < compute_thread_count() &&
+         "The thread index is not within thread strategy's range!");
+
+  // In this mode, the ThreadNumber represents the core number, not the
+  // hyper-thread number. Assumes all NUMA groups have the same amount of
+  // hyper-threads.
+  if (!UseHyperThreads)
+    ThreadPoolNum *= Groups[0].ThreadsPerCore;
+
+  unsigned ThreadRangeStart = 0;
+  for (unsigned I = 0; I < Groups.size(); ++I) {
+    const ProcessorGroup &G = Groups[I];
+    if (ThreadPoolNum >= ThreadRangeStart &&
+        ThreadPoolNum < ThreadRangeStart + G.UsableThreads) {
+
+      GROUP_AFFINITY Affinity{};
+      Affinity.Group = G.ID;
+      Affinity.Mask = G.Affinity;
+      SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
+    }
+    ThreadRangeStart += G.UsableThreads;
+  }
+}
+
+llvm::BitVector llvm::get_thread_affinity_mask() {
+  GROUP_AFFINITY Affinity{};
+  GetThreadGroupAffinity(GetCurrentThread(), &Affinity);
+
+  static unsigned All =
+      aggregate(getProcessorGroups(),
+                [](const ProcessorGroup &G) { return G.AllThreads; });
+
+  unsigned StartOffset =
+      aggregate(getProcessorGroups(), [&](const ProcessorGroup &G) {
+        return G.ID < Affinity.Group ? G.AllThreads : 0;
+      });
+
+  llvm::BitVector V;
+  V.resize(All);
+  for (unsigned I = 0; I < sizeof(KAFFINITY) * 8; ++I) {
+    if ((Affinity.Mask >> I) & 1)
+      V.set(StartOffset + I);
+  }
+  return V;
+}
+
+unsigned llvm::get_cpus() { return getProcessorGroups().size(); }
Index: llvm/lib/Support/Unix/Threading.inc
===================================================================
--- llvm/lib/Support/Unix/Threading.inc
+++ llvm/lib/Support/Unix/Threading.inc
@@ -267,3 +267,27 @@
 #endif
   return SetThreadPriorityResult::FAILURE;
 }
+
+#include <thread>
+
+int computeHostNumHardwareThreads() {
+#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT)
+  cpu_set_t Set;
+  if (sched_getaffinity(0, sizeof(Set), &Set))
+    return CPU_COUNT(&Set);
+#endif
+  // Guard against std::thread::hardware_concurrency() returning 0.
+  if (unsigned Val = std::thread::hardware_concurrency())
+    return Val;
+  return 1;
+}
+
+void llvm::ThreadPoolStrategy::apply_thread_strategy(
+    unsigned ThreadPoolNum) const {}
+
+llvm::BitVector llvm::get_thread_affinity_mask() {
+  // FIXME: Implement
+  llvm_unreachable("Not implemented!");
+}
+
+unsigned llvm::get_cpus() { return 1; }
Index: llvm/lib/Support/Threading.cpp
===================================================================
--- llvm/lib/Support/Threading.cpp
+++ llvm/lib/Support/Threading.cpp
@@ -45,10 +45,6 @@
   Fn(UserData);
 }
 
-unsigned llvm::heavyweight_hardware_concurrency() { return 1; }
-
-unsigned llvm::hardware_concurrency() { return 1; }
-
 uint64_t llvm::get_threadid() { return 0; }
 
 uint32_t llvm::get_max_thread_name_length() { return 0; }
@@ -57,6 +53,14 @@
 
 void llvm::get_thread_name(SmallVectorImpl<char> &Name) { Name.clear(); }
 
+llvm::BitVector llvm::get_thread_affinity_mask() { return {}; }
+
+unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
+  // When threads are disabled, already return 1 to ensure clients will loop at
+  // least once.
+  return 1;
+}
+
 #if LLVM_ENABLE_THREADS == 0
 void llvm::llvm_execute_on_thread_async(
     llvm::unique_function<void()> Func,
@@ -78,30 +82,19 @@
 
 #else
 
-#include <thread>
-unsigned llvm::heavyweight_hardware_concurrency() {
-  // Since we can't get here unless LLVM_ENABLE_THREADS == 1, it is safe to use
-  // `std::thread` directly instead of `llvm::thread` (and indeed, doing so
-  // allows us to not define `thread` in the llvm namespace, which conflicts
-  // with some platforms such as FreeBSD whose headers also define a struct
-  // called `thread` in the global namespace which can cause ambiguity due to
-  // ADL.
-  int NumPhysical = sys::getHostNumPhysicalCores();
-  if (NumPhysical == -1)
-    return std::thread::hardware_concurrency();
-  return NumPhysical;
-}
+int computeHostNumHardwareThreads();
 
-unsigned llvm::hardware_concurrency() {
-#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT)
-  cpu_set_t Set;
-  if (sched_getaffinity(0, sizeof(Set), &Set))
-    return CPU_COUNT(&Set);
-#endif
-  // Guard against std::thread::hardware_concurrency() returning 0.
-  if (unsigned Val = std::thread::hardware_concurrency())
-    return Val;
-  return 1;
+unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
+  int MaxThreadCount = UseHyperThreads ? computeHostNumHardwareThreads()
+                                       : sys::getHostNumPhysicalCores();
+  if (MaxThreadCount <= 0)
+    MaxThreadCount = 1;
+
+  // No need to create more threads than there are hardware threads, it would
+  // uselessly induce more context-switching and cache eviction.
+  if (!ThreadsRequested || ThreadsRequested > (unsigned)MaxThreadCount)
+    return MaxThreadCount;
+  return ThreadsRequested;
 }
 
 namespace {
Index: llvm/lib/Support/ThreadPool.cpp
===================================================================
--- llvm/lib/Support/ThreadPool.cpp
+++ llvm/lib/Support/ThreadPool.cpp
@@ -20,16 +20,15 @@
 
 #if LLVM_ENABLE_THREADS
 
-// Default to hardware_concurrency
-ThreadPool::ThreadPool() : ThreadPool(hardware_concurrency()) {}
-
-ThreadPool::ThreadPool(unsigned ThreadCount)
-    : ActiveThreads(0), EnableFlag(true) {
+ThreadPool::ThreadPool(ThreadPoolStrategy S)
+    : ActiveThreads(0), EnableFlag(true),
+      ThreadCount(S.compute_thread_count()) {
   // Create ThreadCount threads that will loop forever, wait on QueueCondition
   // for tasks to be queued or the Pool to be destroyed.
   Threads.reserve(ThreadCount);
   for (unsigned ThreadID = 0; ThreadID < ThreadCount; ++ThreadID) {
-    Threads.emplace_back([&] {
+    Threads.emplace_back([S, ThreadID, this] {
+      S.apply_thread_strategy(ThreadID);
       while (true) {
         PackagedTaskTy Task;
         {
@@ -108,12 +107,10 @@
 
 #else // LLVM_ENABLE_THREADS Disabled
 
-ThreadPool::ThreadPool() : ThreadPool(0) {}
-
 // No threads are launched, issue a warning if ThreadCount is not 0
-ThreadPool::ThreadPool(unsigned ThreadCount)
-    : ActiveThreads(0) {
-  if (ThreadCount) {
+ThreadPool::ThreadPool(ThreadPoolStrategy S)
+    : ActiveThreads(0), ThreadCount(S.compute_thread_count()) {
+  if (ThreadCount != 1) {
     errs() << "Warning: request a ThreadPool with " << ThreadCount
            << " threads, but LLVM_ENABLE_THREADS has been turned off\n";
   }
@@ -138,8 +135,6 @@
   return Future;
 }
 
-ThreadPool::~ThreadPool() {
-  wait();
-}
+ThreadPool::~ThreadPool() { wait(); }
 
 #endif
Index: llvm/lib/Support/Parallel.cpp
===================================================================
--- llvm/lib/Support/Parallel.cpp
+++ llvm/lib/Support/Parallel.cpp
@@ -36,13 +36,16 @@
 ///   in filo order.
 class ThreadPoolExecutor : public Executor {
 public:
-  explicit ThreadPoolExecutor(unsigned ThreadCount = hardware_concurrency())
-      : Done(ThreadCount) {
+  explicit ThreadPoolExecutor(ThreadPoolStrategy S = hardware_concurrency())
+      : Strategy(S), Done(S.compute_thread_count()) {
     // Spawn all but one of the threads in another thread as spawning threads
     // can take a while.
-    std::thread([&, ThreadCount] {
-      for (size_t i = 1; i < ThreadCount; ++i) {
-        std::thread([=] { work(); }).detach();
+    std::thread([&] {
+      for (size_t I = 1; I < Strategy.compute_thread_count(); ++I) {
+        std::thread([=] {
+          Strategy.apply_thread_strategy(I);
+          work();
+        }).detach();
       }
       work();
     }).detach();
@@ -82,6 +85,7 @@
   std::stack<std::function<void()>> WorkStack;
   std::mutex Mutex;
   std::condition_variable Cond;
+  ThreadPoolStrategy Strategy;
   parallel::detail::Latch Done;
 };
 
Index: llvm/lib/Support/Host.cpp
===================================================================
--- llvm/lib/Support/Host.cpp
+++ llvm/lib/Support/Host.cpp
@@ -1266,7 +1266,7 @@
 // On Linux, the number of physical cores can be computed from /proc/cpuinfo,
 // using the number of unique physical/core id pairs. The following
 // implementation reads the /proc/cpuinfo format on an x86_64 system.
-static int computeHostNumPhysicalCores() {
+int computeHostNumPhysicalCores() {
   // Read /proc/cpuinfo as a stream (until EOF reached). It cannot be
   // mmapped because it appears to have 0 size.
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
@@ -1312,7 +1312,7 @@
 #include <sys/sysctl.h>
 
 // Gets the number of *physical cores* on the machine.
-static int computeHostNumPhysicalCores() {
+int computeHostNumPhysicalCores() {
   uint32_t count;
   size_t len = sizeof(count);
   sysctlbyname("hw.physicalcpu", &count, &len, NULL, 0);
@@ -1326,6 +1326,9 @@
   }
   return count;
 }
+#elif defined(_WIN32)
+// Defined in llvm/lib/Support/Windows/Threading.inc
+int computeHostNumPhysicalCores();
 #else
 // On other systems, return -1 to indicate unknown.
 static int computeHostNumPhysicalCores() { return -1; }
Index: llvm/lib/LTO/ThinLTOCodeGenerator.cpp
===================================================================
--- llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -81,8 +81,8 @@
 
 namespace {
 
-static cl::opt<int>
-    ThreadCount("threads", cl::init(llvm::heavyweight_hardware_concurrency()));
+// Default to using one job per hardware core in the system
+static cl::opt<int> ThreadCount("threads", cl::init(0));
 
 // Simple helper to save temporary files for debug.
 static void saveTempBitcode(const Module &TheModule, StringRef TempDir,
@@ -1037,7 +1037,7 @@
 
   // Parallel optimizer + codegen
   {
-    ThreadPool Pool(ThreadCount);
+    ThreadPool Pool(heavyweight_hardware_concurrency(ThreadCount));
     for (auto IndexCount : ModulesOrdering) {
       auto &Mod = Modules[IndexCount];
       Pool.async([&](int count) {
Index: llvm/lib/LTO/LTOBackend.cpp
===================================================================
--- llvm/lib/LTO/LTOBackend.cpp
+++ llvm/lib/LTO/LTOBackend.cpp
@@ -375,7 +375,8 @@
 void splitCodeGen(Config &C, TargetMachine *TM, AddStreamFn AddStream,
                   unsigned ParallelCodeGenParallelismLevel,
                   std::unique_ptr<Module> Mod) {
-  ThreadPool CodegenThreadPool(ParallelCodeGenParallelismLevel);
+  ThreadPool CodegenThreadPool(
+      heavyweight_hardware_concurrency(ParallelCodeGenParallelismLevel));
   unsigned ThreadCount = 0;
   const Target *T = &TM->getTarget();
 
Index: llvm/lib/LTO/LTO.cpp
===================================================================
--- llvm/lib/LTO/LTO.cpp
+++ llvm/lib/LTO/LTO.cpp
@@ -475,8 +475,7 @@
 LTO::ThinLTOState::ThinLTOState(ThinBackend Backend)
     : Backend(Backend), CombinedIndex(/*HaveGVs*/ false) {
   if (!Backend)
-    this->Backend =
-        createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
+    this->Backend = createInProcessThinBackend();
 }
 
 LTO::LTO(Config Conf, ThinBackend Backend,
@@ -1067,7 +1066,8 @@
       const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
       AddStreamFn AddStream, NativeObjectCache Cache)
       : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
-        BackendThreadPool(ThinLTOParallelismLevel),
+        BackendThreadPool(
+            heavyweight_hardware_concurrency(ThinLTOParallelismLevel)),
         AddStream(std::move(AddStream)), Cache(std::move(Cache)) {
     for (auto &Name : CombinedIndex.cfiFunctionDefs())
       CfiFunctionDefs.insert(
Index: llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
===================================================================
--- llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -153,7 +153,8 @@
 
   if (S.NumCompileThreads > 0) {
     CompileLayer->setCloneToNewContextOnEmit(true);
-    CompileThreads = std::make_unique<ThreadPool>(S.NumCompileThreads);
+    CompileThreads =
+        std::make_unique<ThreadPool>(hardware_concurrency(S.NumCompileThreads));
     ES->setDispatchMaterialization(
         [this](JITDylib &JD, std::unique_ptr<MaterializationUnit> MU) {
           // FIXME: Switch to move capture once we have c++14.
Index: llvm/lib/CodeGen/ParallelCG.cpp
===================================================================
--- llvm/lib/CodeGen/ParallelCG.cpp
+++ llvm/lib/CodeGen/ParallelCG.cpp
@@ -51,7 +51,7 @@
   // Create ThreadPool in nested scope so that threads will be joined
   // on destruction.
   {
-    ThreadPool CodegenThreadPool(OSs.size());
+    ThreadPool CodegenThreadPool(hardware_concurrency(OSs.size()));
     int ThreadCount = 0;
 
     SplitModule(
Index: llvm/include/llvm/Support/Threading.h
===================================================================
--- llvm/include/llvm/Support/Threading.h
+++ llvm/include/llvm/Support/Threading.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_SUPPORT_THREADING_H
 #define LLVM_SUPPORT_THREADING_H
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/FunctionExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
@@ -143,20 +144,57 @@
 #endif
   }
 
-  /// Get the amount of currency to use for tasks requiring significant
-  /// memory or other resources. Currently based on physical cores, if
-  /// available for the host system, otherwise falls back to
-  /// thread::hardware_concurrency().
-  /// Returns 1 when LLVM is configured with LLVM_ENABLE_THREADS=OFF
-  unsigned heavyweight_hardware_concurrency();
+  /// This tells how a thread pool will be used
+  class ThreadPoolStrategy {
+  public:
+    // The default thread count means all available threads, excluding affinity
+    // mask. If set, this value only represents a suggested high bound, the
+    // runtime might use a lower value (not higher).
+    unsigned ThreadsRequested = 0;
+
+    // If SMT is active, use hyper threads. If false, there will be only one
+    // std::thread per core (in that case, each thread will be pinned to a
+    // specific core).
+    bool UseHyperThreads = true;
+
+    /// Retrieves the max available threads for the current strategy. This
+    /// accounts for affinity masks and takes advantage of all CPU sockets.
+    unsigned compute_thread_count() const;
+
+    /// Assign the current thread to an ideal hardware CPU. In a multi-socket
+    /// system, this ensures all CPU sockets are used. \p ThreadNumber
+    /// represents a number between 0 and max threads available for a CPU or a
+    /// NUMA group. \p ThreadCount can be either 0 (auto-detect max) or a fixed
+    /// number of threads to be used. See the ThreadPoolStrategy above.
+    void apply_thread_strategy(unsigned ThreadPoolNum) const;
+  };
+
+  /// Returns a thread strategy for tasks requiring significant  memory or other
+  /// resources. To be used for workloads where hardware_concurrency() proves to
+  /// be less efficient. Avoid this strategy if doing lots of I/O. Currently
+  /// based on physical cores, if available for the host system, otherwise falls
+  /// back to hardware_concurrency(). Returns 1 when LLVM is configured with
+  /// LLVM_ENABLE_THREADS = OFF
+  inline ThreadPoolStrategy
+  heavyweight_hardware_concurrency(unsigned ThreadCount = 0) {
+    ThreadPoolStrategy S;
+    S.UseHyperThreads = false;
+    S.ThreadsRequested = ThreadCount;
+    return S;
+  }
 
   /// Get the number of threads that the current program can execute
-  /// concurrently. On some systems std::thread::hardware_concurrency() returns
-  /// the total number of cores, without taking affinity into consideration.
-  /// Returns 1 when LLVM is configured with LLVM_ENABLE_THREADS=OFF.
-  /// Fallback to std::thread::hardware_concurrency() if sched_getaffinity is
-  /// not available.
-  unsigned hardware_concurrency();
+  /// concurrently. Returns the default thread strategy where all available
+  /// hardware ressources are to be used, except for those initially excluded by
+  /// an affinity mask. On some systems std::thread::hardware_concurrency()
+  /// returns the total number of cores, without taking affinity into
+  /// consideration. Returns 1 when LLVM is configured with
+  /// LLVM_ENABLE_THREADS=OFF.
+  inline ThreadPoolStrategy hardware_concurrency(unsigned ThreadCount = 0) {
+    ThreadPoolStrategy S;
+    S.ThreadsRequested = ThreadCount;
+    return S;
+  }
 
   /// Return the current thread id, as used in various OS system calls.
   /// Note that not all platforms guarantee that the value returned will be
@@ -184,6 +222,14 @@
   /// the operation succeeded or failed is returned.
   void get_thread_name(SmallVectorImpl<char> &Name);
 
+  /// Returns a mask that represents on which hardware thread, core, CPU, NUMA
+  /// group, the calling thread can be executed. On Windows, threads cannot
+  /// cross CPU boundaries.
+  llvm::BitVector get_thread_affinity_mask();
+
+  /// Returns how many physical CPUs or NUMA groups the system has.
+  unsigned get_cpus();
+
   enum class ThreadPriority {
     Background = 0,
     Default = 1,
Index: llvm/include/llvm/Support/ThreadPool.h
===================================================================
--- llvm/include/llvm/Support/ThreadPool.h
+++ llvm/include/llvm/Support/ThreadPool.h
@@ -13,7 +13,9 @@
 #ifndef LLVM_SUPPORT_THREAD_POOL_H
 #define LLVM_SUPPORT_THREAD_POOL_H
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Support/thread.h"
 
 #include <future>
@@ -38,12 +40,11 @@
   using TaskTy = std::function<void()>;
   using PackagedTaskTy = std::packaged_task<void()>;
 
-  /// Construct a pool with the number of threads found by
-  /// hardware_concurrency().
-  ThreadPool();
-
-  /// Construct a pool of \p ThreadCount threads
-  ThreadPool(unsigned ThreadCount);
+  /// Construct a pool using the hardware strategy \p S for mapping hardware
+  /// execution resources (threads, cores, CPUs)
+  /// Defaults to using the maximum execution resources in the system, but
+  /// excluding any resources contained in the affinity mask.
+  ThreadPool(ThreadPoolStrategy S = hardware_concurrency());
 
   /// Blocking destructor: the pool will wait for all the threads to complete.
   ~ThreadPool();
@@ -68,6 +69,8 @@
   /// It is an error to try to add new tasks while blocking on this call.
   void wait();
 
+  unsigned getThreadCount() const { return ThreadCount; }
+
 private:
   /// Asynchronous submission of a task to the pool. The returned future can be
   /// used to wait for the task to finish and is *non-blocking* on destruction.
@@ -94,6 +97,8 @@
   /// Signal for the destruction of the pool, asking thread to exit.
   bool EnableFlag;
 #endif
+
+  unsigned ThreadCount;
 };
 }
 
Index: llvm/include/llvm/LTO/LTO.h
===================================================================
--- llvm/include/llvm/LTO/LTO.h
+++ llvm/include/llvm/LTO/LTO.h
@@ -227,7 +227,8 @@
     AddStreamFn AddStream, NativeObjectCache Cache)>;
 
 /// This ThinBackend runs the individual backend jobs in-process.
-ThinBackend createInProcessThinBackend(unsigned ParallelismLevel);
+/// The default value means to use one job per hardware core (not hyper-thread).
+ThinBackend createInProcessThinBackend(unsigned ParallelismLevel = 0);
 
 /// This ThinBackend writes individual module indexes to files, instead of
 /// running the individual backend jobs. This backend is for distributed builds
Index: llvm/include/llvm/ADT/SmallBitVector.h
===================================================================
--- llvm/include/llvm/ADT/SmallBitVector.h
+++ llvm/include/llvm/ADT/SmallBitVector.h
@@ -662,6 +662,16 @@
       getPointer()->clearBitsNotInMask(Mask, MaskWords);
   }
 
+  void invalid() {
+    assert(empty());
+    X = (uintptr_t)-1;
+  }
+  bool isInvalid() const { return X == (uintptr_t)-1; }
+
+  ArrayRef<uintptr_t> getData() const {
+    return isSmall() ? makeArrayRef(X) : getPointer()->getData();
+  }
+
 private:
   template <bool AddBits, bool InvertMask>
   void applyMask(const uint32_t *Mask, unsigned MaskWords) {
@@ -699,6 +709,23 @@
   return Result;
 }
 
+template <> struct DenseMapInfo<SmallBitVector> {
+  static inline SmallBitVector getEmptyKey() { return SmallBitVector(); }
+  static inline SmallBitVector getTombstoneKey() {
+    SmallBitVector V;
+    V.invalid();
+    return V;
+  }
+  static unsigned getHashValue(const BitVector &V) {
+    return DenseMapInfo<std::pair<unsigned, ArrayRef<uintptr_t>>>::getHashValue(
+        std::make_pair(V.size(), V.getData()));
+  }
+  static bool isEqual(const SmallBitVector &LHS, const SmallBitVector &RHS) {
+    if (LHS.isInvalid() || RHS.isInvalid())
+      return LHS.isInvalid() == RHS.isInvalid();
+    return LHS == RHS;
+  }
+};
 } // end namespace llvm
 
 namespace std {
Index: llvm/include/llvm/ADT/BitVector.h
===================================================================
--- llvm/include/llvm/ADT/BitVector.h
+++ llvm/include/llvm/ADT/BitVector.h
@@ -14,6 +14,7 @@
 #define LLVM_ADT_BITVECTOR_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/MathExtras.h"
 #include <algorithm>
@@ -758,6 +759,14 @@
     std::swap(Size, RHS.Size);
   }
 
+  void invalid() {
+    assert(!Size && Bits.empty());
+    Size = (unsigned)-1;
+  }
+  bool isInvalid() const { return Size == (unsigned)-1; }
+
+  ArrayRef<BitWord> getData() const { return Bits; }
+
   //===--------------------------------------------------------------------===//
   // Portable bit mask operations.
   //===--------------------------------------------------------------------===//
@@ -932,6 +941,23 @@
   return X.getMemorySize();
 }
 
+template <> struct DenseMapInfo<BitVector> {
+  static inline BitVector getEmptyKey() { return BitVector(); }
+  static inline BitVector getTombstoneKey() {
+    BitVector V;
+    V.invalid();
+    return V;
+  }
+  static unsigned getHashValue(const BitVector &V) {
+    return DenseMapInfo<std::pair<unsigned, ArrayRef<uintptr_t>>>::getHashValue(
+        std::make_pair(V.size(), V.getData()));
+  }
+  static bool isEqual(const BitVector &LHS, const BitVector &RHS) {
+    if (LHS.isInvalid() || RHS.isInvalid())
+      return LHS.isInvalid() == RHS.isInvalid();
+    return LHS == RHS;
+  }
+};
 } // end namespace llvm
 
 namespace std {
Index: lld/ELF/SyntheticSections.cpp
===================================================================
--- lld/ELF/SyntheticSections.cpp
+++ lld/ELF/SyntheticSections.cpp
@@ -2668,8 +2668,8 @@
   size_t numShards = 32;
   size_t concurrency = 1;
   if (threadsEnabled)
-    concurrency =
-        std::min<size_t>(PowerOf2Floor(hardware_concurrency()), numShards);
+    concurrency = std::min<size_t>(
+        hardware_concurrency().compute_thread_count(), numShards);
 
   // A sharded map to uniquify symbols by name.
   std::vector<DenseMap<CachedHashStringRef, size_t>> map(numShards);
@@ -3112,8 +3112,8 @@
   // operations in the following tight loop.
   size_t concurrency = 1;
   if (threadsEnabled)
-    concurrency =
-        std::min<size_t>(PowerOf2Floor(hardware_concurrency()), numShards);
+    concurrency = std::min<size_t>(
+        hardware_concurrency().compute_thread_count(), numShards);
 
   // Add section pieces to the builders.
   parallelForEachN(0, concurrency, [&](size_t threadId) {
Index: clang/tools/clang-scan-deps/ClangScanDeps.cpp
===================================================================
--- clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/Threading.h"
 #include <mutex>
 #include <thread>
@@ -299,14 +300,9 @@
 
   DependencyScanningService Service(ScanMode, Format, ReuseFileManager,
                                     SkipExcludedPPRanges);
-#if LLVM_ENABLE_THREADS
-  unsigned NumWorkers =
-      NumThreads == 0 ? llvm::hardware_concurrency() : NumThreads;
-#else
-  unsigned NumWorkers = 1;
-#endif
+  llvm::ThreadPool Pool(llvm::hardware_concurrency(NumThreads));
   std::vector<std::unique_ptr<DependencyScanningTool>> WorkerTools;
-  for (unsigned I = 0; I < NumWorkers; ++I)
+  for (unsigned I = 0; I < Pool.getThreadCount(); ++I)
     WorkerTools.push_back(std::make_unique<DependencyScanningTool>(Service));
 
   std::vector<SingleCommandCompilationDatabase> Inputs;
@@ -314,18 +310,17 @@
        AdjustingCompilations->getAllCompileCommands())
     Inputs.emplace_back(Cmd);
 
-  std::vector<std::thread> WorkerThreads;
   std::atomic<bool> HadErrors(false);
   std::mutex Lock;
   size_t Index = 0;
 
   if (Verbose) {
     llvm::outs() << "Running clang-scan-deps on " << Inputs.size()
-                 << " files using " << NumWorkers << " workers\n";
+                 << " files using " << Pool.getThreadCount() << " workers\n";
   }
-  for (unsigned I = 0; I < NumWorkers; ++I) {
-    auto Worker = [I, &Lock, &Index, &Inputs, &HadErrors, &WorkerTools,
-                   &DependencyOS, &Errs]() {
+  for (unsigned I = 0; I < Pool.getThreadCount(); ++I) {
+    Pool.async([I, &Lock, &Index, &Inputs, &HadErrors, &WorkerTools,
+                &DependencyOS, &Errs]() {
       while (true) {
         const SingleCommandCompilationDatabase *Input;
         std::string Filename;
@@ -345,16 +340,9 @@
         if (handleDependencyToolResult(Filename, MaybeFile, DependencyOS, Errs))
           HadErrors = true;
       }
-    };
-#if LLVM_ENABLE_THREADS
-    WorkerThreads.emplace_back(std::move(Worker));
-#else
-    // Run the worker without spawning a thread when threads are disabled.
-    Worker();
-#endif
+    });
   }
-  for (auto &W : WorkerThreads)
-    W.join();
+  Pool.wait();
 
   return HadErrors;
 }
Index: clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
===================================================================
--- clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
+++ clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
@@ -106,7 +106,8 @@
   // sharding gives a performance edge by reducing the lock contention.
   // FIXME: A better heuristic might also consider the OS to account for
   // the different cost of lock contention on different OSes.
-  NumShards = std::max(2u, llvm::hardware_concurrency() / 4);
+  NumShards =
+      std::max(2u, llvm::hardware_concurrency().compute_thread_count() / 4);
   CacheShards = std::make_unique<CacheShard[]>(NumShards);
 }
 
Index: clang/lib/Tooling/AllTUsExecution.cpp
===================================================================
--- clang/lib/Tooling/AllTUsExecution.cpp
+++ clang/lib/Tooling/AllTUsExecution.cpp
@@ -114,8 +114,7 @@
   auto &Action = Actions.front();
 
   {
-    llvm::ThreadPool Pool(ThreadCount == 0 ? llvm::hardware_concurrency()
-                                           : ThreadCount);
+    llvm::ThreadPool Pool(llvm::hardware_concurrency(ThreadCount));
     for (std::string File : Files) {
       Pool.async(
           [&](std::string Path) {
Index: clang-tools-extra/clangd/index/BackgroundRebuild.h
===================================================================
--- clang-tools-extra/clangd/index/BackgroundRebuild.h
+++ clang-tools-extra/clangd/index/BackgroundRebuild.h
@@ -49,7 +49,9 @@
 public:
   BackgroundIndexRebuilder(SwapIndex *Target, FileSymbols *Source,
                            unsigned Threads)
-      : TUsBeforeFirstBuild(Threads), Target(Target), Source(Source) {}
+      : TUsBeforeFirstBuild(llvm::heavyweight_hardware_concurrency(Threads)
+                                .compute_thread_count()),
+        Target(Target), Source(Source) {}
 
   // Called to indicate a TU has been indexed.
   // May rebuild, if enough TUs have been indexed.
Index: clang-tools-extra/clangd/index/Background.h
===================================================================
--- clang-tools-extra/clangd/index/Background.h
+++ clang-tools-extra/clangd/index/Background.h
@@ -117,11 +117,10 @@
   /// If BuildIndexPeriodMs is greater than 0, the symbol index will only be
   /// rebuilt periodically (one per \p BuildIndexPeriodMs); otherwise, index is
   /// rebuilt for each indexed file.
-  BackgroundIndex(
-      Context BackgroundContext, const FileSystemProvider &,
-      const GlobalCompilationDatabase &CDB,
-      BackgroundIndexStorage::Factory IndexStorageFactory,
-      size_t ThreadPoolSize = llvm::heavyweight_hardware_concurrency());
+  BackgroundIndex(Context BackgroundContext, const FileSystemProvider &,
+                  const GlobalCompilationDatabase &CDB,
+                  BackgroundIndexStorage::Factory IndexStorageFactory,
+                  size_t ThreadPoolSize = 0); // 0 = use all hardware threads
   ~BackgroundIndex(); // Blocks while the current task finishes.
 
   // Enqueue translation units for indexing.
Index: clang-tools-extra/clangd/index/Background.cpp
===================================================================
--- clang-tools-extra/clangd/index/Background.cpp
+++ clang-tools-extra/clangd/index/Background.cpp
@@ -146,9 +146,10 @@
           CDB.watch([&](const std::vector<std::string> &ChangedFiles) {
             enqueue(ChangedFiles);
           })) {
-  assert(ThreadPoolSize > 0 && "Thread pool size can't be zero.");
+  assert(Rebuilder.TUsBeforeFirstBuild > 0 &&
+         "Thread pool size can't be zero.");
   assert(this->IndexStorageFactory && "Storage factory can not be null!");
-  for (unsigned I = 0; I < ThreadPoolSize; ++I) {
+  for (unsigned I = 0; I < Rebuilder.TUsBeforeFirstBuild; ++I) {
     ThreadPool.runAsync("background-worker-" + llvm::Twine(I + 1), [this] {
       WithContext Ctx(this->BackgroundContext.clone());
       Queue.work([&] { Rebuilder.idle(); });
Index: clang-tools-extra/clangd/TUScheduler.cpp
===================================================================
--- clang-tools-extra/clangd/TUScheduler.cpp
+++ clang-tools-extra/clangd/TUScheduler.cpp
@@ -824,13 +824,7 @@
 } // namespace
 
 unsigned getDefaultAsyncThreadsCount() {
-  unsigned HardwareConcurrency = llvm::heavyweight_hardware_concurrency();
-  // heavyweight_hardware_concurrency may fall back to hardware_concurrency.
-  // C++ standard says that hardware_concurrency() may return 0; fallback to 1
-  // worker thread in that case.
-  if (HardwareConcurrency == 0)
-    return 1;
-  return HardwareConcurrency;
+  return llvm::heavyweight_hardware_concurrency().compute_thread_count();
 }
 
 FileStatus TUStatus::render(PathRef File) const {
Index: clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
===================================================================
--- clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
+++ clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
@@ -268,8 +268,7 @@
   Error = false;
   llvm::sys::Mutex IndexMutex;
   // ExecutorConcurrency is a flag exposed by AllTUsExecution.h
-  llvm::ThreadPool Pool(ExecutorConcurrency == 0 ? llvm::hardware_concurrency()
-                                                 : ExecutorConcurrency);
+  llvm::ThreadPool Pool(llvm::hardware_concurrency(ExecutorConcurrency));
   for (auto &Group : USRToBitcode) {
     Pool.async([&]() {
       std::vector<std::unique_ptr<doc::Info>> Infos;

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D71775: [ThreadPool] On Windows, extend usage to all CPU sockets and all NUMA groups

Reply via email to