[PATCH] libgfortran: Replace mutex with rwlock

2022-12-20 Thread Lipeng Zhu via Gcc-patches
This patch try to introduce the rwlock and split the read/write to
unit_root tree and unit_cache with rwlock instead of the mutex to
increase CPU efficiency. In the get_gfc_unit function, the percentage
to step into the insert_unit function is around 30%, in most instances,
we can get the unit in the phase of reading the unit_cache or unit_root
tree. So split the read/write phase by rwlock would be an approach to
make it more parallel.

BTW, the IPC metrics can increase from 0.25 to 2.2 in the Intel
SRP server with 220 cores. The benchmark we used is
https://github.com/rwesson/NEAT

libgcc/ChangeLog:

* gthr-posix.h (__GTHREAD_RWLOCK_INIT): New macro
(__gthrw): New function
(__gthread_rwlock_rdlock): New function
(__gthread_rwlock_tryrdlock): New function
(__gthread_rwlock_wrlock): New function
(__gthread_rwlock_trywrlock): New function
(__gthread_rwlock_unlock): New function

libgfortran/ChangeLog:

* io/async.c (DEBUG_LINE): New
* io/async.h (RWLOCK_DEBUG_ADD): New macro
(CHECK_RDLOCK): New macro
(CHECK_WRLOCK): New macro
(TAIL_RWLOCK_DEBUG_QUEUE): New macro
(IN_RWLOCK_DEBUG_QUEUE): New macro
(RDLOCK): New macro
(WRLOCK): New macro
(RWUNLOCK): New macro
(RD_TO_WRLOCK): New macro
(INTERN_RDLOCK): New macro
(INTERN_WRLOCK): New macro
(INTERN_RWUNLOCK): New macro
* io/io.h (internal_proto): Define unit_rwlock
* io/transfer.c (st_read_done_worker): Relace unit_lock with unit_rwlock
(st_write_done_worker): Relace unit_lock with unit_rwlock
* io/unit.c (get_gfc_unit): Relace unit_lock with unit_rwlock
(if): Relace unit_lock with unit_rwlock
(close_unit_1): Relace unit_lock with unit_rwlock
(close_units): Relace unit_lock with unit_rwlock
(newunit_alloc): Relace unit_lock with unit_rwlock
* io/unix.c (flush_all_units): Relace unit_lock with unit_rwlock
---
 libgcc/gthr-posix.h   |  52 +
 libgfortran/io/async.c|   4 +
 libgfortran/io/async.h| 151 ++
 libgfortran/io/io.h   |  15 ++--
 libgfortran/io/transfer.c |   8 +-
 libgfortran/io/unit.c |  65 
 libgfortran/io/unix.c |  16 ++--
 7 files changed, 265 insertions(+), 46 deletions(-)

diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h
index f1a5ab8e075..358948e8ae8 100644
--- a/libgcc/gthr-posix.h
+++ b/libgcc/gthr-posix.h
@@ -48,6 +48,7 @@ typedef pthread_t __gthread_t;
 typedef pthread_key_t __gthread_key_t;
 typedef pthread_once_t __gthread_once_t;
 typedef pthread_mutex_t __gthread_mutex_t;
+typedef pthread_rwlock_t __gthread_rwlock_t;
 typedef pthread_mutex_t __gthread_recursive_mutex_t;
 typedef pthread_cond_t __gthread_cond_t;
 typedef struct timespec __gthread_time_t;
@@ -58,6 +59,7 @@ typedef struct timespec __gthread_time_t;
 
 #define __GTHREAD_MUTEX_INIT PTHREAD_MUTEX_INITIALIZER
 #define __GTHREAD_MUTEX_INIT_FUNCTION __gthread_mutex_init_function
+#define __GTHREAD_RWLOCK_INIT PTHREAD_RWLOCK_INITIALIZER
 #define __GTHREAD_ONCE_INIT PTHREAD_ONCE_INIT
 #if defined(PTHREAD_RECURSIVE_MUTEX_INITIALIZER)
 #define __GTHREAD_RECURSIVE_MUTEX_INIT PTHREAD_RECURSIVE_MUTEX_INITIALIZER
@@ -135,6 +137,11 @@ __gthrw(pthread_mutexattr_init)
 __gthrw(pthread_mutexattr_settype)
 __gthrw(pthread_mutexattr_destroy)
 
+__gthrw(pthread_rwlock_rdlock)
+__gthrw(pthread_rwlock_tryrdlock)
+__gthrw(pthread_rwlock_wrlock)
+__gthrw(pthread_rwlock_trywrlock)
+__gthrw(pthread_rwlock_unlock)
 
 #if defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)
 /* Objective-C.  */
@@ -885,6 +892,51 @@ __gthread_cond_destroy (__gthread_cond_t* __cond)
   return __gthrw_(pthread_cond_destroy) (__cond);
 }
 
+static inline int
+__gthread_rwlock_rdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_rdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_tryrdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_tryrdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_wrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_wrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_trywrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_trywrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_unlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_unlock) (__rwlock);
+  else
+return 0;
+}
+
 #endif /* _LIBOBJC */
 
 #endif /* ! GCC_GTHR_POSIX_H */
diff --git a/libgfortran/io/async.c b/libgfortran/io/async.c
index 912b39ea302..f0bde979da4 100644
--- a/libgfortran/io/async.c
+++ b/libgfortran/io/async.c
@@ -42,6 +42,10 @@ DEBUG_LINE (__thread const char *aio_prefix = MPREFIX);
 
 DEBUG_LINE (__gthread_mutex_t debug_queue_lock = __

[PATCH] libgfortran: Replace mutex with rwlock

2022-12-20 Thread Lipeng Zhu via Gcc-patches
This patch try to introduce the rwlock and split the read/write to
unit_root tree and unit_cache with rwlock instead of the mutex to
increase CPU efficiency. In the get_gfc_unit function, the percentage
to step into the insert_unit function is around 30%, in most instances,
we can get the unit in the phase of reading the unit_cache or unit_root
tree. So split the read/write phase by rwlock would be an approach to
make it more parallel.

BTW, the IPC metrics can gain around 9x in our test
server with 220 cores. The benchmark we used is
https://github.com/rwesson/NEAT

libgcc/ChangeLog:

* gthr-posix.h (__GTHREAD_RWLOCK_INIT): New macro
(__gthrw): New function
(__gthread_rwlock_rdlock): New function
(__gthread_rwlock_tryrdlock): New function
(__gthread_rwlock_wrlock): New function
(__gthread_rwlock_trywrlock): New function
(__gthread_rwlock_unlock): New function

libgfortran/ChangeLog:

* io/async.c (DEBUG_LINE): New
* io/async.h (RWLOCK_DEBUG_ADD): New macro
(CHECK_RDLOCK): New macro
(CHECK_WRLOCK): New macro
(TAIL_RWLOCK_DEBUG_QUEUE): New macro
(IN_RWLOCK_DEBUG_QUEUE): New macro
(RDLOCK): New macro
(WRLOCK): New macro
(RWUNLOCK): New macro
(RD_TO_WRLOCK): New macro
(INTERN_RDLOCK): New macro
(INTERN_WRLOCK): New macro
(INTERN_RWUNLOCK): New macro
* io/io.h (internal_proto): Define unit_rwlock
* io/transfer.c (st_read_done_worker): Relace unit_lock with unit_rwlock
(st_write_done_worker): Relace unit_lock with unit_rwlock
* io/unit.c (get_gfc_unit): Relace unit_lock with unit_rwlock
(if): Relace unit_lock with unit_rwlock
(close_unit_1): Relace unit_lock with unit_rwlock
(close_units): Relace unit_lock with unit_rwlock
(newunit_alloc): Relace unit_lock with unit_rwlock
* io/unix.c (flush_all_units): Relace unit_lock with unit_rwlock
---
 libgcc/gthr-posix.h   |  52 +
 libgfortran/io/async.c|   4 +
 libgfortran/io/async.h| 151 ++
 libgfortran/io/io.h   |  15 ++--
 libgfortran/io/transfer.c |   8 +-
 libgfortran/io/unit.c |  65 
 libgfortran/io/unix.c |  16 ++--
 7 files changed, 265 insertions(+), 46 deletions(-)

diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h
index f1a5ab8e075..358948e8ae8 100644
--- a/libgcc/gthr-posix.h
+++ b/libgcc/gthr-posix.h
@@ -48,6 +48,7 @@ typedef pthread_t __gthread_t;
 typedef pthread_key_t __gthread_key_t;
 typedef pthread_once_t __gthread_once_t;
 typedef pthread_mutex_t __gthread_mutex_t;
+typedef pthread_rwlock_t __gthread_rwlock_t;
 typedef pthread_mutex_t __gthread_recursive_mutex_t;
 typedef pthread_cond_t __gthread_cond_t;
 typedef struct timespec __gthread_time_t;
@@ -58,6 +59,7 @@ typedef struct timespec __gthread_time_t;
 
 #define __GTHREAD_MUTEX_INIT PTHREAD_MUTEX_INITIALIZER
 #define __GTHREAD_MUTEX_INIT_FUNCTION __gthread_mutex_init_function
+#define __GTHREAD_RWLOCK_INIT PTHREAD_RWLOCK_INITIALIZER
 #define __GTHREAD_ONCE_INIT PTHREAD_ONCE_INIT
 #if defined(PTHREAD_RECURSIVE_MUTEX_INITIALIZER)
 #define __GTHREAD_RECURSIVE_MUTEX_INIT PTHREAD_RECURSIVE_MUTEX_INITIALIZER
@@ -135,6 +137,11 @@ __gthrw(pthread_mutexattr_init)
 __gthrw(pthread_mutexattr_settype)
 __gthrw(pthread_mutexattr_destroy)
 
+__gthrw(pthread_rwlock_rdlock)
+__gthrw(pthread_rwlock_tryrdlock)
+__gthrw(pthread_rwlock_wrlock)
+__gthrw(pthread_rwlock_trywrlock)
+__gthrw(pthread_rwlock_unlock)
 
 #if defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)
 /* Objective-C.  */
@@ -885,6 +892,51 @@ __gthread_cond_destroy (__gthread_cond_t* __cond)
   return __gthrw_(pthread_cond_destroy) (__cond);
 }
 
+static inline int
+__gthread_rwlock_rdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_rdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_tryrdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_tryrdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_wrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_wrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_trywrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_trywrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_unlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_unlock) (__rwlock);
+  else
+return 0;
+}
+
 #endif /* _LIBOBJC */
 
 #endif /* ! GCC_GTHR_POSIX_H */
diff --git a/libgfortran/io/async.c b/libgfortran/io/async.c
index 912b39ea302..f0bde979da4 100644
--- a/libgfortran/io/async.c
+++ b/libgfortran/io/async.c
@@ -42,6 +42,10 @@ DEBUG_LINE (__thread const char *aio_prefix = MPREFIX);
 
 DEBUG_LINE (__gthread_mutex_t debug_queue_lock = __GTHREAD_MUTEX_IN

[PATCH v2] libgfortran: Replace mutex with rwlock

2022-12-28 Thread Lipeng Zhu via Gcc-patches
This patch try to introduce the rwlock and split the read/write to
unit_root tree and unit_cache with rwlock instead of the mutex to
increase CPU efficiency. In the get_gfc_unit function, the percentage
to step into the insert_unit function is around 30%, in most instances,
we can get the unit in the phase of reading the unit_cache or unit_root
tree. So split the read/write phase by rwlock would be an approach to
make it more parallel.

BTW, the IPC metrics can gain around 9x in our test
server with 220 cores. The benchmark we used is
https://github.com/rwesson/NEAT

libgcc/ChangeLog:

* gthr-posix.h (__GTHREAD_RWLOCK_INIT): New macro
(__gthrw): New function
(__gthread_rwlock_rdlock): New function
(__gthread_rwlock_tryrdlock): New function
(__gthread_rwlock_wrlock): New function
(__gthread_rwlock_trywrlock): New function
(__gthread_rwlock_unlock): New function

libgfortran/ChangeLog:

* io/async.c (DEBUG_LINE): New
* io/async.h (RWLOCK_DEBUG_ADD): New macro
(CHECK_RDLOCK): New macro
(CHECK_WRLOCK): New macro
(TAIL_RWLOCK_DEBUG_QUEUE): New macro
(IN_RWLOCK_DEBUG_QUEUE): New macro
(RDLOCK): New macro
(WRLOCK): New macro
(RWUNLOCK): New macro
(RD_TO_WRLOCK): New macro
(INTERN_RDLOCK): New macro
(INTERN_WRLOCK): New macro
(INTERN_RWUNLOCK): New macro
* io/io.h (internal_proto): Define unit_rwlock
* io/transfer.c (st_read_done_worker): Relace unit_lock with unit_rwlock
(st_write_done_worker): Relace unit_lock with unit_rwlock
* io/unit.c (get_gfc_unit): Relace unit_lock with unit_rwlock
(if): Relace unit_lock with unit_rwlock
(close_unit_1): Relace unit_lock with unit_rwlock
(close_units): Relace unit_lock with unit_rwlock
(newunit_alloc): Relace unit_lock with unit_rwlock
* io/unix.c (flush_all_units): Relace unit_lock with unit_rwlock

---
v1 -> v2:
Limit the pthread_rwlock usage in libgcc only when __cplusplus isn't defined.

Signed-off-by: Lipeng Zhu 
---
 libgcc/gthr-posix.h   |  60 +++
 libgfortran/io/async.c|   4 +
 libgfortran/io/async.h| 151 ++
 libgfortran/io/io.h   |  15 ++--
 libgfortran/io/transfer.c |   8 +-
 libgfortran/io/unit.c |  65 
 libgfortran/io/unix.c |  16 ++--
 7 files changed, 273 insertions(+), 46 deletions(-)

diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h
index f1a5ab8e075..cbe7cba20d0 100644
--- a/libgcc/gthr-posix.h
+++ b/libgcc/gthr-posix.h
@@ -48,6 +48,9 @@ typedef pthread_t __gthread_t;
 typedef pthread_key_t __gthread_key_t;
 typedef pthread_once_t __gthread_once_t;
 typedef pthread_mutex_t __gthread_mutex_t;
+#ifndef __cplusplus
+typedef pthread_rwlock_t __gthread_rwlock_t;
+#endif
 typedef pthread_mutex_t __gthread_recursive_mutex_t;
 typedef pthread_cond_t __gthread_cond_t;
 typedef struct timespec __gthread_time_t;
@@ -58,6 +61,9 @@ typedef struct timespec __gthread_time_t;
 
 #define __GTHREAD_MUTEX_INIT PTHREAD_MUTEX_INITIALIZER
 #define __GTHREAD_MUTEX_INIT_FUNCTION __gthread_mutex_init_function
+#ifndef __cplusplus
+#define __GTHREAD_RWLOCK_INIT PTHREAD_RWLOCK_INITIALIZER
+#endif
 #define __GTHREAD_ONCE_INIT PTHREAD_ONCE_INIT
 #if defined(PTHREAD_RECURSIVE_MUTEX_INITIALIZER)
 #define __GTHREAD_RECURSIVE_MUTEX_INIT PTHREAD_RECURSIVE_MUTEX_INITIALIZER
@@ -135,6 +141,13 @@ __gthrw(pthread_mutexattr_init)
 __gthrw(pthread_mutexattr_settype)
 __gthrw(pthread_mutexattr_destroy)
 
+#ifndef __cplusplus
+__gthrw(pthread_rwlock_rdlock)
+__gthrw(pthread_rwlock_tryrdlock)
+__gthrw(pthread_rwlock_wrlock)
+__gthrw(pthread_rwlock_trywrlock)
+__gthrw(pthread_rwlock_unlock)
+#endif
 
 #if defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)
 /* Objective-C.  */
@@ -885,6 +898,53 @@ __gthread_cond_destroy (__gthread_cond_t* __cond)
   return __gthrw_(pthread_cond_destroy) (__cond);
 }
 
+#ifndef __cplusplus
+static inline int
+__gthread_rwlock_rdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_rdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_tryrdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_tryrdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_wrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_wrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_trywrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_trywrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_unlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_unlock) (__rwlock);
+  else
+return 0;
+}
+#endif
+
 #endif /* _LIBOBJC */
 
 #endif /* ! GCC_GTHR_POSIX_H */
diff --git a/libgfortran/io/async.c b/libgfortran/io/async.c
in

[PATCH v3] libgfortran: Replace mutex with rwlock

2023-04-19 Thread Lipeng Zhu via Gcc-patches
This patch try to introduce the rwlock and split the read/write to
unit_root tree and unit_cache with rwlock instead of the mutex to
increase CPU efficiency. In the get_gfc_unit function, the percentage
to step into the insert_unit function is around 30%, in most instances,
we can get the unit in the phase of reading the unit_cache or unit_root
tree. So split the read/write phase by rwlock would be an approach to
make it more parallel.

BTW, the IPC metrics can gain around 9x in our test
server with 220 cores. The benchmark we used is
https://github.com/rwesson/NEAT

libgcc/ChangeLog:

* gthr-posix.h (__GTHREAD_RWLOCK_INIT): New macro
(__gthrw): New function
(__gthread_rwlock_rdlock): New function
(__gthread_rwlock_tryrdlock): New function
(__gthread_rwlock_wrlock): New function
(__gthread_rwlock_trywrlock): New function
(__gthread_rwlock_unlock): New function

libgfortran/ChangeLog:

* io/async.c (DEBUG_LINE): New
* io/async.h (RWLOCK_DEBUG_ADD): New macro
(CHECK_RDLOCK): New macro
(CHECK_WRLOCK): New macro
(TAIL_RWLOCK_DEBUG_QUEUE): New macro
(IN_RWLOCK_DEBUG_QUEUE): New macro
(RDLOCK): New macro
(WRLOCK): New macro
(RWUNLOCK): New macro
(RD_TO_WRLOCK): New macro
(INTERN_RDLOCK): New macro
(INTERN_WRLOCK): New macro
(INTERN_RWUNLOCK): New macro
* io/io.h (internal_proto): Define unit_rwlock
* io/transfer.c (st_read_done_worker): Relace unit_lock with unit_rwlock
(st_write_done_worker): Relace unit_lock with unit_rwlock
* io/unit.c (get_gfc_unit): Relace unit_lock with unit_rwlock
(if): Relace unit_lock with unit_rwlock
(close_unit_1): Relace unit_lock with unit_rwlock
(close_units): Relace unit_lock with unit_rwlock
(newunit_alloc): Relace unit_lock with unit_rwlock
* io/unix.c (flush_all_units): Relace unit_lock with unit_rwlock

---
v1 -> v2:
Limit the pthread_rwlock usage in libgcc only when __cplusplus isn't defined.

v2 -> v3:
Rebase the patch with trunk branch.

Signed-off-by: Lipeng Zhu 
---
 libgcc/gthr-posix.h   |  60 +++
 libgfortran/io/async.c|   4 +
 libgfortran/io/async.h| 151 ++
 libgfortran/io/io.h   |  15 ++--
 libgfortran/io/transfer.c |   8 +-
 libgfortran/io/unit.c |  65 
 libgfortran/io/unix.c |  16 ++--
 7 files changed, 273 insertions(+), 46 deletions(-)

diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h
index aebcfdd9f4c..73283082997 100644
--- a/libgcc/gthr-posix.h
+++ b/libgcc/gthr-posix.h
@@ -48,6 +48,9 @@ typedef pthread_t __gthread_t;
 typedef pthread_key_t __gthread_key_t;
 typedef pthread_once_t __gthread_once_t;
 typedef pthread_mutex_t __gthread_mutex_t;
+#ifndef __cplusplus
+typedef pthread_rwlock_t __gthread_rwlock_t;
+#endif
 typedef pthread_mutex_t __gthread_recursive_mutex_t;
 typedef pthread_cond_t __gthread_cond_t;
 typedef struct timespec __gthread_time_t;
@@ -58,6 +61,9 @@ typedef struct timespec __gthread_time_t;
 
 #define __GTHREAD_MUTEX_INIT PTHREAD_MUTEX_INITIALIZER
 #define __GTHREAD_MUTEX_INIT_FUNCTION __gthread_mutex_init_function
+#ifndef __cplusplus
+#define __GTHREAD_RWLOCK_INIT PTHREAD_RWLOCK_INITIALIZER
+#endif
 #define __GTHREAD_ONCE_INIT PTHREAD_ONCE_INIT
 #if defined(PTHREAD_RECURSIVE_MUTEX_INITIALIZER)
 #define __GTHREAD_RECURSIVE_MUTEX_INIT PTHREAD_RECURSIVE_MUTEX_INITIALIZER
@@ -135,6 +141,13 @@ __gthrw(pthread_mutexattr_init)
 __gthrw(pthread_mutexattr_settype)
 __gthrw(pthread_mutexattr_destroy)
 
+#ifndef __cplusplus
+__gthrw(pthread_rwlock_rdlock)
+__gthrw(pthread_rwlock_tryrdlock)
+__gthrw(pthread_rwlock_wrlock)
+__gthrw(pthread_rwlock_trywrlock)
+__gthrw(pthread_rwlock_unlock)
+#endif
 
 #if defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)
 /* Objective-C.  */
@@ -885,6 +898,53 @@ __gthread_cond_destroy (__gthread_cond_t* __cond)
   return __gthrw_(pthread_cond_destroy) (__cond);
 }
 
+#ifndef __cplusplus
+static inline int
+__gthread_rwlock_rdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_rdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_tryrdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_tryrdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_wrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_wrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_trywrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_trywrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_unlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_unlock) (__rwlock);
+  else
+return 0;
+}
+#endif
+
 #endif /* _LIBOBJC */
 
 #endif /* ! GCC_GTHR_POSIX_H */
diff --git a/lib

[PATCH v4] libgfortran: Replace mutex with rwlock

2023-05-08 Thread Lipeng Zhu via Gcc-patches
This patch try to introduce the rwlock and split the read/write to
unit_root tree and unit_cache with rwlock instead of the mutex to
increase CPU efficiency. In the get_gfc_unit function, the percentage
to step into the insert_unit function is around 30%, in most instances,
we can get the unit in the phase of reading the unit_cache or unit_root
tree. So split the read/write phase by rwlock would be an approach to
make it more parallel.

BTW, the IPC metrics can gain around 9x in our test
server with 220 cores. The benchmark we used is
https://github.com/rwesson/NEAT

libgcc/ChangeLog:

* gthr-posix.h (__GTHREAD_RWLOCK_INIT): New macro
(__gthrw): New function
(__gthread_rwlock_rdlock): New function
(__gthread_rwlock_tryrdlock): New function
(__gthread_rwlock_wrlock): New function
(__gthread_rwlock_trywrlock): New function
(__gthread_rwlock_unlock): New function

libgfortran/ChangeLog:

* io/async.c (DEBUG_LINE): New
* io/async.h (RWLOCK_DEBUG_ADD): New macro
(CHECK_RDLOCK): New macro
(CHECK_WRLOCK): New macro
(TAIL_RWLOCK_DEBUG_QUEUE): New macro
(IN_RWLOCK_DEBUG_QUEUE): New macro
(RDLOCK): New macro
(WRLOCK): New macro
(RWUNLOCK): New macro
(RD_TO_WRLOCK): New macro
(INTERN_RDLOCK): New macro
(INTERN_WRLOCK): New macro
(INTERN_RWUNLOCK): New macro
* io/io.h (internal_proto): Define unit_rwlock
* io/transfer.c (st_read_done_worker): Relace unit_lock with unit_rwlock
(st_write_done_worker): Relace unit_lock with unit_rwlock
* io/unit.c (get_gfc_unit): Relace unit_lock with unit_rwlock
(if): Relace unit_lock with unit_rwlock
(close_unit_1): Relace unit_lock with unit_rwlock
(close_units): Relace unit_lock with unit_rwlock
(newunit_alloc): Relace unit_lock with unit_rwlock
* io/unix.c (flush_all_units): Relace unit_lock with unit_rwlock

---
v1 -> v2:
Limit the pthread_rwlock usage in libgcc only when __cplusplus isn't defined.

v2 -> v3:
Rebase the patch with trunk branch.

v3 -> v4:
Update the comments.

Reviewed-by: Hongjiu Lu 
Signed-off-by: Lipeng Zhu 
---
 libgcc/gthr-posix.h   |  60 +++
 libgfortran/io/async.c|   4 +
 libgfortran/io/async.h| 151 ++
 libgfortran/io/io.h   |  15 ++--
 libgfortran/io/transfer.c |   8 +-
 libgfortran/io/unit.c |  75 +++
 libgfortran/io/unix.c |  16 ++--
 7 files changed, 283 insertions(+), 46 deletions(-)

diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h
index aebcfdd9f4c..73283082997 100644
--- a/libgcc/gthr-posix.h
+++ b/libgcc/gthr-posix.h
@@ -48,6 +48,9 @@ typedef pthread_t __gthread_t;
 typedef pthread_key_t __gthread_key_t;
 typedef pthread_once_t __gthread_once_t;
 typedef pthread_mutex_t __gthread_mutex_t;
+#ifndef __cplusplus
+typedef pthread_rwlock_t __gthread_rwlock_t;
+#endif
 typedef pthread_mutex_t __gthread_recursive_mutex_t;
 typedef pthread_cond_t __gthread_cond_t;
 typedef struct timespec __gthread_time_t;
@@ -58,6 +61,9 @@ typedef struct timespec __gthread_time_t;
 
 #define __GTHREAD_MUTEX_INIT PTHREAD_MUTEX_INITIALIZER
 #define __GTHREAD_MUTEX_INIT_FUNCTION __gthread_mutex_init_function
+#ifndef __cplusplus
+#define __GTHREAD_RWLOCK_INIT PTHREAD_RWLOCK_INITIALIZER
+#endif
 #define __GTHREAD_ONCE_INIT PTHREAD_ONCE_INIT
 #if defined(PTHREAD_RECURSIVE_MUTEX_INITIALIZER)
 #define __GTHREAD_RECURSIVE_MUTEX_INIT PTHREAD_RECURSIVE_MUTEX_INITIALIZER
@@ -135,6 +141,13 @@ __gthrw(pthread_mutexattr_init)
 __gthrw(pthread_mutexattr_settype)
 __gthrw(pthread_mutexattr_destroy)
 
+#ifndef __cplusplus
+__gthrw(pthread_rwlock_rdlock)
+__gthrw(pthread_rwlock_tryrdlock)
+__gthrw(pthread_rwlock_wrlock)
+__gthrw(pthread_rwlock_trywrlock)
+__gthrw(pthread_rwlock_unlock)
+#endif
 
 #if defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)
 /* Objective-C.  */
@@ -885,6 +898,53 @@ __gthread_cond_destroy (__gthread_cond_t* __cond)
   return __gthrw_(pthread_cond_destroy) (__cond);
 }
 
+#ifndef __cplusplus
+static inline int
+__gthread_rwlock_rdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_rdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_tryrdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_tryrdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_wrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_wrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_trywrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_trywrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_unlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_unlock) (__rwlock);
+  else
+return 0;
+}
+#endif
+
 #endif /* _LI

[PATCH v5] libgfortran: Replace mutex with rwlock

2023-05-08 Thread Lipeng Zhu via Gcc-patches
This patch try to introduce the rwlock and split the read/write to
unit_root tree and unit_cache with rwlock instead of the mutex to
increase CPU efficiency. In the get_gfc_unit function, the percentage
to step into the insert_unit function is around 30%, in most instances,
we can get the unit in the phase of reading the unit_cache or unit_root
tree. So split the read/write phase by rwlock would be an approach to
make it more parallel.

BTW, the IPC metrics can gain around 9x in our test
server with 220 cores. The benchmark we used is
https://github.com/rwesson/NEAT

libgcc/ChangeLog:

* gthr-posix.h (__GTHREAD_RWLOCK_INIT): New macro
(__gthrw): New function
(__gthread_rwlock_rdlock): New function
(__gthread_rwlock_tryrdlock): New function
(__gthread_rwlock_wrlock): New function
(__gthread_rwlock_trywrlock): New function
(__gthread_rwlock_unlock): New function

libgfortran/ChangeLog:

* io/async.c (DEBUG_LINE): New
* io/async.h (RWLOCK_DEBUG_ADD): New macro
(CHECK_RDLOCK): New macro
(CHECK_WRLOCK): New macro
(TAIL_RWLOCK_DEBUG_QUEUE): New macro
(IN_RWLOCK_DEBUG_QUEUE): New macro
(RDLOCK): New macro
(WRLOCK): New macro
(RWUNLOCK): New macro
(RD_TO_WRLOCK): New macro
(INTERN_RDLOCK): New macro
(INTERN_WRLOCK): New macro
(INTERN_RWUNLOCK): New macro
* io/io.h (internal_proto): Define unit_rwlock
* io/transfer.c (st_read_done_worker): Relace unit_lock with unit_rwlock
(st_write_done_worker): Relace unit_lock with unit_rwlock
* io/unit.c (get_gfc_unit): Relace unit_lock with unit_rwlock
(if): Relace unit_lock with unit_rwlock
(close_unit_1): Relace unit_lock with unit_rwlock
(close_units): Relace unit_lock with unit_rwlock
(newunit_alloc): Relace unit_lock with unit_rwlock
* io/unix.c (flush_all_units): Relace unit_lock with unit_rwlock

---
v1 -> v2:
Limit the pthread_rwlock usage in libgcc only when __cplusplus isn't defined.

v2 -> v3:
Rebase the patch with trunk branch.

v3 -> v4:
Update the comments.

v4 -> v5:
Fix typos and code formatter.

Reviewed-by: Hongjiu Lu 
Signed-off-by: Lipeng Zhu 
---
 libgcc/gthr-posix.h   |  60 +++
 libgfortran/io/async.c|   4 +
 libgfortran/io/async.h| 154 +-
 libgfortran/io/io.h   |  15 ++--
 libgfortran/io/transfer.c |   8 +-
 libgfortran/io/unit.c |  71 +++---
 libgfortran/io/unix.c |  16 ++--
 7 files changed, 281 insertions(+), 47 deletions(-)

diff --git a/libgcc/gthr-posix.h b/libgcc/gthr-posix.h
index aebcfdd9f4c..73283082997 100644
--- a/libgcc/gthr-posix.h
+++ b/libgcc/gthr-posix.h
@@ -48,6 +48,9 @@ typedef pthread_t __gthread_t;
 typedef pthread_key_t __gthread_key_t;
 typedef pthread_once_t __gthread_once_t;
 typedef pthread_mutex_t __gthread_mutex_t;
+#ifndef __cplusplus
+typedef pthread_rwlock_t __gthread_rwlock_t;
+#endif
 typedef pthread_mutex_t __gthread_recursive_mutex_t;
 typedef pthread_cond_t __gthread_cond_t;
 typedef struct timespec __gthread_time_t;
@@ -58,6 +61,9 @@ typedef struct timespec __gthread_time_t;
 
 #define __GTHREAD_MUTEX_INIT PTHREAD_MUTEX_INITIALIZER
 #define __GTHREAD_MUTEX_INIT_FUNCTION __gthread_mutex_init_function
+#ifndef __cplusplus
+#define __GTHREAD_RWLOCK_INIT PTHREAD_RWLOCK_INITIALIZER
+#endif
 #define __GTHREAD_ONCE_INIT PTHREAD_ONCE_INIT
 #if defined(PTHREAD_RECURSIVE_MUTEX_INITIALIZER)
 #define __GTHREAD_RECURSIVE_MUTEX_INIT PTHREAD_RECURSIVE_MUTEX_INITIALIZER
@@ -135,6 +141,13 @@ __gthrw(pthread_mutexattr_init)
 __gthrw(pthread_mutexattr_settype)
 __gthrw(pthread_mutexattr_destroy)
 
+#ifndef __cplusplus
+__gthrw(pthread_rwlock_rdlock)
+__gthrw(pthread_rwlock_tryrdlock)
+__gthrw(pthread_rwlock_wrlock)
+__gthrw(pthread_rwlock_trywrlock)
+__gthrw(pthread_rwlock_unlock)
+#endif
 
 #if defined(_LIBOBJC) || defined(_LIBOBJC_WEAK)
 /* Objective-C.  */
@@ -885,6 +898,53 @@ __gthread_cond_destroy (__gthread_cond_t* __cond)
   return __gthrw_(pthread_cond_destroy) (__cond);
 }
 
+#ifndef __cplusplus
+static inline int
+__gthread_rwlock_rdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_rdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_tryrdlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_tryrdlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_wrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_wrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_trywrlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_trywrlock) (__rwlock);
+  else
+return 0;
+}
+
+static inline int
+__gthread_rwlock_unlock (__gthread_rwlock_t *__rwlock)
+{
+  if (__gthread_active_p ())
+return __gthrw_(pthread_rwlock_unlock) (__rwlock);
+  else
+