Discussion:
[PATCH, libgfortran] Remove recursion check
Janne Blomqvist
2018-09-24 08:56:58 UTC
Permalink
libgfortran has a recursion check in the error handling paths. This
works by checking the value of a static variable, and if it matches,
aborting immediately instead of continuing error processing.
Unfortunately, in a multi-threaded program, if two threads report an
error at the same time, this can trigger this check, and thus the
underlying reason for the error is never reported.

The downside is that without the recursion check, it might be possible
to get into some infinite recursion in the error handling. But by my
audit of the code, this shouldn't happen.

Regtested on x86_64-pc-linux-gnu, Ok for trunk?

libgfortran/ChangeLog:

2018-09-24 Janne Blomqvist <***@gcc.gnu.org>

* runtime/error.c (MAGIC): Remove.
(recursion_check): Remove function.
(os_error): Remove recursion_check call.
(runtime_error): Likewise.
(runtime_error_at): Likewise.
(internal_error): Likewise.
(generate_error_common): Likewise.
(notify_std): Likewise.
* runtime/minimal.c (MAGIC): Remove.
(recursion_check): Remove function.
(os_error): Remove recursion_check call.
(runtime_error): Likewise.
(runtime_error_at): Likewise.
(internal_error): Likewise.
---
libgfortran/runtime/error.c | 25 -------------------------
libgfortran/runtime/minimal.c | 22 ----------------------
2 files changed, 47 deletions(-)

diff --git a/libgfortran/runtime/error.c b/libgfortran/runtime/error.c
index b07a4c0b12a..b6cbd14ff6d 100644
--- a/libgfortran/runtime/error.c
+++ b/libgfortran/runtime/error.c
@@ -330,25 +330,6 @@ show_locus (st_parameter_common *cmp)
}


-/* recursion_check()-- It's possible for additional errors to occur
- * during fatal error processing. We detect this condition here and
- * exit with code 4 immediately. */
-
-#define MAGIC 0x20DE8101
-
-static void
-recursion_check (void)
-{
- static int magic = 0;
-
- /* Don't even try to print something at this point */
- if (magic == MAGIC)
- sys_abort ();
-
- magic = MAGIC;
-}
-
-
#define STRERR_MAXSZ 256

/* os_error()-- Operating system error. We get a message from the
@@ -360,7 +341,6 @@ os_error (const char *message)
{
char errmsg[STRERR_MAXSZ];
struct iovec iov[5];
- recursion_check ();
iov[0].iov_base = (char*) "Operating system error: ";
iov[0].iov_len = strlen (iov[0].iov_base);
iov[1].iov_base = gf_strerror (errno, errmsg, STRERR_MAXSZ);
@@ -388,7 +368,6 @@ runtime_error (const char *message, ...)
va_list ap;
int written;

- recursion_check ();
iov[0].iov_base = (char*) "Fortran runtime error: ";
iov[0].iov_len = strlen (iov[0].iov_base);
va_start (ap, message);
@@ -417,7 +396,6 @@ runtime_error_at (const char *where, const char *message, ...)
struct iovec iov[4];
int written;

- recursion_check ();
iov[0].iov_base = (char*) where;
iov[0].iov_len = strlen (where);
iov[1].iov_base = (char*) "\nFortran runtime error: ";
@@ -473,7 +451,6 @@ internal_error (st_parameter_common *cmp, const char *message)
{
struct iovec iov[3];

- recursion_check ();
show_locus (cmp);
iov[0].iov_base = (char*) "Internal Error: ";
iov[0].iov_len = strlen (iov[0].iov_base);
@@ -674,7 +651,6 @@ generate_error_common (st_parameter_common *cmp, int family, const char *message
/* Return code, caller is responsible for terminating
the program if necessary. */

- recursion_check ();
show_locus (cmp);
struct iovec iov[3];
iov[0].iov_base = (char*) "Fortran runtime error: ";
@@ -766,7 +742,6 @@ notify_std (st_parameter_common *cmp, int std, const char * message)

if (!warning)
{
- recursion_check ();
show_locus (cmp);
iov[0].iov_base = (char*) "Fortran runtime error: ";
iov[0].iov_len = strlen (iov[0].iov_base);
diff --git a/libgfortran/runtime/minimal.c b/libgfortran/runtime/minimal.c
index b6d26fd2228..44ebd99af07 100644
--- a/libgfortran/runtime/minimal.c
+++ b/libgfortran/runtime/minimal.c
@@ -43,24 +43,6 @@ options_t options;
static int argc_save;
static char **argv_save;

-/* recursion_check()-- It's possible for additional errors to occur
- * during fatal error processing. We detect this condition here and
- * exit with code 4 immediately. */
-
-#define MAGIC 0x20DE8101
-
-static void
-recursion_check (void)
-{
- static int magic = 0;
-
- /* Don't even try to print something at this point */
- if (magic == MAGIC)
- sys_abort ();
-
- magic = MAGIC;
-}
-

/* os_error()-- Operating system error. We get a message from the
* operating system, show it and leave. Some operating system errors
@@ -69,7 +51,6 @@ recursion_check (void)
void
os_error (const char *message)
{
- recursion_check ();
printf ("Operating system error: ");
printf ("%s\n", message);
exit (1);
@@ -85,7 +66,6 @@ runtime_error (const char *message, ...)
{
va_list ap;

- recursion_check ();
printf ("Fortran runtime error: ");
va_start (ap, message);
vprintf (message, ap);
@@ -103,7 +83,6 @@ runtime_error_at (const char *where, const char *message, ...)
{
va_list ap;

- recursion_check ();
printf ("%s", where);
printf ("\nFortran runtime error: ");
va_start (ap, message);
@@ -136,7 +115,6 @@ iexport(runtime_warning_at);
void
internal_error (st_parameter_common *cmp, const char *message)
{
- recursion_check ();
printf ("Internal Error: ");
printf ("%s", message);
printf ("\n");
--
2.17.1
Thomas Koenig
2018-09-24 16:48:12 UTC
Permalink
Hi Janne,
Post by Janne Blomqvist
libgfortran has a recursion check in the error handling paths. This
works by checking the value of a static variable, and if it matches,
aborting immediately instead of continuing error processing.
Unfortunately, in a multi-threaded program, if two threads report an
error at the same time, this can trigger this check, and thus the
underlying reason for the error is never reported.
I agree that this is a problem that should be addressed. Hm...

What do you think, would it be desirable / possible to keep the
recursive error check, but ensure thread safety by a suitable
locking mechanism? As a first step, we should probably specify
what exactly we would like to happen.

Let us assume that the aim is to keep the current behavior for normal
processes and allow concurrent error processing for multiple threads.

This could be done by making the static variable thread-local.
I'm not sure that this would work reliably, or if some sort of
locking mechanism (probably involving PTHREAD_MUTEX_RECURSIVE)
would be required - only have one thread at a time process
an error.

Do we actually want to keep the current behavior for non-threaded
programs? I'd be in favor, but I do not feel strongly about that.

Regards

Thomas
Janne Blomqvist
2018-09-24 19:18:49 UTC
Permalink
Post by Thomas Koenig
Hi Janne,
Post by Janne Blomqvist
libgfortran has a recursion check in the error handling paths. This
works by checking the value of a static variable, and if it matches,
aborting immediately instead of continuing error processing.
Unfortunately, in a multi-threaded program, if two threads report an
error at the same time, this can trigger this check, and thus the
underlying reason for the error is never reported.
I agree that this is a problem that should be addressed. Hm...
What do you think, would it be desirable / possible to keep the
recursive error check, but ensure thread safety by a suitable
locking mechanism? As a first step, we should probably specify
what exactly we would like to happen.
Let us assume that the aim is to keep the current behavior for normal
processes and allow concurrent error processing for multiple threads.
This could be done by making the static variable thread-local.
I'm not sure that this would work reliably, or if some sort of
locking mechanism (probably involving PTHREAD_MUTEX_RECURSIVE)
would be required - only have one thread at a time process
an error.
Do we actually want to keep the current behavior for non-threaded
programs? I'd be in favor, but I do not feel strongly about that.
What I briefly was thinking about doing, was to use TLS. Or rather, since
TLS is not supported on all targets, something like I did in
intrinsics/random.c:get_rand_state(). But, since the error handling stuff
should be async-signal-safe, the allocation in the setup path should be
done separately, e.g. as part of library initialization.

In the end I decided against it because

1) It's more complicated, in order to handle a quite unlikely edge case.

2) If recursion happens anyway, it's a bug in our error handling flow which
should be fixed and not be papered over.

Anyway, I'm not massively against this, if people have any particular
opinion lets hear it.
--
Janne Blomqvist
Janne Blomqvist
2018-10-06 17:00:04 UTC
Permalink
Post by Janne Blomqvist
Post by Thomas Koenig
Hi Janne,
Post by Janne Blomqvist
libgfortran has a recursion check in the error handling paths. This
works by checking the value of a static variable, and if it matches,
aborting immediately instead of continuing error processing.
Unfortunately, in a multi-threaded program, if two threads report an
error at the same time, this can trigger this check, and thus the
underlying reason for the error is never reported.
I agree that this is a problem that should be addressed. Hm...
What do you think, would it be desirable / possible to keep the
recursive error check, but ensure thread safety by a suitable
locking mechanism? As a first step, we should probably specify
what exactly we would like to happen.
Let us assume that the aim is to keep the current behavior for normal
processes and allow concurrent error processing for multiple threads.
This could be done by making the static variable thread-local.
I'm not sure that this would work reliably, or if some sort of
locking mechanism (probably involving PTHREAD_MUTEX_RECURSIVE)
would be required - only have one thread at a time process
an error.
Do we actually want to keep the current behavior for non-threaded
programs? I'd be in favor, but I do not feel strongly about that.
What I briefly was thinking about doing, was to use TLS. Or rather, since
TLS is not supported on all targets, something like I did in
intrinsics/random.c:get_rand_state(). But, since the error handling stuff
should be async-signal-safe, the allocation in the setup path should be
done separately, e.g. as part of library initialization.
In the end I decided against it because
1) It's more complicated, in order to handle a quite unlikely edge case.
2) If recursion happens anyway, it's a bug in our error handling flow
which should be fixed and not be papered over.
Anyway, I'm not massively against this, if people have any particular
opinion lets hear it.
PING! Any opinions on the above?
--
Janne Blomqvist
Jerry DeLisle
2018-10-06 17:21:58 UTC
Permalink
Post by Janne Blomqvist
Post by Janne Blomqvist
Post by Thomas Koenig
Hi Janne,
Post by Janne Blomqvist
libgfortran has a recursion check in the error handling paths. This
works by checking the value of a static variable, and if it matches,
aborting immediately instead of continuing error processing.
Unfortunately, in a multi-threaded program, if two threads report an
error at the same time, this can trigger this check, and thus the
underlying reason for the error is never reported.
I agree that this is a problem that should be addressed. Hm...
What do you think, would it be desirable / possible to keep the
recursive error check, but ensure thread safety by a suitable
locking mechanism? As a first step, we should probably specify
what exactly we would like to happen.
Let us assume that the aim is to keep the current behavior for normal
processes and allow concurrent error processing for multiple threads.
This could be done by making the static variable thread-local.
I'm not sure that this would work reliably, or if some sort of
locking mechanism (probably involving PTHREAD_MUTEX_RECURSIVE)
would be required - only have one thread at a time process
an error.
Do we actually want to keep the current behavior for non-threaded
programs? I'd be in favor, but I do not feel strongly about that.
What I briefly was thinking about doing, was to use TLS. Or rather, since
TLS is not supported on all targets, something like I did in
intrinsics/random.c:get_rand_state(). But, since the error handling stuff
should be async-signal-safe, the allocation in the setup path should be
done separately, e.g. as part of library initialization.
In the end I decided against it because
1) It's more complicated, in order to handle a quite unlikely edge case.
2) If recursion happens anyway, it's a bug in our error handling flow
which should be fixed and not be papered over.
Anyway, I'm not massively against this, if people have any particular
opinion lets hear it.
PING! Any opinions on the above?
Agree it should be fixed. I would think a mutex lock should work. Lock,
issue error message, unlock. Even if there is recursion, since there is
at least one error somewhere, It should be OK to pause the world to
issue the error, and come back and finish. AS I think about it, maybe
one global lock variable so that only one thread can get it at a time
and others wait until it is unlcoked to proceed with their lock and error.

Would there be a problem with some sort of endless loop waiting for the
unlock?

Jerry
Thomas Koenig
2018-10-06 17:55:02 UTC
Permalink
Hi Jerry,
Post by Jerry DeLisle
Agree it should be fixed. I would think a mutex lock should work. Lock,
issue error message, unlock. Even if there is recursion, since there is
at least one error somewhere, It should be OK to pause the world to
issue the error, and come back and finish. AS I think about it, maybe
one global lock variable so that only one thread can get it at a time
and others wait until it is unlcoked to proceed with their lock and error.
Would there be a problem with some sort of endless loop waiting for the
unlock?
I think that as long as we

a) make the lock recursive

b) make sure to unlock it afterwards

we should be fine, we also should not enter an endless loop on
the rare case of aa double error.

I can prepare a patch, unless somebody else wants to do it.

Regards

Thomas
Janne Blomqvist
2018-10-06 18:05:02 UTC
Permalink
Post by Thomas Koenig
Hi Jerry,
Post by Jerry DeLisle
Agree it should be fixed. I would think a mutex lock should work. Lock,
issue error message, unlock. Even if there is recursion, since there is
at least one error somewhere, It should be OK to pause the world to
issue the error, and come back and finish. AS I think about it, maybe
one global lock variable so that only one thread can get it at a time
and others wait until it is unlcoked to proceed with their lock and
error.
Post by Jerry DeLisle
Would there be a problem with some sort of endless loop waiting for the
unlock?
I think that as long as we
a) make the lock recursive
b) make sure to unlock it afterwards
we should be fine, we also should not enter an endless loop on
the rare case of aa double error.
I can prepare a patch, unless somebody else wants to do it.
The error handling functions can be called from a signal handler, so they
need to be async-signal-safe. See a list e.g. at
http://www.man7.org/linux/man-pages/man7/signal-safety.7.html . Notably,
none of the pthread mutex functions are there.

And come to think of it, neither are pthread_{g,s}etspecific(), so my idea
of using TLS a few messages earlier in this thread is probably not Ok
either..
--
Janne Blomqvist
Thomas Koenig
2018-10-06 21:36:27 UTC
Permalink
Hi Janne,
Post by Janne Blomqvist
The error handling functions can be called from a signal handler, so they
need to be async-signal-safe.
I didn't know that. How can this happen?

Regards

Thomas
Janne Blomqvist
2018-11-08 19:48:12 UTC
Permalink
Post by Thomas Koenig
Hi Janne,
Post by Janne Blomqvist
The error handling functions can be called from a signal handler, so they
need to be async-signal-safe.
I didn't know that. How can this happen?
Hmm, seems I was imagining things, I can't find anything like that in the
code.

Anyway, the more I think about it the better I like my original patch

1) It's KISS

2) I can't find anything in the code that would lead to endless recursive
invocation of the error printing functions.

So, Ok for trunk?
--
Janne Blomqvist
Thomas Koenig
2018-11-10 18:36:22 UTC
Permalink
Hi Janne,
Post by Janne Blomqvist
1) It's KISS
2) I can't find anything in the code that would lead to endless
recursive invocation of the error printing functions.
So, Ok for trunk?
With async I/O, I think the possibilities of hitting
concurrent errors have increased, so I'd still prefer the
solution with wrapping a recursive lock around error handling.

Regards

Thomas
Janne Blomqvist
2018-11-23 19:58:15 UTC
Permalink
With multiple threads, using an unprotected static variable to check
whether recursion has occured isn't valid, as one thread might have
modified the variable, thus causing another thread to incorrectly
conclude that recursion has occured. This patch avoids this problem
by using a thread-specific variable for the recursion check.

Regtested on x86_64-pc-linux-gnu, Ok for trunk?

libgfortran/ChangeLog:

2018-11-23 Janne Blomqvist <***@gcc.gnu.org>

* runtime/error.c (MAGIC): Remove.
(recursion_key): New variable.
(recursion_check): Use thread-specific variable for recursion
check if threads are active.
(constructor_recursion_check): New function.
(destructor_recursion_check): New funcion.
---
libgfortran/runtime/error.c | 43 +++++++++++++++++++++++++++++++------
1 file changed, 36 insertions(+), 7 deletions(-)

diff --git a/libgfortran/runtime/error.c b/libgfortran/runtime/error.c
index b07a4c0b12a..7c52733c19c 100644
--- a/libgfortran/runtime/error.c
+++ b/libgfortran/runtime/error.c
@@ -332,21 +332,50 @@ show_locus (st_parameter_common *cmp)

/* recursion_check()-- It's possible for additional errors to occur
* during fatal error processing. We detect this condition here and
- * exit with code 4 immediately. */
+ * abort immediately. */

-#define MAGIC 0x20DE8101
+static __gthread_key_t recursion_key;

static void
recursion_check (void)
{
- static int magic = 0;
+ if (__gthread_active_p ())
+ {
+ bool* p = __gthread_getspecific (recursion_key);
+ if (!p)
+ {
+ p = xcalloc (1, sizeof (bool));
+ __gthread_setspecific (recursion_key, p);
+ }
+ if (*p)
+ sys_abort ();
+ *p = true;
+ }
+ else
+ {
+ static bool recur;
+ if (recur)
+ sys_abort ();
+ recur = true;
+ }
+}

- /* Don't even try to print something at this point */
- if (magic == MAGIC)
- sys_abort ();
+#ifdef __GTHREADS
+static void __attribute__((constructor))
+constructor_recursion_check (void)
+{
+ if (__gthread_active_p ())
+ __gthread_key_create (&recursion_key, &free);
+}

- magic = MAGIC;
+static void __attribute__((destructor))
+destructor_recursion_check (void)
+{
+ if (__gthread_active_p ())
+ __gthread_key_delete (recursion_key);
}
+#endif
+


#define STRERR_MAXSZ 256
--
2.17.1
Janne Blomqvist
2018-11-23 20:43:08 UTC
Permalink
Post by Janne Blomqvist
With multiple threads, using an unprotected static variable to check
whether recursion has occured isn't valid, as one thread might have
modified the variable, thus causing another thread to incorrectly
conclude that recursion has occured. This patch avoids this problem
by using a thread-specific variable for the recursion check.
Regtested on x86_64-pc-linux-gnu, Ok for trunk?
OK and thanks Janne.
Thanks for the quick review, committed as r266419.
--
Janne Blomqvist
Loading...