[ptest-runner][PATCH 3/3] utils.c: add system data collection when a test gets stuck.


Alexander Kanavin
 

Currently, ptest-runner simply kills the offending test without further ado,
which is not at all helpful when trying to figure out why it happens
(especially if such hangs are intermittent and rare). There's now a script
that gets executed before killing the test, so ideas on what to have in it
are welcome.

Signed-off-by: Alexander Kanavin <alex@linutronix.de>
---
Makefile | 2 +-
ptest-runner-collect-system-data | 5 +++++
utils.c | 24 ++++++++++++++++++++++++
3 files changed, 30 insertions(+), 1 deletion(-)
create mode 100755 ptest-runner-collect-system-data

diff --git a/Makefile b/Makefile
index a6372de..168cf5a 100644
--- a/Makefile
+++ b/Makefile
@@ -43,7 +43,7 @@ $(TEST_EXECUTABLE): $(TEST_OBJECTS)
$(CC) $(LDFLAGS) $(TEST_OBJECTS) -o $@ $(TEST_LIBSTATIC) $(TEST_LDFLAGS)

check: $(TEST_EXECUTABLE)
- ./$(TEST_EXECUTABLE) -d $(TEST_DATA)
+ PATH=.:$(PATH) ./$(TEST_EXECUTABLE) -d $(TEST_DATA)

.c.o:
$(CC) $(CFLAGS) -c $< -o $@
diff --git a/ptest-runner-collect-system-data b/ptest-runner-collect-system-data
new file mode 100755
index 0000000..5bfeaf3
--- /dev/null
+++ b/ptest-runner-collect-system-data
@@ -0,0 +1,5 @@
+#!/bin/sh
+# Other ideas on what to do when a ptest gets stuck welcome.
+pstree -a -l
+df
+free
diff --git a/utils.c b/utils.c
index 58c3aa1..a67ac11 100644
--- a/utils.c
+++ b/utils.c
@@ -281,6 +281,27 @@ close_fds(void)
}
}

+static void
+collect_system_state(FILE* fout)
+{
+ char *cmd = "ptest-runner-collect-system-data";
+
+ char buf[1024];
+ FILE *fp;
+
+ if ((fp = popen(cmd, "r")) == NULL) {
+ fprintf(fout, "Error opening pipe!\n");
+ }
+
+ while (fgets(buf, 1024, fp) != NULL) {
+ fprintf(fout, "%s", buf);
+ }
+
+ if(pclose(fp)) {
+ fprintf(fout, "Command not found or exited with error status\n");
+ }
+}
+
static void *
read_child(void *arg)
{
@@ -313,6 +334,9 @@ read_child(void *arg)
}

} else if (r == 0) {
+ // no output from the test after a timeout; the test is stuck, so collect
+ // as much data from the system as possible and kill the test
+ collect_system_state(_child_reader.fps[0]);
_child_reader.timeouted = 1;
kill(-_child_reader.pid, SIGKILL);
}
--
2.33.0


Richard Purdie
 

On Thu, 2021-09-16 at 14:46 +0200, Alexander Kanavin wrote:
Currently, ptest-runner simply kills the offending test without further ado,
which is not at all helpful when trying to figure out why it happens
(especially if such hangs are intermittent and rare). There's now a script
that gets executed before killing the test, so ideas on what to have in it
are welcome.

Signed-off-by: Alexander Kanavin <alex@linutronix.de>
---
Makefile | 2 +-
ptest-runner-collect-system-data | 5 +++++
utils.c | 24 ++++++++++++++++++++++++
3 files changed, 30 insertions(+), 1 deletion(-)
create mode 100755 ptest-runner-collect-system-data

diff --git a/Makefile b/Makefile
index a6372de..168cf5a 100644
--- a/Makefile
+++ b/Makefile
@@ -43,7 +43,7 @@ $(TEST_EXECUTABLE): $(TEST_OBJECTS)
$(CC) $(LDFLAGS) $(TEST_OBJECTS) -o $@ $(TEST_LIBSTATIC) $(TEST_LDFLAGS)

check: $(TEST_EXECUTABLE)
- ./$(TEST_EXECUTABLE) -d $(TEST_DATA)
+ PATH=.:$(PATH) ./$(TEST_EXECUTABLE) -d $(TEST_DATA)

.c.o:
$(CC) $(CFLAGS) -c $< -o $@
diff --git a/ptest-runner-collect-system-data b/ptest-runner-collect-system-data
new file mode 100755
index 0000000..5bfeaf3
--- /dev/null
+++ b/ptest-runner-collect-system-data
@@ -0,0 +1,5 @@
+#!/bin/sh
+# Other ideas on what to do when a ptest gets stuck welcome.
+pstree -a -l
+df
+free
It is great to see this. I'd suggest dmesg in here since we've seen components
of tests segfault before (e.g. lttng-relayd in lttng-tools).

Cheers,

Richard


Alexander Kanavin
 


On Thu, 16 Sept 2021 at 18:18, Richard Purdie <richard.purdie@...> wrote:
On Thu, 2021-09-16 at 14:46 +0200, Alexander Kanavin wrote:
> Currently, ptest-runner simply kills the offending test without further ado,
> which is not at all helpful when trying to figure out why it happens
> (especially if such hangs are intermittent and rare). There's now a script
> that gets executed before killing the test, so ideas on what to have in it
> are welcome.
>
> Signed-off-by: Alexander Kanavin <alex@...>
> ---
>  Makefile                         |  2 +-
>  ptest-runner-collect-system-data |  5 +++++
>  utils.c                          | 24 ++++++++++++++++++++++++
>  3 files changed, 30 insertions(+), 1 deletion(-)
>  create mode 100755 ptest-runner-collect-system-data
>
> diff --git a/Makefile b/Makefile
> index a6372de..168cf5a 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -43,7 +43,7 @@ $(TEST_EXECUTABLE): $(TEST_OBJECTS)
>       $(CC) $(LDFLAGS) $(TEST_OBJECTS) -o $@ $(TEST_LIBSTATIC) $(TEST_LDFLAGS)

>  check: $(TEST_EXECUTABLE)
> -     ./$(TEST_EXECUTABLE) -d $(TEST_DATA)
> +     PATH=.:$(PATH) ./$(TEST_EXECUTABLE) -d $(TEST_DATA)

>  .c.o:
>       $(CC) $(CFLAGS) -c $< -o $@
> diff --git a/ptest-runner-collect-system-data b/ptest-runner-collect-system-data
> new file mode 100755
> index 0000000..5bfeaf3
> --- /dev/null
> +++ b/ptest-runner-collect-system-data
> @@ -0,0 +1,5 @@
> +#!/bin/sh
> +# Other ideas on what to do when a ptest gets stuck welcome.
> +pstree -a -l
> +df
> +free

It is great to see this. I'd suggest dmesg in here since we've seen components
of tests segfault before (e.g. lttng-relayd in lttng-tools).

Cheers,

Richard



Anibal Limon
 

Hi Alex,

Do you have a repo/branch for this patch?.

I'm having issues applying...

...
alimon@blackbox:~/upstream/ptest-runner2$ git am -3 ~/Downloads/\[ptest-runner\]\[PATCH\ 3_3\]\ utils.c_\ add\ system\ data\ collection\ when\ a\ test\ gets\ stuck..eml
Applying: utils.c: add system data collection when a test gets stuck.
error: sha1 information is lacking or useless (utils.c).
error: could not build fake ancestor
Patch failed at 0001 utils.c: add system data collection when a test gets stuck.
hint: Use 'git am --show-current-patch=diff' to see the failed patch
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".
...

Regards,
Anibal

On Thu, 16 Sept 2021 at 07:46, Alexander Kanavin <alex.kanavin@...> wrote:
Currently, ptest-runner simply kills the offending test without further ado,
which is not at all helpful when trying to figure out why it happens
(especially if such hangs are intermittent and rare). There's now a script
that gets executed before killing the test, so ideas on what to have in it
are welcome.

Signed-off-by: Alexander Kanavin <alex@...>
---
 Makefile                         |  2 +-
 ptest-runner-collect-system-data |  5 +++++
 utils.c                          | 24 ++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100755 ptest-runner-collect-system-data

diff --git a/Makefile b/Makefile
index a6372de..168cf5a 100644
--- a/Makefile
+++ b/Makefile
@@ -43,7 +43,7 @@ $(TEST_EXECUTABLE): $(TEST_OBJECTS)
        $(CC) $(LDFLAGS) $(TEST_OBJECTS) -o $@ $(TEST_LIBSTATIC) $(TEST_LDFLAGS)

 check: $(TEST_EXECUTABLE)
-       ./$(TEST_EXECUTABLE) -d $(TEST_DATA)
+       PATH=.:$(PATH) ./$(TEST_EXECUTABLE) -d $(TEST_DATA)

 .c.o:
        $(CC) $(CFLAGS) -c $< -o $@
diff --git a/ptest-runner-collect-system-data b/ptest-runner-collect-system-data
new file mode 100755
index 0000000..5bfeaf3
--- /dev/null
+++ b/ptest-runner-collect-system-data
@@ -0,0 +1,5 @@
+#!/bin/sh
+# Other ideas on what to do when a ptest gets stuck welcome.
+pstree -a -l
+df
+free
diff --git a/utils.c b/utils.c
index 58c3aa1..a67ac11 100644
--- a/utils.c
+++ b/utils.c
@@ -281,6 +281,27 @@ close_fds(void)
        }
 }

+static void
+collect_system_state(FILE* fout)
+{
+       char *cmd = "ptest-runner-collect-system-data";
+
+       char buf[1024];
+       FILE *fp;
+
+       if ((fp = popen(cmd, "r")) == NULL) {
+               fprintf(fout, "Error opening pipe!\n");
+       }
+
+       while (fgets(buf, 1024, fp) != NULL) {
+               fprintf(fout, "%s", buf);
+       }
+
+       if(pclose(fp))  {
+               fprintf(fout, "Command not found or exited with error status\n");
+       }
+}
+
 static void *
 read_child(void *arg)
 {
@@ -313,6 +334,9 @@ read_child(void *arg)
                        }

                } else if (r == 0) {
+                       // no output from the test after a timeout; the test is stuck, so collect
+                       // as much data from the system as possible and kill the test
+                       collect_system_state(_child_reader.fps[0]);
                        _child_reader.timeouted = 1;
                        kill(-_child_reader.pid, SIGKILL);
                 }
--
2.33.0


Alexander Kanavin
 

Not sure what happened here, I published the patches in https://github.com/kanavin/ptest-runner2

Alex


On Thu, 23 Sept 2021 at 18:33, Anibal Limon <anibal.limon@...> wrote:
Hi Alex,

Do you have a repo/branch for this patch?.

I'm having issues applying...

...
alimon@blackbox:~/upstream/ptest-runner2$ git am -3 ~/Downloads/\[ptest-runner\]\[PATCH\ 3_3\]\ utils.c_\ add\ system\ data\ collection\ when\ a\ test\ gets\ stuck..eml
Applying: utils.c: add system data collection when a test gets stuck.
error: sha1 information is lacking or useless (utils.c).
error: could not build fake ancestor
Patch failed at 0001 utils.c: add system data collection when a test gets stuck.
hint: Use 'git am --show-current-patch=diff' to see the failed patch
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".
...

Regards,
Anibal

On Thu, 16 Sept 2021 at 07:46, Alexander Kanavin <alex.kanavin@...> wrote:
Currently, ptest-runner simply kills the offending test without further ado,
which is not at all helpful when trying to figure out why it happens
(especially if such hangs are intermittent and rare). There's now a script
that gets executed before killing the test, so ideas on what to have in it
are welcome.

Signed-off-by: Alexander Kanavin <alex@...>
---
 Makefile                         |  2 +-
 ptest-runner-collect-system-data |  5 +++++
 utils.c                          | 24 ++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100755 ptest-runner-collect-system-data

diff --git a/Makefile b/Makefile
index a6372de..168cf5a 100644
--- a/Makefile
+++ b/Makefile
@@ -43,7 +43,7 @@ $(TEST_EXECUTABLE): $(TEST_OBJECTS)
        $(CC) $(LDFLAGS) $(TEST_OBJECTS) -o $@ $(TEST_LIBSTATIC) $(TEST_LDFLAGS)

 check: $(TEST_EXECUTABLE)
-       ./$(TEST_EXECUTABLE) -d $(TEST_DATA)
+       PATH=.:$(PATH) ./$(TEST_EXECUTABLE) -d $(TEST_DATA)

 .c.o:
        $(CC) $(CFLAGS) -c $< -o $@
diff --git a/ptest-runner-collect-system-data b/ptest-runner-collect-system-data
new file mode 100755
index 0000000..5bfeaf3
--- /dev/null
+++ b/ptest-runner-collect-system-data
@@ -0,0 +1,5 @@
+#!/bin/sh
+# Other ideas on what to do when a ptest gets stuck welcome.
+pstree -a -l
+df
+free
diff --git a/utils.c b/utils.c
index 58c3aa1..a67ac11 100644
--- a/utils.c
+++ b/utils.c
@@ -281,6 +281,27 @@ close_fds(void)
        }
 }

+static void
+collect_system_state(FILE* fout)
+{
+       char *cmd = "ptest-runner-collect-system-data";
+
+       char buf[1024];
+       FILE *fp;
+
+       if ((fp = popen(cmd, "r")) == NULL) {
+               fprintf(fout, "Error opening pipe!\n");
+       }
+
+       while (fgets(buf, 1024, fp) != NULL) {
+               fprintf(fout, "%s", buf);
+       }
+
+       if(pclose(fp))  {
+               fprintf(fout, "Command not found or exited with error status\n");
+       }
+}
+
 static void *
 read_child(void *arg)
 {
@@ -313,6 +334,9 @@ read_child(void *arg)
                        }

                } else if (r == 0) {
+                       // no output from the test after a timeout; the test is stuck, so collect
+                       // as much data from the system as possible and kill the test
+                       collect_system_state(_child_reader.fps[0]);
                        _child_reader.timeouted = 1;
                        kill(-_child_reader.pid, SIGKILL);
                 }
--
2.33.0