Hi there,
thanks for your library. I came across a livelock in our use of the library and wondered whether this could be considered a bug.
To reproduce, add the following patch to the current master branch and run report server and then report client:
diff --git a/lib60870-C/examples/CMakeLists.txt b/lib60870-C/examples/CMakeLists.txt
index fdb7533..e1457cb 100644
--- a/lib60870-C/examples/CMakeLists.txt
+++ b/lib60870-C/examples/CMakeLists.txt
@@ -9,6 +9,7 @@ add_subdirectory(cs104_server_no_threads)
add_subdirectory(cs104_server_files)
add_subdirectory(cs104_redundancy_server)
add_subdirectory(multi_client_server)
+add_subdirectory(report)
if (WITH_MBEDTLS OR WITH_MBEDTLS3)
add_subdirectory(tls_client)
diff --git a/lib60870-C/examples/report/CMakeLists.txt b/lib60870-C/examples/report/CMakeLists.txt
new file mode 100644
index 0000000..095084e
--- /dev/null
+++ b/lib60870-C/examples/report/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(report report.c)
+target_link_libraries(report lib60870)
diff --git a/lib60870-C/examples/report/report.c b/lib60870-C/examples/report/report.c
new file mode 100644
index 0000000..cc490ec
--- /dev/null
+++ b/lib60870-C/examples/report/report.c
@@ -0,0 +1,110 @@
+#include <stdio.h>
+#include <string.h>
+
+#include "cs104_slave.h"
+#include "cs104_connection.h"
+
+#include "hal_thread.h"
+#include "hal_time.h"
+
+struct client_state {
+ Semaphore semaphore;
+ bool connected;
+};
+
+static const int port = 2404;
+
+static bool isConnected(struct client_state *state) {
+ Semaphore_wait(state->semaphore);
+ bool result = state->connected;
+ Semaphore_post(state->semaphore);
+ return result;
+}
+
+static void setConnected(struct client_state *state, bool connected) {
+ Semaphore_wait(state->semaphore);
+ state->connected = connected;
+ Semaphore_post(state->semaphore);
+}
+
+static void
+connectionHandler (void* parameter, CS104_Connection connection, CS104_ConnectionEvent event)
+{
+ switch (event) {
+ case CS104_CONNECTION_OPENED:
+ puts("Connection established");
+ setConnected((struct client_state*) parameter, true);
+ break;
+ case CS104_CONNECTION_CLOSED:
+ puts("Connection closed");
+ setConnected((struct client_state*) parameter, false);
+ break;
+ case CS104_CONNECTION_FAILED:
+ puts("Failed to connect");
+ setConnected((struct client_state*) parameter, false);
+ break;
+ case CS104_CONNECTION_STARTDT_CON_RECEIVED:
+ puts("Received STARTDT_CON");
+ break;
+ case CS104_CONNECTION_STOPDT_CON_RECEIVED:
+ puts("Received STOPDT_CON");
+ break;
+ default:
+ printf("Received unimplemented connection event %d\n", (int) event);
+ break;
+ }
+}
+
+static void client() {
+ struct client_state state;
+ state.semaphore = Semaphore_create(1);
+ state.connected = false;
+
+ CS104_Connection con = CS104_Connection_create("127.0.0.1", port);
+
+ CS104_Connection_setConnectionHandler(con, connectionHandler, &state);
+
+ while (true) {
+ if (!isConnected(&state)) {
+ puts("Client not connected, connecting...");
+ if (CS104_Connection_connect(con)) {
+ puts("Connected!");
+ } else {
+ puts("Connection failed");
+ }
+ }
+ }
+
+ CS104_Connection_destroy(con);
+}
+
+static void server() {
+ CS104_Slave slave = CS104_Slave_create(10, 10);
+
+ CS104_Slave_setLocalAddress(slave, "127.0.0.1");
+ CS104_Slave_setLocalPort(slave, port);
+ CS104_Slave_setServerMode(slave, CS104_MODE_SINGLE_REDUNDANCY_GROUP);
+
+ CS104_Slave_start(slave);
+
+ puts("Started server");
+ while (CS104_Slave_isRunning(slave) == true) {
+ Thread_sleep(1000);
+ }
+}
+
+int main(int argc, char* argv[]) {
+ if (argc != 2) {
+ fputs("Need exactly one argument: client or server\n", stderr);
+ return 1;
+ }
+ if (0 == strcmp(argv[1], "client")) {
+ client();
+ } else if (0 == strcmp(argv[1], "server")) {
+ server();
+ } else {
+ fputs("Need exactly one argument: client or server\n", stderr);
+ }
+
+ return 0;
+}
diff --git a/lib60870-C/src/iec60870/cs104/cs104_connection.c b/lib60870-C/src/iec60870/cs104/cs104_connection.c
index f4692f4..950b054 100644
--- a/lib60870-C/src/iec60870/cs104/cs104_connection.c
+++ b/lib60870-C/src/iec60870/cs104/cs104_connection.c
@@ -947,6 +947,8 @@ handleConnection(void* parameter)
if (isRunning(self))
{
+ // Make the bug much more likely to occur - this should just influence the order in which threads are scheduled
+ Thread_sleep(1000);
#if (CONFIG_USE_SEMAPHORES == 1)
Semaphore_wait(self->conStateLock);
#endif /* (CONFIG_USE_SEMAPHORES == 1) */
For me, the client outputs the following and then hangs:
$ ./report client
Client not connected, connecting...
Connected!
Client not connected, connecting...
Connection established
According to gdb, in this situation the main thread hangs here:
|
Thread_destroy(self->connectionHandlingThread); |
What happens is the following:
- The main thread sees that we are not connected and calls
CS104_Connection_connect to reconnect.
- The
handleConnection thread starts and gets to here:
- This allows the main thread to return
true from CS104_Connection_connect (after the semaphore was put):
|
while ((isRunning(self) == false) && (isFailure(self) == false)) |
|
Thread_sleep(1); |
|
|
|
return isRunning(self); |
- The
reproducer makes another loop iteration and sees that its connected variable is still false, so calls CS104_Connection_connect again. This causes a call to Thread_destroy, which waits for the worker thread to exit.
- Thanks to the
sleep that I added in my above patch, the worker thread only continues now. It now calls the connectionHandler callback here:
|
self->connectionHandler(self->connectionHandlerParameter, self, CS104_CONNECTION_OPENED); |
- Now the worker thread will handle the connection until the server closes the connection. The main thread is stuck waiting for this disconnect.
I have seen the same race occurring even without this extra sleep, but not in this reproducer example (but I have not tried that often to cause it).
In my actual program, I worked around this race by setting connected to true before calling CS104_Connection_connect. I think I cannot do it afterwards, since that would allow another race: The connection could establish and immediately fail. The connectionHandler callback could thus be called before the main thread sets connected to true and my state would be inconsistent again. Threads are tricky.
I hope the above reports makes a bit of sense. In the end, I think this was caused by an overly aggressive reconnection policy. It still took quite a while to figure out the details and I thought you might be interested.
My question now:
- Do you think it would make sense to ensure that the
connectionHandler callback gets called before CS104_Connection_connect returns? I think that would have avoided all my problems.
- Do you think adding something like
CS104_Slave_isRunning but for CS104_Connection would make sense? Or does such a function perhaps already exist and I just missed it? With such a function, I would no longer have to track the connection state myself to trigger reconnects.
Hi there,
thanks for your library. I came across a livelock in our use of the library and wondered whether this could be considered a bug.
To reproduce, add the following patch to the current
masterbranch and runreport serverand thenreport client:For me, the client outputs the following and then hangs:
According to
gdb, in this situation the main thread hangs here:lib60870/lib60870-C/src/iec60870/cs104/cs104_connection.c
Line 1146 in 94f6e65
What happens is the following:
CS104_Connection_connectto reconnect.handleConnectionthread starts and gets to here:lib60870/lib60870-C/src/iec60870/cs104/cs104_connection.c
Line 941 in 94f6e65
truefromCS104_Connection_connect(after the semaphore was put):lib60870/lib60870-C/src/iec60870/cs104/cs104_connection.c
Lines 1162 to 1165 in 94f6e65
reproducermakes another loop iteration and sees that itsconnectedvariable is stillfalse, so callsCS104_Connection_connectagain. This causes a call toThread_destroy, which waits for the worker thread to exit.sleepthat I added in my above patch, the worker thread only continues now. It now calls theconnectionHandlercallback here:lib60870/lib60870-C/src/iec60870/cs104/cs104_connection.c
Line 962 in 94f6e65
I have seen the same race occurring even without this extra
sleep, but not in this reproducer example (but I have not tried that often to cause it).In my actual program, I worked around this race by setting
connectedtotruebefore callingCS104_Connection_connect. I think I cannot do it afterwards, since that would allow another race: The connection could establish and immediately fail. TheconnectionHandlercallback could thus be called before the main thread setsconnectedtotrueand my state would be inconsistent again. Threads are tricky.I hope the above reports makes a bit of sense. In the end, I think this was caused by an overly aggressive reconnection policy. It still took quite a while to figure out the details and I thought you might be interested.
My question now:
connectionHandlercallback gets called beforeCS104_Connection_connectreturns? I think that would have avoided all my problems.CS104_Slave_isRunningbut forCS104_Connectionwould make sense? Or does such a function perhaps already exist and I just missed it? With such a function, I would no longer have to track the connection state myself to trigger reconnects.