SIGPIPE 与正在运行的程序

发布于 2024-12-13 21:38:22 字数 4511 浏览 0 评论 0原文

我有两个守护进程，A 正在与 B 通话。B 正在侦听端口，A 打开到该端口的 tcp 连接。 A 能够打开到 B 的套接字，但是当它尝试实际写入所述套接字时，我收到一个 SIGPIPE，因此我试图找出 B 可以在哪里关闭打开的套接字。

但是，如果我附加到 gdb 中的两个守护进程，则 SIGPIPE 会在调用任何用于处理数据的代码之前发生。这是有道理的，因为初始写入永远不会成功，并且侦听器是通过接收数据来触发的。我的问题是 - 什么可能导致守护进程 B 在发送任何数据之前关闭套接字？套接字在打开后不到一微秒的时间内关闭，所以我认为这不可能是超时或类似的情况。我想要一份可以追踪的可能性的详细清单，因为我已经考虑了这个问题好几天了，而且我几乎没有想法。

根据要求，以下是接受和处理通信的代码：

{
extern char *PAddrToString(pbs_net_t *);

int i;
int n;

time_t now;

fd_set *SelectSet = NULL;
int SelectSetSize = 0;

int MaxNumDescriptors = 0;

char id[] = "wait_request";
char tmpLine[1024];

struct timeval timeout;

long OrigState = 0;

if (SState != NULL)
  OrigState = *SState;

timeout.tv_usec = 0;

timeout.tv_sec  = waittime;

SelectSetSize = sizeof(char) * get_fdset_size();
SelectSet = (fd_set *)calloc(1,SelectSetSize);

pthread_mutex_lock(global_sock_read_mutex);

memcpy(SelectSet,GlobalSocketReadSet,SelectSetSize);

/* selset = readset;*/  /* readset is global */
MaxNumDescriptors = get_max_num_descriptors();

pthread_mutex_unlock(global_sock_read_mutex);
n = select(MaxNumDescriptors, SelectSet, (fd_set *)0, (fd_set *)0, &timeout);

if (n == -1)
  {
  if (errno == EINTR)
    {
    n = 0; /* interrupted, cycle around */
    }
  else
    {
    int i;

    struct stat fbuf;

    /* check all file descriptors to verify they are valid */

    /* NOTE: selset may be modified by failed select() */

    for (i = 0; i < MaxNumDescriptors; i++)
      {
      if (FD_ISSET(i, GlobalSocketReadSet) == 0)
        continue;

      if (fstat(i, &fbuf) == 0)
        continue;

      /* clean up SdList and bad sd... */

      pthread_mutex_lock(global_sock_read_mutex);
      FD_CLR(i, GlobalSocketReadSet);
      pthread_mutex_unlock(global_sock_read_mutex);
      } /* END for each socket in global read set */

    free(SelectSet);

    log_err(errno, id, "Unable to select sockets to read requests");


    return(-1);
    }  /* END else (errno == EINTR) */
  }    /* END if (n == -1) */

for (i = 0; (i < max_connection) && (n != 0); i++)
  {
  pthread_mutex_lock(svr_conn[i].cn_mutex);

  if (FD_ISSET(i, SelectSet))
    {
    /* this socket has data */
    n--;

    svr_conn[i].cn_lasttime = time(NULL);

    if (svr_conn[i].cn_active != Idle)
      {
      void *(*func)(void *) = svr_conn[i].cn_func;

      netcounter_incr();

      pthread_mutex_unlock(svr_conn[i].cn_mutex);

      func((void *)&i);

      /* NOTE:  breakout if state changed (probably received shutdown request) */

      if ((SState != NULL) &&
          (OrigState != *SState))
        break;
      }
    else
      {

      pthread_mutex_lock(global_sock_read_mutex);
      FD_CLR(i, GlobalSocketReadSet);
      pthread_mutex_unlock(global_sock_read_mutex);

      close_conn(i, TRUE);

      pthread_mutex_unlock(svr_conn[i].cn_mutex);
      pthread_mutex_lock(num_connections_mutex);

      sprintf(tmpLine, "closed connections to fd %d - num_connections=%d (select bad socket)",
        i,
        num_connections);

      pthread_mutex_unlock(num_connections_mutex);
      log_err(-1, id, tmpLine);
      }
    }
  else
    pthread_mutex_unlock(svr_conn[i].cn_mutex);
  } /* END for i */

/* NOTE:  break out if shutdown request received */

if ((SState != NULL) && (OrigState != *SState))
  return(0);

/* have any connections timed out ?? */
now = time((time_t *)0);

for (i = 0;i < max_connection;i++)
  {
  struct connection *cp;

  pthread_mutex_lock(svr_conn[i].cn_mutex);

  cp = &svr_conn[i];

  if (cp->cn_active != FromClientDIS)
    {
    pthread_mutex_unlock(svr_conn[i].cn_mutex);

    continue;
    }

  if ((now - cp->cn_lasttime) <= PBS_NET_MAXCONNECTIDLE)
    {
    pthread_mutex_unlock(svr_conn[i].cn_mutex);

    continue;
    }

  if (cp->cn_authen & PBS_NET_CONN_NOTIMEOUT)
    {
    pthread_mutex_unlock(svr_conn[i].cn_mutex);

    continue; /* do not time-out this connection */
    }

  /* NOTE:  add info about node associated with connection - NYI */

  snprintf(tmpLine, sizeof(tmpLine), "connection %d to host %s has timed out after %d seconds - closing stale connection\n",
    i,
    PAddrToString(&cp->cn_addr),
    PBS_NET_MAXCONNECTIDLE);

  log_err(-1, "wait_request", tmpLine);

  /* locate node associated with interface, mark node as down until node responds */
  /* NYI */
  close_conn(i, TRUE);

  pthread_mutex_unlock(svr_conn[i].cn_mutex);
  }  /* END for (i) */

return(0);
}

注意：我没有编写此代码。

原文

I have two daemons, and A is speaking to B. B is listening on a port, and A opens a tcp connection to that port. A is able to open a socket to B, but when it attempts to actually write said socket, I get a SIGPIPE, so I'm trying to figure out where B could be closing the open socket.

However, if I attach to both daemons in gdb, the SIGPIPE happens before any of the code for handling data is called. This kind of makes sense, because the initial write is never successful, and the listeners are triggered from receiving data. My question is - what could cause daemon B to close the socket before any data is sent? The socket is closed less than a microsecond after opening it, so I'm thinking it can't be a timeout or anything of the sort. I would love a laundry list of possibilities to track down, as I've been chewing on this one for a few days and I'm pretty much out of ideas.

As requested, here is the code that accepts and handles communication:

{
extern char *PAddrToString(pbs_net_t *);

int i;
int n;

time_t now;

fd_set *SelectSet = NULL;
int SelectSetSize = 0;

int MaxNumDescriptors = 0;

char id[] = "wait_request";
char tmpLine[1024];

struct timeval timeout;

long OrigState = 0;

if (SState != NULL)
  OrigState = *SState;

timeout.tv_usec = 0;

timeout.tv_sec  = waittime;

SelectSetSize = sizeof(char) * get_fdset_size();
SelectSet = (fd_set *)calloc(1,SelectSetSize);

pthread_mutex_lock(global_sock_read_mutex);

memcpy(SelectSet,GlobalSocketReadSet,SelectSetSize);

/* selset = readset;*/  /* readset is global */
MaxNumDescriptors = get_max_num_descriptors();

pthread_mutex_unlock(global_sock_read_mutex);
n = select(MaxNumDescriptors, SelectSet, (fd_set *)0, (fd_set *)0, &timeout);

if (n == -1)
  {
  if (errno == EINTR)
    {
    n = 0; /* interrupted, cycle around */
    }
  else
    {
    int i;

    struct stat fbuf;

    /* check all file descriptors to verify they are valid */

    /* NOTE: selset may be modified by failed select() */

    for (i = 0; i < MaxNumDescriptors; i++)
      {
      if (FD_ISSET(i, GlobalSocketReadSet) == 0)
        continue;

      if (fstat(i, &fbuf) == 0)
        continue;

      /* clean up SdList and bad sd... */

      pthread_mutex_lock(global_sock_read_mutex);
      FD_CLR(i, GlobalSocketReadSet);
      pthread_mutex_unlock(global_sock_read_mutex);
      } /* END for each socket in global read set */

    free(SelectSet);

    log_err(errno, id, "Unable to select sockets to read requests");


    return(-1);
    }  /* END else (errno == EINTR) */
  }    /* END if (n == -1) */

for (i = 0; (i < max_connection) && (n != 0); i++)
  {
  pthread_mutex_lock(svr_conn[i].cn_mutex);

  if (FD_ISSET(i, SelectSet))
    {
    /* this socket has data */
    n--;

    svr_conn[i].cn_lasttime = time(NULL);

    if (svr_conn[i].cn_active != Idle)
      {
      void *(*func)(void *) = svr_conn[i].cn_func;

      netcounter_incr();

      pthread_mutex_unlock(svr_conn[i].cn_mutex);

      func((void *)&i);

      /* NOTE:  breakout if state changed (probably received shutdown request) */

      if ((SState != NULL) &&
          (OrigState != *SState))
        break;
      }
    else
      {

      pthread_mutex_lock(global_sock_read_mutex);
      FD_CLR(i, GlobalSocketReadSet);
      pthread_mutex_unlock(global_sock_read_mutex);

      close_conn(i, TRUE);

      pthread_mutex_unlock(svr_conn[i].cn_mutex);
      pthread_mutex_lock(num_connections_mutex);

      sprintf(tmpLine, "closed connections to fd %d - num_connections=%d (select bad socket)",
        i,
        num_connections);

      pthread_mutex_unlock(num_connections_mutex);
      log_err(-1, id, tmpLine);
      }
    }
  else
    pthread_mutex_unlock(svr_conn[i].cn_mutex);
  } /* END for i */

/* NOTE:  break out if shutdown request received */

if ((SState != NULL) && (OrigState != *SState))
  return(0);

/* have any connections timed out ?? */
now = time((time_t *)0);

for (i = 0;i < max_connection;i++)
  {
  struct connection *cp;

  pthread_mutex_lock(svr_conn[i].cn_mutex);

  cp = &svr_conn[i];

  if (cp->cn_active != FromClientDIS)
    {
    pthread_mutex_unlock(svr_conn[i].cn_mutex);

    continue;
    }

  if ((now - cp->cn_lasttime) <= PBS_NET_MAXCONNECTIDLE)
    {
    pthread_mutex_unlock(svr_conn[i].cn_mutex);

    continue;
    }

  if (cp->cn_authen & PBS_NET_CONN_NOTIMEOUT)
    {
    pthread_mutex_unlock(svr_conn[i].cn_mutex);

    continue; /* do not time-out this connection */
    }

  /* NOTE:  add info about node associated with connection - NYI */

  snprintf(tmpLine, sizeof(tmpLine), "connection %d to host %s has timed out after %d seconds - closing stale connection\n",
    i,
    PAddrToString(&cp->cn_addr),
    PBS_NET_MAXCONNECTIDLE);

  log_err(-1, "wait_request", tmpLine);

  /* locate node associated with interface, mark node as down until node responds */
  /* NYI */
  close_conn(i, TRUE);

  pthread_mutex_unlock(svr_conn[i].cn_mutex);
  }  /* END for (i) */

return(0);
}

NOTE: I didn't write this code.

分享到QQ

分享到微博