Post

Quagga zebra between kernel interface method

Quagga开源路由协议的核心功能,就是维护内核中的路由表,这是通过用户态的路由协议和内核交互实现的。那么用户态程序如何与内核进行交互,本文就来探究一下。

在 Quagga 1.2.4的 configure.ac 文件中有下面这一段代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
dnl ------------------------------------
dnl Determine routing get and set method
dnl ------------------------------------
AC_MSG_CHECKING(zebra between kernel interface method)
if test x"$opsys" = x"gnu-linux"; then
  AC_MSG_RESULT(netlink)
  RT_METHOD=rt_netlink.o
  AC_DEFINE(HAVE_NETLINK,,netlink)
  netlink=yes
else
  AC_MSG_RESULT(Route socket)
  KERNEL_METHOD="kernel_socket.o"
  RT_METHOD="rt_socket.o"
fi
AC_SUBST(RT_METHOD)
AC_SUBST(KERNEL_METHOD)
AM_CONDITIONAL([HAVE_NETLINK], [test "x$netlink" = "xyes"])

通过判断操作系统的类型(opsys),当是 gnu-linux 的时候,RT_METHOD = rt_netlink.o,否则 RT_METHOD = kernel_socket.o 且 KERNEL_METHOD = rt_socket.o。RT_METHOD 和 KERNEL_METHOD 在 zebra 目录中的 Makefile.am 引用,用来赋值最终需要编译的目标文件变量,具体如下:

1
2
3
4
5
6
7
8
9
ipforward = @IPFORWARD@
if_method = @IF_METHOD@
rt_method = @RT_METHOD@
rtread_method = @RTREAD_METHOD@
kernel_method = @KERNEL_METHOD@
ioctl_method = @IOCTL_METHOD@

otherobj = $(ioctl_method) $(ipforward) $(if_method) \
	$(rt_method) $(rtread_method) $(kernel_method)

接下来我们看一下具体的代码。在 zebra_rib.c 中 rib_process 最后通过 rib_update_kernel 将路由表安装到内核中, rib_update_kernel 最终通过 kernel_route_rib 将路由实际下发到内核路由表。而kernel_route_rib 有两种不同的实现,分别对应前面提到的 rt_netlink.c 以及 rt_socket.c 和 kernel_socket.c 文件。

首先让我们看看 rt_netlink.c中 的实现。

1
2
3
4
5
6
7
8
9
10
11
12
13
int
kernel_route_rib (struct prefix *p, struct rib *old, struct rib *new)
{
  if (!old && new)
    return netlink_route_multipath (RTM_NEWROUTE, p, new);
  if (old && !new)
    return netlink_route_multipath (RTM_DELROUTE, p, old);

   /* Replace, can be done atomically if metric does not change;
    * netlink uses [prefix, tos, priority] to identify prefix.
    * Now metric is not sent to kernel, so we can just do atomic replace. */
  return netlink_route_multipath (RTM_NEWROUTE, p, new);
}

根据 old 和 new 的状态决定是添加 (RTM_NEWROUTE) 还是删除 (RTM_DELROUTE) 路由, 或者是更新路由。不论哪种情况最终都调用 netlink_route_multipath。因为netlink_route_multipath的实现比较长,我们只选取比较重要的进行分析。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
/* Routing table change via netlink interface. */
static int
netlink_route_multipath (int cmd, struct prefix *p, struct rib *rib)
{
  int bytelen;
  struct sockaddr_nl snl;
  struct nexthop *nexthop = NULL, *tnexthop;
  int recursing;
  int nexthop_num;
  int discard;
  int family = PREFIX_FAMILY(p);
  const char *routedesc;

  /*请求消息格式, 其中nlmsghdr结构体成员n封装netlink消息头,rtmsg结构体成员r封装路由消息投,buf封装路由信息净荷*/
  struct
  {
    struct nlmsghdr n;
    struct rtmsg r;
    char buf[NL_PKT_BUF_SIZE];
  } req;

  struct zebra_vrf *zvrf = vrf_info_lookup (rib->vrf_id);

  memset (&req, 0, sizeof req - NL_PKT_BUF_SIZE);

  bytelen = (family == AF_INET ? 4 : 16);

  req.n.nlmsg_len = NLMSG_LENGTH (sizeof (struct rtmsg));
  req.n.nlmsg_flags = NLM_F_CREATE | NLM_F_REPLACE | NLM_F_REQUEST;
  req.n.nlmsg_type = cmd;
  req.r.rtm_family = family;
  req.r.rtm_table = rib->table;
  req.r.rtm_dst_len = p->prefixlen;
  req.r.rtm_protocol = RTPROT_ZEBRA;
  req.r.rtm_scope = RT_SCOPE_LINK;

  /*此处省略根据传入的cmd, p 和 rib 参数决定下发的路由消息处理流程*/

  /* Destination netlink address. */
  memset (&snl, 0, sizeof snl);
  snl.nl_family = AF_NETLINK;

  /*除了参数n的地址外,还通过netlink_talk下发了netlink_cmd,也就是netlink socket,以及zvrf指定的路由表*/

  /* Talk to netlink socket. */
  return netlink_talk (&req.n, &zvrf->netlink_cmd, zvrf);
}

我们再看看netlink_talk的函数实现。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
/* sendmsg() to netlink socket then recvmsg(). */
static int
netlink_talk (struct nlmsghdr *n, struct nlsock *nl, struct zebra_vrf *zvrf)
{
  int status;
  struct sockaddr_nl snl;

  /*构建消息缓冲区iovec和消息体msghdr,需要注意的是这里直接使用nlmsghdr传入的地址作为缓冲区,使用nlmsg_len标记缓冲区长度*/
  struct iovec iov = {
    .iov_base = (void *) n,
    .iov_len = n->nlmsg_len
  };
  struct msghdr msg = {
    .msg_name = (void *) &snl,
    .msg_namelen = sizeof snl,
    .msg_iov = &iov,
    .msg_iovlen = 1,
  };
  int save_errno;

  memset (&snl, 0, sizeof snl);
  snl.nl_family = AF_NETLINK;

  n->nlmsg_seq = ++nl->seq;

  /*这里表示需要ACK应答消息*/

  /* Request an acknowledgement by setting NLM_F_ACK */
  n->nlmsg_flags |= NLM_F_ACK;

  if (IS_ZEBRA_DEBUG_KERNEL)
    zlog_debug ("netlink_talk: %s type %s(%u), seq=%u", nl->name,
               lookup (nlmsg_str, n->nlmsg_type), n->nlmsg_type,
               n->nlmsg_seq);

  /*通过sendmsg发送消息给内核*/

  /* Send message to netlink interface. */
  if (zserv_privs.change (ZPRIVS_RAISE))
    zlog (NULL, LOG_ERR, "Can't raise privileges");
  status = sendmsg (nl->sock, &msg, 0);
  save_errno = errno;
  if (zserv_privs.change (ZPRIVS_LOWER))
    zlog (NULL, LOG_ERR, "Can't lower privileges");

  if (status < 0)
    {
      zlog (NULL, LOG_ERR, "netlink_talk sendmsg() error: %s",
            safe_strerror (save_errno));
      return -1;
    }

   /*获取内核响应,并调用netlink_parse_info进行解析*/

  /* 
   * Get reply from netlink socket. 
   * The reply should either be an acknowlegement or an error.
   */
  return netlink_parse_info (netlink_talk_filter, nl, zvrf);
}

我们再看看netlink_parse_info的实现,因为代码比较长,所以只选取重点段落分析。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
/* Receive message from netlink interface and pass those information
   to the given function. */
static int
netlink_parse_info (int (*filter) (struct sockaddr_nl *, struct nlmsghdr *,
                                   vrf_id_t),
                    struct nlsock *nl, struct zebra_vrf *zvrf)
{
  int status;
  int ret = 0;
  int error;

  while (1)
    {
      /*构建接收缓冲区以及接收消息结构体*/
      struct iovec iov = {
        .iov_base = nl_rcvbuf.p,
        .iov_len = nl_rcvbuf.size,
      };
      struct sockaddr_nl snl;
      struct msghdr msg = {
        .msg_name = (void *) &snl,
        .msg_namelen = sizeof snl,
        .msg_iov = &iov,
        .msg_iovlen = 1
      };
      struct nlmsghdr *h;

      /*通过recvmsg接收内核的响应消息*/
      status = recvmsg (nl->sock, &msg, 0);
      if (status < 0)
        {
          if (errno == EINTR)
            continue;
          if (errno == EWOULDBLOCK || errno == EAGAIN)
            break;
          zlog (NULL, LOG_ERR, "%s recvmsg overrun: %s",
	  	nl->name, safe_strerror(errno));
          continue;
        }

      if (status == 0)
        {
          zlog (NULL, LOG_ERR, "%s EOF", nl->name);
          return -1;
        }

      if (msg.msg_namelen != sizeof snl)
        {
          zlog (NULL, LOG_ERR, "%s sender address length error: length %d",
                nl->name, msg.msg_namelen);
          return -1;
        }
      
      /*循环处理nlmsghdr消息*/
      for (h = (struct nlmsghdr *) nl_rcvbuf.p; 
           NLMSG_OK (h, (unsigned int) status);
           h = NLMSG_NEXT (h, status))
        {
             /*此处省略处理具体消息的流程*/
		}

          /* OK we got netlink message. */
          if (IS_ZEBRA_DEBUG_KERNEL)
            zlog_debug ("netlink_parse_info: %s type %s(%u), seq=%u, pid=%u",
                       nl->name,
                       lookup (nlmsg_str, h->nlmsg_type), h->nlmsg_type,
                       h->nlmsg_seq, h->nlmsg_pid);

        /*此处省略进行一些错误信息处理的流程*/
    }

  return ret;
}

经过上面的分析,我们就知道了zebra如何使用netlink和内核交互路由配置信息,接下来我们看看socket方式是如何实现的。

在rt_socket.c中我们找到了老朋友kernel_route_rib函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
int
kernel_route_rib (struct prefix *p, struct rib *old, struct rib *new)
{
  int route = 0;

  if (zserv_privs.change(ZPRIVS_RAISE))
    zlog (NULL, LOG_ERR, "Can't raise privileges");

  if (old)
    route |= kernel_rtm (RTM_DELETE, p, old);

  if (new)
    route |= kernel_rtm (RTM_ADD, p, new);

  if (zserv_privs.change(ZPRIVS_LOWER))
    zlog (NULL, LOG_ERR, "Can't lower privileges");

  return route;
}

可以看到不论是添加还是删除路由,最终都通过kernel_rtm实现的,而kernel_rtm根据Address Family类型AF_INET和AF_INET6,分别调用kernel_rtm_ipv4和kernel_rtm_ipv6。我们以kernel_rtm_ipv4为例看一下具体流程, kernel_rtm_ipv6的实现是类似的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
/* Interface between zebra message and rtm message. */
static int
kernel_rtm_ipv4 (int cmd, struct prefix *p, struct rib *rib)

{
  struct sockaddr_in *mask = NULL;
  struct sockaddr_in sin_dest, sin_mask, sin_gate;
  struct nexthop *nexthop, *tnexthop;
  int recursing;
  int nexthop_num = 0;
  ifindex_t ifindex = 0;
  int gate = 0;
  int error;
  char prefix_buf[PREFIX_STRLEN];

  /*初始化地址簇相关信息*/

  if (IS_ZEBRA_DEBUG_RIB)
    prefix2str (p, prefix_buf, sizeof(prefix_buf));
  memset (&sin_dest, 0, sizeof (struct sockaddr_in));
  sin_dest.sin_family = AF_INET;
#ifdef HAVE_STRUCT_SOCKADDR_IN_SIN_LEN
  sin_dest.sin_len = sizeof (struct sockaddr_in);
#endif /* HAVE_STRUCT_SOCKADDR_IN_SIN_LEN */
  sin_dest.sin_addr = p->u.prefix4;

  memset (&sin_mask, 0, sizeof (struct sockaddr_in));

  memset (&sin_gate, 0, sizeof (struct sockaddr_in));
  sin_gate.sin_family = AF_INET;
#ifdef HAVE_STRUCT_SOCKADDR_IN_SIN_LEN
  sin_gate.sin_len = sizeof (struct sockaddr_in);
#endif /* HAVE_STRUCT_SOCKADDR_IN_SIN_LEN */

  /* Make gateway. */
  for (ALL_NEXTHOPS_RO(rib->nexthop, nexthop, tnexthop, recursing))
    {
      if (CHECK_FLAG (nexthop->flags, NEXTHOP_FLAG_RECURSIVE))
        continue;

      gate = 0;
      char gate_buf[INET_ADDRSTRLEN] = "NULL";

      /*此处省略一些错误流程检查*/

      /*通过rtm_write下发路由信息*/
	  error = rtm_write (cmd,
			     (union sockunion *)&sin_dest, 
			     (union sockunion *)mask, 
			     gate ? (union sockunion *)&sin_gate : NULL,
			     ifindex,
			     rib->flags,
			     rib->metric);

           if (IS_ZEBRA_DEBUG_RIB)
           {
             if (!gate)
             {
               zlog_debug ("%s: %s: attention! gate not found for rib %p",
                 __func__, prefix_buf, rib);
               rib_dump (p, rib);
             }
             else
               inet_ntop (AF_INET, &sin_gate.sin_addr, gate_buf, INET_ADDRSTRLEN);
           }

           /*此处省略一些错误信息处理流程*/
 
     } /* for (ALL_NEXTHOPS_RO(...))*/

  return 0; /*XXX*/
}

我们再来看看rtm_write的实现。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/* Interface function for the kernel routing table updates.  Support
 * for RTM_CHANGE will be needed.
 * Exported only for rt_socket.c
 */
int
rtm_write (int message,
	   union sockunion *dest,
	   union sockunion *mask,
	   union sockunion *gate,
	   unsigned int index,
	   int zebra_flags,
	   int metric)
{
  int ret;
  caddr_t pnt;
  struct interface *ifp;

  /* Sequencial number of routing message. */
  static int msg_seq = 0;

  /* Struct of rt_msghdr and buffer for storing socket's data. */
  struct 
  {
    struct rt_msghdr rtm;
    char buf[512];
  } msg;
  
  if (routing_sock < 0)
    return ZEBRA_ERR_EPERM;

  /*构建rt_msghdr结构体信息*/

  /* Clear and set rt_msghdr values */
  memset (&msg, 0, sizeof (struct rt_msghdr));
  msg.rtm.rtm_version = RTM_VERSION;
  msg.rtm.rtm_type = message;
  msg.rtm.rtm_seq = msg_seq++;
  msg.rtm.rtm_addrs = RTA_DST;
  msg.rtm.rtm_addrs |= RTA_GATEWAY;
  msg.rtm.rtm_flags = RTF_UP;
  msg.rtm.rtm_index = index;

  /*各种标志位设置*/
  if (metric != 0)
    {
      msg.rtm.rtm_rmx.rmx_hopcount = metric;
      msg.rtm.rtm_inits |= RTV_HOPCOUNT;
    }

  ifp = if_lookup_by_index (index);

  if (gate && (message == RTM_ADD || message == RTM_CHANGE))
    msg.rtm.rtm_flags |= RTF_GATEWAY;

  if (mask)
    msg.rtm.rtm_addrs |= RTA_NETMASK;
  else if (message == RTM_ADD || message == RTM_CHANGE)
    msg.rtm.rtm_flags |= RTF_HOST;

  /* Tagging route with flags */
  msg.rtm.rtm_flags |= (RTF_PROTO1);

  /* Additional flags. */
  if (zebra_flags & ZEBRA_FLAG_BLACKHOLE)
    msg.rtm.rtm_flags |= RTF_BLACKHOLE;
  if (zebra_flags & ZEBRA_FLAG_REJECT)
    msg.rtm.rtm_flags |= RTF_REJECT;

  /*地址前缀,掩码,下一跳网关地址赋值*/
#define SOCKADDRSET(X,R) \
  if (msg.rtm.rtm_addrs & (R)) \
    { \
      int len = SAROUNDUP (X); \
      memcpy (pnt, (caddr_t)(X), len); \
      pnt += len; \
    }

  pnt = (caddr_t) msg.buf;

  /* Write each socket data into rtm message buffer */
  SOCKADDRSET (dest, RTA_DST);
  SOCKADDRSET (gate, RTA_GATEWAY);
  SOCKADDRSET (mask, RTA_NETMASK);

  msg.rtm.rtm_msglen = pnt - (caddr_t) &msg;

  /*调用write将msg消息下发内核*/
  ret = write (routing_sock, &msg, msg.rtm.rtm_msglen);

  /*返回信息处理*/
  if (ret != msg.rtm.rtm_msglen) 
    {
      if (errno == EEXIST) 
	return ZEBRA_ERR_RTEXIST;
      if (errno == ENETUNREACH)
	return ZEBRA_ERR_RTUNREACH;
      if (errno == ESRCH)
	return ZEBRA_ERR_RTNOEXIST;
      
      zlog_warn ("%s: write : %s (%d)", __func__, safe_strerror (errno), errno);
      return ZEBRA_ERR_KERNEL;
    }
  return ZEBRA_ERR_NOERROR;
}

根据上述分析,我们大概知道了netlink消息通过nlmsghdr,rtmsg和buf下发路由配置,而socket通过rt_msghdr和buf来下发路由配置。netlink消息之所以分为nlmsghdr和rtmsg两个部分,是因为nlmsghdr是netlink通用信息头,而rtmsg才是路由相关头信息。不论哪种方式,最终的路由配置记录都存放在后面的buf中,通过前面的头信息中的len字段来标识buf的边界。

虽然上面分析的是 Quagga,但是后面fork出的 Frr 和上面的实现也差不多,这里就不展开对比了。需要注意的一点是,不管是 Quagga 还是 Frr,在使用socket类型将路由信息写入内核的时候,都没有通过read/recv读取返回状态信息,而netlink方式会通过recvmsg读取返回信息。

我们再看看两种socket的初始化流程。

netlink套接字的初始化代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
/* Make socket for Linux netlink interface. */
static int
netlink_socket (struct nlsock *nl, unsigned long groups, vrf_id_t vrf_id)
{
  int ret;
  struct sockaddr_nl snl;
  int sock;
  int namelen;
  int save_errno;

  if (zserv_privs.change (ZPRIVS_RAISE))
    {
      zlog (NULL, LOG_ERR, "Can't raise privileges");
      return -1;
    }

  /*使用 AF_NETLINK 地址簇创建,协议类型是NETLINK_ROUTE*/
  sock = vrf_socket (AF_NETLINK, SOCK_RAW, NETLINK_ROUTE, vrf_id);
  if (sock < 0)
    {
      zlog (NULL, LOG_ERR, "Can't open %s socket: %s", nl->name,
            safe_strerror (errno));
      return -1;
    }

  memset (&snl, 0, sizeof snl);
  snl.nl_family = AF_NETLINK;
  snl.nl_groups = groups;

  /* Bind the socket to the netlink structure for anything. */
  ret = bind (sock, (struct sockaddr *) &snl, sizeof snl);
  save_errno = errno;
  if (zserv_privs.change (ZPRIVS_LOWER))
    zlog (NULL, LOG_ERR, "Can't lower privileges");

  if (ret < 0)
    {
      zlog (NULL, LOG_ERR, "Can't bind %s socket to group 0x%x: %s",
            nl->name, snl.nl_groups, safe_strerror (save_errno));
      close (sock);
      return -1;
    }

  /* multiple netlink sockets will have different nl_pid */
  namelen = sizeof snl;
  ret = getsockname (sock, (struct sockaddr *) &snl, (socklen_t *) &namelen);
  if (ret < 0 || namelen != sizeof snl)
    {
      zlog (NULL, LOG_ERR, "Can't get %s socket name: %s", nl->name,
            safe_strerror (errno));
      close (sock);
      return -1;
    }

  nl->snl = snl;
  nl->sock = sock;
  return ret;
}

socket套接字的初始化代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
/* Make routing socket. */
static void
routing_socket (struct zebra_vrf *zvrf)
{
  if (zvrf->vrf_id != VRF_DEFAULT)
    return;

  if ( zserv_privs.change (ZPRIVS_RAISE) )
    zlog_err ("routing_socket: Can't raise privileges");

  /*使用AF_ROUTE地址簇创建,协议类型为0*/
  routing_sock = socket (AF_ROUTE, SOCK_RAW, 0);

  if (routing_sock < 0) 
    {
      if ( zserv_privs.change (ZPRIVS_LOWER) )
        zlog_err ("routing_socket: Can't lower privileges");
      zlog_warn ("Can't init kernel routing socket");
      return;
    }

  /* XXX: Socket should be NONBLOCK, however as we currently 
   * discard failed writes, this will lead to inconsistencies.
   * For now, socket must be blocking.
   */
  /*if (fcntl (routing_sock, F_SETFL, O_NONBLOCK) < 0) 
    zlog_warn ("Can't set O_NONBLOCK to routing socket");*/
    
  if ( zserv_privs.change (ZPRIVS_LOWER) )
    zlog_err ("routing_socket: Can't lower privileges");

  /* kernel_read needs rewrite. */
  thread_add_read (zebrad.master, kernel_read, NULL, routing_sock);
}

需要补充的一点是,在Linux下,AF_ROUTE实际使用的就是AF_NETLINK。但是你并不能使用这种套接字兼容UNIX下的route消息格式rt_msghdr。

1
2
#define AF_NETLINK	16
#define AF_ROUTE	AF_NETLINK /* Alias to emulate 4.4BSD */
This post is licensed under CC BY 4.0 by the author.