User applications use the BSD Socket Interface to access network services. The BSD Interface is used to communicate between two or more application programs. Programs can use the file system (AF_UNIX), or the network (AF_INET) as the communication medium. When AF_INET is used, user data is handled by various network layers and then sent out onto the physical network. At the receiving side, the data travels upward through various network layers to the receiving user application.
There are various layers of code in the Linux kernel that user-data travel through to reach the network. Generally they go through the following:
BSD Layer (/net/socket.c)
INET Layer (/net/ipv4/af_inet.c)
TRANSPORT Layer
TCP (/net/ipv4/tcp*.c)
UDP (/net/ipv4/udp.c)
XTP (/net/ipv4/xtp*.c)
NETWORK Layer
IP (/net/ipv4/ip*.c)
?Routing?
DATA LINK Layer
??
Drivers
Ethernet
Drivers
The BSD Interface provides the following system calls to the user level application:
accept()
connect()
send()
sendto()
sendmsg()
recv()
recvfrom()
receivemsg()
getpeername()
getsockopt()
setsockopt()
shutdown()
close() [this is not part of BSD API but is used to terminate a BSD socket]
The BSD layer is coded in the /net/socket.c module. Each system call has a wrapper of the form sys_xxx, where xxx is the BSD system call. Generally these bsd functions simply point to functions in the next layer INET. They take user level arguments and translate them to kernel level arguments. All BSD functions translate the file descriptor to the SOCKET data structure, do minor error checking and then call the corresponding inet_xxx procedure.
The SOCKET data structure contains general state information and pointers
to INET functions. All network (AF_INET) related details are below
this layer.
sock_create(family,type,protocol,&sock)
~~ allocates socket data structure
- does various checks
sock=sock_alloc()
~~ creates socket ds
netfamilies[family]->create(sock,protocol)
~~ points to inet_create()
get_fd(sock->inode)
~~ sets up fd that the user app uses
sock=sockfd_lookup(fd, err)
~~ finds socket data structure via fd
sock->ops->bind(sock,address,addrlen)
~~ points to inet_bind
sock=sockfd_lookup(fd, err)
~~ finds socket data structure via fd
sock->ops->listen(sock,address,addrlen)
~~ points to inet_listen
sock=sockfd_lookup(fd, err)
newsock=sock_alloc()
sock->ops->dup(newsock,
sock)
~~ points to inet_dup; creates a duplicate socket entry
newsock->ops->accept(sock,
newsock, flags) ~~ points to inet_accept;
newsock=socki_lookup(inode)
~~ lookup via inode
get_fd(inode)
newsock->ops->getname(newsock,
address, &len) ~~ points to inet_getname; gets address
of accepted
socket and passes it back to the app
sockfd_lookup()
move_addr_to_kernel(...)
sock->ops->connect(sock,address,addrlen,flags) ~~ points to either inet_dgram_connect [for UDP and XTP] or inet_stream_connect [for TCP]
sockfd_lookup()
sock_sendmsg()
~~ see sys_sendmsg()
sockfd_lookup()
move addr to kernel space and
assign it to msghdr
sock_sendmsg()
~~ see sys_sendmsg()
copy data from user to kernel
space
sockfd_lookup()
sock_sendmsg
socks->ops->sendmsg(sock,msg,size) ~~ points to
inet_sendmsg
sock=sock_fd_lookup
sock->ops->shutdown(sock,how)
~~ points to inet_shutdown
sockfd_put(sock)
sock_release(sock) ~~ found in /net/core/sock.c sock_release() sock->ops->release(sock,NULL) ~~ points to inet_release
The INET layer is coded in the /net/ipv4/af_inet.c module. Each inet function perform some minor "house keeping business" and then call the transport specific function via the prot data structure.
The SOCK data structure is the main structure for the transport
layer. However it contains (via unions) or point to other data structures
athat are more transport specific. For example, SOCK can point to
a tcp_opt data structure which has more detail information about a tcp
connection. For XTP it point to a xtp_ctxt where all XTP Context
data in kept. The SOCK and other transport specific data structures
are ina state of flux as it gradually moves to an even more modular
structure. (For example SOCK still has much IP level data.)
sock->state=SS_UNCONNECTED
sk=sk_alloc(GFP_KERNEL)
// assign transport protocol: TCP, UDP, XTP, RAW, PACKET
sk->no_check=NO_CHECK
sock_init_data(sock, sk) ~~ initialization
of sock ds members
sk->timer.function=&net_timer
sk->prot->init(sk)
~~ transport specific stuff, used only by TCP (tcp_v4_init_sock())
and XTP (xtp_init_sock())
// various checks
num=ntohs(addr->sin_port
if (snum==0)
snum=sk->prot->good_socknum()
~~ gets an used port number
chk_addr_ret=...
// check if ip addr is local/ok
if (sk->prot->verify_bind(sk,num))
error
sk->num=snum
sk->dummy_th.source=ntohs(snum)
sk->daddr=0
sk->dummy_th.dst=0
sk->prot->rehash(sk)
add_to_prot_sklist(sk)
dst_release(sk->dst_cache)
sk->dst_cache=NULL
// allow only stream and UNCONNECTED
inet_autobind(sk)
// checks on backlog parameter
sk->max_ack_nacklog=backlog
if (state != TCP_LISTEN)
sk->ack_backlog=0
state=TCP_LISTEN
prot->rehash(sk)
add_to_prot_sklist(sk)
sk->socket->flags |= SO_ACCEPTCON
inet_autobind(sk)
sk->prot>connect(sk,uaddr,addrlen) ~~ points to udp_connect and xtp_connect
// various checks
inet_autobind(sk)
sk->prot->connect(sk,uaddr,uaddrlen) ~~ points to tcp_v4_connect
// sleep till connection is successful
// various checks
if (sk2=sk1->prot->accept(sk1,flags)) == NULL) ~~ points to tcp_accept or xtp_accept
return error
sk2->sleep=newsock->sleep
newsock->sk=sk2
sk2->socket=newsock
newsk->socket=NULL
while (sk2->state==TCP_SYN_RECV)
sleep(.....)
.....
if (sk2->state == TCP_ESTABLISHED)
goto "success"
if (sk2->err > 0)
goto "err"
if (sk2->state == TCP_CLOSE)
goto "dobadconn"
sucess:
destroy_sock(newsk)
newsock->state=SS_CONNECTED
return 0
err:
err=sockerror(sk2)
return
dobadconn:
.....
inet_create(newsock, oldsock->sk->protocol) ~ wrapper to inet_create
sk=sock->sk
if (sock->state != SS_UNCONNECTED)
sock->state=SS_DISCONNECTING
sk->state_change(sk)
// if linger, timeout=1, else
=0
sock->sk=NULL
sk->socket=NULL
sk->prot->close(sk,timeout)
~~ points to transport xxx_close; each transport protocol must have a xxx_close
function [TCP: tcp_close, XTP: xtp_close]
// peer: get addr info for
local connection or peer connection
if (peer)
//get addr info from the socket you are connect to
else
// get your own (local) addr info
sk->prot->poll(sock,wait)
sk->prot->setsockopt(sock,
level, optname, optval, optlen)
sk->prot->getsockopt(sock, level, optname, optval, optlen)
sk=sock->sk
// various checks
sk->prot->shutdown(sk,
how)
initialize:
sk->receive_queue
sk->write_queue
sk->back_log
sk->error_queue
init_timer(sk->timer)
--various sk members
-- various callbacks
There are 3 main traditional transport protocols, TCP, UDP, and RAW. The Xpress Transport Protocol (XTP) is the latest additional to the Linux kernel.
// sets service type, error control, rate control, flow control
// sets xtp timers
// sets ???
sk->state=TCP_CLOSE
sk->dead=1
xtp_unhash(sk)
if (CT[sk->tp_pinfo.af_xtp.key.key] == NULL) ~~ destroy sk only if the xtp context is valid
return
destroy_sock(sk)
//various checks
TCP provides a reliable connection-oriented service.
// sets mostly sk->tp_pinfo.af_tcp
sk->priority=1
sk->state=TCP_CLOSE
sk->max_unacked=2048
sk->max_ack_backlog=SOMAXCONN
sk->mtu=576
sk->mss=536
sk->dummy_th.ack=1
sk->dummy_th.doff=sizeof(struct
tcphdr) >> 2
// various checks
ip_route_connect(......) ~~ sets up mac level stuff
// check for tcp_unique_address
buff=sock_wmalloc(sk, MAX_SYN_SIZE,0 GFP_KERNEL) ~~ skbuff allocation
ip_biuld_header(buff,sk)
skb_put()
tp->... ~~ various tcp settings
// setting for mts/mss
tcp_set_state()
tcp_v4_rehash(sk)
tcp_init_xmit_timers(sk)
skb_queue_tail(&sk->write_queue,buff)
skb1=skb_clone(buff,GFP_KERNEL)
ip_queue_xmit(sk1)
sk->shutdown |= SEND_SHUTDOWN
if (tcp_close_state(sk))
tcp_send_fin(sk)
release_sock(sk)
start_bh_atomic()
....
end_bh_atomic()
if (sk->state==TCP_LISTEN)
tcp_set_state(sk, TCP_CLOSE)
tcp_close_pending(sk)
release_sock(sk)
sk->prot->unhash(sk)
return
if (!sk->dead)
sk->state_change(sk)
// flush receive_queue
if (tcp_close_state(sk,1)==1)
tcp_send_fin(sk)
if (timeout)
....
sleep
....
if ( ...TCP_FIN_WAIT2
[zombie] )
set time
sk->dead=1
if (sk->state==TCP_CLOSE)
sk->prot->unhash(sk)
tcp_clean_xmit_timers(sk)
if (keepopen)
tcp_dec_slow_timer()
// flush write_queue
// flush out_of_order_order
UDP provides unrealiable connectionless service
// various checks
ip_route_connect(&Rt, uaddr.addr, sk->saddr, sk->ip_tos|sk->localroute) ~~ sets up mac level stuff
if (rt->rt_flags & RTF_BROADCAST && !sk->broadcast)
ip_rt_put(rt)
return error
if (!sk->saddr)
sk->saddr=rt->src
if (!sk->rcv_addr)
sk->rcv_addr=rt-> rt_src
sk->daddr=rt->rt_dst
sk->dummy_th.dst=usin->sin_port
sk->state=TCP_ESTABLISHED
ip_rt_put(rt)
sk->state=TCP_CLOSE
sk->dead=1
udp_v4_unhash(sk)
destroy_sock(sk)
net_delete_timer(sk)
if (sk->prot->destroy)
sk->prot->destroy(sk)
kill_sk_queue(sk)
kill_sk_now(sk)
del_from_prot_list(sk)
dst_release(sk->dst_cache)
sk_free(sk);
The network layer protocol is Internet Protocol (IP). The TRANSPORT layer uses the services of the NETWORK to send its data to the network. The NETWORK layer protocol is usually IP (Internet Protocol).
rr=ip_route_output(rp,dst,src,tos,NULL0
dst=(*rp)->rt_dst
src=(*rp)->rt_src
ip_rt_put(*rp)
*rp=NULL
return ip_route_output(...)