blob: 3889eb17288a606704719ce45f3bbe373321ff92 [file] [log] [blame]
/*
* This file contains pieces of the Linux TCP/IP stack needed for modular
* TOE support.
*
* Copyright (C) 2006-2009 Chelsio Communications. All rights reserved.
* See the corresponding files in the Linux tree for copyrights of the
* original Linux code a lot of this file is based on.
*
* Additional code written by Dimitris Michailidis (dm@chelsio.com)
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE file included in this
* release for licensing terms and conditions.
*/
/* The following tags are used by the out-of-kernel Makefile to identify
* supported kernel versions if a module_support-<kver> file is not found.
* Do not remove these tags.
* $SUPPORTED KERNEL 2.6.12$
*/
#ifndef AUTOCONF_INCLUDED
#include <linux/autoconf.h>
#endif
#include <net/tcp.h>
#include <linux/random.h>
#include <linux/kallsyms.h>
#include <linux/toedev.h>
#include <net/offload.h>
#include <linux/sunrpc/xprt.h>
static int toe_sysctl_tcp_tw_recycle = 0;
static struct proto orig_tcp_prot;
static struct proto_ops *inet_stream_ops;
/* Enable TCP options by default in case we can't locate the actual sysctls. */
static int tcp_options_sysctl = 1;
int *sysctl_tcp_timestamps_p = &tcp_options_sysctl;
int *sysctl_tcp_sack_p = &tcp_options_sysctl;
int *sysctl_tcp_window_scaling_p = &tcp_options_sysctl;
/* The next few definitions track the data_ready callbacks for RPC and iSCSI */
static void (*iscsi_tcp_data_ready_p)(struct sock *sk, int bytes);
static sk_read_actor_t iscsi_tcp_recv_p;
static void (*xs_tcp_data_ready_p)(struct sock *sk, int bytes);
static sk_read_actor_t xs_tcp_data_recv_p;
/*
* The next two definitions provide a replacement for route.h:rt_get_peer(),
* which is not exported to modules.
*/
static void (*rt_bind_peer_p)(struct rtable *rt, int create);
static inline struct inet_peer *rt_get_peer_offload(struct rtable *rt)
{
if (rt->peer)
return rt->peer;
if (rt_bind_peer_p)
rt_bind_peer_p(rt, 0);
return rt->peer;
}
static void find_rpc_iscsi_callbacks(void)
{
/* All of these may fail since RPC/iSCSI may not be loaded */
iscsi_tcp_data_ready_p =
(void *)kallsyms_lookup_name("iscsi_tcp_data_ready");
iscsi_tcp_recv_p = (void *)kallsyms_lookup_name("iscsi_tcp_recv");
xs_tcp_data_ready_p =
(void *)kallsyms_lookup_name("tcp_data_ready");
xs_tcp_data_recv_p = (void *)kallsyms_lookup_name("tcp_data_recv");
}
static int (*tcp_v4_hash_connect_p)(struct sock *sk);
static inline int tcp_v4_hash_connect_offload(struct sock *sk)
{
if (tcp_v4_hash_connect_p)
return tcp_v4_hash_connect_p(sk);
else
return -1;
}
static int (*__ip_route_output_key_p)(struct rtable **rp, const struct flowi *flp);
static inline int __ip_route_output_key_offload(struct rtable **rp,
const struct flowi *flp)
{
if (__ip_route_output_key_p)
return __ip_route_output_key_p(rp, flp);
else
return -1;
}
static int (*ip_route_output_flow_p)(struct rtable **rp, struct flowi *flp,
struct sock *sk, int flags);
static inline int ip_route_output_flow_offload(struct rtable **rp,
struct flowi *flp,
struct sock *sk, int flags)
{
if (ip_route_output_flow_p)
return ip_route_output_flow_p(rp, flp, sk, flags);
else
return -1;
}
static inline int ip_route_connect_offload(struct rtable **rp, u32 dst,
u32 src, u32 tos, int oif, u8 protocol,
u16 sport, u16 dport, struct sock *sk)
{
struct flowi fl = { .oif = oif,
.nl_u = { .ip4_u = { .daddr = dst,
.saddr = src,
.tos = tos } },
.proto = protocol,
.uli_u = { .ports =
{ .sport = sport,
.dport = dport } } };
int err;
if (!dst || !src) {
err = __ip_route_output_key_offload(rp, &fl);
if (err)
return err;
fl.fl4_dst = (*rp)->rt_dst;
fl.fl4_src = (*rp)->rt_src;
ip_rt_put(*rp);
*rp = NULL;
}
return ip_route_output_flow_offload(rp, &fl, sk, 0);
}
static inline int ip_route_newports_offload(struct rtable **rp, u16 sport,
u16 dport, struct sock *sk)
{
if (sport != (*rp)->fl.fl_ip_sport ||
dport != (*rp)->fl.fl_ip_dport) {
struct flowi fl;
memcpy(&fl, &(*rp)->fl, sizeof(fl));
fl.fl_ip_sport = sport;
fl.fl_ip_dport = dport;
ip_rt_put(*rp);
*rp = NULL;
return ip_route_output_flow_offload(rp, &fl, sk, 0);
}
return 0;
}
/*
* The functions below replace some of the original methods of tcp_prot to
* support offloading.
*/
static void tcp_v4_hash_offload(struct sock *sk)
{
orig_tcp_prot.hash(sk);
if (sk->sk_state == TCP_LISTEN)
start_listen_offload(sk);
}
static void tcp_unhash_offload(struct sock *sk)
{
if (sk->sk_state == TCP_LISTEN)
stop_listen_offload(sk);
orig_tcp_prot.unhash(sk);
}
static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
__u32 faddr, __u16 fport)
{
int h = (laddr ^ lport) ^ (faddr ^ fport);
h ^= h >> 16;
h ^= h >> 8;
return h & (tcp_ehash_size - 1);
}
static __inline__ int tcp_sk_hashfn(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
__u32 laddr = inet->rcv_saddr;
__u16 lport = inet->num;
__u32 faddr = inet->daddr;
__u16 fport = inet->dport;
return tcp_hashfn(laddr, lport, faddr, fport);
}
static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
{
struct hlist_head *list;
rwlock_t *lock;
BUG_TRAP(sk_unhashed(sk));
if (listen_possible && sk->sk_state == TCP_LISTEN) {
list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
lock = &tcp_lhash_lock;
tcp_listen_wlock();
} else {
list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
lock = &tcp_ehash[sk->sk_hashent].lock;
write_lock(lock);
}
__sk_add_node(sk, list);
sock_prot_inc_use(sk->sk_prot);
write_unlock(lock);
if (listen_possible && sk->sk_state == TCP_LISTEN)
wake_up(&tcp_lhash_wait);
}
static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
struct tcp_tw_bucket **twp)
{
struct inet_sock *inet = inet_sk(sk);
u32 daddr = inet->rcv_saddr;
u32 saddr = inet->daddr;
int dif = sk->sk_bound_dev_if;
TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
__u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
struct tcp_ehash_bucket *head = &tcp_ehash[hash];
struct sock *sk2;
struct hlist_node *node;
struct tcp_tw_bucket *tw;
write_lock(&head->lock);
/* Check TIME-WAIT sockets first. */
sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
tw = (struct tcp_tw_bucket *)sk2;
if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
struct tcp_sock *tp = tcp_sk(sk);
/* With PAWS, it is safe from the viewpoint
of data integrity. Even without PAWS it
is safe provided sequence spaces do not
overlap i.e. at data rates <= 80Mbit/sec.
Actually, the idea is close to VJ's one,
only timestamp cache is held not per host,
but per port pair and TW bucket is used
as state holder.
If TW bucket has been already destroyed we
fall back to VJ's scheme and use initial
timestamp retrieved from peer table.
*/
if (tw->tw_ts_recent_stamp &&
(!twp || (sysctl_tcp_tw_reuse &&
xtime.tv_sec -
tw->tw_ts_recent_stamp > 1))) {
if ((tp->write_seq =
tw->tw_snd_nxt + 65535 + 2) == 0)
tp->write_seq = 1;
tp->rx_opt.ts_recent = tw->tw_ts_recent;
tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
sock_hold(sk2);
goto unique;
} else
goto not_unique;
}
}
tw = NULL;
/* And established part... */
sk_for_each(sk2, node, &head->chain) {
if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
goto not_unique;
}
unique:
/* Must record num and sport now. Otherwise we will see
* in hash table socket with a funny identity. */
inet->num = lport;
inet->sport = htons(lport);
sk->sk_hashent = hash;
BUG_TRAP(sk_unhashed(sk));
__sk_add_node(sk, &head->chain);
sock_prot_inc_use(sk->sk_prot);
write_unlock(&head->lock);
if (twp) {
*twp = tw;
NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
} else if (tw) {
/* Silly. Should hash-dance instead... */
tcp_tw_deschedule(tw);
NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
tcp_tw_put(tw);
}
return 0;
not_unique:
write_unlock(&head->lock);
return -EADDRNOTAVAIL;
}
static u32 (*secure_tcp_port_ephemeral_p)(__u32 saddr, __u32 daddr, __u16 dport);
static inline u32 secure_tcp_port_ephemeral_offload(__u32 saddr, __u32 daddr,
__u16 dport)
{
if (secure_tcp_port_ephemeral_p)
return secure_tcp_port_ephemeral_p(saddr, daddr, dport);
else
return -1;
}
static inline u32 connect_port_offset(const struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
return secure_tcp_port_ephemeral_offload(inet->rcv_saddr, inet->daddr,
inet->dport);
}
static inline int tcp_v4_hash_connect(struct sock *sk)
{
unsigned short snum = inet_sk(sk)->num;
struct tcp_bind_hashbucket *head;
struct tcp_bind_bucket *tb;
int ret;
if (!snum) {
int low = sysctl_local_port_range[0];
int high = sysctl_local_port_range[1];
int range = high - low;
int i;
int port;
static u32 hint;
u32 offset = hint + connect_port_offset(sk);
struct hlist_node *node;
struct tcp_tw_bucket *tw = NULL;
if (offset == -1)
return -EADDRNOTAVAIL;
local_bh_disable();
for (i = 1; i <= range; i++) {
port = low + (i + offset) % range;
head = &tcp_bhash[tcp_bhashfn(port)];
spin_lock(&head->lock);
/* Does not bother with rcv_saddr checks,
* because the established check is already
* unique enough.
*/
tb_for_each(tb, node, &head->chain) {
if (tb->port == port) {
BUG_TRAP(!hlist_empty(&tb->owners));
if (tb->fastreuse >= 0)
goto next_port;
if (!__tcp_v4_check_established(sk,
port,
&tw))
goto ok;
goto next_port;
}
}
tb = tcp_bucket_create(head, port);
if (!tb) {
spin_unlock(&head->lock);
break;
}
tb->fastreuse = -1;
goto ok;
next_port:
spin_unlock(&head->lock);
}
local_bh_enable();
return -EADDRNOTAVAIL;
ok:
hint += i;
/* Head lock still held and bh's disabled */
tcp_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
inet_sk(sk)->sport = htons(port);
__tcp_v4_hash(sk, 0);
}
spin_unlock(&head->lock);
if (tw) {
tcp_tw_deschedule(tw);
tcp_tw_put(tw);
}
ret = 0;
goto out;
}
head = &tcp_bhash[tcp_bhashfn(snum)];
tb = tcp_sk(sk)->bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
__tcp_v4_hash(sk, 0);
spin_unlock_bh(&head->lock);
return 0;
} else {
spin_unlock(&head->lock);
/* No definite answer... Walk to established hash table */
ret = __tcp_v4_check_established(sk, snum, NULL);
out:
local_bh_enable();
return ret;
}
}
static int tcp_v4_connect_offload(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct rtable *rt;
u32 daddr, nexthop;
int tmp;
int err;
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
if (usin->sin_family != AF_INET)
return -EAFNOSUPPORT;
nexthop = daddr = usin->sin_addr.s_addr;
if (inet->opt && inet->opt->srr) {
if (!daddr)
return -EINVAL;
nexthop = inet->opt->faddr;
}
tmp = ip_route_connect_offload(&rt, nexthop, inet->saddr,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
IPPROTO_TCP,
inet->sport, usin->sin_port, sk);
if (tmp < 0)
return tmp;
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
ip_rt_put(rt);
return -ENETUNREACH;
}
if (!inet->opt || !inet->opt->srr)
daddr = rt->rt_dst;
if (!inet->saddr)
inet->saddr = rt->rt_src;
inet->rcv_saddr = inet->saddr;
if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
/* Reset inherited state */
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
tp->write_seq = 0;
}
if (toe_sysctl_tcp_tw_recycle &&
!tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
struct inet_peer *peer = rt_get_peer_offload(rt);
/* VJ's idea. We save last timestamp seen from
* the destination in peer table, when entering state TIME-WAIT
* and initialize rx_opt.ts_recent from it, when trying new connection.
*/
if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
tp->rx_opt.ts_recent = peer->tcp_ts;
}
}
inet->dport = usin->sin_port;
inet->daddr = daddr;
tp->ext_header_len = 0;
if (inet->opt)
tp->ext_header_len = inet->opt->optlen;
tp->rx_opt.mss_clamp = 536;
/* Socket identity is still unknown (sport may be zero).
* However we set state to SYN-SENT and not releasing socket
* lock select source port, enter ourselves into the hash tables and
* complete initialization after this.
*/
tcp_set_state(sk, TCP_SYN_SENT);
err = tcp_v4_hash_connect(sk);
if (err)
goto failure;
err = ip_route_newports_offload(&rt, inet->sport, inet->dport, sk);
if (err)
goto failure;
/* OK, now commit destination to socket. */
__sk_dst_set(sk, &rt->u.dst);
tcp_v4_setup_caps(sk, &rt->u.dst);
if (tcp_connect_offload(sk))
return 0;
if (!tp->write_seq)
tp->write_seq = secure_tcp_sequence_number(inet->saddr,
inet->daddr,
inet->sport,
usin->sin_port);
inet->id = tp->write_seq ^ jiffies;
err = tcp_connect(sk);
rt = NULL;
if (err)
goto failure;
return 0;
failure:
/* This unhashes the socket and releases the local port, if necessary. */
tcp_set_state(sk, TCP_CLOSE);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->dport = 0;
return err;
}
ssize_t tcp_sendpage_offload(struct socket *sock, struct page *page,
int offset, size_t size, int flags)
{
struct sock *sk = sock->sk;
if (sk->sk_prot->sendpage)
return sk->sk_prot->sendpage(sk, page, offset, size, flags);
return tcp_sendpage(sock, page, offset, size, flags);
}
EXPORT_SYMBOL(tcp_sendpage_offload);
#define FIND_SYSCTL(name) do { \
int *p = (void *)kallsyms_lookup_name_p("sysctl_tcp_" # name); \
if (p) \
sysctl_tcp_ ## name ## _p = p; \
} while (0)
int prepare_tcp_for_offload(void)
{
int err;
struct socket *sock;
if (inet_stream_ops) /* already done */
return 0;
err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
if (err < 0) {
printk(KERN_ERR "Could not create TCP socket, error %d\n", err);
return err;
}
inet_stream_ops = (struct proto_ops *)sock->ops;
inet_stream_ops->sendpage = tcp_sendpage_offload;
sock_release(sock);
orig_tcp_prot = tcp_prot;
tcp_prot.hash = tcp_v4_hash_offload;
tcp_prot.unhash = tcp_unhash_offload;
tcp_prot.connect = tcp_v4_connect_offload;
__ip_route_output_key_p = (void *)kallsyms_lookup_name("__ip_route_output_key");
if (!__ip_route_output_key_p) {
printk(KERN_ERR "Could not locate __ip_route_output_key_p");
return -1;
}
ip_route_output_flow_p = (void *)kallsyms_lookup_name("ip_route_output_flow");
if (!ip_route_output_flow_p) {
printk(KERN_ERR "Could not locate ip_route_output_flow");
return -1;
}
secure_tcp_port_ephemeral_p = (void *)kallsyms_lookup_name("secure_tcp_port_ephemeral");
if (!secure_tcp_port_ephemeral_p) {
printk(KERN_ERR "Could not locate secure_tcp_port_ephemeral_p");
return -1;
}
/*
* rt_bind_peer is not a critical function, it's ok if we are unable
* to locate it.
*/
rt_bind_peer_p = (void *)kallsyms_lookup_name("rt_bind_peer");
return 0;
}
void restore_tcp_to_nonoffload(void)
{
if (inet_stream_ops) {
inet_stream_ops->sendpage = tcp_sendpage;
tcp_prot.hash = orig_tcp_prot.hash;
tcp_prot.unhash = orig_tcp_prot.unhash;
tcp_prot.connect = orig_tcp_prot.connect;
}
}
static inline int ofld_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor)
{
if (sock_flag(sk, SOCK_OFFLOADED)) {
const struct sk_ofld_proto *p = (void *)sk->sk_prot;
return p->read_sock(sk, desc, recv_actor);
}
return tcp_read_sock(sk, desc, recv_actor);
}
/* Replacement for RPC's ->data_ready callback */
static void xs_ofld_tcp_data_ready(struct sock *sk, int bytes)
{
struct rpc_xprt *xprt;
read_descriptor_t rd_desc;
read_lock(&sk->sk_callback_lock);
if (!(xprt = sk->sk_user_data))
goto out;
if (xprt->shutdown)
goto out;
/* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
rd_desc.arg.data = xprt;
rd_desc.count = 65536;
ofld_read_sock(sk, &rd_desc, xs_tcp_data_recv_p);
out:
read_unlock(&sk->sk_callback_lock);
}
#if 0
/* Copy of iscsi_tcp_segment_unmap */
static inline void iscsi_tcp_segment_unmap(struct iscsi_segment *segment)
{
if (segment->sg_mapped) {
kunmap_atomic(segment->sg_mapped, KM_SOFTIRQ0);
segment->sg_mapped = NULL;
segment->data = NULL;
}
}
/* Replacement for iSCSI's ->data_ready callback */
static void iscsi_ofld_tcp_data_ready(struct sock *sk, int bytes)
{
struct iscsi_conn *conn = sk->sk_user_data;
struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
read_descriptor_t rd_desc;
read_lock(&sk->sk_callback_lock);
rd_desc.arg.data = conn;
rd_desc.count = 1;
ofld_read_sock(sk, &rd_desc, iscsi_tcp_recv_p);
read_unlock(&sk->sk_callback_lock);
iscsi_tcp_segment_unmap(&tcp_conn->in.segment);
}
#endif
int install_special_data_ready(struct sock *sk)
{
if (!sk->sk_user_data)
return 0;
/* sysctls are also best effort */
FIND_SYSCTL(timestamps);
FIND_SYSCTL(sack);
FIND_SYSCTL(window_scaling);
find_rpc_iscsi_callbacks();
if (sk->sk_data_ready == xs_tcp_data_ready_p)
sk->sk_data_ready = xs_ofld_tcp_data_ready;
#if 0
else if (sk->sk_data_ready == iscsi_tcp_data_ready_p)
sk->sk_data_ready = iscsi_ofld_tcp_data_ready;
#endif
else
return 0;
return 1;
}
EXPORT_SYMBOL(install_special_data_ready);
void restore_special_data_ready(struct sock *sk)
{
if (sk->sk_data_ready == xs_ofld_tcp_data_ready)
sk->sk_data_ready = xs_tcp_data_ready_p;
#if 0
else if (sk->sk_data_ready == iscsi_ofld_tcp_data_ready)
sk->sk_data_ready = iscsi_tcp_data_ready_p;
#endif
}
EXPORT_SYMBOL(restore_special_data_ready);