blob: 5b0753c80d3cf190368dd98f20792adaddbdba0b [file] [log] [blame]
/*
* This file contains pieces of the Linux TCP/IP stack needed for modular
* TOE support.
*
* Copyright (C) 2006-2009 Chelsio Communications. All rights reserved.
* See the corresponding files in the Linux tree for copyrights of the
* original Linux code a lot of this file is based on.
*
* Additional code written by Dimitris Michailidis (dm@chelsio.com)
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE file included in this
* release for licensing terms and conditions.
*/
/* The following tags are used by the out-of-kernel Makefile to identify
* supported kernel versions if a module_support-<kver> file is not found.
* Do not remove these tags.
* $SUPPORTED KERNEL 2.6.20$
*/
#include <net/tcp.h>
#include <linux/random.h>
#include "toe_compat.h"
#include <linux/toedev.h>
#include <net/inet_common.h>
#include <net/offload.h>
#include <linux/mm.h>
#include <linux/kprobes.h>
#include <linux/sunrpc/xprt.h>
#include <asm/page.h>
#include <asm/cacheflush.h>
#include <linux/highmem.h>
#include "toe_iscsi.h"
static struct proto orig_tcp_prot;
static unsigned long (*kallsyms_lookup_name_p)(const char *name);
/* Enable TCP options by default in case we can't locate the actual sysctls. */
static int tcp_options_sysctl = 1;
int *sysctl_tcp_timestamps_p = &tcp_options_sysctl;
int *sysctl_tcp_sack_p = &tcp_options_sysctl;
int *sysctl_tcp_window_scaling_p = &tcp_options_sysctl;
/* The next few definitions track the data_ready callbacks for RPC and iSCSI */
static void (*iscsi_tcp_data_ready_p)(struct sock *sk, int bytes);
static void (*iscsi_sw_tcp_data_ready_p)(struct sock *sk, int bytes);
static sk_read_actor_t iscsi_tcp_recv_p;
static sk_read_actor_t iscsi_sw_tcp_recv_p;
static sk_read_actor_t iscsi_tcp_data_recv_p;
static void (*xs_tcp_data_ready_p)(struct sock *sk, int bytes);
static sk_read_actor_t xs_tcp_data_recv_p;
/*
* The next two definitions provide a replacement for route.h:rt_get_peer(),
* which is not exported to modules.
*/
static void (*rt_bind_peer_p)(struct rtable *rt, int create);
static inline struct inet_peer *rt_get_peer_offload(struct rtable *rt)
{
if (rt->peer)
return rt->peer;
if (rt_bind_peer_p)
rt_bind_peer_p(rt, 0);
return rt->peer;
}
static void find_rpc_iscsi_callbacks(void)
{
/* All of these may fail since RPC/iSCSI may not be loaded */
iscsi_tcp_data_ready_p =
(void *)kallsyms_lookup_name_p("iscsi_tcp_data_ready");
iscsi_sw_tcp_data_ready_p =
(void *)kallsyms_lookup_name_p("iscsi_sw_tcp_data_ready");
iscsi_tcp_recv_p = (void *)kallsyms_lookup_name_p("iscsi_tcp_recv");
iscsi_sw_tcp_recv_p =
(void *)kallsyms_lookup_name_p("iscsi_sw_tcp_recv");
iscsi_tcp_data_recv_p =
(void *)kallsyms_lookup_name_p("iscsi_tcp_data_recv");
xs_tcp_data_ready_p =
(void *)kallsyms_lookup_name_p("xs_tcp_data_ready");
xs_tcp_data_recv_p = (void *)kallsyms_lookup_name_p("xs_tcp_data_recv");
}
void security_inet_conn_estab(struct sock *sk, struct sk_buff *skb)
{
security_inet_conn_established(sk, skb);
}
EXPORT_SYMBOL(security_inet_conn_estab);
/*
* The functions below replace some of the original methods of tcp_prot to
* support offloading.
*/
static void tcp_v4_hash_offload(struct sock *sk)
{
orig_tcp_prot.hash(sk);
if (sk->sk_state == TCP_LISTEN)
start_listen_offload(sk);
}
static void tcp_unhash_offload(struct sock *sk)
{
if (sk->sk_state == TCP_LISTEN)
stop_listen_offload(sk);
orig_tcp_prot.unhash(sk);
}
static int tcp_v4_connect_offload(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct rtable *rt;
u32 daddr, nexthop;
int tmp;
int err;
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
if (usin->sin_family != AF_INET)
return -EAFNOSUPPORT;
nexthop = daddr = usin->sin_addr.s_addr;
if (inet->opt && inet->opt->srr) {
if (!daddr)
return -EINVAL;
nexthop = inet->opt->faddr;
}
tmp = ip_route_connect(&rt, nexthop, inet->saddr,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
IPPROTO_TCP,
inet->sport, usin->sin_port, sk);
if (tmp < 0)
return tmp;
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
ip_rt_put(rt);
return -ENETUNREACH;
}
if (!inet->opt || !inet->opt->srr)
daddr = rt->rt_dst;
if (!inet->saddr)
inet->saddr = rt->rt_src;
inet->rcv_saddr = inet->saddr;
if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
/* Reset inherited state */
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
tp->write_seq = 0;
}
if (tcp_death_row.sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
struct inet_peer *peer = rt_get_peer_offload(rt);
/* VJ's idea. We save last timestamp seen from
* the destination in peer table, when entering state TIME-WAIT
* and initialize rx_opt.ts_recent from it, when trying new
* connection.
*/
if (peer &&
peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
tp->rx_opt.ts_recent = peer->tcp_ts;
}
}
inet->dport = usin->sin_port;
inet->daddr = daddr;
inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet->opt)
inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
tp->rx_opt.mss_clamp = 536;
/* Socket identity is still unknown (sport may be zero).
* However we set state to SYN-SENT and not releasing socket
* lock select source port, enter ourselves into the hash tables and
* complete initialization after this.
*/
tcp_set_state(sk, TCP_SYN_SENT);
err = inet_hash_connect(&tcp_death_row, sk);
if (err)
goto failure;
err = ip_route_newports(&rt, IPPROTO_TCP, inet->sport,
inet->dport, sk);
if (err)
goto failure;
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->u.dst);
if (tcp_connect_offload(sk))
return 0;
if (!tp->write_seq)
tp->write_seq = secure_tcp_sequence_number(inet->saddr,
inet->daddr,
inet->sport,
usin->sin_port);
inet->id = tp->write_seq ^ jiffies;
err = tcp_connect(sk);
rt = NULL;
if (err)
goto failure;
return 0;
failure:
/* This unhashes the socket and releases the local port,
if necessary */
tcp_set_state(sk, TCP_CLOSE);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->dport = 0;
return err;
}
ssize_t tcp_sendpage_offload(struct socket *sock, struct page *page,
int offset, size_t size, int flags)
{
struct sock *sk = sock->sk;
if (sk->sk_prot->sendpage)
return sk->sk_prot->sendpage(sk, page, offset, size, flags);
return tcp_sendpage(sock, page, offset, size, flags);
}
EXPORT_SYMBOL(tcp_sendpage_offload);
int tcp_sendmsg_offload(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg,
size_t size)
{
struct sock *sk = sock->sk;
if (sk->sk_prot->sendmsg)
return sk->sk_prot->sendmsg(iocb, sk, msg, size);
return tcp_sendmsg(iocb, (void *)sock, msg, size);
}
EXPORT_SYMBOL(tcp_sendmsg_offload);
static int offload_enabled;
#ifdef CONFIG_DEBUG_RODATA
static struct proto_ops offload_inet_stream_ops;
void offload_socket_ops(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
if (sock && sock->ops == &inet_stream_ops)
sock->ops = &offload_inet_stream_ops;
}
void restore_socket_ops(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
if (sock && sock->ops == &offload_inet_stream_ops)
sock->ops = &inet_stream_ops;
}
EXPORT_SYMBOL(restore_socket_ops);
static int offload_listen_cb(void *dummy, struct sock *sk)
{
offload_socket_ops(sk);
return 0;
}
static int restore_listen_cb(void *dummy, struct sock *sk)
{
restore_socket_ops(sk);
return 0;
}
#endif
static int find_kallsyms_lookup_name(void)
{
int err = 0;
#if defined(KPROBES_KALLSYMS)
struct kprobe kp;
memset(&kp, 0, sizeof kp);
kp.symbol_name = "kallsyms_lookup_name";
err = register_kprobe(&kp);
if (!err) {
kallsyms_lookup_name_p = (void *)kp.addr;
unregister_kprobe(&kp);
}
#else
kallsyms_lookup_name_p = (void *)KALLSYMS_LOOKUP;
#endif
if (!err)
err = kallsyms_lookup_name_p == NULL;
return err;
}
#define FIND_SYSCTL(name) do { \
int *p = (void *)kallsyms_lookup_name_p("sysctl_tcp_" # name); \
if (p) \
sysctl_tcp_ ## name ## _p = p; \
} while (0)
int prepare_tcp_for_offload(void)
{
if (offload_enabled) /* already done */
return 0;
if (!kallsyms_lookup_name_p) {
int err = find_kallsyms_lookup_name();
if (err)
return err;
}
/*
* rt_bind_peer is not a critical function, it's ok if we are unable
* to locate it.
*/
rt_bind_peer_p = (void *)kallsyms_lookup_name_p("rt_bind_peer");
#ifdef CONFIG_DEBUG_RODATA
offload_inet_stream_ops = inet_stream_ops;
offload_inet_stream_ops.sendmsg = tcp_sendmsg_offload;
offload_inet_stream_ops.sendpage = tcp_sendpage_offload;
walk_listens(NULL, offload_listen_cb);
#else
{
struct proto_ops *iso = (struct proto_ops *)&inet_stream_ops;
iso->sendmsg = tcp_sendmsg_offload;
iso->sendpage = tcp_sendpage_offload;
}
#endif
orig_tcp_prot = tcp_prot;
tcp_prot.hash = tcp_v4_hash_offload;
tcp_prot.unhash = tcp_unhash_offload;
tcp_prot.connect = tcp_v4_connect_offload;
offload_enabled = 1;
return 0;
}
void restore_tcp_to_nonoffload(void)
{
if (offload_enabled) {
#ifdef CONFIG_DEBUG_RODATA
walk_listens(NULL, restore_listen_cb);
#else
{
struct proto_ops *iso;
iso = (struct proto_ops *)&inet_stream_ops;
iso->sendmsg = tcp_sendmsg;
iso->sendpage = tcp_sendpage;
}
#endif
tcp_prot.hash = orig_tcp_prot.hash;
tcp_prot.unhash = orig_tcp_prot.unhash;
tcp_prot.connect = orig_tcp_prot.connect;
offload_enabled = 0;
}
}
static inline int ofld_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor)
{
if (sock_flag(sk, SOCK_OFFLOADED)) {
const struct sk_ofld_proto *p = (void *)sk->sk_prot;
return p->read_sock(sk, desc, recv_actor);
}
return tcp_read_sock(sk, desc, recv_actor);
}
/* Replacement for RPC's ->data_ready callback */
static void xs_ofld_tcp_data_ready(struct sock *sk, int bytes)
{
struct rpc_xprt *xprt;
read_descriptor_t rd_desc;
read_lock(&sk->sk_callback_lock);
if (!(xprt = sk->sk_user_data))
goto out;
if (xprt->shutdown)
goto out;
/* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
rd_desc.arg.data = xprt;
rd_desc.count = 65536;
ofld_read_sock(sk, &rd_desc, xs_tcp_data_recv_p);
out:
read_unlock(&sk->sk_callback_lock);
}
/* Copy of iscsi_tcp_segment_unmap */
static inline void iscsi_tcp_segment_unmap(struct iscsi_segment *segment)
{
if (segment->sg_mapped) {
kunmap_atomic(segment->sg_mapped, KM_SOFTIRQ0);
segment->sg_mapped = NULL;
segment->data = NULL;
}
}
/* Replacement for iSCSI's ->data_ready callback */
static void iscsi_ofld_tcp_data_ready_0(struct sock *sk, int bytes)
{
struct iscsi_conn *conn = sk->sk_user_data;
struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
read_descriptor_t rd_desc;
read_lock(&sk->sk_callback_lock);
rd_desc.arg.data = conn;
rd_desc.count = 1;
ofld_read_sock(sk, &rd_desc, iscsi_tcp_recv_p);
read_unlock(&sk->sk_callback_lock);
iscsi_tcp_segment_unmap(&tcp_conn->in.segment);
}
/* Replacement for iSCSI's ->data_ready callback */
static void iscsi_ofld_tcp_data_ready_2(struct sock *sk, int bytes)
{
struct iscsi_conn *conn = sk->sk_user_data;
struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
read_descriptor_t rd_desc;
read_lock(&sk->sk_callback_lock);
rd_desc.arg.data = conn;
rd_desc.count = 1;
ofld_read_sock(sk, &rd_desc, iscsi_sw_tcp_recv_p);
read_unlock(&sk->sk_callback_lock);
iscsi_tcp_segment_unmap(&tcp_conn->in.segment);
}
/* Replacement for iSCSI's ->data_ready callback, old api */
static void iscsi_ofld_tcp_data_ready_1(struct sock *sk, int bytes)
{
struct iscsi_conn *conn = sk->sk_user_data;
read_descriptor_t rd_desc;
read_lock(&sk->sk_callback_lock);
rd_desc.arg.data = conn;
rd_desc.count = 1;
ofld_read_sock(sk, &rd_desc, iscsi_tcp_data_recv_p);
read_unlock(&sk->sk_callback_lock);
}
int install_special_data_ready(struct sock *sk)
{
if (!sk->sk_user_data)
return 0;
/* sysctls are also best effort */
FIND_SYSCTL(timestamps);
FIND_SYSCTL(sack);
FIND_SYSCTL(window_scaling);
find_rpc_iscsi_callbacks();
if (sk->sk_data_ready == xs_tcp_data_ready_p)
sk->sk_data_ready = xs_ofld_tcp_data_ready;
else if (sk->sk_data_ready == iscsi_tcp_data_ready_p) {
if (iscsi_tcp_recv_p)
sk->sk_data_ready = iscsi_ofld_tcp_data_ready_0;
else if (iscsi_tcp_data_recv_p)
sk->sk_data_ready = iscsi_ofld_tcp_data_ready_1;
} else if (sk->sk_data_ready == iscsi_sw_tcp_data_ready_p) {
sk->sk_data_ready = iscsi_ofld_tcp_data_ready_2;
} else
return 0;
return 1;
}
EXPORT_SYMBOL(install_special_data_ready);
void restore_special_data_ready(struct sock *sk)
{
if (sk->sk_data_ready == xs_ofld_tcp_data_ready)
sk->sk_data_ready = xs_tcp_data_ready_p;
else if (sk->sk_data_ready == iscsi_ofld_tcp_data_ready_0)
sk->sk_data_ready = iscsi_tcp_data_ready_p;
else if (sk->sk_data_ready == iscsi_ofld_tcp_data_ready_1)
sk->sk_data_ready = iscsi_tcp_data_ready_p;
else if (sk->sk_data_ready == iscsi_ofld_tcp_data_ready_2)
sk->sk_data_ready = iscsi_sw_tcp_data_ready_p;
}
EXPORT_SYMBOL(restore_special_data_ready);