blob: 6f7a1c6809a63920d63d9cee4e11700bb445d39d [file] [log] [blame]
/*
* This file implements the Chelsio CPL5 message processing.
*
* Copyright (C) 2003-2010 Chelsio Communications. All rights reserved.
*
* Written by Dimitris Michailidis (dm@chelsio.com)
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE file included in this
* release for licensing terms and conditions.
*/
#include "defs.h"
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/toedev.h>
#include <linux/if_vlan.h>
#include <net/tcp.h>
#include <net/offload.h>
#include <net/route.h>
#include <asm/atomic.h>
#include "tom.h"
#include "cpl_io_state.h"
#include "t3_ddp.h"
#include "t3cdev.h"
#include "l2t.h"
#include "tcb.h"
#include "cxgb3_defs.h"
#include "cxgb3_ctl_defs.h"
#include "firmware_exports.h"
#include "trace.h"
#include "tom_compat.h"
#define DEBUG_WR 0
extern struct sk_ofld_proto t3_tcp_prot;
extern struct request_sock_ops t3_rsk_ops;
/*
* For ULP connections HW may add headers, e.g., for digests, that aren't part
* of the messages sent by the host but that are part of the TCP payload and
* therefore consume TCP sequence space. Tx connection parameters that
* operate in TCP sequence space are affected by the HW additions and need to
* compensate for them to accurately track TCP sequence numbers. This array
* contains the compensating extra lengths for ULP packets. It is indexed by
* a packet's ULP submode.
*/
const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
/*
* TOS values for HW scheduling classes. If an offload policy assigns a
* connection to a class we use a value from this table as its TOS. These
* are special values and we do not otherwise use them as TOS.
*/
static const u8 sched_class_tos[] = {
0x30, 0x32, 0x34, 0x36, 0x31, 0x33, 0x35, 0x37
};
/*
* This sk_buff holds a fake header-only TCP segment that we use whenever we
* need to exploit SW TCP functionality that expects TCP headers, such as
* tcp_create_openreq_child(). It's a RO buffer that may be used by multiple
* CPUs without locking.
*/
static struct sk_buff *tcphdr_skb __read_mostly;
/*
* Size of WRs in bytes. Note that we assume all devices we are handling have
* the same WR size.
*/
static unsigned int wrlen __read_mostly;
/*
* The number of WRs needed for an skb depends on the number of page fragments
* in the skb and whether it has any payload in its main body. This maps the
* length of the gather list represented by an skb into the # of necessary WRs.
*/
static unsigned int skb_wrs[MAX_SKB_FRAGS + 2] __read_mostly;
/*
* Socket filter that drops everything by specifying a 0-length filter program.
*/
static struct sk_filter drop_all = { .refcnt = ATOMIC_INIT(1) };
/*
* TOE information returned through inet_diag for offloaded connections.
*/
struct t3_inet_diag_info {
u32 toe_id; /* determines how to interpret the rest of the fields */
u32 tid;
u8 wrs;
u8 queue;
u8 ulp_mode:4;
u8 sched_class:4;
u8 ddp_enabled;
char dev_name[TOENAMSIZ];
};
/*
* Similar to process_cpl_msg() but takes an extra socket reference around the
* call to the handler. Should be used if the handler may drop a socket
* reference.
*/
static inline void process_cpl_msg_ref(void (*fn)(struct sock *,
struct sk_buff *),
struct sock *sk, struct sk_buff *skb)
{
sock_hold(sk);
process_cpl_msg(fn, sk, skb);
sock_put(sk);
}
static inline int is_t3a(const struct toedev *dev)
{
return dev->ttid == TOE_ID_CHELSIO_T3;
}
/*
* Returns an sk_buff for a reply CPL message of size len. If the input
* sk_buff has no other users it is trimmed and reused, otherwise a new buffer
* is allocated. The input skb must be of size at least len. Note that this
* operation does not destroy the original skb data even if it decides to reuse
* the buffer.
*/
static struct sk_buff *get_cpl_reply_skb(struct sk_buff *skb, size_t len,
int gfp)
{
if (likely(!skb_cloned(skb))) {
BUG_ON(skb->len < len);
__skb_trim(skb, len);
skb_get(skb);
} else {
skb = alloc_skb(len, gfp);
if (skb)
__skb_put(skb, len);
}
return skb;
}
/*
* Like get_cpl_reply_skb() but the returned buffer starts out empty.
*/
static struct sk_buff *__get_cpl_reply_skb(struct sk_buff *skb, size_t len,
int gfp)
{
if (likely(!skb_cloned(skb) && !skb->data_len)) {
__skb_trim(skb, 0);
skb_get(skb);
} else
skb = alloc_skb(len, gfp);
return skb;
}
/*
* Determine whether to send a CPL message now or defer it. A message is
* deferred if the connection is in SYN_SENT since we don't know the TID yet.
* For connections in other states the message is sent immediately.
* If through_l2t is set the message is subject to ARP processing, otherwise
* it is sent directly.
*/
static inline void send_or_defer(struct sock *sk, struct tcp_sock *tp,
struct sk_buff *skb, int through_l2t)
{
struct t3cdev *cdev = T3C_DEV(sk);
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
if (unlikely(sk->sk_state == TCP_SYN_SENT))
__skb_queue_tail(&tp->out_of_order_queue, skb); // defer
else if (through_l2t)
l2t_send(cdev, skb, cplios->l2t_entry); // send through L2T
else
cxgb3_ofld_send(cdev, skb); // send directly
}
/*
* Populate a TID_RELEASE WR. The skb must be already propely sized.
*/
static inline void mk_tid_release(struct sk_buff *skb, const struct sock *sk,
unsigned int tid)
{
struct cpl_tid_release *req;
skb->priority = mkprio(CPL_PRIORITY_SETUP, sk);
req = (struct cpl_tid_release *)__skb_put(skb, sizeof(*req));
req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
}
/*
* Insert a socket to the TID table and take an extra reference.
*/
static inline void sk_insert_tid(struct tom_data *d, struct sock *sk,
unsigned int tid)
{
sock_hold(sk);
cxgb3_insert_tid(d->cdev, d->client, sk, tid);
}
/**
* find_best_mtu - find the entry in the MTU table closest to an MTU
* @d: TOM state
* @mtu: the target MTU
*
* Returns the index of the value in the MTU table that is closest to but
* does not exceed the target MTU.
*/
static unsigned int find_best_mtu(const struct t3c_data *d, unsigned short mtu)
{
int i = 0;
while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
++i;
return i;
}
static unsigned int select_mss(struct sock *sk, unsigned int pmtu)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
unsigned int idx;
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
struct tom_data *d = TOM_DATA(cplios->toedev);
const struct t3c_data *td = T3C_DATA(d->cdev);
tp->advmss = dst_metric(dst, RTAX_ADVMSS);
if (USER_MSS(tp) && tp->advmss > USER_MSS(tp))
tp->advmss = USER_MSS(tp);
if (tp->advmss > pmtu - 40)
tp->advmss = pmtu - 40;
if (tp->advmss < td->mtus[0] - 40)
tp->advmss = td->mtus[0] - 40;
idx = find_best_mtu(td, tp->advmss + 40);
tp->advmss = td->mtus[idx] - 40;
inet_csk(sk)->icsk_pmtu_cookie = pmtu;
return idx;
}
void t3_select_window(struct sock *sk, int request)
{
struct toedev *dev = CPL_IO_STATE(sk)->toedev;
struct tom_data *d = TOM_DATA(dev);
struct tcp_sock *tp = tcp_sk(sk);
unsigned int wnd = tp->rcv_wnd;
unsigned int max_rcv_wnd;
if ((tp->copied_seq - tp->rcv_wup) > (tp->rcv_wnd >> 1))
wnd = tp->advmss*(tp->rcv_wnd/tp->advmss) << 1;
wnd = max_t(unsigned int, wnd, tcp_full_space(sk));
wnd = max_t(unsigned int, request, wnd);
/* PR 5138 */
max_rcv_wnd = (dev->ttid < TOE_ID_CHELSIO_T3C ?
(u32)d->rx_page_size * 23 :
MAX_RCV_WND);
if (wnd > max_rcv_wnd)
wnd = max_rcv_wnd;
/*
* Check if we need to grow the receive window in response to an increase in
* the socket's receive buffer size. Some applications increase the buffer
* size dynamically and rely on the window to grow accordingly.
*/
if (wnd > tp->rcv_wnd) {
tp->rcv_wup -= wnd - tp->rcv_wnd;
tp->rcv_wnd = wnd;
/* Mark the recieve window as updated*/
cplios_reset_flag(sk, CPLIOS_UPDATE_RCV_WND);
}
}
unsigned int t3_select_delack(struct sock *sk)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct toedev *dev = cplios->toedev;
unsigned int dack_mode;
dack_mode = TOM_TUNABLE(dev, delack);
if (!dack_mode)
return 0;
if ((dack_mode == 2) && (MSS_CLAMP(tp) > 1680))
dack_mode = 3;
if ((dack_mode == 3) && (tp->rcv_wnd < 2 * 26880))
dack_mode = 1;
if ((dack_mode == 2) && (tp->rcv_wnd < 2 * 16 * MSS_CLAMP(tp)))
dack_mode = 1;
if ((dev->ttid >= TOE_ID_CHELSIO_T3C) && (cplios->delack_mode == 0) &&
(tp->rcv_wnd > 2 * 2 * MSS_CLAMP(tp)))
dack_mode = 1;
return dack_mode;
}
#if VALIDATE_TID
/*
* Returns true if a connection TID is in range and currently unused.
*/
static int valid_new_tid(const struct tid_info *t, unsigned int tid)
{
return tid < t->ntids && !t->tid_tab[tid].ctx;
}
#define VALIDATE_SOCK(sk) \
do { \
if (unlikely(!(sk))) \
return CPL_RET_UNKNOWN_TID | CPL_RET_BUF_DONE; \
} while (0)
#else
#define VALIDATE_SOCK(sk) do {} while (0)
#endif
/*
* Called when we receive the last message from HW for a connection. A
* connection cannot transition to TCP_CLOSE prior to this event.
* Resources related to the offload state of a connection (e.g., L2T entries)
* must have been relinquished prior to calling this.
*/
static void connection_done(struct sock *sk)
{
#if 0
printk("connection_done: TID: %u, state: %d, dead %d, refs %d\n",
CPL_IO_STATE(sk)->tid, sk->sk_state, sock_flag(sk, SOCK_DEAD),
atomic_read(&sk->sk_refcnt));
// dump_stack();
#endif
#ifdef T3_TRACE
T3_TRACE1(TIDTB(sk),
"connection_done: GTS rpl pending %d, if pending wake",
cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING));
#endif
sk_wakeup_sleepers(sk, 0);
tcp_done(sk);
}
/*
* Min receive window. We want it to be large enough to accommodate receive
* coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
*/
#define MIN_RCV_WND (24 * 1024U)
/*
* Determine the receive window scaling factor given a target max
* receive window.
*/
static inline int select_rcv_wscale(int space, int wscale_ok, int window_clamp)
{
int wscale = 0;
if (space > MAX_RCV_WND)
space = MAX_RCV_WND;
if (window_clamp && window_clamp < space)
space = window_clamp;
if (wscale_ok)
for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
return wscale;
}
/* Returns bits 2:7 of a socket's TOS field */
#define SK_TOS(sk) ((inet_sk(sk)->tos >> 2) & M_TOS)
/*
* The next two functions calculate the option 0 value for a socket.
*/
static inline unsigned int calc_opt0h(struct sock *sk)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct tcp_sock *tp = tcp_sk(sk);
return V_NAGLE((tp->nonagle & TCP_NAGLE_OFF) == 0) |
V_KEEP_ALIVE(sock_flag(sk, SOCK_KEEPOPEN) != 0) | F_TCAM_BYPASS |
V_WND_SCALE(RCV_WSCALE(tp)) | V_MSS_IDX(cplios->mtu_idx);
}
static inline unsigned int calc_opt0l(struct sock *sk)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct tcp_sock *tp = tcp_sk(sk);
unsigned int tos;
if (cplios->sched_cls < ARRAY_SIZE(sched_class_tos))
tos = sched_class_tos[cplios->sched_cls];
else {
tos = SK_TOS(sk);
if ((tos & 0x38) == 0x30) /* suppress values in special range */
tos = 0;
}
return V_TOS(tos) | V_ULP_MODE(cplios->ulp_mode) |
V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
}
static unsigned int calc_opt2(const struct sock *sk,
const struct offload_settings *s)
{
u32 opt2 = (F_CPU_INDEX_VALID |
V_CPU_INDEX(CPL_IO_STATE(sk)->rss_cpu_idx));
if (unlikely(!s))
return opt2;
if (s->rx_coalesce >= 0)
opt2 |= F_RX_COALESCE_VALID |
V_RX_COALESCE(s->rx_coalesce ? 3 : 0);
if (s->cong_algo >= 0)
opt2 |= F_FLAVORS_VALID | V_CONG_CONTROL_FLAVOR(s->cong_algo) |
V_PACING_FLAVOR(1);
return opt2;
}
#ifdef CTRL_SKB_CACHE
/*
* This function is intended for allocations of small control messages.
* Such messages go as immediate data and usually the pakets are freed
* immediately. We maintain a cache of one small sk_buff and use it whenever
* it is available (has a user count of 1). Otherwise we get a fresh buffer.
*/
static struct sk_buff *alloc_ctrl_skb(const struct tcp_sock *tp, int len)
{
struct sk_buff *skb = cplios->ctrl_skb_cache;
if (likely(skb && !skb_shared(skb) && !skb_cloned(skb))) {
__skb_trim(skb, 0);
atomic_set(&skb->users, 2);
} else if (likely(!in_atomic()))
skb = alloc_skb_nofail(len);
else
skb = alloc_skb(len, GFP_ATOMIC);
return skb;
}
#else
# define alloc_ctrl_skb(tp, len) alloc_skb_nofail(len)
#endif
static inline void free_wr_skb(struct sk_buff *skb)
{
#if defined(CONFIG_T3_ZCOPY_SENDMSG) || defined(CONFIG_T3_ZCOPY_SENDMSG_MODULE)
if (skb->data[0] == FW_WROPCODE_OFLD_TX_DATA)
t3_zcopy_cleanup_skb(skb);
#endif
kfree_skb(skb);
}
static void purge_wr_queue(struct sock *sk)
{
struct sk_buff *skb;
while ((skb = dequeue_wr(sk)) != NULL)
free_wr_skb(skb);
}
/*
* Returns true if an sk_buff carries urgent data.
*/
static inline int skb_urgent(struct sk_buff *skb)
{
return (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_URG) != 0;
}
/*
* Generic ARP failure handler that discards the buffer.
*/
static void arp_failure_discard(struct t3cdev *cdev, struct sk_buff *skb)
{
kfree_skb(skb);
}
static inline void make_tx_data_wr(struct sock *sk, struct sk_buff *skb,
int len)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct tx_data_wr *req;
struct tcp_sock *tp = tcp_sk(sk);
skb_reset_transport_header(skb);
req = (struct tx_data_wr *)__skb_push(skb, sizeof(*req));
req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
req->wr_lo = htonl(V_WR_TID(cplios->tid));
req->sndseq = htonl(tp->snd_nxt);
/* len includes the length of any HW ULP additions */
req->len = htonl(len);
req->param = htonl(V_TX_PORT(cplios->l2t_entry->chan_idx));
/* V_TX_ULP_SUBMODE sets both the mode and submode */
req->flags = htonl(V_TX_ULP_SUBMODE(skb_ulp_mode(skb)) |
V_TX_URG(skb_urgent(skb)) |
V_TX_SHOVE((!cplios_flag(sk, CPLIOS_TX_MORE_DATA)) &&
(skb_peek(&sk->sk_write_queue) ? 0 : 1)));
if (!cplios_flag(sk, CPLIOS_TX_DATA_SENT)) {
req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
V_TX_CPU_IDX(cplios->rss_cpu_idx));
/* Sendbuffer is in units of 32KB.
*/
req->param |= htonl(V_TX_SNDBUF(sk->sk_sndbuf >> 15));
cplios_set_flag(sk, CPLIOS_TX_DATA_SENT);
}
}
/*
* Prepends TX_DATA_WR or CPL_CLOSE_CON_REQ headers to buffers waiting in a
* socket's send queue and sends them on to the TOE. Must be called with the
* socket lock held. Returns the amount of send buffer space that was freed
* as a result of sending queued data to the TOE.
*/
int t3_push_frames(struct sock *sk, int req_completion)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
int total_size = 0;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
struct t3cdev *cdev;
struct tom_data *d;
if (unlikely(sk_in_state(sk, TCPF_SYN_SENT | TCPF_CLOSE)))
return 0;
/*
* We shouldn't really be called at all after an abort but check just
* in case.
*/
if (unlikely(cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN)))
return 0;
d = TOM_DATA(cplios->toedev);
cdev = d->cdev;
while (cplios->wr_avail && (skb = skb_peek(&sk->sk_write_queue)) != NULL &&
!cplios_flag(sk, CPLIOS_TX_WAIT_IDLE) &&
(!(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_HOLD) ||
skb_queue_len(&sk->sk_write_queue) > 1)) {
int len = skb->len; /* length before skb_push */
int frags = skb_shinfo(skb)->nr_frags + (len != skb->data_len);
int wrs_needed = skb_wrs[frags];
if (wrs_needed > 1 && len + sizeof(struct tx_data_wr) <= wrlen)
wrs_needed = 1;
WARN_ON(frags >= ARRAY_SIZE(skb_wrs) || wrs_needed < 1);
if (cplios->wr_avail < wrs_needed)
break;
__skb_unlink(skb, &sk->sk_write_queue);
skb->priority = mkprio(CPL_PRIORITY_DATA, sk);
skb->csum = wrs_needed; /* remember this until the WR_ACK */
cplios->wr_avail -= wrs_needed;
cplios->wr_unacked += wrs_needed;
enqueue_wr(sk, skb);
if (likely(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR)) {
len += ulp_extra_len(skb);
make_tx_data_wr(sk, skb, len);
tp->snd_nxt += len;
tp->lsndtime = tcp_time_stamp;
#if defined(CONFIG_T3_ZCOPY_SENDMSG) || defined(CONFIG_T3_ZCOPY_SENDMSG_MODULE)
atomic_add(skb->len - sizeof (struct tx_data_wr),
&d->tx_dma_pending);
skb->sk = sk;
#endif
if ((req_completion && cplios->wr_unacked == wrs_needed) ||
(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_COMPL) ||
cplios->wr_unacked >= cplios->wr_max / 2) {
struct work_request_hdr *wr = cplhdr(skb);
wr->wr_hi |= htonl(F_WR_COMPL);
cplios->wr_unacked = 0;
}
ULP_SKB_CB(skb)->flags &= ~ULPCB_FLAG_NEED_HDR;
} else if (skb->data[0] == FW_WROPCODE_OFLD_CLOSE_CON)
cplios_set_flag(sk, CPLIOS_CLOSE_CON_REQUESTED);
total_size += skb->truesize;
if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_BARRIER)
cplios_set_flag(sk, CPLIOS_TX_WAIT_IDLE);
set_arp_failure_handler(skb, arp_failure_discard);
l2t_send(cdev, skb, cplios->l2t_entry);
}
sk->sk_wmem_queued -= total_size;
return total_size;
}
EXPORT_SYMBOL(t3_push_frames);
#ifndef TCP_CONGESTION_CONTROL
struct tcp_congestion_ops tcp_init_congestion_ops = {
.name = "",
.owner = THIS_MODULE,
};
#endif
static inline void free_atid(struct t3cdev *cdev, unsigned int tid)
{
struct sock *sk = cxgb3_free_atid(cdev, tid);
if (sk)
sock_put(sk);
}
/*
* Release resources held by an offload connection (TID, L2T entry, etc.)
*/
void t3_release_offload_resources(struct sock *sk)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct toedev *tdev = cplios->toedev;
struct t3cdev *cdev;
unsigned int tid = cplios->tid;
if (!tdev)
return;
cdev = T3C_DEV(sk);
if (!cdev)
return;
cplios->rss_cpu_idx = 0;
t3_release_ddp_resources(sk);
#ifdef CTRL_SKB_CACHE
kfree_skb(cplios->ctrl_skb_cache);
cplios->ctrl_skb_cache = NULL;
#endif
if (cplios->wr_avail != cplios->wr_max) {
purge_wr_queue(sk);
reset_wr_list(sk);
}
if (cplios->l2t_entry) {
l2t_release(L2DATA(cdev), cplios->l2t_entry);
cplios->l2t_entry = NULL;
}
if (sk->sk_state == TCP_SYN_SENT) { // we have ATID
free_atid(cdev, tid);
__skb_queue_purge(&tp->out_of_order_queue);
} else { // we have TID
cxgb3_remove_tid(cdev, (void *)sk, tid);
sock_put(sk);
}
t3_set_ca_ops(sk, &tcp_init_congestion_ops);
cplios->toedev = NULL;
#if 0
printk(KERN_INFO "closing TID %u, state %u\n", tid, sk->sk_state);
#endif
}
/*
* Returns whether a CPL message is not expected in the socket backlog of a
* closed connection. Most messages are illegal at that point except
* ABORT_RPL_RSS and GET_TCB_RPL sent by DDP.
*/
static int bad_backlog_msg(unsigned int opcode)
{
return opcode != CPL_ABORT_RPL_RSS && opcode != CPL_GET_TCB_RPL;
}
/*
* Called for each sk_buff in a socket's receive backlog during
* backlog processing.
*/
static int t3_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
#if VALIDATE_TID
unsigned int opcode = ntohl(skb->csum) >> 24;
if (unlikely(sk->sk_state == TCP_CLOSE && bad_backlog_msg(opcode))) {
printk(KERN_ERR "unexpected CPL message with opcode %x for "
"closed TID %u\n", opcode, CPL_IO_STATE(sk)->tid);
kfree_skb(skb);
return 0;
}
#endif
BLOG_SKB_CB(skb)->backlog_rcv(sk, skb);
return 0;
}
#ifdef CONFIG_TCP_OFFLOAD_MODULE
static void dummy_tcp_keepalive_timer(unsigned long data)
{
}
#endif
/*
* Switch a socket to the offload protocol operations. Note that the offload
* operations do not contain the offload backlog handler, we install that
* directly to the socket.
*/
static void install_offload_ops(struct sock *sk)
{
sk->sk_prot = &t3_tcp_prot.proto;
sk->sk_backlog_rcv = t3_backlog_rcv;
if (sk->sk_write_space == sk_stream_write_space)
sk->sk_write_space = t3_write_space;
#ifdef LINUX_2_4
if (sk->filter)
sk_filter_release(sk, sk->filter);
sk->filter = &drop_all;
sk_filter_charge(sk, sk->filter);
#else
if (sk->sk_filter)
sk_filter_uncharge(sk, sk->sk_filter);
sk->sk_filter = &drop_all;
sk_filter_charge(sk, sk->sk_filter);
#endif /* LINUX_2_4 */
#ifdef CONFIG_TCP_OFFLOAD_MODULE
sk->sk_timer.function = dummy_tcp_keepalive_timer;
#endif
sock_set_flag(sk, SOCK_OFFLOADED);
}
#if DEBUG_WR
static void dump_wrs(struct sock *sk)
{
u64 *d;
struct sk_buff *p;
printk("TID %u info:\n", CPL_IO_STATE(sk)->tid);
skb_queue_walk(&sk->sk_write_queue, p) {
d = cplhdr(p);
printk(" len %u, frags %u, flags %x, data %llx\n",
p->len, skb_shinfo(p)->nr_frags, ULP_SKB_CB(p)->flags,
(unsigned long long)be64_to_cpu(*d));
}
printk("outstanding:\n");
wr_queue_walk(sk, p) {
d = cplhdr(p);
printk(" len %u, frags %u, flags %x, data %llx,%llx,%llx\n",
p->len, skb_shinfo(p)->nr_frags, ULP_SKB_CB(p)->flags,
(unsigned long long)be64_to_cpu(*d),
(unsigned long long)be64_to_cpu(d[1]),
(unsigned long long)be64_to_cpu(d[2]));
}
}
static int count_pending_wrs(const struct sock *sk)
{
int n = 0;
const struct sk_buff *p;
wr_queue_walk(sk, p)
n += p->csum;
return n;
}
static void check_wr_invariants(const struct sock *sk)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
int pending = count_pending_wrs(sk);
if (unlikely(cplios->wr_avail + pending != cplios->wr_max))
printk(KERN_ERR "TID %u: credit imbalance: avail %u, "
"pending %u, total should be %u\n", cplios->tid,
cplios->wr_avail, pending, cplios->wr_max);
}
#endif
static void t3_idiag_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
#if DEBUG_WR
if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
bh_lock_sock(sk);
if (!sock_owned_by_user(sk))
dump_wrs(sk);
bh_unlock_sock(sk);
}
#endif
if (ext & (1 << INET_DIAG_MAX)) {
struct rtattr *rta;
struct t3_inet_diag_info *info;
rta = __RTA_PUT(skb, INET_DIAG_MAX + 1, sizeof(*info));
info = RTA_DATA(rta);
info->toe_id = TOE_ID_CHELSIO_T3;
info->tid = cplios->tid;
info->wrs = cplios->wr_max - cplios->wr_avail;
info->queue = cplios->qset_idx;
info->ulp_mode = cplios->ulp_mode;
info->sched_class = cplios->sched_cls != SCHED_CLS_NONE ?
cplios->sched_cls : 0;
info->ddp_enabled = DDP_STATE(sk)->ddp_setup;
strcpy(info->dev_name, cplios->toedev->name);
rtattr_failure: ;
}
}
#define T3_CONG_OPS(s) \
{ .name = s, .owner = THIS_MODULE, .get_info = t3_idiag_get_info }
static struct tcp_congestion_ops t3_cong_ops[] = {
T3_CONG_OPS("reno"), T3_CONG_OPS("tahoe"),
T3_CONG_OPS("newreno"), T3_CONG_OPS("highspeed")
};
static void mk_act_open_req(struct sock *sk, struct sk_buff *skb,
unsigned int atid, const struct l2t_entry *e,
const struct offload_settings *s)
{
struct cpl_act_open_req *req;
skb->priority = mkprio(CPL_PRIORITY_SETUP, sk);
req = (struct cpl_act_open_req *)__skb_put(skb, sizeof(*req));
req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
#ifdef LINUX_2_4
req->local_port = sk->inet_sport;
req->peer_port = sk->inet_dport;
req->local_ip = sk->inet_saddr;
req->peer_ip = sk->inet_daddr;
#else
req->local_port = inet_sk(sk)->inet_sport;
req->peer_port = inet_sk(sk)->inet_dport;
req->local_ip = inet_sk(sk)->inet_saddr;
req->peer_ip = inet_sk(sk)->inet_daddr;
#endif /* LINUX_2_4 */
req->opt0h = htonl(calc_opt0h(sk) | V_L2T_IDX(e->idx) |
V_TX_CHANNEL(e->chan_idx));
req->opt0l = htonl(calc_opt0l(sk));
req->params = 0;
/*
* Because we may need to retransmit an ACT_OPEN_REQ and we don't want
* to keep the offload settings around we use the following hack:
*
* - if we are given offload settings we use them and store the
* resulting opt2 in rcv_tstamp
* - otherwise we use the previously saved opt2
*/
if (likely(s))
tcp_sk(sk)->rcv_tstamp = calc_opt2(sk, s);
req->opt2 = htonl(tcp_sk(sk)->rcv_tstamp);
}
/*
* Convert an ACT_OPEN_RPL status to a Linux errno.
*/
static int act_open_rpl_status_to_errno(int status)
{
switch (status) {
case CPL_ERR_CONN_RESET:
return ECONNREFUSED;
case CPL_ERR_ARP_MISS:
return EHOSTUNREACH;
case CPL_ERR_CONN_TIMEDOUT:
return ETIMEDOUT;
case CPL_ERR_TCAM_FULL:
return ENOMEM;
case CPL_ERR_CONN_EXIST:
printk(KERN_ERR "ACTIVE_OPEN_RPL: 4-tuple in use\n");
return EADDRINUSE;
default:
return EIO;
}
}
static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb);
static void fail_act_open(struct sock *sk, int errno)
{
sk->sk_err = errno;
sk->sk_error_report(sk);
t3_release_offload_resources(sk);
connection_done(sk);
T3_TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
}
static void act_open_retry_timer(unsigned long data)
{
struct sk_buff *skb;
struct sock *sk = (struct sock *)data;
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) /* try in a bit */
sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
jiffies + HZ / 20);
else {
skb = alloc_skb(sizeof(struct cpl_act_open_req), GFP_ATOMIC);
if (!skb)
fail_act_open(sk, ENOMEM);
else {
skb->sk = sk;
set_arp_failure_handler(skb, act_open_req_arp_failure);
mk_act_open_req(sk, skb, cplios->tid,
cplios->l2t_entry, NULL);
l2t_send(T3C_DEV(sk), skb, cplios->l2t_entry);
}
}
bh_unlock_sock(sk);
sock_put(sk);
}
/*
* Handle active open failures.
*/
static void active_open_failed(struct sock *sk, struct sk_buff *skb)
{
struct cpl_act_open_rpl *rpl = cplhdr(skb);
struct inet_connection_sock *icsk = inet_csk(sk);
if (rpl->status == CPL_ERR_CONN_EXIST &&
icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
icsk->icsk_retransmit_timer.function = act_open_retry_timer;
sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
jiffies + HZ / 2);
} else
fail_act_open(sk, act_open_rpl_status_to_errno(rpl->status));
__kfree_skb(skb);
}
/*
* Return whether a failed active open has allocated a TID
*/
static inline int act_open_has_tid(int status)
{
return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
status != CPL_ERR_ARP_MISS;
}
/*
* Process an ACT_OPEN_RPL CPL message.
*/
static int do_act_open_rpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
{
struct sock *sk = (struct sock *)ctx;
struct cpl_act_open_rpl *rpl = cplhdr(skb);
VALIDATE_SOCK(sk);
if (cdev->type != T3A && act_open_has_tid(rpl->status))
cxgb3_queue_tid_release(cdev, GET_TID(rpl));
process_cpl_msg_ref(active_open_failed, sk, skb);
return 0;
}
/*
* Handle an ARP failure for an active open. XXX purge ofo queue
*
* XXX badly broken for crossed SYNs as the ATID is no longer valid.
* XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
* check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't
* free the atid. Hmm.
*/
static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb)
{
struct sock *sk = skb->sk;
sock_hold(sk);
bh_lock_sock(sk);
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) {
if (!sock_owned_by_user(sk)) {
fail_act_open(sk, EHOSTUNREACH);
__kfree_skb(skb);
} else {
/*
* Smart solution: Synthesize an ACTIVE_OPEN_RPL in the
* existing sk_buff and queue it to the backlog. We
* are certain the sk_buff is not shared. We also
* don't bother trimming the buffer.
*/
struct cpl_act_open_rpl *rpl = cplhdr(skb);
rpl->ot.opcode = CPL_ACT_OPEN_RPL;
rpl->status = CPL_ERR_ARP_MISS;
BLOG_SKB_CB(skb)->backlog_rcv = active_open_failed;
__sk_add_backlog(sk, skb);
/*
* XXX Make sure a PASS_ACCEPT_RPL behind us doesn't
* destroy the socket. Unfortunately we can't go into
* SYN_SENT because we don't have an atid.
* Needs more thought.
*/
}
}
bh_unlock_sock(sk);
sock_put(sk);
}
/*
* Determine the receive window size for a socket.
*/
static unsigned int select_rcv_wnd(struct sock *sk)
{
struct toedev *dev = CPL_IO_STATE(sk)->toedev;
struct tom_data *d = TOM_DATA(dev);
unsigned int wnd = tcp_full_space(sk);
unsigned int max_rcv_wnd;
/*
* For receive coalescing to work effectively we need a receive window
* that can accomodate a coalesced segment.
*/
if (wnd < MIN_RCV_WND)
wnd = MIN_RCV_WND;
/* PR 5138 */
max_rcv_wnd = (dev->ttid < TOE_ID_CHELSIO_T3C ?
(u32)d->rx_page_size * 23 :
MAX_RCV_WND);
cplios_set_flag(sk, CPLIOS_UPDATE_RCV_WND);
return min(wnd, max_rcv_wnd);
}
#if defined(TCP_CONGESTION_CONTROL)
static void pivot_ca_ops(struct sock *sk, int cong)
{
struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
module_put(icsk->icsk_ca_ops->owner);
icsk->icsk_ca_ops = &t3_cong_ops[cong < 0 ? 2 : cong];
}
#endif
#define CTRL_SKB_LEN 120
/*
* Assign offload parameters to some socket fields. This code is used by
* both active and passive opens.
*/
static void init_offload_sk(struct sock *sk, struct toedev *dev,
unsigned int tid, struct l2t_entry *e,
struct dst_entry *dst,
struct net_device *egress_dev,
const struct offload_settings *s)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct tcp_sock *tp = tcp_sk(sk);
cplios->toedev = dev;
cplios->tid = tid;
cplios->l2t_entry = e;
cplios->wr_max = cplios->wr_avail = TOM_TUNABLE(dev, max_wrs);
cplios->wr_unacked = 0;
cplios->delack_mode = 0;
cplios->mtu_idx = select_mss(sk, dst_mtu(dst));
tp->rcv_wnd = select_rcv_wnd(sk);
cplios->ulp_mode = (TOM_TUNABLE(dev, ddp) &&
!sock_flag(sk, SOCK_NO_DDP) &&
tp->rcv_wnd >= MIN_DDP_RCV_WIN
? ULP_MODE_TCPDDP
: ULP_MODE_NONE);
cplios->sched_cls = (s->sched_class >= 0
? s->sched_class
: SCHED_CLS_NONE);
cplios->qset_idx = 0;
cplios->rss_cpu_idx = 0;
if (s->rssq >= 0) {
unsigned int id = s->rssq;
if (dev->ctl(dev, GET_CPUIDX_OF_QSET, &id) == 0) {
cplios->qset_idx = s->rssq;
cplios->rss_cpu_idx = id;
}
}
#ifdef CTRL_SKB_CACHE
cplios->ctrl_skb_cache = alloc_skb(CTRL_SKB_LEN, gfp_any());
#endif
reset_wr_list(sk);
if (!tp->window_clamp)
tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
/*
* Set sk_sndbuf so that t3_write_space and sk_stream_write_space
* calculate available socket space the same way. This allows us to
* keep the original ->sk_write_space callback in cases of kernel
* sockets that provide their own version and expect
* sk_stream_write_space's method to be working.
*
* The only case we don't handle are sockets that have their own
* ->sk_write_space callback and set SOCK_SNDBUF_LOCK.
*/
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
sk->sk_sndbuf = TOM_TUNABLE(dev, max_host_sndbuf);
#if defined(TCP_CONGESTION_CONTROL)
pivot_ca_ops(sk, s->cong_algo);
#endif
}
static inline void check_sk_callbacks(struct sock *sk)
{
if (unlikely(sk->sk_user_data &&
!cplios_flag(sk, CPLIOS_CALLBACKS_CHKD))) {
if (install_special_data_ready(sk) > 0)
sock_set_flag(sk, SOCK_NO_DDP);
cplios_set_flag(sk, CPLIOS_CALLBACKS_CHKD);
}
}
/*
* Send an active open request.
*/
int t3_connect(struct toedev *tdev, struct sock *sk,
struct net_device *egress_dev)
{
int atid;
struct sk_buff *skb;
struct l2t_entry *e;
struct tom_data *d = TOM_DATA(tdev);
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
struct cpl_io_state *cplios;
struct offload_req orq;
struct offload_settings settings;
offload_req_from_sk(&orq, sk, OPEN_TYPE_ACTIVE);
settings = *lookup_ofld_policy(tdev, &orq, d->conf.cop_managed_offloading);
#ifndef LINUX_2_4
rcu_read_unlock();
#else
read_unlock(&tdev->policy_lock);
#endif
if (!settings.offload)
goto out_err;
atid = cxgb3_alloc_atid(d->cdev, d->client, sk);
if (atid < 0)
goto out_err;
cplios = kzalloc(sizeof *cplios, GFP_KERNEL);
if (cplios == NULL)
goto out_err;
e = t3_l2t_get(d->cdev, dst->neighbour, egress_dev);
if (!e)
goto free_tid;
skb = alloc_skb_nofail(sizeof(struct cpl_act_open_req));
skb->sk = sk;
set_arp_failure_handler(skb, act_open_req_arp_failure);
sock_hold(sk);
CPL_IO_STATE(sk) = cplios;
install_offload_ops(sk);
check_sk_callbacks(sk);
init_offload_sk(sk, tdev, atid, e, dst, egress_dev, &settings);
RCV_WSCALE(tp) = select_rcv_wscale(tcp_full_space(sk),
sysctl_tcp_window_scaling,
tp->window_clamp);
sk->sk_err = 0;
sock_reset_flag(sk, SOCK_DONE);
T3_TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
mk_act_open_req(sk, skb, atid, e, &settings);
l2t_send(d->cdev, skb, e);
if (cplios->ulp_mode == ULP_MODE_TCPDDP)
t3_enable_ddp(sk, 0);
return 0;
free_tid:
free_atid(d->cdev, atid);
out_err:
return -1;
}
/*
* Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant
* and send it along.
*/
static void abort_arp_failure(struct t3cdev *cdev, struct sk_buff *skb)
{
struct cpl_abort_req *req = cplhdr(skb);
req->cmd = CPL_ABORT_NO_RST;
cxgb3_ofld_send(cdev, skb);
}
/*
* Send an ABORT_REQ message. Cannot fail. This routine makes sure we do
* not send multiple ABORT_REQs for the same connection and also that we do
* not try to send a message after the connection has closed. Returns 1 if
* an ABORT_REQ wasn't generated after all, 0 otherwise.
*/
int t3_send_reset(struct sock *sk, int mode, struct sk_buff *skb)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct cpl_abort_req *req;
struct tcp_sock *tp = tcp_sk(sk);
unsigned int tid = cplios->tid;
if (unlikely(cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN) ||
!cplios->toedev)) {
if (skb)
__kfree_skb(skb);
return 1;
}
cplios_set_flag(sk, CPLIOS_ABORT_RPL_PENDING);
cplios_set_flag(sk, CPLIOS_ABORT_SHUTDOWN);
/* Purge the send queue so we don't send anything after an abort. */
t3_purge_write_queue(sk);
if (cplios_flag(sk, CPLIOS_CLOSE_CON_REQUESTED) && is_t3a(cplios->toedev))
mode |= CPL_ABORT_POST_CLOSE_REQ;
if (!skb)
skb = alloc_skb_nofail(sizeof(*req));
skb->priority = mkprio(CPL_PRIORITY_DATA, sk);
set_arp_failure_handler(skb, abort_arp_failure);
req = (struct cpl_abort_req *)skb_put(skb, sizeof(*req));
req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
req->wr.wr_lo = htonl(V_WR_TID(tid));
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
req->rsvd0 = htonl(tp->snd_nxt);
req->rsvd1 = !cplios_flag(sk, CPLIOS_TX_DATA_SENT);
req->cmd = mode;
if (sk->sk_state == TCP_SYN_SENT)
__skb_queue_tail(&tp->out_of_order_queue, skb); // defer
else
l2t_send(T3C_DEV(sk), skb, cplios->l2t_entry);
return 0;
}
EXPORT_SYMBOL(t3_send_reset);
/*
* Reset a connection that is on a listener's SYN queue or accept queue,
* i.e., one that has not had a struct socket associated with it.
* Must be called from process context.
*
* Modeled after code in inet_csk_listen_stop().
*/
static void reset_listen_child(struct sock *child)
{
struct sk_buff *skb = alloc_skb_nofail(sizeof(struct cpl_abort_req));
sock_hold(child); // need to survive past inet_csk_destroy_sock()
local_bh_disable();
bh_lock_sock(child);
t3_send_reset(child, CPL_ABORT_SEND_RST, skb);
sock_orphan(child);
INC_ORPHAN_COUNT(child);
if (child->sk_state == TCP_CLOSE)
inet_csk_destroy_sock(child);
bh_unlock_sock(child);
local_bh_enable();
sock_put(child);
}
/*
* The reap list is the list of passive open sockets that were orphaned when
* their listening parent went away and wasn't able to nuke them for whatever
* reason. These sockets are terminated through a work request from process
* context.
*/
static struct sock *reap_list;
static spinlock_t reap_list_lock = SPIN_LOCK_UNLOCKED;
/*
* Process the reap list.
*/
DECLARE_TASK_FUNC(process_reap_list, task_param)
{
spin_lock_bh(&reap_list_lock);
while (reap_list) {
struct sock *sk = reap_list;
reap_list = sk->sk_user_data;
sk->sk_user_data = NULL;
spin_unlock_bh(&reap_list_lock);
reset_listen_child(sk);
spin_lock_bh(&reap_list_lock);
}
spin_unlock_bh(&reap_list_lock);
}
static T3_DECLARE_WORK(reap_task, process_reap_list, NULL);
/*
* Add a socket to the reap list and schedule a work request to process it.
* We thread sockets through their sk_user_data pointers. May be called
* from softirq context and any associated open request must have already
* been freed.
*/
static void add_to_reap_list(struct sock *sk)
{
BUG_ON(sk->sk_user_data);
release_tcp_port(sk); // release the port immediately, it may be reused
spin_lock_bh(&reap_list_lock);
sk->sk_user_data = reap_list;
reap_list = sk;
if (!sk->sk_user_data)
schedule_work(&reap_task);
spin_unlock_bh(&reap_list_lock);
}
static void __set_tcb_field(struct sock *sk, struct sk_buff *skb, u16 word,
u64 mask, u64 val, int no_reply)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct cpl_set_tcb_field *req;
req = (struct cpl_set_tcb_field *)__skb_put(skb, sizeof(*req));
req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, cplios->tid));
req->reply = V_NO_REPLY(no_reply);
req->cpu_idx = cplios->rss_cpu_idx;
req->word = htons(word);
req->mask = cpu_to_be64(mask);
req->val = cpu_to_be64(val);
skb->priority = mkprio(CPL_PRIORITY_CONTROL, sk);
}
void t3_set_tcb_field(struct sock *sk, u16 word, u64 mask, u64 val)
{
struct sk_buff *skb;
if (sk->sk_state == TCP_CLOSE || cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN))
return;
skb = alloc_ctrl_skb(tcp_sk(sk), sizeof(struct cpl_set_tcb_field));
__set_tcb_field(sk, skb, word, mask, val, 1);
send_or_defer(sk, tcp_sk(sk), skb, 0);
}
/*
* Set one of the t_flags bits in the TCB.
*/
static void set_tcb_tflag(struct sock *sk, unsigned int bit_pos, int val)
{
t3_set_tcb_field(sk, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
}
/*
* Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
*/
void t3_set_nagle(struct sock *sk)
{
set_tcb_tflag(sk, S_TF_NAGLE, !(tcp_sk(sk)->nonagle & TCP_NAGLE_OFF));
}
/*
* Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
*/
void t3_set_keepalive(struct sock *sk, int on_off)
{
set_tcb_tflag(sk, S_TF_KEEPALIVE, on_off);
}
void t3_set_rcv_coalesce_enable(struct sock *sk, int on_off)
{
set_tcb_tflag(sk, S_TF_RCV_COALESCE_ENABLE, on_off);
}
void t3_set_dack(struct sock *sk, int on_off)
{
set_tcb_tflag(sk, S_TF_DACK, on_off);
}
void t3_set_dack_mss(struct sock *sk, int on_off)
{
set_tcb_tflag(sk, S_TF_DACK_MSS, on_off);
}
void t3_set_migrating(struct sock *sk, int on_off)
{
set_tcb_tflag(sk, S_TF_MIGRATING, on_off);
}
void t3_set_non_offload(struct sock *sk, int on_off)
{
set_tcb_tflag(sk, S_TF_NON_OFFLOAD, on_off);
}
/*
* Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
*/
void t3_set_tos(struct sock *sk)
{
t3_set_tcb_field(sk, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
V_TCB_TOS(SK_TOS(sk)));
}
/*
* In DDP mode, TP fails to schedule a timer to push RX data to the host when
* DDP is disabled (data is delivered to freelist). [Note that, the peer should
* set the PSH bit in the last segment, which would trigger delivery.]
* We work around the issue by setting a DDP buffer in a partial placed state,
* which guarantees that TP will schedule a timer.
*/
#define TP_DDP_TIMER_WORKAROUND_MASK\
(V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
#define TP_DDP_TIMER_WORKAROUND_VAL\
(V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
((V_TCB_RX_DDP_BUF0_OFFSET((u64)1) | V_TCB_RX_DDP_BUF0_LEN((u64)2)) <<\
32))
void t3_enable_ddp(struct sock *sk, int on)
{
if (on)
t3_set_tcb_field(sk, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
V_TF_DDP_OFF(0));
else
t3_set_tcb_field(sk, W_TCB_RX_DDP_FLAGS,
V_TF_DDP_OFF(1) |
TP_DDP_TIMER_WORKAROUND_MASK,
V_TF_DDP_OFF(1) |
TP_DDP_TIMER_WORKAROUND_VAL);
}
void t3_set_ddp_tag(struct sock *sk, int buf_idx, unsigned int tag_color)
{
t3_set_tcb_field(sk, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
tag_color);
}
void t3_set_ddp_buf(struct sock *sk, int buf_idx, unsigned int offset,
unsigned int len)
{
if (buf_idx == 0)
t3_set_tcb_field(sk, W_TCB_RX_DDP_BUF0_OFFSET,
V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
V_TCB_RX_DDP_BUF0_OFFSET((u64)offset) |
V_TCB_RX_DDP_BUF0_LEN((u64)len));
else
t3_set_tcb_field(sk, W_TCB_RX_DDP_BUF1_OFFSET,
V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
V_TCB_RX_DDP_BUF1_OFFSET((u64)offset) |
V_TCB_RX_DDP_BUF1_LEN(((u64)len) << 32));
}
int t3_set_cong_control(struct sock *sk, const char *name)
{
int cong_algo;
for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
if (!strcmp(name, t3_cong_ops[cong_algo].name))
break;
if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
return -EINVAL;
return 0;
}
int t3_get_tcb(struct sock *sk)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct cpl_get_tcb *req;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = alloc_skb(sizeof(*req), gfp_any());
if (!skb)
return -ENOMEM;
skb->priority = mkprio(CPL_PRIORITY_CONTROL, sk);
req = (struct cpl_get_tcb *)__skb_put(skb, sizeof(*req));
req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, cplios->tid));
req->cpuno = htons(cplios->rss_cpu_idx);
if (sk->sk_state == TCP_SYN_SENT)
__skb_queue_tail(&tp->out_of_order_queue, skb); // defer
else
cxgb3_ofld_send(T3C_DEV(sk), skb);
return 0;
}
/*
* Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are
* permitted to return without sending the message in case we cannot allocate
* an sk_buff. Returns the number of credits sent.
*/
u32 t3_send_rx_credits(struct sock *sk, u32 credits, u32 dack, int nofail)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct sk_buff *skb;
struct cpl_rx_data_ack *req;
skb = nofail ? alloc_ctrl_skb(tp, sizeof(*req)) :
alloc_skb(sizeof(*req), GFP_ATOMIC);
if (!skb)
return 0;
req = (struct cpl_rx_data_ack *)__skb_put(skb, sizeof(*req));
req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, cplios->tid));
req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
skb->priority = mkprio(CPL_PRIORITY_ACK, sk);
cxgb3_ofld_send(T3C_DEV(sk), skb);
return credits;
}
/*
* Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
* This is only used in DDP mode, so we take the opportunity to also set the
* DACK mode and flush any Rx credits.
*/
void t3_send_rx_modulate(struct sock *sk)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct sk_buff *skb;
struct cpl_rx_data_ack *req;
struct tcp_sock *tp = tcp_sk(sk);
u32 dack;
dack = t3_select_delack(sk);
skb = alloc_ctrl_skb(tp, sizeof(*req));
req = (struct cpl_rx_data_ack *)__skb_put(skb, sizeof(*req));
req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, cplios->tid));
req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
V_RX_DACK_MODE(dack) |
V_RX_CREDITS(tp->copied_seq - tp->rcv_wup));
skb->priority = mkprio(CPL_PRIORITY_CONTROL, sk);
cxgb3_ofld_send(T3C_DEV(sk), skb);
tp->rcv_wup = tp->copied_seq;
}
/*
* Handle receipt of an urgent pointer.
*/
static void handle_urg_ptr(struct sock *sk, u32 urg_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
urg_seq--; /* initially points past the urgent data, per BSD */
if (tp->urg_data && !after(urg_seq, tp->urg_seq))
return; /* duplicate pointer */
sk_send_sigurg(sk);
if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
!sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
tp->copied_seq++;
if (skb && tp->copied_seq - ULP_SKB_CB(skb)->seq >= skb->len)
tom_eat_skb(sk, skb, 0);
}
tp->urg_data = TCP_URG_NOTYET;
tp->urg_seq = urg_seq;
}
/*
* Returns true if a socket cannot accept new Rx data.
*/
static inline int sk_no_receive(const struct sock *sk)
{
return (sk->sk_shutdown & RCV_SHUTDOWN);
}
/*
* Process an urgent data notification.
*/
static void rx_urg_notify(struct sock *sk, struct sk_buff *skb)
{
struct cpl_rx_urg_notify *hdr = cplhdr(skb);
if (!sk_no_receive(sk))
handle_urg_ptr(sk, ntohl(hdr->seq));
__kfree_skb(skb);
}
/*
* Handler for RX_URG_NOTIFY CPL messages.
*/
static int do_rx_urg_notify(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
{
struct sock *sk = (struct sock *)ctx;
VALIDATE_SOCK(sk);
process_cpl_msg(rx_urg_notify, sk, skb);
return 0;
}
/*
* A helper function that aborts a connection and increments the given MIB
* counter. The supplied skb is used to generate the ABORT_REQ message if
* possible. Must be called with softirqs disabled.
*/
static inline void abort_conn(struct sock *sk, struct sk_buff *skb, int mib)
{
struct sk_buff *abort_skb;
abort_skb = __get_cpl_reply_skb(skb, sizeof(struct cpl_abort_req),
GFP_ATOMIC);
if (abort_skb) {
T3_NET_INC_STATS_BH(sock_net(sk), mib);
t3_send_reset(sk, CPL_ABORT_SEND_RST, abort_skb);
}
}
/*
* Returns true if we need to explicitly request RST when we receive new data
* on an RX-closed connection.
*/
static inline int need_rst_on_excess_rx(const struct sock *sk)
{
return 1;
}
/*
* Handles Rx data that arrives in a state where the socket isn't accepting
* new data.
*/
static void handle_excess_rx(struct sock *sk, struct sk_buff *skb)
{
if (need_rst_on_excess_rx(sk) && !cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN))
abort_conn(sk, skb, LINUX_MIB_TCPABORTONDATA);
kfree_skb(skb); /* can't use __kfree_skb here */
}
/*
* Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
* by getting the DDP offset from the TCB.
*/
static void tcb_rpl_as_ddp_complete(struct sock *sk, struct sk_buff *skb)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct ddp_state *q = DDP_STATE(sk);
struct ddp_buf_state *bsp;
struct cpl_get_tcb_rpl *hdr;
unsigned int ddp_offset, dack, dack_mss;
u64 t;
__be64 *tcb;
if (unlikely(!(tp = tcp_sk(sk)))) {
kfree_skb(skb);
return;
}
/* Note that we only accout for CPL_GET_TCB issued by the DDP code. We
* really need a cookie in order to dispatch the RPLs.
*/
q->get_tcb_count--;
/* It is a possible that a previous CPL already invalidated UBUF DDP
* and moved the cur_buf idx and hence no further processing of this
* skb is required. However, the app might be sleeping on
* !q->get_tcb_count and we need to wake it up.
*/
if (q->cancel_ubuf && !t3_ddp_ubuf_pending(sk)) {
kfree_skb(skb);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, 0);
return;
}
bsp = &q->buf_state[q->cur_buf];
hdr = cplhdr(skb);
tcb = (__be64 *)(hdr + 1);
if (q->cur_buf == 0) {
t = be64_to_cpu(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
} else {
t = be64_to_cpu(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
}
ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
t = be64_to_cpu(tcb[(31 - W_TCB_T_FLAGS1) /2]);
dack = (t >> (32 + S_TF_DACK)) & 0x1;
t = be64_to_cpu(tcb[(31 - W_TCB_T_FLAGS2) /2]);
dack_mss = (t >> (S_TF_DACK_MSS - 32)) & 0x1;
dack |= dack_mss << 1;
if (unlikely(dack != cplios->delack_mode)) {
cplios->delack_mode = dack;
cplios->delack_seq = tp->rcv_nxt;
}
#ifdef T3_TRACE
T3_TRACE4(TIDTB(sk),
"tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u ddp_offset %u delack_mode %u",
tp->rcv_nxt, q->cur_buf, ddp_offset, cplios->delack_mode);
#endif
#if 0
{
unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
t = be64_to_cpu(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
t = be64_to_cpu(tcb[(31 - W_TCB_RCV_NXT) / 2]);
rcv_nxt = t >> S_TCB_RCV_NXT;
rcv_nxt &= M_TCB_RCV_NXT;
t = be64_to_cpu(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
T3_TRACE2(TIDTB(sk),
"tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
ddp_flags, rcv_nxt - rx_hdr_offset);
T3_TRACE4(TB(q),
"tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
T3_TRACE3(TB(q),
"tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
T3_TRACE2(TB(q),
"tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
q->buf_state[0].flags, q->buf_state[1].flags);
}
#endif
skb_ulp_ddp_offset(skb) = bsp->cur_offset;
bsp->cur_offset = ddp_offset;
skb->len = ddp_offset - skb_ulp_ddp_offset(skb);
if (unlikely(sk_no_receive(sk) && skb->len)) {
handle_excess_rx(sk, skb);
return;
}
#ifdef T3_TRACE
if ((int)skb->len < 0) {
T3_TRACE0(TIDTB(sk), "tcb_rpl_as_ddp_complete: neg len");
}
#endif
if (bsp->flags & DDP_BF_NOCOPY) {
#ifdef T3_TRACE
T3_TRACE0(TIDTB(sk),
"tcb_rpl_as_ddp_complete: CANCEL UBUF");
if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
printk("!cancel_ubuf");
}
#endif
skb_ulp_ddp_flags(skb) = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
q->cur_buf ^= 1;
} else if (bsp->flags & DDP_BF_NOFLIP) {
skb_ulp_ddp_flags(skb) = 1; /* always a kernel buffer */
/* now HW buffer carries a user buffer */
bsp->flags &= ~DDP_BF_NOFLIP;
bsp->flags |= DDP_BF_NOCOPY;
/* It is possible that the CPL_GET_TCB_RPL doesn't indicate
* any new data in which case we're done. If in addition the
* offset is 0, then there wasn't a completion for the kbuf
* and we need to decrement the posted count.
*/
if (!skb->len) {
if (!ddp_offset) {
q->kbuf_posted--;
bsp->flags |= DDP_BF_NODATA;
}
BUG_ON(skb->len);
kfree_skb(skb);
return;
}
} else {
/* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
* but it got here way late and nobody cares anymore.
*/
kfree_skb(skb);
return;
}
skb_gl_set(skb, bsp->gl);
ULP_SKB_CB(skb)->seq = tp->rcv_nxt;
tp->rcv_nxt += skb->len;
skb_reset_transport_header(skb);
tcp_hdr(skb)->fin = 0; /* changes original TCB */
inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp;
#ifdef T3_TRACE
T3_TRACE3(TIDTB(sk),
"tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u lskb->len %u",
ULP_SKB_CB(skb)->seq, q->cur_buf, skb->len);
#endif
__skb_queue_tail(&sk->sk_receive_queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, 0);
}
/*
* Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code,
* in that case they are similar to DDP completions.
*/
static int do_get_tcb_rpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
{
struct sock *sk = (struct sock *)ctx;
/* OK if socket doesn't exist */
if (!sk)
return CPL_RET_BUF_DONE;
process_cpl_msg(tcb_rpl_as_ddp_complete, sk, skb);
return 0;
}
static void handle_ddp_data(struct sock *sk, struct sk_buff *origskb)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct ddp_state *q;
struct ddp_buf_state *bsp;
struct cpl_rx_data *hdr = cplhdr(origskb);
unsigned int rcv_nxt = ntohl(hdr->seq);
struct sk_buff *skb;
/* If the sequence number received is less than expected then the assumptions
that follow do not apply.
*/
if (tp->rcv_nxt >= rcv_nxt)
return;
q = DDP_STATE(sk);
if (!q->ddp_setup)
return;
skb = skb_clone(origskb, GFP_ATOMIC);
if (!skb)
return;
bsp = &q->buf_state[q->cur_buf];
/* Here we assume that data placed into host memory by DDP corresponds
to the difference between the sequence number received in the RX_DATA header
and the expected sequence number. And since we tested the sequence above
so the computed skb->len is positive we won't panic later on...
*/
skb->len = rcv_nxt - tp->rcv_nxt;
#ifdef T3_TRACE
if ((int)skb->len < 0) {
T3_TRACE0(TIDTB(sk), "handle_ddp_data: neg len");
}
#endif
skb_gl_set(skb, bsp->gl);
skb_ulp_ddp_offset(skb) = bsp->cur_offset;
skb_ulp_ddp_flags(skb) =
DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
if (bsp->flags & DDP_BF_NOCOPY)
bsp->flags &= ~DDP_BF_NOCOPY;
if (unlikely(hdr->dack_mode != cplios->delack_mode)) {
cplios->delack_mode = hdr->dack_mode;
cplios->delack_seq = tp->rcv_nxt;
}
ULP_SKB_CB(skb)->seq = tp->rcv_nxt;
tp->rcv_nxt = rcv_nxt;
bsp->cur_offset += skb->len;
if (!(bsp->flags & DDP_BF_NOFLIP))
q->cur_buf ^= 1;
inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp;
__skb_queue_tail(&sk->sk_receive_queue, skb);
/* For now, don't re-enable DDP after a connection fell out of DDP
* mode.
*/
q->ubuf_ddp_ready = 0;
}
/*
* Process new data received for a connection.
*/
static void new_rx_data(struct sock *sk, struct sk_buff *skb)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct cpl_rx_data *hdr = cplhdr(skb);
struct tcp_sock *tp = tcp_sk(sk);
if (unlikely(sk_no_receive(sk))) {
handle_excess_rx(sk, skb);
return;
}
if (cplios->ulp_mode == ULP_MODE_TCPDDP)
handle_ddp_data(sk, skb);
ULP_SKB_CB(skb)->seq = ntohl(hdr->seq);
ULP_SKB_CB(skb)->flags = 0;
skb_ulp_mode(skb) = ULP_MODE_NONE; /* for iSCSI */
skb_ulp_ddp_flags(skb) = 0; /* for DDP */
#if VALIDATE_SEQ
if (unlikely(ULP_SKB_CB(skb)->seq != tp->rcv_nxt)) {
printk(KERN_ERR
"%s: TID %u: Bad sequence number %u, expected %u\n",
cplios->toedev->name, cplios->tid, ULP_SKB_CB(skb)->seq,
tp->rcv_nxt);
__kfree_skb(skb);
return;
}
#endif
skb_reset_transport_header(skb);
__skb_pull(skb, sizeof(*hdr));
if (!skb->data_len)
__skb_trim(skb, ntohs(hdr->len));
if (unlikely(hdr->urg))
handle_urg_ptr(sk, tp->rcv_nxt + ntohs(hdr->urg));
if (unlikely(tp->urg_data == TCP_URG_NOTYET &&
tp->urg_seq - tp->rcv_nxt < skb->len))
tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
tp->rcv_nxt];
if (unlikely(hdr->dack_mode != cplios->delack_mode)) {
cplios->delack_mode = hdr->dack_mode;
cplios->delack_seq = tp->rcv_nxt;
}
tcp_hdr(skb)->fin = 0; /* modifies original hdr->urg */
tp->rcv_nxt += skb->len;
#ifdef T3_TRACE
T3_TRACE2(TIDTB(sk),
"new_rx_data: seq 0x%x len %u",
ULP_SKB_CB(skb)->seq, skb->len);
#endif
inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp;
__skb_queue_tail(&sk->sk_receive_queue, skb);
if (!sock_flag(sk, SOCK_DEAD)) {
check_sk_callbacks(sk);
sk->sk_data_ready(sk, 0);
}
}
/*
* Handler for RX_DATA CPL messages.
*/
static int do_rx_data(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
{
struct sock *sk = (struct sock *)ctx;
VALIDATE_SOCK(sk);
skb_gl_set(skb, NULL); /* indicates packet is RX_DATA */
process_cpl_msg(new_rx_data, sk, skb);
return 0;
}
static void new_rx_data_ddp(struct sock *sk, struct sk_buff *skb)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct tcp_sock *tp;
struct ddp_state *q;
struct ddp_buf_state *bsp;
struct cpl_rx_data_ddp *hdr;
unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
unsigned int nomoredata=0;
unsigned int delack_mode;
if (unlikely(sk_no_receive(sk))) {
handle_excess_rx(sk, skb);
return;
}
tp = tcp_sk(sk);
q = DDP_STATE(sk);
hdr = cplhdr(skb);
ddp_report = ntohl(hdr->ddp_report);
buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
bsp = &q->buf_state[buf_idx];
#ifdef T3_TRACE
T3_TRACE5(TIDTB(sk),
"new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
"hdr seq 0x%x len %u offset %u",
tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
ntohs(hdr->len), G_DDP_OFFSET(ddp_report));
T3_TRACE1(TIDTB(sk),
"new_rx_data_ddp: ddp_report 0x%x",
ddp_report);
#endif
ddp_len = ntohs(hdr->len);
rcv_nxt = ntohl(hdr->seq) + ddp_len;
delack_mode = G_DDP_DACK_MODE(ddp_report);
if (unlikely(G_DDP_DACK_MODE(ddp_report) != cplios->delack_mode)) {
cplios->delack_mode = delack_mode;
cplios->delack_seq = tp->rcv_nxt;
}
ULP_SKB_CB(skb)->seq = tp->rcv_nxt;
tp->rcv_nxt = rcv_nxt;
/*
* Store the length in skb->len. We are changing the meaning of
* skb->len here, we need to be very careful that nothing from now on
* interprets ->len of this packet the usual way.
*/
skb->len = tp->rcv_nxt - ULP_SKB_CB(skb)->seq;
/*
* Figure out where the new data was placed in the buffer and store it
* in when. Assumes the buffer offset starts at 0, consumer needs to
* account for page pod's pg_offset.
*/
end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
skb_ulp_ddp_offset(skb) = end_offset - skb->len;
/*
* We store in mac.raw the address of the gather list where the
* placement happened.
*/
skb_gl_set(skb, bsp->gl);
bsp->cur_offset = end_offset;
/*
* Bit 0 of DDP flags stores whether the DDP buffer is completed.
* Note that other parts of the code depend on this being in bit 0.
*/
if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->length) {
skb_ulp_ddp_flags(skb) = 0; /* potential spurious completion */
BUG_ON(1);
} else {
skb_ulp_ddp_flags(skb) = !!(ddp_report & F_DDP_BUF_COMPLETE);
if (skb_ulp_ddp_flags(skb) && !(bsp->flags & DDP_BF_NOFLIP)) {
q->cur_buf ^= 1; /* flip buffers */
if (end_offset < q->kbuf[0]->length)
nomoredata=1;
}
}
if (bsp->flags & DDP_BF_NOCOPY) {
skb_ulp_ddp_flags(skb) |= (bsp->flags & DDP_BF_NOCOPY);
bsp->flags &= ~DDP_BF_NOCOPY;
}
if (ddp_report & F_DDP_PSH)
skb_ulp_ddp_flags(skb) |= DDP_BF_PSH;
if (nomoredata)
skb_ulp_ddp_flags(skb) |= DDP_BF_NODATA;
skb_reset_transport_header(skb);
tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */
inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp;
__skb_queue_tail(&sk->sk_receive_queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, 0);
}
#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
F_DDP_INVALID_PPOD)
/*
* Handler for RX_DATA_DDP CPL messages.
*/
static int do_rx_data_ddp(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
{
struct sock *sk = ctx;
const struct cpl_rx_data_ddp *hdr = cplhdr(skb);
VALIDATE_SOCK(sk);
if (unlikely(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
printk(KERN_ERR "RX_DATA_DDP for TID %u reported error 0x%x\n",
GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
return CPL_RET_BUF_DONE;
}
process_cpl_msg(new_rx_data_ddp, sk, skb);
return 0;
}
static void process_ddp_complete(struct sock *sk, struct sk_buff *skb)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct ddp_state *q;
struct ddp_buf_state *bsp;
struct cpl_rx_ddp_complete *hdr;
unsigned int ddp_report, buf_idx;
unsigned int nomoredata=0;
unsigned int delack_mode;
if (unlikely(sk_no_receive(sk))) {
handle_excess_rx(sk, skb);
return;
}
tp = tcp_sk(sk);
q = DDP_STATE(sk);
hdr = cplhdr(skb);
ddp_report = ntohl(hdr->ddp_report);
buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
bsp = &q->buf_state[buf_idx];
skb_ulp_ddp_offset(skb) = bsp->cur_offset;
skb->len = G_DDP_OFFSET(ddp_report) - skb_ulp_ddp_offset(skb);
#ifdef T3_TRACE
T3_TRACE5(TIDTB(sk),
"process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
"ddp_report 0x%x offset %u, len %u",
tp->rcv_nxt, bsp->cur_offset, ddp_report,
G_DDP_OFFSET(ddp_report), skb->len);
#endif
bsp->cur_offset += skb->len;
if (!(bsp->flags & DDP_BF_NOFLIP)) {
q->cur_buf ^= 1; /* flip buffers */
if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->length)
nomoredata=1;
}
#ifdef T3_TRACE
T3_TRACE4(TIDTB(sk),
"process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
"ddp_report %u offset %u",
tp->rcv_nxt, bsp->cur_offset, ddp_report,
G_DDP_OFFSET(ddp_report));
#endif
skb_gl_set(skb, bsp->gl);
skb_ulp_ddp_flags(skb) = (bsp->flags & DDP_BF_NOCOPY) | 1;
if (bsp->flags & DDP_BF_NOCOPY)
bsp->flags &= ~DDP_BF_NOCOPY;
if (nomoredata)
skb_ulp_ddp_flags(skb) |= DDP_BF_NODATA;
delack_mode = G_DDP_DACK_MODE(ddp_report);
if (unlikely(G_DDP_DACK_MODE(ddp_report) != cplios->delack_mode)) {
cplios->delack_mode = delack_mode;
cplios->delack_seq = tp->rcv_nxt;
}
ULP_SKB_CB(skb)->seq = tp->rcv_nxt;
tp->rcv_nxt += skb->len;
skb_reset_transport_header(skb);
tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */
inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp;
__skb_queue_tail(&sk->sk_receive_queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, 0);
}
/*
* Handler for RX_DDP_COMPLETE CPL messages.
*/
static int do_rx_ddp_complete(struct t3cdev *cdev, struct sk_buff *skb,
void *ctx)
{
struct sock *sk = ctx;
VALIDATE_SOCK(sk);
process_cpl_msg(process_ddp_complete, sk, skb);
return 0;
}
/*
* Move a socket to TIME_WAIT state. We need to make some adjustments to the
* socket state before calling tcp_time_wait to comply with its expectations.
*/
static void enter_timewait(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
/*
* Bump rcv_nxt for the peer FIN. We don't do this at the time we
* process peer_close because we don't want to carry the peer FIN in
* the socket's receive queue and if we increment rcv_nxt without
* having the FIN in the receive queue we'll confuse facilities such
* as SIOCINQ.
*/
tp->rcv_nxt++;
TS_RECENT_STAMP(tp) = 0; /* defeat recycling */
tp->srtt = 0; /* defeat tcp_update_metrics */
tcp_time_wait(sk, TCP_TIME_WAIT, 0); /* calls tcp_done */
}
/*
* For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This
* function deals with the data that may be reported along with the FIN.
* Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
* perform normal FIN-related processing. In the latter case 1 indicates that
* there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
* skb can be freed.
*/
static int handle_peer_close_data(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct ddp_state *q;
struct ddp_buf_state *bsp;
struct cpl_peer_close *req = cplhdr(skb);
unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
if (tp->rcv_nxt == rcv_nxt) /* no data */
return 0;
if (unlikely(sk_no_receive(sk))) {
handle_excess_rx(sk, skb);
/*
* Although we discard the data we want to process the FIN so
* that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
* PEER_CLOSE without data. In particular this PEER_CLOSE
* may be what will close the connection. We return 1 because
* handle_excess_rx() already freed the packet.
*/
return 1;
}
q = DDP_STATE(sk);
bsp = &q->buf_state[q->cur_buf];
skb->len = rcv_nxt - tp->rcv_nxt;
skb_gl_set(skb, bsp->gl);
skb_ulp_ddp_offset(skb) = bsp->cur_offset;
skb_ulp_ddp_flags(skb) =
DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
ULP_SKB_CB(skb)->seq = tp->rcv_nxt;
tp->rcv_nxt = rcv_nxt;
bsp->cur_offset += skb->len;
if (!(bsp->flags & DDP_BF_NOFLIP))
q->cur_buf ^= 1;
skb_reset_transport_header(skb);
tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */
inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp;
__skb_queue_tail(&sk->sk_receive_queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, 0);
return 1;
}
/*
* Handle a peer FIN.
*/
static void do_peer_fin(struct sock *sk, struct sk_buff *skb)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
int keep = 0, dead = sock_flag(sk, SOCK_DEAD);
#ifdef T3_TRACE
T3_TRACE0(TIDTB(sk),"do_peer_fin:");
#endif
if (!is_t3a(cplios->toedev) &&
cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING))
goto out;
if (cplios->ulp_mode == ULP_MODE_TCPDDP) {
keep = handle_peer_close_data(sk, skb);
if (keep < 0)
return;
}
sk->sk_shutdown |= RCV_SHUTDOWN;
sock_set_flag(sk, SOCK_DONE);
switch (sk->sk_state) {
case TCP_SYN_RECV:
case TCP_ESTABLISHED:
tcp_set_state(sk, TCP_CLOSE_WAIT);
break;
case TCP_FIN_WAIT1:
tcp_set_state(sk, TCP_CLOSING);
break;
case TCP_FIN_WAIT2:
/*
* If we've sent an abort_req we must have sent it too late,
* HW will send us a reply telling us so, and this peer_close
* is really the last message for this connection and needs to
* be treated as an abort_rpl, i.e., transition the connection
* to TCP_CLOSE (note that the host stack does this at the
* time of generating the RST but we must wait for HW).
* Otherwise we enter TIME_WAIT.
*/
t3_release_offload_resources(sk);
if (cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING))
connection_done(sk);
else
enter_timewait(sk);
break;
default:
printk(KERN_ERR
"%s: TID %u received PEER_CLOSE in bad state %d\n",
cplios->toedev->name, cplios->tid, sk->sk_state);
}
if (!dead) {
sk->sk_state_change(sk);
/* Do not send POLL_HUP for half duplex close. */
if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
sk->sk_state == TCP_CLOSE)
sk_wake_async(sk, 1, POLL_HUP);
else
sk_wake_async(sk, 1, POLL_IN);
}
out: if (!keep)
__kfree_skb(skb);
}
/*
* Handler for PEER_CLOSE CPL messages.
*/
static int do_peer_close(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
{
struct sock *sk = (struct sock *)ctx;
VALIDATE_SOCK(sk);
process_cpl_msg_ref(do_peer_fin, sk, skb);
return 0;
}
/*
* Process a peer ACK to our FIN.
*/
static void process_close_con_rpl(struct sock *sk, struct sk_buff *skb)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct cpl_close_con_rpl *rpl = cplhdr(skb);
tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */
if (!is_t3a(cplios->toedev) && cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING))
goto out;
switch (sk->sk_state) {
case TCP_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */
t3_release_offload_resources(sk);
if (cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING))
connection_done(sk);
else
enter_timewait(sk);
break;
case TCP_LAST_ACK:
/*
* In this state we don't care about pending abort_rpl.
* If we've sent abort_req it was post-close and was sent too
* late, this close_con_rpl is the actual last message.
*/
t3_release_offload_resources(sk);
connection_done(sk);
break;
case TCP_FIN_WAIT1:
tcp_set_state(sk, TCP_FIN_WAIT2);
sk->sk_shutdown |= SEND_SHUTDOWN;
dst_confirm(sk->sk_dst_cache);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_state_change(sk); // Wake up lingering close()
else if (tcp_sk(sk)->linger2 < 0 &&
!cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN))
abort_conn(sk, skb, LINUX_MIB_TCPABORTONLINGER);
break;
default:
printk(KERN_ERR
"%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
cplios->toedev->name, cplios->tid, sk->sk_state);
}
out: kfree_skb(skb); /* can't use __kfree_skb here */
}
/*
* Handler for CLOSE_CON_RPL CPL messages.
*/
static int do_close_con_rpl(struct t3cdev *cdev, struct sk_buff *skb,
void *ctx)
{
struct sock *sk = (struct sock *)ctx;
VALIDATE_SOCK(sk);
process_cpl_msg_ref(process_close_con_rpl, sk, skb);
return 0;
}
/*
* Process abort replies. We only process these messages if we anticipate
* them as the coordination between SW and HW in this area is somewhat lacking
* and sometimes we get ABORT_RPLs after we are done with the connection that
* originated the ABORT_REQ.
*/
static void process_abort_rpl(struct sock *sk, struct sk_buff *skb)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
#ifdef T3_TRACE
T3_TRACE1(TIDTB(sk),
"process_abort_rpl: GTS rpl pending %d",
cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING));
#endif
if (cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING)) {
if (!cplios_flag(sk, CPLIOS_ABORT_RPL_RCVD) &&
!is_t3a(cplios->toedev))
cplios_set_flag(sk, CPLIOS_ABORT_RPL_RCVD);
else {
cplios_reset_flag(sk, CPLIOS_ABORT_RPL_RCVD);
cplios_reset_flag(sk, CPLIOS_ABORT_RPL_PENDING);
if (!cplios_flag(sk, CPLIOS_ABORT_REQ_RCVD) ||
!is_t3a(cplios->toedev)) {
BUG_ON(cplios_flag(sk, CPLIOS_ABORT_REQ_RCVD));
t3_release_offload_resources(sk);
connection_done(sk);
}
}
}
__kfree_skb(skb);
}
/*
* Handle an ABORT_RPL_RSS CPL message.
*/
static int do_abort_rpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
{
struct sock *sk;
struct cpl_abort_rpl_rss *rpl = cplhdr(skb);
/*
* Ignore replies to post-close aborts indicating that the abort was
* requested too late. These connections are terminated when we get
* PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
* arrives the TID is either no longer used or it has been recycled.
*/
if (rpl->status == CPL_ERR_ABORT_FAILED) {
discard:
__kfree_skb(skb);
return 0;
}
sk = (struct sock *)ctx;
/*
* Sometimes we've already closed the socket, e.g., a post-close
* abort races with ABORT_REQ_RSS, the latter frees the socket
* expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
* but FW turns the ABORT_REQ into a regular one and so we get
* ABORT_RPL_RSS with status 0 and no socket. Only on T3A.
*/
if (!sk)
goto discard;
process_cpl_msg_ref(process_abort_rpl, sk, skb);
return 0;
}
/*
* Convert the status code of an ABORT_REQ into a Linux error code. Also
* indicate whether RST should be sent in response.
*/
static int abort_status_to_errno(struct sock *sk, int abort_reason,
int *need_rst)
{
switch (abort_reason) {
case CPL_ERR_BAD_SYN:
// fall through
T3_NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
case CPL_ERR_CONN_RESET:
// XXX need to handle SYN_RECV due to crossed SYNs
return sk->sk_state == TCP_CLOSE_WAIT ? EPIPE : ECONNRESET;
case CPL_ERR_XMIT_TIMEDOUT:
case CPL_ERR_PERSIST_TIMEDOUT:
case CPL_ERR_FINWAIT2_TIMEDOUT:
case CPL_ERR_KEEPALIVE_TIMEDOUT:
T3_NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
return ETIMEDOUT;
default:
return EIO;
}
}
static inline void set_abort_rpl_wr(struct sk_buff *skb, unsigned int tid,
int cmd)
{
struct cpl_abort_rpl *rpl = cplhdr(skb);
rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
rpl->wr.wr_lo = htonl(V_WR_TID(tid));
OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
rpl->cmd = cmd;
}
static void send_deferred_abort_rpl(struct toedev *tdev, struct sk_buff *skb)
{
struct sk_buff *reply_skb;
struct cpl_abort_req_rss *req = cplhdr(skb);
reply_skb = alloc_skb_nofail(sizeof(struct cpl_abort_rpl));
reply_skb->priority = CPL_PRIORITY_DATA;
__skb_put(reply_skb, sizeof(struct cpl_abort_rpl));
set_abort_rpl_wr(reply_skb, GET_TID(req), req->status);
cxgb3_ofld_send(TOM_DATA(tdev)->cdev, reply_skb);
kfree_skb(skb);
}
/*
* Returns whether an ABORT_REQ_RSS message is a negative advice.
*/
static inline int is_neg_adv_abort(unsigned int status)
{
return status == CPL_ERR_RTX_NEG_ADVICE ||
status == CPL_ERR_PERSIST_NEG_ADVICE;
}
static void send_abort_rpl(struct sk_buff *skb, struct toedev *tdev,
int rst_status)
{
struct sk_buff *reply_skb;
struct cpl_abort_req_rss *req = cplhdr(skb);
reply_skb = get_cpl_reply_skb(skb, sizeof(struct cpl_abort_rpl),
gfp_any());
if (!reply_skb) {
/* Defer the reply. Stick rst_status into req->cmd. */
req->status = rst_status;
t3_defer_reply(skb, tdev, send_deferred_abort_rpl);
return;
}
reply_skb->priority = CPL_PRIORITY_DATA;
set_abort_rpl_wr(reply_skb, GET_TID(req), rst_status);
kfree_skb(skb); /* can't use __kfree_skb here */
/*
* XXX need to sync with ARP as for SYN_RECV connections we can send
* these messages while ARP is pending. For other connection states
* it's not a problem.
*/
cxgb3_ofld_send(TOM_DATA(tdev)->cdev, reply_skb);
}
static void cleanup_syn_rcv_conn(struct sock *child, struct sock *parent)
{
struct request_sock *req = child->sk_user_data;
inet_csk_reqsk_queue_removed(parent, req);
synq_remove(child);
__reqsk_free(req);
child->sk_user_data = NULL;
}
/*
* Performs the actual work to abort a SYN_RECV connection.
*/
static void do_abort_syn_rcv(struct sock *child, struct sock *parent)
{
/*
* If the server is still open we clean up the child connection,
* otherwise the server already did the clean up as it was purging
* its SYN queue and the skb was just sitting in its backlog.
*/
if (likely(parent->sk_state == TCP_LISTEN)) {
cleanup_syn_rcv_conn(child, parent);
t3_release_offload_resources(child);
connection_done(child);
}
}
/*
* This is run from a listener's backlog to abort a child connection in
* SYN_RCV state (i.e., one on the listener's SYN queue).
*/
static void bl_abort_syn_rcv(struct sock *lsk, struct sk_buff *skb)
{
struct sock *child = skb->sk;
skb->sk = NULL;
do_abort_syn_rcv(child, lsk);
send_abort_rpl(skb, BLOG_SKB_CB(skb)->dev, CPL_ABORT_NO_RST);
}
/*
* Handle abort requests for a SYN_RECV connection. These need extra work
* because the socket is on its parent's SYN queue.
*/
static int abort_syn_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *parent;
struct toedev *tdev = CPL_IO_STATE(sk)->toedev;
struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
const struct request_sock *oreq = sk->sk_user_data;
struct t3c_tid_entry *t3c_stid;
struct tid_info *t;
if (!oreq)
return -1; /* somehow we are not on the SYN queue */
t = &(T3C_DATA(cdev))->tid_maps;
t3c_stid = lookup_stid(t, oreq->ts_recent);
parent = ((struct listen_ctx *)t3c_stid->ctx)->lsk;
bh_lock_sock(parent);
if (!sock_owned_by_user(parent)) {
do_abort_syn_rcv(sk, parent);
send_abort_rpl(skb, tdev, CPL_ABORT_NO_RST);
} else {
skb->sk = sk;
BLOG_SKB_CB(skb)->backlog_rcv = bl_abort_syn_rcv;
__sk_add_backlog(parent, skb);
}
bh_unlock_sock(parent);
return 0;
}
/*
* Process abort requests. If we are waiting for an ABORT_RPL we ignore this
* request except that we need to reply to it.
*/
static void process_abort_req(struct sock *sk, struct sk_buff *skb)
{
int rst_status = CPL_ABORT_NO_RST;
const struct cpl_abort_req_rss *req = cplhdr(skb);
if (!cplios_flag(sk, CPLIOS_ABORT_REQ_RCVD)) {
cplios_set_flag(sk, CPLIOS_ABORT_REQ_RCVD);
cplios_set_flag(sk, CPLIOS_ABORT_SHUTDOWN);
__kfree_skb(skb);
return;
}
cplios_reset_flag(sk, CPLIOS_ABORT_REQ_RCVD);
/*
* Three cases to consider:
* a) We haven't sent an abort_req; close the connection.
* b) We have sent a post-close abort_req that will get to TP too late
* and will generate a CPL_ERR_ABORT_FAILED reply. The reply will
* be ignored and the connection should be closed now.
* c) We have sent a regular abort_req that will get to TP too late.
* That will generate an abort_rpl with status 0, wait for it.
*/
if (!cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING) ||
(is_t3a(CPL_IO_STATE(sk)->toedev) &&
cplios_flag(sk, CPLIOS_CLOSE_CON_REQUESTED))) {
sk->sk_err = abort_status_to_errno(sk, req->status,
&rst_status);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_error_report(sk);
/*
* SYN_RECV needs special processing. If abort_syn_rcv()
* returns 0 is has taken care of the abort.
*/
if (sk->sk_state == TCP_SYN_RECV && !abort_syn_rcv(sk, skb))
return;
t3_release_offload_resources(sk);
connection_done(sk);
}
send_abort_rpl(skb, BLOG_SKB_CB(skb)->dev, rst_status);
}
/*
* Handle an ABORT_REQ_RSS CPL message.
*/
static int do_abort_req(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
{
const struct cpl_abort_req_rss *req = cplhdr(skb);
struct sock *sk = (struct sock *)ctx;
if (is_neg_adv_abort(req->status)) {
__kfree_skb(skb);
return 0;
}
VALIDATE_SOCK(sk);
/*
* Save the offload device in the skb, we may process this message
* after the socket has closed.
*/
BLOG_SKB_CB(skb)->dev = CPL_IO_STATE(sk)->toedev;
process_cpl_msg_ref(process_abort_req, sk, skb);
return 0;
}
static void pass_open_abort(struct sock *child, struct sock *parent,
struct sk_buff *skb)
{
struct toedev *tdev = BLOG_SKB_CB(skb)->dev;
do_abort_syn_rcv(child, parent);
if (tdev->ttid == TOE_ID_CHELSIO_T3) {
struct cpl_pass_accept_rpl *rpl = cplhdr(skb);
rpl->opt0h = htonl(F_TCAM_BYPASS);
rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
cxgb3_ofld_send(TOM_DATA(tdev)->cdev, skb);
} else
kfree_skb(skb);
}
/*
* Runs from a listener's backlog to abort a child connection that had an
* ARP failure.
*/
static void bl_pass_open_abort(struct sock *lsk, struct sk_buff *skb)
{
pass_open_abort(skb->sk, lsk, skb);
}
static void handle_pass_open_arp_failure(struct sock *sk, struct sk_buff *skb)
{
struct t3cdev *cdev;
struct sock *parent;
const struct request_sock *oreq;
struct t3c_tid_entry *t3c_stid;
struct tid_info *t;
/*
* If the connection is being aborted due to the parent listening
* socket going away there's nothing to do, the ABORT_REQ will close
* the connection.
*/
if (cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING)) {
kfree_skb(skb);
return;
}
oreq = sk->sk_user_data;
cdev = T3C_DEV(sk);
t = &(T3C_DATA(cdev))->tid_maps;
t3c_stid = lookup_stid(t, oreq->ts_recent);
parent = ((struct listen_ctx *)t3c_stid->ctx)->lsk;
bh_lock_sock(parent);
if (!sock_owned_by_user(parent))
pass_open_abort(sk, parent, skb);
else {
BLOG_SKB_CB(skb)->backlog_rcv = bl_pass_open_abort;
__sk_add_backlog(parent, skb);
}
bh_unlock_sock(parent);
}
/*
* Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly
* to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
* connection.
*/
static void pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct sk_buff *skb)
{
T3_TCP_INC_STATS_BH(sock_net(skb->sk), TCP_MIB_ATTEMPTFAILS);
BLOG_SKB_CB(skb)->dev = CPL_IO_STATE(skb->sk)->toedev;
process_cpl_msg_ref(handle_pass_open_arp_failure, skb->sk, skb);
}
#if defined(ROUTE_REQ)
static struct dst_entry *route_req(struct sock *sk, struct open_request *req)
{
struct rtable *rt;
struct flowi fl = { .oif = sk->sk_bound_dev_if,
.nl_u = { .ip4_u =
{ .daddr = req->af.v4_req.rmt_addr,
.saddr = req->af.v4_req.loc_addr,
.tos = RT_CONN_FLAGS(sk)}},
.proto = IPPROTO_TCP,
.uli_u = { .ports =
#ifdef LINUX_2_4
{ .sport = sk->sport,
#else
{ .sport = inet_sk(sk)->inet_sport,
#endif /* LINUX_2_4 */
.dport = req->rmt_port}}
};
if (ip_route_output_flow(&rt, &fl, sk, 0)) {
IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
return &rt->u.dst;
}
#endif
/*
* Create a new socket as a child of the listening socket 'lsk' and initialize
* with the information in the supplied PASS_ACCEPT_REQ message.
*
* 'retry' indicates to the caller whether a failure is device-related and the
* connection should be passed to the host stack, or connection-related and
* the connection request should be rejected.
*/
static struct sock *mk_pass_sock(struct sock *lsk, struct toedev *dev, int tid,
const struct cpl_pass_accept_req *req,
int *retry,
const struct offload_settings *s)
{
struct sock *newsk;
struct cpl_io_state *newcplios;
struct l2t_entry *e;
struct dst_entry *dst;
struct tcp_sock *newtp;
struct net_device *egress;
struct request_sock *oreq = reqsk_alloc(&t3_rsk_ops);
*retry = 0;
if (!oreq)
goto out_err;
tcp_rsk(oreq)->rcv_isn = ntohl(req->rcv_isn);
inet_rsk(oreq)->rmt_port = req->peer_port;
t3_set_req_addr(oreq, req->local_ip, req->peer_ip);
t3_set_req_opt(oreq, NULL);
if (sysctl_tcp_window_scaling) {
inet_rsk(oreq)->wscale_ok = 1;
inet_rsk(oreq)->snd_wscale = req->tcp_options.wsf;
}
#ifdef CONFIG_SECURITY_NETWORK
if (security_inet_conn_request(lsk, tcphdr_skb, oreq))
goto free_or;
#endif
dst = route_req(lsk, oreq);
if (!dst)
goto free_or;
egress = offload_get_phys_egress(dst->neighbour->dev, NULL, TOE_OPEN);
if (!egress || TOEDEV(egress) != dev) {
*retry = 1; /* asymmetric route */
goto free_dst;
}
e = t3_l2t_get(TOM_DATA(dev)->cdev, dst->neighbour, egress);
if (!e) {
*retry = 1; /* out of HW resources */
goto free_dst;
}
newcplios = kzalloc(sizeof *newcplios, GFP_ATOMIC);
if (!newcplios)
goto free_l2t;
newsk = tcp_create_openreq_child(lsk, oreq, tcphdr_skb);
if (!newsk) {
kfree(newcplios);
goto free_l2t;
}
CPL_IO_STATE(newsk) = newcplios;
if (sock_flag(newsk, SOCK_KEEPOPEN))
inet_csk_delete_keepalive_timer(newsk);
oreq->ts_recent = G_PASS_OPEN_TID(ntohl(req->tos_tid));
newsk->sk_user_data = oreq;
sk_setup_caps(newsk, dst);
newtp = tcp_sk(newsk);
init_offload_sk(newsk, dev, tid, e, dst, egress, s);
newcplios->delack_seq = newtp->rcv_nxt;
RCV_WSCALE(newtp) = select_rcv_wscale(tcp_full_space(newsk),
WSCALE_OK(newtp),
newtp->window_clamp);
#ifdef LINUX_2_4
newsk->daddr = req->peer_ip;
newsk->rcv_saddr = req->local_ip;
newsk->saddr = req->local_ip;
#else
inet_sk(newsk)->inet_daddr = req->peer_ip;
inet_sk(newsk)->inet_rcv_saddr = req->local_ip;
inet_sk(newsk)->inet_saddr = req->local_ip;
#endif /* LINUX_2_4 */
lsk->sk_prot->hash(newsk);
t3_inet_inherit_port(&tcp_hashinfo, lsk, newsk);
install_offload_ops(newsk);
bh_unlock_sock(newsk); // counters tcp_create_openreq_child()
return newsk;
free_l2t:
l2t_release(L2DATA(dev), e);
free_dst:
dst_release(dst);
free_or:
__reqsk_free(oreq);
out_err:
return NULL;
}
/*
* Populate a reject/tunnel CPL_PASS_ACCEPT_RPL WR.
*/
static void mk_pass_accept_rpl(struct sk_buff *reply_skb,
struct sk_buff *req_skb, int cmd)
{
struct cpl_pass_accept_req *req = cplhdr(req_skb);
struct cpl_pass_accept_rpl *rpl = cplhdr(reply_skb);
unsigned int tid = GET_TID(req);
reply_skb->priority = CPL_PRIORITY_SETUP;
rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet
rpl->opt0h = htonl(F_TCAM_BYPASS);
rpl->opt0l_status = htonl(cmd);
rpl->opt2 = 0;
rpl->rsvd = rpl->opt2; /* workaround for HW bug */
}
/*
* Send a deferred reject to an accept request.
*/
static void reject_pass_request(struct toedev *tdev, struct sk_buff *skb)
{
struct sk_buff *reply_skb;
reply_skb = alloc_skb_nofail(sizeof(struct cpl_pass_accept_rpl));
__skb_put(reply_skb, sizeof(struct cpl_pass_accept_rpl));
mk_pass_accept_rpl(reply_skb, skb, CPL_PASS_OPEN_REJECT);
cxgb3_ofld_send(TOM_DATA(tdev)->cdev, reply_skb);
kfree_skb(skb);
}
static void offload_req_from_pass_accept_req(struct offload_req *oreq,
const struct cpl_pass_accept_req *req,
const struct sock *listen_sk)
{
oreq->sip[0] = req->peer_ip;
oreq->sip[1] = oreq->sip[2] = oreq->sip[3] = 0;
oreq->dip[0] = req->local_ip;
oreq->dip[1] = oreq->dip[2] = oreq->dip[3] = 0;
oreq->sport = req->peer_port;
oreq->dport = req->local_port;
oreq->ipvers_opentype = (OPEN_TYPE_PASSIVE << 4) | 4;
oreq->tos = G_PASS_OPEN_TOS(ntohl(req->tos_tid));
oreq->vlan = req->vlan_tag ? req->vlan_tag & htons(VLAN_VID_MASK) :
htons(0xfff);
#ifdef SO_MARK
oreq->mark = listen_sk->sk_mark;
#else
oreq->mark = 0;
#endif
}
/*
* Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket
* lock held. Note that the sock here is a listening socket that is not owned
* by the TOE.
*/
static void process_pass_accept_req(struct sock *sk, struct sk_buff *skb)
{
int rt_flags;
int pass2host;
struct sock *newsk;
struct l2t_entry *e;
struct iff_mac tim;
struct offload_req orq;
struct offload_settings settings;
struct sk_buff *reply_skb, *ddp_skb = NULL;
struct cpl_pass_accept_rpl *rpl;
struct cpl_pass_accept_req *req = cplhdr(skb);
unsigned int tid = GET_TID(req);
struct toedev *tdev = BLOG_SKB_CB(skb)->dev;
struct tom_data *d = TOM_DATA(tdev);
struct t3cdev *cdev = d->cdev;
reply_skb = get_cpl_reply_skb(skb, sizeof(*rpl), GFP_ATOMIC);
if (unlikely(!reply_skb)) {
if (tdev->ttid == TOE_ID_CHELSIO_T3)
t3_defer_reply(skb, tdev, reject_pass_request);
else {
cxgb3_queue_tid_release(cdev, tid);
kfree_skb(skb);
}
goto out;
}
if (sk->sk_state != TCP_LISTEN)
goto reject;
if (inet_csk_reqsk_queue_is_full(sk))
goto reject;
if (sk_acceptq_is_full(sk) && d->conf.soft_backlog_limit)
goto reject;
tim.mac_addr = req->dst_mac;
tim.vlan_tag = ntohs(req->vlan_tag);
if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev)
goto reject;
if (ip_route_input(skb, req->local_ip, req->peer_ip,
G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
goto reject;
rt_flags = ((struct rtable *)skb_dst(skb))->rt_flags &
(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
dst_release(skb_dst(skb)); // done with the input route, release it
skb_dst_set(skb, NULL);
if (rt_flags != RTCF_LOCAL)
goto reject;
offload_req_from_pass_accept_req(&orq, req, sk);
settings = *lookup_ofld_policy(tdev, &orq, d->conf.cop_managed_offloading);
#ifndef LINUX_2_4
rcu_read_unlock();
#else
read_unlock(&tdev->policy_lock);
#endif
newsk = mk_pass_sock(sk, tdev, tid, req, &pass2host, &settings);
if (!newsk)
goto reject;
/*
* Our use of sk_user_data for sockets on the SYNQ can confuse the
* sanitization of socket callbacks in the RX_DATA handler. Since
* there aren't any kernel apps that need to sanitize the callbacks
* of passively opened sockets we solve the problem by skipping
* the sanitization on such sockets.
*/
cplios_set_flag(newsk, CPLIOS_CALLBACKS_CHKD);
inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT);
synq_add(sk, newsk);
/* Don't get a reference, newsk starts out with ref count 2 */
cxgb3_insert_tid(cdev, d->client, newsk, tid);
if (CPL_IO_STATE(newsk)->ulp_mode == ULP_MODE_TCPDDP) {
ddp_skb = alloc_skb(sizeof(struct cpl_set_tcb_field),
GFP_ATOMIC);
if (!ddp_skb)
CPL_IO_STATE(newsk)->ulp_mode = ULP_MODE_NONE;
}
reply_skb->sk = newsk;
set_arp_failure_handler(reply_skb, pass_accept_rpl_arp_failure);
e = CPL_IO_STATE(newsk)->l2t_entry;
rpl = cplhdr(reply_skb);
rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten
rpl->opt0h = htonl(calc_opt0h(newsk) | V_L2T_IDX(e->idx) |
V_TX_CHANNEL(e->chan_idx));
rpl->opt0l_status = htonl(calc_opt0l(newsk) |
CPL_PASS_OPEN_ACCEPT);
rpl->opt2 = htonl(calc_opt2(newsk, &settings));
rpl->rsvd = rpl->opt2; /* workaround for HW bug */
reply_skb->priority = mkprio(CPL_PRIORITY_SETUP, newsk);
l2t_send(cdev, reply_skb, e);
kfree_skb(skb);
if (ddp_skb) {
set_arp_failure_handler(ddp_skb, arp_failure_discard);
__set_tcb_field(newsk, ddp_skb, W_TCB_RX_DDP_FLAGS,
V_TF_DDP_OFF(1) |
TP_DDP_TIMER_WORKAROUND_MASK,
V_TF_DDP_OFF(1) |
TP_DDP_TIMER_WORKAROUND_VAL, 1);
l2t_send(cdev, ddp_skb, e);
}
return;
reject:
if (tdev->ttid == TOE_ID_CHELSIO_T3)
mk_pass_accept_rpl(reply_skb, skb, CPL_PASS_OPEN_REJECT);
else {
__skb_trim(reply_skb, 0);
mk_tid_release(reply_skb, NULL, tid);
}
cxgb3_ofld_send(cdev, reply_skb);
kfree_skb(skb);
out:
T3_TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
}
/*
* Handle a CPL_PASS_ACCEPT_REQ message.
*/
static int do_pass_accept_req(struct t3cdev *cdev, struct sk_buff *skb,
void *ctx)
{
struct cpl_pass_accept_req *req = cplhdr(skb);
struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
struct sock *lsk = listen_ctx->lsk;
struct tom_data *d = listen_ctx->tom_data;
#if VALIDATE_TID
unsigned int tid = GET_TID(req);
struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
if (unlikely(!lsk)) {
printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
cdev->name,
(unsigned long)((union listen_entry *)ctx -
t->stid_tab));
return CPL_RET_BUF_DONE;
}
if (unlikely(tid >= t->ntids)) {
printk(KERN_ERR "%s: passive open TID %u too large\n",
cdev->name, tid);
return CPL_RET_BUF_DONE;
}
/*
* For T3A the current user of the TID may have closed but its last
* message(s) may have been backlogged so the TID appears to be still
* in use. Just take the TID away, the connection can close at its
* own leisure. For T3B this situation is a bug.
*/
if (!valid_new_tid(t, tid) &&
cdev->type != T3A) {
printk(KERN_ERR "%s: passive open uses existing TID %u\n",
cdev->name, tid);
return CPL_RET_BUF_DONE;
}
#endif
BLOG_SKB_CB(skb)->dev = &d->tdev;
process_cpl_msg(process_pass_accept_req, lsk, skb);
return 0;
}
/*
* Add a passively open socket to its parent's accept queue. Note that the
* child may be in any state by now, including TCP_CLOSE. We can guarantee
* though that it has not been orphaned yet.
*/
static void add_pass_open_to_parent(struct sock *child, struct sock *lsk,
struct toedev *dev)
{
struct request_sock *oreq;
/*
* If the server is closed it has already killed its embryonic
* children. There is nothing further to do about child.
*/
if (lsk->sk_state != TCP_LISTEN)
return;
oreq = child->sk_user_data;
child->sk_user_data = NULL;
inet_csk_reqsk_queue_removed(lsk, oreq);
synq_remove(child);
if (sk_acceptq_is_full(lsk) && !TOM_TUNABLE(dev, soft_backlog_limit)) {
T3_NET_INC_STATS_BH(sock_net(lsk), LINUX_MIB_LISTENOVERFLOWS);
T3_NET_INC_STATS_BH(sock_net(lsk), LINUX_MIB_LISTENDROPS);
__reqsk_free(oreq);
add_to_reap_list(child);
} else {
inet_csk_reqsk_queue_add(lsk, oreq, child);
lsk->sk_data_ready(lsk, 0);
}
}
/*
* This is run from a listener's backlog to add a child socket to its accept
* queue. Note that at this point the child is not locked and we intentionally
* do not bother locking it as the only fields we may be using are
* sk_user_data, and the open request and there aren't any concurrent users
* for them.
*/
static void bl_add_pass_open_to_parent(struct sock *lsk, struct sk_buff *skb)
{
struct sock *child = skb->sk;
skb->sk = NULL;
add_pass_open_to_parent(child, lsk, BLOG_SKB_CB(skb)->dev);
__kfree_skb(skb);
}
/*
* Called when a connection is established to translate the TCP options
* reported by HW to Linux's native format.
*/
static void assign_rxopt(struct sock *sk, unsigned int opt)
{
const struct t3c_data *td = T3C_DATA(T3C_DEV(sk));
struct tcp_sock *tp = tcp_sk(sk);
MSS_CLAMP(tp) = td->mtus[G_TCPOPT_MSS(opt)] - 40;
tp->mss_cache = MSS_CLAMP(tp);
tp->tcp_header_len = sizeof(struct tcphdr);
TSTAMP_OK(tp) = G_TCPOPT_TSTAMP(opt);
SACK_OK(tp) = G_TCPOPT_SACK(opt);
WSCALE_OK(tp) = G_TCPOPT_WSCALE_OK(opt);
SND_WSCALE(tp) = G_TCPOPT_SND_WSCALE(opt);
if (!WSCALE_OK(tp))
RCV_WSCALE(tp) = 0;
if (TSTAMP_OK(tp)) {
tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
tp->mss_cache -= TCPOLEN_TSTAMP_ALIGNED;
}
}
/*
* Completes some final bits of initialization for just established connections
* and changes their state to TCP_ESTABLISHED.
*
* snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
*/
static void make_established(struct sock *sk, u32 snd_isn, unsigned int opt)
{
struct tcp_sock *tp = tcp_sk(sk);
tp->pushed_seq = tp->write_seq = tp->snd_nxt = tp->snd_una = snd_isn;
inet_sk(sk)->inet_id = tp->write_seq ^ jiffies;
assign_rxopt(sk, opt);
/*
* Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
* pass through opt0.
*/
if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
tp->rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
dst_confirm(sk->sk_dst_cache);
/*
* tcp_poll() does not lock socket, make sure initial values are
* committed before changing to ESTABLISHED.
*/
mb();
tcp_set_state(sk, TCP_ESTABLISHED);
}
/*
* Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work
* if we are in TCP_SYN_RECV due to crossed SYNs
*/
static int do_pass_establish(struct t3cdev *cdev, struct sk_buff *skb,
void *ctx)
{
struct cpl_pass_establish *req = cplhdr(skb);
struct sock *lsk, *sk = (struct sock *)ctx;
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct toedev *tdev = cplios->toedev;
VALIDATE_SOCK(sk);
bh_lock_sock(sk);
if (unlikely(sock_owned_by_user(sk))) {
// This can only happen in simultaneous opens. XXX TBD
__kfree_skb(skb);
} else {
// Complete socket initialization now that we have the SND_ISN
struct t3c_tid_entry *t3c_stid;
struct tid_info *t;
unsigned int stid;
cplios->wr_max = cplios->wr_avail = TOM_TUNABLE(tdev, max_wrs);
cplios->wr_unacked = 0;
cplios->rss_cpu_idx = G_QNUM(ntohl(skb->csum));
make_established(sk, ntohl(req->snd_isn), ntohs(req->tcp_opt));
if (unlikely(sk->sk_socket)) { // simultaneous opens only
sk->sk_state_change(sk);
sk_wake_async(sk, 0, POLL_OUT);
}
/*
* The state for the new connection is now up to date.
* Next check if we should add the connection to the parent's
* accept queue. When the parent closes it resets connections
* on its SYN queue, so check if we are being reset. If so we
* don't need to do anything more, the coming ABORT_RPL will
* destroy this socket. Otherwise move the connection to the
* accept queue.
*
* Note that we reset the synq before closing the server so if
* we are not being reset the stid is still open.
*/
if (unlikely(synq_empty(sk))) {
/* removed from synq */
__kfree_skb(skb);
goto unlock;
}
stid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
t = &(T3C_DATA(cdev))->tid_maps;
t3c_stid = lookup_stid(t, stid);
lsk = ((struct listen_ctx *)t3c_stid->ctx)->lsk;
bh_lock_sock(lsk);
if (likely(!sock_owned_by_user(lsk))) {
__kfree_skb(skb);
add_pass_open_to_parent(sk, lsk, tdev);
} else {
skb->sk = sk;
BLOG_SKB_CB(skb)->dev = tdev;
BLOG_SKB_CB(skb)->backlog_rcv = bl_add_pass_open_to_parent;
__sk_add_backlog(lsk, skb);
}
bh_unlock_sock(lsk);
}
unlock:
bh_unlock_sock(sk);
return 0;
}
/*
* Fill in the right TID for CPL messages waiting in the out-of-order queue
* and send them to the TOE.
*/
static void fixup_and_send_ofo(struct sock *sk)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct sk_buff *skb;
struct toedev *tdev = cplios->toedev;
struct tcp_sock *tp = tcp_sk(sk);
unsigned int tid = cplios->tid;
while ((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL) {
/*
* A variety of messages can be waiting but the fields we'll
* be touching are common to all so any message type will do.
*/
struct cpl_close_con_req *p = cplhdr(skb);
p->wr.wr_lo = htonl(V_WR_TID(tid));
OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
cxgb3_ofld_send(TOM_DATA(tdev)->cdev, skb);
}
}
/*
* Adjust buffers already in write queue after a SYN_SENT->ESTABLISHED
* transition. For TX_DATA we need to adjust the start sequence numbers, and
* for other packets we need to adjust the TID. TX_DATA packets don't have
* headers yet and so not TIDs.
*/
static void fixup_pending_writeq_buffers(struct sock *sk)
{
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk);
unsigned int tid = CPL_IO_STATE(sk)->tid;
skb_queue_walk(&sk->sk_write_queue, skb) {
if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR) {
ULP_SKB_CB(skb)->seq = tp->write_seq;
tp->write_seq += skb->len + ulp_extra_len(skb);
} else {
struct cpl_close_con_req *p = cplhdr(skb);
p->wr.wr_lo = htonl(V_WR_TID(tid));
OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
}
}
}
/*
* Updates socket state from an active establish CPL message. Runs with the
* socket lock held.
*/
static void sock_act_establish(struct sock *sk, struct sk_buff *skb)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct cpl_act_establish *req = cplhdr(skb);
u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */
struct tcp_sock *tp = tcp_sk(sk);
if (unlikely(sk->sk_state != TCP_SYN_SENT))
printk(KERN_ERR "TID %u expected SYN_SENT, found %d\n",
cplios->tid, sk->sk_state);
tp->rcv_tstamp = tcp_time_stamp;
cplios->delack_seq = tp->copied_seq = tp->rcv_wup = tp->rcv_nxt = rcv_isn;
make_established(sk, ntohl(req->snd_isn), ntohs(req->tcp_opt));
#if defined(CONFIG_SECURITY_NETWORK) && defined(SEC_INET_CONN_ESTABLISHED)
security_inet_conn_estab(sk, tcphdr_skb);
#endif
/*
* Now that we finally have a TID send any CPL messages that we had to
* defer for lack of a TID.
*/
if (skb_queue_len(&tp->out_of_order_queue))
fixup_and_send_ofo(sk);
if (likely(!sock_flag(sk, SOCK_DEAD))) {
sk->sk_state_change(sk);
sk_wake_async(sk, 0, POLL_OUT);
}
__kfree_skb(skb);
/*
* Currently the send queue must be empty at this point because the
* socket layer does not send anything before a connection is
* established. To be future proof though we handle the possibility
* that there are pending buffers to send (either TX_DATA or
* CLOSE_CON_REQ). First we need to adjust the sequence number of the
* buffers according to the just learned write_seq, and then we send
* them on their way.
*/
fixup_pending_writeq_buffers(sk);
if (t3_push_frames(sk, 1))
sk->sk_write_space(sk);
}
/*
* Process a CPL_ACT_ESTABLISH message.
*/
static int do_act_establish(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
{
struct cpl_act_establish *req = cplhdr(skb);
unsigned int tid = GET_TID(req);
unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
struct sock *sk = (struct sock *)ctx;
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct toedev *tdev = cplios->toedev;
struct tom_data *d = TOM_DATA(tdev);
/*
* It's OK if the TID is currently in use, the owning socket may have
* backlogged its last CPL message(s). Just take it away.
*/
CPL_IO_STATE(sk)->tid = tid;
sk_insert_tid(d, sk, tid);
free_atid(cdev, atid);
cplios->rss_cpu_idx = G_QNUM(ntohl(skb->csum));
process_cpl_msg(sock_act_establish, sk, skb);
return 0;
}
/*
* Process an acknowledgment of WR completion. Advance snd_una and send the
* next batch of work requests from the write queue.
*/
static void wr_ack(struct sock *sk, struct sk_buff *skb)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct cpl_wr_ack *hdr = cplhdr(skb);
unsigned int credits = ntohs(hdr->credits);
u32 snd_una = ntohl(hdr->snd_una);
cplios->wr_avail += credits;
/*
* If the last write request in the queue with a request completion
* flag has been consumed, reset our bookeepping.
*/
if (cplios->wr_unacked > cplios->wr_max - cplios->wr_avail)
cplios->wr_unacked = cplios->wr_max - cplios->wr_avail;
while (credits) {
struct sk_buff *p = peek_wr(sk);
if (unlikely(!p)) {
printk(KERN_ERR "%u WR_ACK credits for TID %u with "
"nothing pending, state %u\n",
credits, cplios->tid, sk->sk_state);
break;
}
if (unlikely(credits < p->csum)) {
#if DEBUG_WR > 1
struct tx_data_wr *w = cplhdr(p);
printk(KERN_ERR
"TID %u got %u WR credits, need %u, len %u, "
"main body %u, frags %u, seq # %u, ACK una %u,"
" ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
cplios->tid, credits, p->csum, p->len,
p->len - p->data_len, skb_shinfo(p)->nr_frags,
ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
cplios->wr_avail, count_pending_wrs(sk) - credits);
#endif
p->csum -= credits;
break;
} else {
dequeue_wr(sk);
credits -= p->csum;
free_wr_skb(p);
}
}
#if DEBUG_WR
check_wr_invariants(sk);
#endif
if (unlikely(before(snd_una, tp->snd_una))) {
#if VALIDATE_SEQ
struct tom_data *d = TOM_DATA(cplios->toedev);
printk(KERN_ERR "%s: unexpected sequence # %u in WR_ACK "
"for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
cplios->tid, tp->snd_una);
#endif
goto out_free;
}
if (tp->snd_una != snd_una) {
tp->snd_una = snd_una;
dst_confirm(sk->sk_dst_cache);
tp->rcv_tstamp = tcp_time_stamp;
if (tp->snd_una == tp->snd_nxt)
cplios_reset_flag(sk, CPLIOS_TX_WAIT_IDLE);
}
/*
* If there's more data queued up, see if we can get it into the write
* queue ... If we're able to push any data into the write queue,
* free up socket send buffer space.
*/
if (skb_queue_len(&sk->sk_write_queue) && t3_push_frames(sk, 0))
sk->sk_write_space(sk);
out_free:
__kfree_skb(skb);
}
/*
* Handler for TX_DATA_ACK CPL messages.
*/
static int do_wr_ack(struct t3cdev *dev, struct sk_buff *skb, void *ctx)
{
struct sock *sk = (struct sock *)ctx;
VALIDATE_SOCK(sk);
process_cpl_msg(wr_ack, sk, skb);
return 0;
}
/*
* Handler for TRACE_PKT CPL messages. Just sink these packets.
*/
static int do_trace_pkt(struct t3cdev *dev, struct sk_buff *skb, void *ctx)
{
__kfree_skb(skb);
return 0;
}
/*
* Disconnect offloaded established but not yet accepted connections sitting
* on a server's accept_queue. We just send an ABORT_REQ at this point and
* finish off the disconnect later as we may need to wait for the ABORT_RPL.
*/
void t3_disconnect_acceptq(struct sock *listen_sk)
{
struct request_sock **pprev;
pprev = ACCEPT_QUEUE(listen_sk);
while (*pprev) {
struct request_sock *req = *pprev;
if (req->rsk_ops == RSK_OPS(&t3_rsk_ops)) { // one of ours
struct sock *child = req->sk;
*pprev = req->dl_next;
sk_acceptq_removed(listen_sk);
__reqsk_free(req);
release_tcp_port(child);
reset_listen_child(child);
} else
pprev = &req->dl_next;
}
}
/*
* Reset offloaded connections sitting on a server's syn queue. As above
* we send ABORT_REQ and finish off when we get ABORT_RPL.
*/
void t3_reset_synq(struct sock *listen_sk)
{
struct sock **nextsk = &synq_next(listen_sk);
/*
* Note: the while predicate below is a little tricky because the
* fields used to implement the doubly linked list have been hijacked
* out of the (struct tcp_sock) portion of the socket. If the fields
* were solely ours to use, then the test of "*nextsk != listen_sk"
* would be enough. But when we empty the SYN queue, the state of
* those hijacked fields are reset to the values expected by Linux
* and "*nextsk" will no longer have any legitimate meaning for us.
* Thus the double predicate of testing for both the SYN queue being
* empty (which is implemented in a Linux version-dependent fashion)
* and making sure the next socket to process isn't our listen
* socket ...
*/
while (!synq_empty(listen_sk) && *nextsk != listen_sk) {
struct sock *child = *nextsk;
if (child->sk_prot == &t3_tcp_prot.proto) {
/* one of ours */
cleanup_syn_rcv_conn(child, listen_sk);
release_tcp_port(child);
reset_listen_child(child);
} else {
/* some other offloaded socket ... */
nextsk = &synq_next(*nextsk);
}
}
}
int t3_setup_ppods(struct sock *sk, const struct ddp_gather_list *gl,
unsigned int nppods, unsigned int tag, unsigned int maxoff,
unsigned int pg_off, unsigned int color)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
unsigned int i, j, pidx;
struct pagepod *p;
struct sk_buff *skb;
struct ulp_mem_io *req;
struct tcp_sock *tp = tcp_sk(sk);
unsigned int tid = cplios->tid;
const struct tom_data *td = TOM_DATA(cplios->toedev);
unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
for (i = 0; i < nppods; ++i) {
skb = alloc_ctrl_skb(tp, sizeof(*req) + PPOD_SIZE);
skb->priority = mkprio(CPL_PRIORITY_CONTROL, sk);
req = (struct ulp_mem_io *)__skb_put(skb,
sizeof(*req) + PPOD_SIZE);
req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
V_ULPTX_CMD(ULP_MEM_WRITE));
req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
p = (struct pagepod *)(req + 1);
if (likely(i < nppods - NUM_SENTINEL_PPODS)) {
p->vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
p->pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
V_PPOD_COLOR(color));
p->max_offset = htonl(maxoff);
p->page_offset = htonl(pg_off);
p->rsvd = 0;
for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
p->addr[j] = pidx < gl->nelem ?
cpu_to_be64(gl->phys_addr[pidx]) : 0;
} else
p->vld_tid = 0; /* mark sentinel page pods invalid */
send_or_defer(sk, tp, skb, 0);
ppod_addr += PPOD_SIZE;
}
return 0;
}
/*
* Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
*/
static inline void mk_cpl_barrier_ulp(struct cpl_barrier *b)
{
struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
b->opcode = CPL_BARRIER;
}
/*
* Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
*/
static inline void mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid,
unsigned int cpuno)
{
struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
txpkt = (struct ulp_txpkt *)req;
txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
req->cpuno = htons(cpuno);
}
/*
* Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
*/
static inline void mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req,
unsigned int tid, unsigned int word,
u64 mask, u64 val)
{
struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
req->reply = V_NO_REPLY(1);
req->cpu_idx = 0;
req->word = htons(word);
req->mask = cpu_to_be64(mask);
req->val = cpu_to_be64(val);
}
/*
* Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
*/
static void mk_rx_data_ack_ulp(struct sock *sk, struct cpl_rx_data_ack *ack,
unsigned int tid,
unsigned int credits)
{
struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
u32 dack;
dack = t3_select_delack(sk);
txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
V_RX_DACK_MODE(dack) |
V_RX_CREDITS(credits));
}
void t3_cancel_ddpbuf(struct sock *sk, unsigned int bufidx)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
unsigned int wrlen;
struct sk_buff *skb;
struct work_request_hdr *wr;
struct cpl_barrier *lock;
struct cpl_set_tcb_field *req;
struct cpl_get_tcb *getreq;
struct ddp_state *p = DDP_STATE(sk);
wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
sizeof(*getreq);
skb = alloc_ctrl_skb(tp, wrlen);
skb->priority = mkprio(CPL_PRIORITY_CONTROL, sk);
wr = (struct work_request_hdr *)__skb_put(skb, wrlen);
wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
lock = (struct cpl_barrier *)(wr + 1);
mk_cpl_barrier_ulp(lock);
req = (struct cpl_set_tcb_field *)(lock + 1);
/* Hmmm, not sure if this actually a good thing: reactivating
* the other buffer might be an issue if it has been completed
* already. However, that is unlikely, since the fact that the UBUF
* is not completed indicates that there is no oustanding data.
*/
if (bufidx == 0)
mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_FLAGS,
V_TF_DDP_ACTIVE_BUF(1) |
V_TF_DDP_BUF0_VALID(1),
V_TF_DDP_ACTIVE_BUF(1));
else
mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_FLAGS,
V_TF_DDP_ACTIVE_BUF(1) |
V_TF_DDP_BUF1_VALID(1), 0);
getreq = (struct cpl_get_tcb *)(req + 1);
mk_get_tcb_ulp(getreq, cplios->tid, cplios->rss_cpu_idx);
mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
/* Keep track of the number of oustanding CPL_GET_TCB requests
*/
p->get_tcb_count++;
#ifdef T3_TRACE
T3_TRACE1(TIDTB(sk),
"t3_cancel_ddpbuf: bufidx %u", bufidx);
#endif
cxgb3_ofld_send(T3C_DEV(sk), skb);
}
/**
* t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
* @sk: the socket associated with the buffers
* @bufidx: index of HW DDP buffer (0 or 1)
* @tag0: new tag for HW buffer 0
* @tag1: new tag for HW buffer 1
* @len: new length for HW buf @bufidx
*
* Sends a compound WR to overlay a new DDP buffer on top of an existing
* buffer by changing the buffer tag and length and setting the valid and
* active flag accordingly. The caller must ensure the new buffer is at
* least as big as the existing one. Since we typically reprogram both HW
* buffers this function sets both tags for convenience. Read the TCB to
* determine how made data was written into the buffer before the overlay
* took place.
*/
void t3_overlay_ddpbuf(struct sock *sk, unsigned int bufidx, unsigned int tag0,
unsigned int tag1, unsigned int len)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
unsigned int wrlen;
struct sk_buff *skb;
struct work_request_hdr *wr;
struct cpl_get_tcb *getreq;
struct cpl_set_tcb_field *req;
struct ddp_state *p = DDP_STATE(sk);
wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
skb = alloc_ctrl_skb(tp, wrlen);
skb->priority = mkprio(CPL_PRIORITY_CONTROL, sk);
wr = (struct work_request_hdr *)__skb_put(skb, wrlen);
/* Set the ATOMIC flag to make sure that TP processes the following
* CPLs in an atomic manner and no wire segments can be interleaved.
*/
wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
req = (struct cpl_set_tcb_field *)(wr + 1);
mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_BUF0_TAG,
V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
V_TCB_RX_DDP_BUF0_TAG(tag0) |
V_TCB_RX_DDP_BUF1_TAG((u64)tag1) << 32);
req++;
if (bufidx == 0) {
mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_BUF0_LEN,
V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
V_TCB_RX_DDP_BUF0_LEN((u64)len));
req++;
mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_FLAGS,
V_TF_DDP_PUSH_DISABLE_0(1) |
V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
V_TF_DDP_PUSH_DISABLE_0(0) |
V_TF_DDP_BUF0_VALID(1));
} else {
mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_BUF1_LEN,
V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
V_TCB_RX_DDP_BUF1_LEN((u64)len));
req++;
mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_FLAGS,
V_TF_DDP_PUSH_DISABLE_1(1) |
V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
V_TF_DDP_PUSH_DISABLE_1(0) |
V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
}
getreq = (struct cpl_get_tcb *)(req + 1);
mk_get_tcb_ulp(getreq, cplios->tid, cplios->rss_cpu_idx);
/* Keep track of the number of oustanding CPL_GET_TCB requests
*/
p->get_tcb_count++;
#ifdef T3_TRACE
T3_TRACE4(TIDTB(sk),
"t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
"len %d",
bufidx, tag0, tag1, len);
#endif
cxgb3_ofld_send(T3C_DEV(sk), skb);
}
/*
* Sends a compound WR containing all the CPL messages needed to program the
* two HW DDP buffers, namely optionally setting up the length and offset of
* each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
*/
void t3_setup_ddpbufs(struct sock *sk, unsigned int len0, unsigned int offset0,
unsigned int len1, unsigned int offset1,
u64 ddp_flags, u64 flag_mask, int modulate)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
unsigned int wrlen;
struct sk_buff *skb;
struct work_request_hdr *wr;
struct cpl_set_tcb_field *req;
struct tcp_sock *tp = tcp_sk(sk);
wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
(len1 ? sizeof(*req) : 0) +
(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
skb = alloc_ctrl_skb(tp, wrlen);
skb->priority = mkprio(CPL_PRIORITY_CONTROL, sk);
wr = (struct work_request_hdr *)__skb_put(skb, wrlen);
wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
req = (struct cpl_set_tcb_field *)(wr + 1);
if (len0) { /* program buffer 0 offset and length */
mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_BUF0_OFFSET,
V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
V_TCB_RX_DDP_BUF0_OFFSET((u64)offset0) |
V_TCB_RX_DDP_BUF0_LEN((u64)len0));
req++;
}
if (len1) { /* program buffer 1 offset and length */
mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_BUF1_OFFSET,
V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
V_TCB_RX_DDP_BUF1_OFFSET((u64)offset1) |
V_TCB_RX_DDP_BUF1_LEN((u64)len1) << 32);
req++;
}
mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_FLAGS, flag_mask,
ddp_flags);
if (modulate) {
mk_rx_data_ack_ulp(sk, (struct cpl_rx_data_ack *)(req + 1),
cplios->tid,
tp->copied_seq - tp->rcv_wup);
tp->rcv_wup = tp->copied_seq;
}
#ifdef T3_TRACE
T3_TRACE5(TIDTB(sk),
"t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
"modulate %d",
len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
modulate);
#endif
cxgb3_ofld_send(T3C_DEV(sk), skb);
}
void t3_init_wr_tab(unsigned int wr_len)
{
int i;
if (skb_wrs[1]) /* already initialized */
return;
for (i = 1; i < ARRAY_SIZE(skb_wrs); i++) {
int sgl_len = (3 * i) / 2 + (i & 1);
sgl_len += 3;
skb_wrs[i] = sgl_len <= wr_len ?
1 : 1 + (sgl_len - 2) / (wr_len - 1);
}
wrlen = wr_len * 8;
}
int __init t3_init_cpl_io(void)
{
tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
if (!tcphdr_skb) {
printk(KERN_ERR
"Chelsio TCP offload: can't allocate sk_buff\n");
return -1;
}
skb_put(tcphdr_skb, sizeof(struct tcphdr));
skb_reset_transport_header(tcphdr_skb);
memset(tcphdr_skb->data, 0, tcphdr_skb->len);
/* CIPSO_V4_OPTEXIST is false for tcphdr_skb without anything extra */
t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
return 0;
}