| /* |
| * This file implements the Chelsio CPL5 message processing. |
| * |
| * Copyright (C) 2003-2010 Chelsio Communications. All rights reserved. |
| * |
| * Written by Dimitris Michailidis (dm@chelsio.com) |
| * |
| * This program is distributed in the hope that it will be useful, but WITHOUT |
| * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| * FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE file included in this |
| * release for licensing terms and conditions. |
| */ |
| |
| #include "defs.h" |
| #include <linux/module.h> |
| #include <linux/vmalloc.h> |
| #include <linux/ip.h> |
| #include <linux/netdevice.h> |
| #include <linux/inetdevice.h> |
| #include <linux/toedev.h> |
| #include <linux/if_vlan.h> |
| #include <net/tcp.h> |
| #include <net/offload.h> |
| #include <net/route.h> |
| #include <asm/atomic.h> |
| #include "tom.h" |
| #include "cpl_io_state.h" |
| #include "t3_ddp.h" |
| #include "t3cdev.h" |
| #include "l2t.h" |
| #include "tcb.h" |
| #include "cxgb3_defs.h" |
| #include "cxgb3_ctl_defs.h" |
| #include "firmware_exports.h" |
| #include "trace.h" |
| #include "tom_compat.h" |
| |
| #define DEBUG_WR 0 |
| |
| extern struct sk_ofld_proto t3_tcp_prot; |
| extern struct request_sock_ops t3_rsk_ops; |
| |
| /* |
| * For ULP connections HW may add headers, e.g., for digests, that aren't part |
| * of the messages sent by the host but that are part of the TCP payload and |
| * therefore consume TCP sequence space. Tx connection parameters that |
| * operate in TCP sequence space are affected by the HW additions and need to |
| * compensate for them to accurately track TCP sequence numbers. This array |
| * contains the compensating extra lengths for ULP packets. It is indexed by |
| * a packet's ULP submode. |
| */ |
| const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8}; |
| |
| /* |
| * TOS values for HW scheduling classes. If an offload policy assigns a |
| * connection to a class we use a value from this table as its TOS. These |
| * are special values and we do not otherwise use them as TOS. |
| */ |
| static const u8 sched_class_tos[] = { |
| 0x30, 0x32, 0x34, 0x36, 0x31, 0x33, 0x35, 0x37 |
| }; |
| |
| /* |
| * This sk_buff holds a fake header-only TCP segment that we use whenever we |
| * need to exploit SW TCP functionality that expects TCP headers, such as |
| * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple |
| * CPUs without locking. |
| */ |
| static struct sk_buff *tcphdr_skb __read_mostly; |
| |
| /* |
| * Size of WRs in bytes. Note that we assume all devices we are handling have |
| * the same WR size. |
| */ |
| static unsigned int wrlen __read_mostly; |
| |
| /* |
| * The number of WRs needed for an skb depends on the number of page fragments |
| * in the skb and whether it has any payload in its main body. This maps the |
| * length of the gather list represented by an skb into the # of necessary WRs. |
| */ |
| static unsigned int skb_wrs[MAX_SKB_FRAGS + 2] __read_mostly; |
| |
| /* |
| * Socket filter that drops everything by specifying a 0-length filter program. |
| */ |
| static struct sk_filter drop_all = { .refcnt = ATOMIC_INIT(1) }; |
| |
| /* |
| * TOE information returned through inet_diag for offloaded connections. |
| */ |
| struct t3_inet_diag_info { |
| u32 toe_id; /* determines how to interpret the rest of the fields */ |
| u32 tid; |
| u8 wrs; |
| u8 queue; |
| u8 ulp_mode:4; |
| u8 sched_class:4; |
| u8 ddp_enabled; |
| char dev_name[TOENAMSIZ]; |
| }; |
| |
| /* |
| * Similar to process_cpl_msg() but takes an extra socket reference around the |
| * call to the handler. Should be used if the handler may drop a socket |
| * reference. |
| */ |
| static inline void process_cpl_msg_ref(void (*fn)(struct sock *, |
| struct sk_buff *), |
| struct sock *sk, struct sk_buff *skb) |
| { |
| sock_hold(sk); |
| process_cpl_msg(fn, sk, skb); |
| sock_put(sk); |
| } |
| |
| static inline int is_t3a(const struct toedev *dev) |
| { |
| return dev->ttid == TOE_ID_CHELSIO_T3; |
| } |
| |
| /* |
| * Returns an sk_buff for a reply CPL message of size len. If the input |
| * sk_buff has no other users it is trimmed and reused, otherwise a new buffer |
| * is allocated. The input skb must be of size at least len. Note that this |
| * operation does not destroy the original skb data even if it decides to reuse |
| * the buffer. |
| */ |
| static struct sk_buff *get_cpl_reply_skb(struct sk_buff *skb, size_t len, |
| int gfp) |
| { |
| if (likely(!skb_cloned(skb))) { |
| BUG_ON(skb->len < len); |
| __skb_trim(skb, len); |
| skb_get(skb); |
| } else { |
| skb = alloc_skb(len, gfp); |
| if (skb) |
| __skb_put(skb, len); |
| } |
| return skb; |
| } |
| |
| /* |
| * Like get_cpl_reply_skb() but the returned buffer starts out empty. |
| */ |
| static struct sk_buff *__get_cpl_reply_skb(struct sk_buff *skb, size_t len, |
| int gfp) |
| { |
| if (likely(!skb_cloned(skb) && !skb->data_len)) { |
| __skb_trim(skb, 0); |
| skb_get(skb); |
| } else |
| skb = alloc_skb(len, gfp); |
| return skb; |
| } |
| |
| /* |
| * Determine whether to send a CPL message now or defer it. A message is |
| * deferred if the connection is in SYN_SENT since we don't know the TID yet. |
| * For connections in other states the message is sent immediately. |
| * If through_l2t is set the message is subject to ARP processing, otherwise |
| * it is sent directly. |
| */ |
| static inline void send_or_defer(struct sock *sk, struct tcp_sock *tp, |
| struct sk_buff *skb, int through_l2t) |
| { |
| struct t3cdev *cdev = T3C_DEV(sk); |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| |
| if (unlikely(sk->sk_state == TCP_SYN_SENT)) |
| __skb_queue_tail(&tp->out_of_order_queue, skb); // defer |
| else if (through_l2t) |
| l2t_send(cdev, skb, cplios->l2t_entry); // send through L2T |
| else |
| cxgb3_ofld_send(cdev, skb); // send directly |
| } |
| |
| /* |
| * Populate a TID_RELEASE WR. The skb must be already propely sized. |
| */ |
| static inline void mk_tid_release(struct sk_buff *skb, const struct sock *sk, |
| unsigned int tid) |
| { |
| struct cpl_tid_release *req; |
| |
| skb->priority = mkprio(CPL_PRIORITY_SETUP, sk); |
| req = (struct cpl_tid_release *)__skb_put(skb, sizeof(*req)); |
| req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); |
| OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid)); |
| } |
| |
| /* |
| * Insert a socket to the TID table and take an extra reference. |
| */ |
| static inline void sk_insert_tid(struct tom_data *d, struct sock *sk, |
| unsigned int tid) |
| { |
| sock_hold(sk); |
| cxgb3_insert_tid(d->cdev, d->client, sk, tid); |
| } |
| |
| /** |
| * find_best_mtu - find the entry in the MTU table closest to an MTU |
| * @d: TOM state |
| * @mtu: the target MTU |
| * |
| * Returns the index of the value in the MTU table that is closest to but |
| * does not exceed the target MTU. |
| */ |
| static unsigned int find_best_mtu(const struct t3c_data *d, unsigned short mtu) |
| { |
| int i = 0; |
| |
| while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu) |
| ++i; |
| return i; |
| } |
| |
| static unsigned int select_mss(struct sock *sk, unsigned int pmtu) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| unsigned int idx; |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct dst_entry *dst = __sk_dst_get(sk); |
| struct tom_data *d = TOM_DATA(cplios->toedev); |
| const struct t3c_data *td = T3C_DATA(d->cdev); |
| |
| tp->advmss = dst_metric(dst, RTAX_ADVMSS); |
| if (USER_MSS(tp) && tp->advmss > USER_MSS(tp)) |
| tp->advmss = USER_MSS(tp); |
| if (tp->advmss > pmtu - 40) |
| tp->advmss = pmtu - 40; |
| if (tp->advmss < td->mtus[0] - 40) |
| tp->advmss = td->mtus[0] - 40; |
| idx = find_best_mtu(td, tp->advmss + 40); |
| tp->advmss = td->mtus[idx] - 40; |
| inet_csk(sk)->icsk_pmtu_cookie = pmtu; |
| return idx; |
| } |
| |
| void t3_select_window(struct sock *sk, int request) |
| { |
| struct toedev *dev = CPL_IO_STATE(sk)->toedev; |
| struct tom_data *d = TOM_DATA(dev); |
| struct tcp_sock *tp = tcp_sk(sk); |
| unsigned int wnd = tp->rcv_wnd; |
| unsigned int max_rcv_wnd; |
| |
| if ((tp->copied_seq - tp->rcv_wup) > (tp->rcv_wnd >> 1)) |
| wnd = tp->advmss*(tp->rcv_wnd/tp->advmss) << 1; |
| |
| wnd = max_t(unsigned int, wnd, tcp_full_space(sk)); |
| wnd = max_t(unsigned int, request, wnd); |
| |
| /* PR 5138 */ |
| max_rcv_wnd = (dev->ttid < TOE_ID_CHELSIO_T3C ? |
| (u32)d->rx_page_size * 23 : |
| MAX_RCV_WND); |
| |
| if (wnd > max_rcv_wnd) |
| wnd = max_rcv_wnd; |
| /* |
| * Check if we need to grow the receive window in response to an increase in |
| * the socket's receive buffer size. Some applications increase the buffer |
| * size dynamically and rely on the window to grow accordingly. |
| */ |
| |
| if (wnd > tp->rcv_wnd) { |
| tp->rcv_wup -= wnd - tp->rcv_wnd; |
| tp->rcv_wnd = wnd; |
| /* Mark the recieve window as updated*/ |
| cplios_reset_flag(sk, CPLIOS_UPDATE_RCV_WND); |
| } |
| |
| } |
| |
| unsigned int t3_select_delack(struct sock *sk) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct toedev *dev = cplios->toedev; |
| unsigned int dack_mode; |
| |
| dack_mode = TOM_TUNABLE(dev, delack); |
| if (!dack_mode) |
| return 0; |
| |
| if ((dack_mode == 2) && (MSS_CLAMP(tp) > 1680)) |
| dack_mode = 3; |
| |
| if ((dack_mode == 3) && (tp->rcv_wnd < 2 * 26880)) |
| dack_mode = 1; |
| |
| if ((dack_mode == 2) && (tp->rcv_wnd < 2 * 16 * MSS_CLAMP(tp))) |
| dack_mode = 1; |
| |
| if ((dev->ttid >= TOE_ID_CHELSIO_T3C) && (cplios->delack_mode == 0) && |
| (tp->rcv_wnd > 2 * 2 * MSS_CLAMP(tp))) |
| dack_mode = 1; |
| |
| return dack_mode; |
| } |
| |
| #if VALIDATE_TID |
| /* |
| * Returns true if a connection TID is in range and currently unused. |
| */ |
| static int valid_new_tid(const struct tid_info *t, unsigned int tid) |
| { |
| return tid < t->ntids && !t->tid_tab[tid].ctx; |
| } |
| |
| #define VALIDATE_SOCK(sk) \ |
| do { \ |
| if (unlikely(!(sk))) \ |
| return CPL_RET_UNKNOWN_TID | CPL_RET_BUF_DONE; \ |
| } while (0) |
| #else |
| #define VALIDATE_SOCK(sk) do {} while (0) |
| #endif |
| |
| /* |
| * Called when we receive the last message from HW for a connection. A |
| * connection cannot transition to TCP_CLOSE prior to this event. |
| * Resources related to the offload state of a connection (e.g., L2T entries) |
| * must have been relinquished prior to calling this. |
| */ |
| static void connection_done(struct sock *sk) |
| { |
| #if 0 |
| printk("connection_done: TID: %u, state: %d, dead %d, refs %d\n", |
| CPL_IO_STATE(sk)->tid, sk->sk_state, sock_flag(sk, SOCK_DEAD), |
| atomic_read(&sk->sk_refcnt)); |
| // dump_stack(); |
| #endif |
| |
| #ifdef T3_TRACE |
| T3_TRACE1(TIDTB(sk), |
| "connection_done: GTS rpl pending %d, if pending wake", |
| cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING)); |
| #endif |
| |
| sk_wakeup_sleepers(sk, 0); |
| tcp_done(sk); |
| } |
| |
| /* |
| * Min receive window. We want it to be large enough to accommodate receive |
| * coalescing, handle jumbo frames, and not trigger sender SWS avoidance. |
| */ |
| #define MIN_RCV_WND (24 * 1024U) |
| |
| /* |
| * Determine the receive window scaling factor given a target max |
| * receive window. |
| */ |
| static inline int select_rcv_wscale(int space, int wscale_ok, int window_clamp) |
| { |
| int wscale = 0; |
| |
| if (space > MAX_RCV_WND) |
| space = MAX_RCV_WND; |
| if (window_clamp && window_clamp < space) |
| space = window_clamp; |
| |
| if (wscale_ok) |
| for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ; |
| return wscale; |
| } |
| |
| /* Returns bits 2:7 of a socket's TOS field */ |
| #define SK_TOS(sk) ((inet_sk(sk)->tos >> 2) & M_TOS) |
| |
| /* |
| * The next two functions calculate the option 0 value for a socket. |
| */ |
| static inline unsigned int calc_opt0h(struct sock *sk) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct tcp_sock *tp = tcp_sk(sk); |
| |
| return V_NAGLE((tp->nonagle & TCP_NAGLE_OFF) == 0) | |
| V_KEEP_ALIVE(sock_flag(sk, SOCK_KEEPOPEN) != 0) | F_TCAM_BYPASS | |
| V_WND_SCALE(RCV_WSCALE(tp)) | V_MSS_IDX(cplios->mtu_idx); |
| } |
| |
| static inline unsigned int calc_opt0l(struct sock *sk) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct tcp_sock *tp = tcp_sk(sk); |
| unsigned int tos; |
| |
| if (cplios->sched_cls < ARRAY_SIZE(sched_class_tos)) |
| tos = sched_class_tos[cplios->sched_cls]; |
| else { |
| tos = SK_TOS(sk); |
| if ((tos & 0x38) == 0x30) /* suppress values in special range */ |
| tos = 0; |
| } |
| |
| return V_TOS(tos) | V_ULP_MODE(cplios->ulp_mode) | |
| V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ)); |
| } |
| |
| static unsigned int calc_opt2(const struct sock *sk, |
| const struct offload_settings *s) |
| { |
| u32 opt2 = (F_CPU_INDEX_VALID | |
| V_CPU_INDEX(CPL_IO_STATE(sk)->rss_cpu_idx)); |
| |
| if (unlikely(!s)) |
| return opt2; |
| |
| if (s->rx_coalesce >= 0) |
| opt2 |= F_RX_COALESCE_VALID | |
| V_RX_COALESCE(s->rx_coalesce ? 3 : 0); |
| if (s->cong_algo >= 0) |
| opt2 |= F_FLAVORS_VALID | V_CONG_CONTROL_FLAVOR(s->cong_algo) | |
| V_PACING_FLAVOR(1); |
| return opt2; |
| } |
| |
| #ifdef CTRL_SKB_CACHE |
| /* |
| * This function is intended for allocations of small control messages. |
| * Such messages go as immediate data and usually the pakets are freed |
| * immediately. We maintain a cache of one small sk_buff and use it whenever |
| * it is available (has a user count of 1). Otherwise we get a fresh buffer. |
| */ |
| static struct sk_buff *alloc_ctrl_skb(const struct tcp_sock *tp, int len) |
| { |
| struct sk_buff *skb = cplios->ctrl_skb_cache; |
| |
| if (likely(skb && !skb_shared(skb) && !skb_cloned(skb))) { |
| __skb_trim(skb, 0); |
| atomic_set(&skb->users, 2); |
| } else if (likely(!in_atomic())) |
| skb = alloc_skb_nofail(len); |
| else |
| skb = alloc_skb(len, GFP_ATOMIC); |
| return skb; |
| } |
| #else |
| # define alloc_ctrl_skb(tp, len) alloc_skb_nofail(len) |
| #endif |
| |
| static inline void free_wr_skb(struct sk_buff *skb) |
| { |
| #if defined(CONFIG_T3_ZCOPY_SENDMSG) || defined(CONFIG_T3_ZCOPY_SENDMSG_MODULE) |
| if (skb->data[0] == FW_WROPCODE_OFLD_TX_DATA) |
| t3_zcopy_cleanup_skb(skb); |
| #endif |
| kfree_skb(skb); |
| } |
| |
| static void purge_wr_queue(struct sock *sk) |
| { |
| struct sk_buff *skb; |
| while ((skb = dequeue_wr(sk)) != NULL) |
| free_wr_skb(skb); |
| } |
| |
| /* |
| * Returns true if an sk_buff carries urgent data. |
| */ |
| static inline int skb_urgent(struct sk_buff *skb) |
| { |
| return (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_URG) != 0; |
| } |
| |
| /* |
| * Generic ARP failure handler that discards the buffer. |
| */ |
| static void arp_failure_discard(struct t3cdev *cdev, struct sk_buff *skb) |
| { |
| kfree_skb(skb); |
| } |
| |
| static inline void make_tx_data_wr(struct sock *sk, struct sk_buff *skb, |
| int len) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct tx_data_wr *req; |
| struct tcp_sock *tp = tcp_sk(sk); |
| |
| skb_reset_transport_header(skb); |
| req = (struct tx_data_wr *)__skb_push(skb, sizeof(*req)); |
| req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)); |
| req->wr_lo = htonl(V_WR_TID(cplios->tid)); |
| req->sndseq = htonl(tp->snd_nxt); |
| /* len includes the length of any HW ULP additions */ |
| req->len = htonl(len); |
| req->param = htonl(V_TX_PORT(cplios->l2t_entry->chan_idx)); |
| /* V_TX_ULP_SUBMODE sets both the mode and submode */ |
| req->flags = htonl(V_TX_ULP_SUBMODE(skb_ulp_mode(skb)) | |
| V_TX_URG(skb_urgent(skb)) | |
| V_TX_SHOVE((!cplios_flag(sk, CPLIOS_TX_MORE_DATA)) && |
| (skb_peek(&sk->sk_write_queue) ? 0 : 1))); |
| |
| if (!cplios_flag(sk, CPLIOS_TX_DATA_SENT)) { |
| req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | |
| V_TX_CPU_IDX(cplios->rss_cpu_idx)); |
| |
| /* Sendbuffer is in units of 32KB. |
| */ |
| req->param |= htonl(V_TX_SNDBUF(sk->sk_sndbuf >> 15)); |
| cplios_set_flag(sk, CPLIOS_TX_DATA_SENT); |
| } |
| } |
| |
| /* |
| * Prepends TX_DATA_WR or CPL_CLOSE_CON_REQ headers to buffers waiting in a |
| * socket's send queue and sends them on to the TOE. Must be called with the |
| * socket lock held. Returns the amount of send buffer space that was freed |
| * as a result of sending queued data to the TOE. |
| */ |
| int t3_push_frames(struct sock *sk, int req_completion) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| int total_size = 0; |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct sk_buff *skb; |
| struct t3cdev *cdev; |
| struct tom_data *d; |
| |
| if (unlikely(sk_in_state(sk, TCPF_SYN_SENT | TCPF_CLOSE))) |
| return 0; |
| |
| /* |
| * We shouldn't really be called at all after an abort but check just |
| * in case. |
| */ |
| if (unlikely(cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN))) |
| return 0; |
| |
| d = TOM_DATA(cplios->toedev); |
| cdev = d->cdev; |
| |
| while (cplios->wr_avail && (skb = skb_peek(&sk->sk_write_queue)) != NULL && |
| !cplios_flag(sk, CPLIOS_TX_WAIT_IDLE) && |
| (!(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_HOLD) || |
| skb_queue_len(&sk->sk_write_queue) > 1)) { |
| |
| int len = skb->len; /* length before skb_push */ |
| int frags = skb_shinfo(skb)->nr_frags + (len != skb->data_len); |
| int wrs_needed = skb_wrs[frags]; |
| |
| if (wrs_needed > 1 && len + sizeof(struct tx_data_wr) <= wrlen) |
| wrs_needed = 1; |
| |
| WARN_ON(frags >= ARRAY_SIZE(skb_wrs) || wrs_needed < 1); |
| if (cplios->wr_avail < wrs_needed) |
| break; |
| |
| __skb_unlink(skb, &sk->sk_write_queue); |
| skb->priority = mkprio(CPL_PRIORITY_DATA, sk); |
| skb->csum = wrs_needed; /* remember this until the WR_ACK */ |
| cplios->wr_avail -= wrs_needed; |
| cplios->wr_unacked += wrs_needed; |
| enqueue_wr(sk, skb); |
| |
| if (likely(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR)) { |
| len += ulp_extra_len(skb); |
| make_tx_data_wr(sk, skb, len); |
| tp->snd_nxt += len; |
| tp->lsndtime = tcp_time_stamp; |
| #if defined(CONFIG_T3_ZCOPY_SENDMSG) || defined(CONFIG_T3_ZCOPY_SENDMSG_MODULE) |
| atomic_add(skb->len - sizeof (struct tx_data_wr), |
| &d->tx_dma_pending); |
| skb->sk = sk; |
| #endif |
| if ((req_completion && cplios->wr_unacked == wrs_needed) || |
| (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_COMPL) || |
| cplios->wr_unacked >= cplios->wr_max / 2) { |
| struct work_request_hdr *wr = cplhdr(skb); |
| |
| wr->wr_hi |= htonl(F_WR_COMPL); |
| cplios->wr_unacked = 0; |
| } |
| ULP_SKB_CB(skb)->flags &= ~ULPCB_FLAG_NEED_HDR; |
| } else if (skb->data[0] == FW_WROPCODE_OFLD_CLOSE_CON) |
| cplios_set_flag(sk, CPLIOS_CLOSE_CON_REQUESTED); |
| |
| total_size += skb->truesize; |
| if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_BARRIER) |
| cplios_set_flag(sk, CPLIOS_TX_WAIT_IDLE); |
| set_arp_failure_handler(skb, arp_failure_discard); |
| |
| l2t_send(cdev, skb, cplios->l2t_entry); |
| } |
| sk->sk_wmem_queued -= total_size; |
| return total_size; |
| } |
| EXPORT_SYMBOL(t3_push_frames); |
| |
| #ifndef TCP_CONGESTION_CONTROL |
| struct tcp_congestion_ops tcp_init_congestion_ops = { |
| .name = "", |
| .owner = THIS_MODULE, |
| }; |
| #endif |
| |
| static inline void free_atid(struct t3cdev *cdev, unsigned int tid) |
| { |
| struct sock *sk = cxgb3_free_atid(cdev, tid); |
| if (sk) |
| sock_put(sk); |
| } |
| /* |
| * Release resources held by an offload connection (TID, L2T entry, etc.) |
| */ |
| void t3_release_offload_resources(struct sock *sk) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct toedev *tdev = cplios->toedev; |
| struct t3cdev *cdev; |
| unsigned int tid = cplios->tid; |
| |
| if (!tdev) |
| return; |
| |
| cdev = T3C_DEV(sk); |
| if (!cdev) |
| return; |
| |
| cplios->rss_cpu_idx = 0; |
| t3_release_ddp_resources(sk); |
| |
| #ifdef CTRL_SKB_CACHE |
| kfree_skb(cplios->ctrl_skb_cache); |
| cplios->ctrl_skb_cache = NULL; |
| #endif |
| |
| if (cplios->wr_avail != cplios->wr_max) { |
| purge_wr_queue(sk); |
| reset_wr_list(sk); |
| } |
| |
| if (cplios->l2t_entry) { |
| l2t_release(L2DATA(cdev), cplios->l2t_entry); |
| cplios->l2t_entry = NULL; |
| } |
| |
| if (sk->sk_state == TCP_SYN_SENT) { // we have ATID |
| free_atid(cdev, tid); |
| __skb_queue_purge(&tp->out_of_order_queue); |
| } else { // we have TID |
| cxgb3_remove_tid(cdev, (void *)sk, tid); |
| sock_put(sk); |
| } |
| |
| t3_set_ca_ops(sk, &tcp_init_congestion_ops); |
| cplios->toedev = NULL; |
| #if 0 |
| printk(KERN_INFO "closing TID %u, state %u\n", tid, sk->sk_state); |
| #endif |
| } |
| |
| /* |
| * Returns whether a CPL message is not expected in the socket backlog of a |
| * closed connection. Most messages are illegal at that point except |
| * ABORT_RPL_RSS and GET_TCB_RPL sent by DDP. |
| */ |
| static int bad_backlog_msg(unsigned int opcode) |
| { |
| return opcode != CPL_ABORT_RPL_RSS && opcode != CPL_GET_TCB_RPL; |
| } |
| |
| /* |
| * Called for each sk_buff in a socket's receive backlog during |
| * backlog processing. |
| */ |
| static int t3_backlog_rcv(struct sock *sk, struct sk_buff *skb) |
| { |
| #if VALIDATE_TID |
| unsigned int opcode = ntohl(skb->csum) >> 24; |
| |
| if (unlikely(sk->sk_state == TCP_CLOSE && bad_backlog_msg(opcode))) { |
| printk(KERN_ERR "unexpected CPL message with opcode %x for " |
| "closed TID %u\n", opcode, CPL_IO_STATE(sk)->tid); |
| kfree_skb(skb); |
| return 0; |
| } |
| #endif |
| |
| BLOG_SKB_CB(skb)->backlog_rcv(sk, skb); |
| return 0; |
| } |
| |
| #ifdef CONFIG_TCP_OFFLOAD_MODULE |
| static void dummy_tcp_keepalive_timer(unsigned long data) |
| { |
| } |
| #endif |
| |
| /* |
| * Switch a socket to the offload protocol operations. Note that the offload |
| * operations do not contain the offload backlog handler, we install that |
| * directly to the socket. |
| */ |
| static void install_offload_ops(struct sock *sk) |
| { |
| sk->sk_prot = &t3_tcp_prot.proto; |
| sk->sk_backlog_rcv = t3_backlog_rcv; |
| if (sk->sk_write_space == sk_stream_write_space) |
| sk->sk_write_space = t3_write_space; |
| |
| #ifdef LINUX_2_4 |
| if (sk->filter) |
| sk_filter_release(sk, sk->filter); |
| sk->filter = &drop_all; |
| sk_filter_charge(sk, sk->filter); |
| #else |
| if (sk->sk_filter) |
| sk_filter_uncharge(sk, sk->sk_filter); |
| sk->sk_filter = &drop_all; |
| sk_filter_charge(sk, sk->sk_filter); |
| #endif /* LINUX_2_4 */ |
| |
| #ifdef CONFIG_TCP_OFFLOAD_MODULE |
| sk->sk_timer.function = dummy_tcp_keepalive_timer; |
| #endif |
| sock_set_flag(sk, SOCK_OFFLOADED); |
| } |
| |
| #if DEBUG_WR |
| static void dump_wrs(struct sock *sk) |
| { |
| u64 *d; |
| struct sk_buff *p; |
| |
| printk("TID %u info:\n", CPL_IO_STATE(sk)->tid); |
| skb_queue_walk(&sk->sk_write_queue, p) { |
| d = cplhdr(p); |
| printk(" len %u, frags %u, flags %x, data %llx\n", |
| p->len, skb_shinfo(p)->nr_frags, ULP_SKB_CB(p)->flags, |
| (unsigned long long)be64_to_cpu(*d)); |
| } |
| printk("outstanding:\n"); |
| wr_queue_walk(sk, p) { |
| d = cplhdr(p); |
| printk(" len %u, frags %u, flags %x, data %llx,%llx,%llx\n", |
| p->len, skb_shinfo(p)->nr_frags, ULP_SKB_CB(p)->flags, |
| (unsigned long long)be64_to_cpu(*d), |
| (unsigned long long)be64_to_cpu(d[1]), |
| (unsigned long long)be64_to_cpu(d[2])); |
| } |
| } |
| |
| static int count_pending_wrs(const struct sock *sk) |
| { |
| int n = 0; |
| const struct sk_buff *p; |
| |
| wr_queue_walk(sk, p) |
| n += p->csum; |
| return n; |
| } |
| |
| static void check_wr_invariants(const struct sock *sk) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| int pending = count_pending_wrs(sk); |
| |
| if (unlikely(cplios->wr_avail + pending != cplios->wr_max)) |
| printk(KERN_ERR "TID %u: credit imbalance: avail %u, " |
| "pending %u, total should be %u\n", cplios->tid, |
| cplios->wr_avail, pending, cplios->wr_max); |
| } |
| #endif |
| |
| static void t3_idiag_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| #if DEBUG_WR |
| if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { |
| bh_lock_sock(sk); |
| if (!sock_owned_by_user(sk)) |
| dump_wrs(sk); |
| bh_unlock_sock(sk); |
| } |
| #endif |
| if (ext & (1 << INET_DIAG_MAX)) { |
| struct rtattr *rta; |
| struct t3_inet_diag_info *info; |
| |
| rta = __RTA_PUT(skb, INET_DIAG_MAX + 1, sizeof(*info)); |
| info = RTA_DATA(rta); |
| info->toe_id = TOE_ID_CHELSIO_T3; |
| info->tid = cplios->tid; |
| info->wrs = cplios->wr_max - cplios->wr_avail; |
| info->queue = cplios->qset_idx; |
| info->ulp_mode = cplios->ulp_mode; |
| info->sched_class = cplios->sched_cls != SCHED_CLS_NONE ? |
| cplios->sched_cls : 0; |
| info->ddp_enabled = DDP_STATE(sk)->ddp_setup; |
| strcpy(info->dev_name, cplios->toedev->name); |
| rtattr_failure: ; |
| } |
| } |
| |
| #define T3_CONG_OPS(s) \ |
| { .name = s, .owner = THIS_MODULE, .get_info = t3_idiag_get_info } |
| |
| static struct tcp_congestion_ops t3_cong_ops[] = { |
| T3_CONG_OPS("reno"), T3_CONG_OPS("tahoe"), |
| T3_CONG_OPS("newreno"), T3_CONG_OPS("highspeed") |
| }; |
| |
| static void mk_act_open_req(struct sock *sk, struct sk_buff *skb, |
| unsigned int atid, const struct l2t_entry *e, |
| const struct offload_settings *s) |
| { |
| struct cpl_act_open_req *req; |
| |
| skb->priority = mkprio(CPL_PRIORITY_SETUP, sk); |
| req = (struct cpl_act_open_req *)__skb_put(skb, sizeof(*req)); |
| req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); |
| OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid)); |
| #ifdef LINUX_2_4 |
| req->local_port = sk->inet_sport; |
| req->peer_port = sk->inet_dport; |
| req->local_ip = sk->inet_saddr; |
| req->peer_ip = sk->inet_daddr; |
| #else |
| req->local_port = inet_sk(sk)->inet_sport; |
| req->peer_port = inet_sk(sk)->inet_dport; |
| req->local_ip = inet_sk(sk)->inet_saddr; |
| req->peer_ip = inet_sk(sk)->inet_daddr; |
| #endif /* LINUX_2_4 */ |
| req->opt0h = htonl(calc_opt0h(sk) | V_L2T_IDX(e->idx) | |
| V_TX_CHANNEL(e->chan_idx)); |
| req->opt0l = htonl(calc_opt0l(sk)); |
| req->params = 0; |
| |
| /* |
| * Because we may need to retransmit an ACT_OPEN_REQ and we don't want |
| * to keep the offload settings around we use the following hack: |
| * |
| * - if we are given offload settings we use them and store the |
| * resulting opt2 in rcv_tstamp |
| * - otherwise we use the previously saved opt2 |
| */ |
| if (likely(s)) |
| tcp_sk(sk)->rcv_tstamp = calc_opt2(sk, s); |
| req->opt2 = htonl(tcp_sk(sk)->rcv_tstamp); |
| } |
| |
| /* |
| * Convert an ACT_OPEN_RPL status to a Linux errno. |
| */ |
| static int act_open_rpl_status_to_errno(int status) |
| { |
| switch (status) { |
| case CPL_ERR_CONN_RESET: |
| return ECONNREFUSED; |
| case CPL_ERR_ARP_MISS: |
| return EHOSTUNREACH; |
| case CPL_ERR_CONN_TIMEDOUT: |
| return ETIMEDOUT; |
| case CPL_ERR_TCAM_FULL: |
| return ENOMEM; |
| case CPL_ERR_CONN_EXIST: |
| printk(KERN_ERR "ACTIVE_OPEN_RPL: 4-tuple in use\n"); |
| return EADDRINUSE; |
| default: |
| return EIO; |
| } |
| } |
| |
| static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb); |
| |
| static void fail_act_open(struct sock *sk, int errno) |
| { |
| sk->sk_err = errno; |
| sk->sk_error_report(sk); |
| t3_release_offload_resources(sk); |
| connection_done(sk); |
| T3_TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); |
| } |
| |
| static void act_open_retry_timer(unsigned long data) |
| { |
| struct sk_buff *skb; |
| struct sock *sk = (struct sock *)data; |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct inet_connection_sock *icsk = inet_csk(sk); |
| |
| bh_lock_sock(sk); |
| if (sock_owned_by_user(sk)) /* try in a bit */ |
| sk_reset_timer(sk, &icsk->icsk_retransmit_timer, |
| jiffies + HZ / 20); |
| else { |
| skb = alloc_skb(sizeof(struct cpl_act_open_req), GFP_ATOMIC); |
| if (!skb) |
| fail_act_open(sk, ENOMEM); |
| else { |
| skb->sk = sk; |
| set_arp_failure_handler(skb, act_open_req_arp_failure); |
| mk_act_open_req(sk, skb, cplios->tid, |
| cplios->l2t_entry, NULL); |
| l2t_send(T3C_DEV(sk), skb, cplios->l2t_entry); |
| } |
| } |
| bh_unlock_sock(sk); |
| sock_put(sk); |
| } |
| |
| /* |
| * Handle active open failures. |
| */ |
| static void active_open_failed(struct sock *sk, struct sk_buff *skb) |
| { |
| struct cpl_act_open_rpl *rpl = cplhdr(skb); |
| struct inet_connection_sock *icsk = inet_csk(sk); |
| |
| if (rpl->status == CPL_ERR_CONN_EXIST && |
| icsk->icsk_retransmit_timer.function != act_open_retry_timer) { |
| icsk->icsk_retransmit_timer.function = act_open_retry_timer; |
| sk_reset_timer(sk, &icsk->icsk_retransmit_timer, |
| jiffies + HZ / 2); |
| } else |
| fail_act_open(sk, act_open_rpl_status_to_errno(rpl->status)); |
| __kfree_skb(skb); |
| } |
| |
| /* |
| * Return whether a failed active open has allocated a TID |
| */ |
| static inline int act_open_has_tid(int status) |
| { |
| return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST && |
| status != CPL_ERR_ARP_MISS; |
| } |
| |
| /* |
| * Process an ACT_OPEN_RPL CPL message. |
| */ |
| static int do_act_open_rpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) |
| { |
| struct sock *sk = (struct sock *)ctx; |
| struct cpl_act_open_rpl *rpl = cplhdr(skb); |
| |
| VALIDATE_SOCK(sk); |
| |
| if (cdev->type != T3A && act_open_has_tid(rpl->status)) |
| cxgb3_queue_tid_release(cdev, GET_TID(rpl)); |
| |
| process_cpl_msg_ref(active_open_failed, sk, skb); |
| return 0; |
| } |
| |
| /* |
| * Handle an ARP failure for an active open. XXX purge ofo queue |
| * |
| * XXX badly broken for crossed SYNs as the ATID is no longer valid. |
| * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should |
| * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't |
| * free the atid. Hmm. |
| */ |
| static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb) |
| { |
| struct sock *sk = skb->sk; |
| |
| sock_hold(sk); |
| bh_lock_sock(sk); |
| if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) { |
| if (!sock_owned_by_user(sk)) { |
| fail_act_open(sk, EHOSTUNREACH); |
| __kfree_skb(skb); |
| } else { |
| /* |
| * Smart solution: Synthesize an ACTIVE_OPEN_RPL in the |
| * existing sk_buff and queue it to the backlog. We |
| * are certain the sk_buff is not shared. We also |
| * don't bother trimming the buffer. |
| */ |
| struct cpl_act_open_rpl *rpl = cplhdr(skb); |
| |
| rpl->ot.opcode = CPL_ACT_OPEN_RPL; |
| rpl->status = CPL_ERR_ARP_MISS; |
| BLOG_SKB_CB(skb)->backlog_rcv = active_open_failed; |
| __sk_add_backlog(sk, skb); |
| |
| /* |
| * XXX Make sure a PASS_ACCEPT_RPL behind us doesn't |
| * destroy the socket. Unfortunately we can't go into |
| * SYN_SENT because we don't have an atid. |
| * Needs more thought. |
| */ |
| } |
| } |
| bh_unlock_sock(sk); |
| sock_put(sk); |
| } |
| |
| /* |
| * Determine the receive window size for a socket. |
| */ |
| static unsigned int select_rcv_wnd(struct sock *sk) |
| { |
| struct toedev *dev = CPL_IO_STATE(sk)->toedev; |
| struct tom_data *d = TOM_DATA(dev); |
| unsigned int wnd = tcp_full_space(sk); |
| unsigned int max_rcv_wnd; |
| |
| /* |
| * For receive coalescing to work effectively we need a receive window |
| * that can accomodate a coalesced segment. |
| */ |
| if (wnd < MIN_RCV_WND) |
| wnd = MIN_RCV_WND; |
| |
| /* PR 5138 */ |
| max_rcv_wnd = (dev->ttid < TOE_ID_CHELSIO_T3C ? |
| (u32)d->rx_page_size * 23 : |
| MAX_RCV_WND); |
| |
| cplios_set_flag(sk, CPLIOS_UPDATE_RCV_WND); |
| |
| return min(wnd, max_rcv_wnd); |
| } |
| |
| #if defined(TCP_CONGESTION_CONTROL) |
| static void pivot_ca_ops(struct sock *sk, int cong) |
| { |
| struct inet_connection_sock *icsk = inet_csk(sk); |
| |
| if (icsk->icsk_ca_ops->release) |
| icsk->icsk_ca_ops->release(sk); |
| module_put(icsk->icsk_ca_ops->owner); |
| icsk->icsk_ca_ops = &t3_cong_ops[cong < 0 ? 2 : cong]; |
| } |
| #endif |
| |
| #define CTRL_SKB_LEN 120 |
| |
| /* |
| * Assign offload parameters to some socket fields. This code is used by |
| * both active and passive opens. |
| */ |
| static void init_offload_sk(struct sock *sk, struct toedev *dev, |
| unsigned int tid, struct l2t_entry *e, |
| struct dst_entry *dst, |
| struct net_device *egress_dev, |
| const struct offload_settings *s) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct tcp_sock *tp = tcp_sk(sk); |
| |
| cplios->toedev = dev; |
| cplios->tid = tid; |
| cplios->l2t_entry = e; |
| cplios->wr_max = cplios->wr_avail = TOM_TUNABLE(dev, max_wrs); |
| cplios->wr_unacked = 0; |
| cplios->delack_mode = 0; |
| cplios->mtu_idx = select_mss(sk, dst_mtu(dst)); |
| tp->rcv_wnd = select_rcv_wnd(sk); |
| cplios->ulp_mode = (TOM_TUNABLE(dev, ddp) && |
| !sock_flag(sk, SOCK_NO_DDP) && |
| tp->rcv_wnd >= MIN_DDP_RCV_WIN |
| ? ULP_MODE_TCPDDP |
| : ULP_MODE_NONE); |
| |
| cplios->sched_cls = (s->sched_class >= 0 |
| ? s->sched_class |
| : SCHED_CLS_NONE); |
| cplios->qset_idx = 0; |
| cplios->rss_cpu_idx = 0; |
| if (s->rssq >= 0) { |
| unsigned int id = s->rssq; |
| |
| if (dev->ctl(dev, GET_CPUIDX_OF_QSET, &id) == 0) { |
| cplios->qset_idx = s->rssq; |
| cplios->rss_cpu_idx = id; |
| } |
| } |
| |
| #ifdef CTRL_SKB_CACHE |
| cplios->ctrl_skb_cache = alloc_skb(CTRL_SKB_LEN, gfp_any()); |
| #endif |
| reset_wr_list(sk); |
| |
| if (!tp->window_clamp) |
| tp->window_clamp = dst_metric(dst, RTAX_WINDOW); |
| |
| /* |
| * Set sk_sndbuf so that t3_write_space and sk_stream_write_space |
| * calculate available socket space the same way. This allows us to |
| * keep the original ->sk_write_space callback in cases of kernel |
| * sockets that provide their own version and expect |
| * sk_stream_write_space's method to be working. |
| * |
| * The only case we don't handle are sockets that have their own |
| * ->sk_write_space callback and set SOCK_SNDBUF_LOCK. |
| */ |
| if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) |
| sk->sk_sndbuf = TOM_TUNABLE(dev, max_host_sndbuf); |
| |
| #if defined(TCP_CONGESTION_CONTROL) |
| pivot_ca_ops(sk, s->cong_algo); |
| #endif |
| } |
| |
| static inline void check_sk_callbacks(struct sock *sk) |
| { |
| if (unlikely(sk->sk_user_data && |
| !cplios_flag(sk, CPLIOS_CALLBACKS_CHKD))) { |
| if (install_special_data_ready(sk) > 0) |
| sock_set_flag(sk, SOCK_NO_DDP); |
| cplios_set_flag(sk, CPLIOS_CALLBACKS_CHKD); |
| } |
| } |
| |
| /* |
| * Send an active open request. |
| */ |
| int t3_connect(struct toedev *tdev, struct sock *sk, |
| struct net_device *egress_dev) |
| { |
| int atid; |
| struct sk_buff *skb; |
| struct l2t_entry *e; |
| struct tom_data *d = TOM_DATA(tdev); |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct dst_entry *dst = __sk_dst_get(sk); |
| struct cpl_io_state *cplios; |
| struct offload_req orq; |
| struct offload_settings settings; |
| |
| offload_req_from_sk(&orq, sk, OPEN_TYPE_ACTIVE); |
| settings = *lookup_ofld_policy(tdev, &orq, d->conf.cop_managed_offloading); |
| #ifndef LINUX_2_4 |
| rcu_read_unlock(); |
| #else |
| read_unlock(&tdev->policy_lock); |
| #endif |
| if (!settings.offload) |
| goto out_err; |
| |
| atid = cxgb3_alloc_atid(d->cdev, d->client, sk); |
| if (atid < 0) |
| goto out_err; |
| |
| cplios = kzalloc(sizeof *cplios, GFP_KERNEL); |
| if (cplios == NULL) |
| goto out_err; |
| |
| e = t3_l2t_get(d->cdev, dst->neighbour, egress_dev); |
| if (!e) |
| goto free_tid; |
| |
| skb = alloc_skb_nofail(sizeof(struct cpl_act_open_req)); |
| skb->sk = sk; |
| set_arp_failure_handler(skb, act_open_req_arp_failure); |
| |
| sock_hold(sk); |
| |
| CPL_IO_STATE(sk) = cplios; |
| install_offload_ops(sk); |
| check_sk_callbacks(sk); |
| |
| init_offload_sk(sk, tdev, atid, e, dst, egress_dev, &settings); |
| RCV_WSCALE(tp) = select_rcv_wscale(tcp_full_space(sk), |
| sysctl_tcp_window_scaling, |
| tp->window_clamp); |
| sk->sk_err = 0; |
| sock_reset_flag(sk, SOCK_DONE); |
| T3_TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); |
| |
| mk_act_open_req(sk, skb, atid, e, &settings); |
| l2t_send(d->cdev, skb, e); |
| if (cplios->ulp_mode == ULP_MODE_TCPDDP) |
| t3_enable_ddp(sk, 0); |
| return 0; |
| |
| free_tid: |
| free_atid(d->cdev, atid); |
| out_err: |
| return -1; |
| } |
| |
| /* |
| * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant |
| * and send it along. |
| */ |
| static void abort_arp_failure(struct t3cdev *cdev, struct sk_buff *skb) |
| { |
| struct cpl_abort_req *req = cplhdr(skb); |
| |
| req->cmd = CPL_ABORT_NO_RST; |
| cxgb3_ofld_send(cdev, skb); |
| } |
| |
| /* |
| * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do |
| * not send multiple ABORT_REQs for the same connection and also that we do |
| * not try to send a message after the connection has closed. Returns 1 if |
| * an ABORT_REQ wasn't generated after all, 0 otherwise. |
| */ |
| int t3_send_reset(struct sock *sk, int mode, struct sk_buff *skb) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct cpl_abort_req *req; |
| struct tcp_sock *tp = tcp_sk(sk); |
| unsigned int tid = cplios->tid; |
| |
| if (unlikely(cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN) || |
| !cplios->toedev)) { |
| if (skb) |
| __kfree_skb(skb); |
| return 1; |
| } |
| |
| cplios_set_flag(sk, CPLIOS_ABORT_RPL_PENDING); |
| cplios_set_flag(sk, CPLIOS_ABORT_SHUTDOWN); |
| |
| /* Purge the send queue so we don't send anything after an abort. */ |
| t3_purge_write_queue(sk); |
| |
| if (cplios_flag(sk, CPLIOS_CLOSE_CON_REQUESTED) && is_t3a(cplios->toedev)) |
| mode |= CPL_ABORT_POST_CLOSE_REQ; |
| |
| if (!skb) |
| skb = alloc_skb_nofail(sizeof(*req)); |
| skb->priority = mkprio(CPL_PRIORITY_DATA, sk); |
| set_arp_failure_handler(skb, abort_arp_failure); |
| |
| req = (struct cpl_abort_req *)skb_put(skb, sizeof(*req)); |
| req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ)); |
| req->wr.wr_lo = htonl(V_WR_TID(tid)); |
| OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid)); |
| req->rsvd0 = htonl(tp->snd_nxt); |
| req->rsvd1 = !cplios_flag(sk, CPLIOS_TX_DATA_SENT); |
| req->cmd = mode; |
| if (sk->sk_state == TCP_SYN_SENT) |
| __skb_queue_tail(&tp->out_of_order_queue, skb); // defer |
| else |
| l2t_send(T3C_DEV(sk), skb, cplios->l2t_entry); |
| return 0; |
| } |
| EXPORT_SYMBOL(t3_send_reset); |
| |
| /* |
| * Reset a connection that is on a listener's SYN queue or accept queue, |
| * i.e., one that has not had a struct socket associated with it. |
| * Must be called from process context. |
| * |
| * Modeled after code in inet_csk_listen_stop(). |
| */ |
| static void reset_listen_child(struct sock *child) |
| { |
| struct sk_buff *skb = alloc_skb_nofail(sizeof(struct cpl_abort_req)); |
| |
| sock_hold(child); // need to survive past inet_csk_destroy_sock() |
| local_bh_disable(); |
| bh_lock_sock(child); |
| |
| t3_send_reset(child, CPL_ABORT_SEND_RST, skb); |
| sock_orphan(child); |
| INC_ORPHAN_COUNT(child); |
| if (child->sk_state == TCP_CLOSE) |
| inet_csk_destroy_sock(child); |
| |
| bh_unlock_sock(child); |
| local_bh_enable(); |
| sock_put(child); |
| } |
| |
| /* |
| * The reap list is the list of passive open sockets that were orphaned when |
| * their listening parent went away and wasn't able to nuke them for whatever |
| * reason. These sockets are terminated through a work request from process |
| * context. |
| */ |
| static struct sock *reap_list; |
| static spinlock_t reap_list_lock = SPIN_LOCK_UNLOCKED; |
| |
| /* |
| * Process the reap list. |
| */ |
| DECLARE_TASK_FUNC(process_reap_list, task_param) |
| { |
| spin_lock_bh(&reap_list_lock); |
| while (reap_list) { |
| struct sock *sk = reap_list; |
| |
| reap_list = sk->sk_user_data; |
| sk->sk_user_data = NULL; |
| spin_unlock_bh(&reap_list_lock); |
| reset_listen_child(sk); |
| spin_lock_bh(&reap_list_lock); |
| } |
| spin_unlock_bh(&reap_list_lock); |
| } |
| |
| static T3_DECLARE_WORK(reap_task, process_reap_list, NULL); |
| |
| /* |
| * Add a socket to the reap list and schedule a work request to process it. |
| * We thread sockets through their sk_user_data pointers. May be called |
| * from softirq context and any associated open request must have already |
| * been freed. |
| */ |
| static void add_to_reap_list(struct sock *sk) |
| { |
| BUG_ON(sk->sk_user_data); |
| |
| release_tcp_port(sk); // release the port immediately, it may be reused |
| |
| spin_lock_bh(&reap_list_lock); |
| sk->sk_user_data = reap_list; |
| reap_list = sk; |
| if (!sk->sk_user_data) |
| schedule_work(&reap_task); |
| spin_unlock_bh(&reap_list_lock); |
| } |
| |
| static void __set_tcb_field(struct sock *sk, struct sk_buff *skb, u16 word, |
| u64 mask, u64 val, int no_reply) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct cpl_set_tcb_field *req; |
| |
| req = (struct cpl_set_tcb_field *)__skb_put(skb, sizeof(*req)); |
| req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); |
| OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, cplios->tid)); |
| req->reply = V_NO_REPLY(no_reply); |
| req->cpu_idx = cplios->rss_cpu_idx; |
| req->word = htons(word); |
| req->mask = cpu_to_be64(mask); |
| req->val = cpu_to_be64(val); |
| |
| skb->priority = mkprio(CPL_PRIORITY_CONTROL, sk); |
| } |
| |
| void t3_set_tcb_field(struct sock *sk, u16 word, u64 mask, u64 val) |
| { |
| struct sk_buff *skb; |
| |
| if (sk->sk_state == TCP_CLOSE || cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN)) |
| return; |
| |
| skb = alloc_ctrl_skb(tcp_sk(sk), sizeof(struct cpl_set_tcb_field)); |
| __set_tcb_field(sk, skb, word, mask, val, 1); |
| send_or_defer(sk, tcp_sk(sk), skb, 0); |
| } |
| |
| /* |
| * Set one of the t_flags bits in the TCB. |
| */ |
| static void set_tcb_tflag(struct sock *sk, unsigned int bit_pos, int val) |
| { |
| t3_set_tcb_field(sk, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos); |
| } |
| |
| /* |
| * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting. |
| */ |
| void t3_set_nagle(struct sock *sk) |
| { |
| set_tcb_tflag(sk, S_TF_NAGLE, !(tcp_sk(sk)->nonagle & TCP_NAGLE_OFF)); |
| } |
| |
| /* |
| * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting. |
| */ |
| void t3_set_keepalive(struct sock *sk, int on_off) |
| { |
| set_tcb_tflag(sk, S_TF_KEEPALIVE, on_off); |
| } |
| |
| void t3_set_rcv_coalesce_enable(struct sock *sk, int on_off) |
| { |
| set_tcb_tflag(sk, S_TF_RCV_COALESCE_ENABLE, on_off); |
| } |
| |
| void t3_set_dack(struct sock *sk, int on_off) |
| { |
| set_tcb_tflag(sk, S_TF_DACK, on_off); |
| } |
| |
| void t3_set_dack_mss(struct sock *sk, int on_off) |
| { |
| set_tcb_tflag(sk, S_TF_DACK_MSS, on_off); |
| } |
| |
| void t3_set_migrating(struct sock *sk, int on_off) |
| { |
| set_tcb_tflag(sk, S_TF_MIGRATING, on_off); |
| } |
| |
| void t3_set_non_offload(struct sock *sk, int on_off) |
| { |
| set_tcb_tflag(sk, S_TF_NON_OFFLOAD, on_off); |
| } |
| |
| /* |
| * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting. |
| */ |
| void t3_set_tos(struct sock *sk) |
| { |
| t3_set_tcb_field(sk, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS), |
| V_TCB_TOS(SK_TOS(sk))); |
| } |
| |
| /* |
| * In DDP mode, TP fails to schedule a timer to push RX data to the host when |
| * DDP is disabled (data is delivered to freelist). [Note that, the peer should |
| * set the PSH bit in the last segment, which would trigger delivery.] |
| * We work around the issue by setting a DDP buffer in a partial placed state, |
| * which guarantees that TP will schedule a timer. |
| */ |
| #define TP_DDP_TIMER_WORKAROUND_MASK\ |
| (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\ |
| ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\ |
| V_TCB_RX_DDP_BUF0_LEN(3)) << 32)) |
| #define TP_DDP_TIMER_WORKAROUND_VAL\ |
| (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\ |
| ((V_TCB_RX_DDP_BUF0_OFFSET((u64)1) | V_TCB_RX_DDP_BUF0_LEN((u64)2)) <<\ |
| 32)) |
| |
| void t3_enable_ddp(struct sock *sk, int on) |
| { |
| if (on) |
| t3_set_tcb_field(sk, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), |
| V_TF_DDP_OFF(0)); |
| else |
| t3_set_tcb_field(sk, W_TCB_RX_DDP_FLAGS, |
| V_TF_DDP_OFF(1) | |
| TP_DDP_TIMER_WORKAROUND_MASK, |
| V_TF_DDP_OFF(1) | |
| TP_DDP_TIMER_WORKAROUND_VAL); |
| } |
| |
| void t3_set_ddp_tag(struct sock *sk, int buf_idx, unsigned int tag_color) |
| { |
| t3_set_tcb_field(sk, W_TCB_RX_DDP_BUF0_TAG + buf_idx, |
| V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), |
| tag_color); |
| } |
| |
| void t3_set_ddp_buf(struct sock *sk, int buf_idx, unsigned int offset, |
| unsigned int len) |
| { |
| if (buf_idx == 0) |
| t3_set_tcb_field(sk, W_TCB_RX_DDP_BUF0_OFFSET, |
| V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | |
| V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), |
| V_TCB_RX_DDP_BUF0_OFFSET((u64)offset) | |
| V_TCB_RX_DDP_BUF0_LEN((u64)len)); |
| else |
| t3_set_tcb_field(sk, W_TCB_RX_DDP_BUF1_OFFSET, |
| V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | |
| V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32), |
| V_TCB_RX_DDP_BUF1_OFFSET((u64)offset) | |
| V_TCB_RX_DDP_BUF1_LEN(((u64)len) << 32)); |
| } |
| |
| int t3_set_cong_control(struct sock *sk, const char *name) |
| { |
| int cong_algo; |
| |
| for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++) |
| if (!strcmp(name, t3_cong_ops[cong_algo].name)) |
| break; |
| |
| if (cong_algo >= ARRAY_SIZE(t3_cong_ops)) |
| return -EINVAL; |
| return 0; |
| } |
| |
| int t3_get_tcb(struct sock *sk) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct cpl_get_tcb *req; |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct sk_buff *skb = alloc_skb(sizeof(*req), gfp_any()); |
| |
| if (!skb) |
| return -ENOMEM; |
| |
| skb->priority = mkprio(CPL_PRIORITY_CONTROL, sk); |
| req = (struct cpl_get_tcb *)__skb_put(skb, sizeof(*req)); |
| req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); |
| OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, cplios->tid)); |
| req->cpuno = htons(cplios->rss_cpu_idx); |
| if (sk->sk_state == TCP_SYN_SENT) |
| __skb_queue_tail(&tp->out_of_order_queue, skb); // defer |
| else |
| cxgb3_ofld_send(T3C_DEV(sk), skb); |
| return 0; |
| } |
| |
| |
| /* |
| * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are |
| * permitted to return without sending the message in case we cannot allocate |
| * an sk_buff. Returns the number of credits sent. |
| */ |
| u32 t3_send_rx_credits(struct sock *sk, u32 credits, u32 dack, int nofail) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct sk_buff *skb; |
| struct cpl_rx_data_ack *req; |
| |
| skb = nofail ? alloc_ctrl_skb(tp, sizeof(*req)) : |
| alloc_skb(sizeof(*req), GFP_ATOMIC); |
| if (!skb) |
| return 0; |
| |
| req = (struct cpl_rx_data_ack *)__skb_put(skb, sizeof(*req)); |
| req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); |
| OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, cplios->tid)); |
| req->credit_dack = htonl(dack | V_RX_CREDITS(credits)); |
| skb->priority = mkprio(CPL_PRIORITY_ACK, sk); |
| cxgb3_ofld_send(T3C_DEV(sk), skb); |
| return credits; |
| } |
| |
| /* |
| * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled. |
| * This is only used in DDP mode, so we take the opportunity to also set the |
| * DACK mode and flush any Rx credits. |
| */ |
| void t3_send_rx_modulate(struct sock *sk) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct sk_buff *skb; |
| struct cpl_rx_data_ack *req; |
| struct tcp_sock *tp = tcp_sk(sk); |
| u32 dack; |
| |
| dack = t3_select_delack(sk); |
| |
| skb = alloc_ctrl_skb(tp, sizeof(*req)); |
| |
| req = (struct cpl_rx_data_ack *)__skb_put(skb, sizeof(*req)); |
| req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); |
| OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, cplios->tid)); |
| req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | |
| V_RX_DACK_MODE(dack) | |
| V_RX_CREDITS(tp->copied_seq - tp->rcv_wup)); |
| skb->priority = mkprio(CPL_PRIORITY_CONTROL, sk); |
| cxgb3_ofld_send(T3C_DEV(sk), skb); |
| tp->rcv_wup = tp->copied_seq; |
| } |
| |
| /* |
| * Handle receipt of an urgent pointer. |
| */ |
| static void handle_urg_ptr(struct sock *sk, u32 urg_seq) |
| { |
| struct tcp_sock *tp = tcp_sk(sk); |
| |
| urg_seq--; /* initially points past the urgent data, per BSD */ |
| |
| if (tp->urg_data && !after(urg_seq, tp->urg_seq)) |
| return; /* duplicate pointer */ |
| |
| sk_send_sigurg(sk); |
| if (tp->urg_seq == tp->copied_seq && tp->urg_data && |
| !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { |
| struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); |
| |
| tp->copied_seq++; |
| if (skb && tp->copied_seq - ULP_SKB_CB(skb)->seq >= skb->len) |
| tom_eat_skb(sk, skb, 0); |
| } |
| tp->urg_data = TCP_URG_NOTYET; |
| tp->urg_seq = urg_seq; |
| } |
| |
| /* |
| * Returns true if a socket cannot accept new Rx data. |
| */ |
| static inline int sk_no_receive(const struct sock *sk) |
| { |
| return (sk->sk_shutdown & RCV_SHUTDOWN); |
| } |
| |
| /* |
| * Process an urgent data notification. |
| */ |
| static void rx_urg_notify(struct sock *sk, struct sk_buff *skb) |
| { |
| struct cpl_rx_urg_notify *hdr = cplhdr(skb); |
| |
| if (!sk_no_receive(sk)) |
| handle_urg_ptr(sk, ntohl(hdr->seq)); |
| |
| __kfree_skb(skb); |
| } |
| |
| /* |
| * Handler for RX_URG_NOTIFY CPL messages. |
| */ |
| static int do_rx_urg_notify(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) |
| { |
| struct sock *sk = (struct sock *)ctx; |
| |
| VALIDATE_SOCK(sk); |
| |
| process_cpl_msg(rx_urg_notify, sk, skb); |
| return 0; |
| } |
| |
| /* |
| * A helper function that aborts a connection and increments the given MIB |
| * counter. The supplied skb is used to generate the ABORT_REQ message if |
| * possible. Must be called with softirqs disabled. |
| */ |
| static inline void abort_conn(struct sock *sk, struct sk_buff *skb, int mib) |
| { |
| struct sk_buff *abort_skb; |
| |
| abort_skb = __get_cpl_reply_skb(skb, sizeof(struct cpl_abort_req), |
| GFP_ATOMIC); |
| if (abort_skb) { |
| T3_NET_INC_STATS_BH(sock_net(sk), mib); |
| t3_send_reset(sk, CPL_ABORT_SEND_RST, abort_skb); |
| } |
| } |
| |
| /* |
| * Returns true if we need to explicitly request RST when we receive new data |
| * on an RX-closed connection. |
| */ |
| static inline int need_rst_on_excess_rx(const struct sock *sk) |
| { |
| return 1; |
| } |
| |
| /* |
| * Handles Rx data that arrives in a state where the socket isn't accepting |
| * new data. |
| */ |
| static void handle_excess_rx(struct sock *sk, struct sk_buff *skb) |
| { |
| if (need_rst_on_excess_rx(sk) && !cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN)) |
| abort_conn(sk, skb, LINUX_MIB_TCPABORTONDATA); |
| |
| kfree_skb(skb); /* can't use __kfree_skb here */ |
| } |
| |
| /* |
| * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE) |
| * by getting the DDP offset from the TCB. |
| */ |
| static void tcb_rpl_as_ddp_complete(struct sock *sk, struct sk_buff *skb) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct ddp_state *q = DDP_STATE(sk); |
| struct ddp_buf_state *bsp; |
| struct cpl_get_tcb_rpl *hdr; |
| unsigned int ddp_offset, dack, dack_mss; |
| u64 t; |
| __be64 *tcb; |
| |
| if (unlikely(!(tp = tcp_sk(sk)))) { |
| kfree_skb(skb); |
| return; |
| } |
| |
| /* Note that we only accout for CPL_GET_TCB issued by the DDP code. We |
| * really need a cookie in order to dispatch the RPLs. |
| */ |
| q->get_tcb_count--; |
| |
| /* It is a possible that a previous CPL already invalidated UBUF DDP |
| * and moved the cur_buf idx and hence no further processing of this |
| * skb is required. However, the app might be sleeping on |
| * !q->get_tcb_count and we need to wake it up. |
| */ |
| if (q->cancel_ubuf && !t3_ddp_ubuf_pending(sk)) { |
| kfree_skb(skb); |
| |
| if (!sock_flag(sk, SOCK_DEAD)) |
| sk->sk_data_ready(sk, 0); |
| |
| return; |
| } |
| |
| bsp = &q->buf_state[q->cur_buf]; |
| hdr = cplhdr(skb); |
| tcb = (__be64 *)(hdr + 1); |
| if (q->cur_buf == 0) { |
| t = be64_to_cpu(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]); |
| ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET); |
| } else { |
| t = be64_to_cpu(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]); |
| ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET; |
| } |
| ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET; |
| t = be64_to_cpu(tcb[(31 - W_TCB_T_FLAGS1) /2]); |
| dack = (t >> (32 + S_TF_DACK)) & 0x1; |
| t = be64_to_cpu(tcb[(31 - W_TCB_T_FLAGS2) /2]); |
| dack_mss = (t >> (S_TF_DACK_MSS - 32)) & 0x1; |
| dack |= dack_mss << 1; |
| if (unlikely(dack != cplios->delack_mode)) { |
| cplios->delack_mode = dack; |
| cplios->delack_seq = tp->rcv_nxt; |
| } |
| |
| #ifdef T3_TRACE |
| T3_TRACE4(TIDTB(sk), |
| "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u ddp_offset %u delack_mode %u", |
| tp->rcv_nxt, q->cur_buf, ddp_offset, cplios->delack_mode); |
| #endif |
| |
| #if 0 |
| { |
| unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx; |
| |
| t = be64_to_cpu(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]); |
| ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS; |
| |
| t = be64_to_cpu(tcb[(31 - W_TCB_RCV_NXT) / 2]); |
| rcv_nxt = t >> S_TCB_RCV_NXT; |
| rcv_nxt &= M_TCB_RCV_NXT; |
| |
| t = be64_to_cpu(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]); |
| rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET); |
| rx_hdr_offset &= M_TCB_RX_HDR_OFFSET; |
| |
| T3_TRACE2(TIDTB(sk), |
| "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x", |
| ddp_flags, rcv_nxt - rx_hdr_offset); |
| T3_TRACE4(TB(q), |
| "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u", |
| tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf); |
| T3_TRACE3(TB(q), |
| "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u", |
| rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset); |
| T3_TRACE2(TB(q), |
| "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x", |
| q->buf_state[0].flags, q->buf_state[1].flags); |
| |
| } |
| #endif |
| |
| skb_ulp_ddp_offset(skb) = bsp->cur_offset; |
| bsp->cur_offset = ddp_offset; |
| skb->len = ddp_offset - skb_ulp_ddp_offset(skb); |
| |
| if (unlikely(sk_no_receive(sk) && skb->len)) { |
| handle_excess_rx(sk, skb); |
| return; |
| } |
| |
| #ifdef T3_TRACE |
| if ((int)skb->len < 0) { |
| T3_TRACE0(TIDTB(sk), "tcb_rpl_as_ddp_complete: neg len"); |
| } |
| #endif |
| if (bsp->flags & DDP_BF_NOCOPY) { |
| #ifdef T3_TRACE |
| T3_TRACE0(TIDTB(sk), |
| "tcb_rpl_as_ddp_complete: CANCEL UBUF"); |
| |
| if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) { |
| printk("!cancel_ubuf"); |
| } |
| #endif |
| skb_ulp_ddp_flags(skb) = DDP_BF_PSH | DDP_BF_NOCOPY | 1; |
| bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA); |
| q->cur_buf ^= 1; |
| } else if (bsp->flags & DDP_BF_NOFLIP) { |
| |
| skb_ulp_ddp_flags(skb) = 1; /* always a kernel buffer */ |
| |
| /* now HW buffer carries a user buffer */ |
| bsp->flags &= ~DDP_BF_NOFLIP; |
| bsp->flags |= DDP_BF_NOCOPY; |
| |
| /* It is possible that the CPL_GET_TCB_RPL doesn't indicate |
| * any new data in which case we're done. If in addition the |
| * offset is 0, then there wasn't a completion for the kbuf |
| * and we need to decrement the posted count. |
| */ |
| if (!skb->len) { |
| if (!ddp_offset) { |
| q->kbuf_posted--; |
| bsp->flags |= DDP_BF_NODATA; |
| } |
| BUG_ON(skb->len); |
| kfree_skb(skb); |
| return; |
| } |
| } else { |
| /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP, |
| * but it got here way late and nobody cares anymore. |
| */ |
| kfree_skb(skb); |
| return; |
| } |
| |
| skb_gl_set(skb, bsp->gl); |
| ULP_SKB_CB(skb)->seq = tp->rcv_nxt; |
| tp->rcv_nxt += skb->len; |
| |
| skb_reset_transport_header(skb); |
| tcp_hdr(skb)->fin = 0; /* changes original TCB */ |
| |
| inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; |
| |
| #ifdef T3_TRACE |
| T3_TRACE3(TIDTB(sk), |
| "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u lskb->len %u", |
| ULP_SKB_CB(skb)->seq, q->cur_buf, skb->len); |
| #endif |
| |
| __skb_queue_tail(&sk->sk_receive_queue, skb); |
| |
| if (!sock_flag(sk, SOCK_DEAD)) |
| sk->sk_data_ready(sk, 0); |
| } |
| |
| /* |
| * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code, |
| * in that case they are similar to DDP completions. |
| */ |
| static int do_get_tcb_rpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) |
| { |
| struct sock *sk = (struct sock *)ctx; |
| |
| /* OK if socket doesn't exist */ |
| if (!sk) |
| return CPL_RET_BUF_DONE; |
| |
| process_cpl_msg(tcb_rpl_as_ddp_complete, sk, skb); |
| return 0; |
| } |
| |
| static void handle_ddp_data(struct sock *sk, struct sk_buff *origskb) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct ddp_state *q; |
| struct ddp_buf_state *bsp; |
| struct cpl_rx_data *hdr = cplhdr(origskb); |
| unsigned int rcv_nxt = ntohl(hdr->seq); |
| struct sk_buff *skb; |
| |
| /* If the sequence number received is less than expected then the assumptions |
| that follow do not apply. |
| */ |
| |
| if (tp->rcv_nxt >= rcv_nxt) |
| return; |
| |
| q = DDP_STATE(sk); |
| if (!q->ddp_setup) |
| return; |
| |
| skb = skb_clone(origskb, GFP_ATOMIC); |
| if (!skb) |
| return; |
| |
| bsp = &q->buf_state[q->cur_buf]; |
| |
| /* Here we assume that data placed into host memory by DDP corresponds |
| to the difference between the sequence number received in the RX_DATA header |
| and the expected sequence number. And since we tested the sequence above |
| so the computed skb->len is positive we won't panic later on... |
| */ |
| |
| skb->len = rcv_nxt - tp->rcv_nxt; |
| |
| #ifdef T3_TRACE |
| if ((int)skb->len < 0) { |
| T3_TRACE0(TIDTB(sk), "handle_ddp_data: neg len"); |
| } |
| #endif |
| |
| skb_gl_set(skb, bsp->gl); |
| |
| skb_ulp_ddp_offset(skb) = bsp->cur_offset; |
| skb_ulp_ddp_flags(skb) = |
| DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; |
| if (bsp->flags & DDP_BF_NOCOPY) |
| bsp->flags &= ~DDP_BF_NOCOPY; |
| |
| if (unlikely(hdr->dack_mode != cplios->delack_mode)) { |
| cplios->delack_mode = hdr->dack_mode; |
| cplios->delack_seq = tp->rcv_nxt; |
| } |
| |
| ULP_SKB_CB(skb)->seq = tp->rcv_nxt; |
| tp->rcv_nxt = rcv_nxt; |
| bsp->cur_offset += skb->len; |
| if (!(bsp->flags & DDP_BF_NOFLIP)) |
| q->cur_buf ^= 1; |
| inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; |
| __skb_queue_tail(&sk->sk_receive_queue, skb); |
| |
| /* For now, don't re-enable DDP after a connection fell out of DDP |
| * mode. |
| */ |
| q->ubuf_ddp_ready = 0; |
| } |
| /* |
| * Process new data received for a connection. |
| */ |
| static void new_rx_data(struct sock *sk, struct sk_buff *skb) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct cpl_rx_data *hdr = cplhdr(skb); |
| struct tcp_sock *tp = tcp_sk(sk); |
| |
| if (unlikely(sk_no_receive(sk))) { |
| handle_excess_rx(sk, skb); |
| return; |
| } |
| |
| if (cplios->ulp_mode == ULP_MODE_TCPDDP) |
| handle_ddp_data(sk, skb); |
| |
| ULP_SKB_CB(skb)->seq = ntohl(hdr->seq); |
| ULP_SKB_CB(skb)->flags = 0; |
| skb_ulp_mode(skb) = ULP_MODE_NONE; /* for iSCSI */ |
| skb_ulp_ddp_flags(skb) = 0; /* for DDP */ |
| |
| #if VALIDATE_SEQ |
| if (unlikely(ULP_SKB_CB(skb)->seq != tp->rcv_nxt)) { |
| printk(KERN_ERR |
| "%s: TID %u: Bad sequence number %u, expected %u\n", |
| cplios->toedev->name, cplios->tid, ULP_SKB_CB(skb)->seq, |
| tp->rcv_nxt); |
| __kfree_skb(skb); |
| return; |
| } |
| #endif |
| skb_reset_transport_header(skb); |
| __skb_pull(skb, sizeof(*hdr)); |
| if (!skb->data_len) |
| __skb_trim(skb, ntohs(hdr->len)); |
| |
| if (unlikely(hdr->urg)) |
| handle_urg_ptr(sk, tp->rcv_nxt + ntohs(hdr->urg)); |
| if (unlikely(tp->urg_data == TCP_URG_NOTYET && |
| tp->urg_seq - tp->rcv_nxt < skb->len)) |
| tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq - |
| tp->rcv_nxt]; |
| |
| if (unlikely(hdr->dack_mode != cplios->delack_mode)) { |
| cplios->delack_mode = hdr->dack_mode; |
| cplios->delack_seq = tp->rcv_nxt; |
| } |
| |
| tcp_hdr(skb)->fin = 0; /* modifies original hdr->urg */ |
| tp->rcv_nxt += skb->len; |
| |
| #ifdef T3_TRACE |
| T3_TRACE2(TIDTB(sk), |
| "new_rx_data: seq 0x%x len %u", |
| ULP_SKB_CB(skb)->seq, skb->len); |
| #endif |
| |
| inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; |
| __skb_queue_tail(&sk->sk_receive_queue, skb); |
| |
| if (!sock_flag(sk, SOCK_DEAD)) { |
| check_sk_callbacks(sk); |
| sk->sk_data_ready(sk, 0); |
| } |
| } |
| |
| /* |
| * Handler for RX_DATA CPL messages. |
| */ |
| static int do_rx_data(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) |
| { |
| struct sock *sk = (struct sock *)ctx; |
| |
| VALIDATE_SOCK(sk); |
| |
| skb_gl_set(skb, NULL); /* indicates packet is RX_DATA */ |
| |
| process_cpl_msg(new_rx_data, sk, skb); |
| return 0; |
| } |
| |
| static void new_rx_data_ddp(struct sock *sk, struct sk_buff *skb) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct tcp_sock *tp; |
| struct ddp_state *q; |
| struct ddp_buf_state *bsp; |
| struct cpl_rx_data_ddp *hdr; |
| unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; |
| unsigned int nomoredata=0; |
| unsigned int delack_mode; |
| |
| if (unlikely(sk_no_receive(sk))) { |
| handle_excess_rx(sk, skb); |
| return; |
| } |
| |
| tp = tcp_sk(sk); |
| q = DDP_STATE(sk); |
| hdr = cplhdr(skb); |
| ddp_report = ntohl(hdr->ddp_report); |
| buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; |
| bsp = &q->buf_state[buf_idx]; |
| |
| #ifdef T3_TRACE |
| T3_TRACE5(TIDTB(sk), |
| "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u " |
| "hdr seq 0x%x len %u offset %u", |
| tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq), |
| ntohs(hdr->len), G_DDP_OFFSET(ddp_report)); |
| T3_TRACE1(TIDTB(sk), |
| "new_rx_data_ddp: ddp_report 0x%x", |
| ddp_report); |
| #endif |
| |
| ddp_len = ntohs(hdr->len); |
| rcv_nxt = ntohl(hdr->seq) + ddp_len; |
| |
| delack_mode = G_DDP_DACK_MODE(ddp_report); |
| if (unlikely(G_DDP_DACK_MODE(ddp_report) != cplios->delack_mode)) { |
| cplios->delack_mode = delack_mode; |
| cplios->delack_seq = tp->rcv_nxt; |
| } |
| |
| ULP_SKB_CB(skb)->seq = tp->rcv_nxt; |
| tp->rcv_nxt = rcv_nxt; |
| |
| /* |
| * Store the length in skb->len. We are changing the meaning of |
| * skb->len here, we need to be very careful that nothing from now on |
| * interprets ->len of this packet the usual way. |
| */ |
| skb->len = tp->rcv_nxt - ULP_SKB_CB(skb)->seq; |
| |
| /* |
| * Figure out where the new data was placed in the buffer and store it |
| * in when. Assumes the buffer offset starts at 0, consumer needs to |
| * account for page pod's pg_offset. |
| */ |
| end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; |
| skb_ulp_ddp_offset(skb) = end_offset - skb->len; |
| |
| /* |
| * We store in mac.raw the address of the gather list where the |
| * placement happened. |
| */ |
| skb_gl_set(skb, bsp->gl); |
| bsp->cur_offset = end_offset; |
| |
| /* |
| * Bit 0 of DDP flags stores whether the DDP buffer is completed. |
| * Note that other parts of the code depend on this being in bit 0. |
| */ |
| if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->length) { |
| skb_ulp_ddp_flags(skb) = 0; /* potential spurious completion */ |
| BUG_ON(1); |
| } else { |
| skb_ulp_ddp_flags(skb) = !!(ddp_report & F_DDP_BUF_COMPLETE); |
| if (skb_ulp_ddp_flags(skb) && !(bsp->flags & DDP_BF_NOFLIP)) { |
| q->cur_buf ^= 1; /* flip buffers */ |
| if (end_offset < q->kbuf[0]->length) |
| nomoredata=1; |
| } |
| } |
| |
| if (bsp->flags & DDP_BF_NOCOPY) { |
| skb_ulp_ddp_flags(skb) |= (bsp->flags & DDP_BF_NOCOPY); |
| bsp->flags &= ~DDP_BF_NOCOPY; |
| } |
| |
| if (ddp_report & F_DDP_PSH) |
| skb_ulp_ddp_flags(skb) |= DDP_BF_PSH; |
| |
| if (nomoredata) |
| skb_ulp_ddp_flags(skb) |= DDP_BF_NODATA; |
| |
| skb_reset_transport_header(skb); |
| tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */ |
| |
| inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; |
| __skb_queue_tail(&sk->sk_receive_queue, skb); |
| if (!sock_flag(sk, SOCK_DEAD)) |
| sk->sk_data_ready(sk, 0); |
| } |
| |
| #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ |
| F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ |
| F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ |
| F_DDP_INVALID_PPOD) |
| |
| /* |
| * Handler for RX_DATA_DDP CPL messages. |
| */ |
| static int do_rx_data_ddp(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) |
| { |
| struct sock *sk = ctx; |
| const struct cpl_rx_data_ddp *hdr = cplhdr(skb); |
| |
| VALIDATE_SOCK(sk); |
| |
| if (unlikely(ntohl(hdr->ddpvld_status) & DDP_ERR)) { |
| printk(KERN_ERR "RX_DATA_DDP for TID %u reported error 0x%x\n", |
| GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status))); |
| return CPL_RET_BUF_DONE; |
| } |
| |
| process_cpl_msg(new_rx_data_ddp, sk, skb); |
| return 0; |
| } |
| |
| static void process_ddp_complete(struct sock *sk, struct sk_buff *skb) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct ddp_state *q; |
| struct ddp_buf_state *bsp; |
| struct cpl_rx_ddp_complete *hdr; |
| unsigned int ddp_report, buf_idx; |
| unsigned int nomoredata=0; |
| unsigned int delack_mode; |
| |
| if (unlikely(sk_no_receive(sk))) { |
| handle_excess_rx(sk, skb); |
| return; |
| } |
| |
| tp = tcp_sk(sk); |
| q = DDP_STATE(sk); |
| hdr = cplhdr(skb); |
| ddp_report = ntohl(hdr->ddp_report); |
| buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1; |
| bsp = &q->buf_state[buf_idx]; |
| |
| skb_ulp_ddp_offset(skb) = bsp->cur_offset; |
| skb->len = G_DDP_OFFSET(ddp_report) - skb_ulp_ddp_offset(skb); |
| |
| #ifdef T3_TRACE |
| T3_TRACE5(TIDTB(sk), |
| "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " |
| "ddp_report 0x%x offset %u, len %u", |
| tp->rcv_nxt, bsp->cur_offset, ddp_report, |
| G_DDP_OFFSET(ddp_report), skb->len); |
| #endif |
| |
| bsp->cur_offset += skb->len; |
| |
| if (!(bsp->flags & DDP_BF_NOFLIP)) { |
| q->cur_buf ^= 1; /* flip buffers */ |
| if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->length) |
| nomoredata=1; |
| } |
| |
| |
| #ifdef T3_TRACE |
| T3_TRACE4(TIDTB(sk), |
| "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u " |
| "ddp_report %u offset %u", |
| tp->rcv_nxt, bsp->cur_offset, ddp_report, |
| G_DDP_OFFSET(ddp_report)); |
| #endif |
| skb_gl_set(skb, bsp->gl); |
| skb_ulp_ddp_flags(skb) = (bsp->flags & DDP_BF_NOCOPY) | 1; |
| |
| if (bsp->flags & DDP_BF_NOCOPY) |
| bsp->flags &= ~DDP_BF_NOCOPY; |
| if (nomoredata) |
| skb_ulp_ddp_flags(skb) |= DDP_BF_NODATA; |
| |
| delack_mode = G_DDP_DACK_MODE(ddp_report); |
| if (unlikely(G_DDP_DACK_MODE(ddp_report) != cplios->delack_mode)) { |
| cplios->delack_mode = delack_mode; |
| cplios->delack_seq = tp->rcv_nxt; |
| } |
| |
| ULP_SKB_CB(skb)->seq = tp->rcv_nxt; |
| tp->rcv_nxt += skb->len; |
| |
| skb_reset_transport_header(skb); |
| tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ |
| |
| inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; |
| __skb_queue_tail(&sk->sk_receive_queue, skb); |
| if (!sock_flag(sk, SOCK_DEAD)) |
| sk->sk_data_ready(sk, 0); |
| } |
| |
| /* |
| * Handler for RX_DDP_COMPLETE CPL messages. |
| */ |
| static int do_rx_ddp_complete(struct t3cdev *cdev, struct sk_buff *skb, |
| void *ctx) |
| { |
| struct sock *sk = ctx; |
| |
| VALIDATE_SOCK(sk); |
| |
| process_cpl_msg(process_ddp_complete, sk, skb); |
| return 0; |
| } |
| |
| /* |
| * Move a socket to TIME_WAIT state. We need to make some adjustments to the |
| * socket state before calling tcp_time_wait to comply with its expectations. |
| */ |
| static void enter_timewait(struct sock *sk) |
| { |
| struct tcp_sock *tp = tcp_sk(sk); |
| |
| /* |
| * Bump rcv_nxt for the peer FIN. We don't do this at the time we |
| * process peer_close because we don't want to carry the peer FIN in |
| * the socket's receive queue and if we increment rcv_nxt without |
| * having the FIN in the receive queue we'll confuse facilities such |
| * as SIOCINQ. |
| */ |
| tp->rcv_nxt++; |
| |
| TS_RECENT_STAMP(tp) = 0; /* defeat recycling */ |
| tp->srtt = 0; /* defeat tcp_update_metrics */ |
| tcp_time_wait(sk, TCP_TIME_WAIT, 0); /* calls tcp_done */ |
| } |
| |
| /* |
| * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This |
| * function deals with the data that may be reported along with the FIN. |
| * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to |
| * perform normal FIN-related processing. In the latter case 1 indicates that |
| * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the |
| * skb can be freed. |
| */ |
| static int handle_peer_close_data(struct sock *sk, struct sk_buff *skb) |
| { |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct ddp_state *q; |
| struct ddp_buf_state *bsp; |
| struct cpl_peer_close *req = cplhdr(skb); |
| unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */ |
| |
| if (tp->rcv_nxt == rcv_nxt) /* no data */ |
| return 0; |
| |
| if (unlikely(sk_no_receive(sk))) { |
| handle_excess_rx(sk, skb); |
| |
| /* |
| * Although we discard the data we want to process the FIN so |
| * that PEER_CLOSE + data behaves the same as RX_DATA_DDP + |
| * PEER_CLOSE without data. In particular this PEER_CLOSE |
| * may be what will close the connection. We return 1 because |
| * handle_excess_rx() already freed the packet. |
| */ |
| return 1; |
| } |
| |
| q = DDP_STATE(sk); |
| bsp = &q->buf_state[q->cur_buf]; |
| skb->len = rcv_nxt - tp->rcv_nxt; |
| skb_gl_set(skb, bsp->gl); |
| skb_ulp_ddp_offset(skb) = bsp->cur_offset; |
| skb_ulp_ddp_flags(skb) = |
| DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; |
| ULP_SKB_CB(skb)->seq = tp->rcv_nxt; |
| tp->rcv_nxt = rcv_nxt; |
| bsp->cur_offset += skb->len; |
| if (!(bsp->flags & DDP_BF_NOFLIP)) |
| q->cur_buf ^= 1; |
| |
| skb_reset_transport_header(skb); |
| tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */ |
| |
| inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; |
| __skb_queue_tail(&sk->sk_receive_queue, skb); |
| if (!sock_flag(sk, SOCK_DEAD)) |
| sk->sk_data_ready(sk, 0); |
| return 1; |
| } |
| |
| /* |
| * Handle a peer FIN. |
| */ |
| static void do_peer_fin(struct sock *sk, struct sk_buff *skb) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| int keep = 0, dead = sock_flag(sk, SOCK_DEAD); |
| |
| #ifdef T3_TRACE |
| T3_TRACE0(TIDTB(sk),"do_peer_fin:"); |
| #endif |
| |
| if (!is_t3a(cplios->toedev) && |
| cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING)) |
| goto out; |
| |
| if (cplios->ulp_mode == ULP_MODE_TCPDDP) { |
| keep = handle_peer_close_data(sk, skb); |
| if (keep < 0) |
| return; |
| } |
| |
| sk->sk_shutdown |= RCV_SHUTDOWN; |
| sock_set_flag(sk, SOCK_DONE); |
| switch (sk->sk_state) { |
| case TCP_SYN_RECV: |
| case TCP_ESTABLISHED: |
| tcp_set_state(sk, TCP_CLOSE_WAIT); |
| break; |
| case TCP_FIN_WAIT1: |
| tcp_set_state(sk, TCP_CLOSING); |
| break; |
| case TCP_FIN_WAIT2: |
| /* |
| * If we've sent an abort_req we must have sent it too late, |
| * HW will send us a reply telling us so, and this peer_close |
| * is really the last message for this connection and needs to |
| * be treated as an abort_rpl, i.e., transition the connection |
| * to TCP_CLOSE (note that the host stack does this at the |
| * time of generating the RST but we must wait for HW). |
| * Otherwise we enter TIME_WAIT. |
| */ |
| t3_release_offload_resources(sk); |
| if (cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING)) |
| connection_done(sk); |
| else |
| enter_timewait(sk); |
| break; |
| default: |
| printk(KERN_ERR |
| "%s: TID %u received PEER_CLOSE in bad state %d\n", |
| cplios->toedev->name, cplios->tid, sk->sk_state); |
| } |
| |
| if (!dead) { |
| sk->sk_state_change(sk); |
| |
| /* Do not send POLL_HUP for half duplex close. */ |
| if ((sk->sk_shutdown & SEND_SHUTDOWN) || |
| sk->sk_state == TCP_CLOSE) |
| sk_wake_async(sk, 1, POLL_HUP); |
| else |
| sk_wake_async(sk, 1, POLL_IN); |
| } |
| out: if (!keep) |
| __kfree_skb(skb); |
| } |
| |
| /* |
| * Handler for PEER_CLOSE CPL messages. |
| */ |
| static int do_peer_close(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) |
| { |
| struct sock *sk = (struct sock *)ctx; |
| |
| VALIDATE_SOCK(sk); |
| |
| process_cpl_msg_ref(do_peer_fin, sk, skb); |
| return 0; |
| } |
| |
| /* |
| * Process a peer ACK to our FIN. |
| */ |
| static void process_close_con_rpl(struct sock *sk, struct sk_buff *skb) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct cpl_close_con_rpl *rpl = cplhdr(skb); |
| |
| tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ |
| |
| if (!is_t3a(cplios->toedev) && cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING)) |
| goto out; |
| |
| switch (sk->sk_state) { |
| case TCP_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */ |
| t3_release_offload_resources(sk); |
| if (cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING)) |
| connection_done(sk); |
| else |
| enter_timewait(sk); |
| break; |
| case TCP_LAST_ACK: |
| /* |
| * In this state we don't care about pending abort_rpl. |
| * If we've sent abort_req it was post-close and was sent too |
| * late, this close_con_rpl is the actual last message. |
| */ |
| t3_release_offload_resources(sk); |
| connection_done(sk); |
| break; |
| case TCP_FIN_WAIT1: |
| tcp_set_state(sk, TCP_FIN_WAIT2); |
| sk->sk_shutdown |= SEND_SHUTDOWN; |
| dst_confirm(sk->sk_dst_cache); |
| |
| if (!sock_flag(sk, SOCK_DEAD)) |
| sk->sk_state_change(sk); // Wake up lingering close() |
| else if (tcp_sk(sk)->linger2 < 0 && |
| !cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN)) |
| abort_conn(sk, skb, LINUX_MIB_TCPABORTONLINGER); |
| break; |
| default: |
| printk(KERN_ERR |
| "%s: TID %u received CLOSE_CON_RPL in bad state %d\n", |
| cplios->toedev->name, cplios->tid, sk->sk_state); |
| } |
| out: kfree_skb(skb); /* can't use __kfree_skb here */ |
| } |
| |
| /* |
| * Handler for CLOSE_CON_RPL CPL messages. |
| */ |
| static int do_close_con_rpl(struct t3cdev *cdev, struct sk_buff *skb, |
| void *ctx) |
| { |
| struct sock *sk = (struct sock *)ctx; |
| |
| VALIDATE_SOCK(sk); |
| |
| process_cpl_msg_ref(process_close_con_rpl, sk, skb); |
| return 0; |
| } |
| |
| /* |
| * Process abort replies. We only process these messages if we anticipate |
| * them as the coordination between SW and HW in this area is somewhat lacking |
| * and sometimes we get ABORT_RPLs after we are done with the connection that |
| * originated the ABORT_REQ. |
| */ |
| static void process_abort_rpl(struct sock *sk, struct sk_buff *skb) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| #ifdef T3_TRACE |
| T3_TRACE1(TIDTB(sk), |
| "process_abort_rpl: GTS rpl pending %d", |
| cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING)); |
| #endif |
| |
| if (cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING)) { |
| if (!cplios_flag(sk, CPLIOS_ABORT_RPL_RCVD) && |
| !is_t3a(cplios->toedev)) |
| cplios_set_flag(sk, CPLIOS_ABORT_RPL_RCVD); |
| else { |
| cplios_reset_flag(sk, CPLIOS_ABORT_RPL_RCVD); |
| cplios_reset_flag(sk, CPLIOS_ABORT_RPL_PENDING); |
| if (!cplios_flag(sk, CPLIOS_ABORT_REQ_RCVD) || |
| !is_t3a(cplios->toedev)) { |
| BUG_ON(cplios_flag(sk, CPLIOS_ABORT_REQ_RCVD)); |
| t3_release_offload_resources(sk); |
| connection_done(sk); |
| } |
| } |
| } |
| __kfree_skb(skb); |
| } |
| |
| /* |
| * Handle an ABORT_RPL_RSS CPL message. |
| */ |
| static int do_abort_rpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) |
| { |
| struct sock *sk; |
| struct cpl_abort_rpl_rss *rpl = cplhdr(skb); |
| |
| /* |
| * Ignore replies to post-close aborts indicating that the abort was |
| * requested too late. These connections are terminated when we get |
| * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss |
| * arrives the TID is either no longer used or it has been recycled. |
| */ |
| if (rpl->status == CPL_ERR_ABORT_FAILED) { |
| discard: |
| __kfree_skb(skb); |
| return 0; |
| } |
| |
| sk = (struct sock *)ctx; |
| |
| /* |
| * Sometimes we've already closed the socket, e.g., a post-close |
| * abort races with ABORT_REQ_RSS, the latter frees the socket |
| * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED, |
| * but FW turns the ABORT_REQ into a regular one and so we get |
| * ABORT_RPL_RSS with status 0 and no socket. Only on T3A. |
| */ |
| if (!sk) |
| goto discard; |
| |
| process_cpl_msg_ref(process_abort_rpl, sk, skb); |
| return 0; |
| } |
| |
| /* |
| * Convert the status code of an ABORT_REQ into a Linux error code. Also |
| * indicate whether RST should be sent in response. |
| */ |
| static int abort_status_to_errno(struct sock *sk, int abort_reason, |
| int *need_rst) |
| { |
| switch (abort_reason) { |
| case CPL_ERR_BAD_SYN: |
| // fall through |
| T3_NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); |
| case CPL_ERR_CONN_RESET: |
| // XXX need to handle SYN_RECV due to crossed SYNs |
| return sk->sk_state == TCP_CLOSE_WAIT ? EPIPE : ECONNRESET; |
| case CPL_ERR_XMIT_TIMEDOUT: |
| case CPL_ERR_PERSIST_TIMEDOUT: |
| case CPL_ERR_FINWAIT2_TIMEDOUT: |
| case CPL_ERR_KEEPALIVE_TIMEDOUT: |
| T3_NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); |
| return ETIMEDOUT; |
| default: |
| return EIO; |
| } |
| } |
| |
| static inline void set_abort_rpl_wr(struct sk_buff *skb, unsigned int tid, |
| int cmd) |
| { |
| struct cpl_abort_rpl *rpl = cplhdr(skb); |
| |
| rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL)); |
| rpl->wr.wr_lo = htonl(V_WR_TID(tid)); |
| OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid)); |
| rpl->cmd = cmd; |
| } |
| |
| static void send_deferred_abort_rpl(struct toedev *tdev, struct sk_buff *skb) |
| { |
| struct sk_buff *reply_skb; |
| struct cpl_abort_req_rss *req = cplhdr(skb); |
| |
| reply_skb = alloc_skb_nofail(sizeof(struct cpl_abort_rpl)); |
| reply_skb->priority = CPL_PRIORITY_DATA; |
| __skb_put(reply_skb, sizeof(struct cpl_abort_rpl)); |
| set_abort_rpl_wr(reply_skb, GET_TID(req), req->status); |
| cxgb3_ofld_send(TOM_DATA(tdev)->cdev, reply_skb); |
| kfree_skb(skb); |
| } |
| |
| /* |
| * Returns whether an ABORT_REQ_RSS message is a negative advice. |
| */ |
| static inline int is_neg_adv_abort(unsigned int status) |
| { |
| return status == CPL_ERR_RTX_NEG_ADVICE || |
| status == CPL_ERR_PERSIST_NEG_ADVICE; |
| } |
| |
| static void send_abort_rpl(struct sk_buff *skb, struct toedev *tdev, |
| int rst_status) |
| { |
| struct sk_buff *reply_skb; |
| struct cpl_abort_req_rss *req = cplhdr(skb); |
| |
| reply_skb = get_cpl_reply_skb(skb, sizeof(struct cpl_abort_rpl), |
| gfp_any()); |
| if (!reply_skb) { |
| /* Defer the reply. Stick rst_status into req->cmd. */ |
| req->status = rst_status; |
| t3_defer_reply(skb, tdev, send_deferred_abort_rpl); |
| return; |
| } |
| |
| reply_skb->priority = CPL_PRIORITY_DATA; |
| set_abort_rpl_wr(reply_skb, GET_TID(req), rst_status); |
| kfree_skb(skb); /* can't use __kfree_skb here */ |
| /* |
| * XXX need to sync with ARP as for SYN_RECV connections we can send |
| * these messages while ARP is pending. For other connection states |
| * it's not a problem. |
| */ |
| cxgb3_ofld_send(TOM_DATA(tdev)->cdev, reply_skb); |
| } |
| |
| static void cleanup_syn_rcv_conn(struct sock *child, struct sock *parent) |
| { |
| struct request_sock *req = child->sk_user_data; |
| |
| inet_csk_reqsk_queue_removed(parent, req); |
| synq_remove(child); |
| __reqsk_free(req); |
| child->sk_user_data = NULL; |
| } |
| |
| /* |
| * Performs the actual work to abort a SYN_RECV connection. |
| */ |
| static void do_abort_syn_rcv(struct sock *child, struct sock *parent) |
| { |
| /* |
| * If the server is still open we clean up the child connection, |
| * otherwise the server already did the clean up as it was purging |
| * its SYN queue and the skb was just sitting in its backlog. |
| */ |
| if (likely(parent->sk_state == TCP_LISTEN)) { |
| cleanup_syn_rcv_conn(child, parent); |
| t3_release_offload_resources(child); |
| connection_done(child); |
| } |
| } |
| |
| /* |
| * This is run from a listener's backlog to abort a child connection in |
| * SYN_RCV state (i.e., one on the listener's SYN queue). |
| */ |
| static void bl_abort_syn_rcv(struct sock *lsk, struct sk_buff *skb) |
| { |
| struct sock *child = skb->sk; |
| |
| skb->sk = NULL; |
| do_abort_syn_rcv(child, lsk); |
| send_abort_rpl(skb, BLOG_SKB_CB(skb)->dev, CPL_ABORT_NO_RST); |
| } |
| |
| /* |
| * Handle abort requests for a SYN_RECV connection. These need extra work |
| * because the socket is on its parent's SYN queue. |
| */ |
| static int abort_syn_rcv(struct sock *sk, struct sk_buff *skb) |
| { |
| struct sock *parent; |
| struct toedev *tdev = CPL_IO_STATE(sk)->toedev; |
| struct t3cdev *cdev = TOM_DATA(tdev)->cdev; |
| const struct request_sock *oreq = sk->sk_user_data; |
| struct t3c_tid_entry *t3c_stid; |
| struct tid_info *t; |
| |
| if (!oreq) |
| return -1; /* somehow we are not on the SYN queue */ |
| |
| t = &(T3C_DATA(cdev))->tid_maps; |
| t3c_stid = lookup_stid(t, oreq->ts_recent); |
| parent = ((struct listen_ctx *)t3c_stid->ctx)->lsk; |
| |
| bh_lock_sock(parent); |
| if (!sock_owned_by_user(parent)) { |
| do_abort_syn_rcv(sk, parent); |
| send_abort_rpl(skb, tdev, CPL_ABORT_NO_RST); |
| } else { |
| skb->sk = sk; |
| BLOG_SKB_CB(skb)->backlog_rcv = bl_abort_syn_rcv; |
| __sk_add_backlog(parent, skb); |
| } |
| bh_unlock_sock(parent); |
| return 0; |
| } |
| |
| /* |
| * Process abort requests. If we are waiting for an ABORT_RPL we ignore this |
| * request except that we need to reply to it. |
| */ |
| static void process_abort_req(struct sock *sk, struct sk_buff *skb) |
| { |
| int rst_status = CPL_ABORT_NO_RST; |
| const struct cpl_abort_req_rss *req = cplhdr(skb); |
| |
| if (!cplios_flag(sk, CPLIOS_ABORT_REQ_RCVD)) { |
| cplios_set_flag(sk, CPLIOS_ABORT_REQ_RCVD); |
| cplios_set_flag(sk, CPLIOS_ABORT_SHUTDOWN); |
| __kfree_skb(skb); |
| return; |
| } |
| cplios_reset_flag(sk, CPLIOS_ABORT_REQ_RCVD); |
| |
| /* |
| * Three cases to consider: |
| * a) We haven't sent an abort_req; close the connection. |
| * b) We have sent a post-close abort_req that will get to TP too late |
| * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will |
| * be ignored and the connection should be closed now. |
| * c) We have sent a regular abort_req that will get to TP too late. |
| * That will generate an abort_rpl with status 0, wait for it. |
| */ |
| if (!cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING) || |
| (is_t3a(CPL_IO_STATE(sk)->toedev) && |
| cplios_flag(sk, CPLIOS_CLOSE_CON_REQUESTED))) { |
| sk->sk_err = abort_status_to_errno(sk, req->status, |
| &rst_status); |
| if (!sock_flag(sk, SOCK_DEAD)) |
| sk->sk_error_report(sk); |
| /* |
| * SYN_RECV needs special processing. If abort_syn_rcv() |
| * returns 0 is has taken care of the abort. |
| */ |
| if (sk->sk_state == TCP_SYN_RECV && !abort_syn_rcv(sk, skb)) |
| return; |
| |
| t3_release_offload_resources(sk); |
| connection_done(sk); |
| } |
| |
| send_abort_rpl(skb, BLOG_SKB_CB(skb)->dev, rst_status); |
| } |
| |
| /* |
| * Handle an ABORT_REQ_RSS CPL message. |
| */ |
| static int do_abort_req(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) |
| { |
| const struct cpl_abort_req_rss *req = cplhdr(skb); |
| struct sock *sk = (struct sock *)ctx; |
| |
| if (is_neg_adv_abort(req->status)) { |
| __kfree_skb(skb); |
| return 0; |
| } |
| |
| VALIDATE_SOCK(sk); |
| |
| /* |
| * Save the offload device in the skb, we may process this message |
| * after the socket has closed. |
| */ |
| BLOG_SKB_CB(skb)->dev = CPL_IO_STATE(sk)->toedev; |
| |
| process_cpl_msg_ref(process_abort_req, sk, skb); |
| return 0; |
| } |
| |
| static void pass_open_abort(struct sock *child, struct sock *parent, |
| struct sk_buff *skb) |
| { |
| struct toedev *tdev = BLOG_SKB_CB(skb)->dev; |
| |
| do_abort_syn_rcv(child, parent); |
| if (tdev->ttid == TOE_ID_CHELSIO_T3) { |
| struct cpl_pass_accept_rpl *rpl = cplhdr(skb); |
| |
| rpl->opt0h = htonl(F_TCAM_BYPASS); |
| rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT); |
| cxgb3_ofld_send(TOM_DATA(tdev)->cdev, skb); |
| } else |
| kfree_skb(skb); |
| } |
| |
| /* |
| * Runs from a listener's backlog to abort a child connection that had an |
| * ARP failure. |
| */ |
| static void bl_pass_open_abort(struct sock *lsk, struct sk_buff *skb) |
| { |
| pass_open_abort(skb->sk, lsk, skb); |
| } |
| |
| static void handle_pass_open_arp_failure(struct sock *sk, struct sk_buff *skb) |
| { |
| struct t3cdev *cdev; |
| struct sock *parent; |
| const struct request_sock *oreq; |
| struct t3c_tid_entry *t3c_stid; |
| struct tid_info *t; |
| /* |
| * If the connection is being aborted due to the parent listening |
| * socket going away there's nothing to do, the ABORT_REQ will close |
| * the connection. |
| */ |
| if (cplios_flag(sk, CPLIOS_ABORT_RPL_PENDING)) { |
| kfree_skb(skb); |
| return; |
| } |
| |
| oreq = sk->sk_user_data; |
| cdev = T3C_DEV(sk); |
| t = &(T3C_DATA(cdev))->tid_maps; |
| t3c_stid = lookup_stid(t, oreq->ts_recent); |
| parent = ((struct listen_ctx *)t3c_stid->ctx)->lsk; |
| |
| bh_lock_sock(parent); |
| if (!sock_owned_by_user(parent)) |
| pass_open_abort(sk, parent, skb); |
| else { |
| BLOG_SKB_CB(skb)->backlog_rcv = bl_pass_open_abort; |
| __sk_add_backlog(parent, skb); |
| } |
| bh_unlock_sock(parent); |
| } |
| |
| /* |
| * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly |
| * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV |
| * connection. |
| */ |
| static void pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct sk_buff *skb) |
| { |
| T3_TCP_INC_STATS_BH(sock_net(skb->sk), TCP_MIB_ATTEMPTFAILS); |
| BLOG_SKB_CB(skb)->dev = CPL_IO_STATE(skb->sk)->toedev; |
| process_cpl_msg_ref(handle_pass_open_arp_failure, skb->sk, skb); |
| } |
| |
| #if defined(ROUTE_REQ) |
| static struct dst_entry *route_req(struct sock *sk, struct open_request *req) |
| { |
| struct rtable *rt; |
| struct flowi fl = { .oif = sk->sk_bound_dev_if, |
| .nl_u = { .ip4_u = |
| { .daddr = req->af.v4_req.rmt_addr, |
| .saddr = req->af.v4_req.loc_addr, |
| .tos = RT_CONN_FLAGS(sk)}}, |
| .proto = IPPROTO_TCP, |
| .uli_u = { .ports = |
| #ifdef LINUX_2_4 |
| { .sport = sk->sport, |
| #else |
| { .sport = inet_sk(sk)->inet_sport, |
| #endif /* LINUX_2_4 */ |
| .dport = req->rmt_port}} |
| }; |
| |
| if (ip_route_output_flow(&rt, &fl, sk, 0)) { |
| IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); |
| return NULL; |
| } |
| return &rt->u.dst; |
| } |
| #endif |
| |
| /* |
| * Create a new socket as a child of the listening socket 'lsk' and initialize |
| * with the information in the supplied PASS_ACCEPT_REQ message. |
| * |
| * 'retry' indicates to the caller whether a failure is device-related and the |
| * connection should be passed to the host stack, or connection-related and |
| * the connection request should be rejected. |
| */ |
| static struct sock *mk_pass_sock(struct sock *lsk, struct toedev *dev, int tid, |
| const struct cpl_pass_accept_req *req, |
| int *retry, |
| const struct offload_settings *s) |
| { |
| struct sock *newsk; |
| struct cpl_io_state *newcplios; |
| struct l2t_entry *e; |
| struct dst_entry *dst; |
| struct tcp_sock *newtp; |
| struct net_device *egress; |
| struct request_sock *oreq = reqsk_alloc(&t3_rsk_ops); |
| |
| *retry = 0; |
| if (!oreq) |
| goto out_err; |
| |
| tcp_rsk(oreq)->rcv_isn = ntohl(req->rcv_isn); |
| inet_rsk(oreq)->rmt_port = req->peer_port; |
| t3_set_req_addr(oreq, req->local_ip, req->peer_ip); |
| t3_set_req_opt(oreq, NULL); |
| if (sysctl_tcp_window_scaling) { |
| inet_rsk(oreq)->wscale_ok = 1; |
| inet_rsk(oreq)->snd_wscale = req->tcp_options.wsf; |
| } |
| |
| #ifdef CONFIG_SECURITY_NETWORK |
| if (security_inet_conn_request(lsk, tcphdr_skb, oreq)) |
| goto free_or; |
| #endif |
| |
| dst = route_req(lsk, oreq); |
| if (!dst) |
| goto free_or; |
| |
| egress = offload_get_phys_egress(dst->neighbour->dev, NULL, TOE_OPEN); |
| if (!egress || TOEDEV(egress) != dev) { |
| *retry = 1; /* asymmetric route */ |
| goto free_dst; |
| } |
| |
| e = t3_l2t_get(TOM_DATA(dev)->cdev, dst->neighbour, egress); |
| if (!e) { |
| *retry = 1; /* out of HW resources */ |
| goto free_dst; |
| } |
| |
| newcplios = kzalloc(sizeof *newcplios, GFP_ATOMIC); |
| if (!newcplios) |
| goto free_l2t; |
| newsk = tcp_create_openreq_child(lsk, oreq, tcphdr_skb); |
| if (!newsk) { |
| kfree(newcplios); |
| goto free_l2t; |
| } |
| CPL_IO_STATE(newsk) = newcplios; |
| |
| if (sock_flag(newsk, SOCK_KEEPOPEN)) |
| inet_csk_delete_keepalive_timer(newsk); |
| oreq->ts_recent = G_PASS_OPEN_TID(ntohl(req->tos_tid)); |
| newsk->sk_user_data = oreq; |
| sk_setup_caps(newsk, dst); |
| |
| newtp = tcp_sk(newsk); |
| init_offload_sk(newsk, dev, tid, e, dst, egress, s); |
| newcplios->delack_seq = newtp->rcv_nxt; |
| RCV_WSCALE(newtp) = select_rcv_wscale(tcp_full_space(newsk), |
| WSCALE_OK(newtp), |
| newtp->window_clamp); |
| |
| #ifdef LINUX_2_4 |
| newsk->daddr = req->peer_ip; |
| newsk->rcv_saddr = req->local_ip; |
| newsk->saddr = req->local_ip; |
| #else |
| inet_sk(newsk)->inet_daddr = req->peer_ip; |
| inet_sk(newsk)->inet_rcv_saddr = req->local_ip; |
| inet_sk(newsk)->inet_saddr = req->local_ip; |
| #endif /* LINUX_2_4 */ |
| |
| lsk->sk_prot->hash(newsk); |
| t3_inet_inherit_port(&tcp_hashinfo, lsk, newsk); |
| install_offload_ops(newsk); |
| bh_unlock_sock(newsk); // counters tcp_create_openreq_child() |
| return newsk; |
| |
| free_l2t: |
| l2t_release(L2DATA(dev), e); |
| free_dst: |
| dst_release(dst); |
| free_or: |
| __reqsk_free(oreq); |
| out_err: |
| return NULL; |
| } |
| |
| /* |
| * Populate a reject/tunnel CPL_PASS_ACCEPT_RPL WR. |
| */ |
| static void mk_pass_accept_rpl(struct sk_buff *reply_skb, |
| struct sk_buff *req_skb, int cmd) |
| { |
| struct cpl_pass_accept_req *req = cplhdr(req_skb); |
| struct cpl_pass_accept_rpl *rpl = cplhdr(reply_skb); |
| unsigned int tid = GET_TID(req); |
| |
| reply_skb->priority = CPL_PRIORITY_SETUP; |
| rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); |
| OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); |
| rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet |
| rpl->opt0h = htonl(F_TCAM_BYPASS); |
| rpl->opt0l_status = htonl(cmd); |
| rpl->opt2 = 0; |
| rpl->rsvd = rpl->opt2; /* workaround for HW bug */ |
| } |
| |
| /* |
| * Send a deferred reject to an accept request. |
| */ |
| static void reject_pass_request(struct toedev *tdev, struct sk_buff *skb) |
| { |
| struct sk_buff *reply_skb; |
| |
| reply_skb = alloc_skb_nofail(sizeof(struct cpl_pass_accept_rpl)); |
| __skb_put(reply_skb, sizeof(struct cpl_pass_accept_rpl)); |
| mk_pass_accept_rpl(reply_skb, skb, CPL_PASS_OPEN_REJECT); |
| cxgb3_ofld_send(TOM_DATA(tdev)->cdev, reply_skb); |
| kfree_skb(skb); |
| } |
| |
| static void offload_req_from_pass_accept_req(struct offload_req *oreq, |
| const struct cpl_pass_accept_req *req, |
| const struct sock *listen_sk) |
| { |
| oreq->sip[0] = req->peer_ip; |
| oreq->sip[1] = oreq->sip[2] = oreq->sip[3] = 0; |
| oreq->dip[0] = req->local_ip; |
| oreq->dip[1] = oreq->dip[2] = oreq->dip[3] = 0; |
| oreq->sport = req->peer_port; |
| oreq->dport = req->local_port; |
| oreq->ipvers_opentype = (OPEN_TYPE_PASSIVE << 4) | 4; |
| oreq->tos = G_PASS_OPEN_TOS(ntohl(req->tos_tid)); |
| oreq->vlan = req->vlan_tag ? req->vlan_tag & htons(VLAN_VID_MASK) : |
| htons(0xfff); |
| #ifdef SO_MARK |
| oreq->mark = listen_sk->sk_mark; |
| #else |
| oreq->mark = 0; |
| #endif |
| } |
| |
| /* |
| * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket |
| * lock held. Note that the sock here is a listening socket that is not owned |
| * by the TOE. |
| */ |
| static void process_pass_accept_req(struct sock *sk, struct sk_buff *skb) |
| { |
| int rt_flags; |
| int pass2host; |
| struct sock *newsk; |
| struct l2t_entry *e; |
| struct iff_mac tim; |
| struct offload_req orq; |
| struct offload_settings settings; |
| struct sk_buff *reply_skb, *ddp_skb = NULL; |
| struct cpl_pass_accept_rpl *rpl; |
| struct cpl_pass_accept_req *req = cplhdr(skb); |
| unsigned int tid = GET_TID(req); |
| struct toedev *tdev = BLOG_SKB_CB(skb)->dev; |
| struct tom_data *d = TOM_DATA(tdev); |
| struct t3cdev *cdev = d->cdev; |
| |
| reply_skb = get_cpl_reply_skb(skb, sizeof(*rpl), GFP_ATOMIC); |
| if (unlikely(!reply_skb)) { |
| if (tdev->ttid == TOE_ID_CHELSIO_T3) |
| t3_defer_reply(skb, tdev, reject_pass_request); |
| else { |
| cxgb3_queue_tid_release(cdev, tid); |
| kfree_skb(skb); |
| } |
| goto out; |
| } |
| |
| if (sk->sk_state != TCP_LISTEN) |
| goto reject; |
| if (inet_csk_reqsk_queue_is_full(sk)) |
| goto reject; |
| if (sk_acceptq_is_full(sk) && d->conf.soft_backlog_limit) |
| goto reject; |
| |
| tim.mac_addr = req->dst_mac; |
| tim.vlan_tag = ntohs(req->vlan_tag); |
| if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) |
| goto reject; |
| |
| if (ip_route_input(skb, req->local_ip, req->peer_ip, |
| G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev)) |
| goto reject; |
| rt_flags = ((struct rtable *)skb_dst(skb))->rt_flags & |
| (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); |
| dst_release(skb_dst(skb)); // done with the input route, release it |
| skb_dst_set(skb, NULL); |
| if (rt_flags != RTCF_LOCAL) |
| goto reject; |
| |
| offload_req_from_pass_accept_req(&orq, req, sk); |
| settings = *lookup_ofld_policy(tdev, &orq, d->conf.cop_managed_offloading); |
| #ifndef LINUX_2_4 |
| rcu_read_unlock(); |
| #else |
| read_unlock(&tdev->policy_lock); |
| #endif |
| |
| newsk = mk_pass_sock(sk, tdev, tid, req, &pass2host, &settings); |
| if (!newsk) |
| goto reject; |
| |
| /* |
| * Our use of sk_user_data for sockets on the SYNQ can confuse the |
| * sanitization of socket callbacks in the RX_DATA handler. Since |
| * there aren't any kernel apps that need to sanitize the callbacks |
| * of passively opened sockets we solve the problem by skipping |
| * the sanitization on such sockets. |
| */ |
| cplios_set_flag(newsk, CPLIOS_CALLBACKS_CHKD); |
| |
| inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT); |
| synq_add(sk, newsk); |
| |
| /* Don't get a reference, newsk starts out with ref count 2 */ |
| cxgb3_insert_tid(cdev, d->client, newsk, tid); |
| |
| if (CPL_IO_STATE(newsk)->ulp_mode == ULP_MODE_TCPDDP) { |
| ddp_skb = alloc_skb(sizeof(struct cpl_set_tcb_field), |
| GFP_ATOMIC); |
| if (!ddp_skb) |
| CPL_IO_STATE(newsk)->ulp_mode = ULP_MODE_NONE; |
| } |
| |
| reply_skb->sk = newsk; |
| set_arp_failure_handler(reply_skb, pass_accept_rpl_arp_failure); |
| |
| e = CPL_IO_STATE(newsk)->l2t_entry; |
| |
| rpl = cplhdr(reply_skb); |
| rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); |
| OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid)); |
| rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten |
| rpl->opt0h = htonl(calc_opt0h(newsk) | V_L2T_IDX(e->idx) | |
| V_TX_CHANNEL(e->chan_idx)); |
| rpl->opt0l_status = htonl(calc_opt0l(newsk) | |
| CPL_PASS_OPEN_ACCEPT); |
| rpl->opt2 = htonl(calc_opt2(newsk, &settings)); |
| |
| rpl->rsvd = rpl->opt2; /* workaround for HW bug */ |
| reply_skb->priority = mkprio(CPL_PRIORITY_SETUP, newsk); |
| l2t_send(cdev, reply_skb, e); |
| kfree_skb(skb); |
| |
| if (ddp_skb) { |
| set_arp_failure_handler(ddp_skb, arp_failure_discard); |
| __set_tcb_field(newsk, ddp_skb, W_TCB_RX_DDP_FLAGS, |
| V_TF_DDP_OFF(1) | |
| TP_DDP_TIMER_WORKAROUND_MASK, |
| V_TF_DDP_OFF(1) | |
| TP_DDP_TIMER_WORKAROUND_VAL, 1); |
| l2t_send(cdev, ddp_skb, e); |
| } |
| return; |
| |
| reject: |
| if (tdev->ttid == TOE_ID_CHELSIO_T3) |
| mk_pass_accept_rpl(reply_skb, skb, CPL_PASS_OPEN_REJECT); |
| else { |
| __skb_trim(reply_skb, 0); |
| mk_tid_release(reply_skb, NULL, tid); |
| } |
| cxgb3_ofld_send(cdev, reply_skb); |
| kfree_skb(skb); |
| out: |
| T3_TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); |
| } |
| |
| /* |
| * Handle a CPL_PASS_ACCEPT_REQ message. |
| */ |
| static int do_pass_accept_req(struct t3cdev *cdev, struct sk_buff *skb, |
| void *ctx) |
| { |
| struct cpl_pass_accept_req *req = cplhdr(skb); |
| struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx; |
| struct sock *lsk = listen_ctx->lsk; |
| struct tom_data *d = listen_ctx->tom_data; |
| |
| #if VALIDATE_TID |
| unsigned int tid = GET_TID(req); |
| struct tid_info *t = &(T3C_DATA(cdev))->tid_maps; |
| |
| if (unlikely(!lsk)) { |
| printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n", |
| cdev->name, |
| (unsigned long)((union listen_entry *)ctx - |
| t->stid_tab)); |
| return CPL_RET_BUF_DONE; |
| } |
| if (unlikely(tid >= t->ntids)) { |
| printk(KERN_ERR "%s: passive open TID %u too large\n", |
| cdev->name, tid); |
| return CPL_RET_BUF_DONE; |
| } |
| /* |
| * For T3A the current user of the TID may have closed but its last |
| * message(s) may have been backlogged so the TID appears to be still |
| * in use. Just take the TID away, the connection can close at its |
| * own leisure. For T3B this situation is a bug. |
| */ |
| if (!valid_new_tid(t, tid) && |
| cdev->type != T3A) { |
| printk(KERN_ERR "%s: passive open uses existing TID %u\n", |
| cdev->name, tid); |
| return CPL_RET_BUF_DONE; |
| } |
| #endif |
| |
| BLOG_SKB_CB(skb)->dev = &d->tdev; |
| process_cpl_msg(process_pass_accept_req, lsk, skb); |
| return 0; |
| } |
| |
| /* |
| * Add a passively open socket to its parent's accept queue. Note that the |
| * child may be in any state by now, including TCP_CLOSE. We can guarantee |
| * though that it has not been orphaned yet. |
| */ |
| static void add_pass_open_to_parent(struct sock *child, struct sock *lsk, |
| struct toedev *dev) |
| { |
| struct request_sock *oreq; |
| |
| /* |
| * If the server is closed it has already killed its embryonic |
| * children. There is nothing further to do about child. |
| */ |
| if (lsk->sk_state != TCP_LISTEN) |
| return; |
| |
| oreq = child->sk_user_data; |
| child->sk_user_data = NULL; |
| |
| inet_csk_reqsk_queue_removed(lsk, oreq); |
| synq_remove(child); |
| |
| if (sk_acceptq_is_full(lsk) && !TOM_TUNABLE(dev, soft_backlog_limit)) { |
| T3_NET_INC_STATS_BH(sock_net(lsk), LINUX_MIB_LISTENOVERFLOWS); |
| T3_NET_INC_STATS_BH(sock_net(lsk), LINUX_MIB_LISTENDROPS); |
| __reqsk_free(oreq); |
| add_to_reap_list(child); |
| } else { |
| inet_csk_reqsk_queue_add(lsk, oreq, child); |
| lsk->sk_data_ready(lsk, 0); |
| } |
| } |
| |
| /* |
| * This is run from a listener's backlog to add a child socket to its accept |
| * queue. Note that at this point the child is not locked and we intentionally |
| * do not bother locking it as the only fields we may be using are |
| * sk_user_data, and the open request and there aren't any concurrent users |
| * for them. |
| */ |
| static void bl_add_pass_open_to_parent(struct sock *lsk, struct sk_buff *skb) |
| { |
| struct sock *child = skb->sk; |
| |
| skb->sk = NULL; |
| add_pass_open_to_parent(child, lsk, BLOG_SKB_CB(skb)->dev); |
| __kfree_skb(skb); |
| } |
| |
| /* |
| * Called when a connection is established to translate the TCP options |
| * reported by HW to Linux's native format. |
| */ |
| static void assign_rxopt(struct sock *sk, unsigned int opt) |
| { |
| const struct t3c_data *td = T3C_DATA(T3C_DEV(sk)); |
| struct tcp_sock *tp = tcp_sk(sk); |
| |
| MSS_CLAMP(tp) = td->mtus[G_TCPOPT_MSS(opt)] - 40; |
| tp->mss_cache = MSS_CLAMP(tp); |
| tp->tcp_header_len = sizeof(struct tcphdr); |
| TSTAMP_OK(tp) = G_TCPOPT_TSTAMP(opt); |
| SACK_OK(tp) = G_TCPOPT_SACK(opt); |
| WSCALE_OK(tp) = G_TCPOPT_WSCALE_OK(opt); |
| SND_WSCALE(tp) = G_TCPOPT_SND_WSCALE(opt); |
| if (!WSCALE_OK(tp)) |
| RCV_WSCALE(tp) = 0; |
| if (TSTAMP_OK(tp)) { |
| tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; |
| tp->mss_cache -= TCPOLEN_TSTAMP_ALIGNED; |
| } |
| } |
| |
| /* |
| * Completes some final bits of initialization for just established connections |
| * and changes their state to TCP_ESTABLISHED. |
| * |
| * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. |
| */ |
| static void make_established(struct sock *sk, u32 snd_isn, unsigned int opt) |
| { |
| struct tcp_sock *tp = tcp_sk(sk); |
| |
| tp->pushed_seq = tp->write_seq = tp->snd_nxt = tp->snd_una = snd_isn; |
| inet_sk(sk)->inet_id = tp->write_seq ^ jiffies; |
| assign_rxopt(sk, opt); |
| |
| /* |
| * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't |
| * pass through opt0. |
| */ |
| if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10)) |
| tp->rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10); |
| |
| dst_confirm(sk->sk_dst_cache); |
| |
| /* |
| * tcp_poll() does not lock socket, make sure initial values are |
| * committed before changing to ESTABLISHED. |
| */ |
| mb(); |
| tcp_set_state(sk, TCP_ESTABLISHED); |
| } |
| |
| /* |
| * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work |
| * if we are in TCP_SYN_RECV due to crossed SYNs |
| */ |
| static int do_pass_establish(struct t3cdev *cdev, struct sk_buff *skb, |
| void *ctx) |
| { |
| struct cpl_pass_establish *req = cplhdr(skb); |
| struct sock *lsk, *sk = (struct sock *)ctx; |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct toedev *tdev = cplios->toedev; |
| |
| VALIDATE_SOCK(sk); |
| |
| bh_lock_sock(sk); |
| if (unlikely(sock_owned_by_user(sk))) { |
| // This can only happen in simultaneous opens. XXX TBD |
| __kfree_skb(skb); |
| } else { |
| // Complete socket initialization now that we have the SND_ISN |
| struct t3c_tid_entry *t3c_stid; |
| struct tid_info *t; |
| unsigned int stid; |
| |
| cplios->wr_max = cplios->wr_avail = TOM_TUNABLE(tdev, max_wrs); |
| cplios->wr_unacked = 0; |
| cplios->rss_cpu_idx = G_QNUM(ntohl(skb->csum)); |
| make_established(sk, ntohl(req->snd_isn), ntohs(req->tcp_opt)); |
| |
| if (unlikely(sk->sk_socket)) { // simultaneous opens only |
| sk->sk_state_change(sk); |
| sk_wake_async(sk, 0, POLL_OUT); |
| } |
| |
| /* |
| * The state for the new connection is now up to date. |
| * Next check if we should add the connection to the parent's |
| * accept queue. When the parent closes it resets connections |
| * on its SYN queue, so check if we are being reset. If so we |
| * don't need to do anything more, the coming ABORT_RPL will |
| * destroy this socket. Otherwise move the connection to the |
| * accept queue. |
| * |
| * Note that we reset the synq before closing the server so if |
| * we are not being reset the stid is still open. |
| */ |
| if (unlikely(synq_empty(sk))) { |
| /* removed from synq */ |
| __kfree_skb(skb); |
| goto unlock; |
| } |
| |
| stid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); |
| t = &(T3C_DATA(cdev))->tid_maps; |
| t3c_stid = lookup_stid(t, stid); |
| lsk = ((struct listen_ctx *)t3c_stid->ctx)->lsk; |
| |
| bh_lock_sock(lsk); |
| if (likely(!sock_owned_by_user(lsk))) { |
| __kfree_skb(skb); |
| add_pass_open_to_parent(sk, lsk, tdev); |
| } else { |
| skb->sk = sk; |
| BLOG_SKB_CB(skb)->dev = tdev; |
| BLOG_SKB_CB(skb)->backlog_rcv = bl_add_pass_open_to_parent; |
| __sk_add_backlog(lsk, skb); |
| } |
| bh_unlock_sock(lsk); |
| } |
| unlock: |
| bh_unlock_sock(sk); |
| return 0; |
| } |
| |
| /* |
| * Fill in the right TID for CPL messages waiting in the out-of-order queue |
| * and send them to the TOE. |
| */ |
| static void fixup_and_send_ofo(struct sock *sk) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct sk_buff *skb; |
| struct toedev *tdev = cplios->toedev; |
| struct tcp_sock *tp = tcp_sk(sk); |
| unsigned int tid = cplios->tid; |
| |
| while ((skb = __skb_dequeue(&tp->out_of_order_queue)) != NULL) { |
| /* |
| * A variety of messages can be waiting but the fields we'll |
| * be touching are common to all so any message type will do. |
| */ |
| struct cpl_close_con_req *p = cplhdr(skb); |
| |
| p->wr.wr_lo = htonl(V_WR_TID(tid)); |
| OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); |
| cxgb3_ofld_send(TOM_DATA(tdev)->cdev, skb); |
| } |
| } |
| |
| /* |
| * Adjust buffers already in write queue after a SYN_SENT->ESTABLISHED |
| * transition. For TX_DATA we need to adjust the start sequence numbers, and |
| * for other packets we need to adjust the TID. TX_DATA packets don't have |
| * headers yet and so not TIDs. |
| */ |
| static void fixup_pending_writeq_buffers(struct sock *sk) |
| { |
| struct sk_buff *skb; |
| struct tcp_sock *tp = tcp_sk(sk); |
| unsigned int tid = CPL_IO_STATE(sk)->tid; |
| |
| skb_queue_walk(&sk->sk_write_queue, skb) { |
| if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR) { |
| ULP_SKB_CB(skb)->seq = tp->write_seq; |
| tp->write_seq += skb->len + ulp_extra_len(skb); |
| } else { |
| struct cpl_close_con_req *p = cplhdr(skb); |
| |
| p->wr.wr_lo = htonl(V_WR_TID(tid)); |
| OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid)); |
| } |
| } |
| } |
| |
| /* |
| * Updates socket state from an active establish CPL message. Runs with the |
| * socket lock held. |
| */ |
| static void sock_act_establish(struct sock *sk, struct sk_buff *skb) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct cpl_act_establish *req = cplhdr(skb); |
| u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */ |
| struct tcp_sock *tp = tcp_sk(sk); |
| |
| if (unlikely(sk->sk_state != TCP_SYN_SENT)) |
| printk(KERN_ERR "TID %u expected SYN_SENT, found %d\n", |
| cplios->tid, sk->sk_state); |
| |
| tp->rcv_tstamp = tcp_time_stamp; |
| cplios->delack_seq = tp->copied_seq = tp->rcv_wup = tp->rcv_nxt = rcv_isn; |
| make_established(sk, ntohl(req->snd_isn), ntohs(req->tcp_opt)); |
| |
| #if defined(CONFIG_SECURITY_NETWORK) && defined(SEC_INET_CONN_ESTABLISHED) |
| security_inet_conn_estab(sk, tcphdr_skb); |
| #endif |
| |
| /* |
| * Now that we finally have a TID send any CPL messages that we had to |
| * defer for lack of a TID. |
| */ |
| if (skb_queue_len(&tp->out_of_order_queue)) |
| fixup_and_send_ofo(sk); |
| |
| if (likely(!sock_flag(sk, SOCK_DEAD))) { |
| sk->sk_state_change(sk); |
| sk_wake_async(sk, 0, POLL_OUT); |
| } |
| |
| __kfree_skb(skb); |
| |
| /* |
| * Currently the send queue must be empty at this point because the |
| * socket layer does not send anything before a connection is |
| * established. To be future proof though we handle the possibility |
| * that there are pending buffers to send (either TX_DATA or |
| * CLOSE_CON_REQ). First we need to adjust the sequence number of the |
| * buffers according to the just learned write_seq, and then we send |
| * them on their way. |
| */ |
| fixup_pending_writeq_buffers(sk); |
| if (t3_push_frames(sk, 1)) |
| sk->sk_write_space(sk); |
| } |
| |
| /* |
| * Process a CPL_ACT_ESTABLISH message. |
| */ |
| static int do_act_establish(struct t3cdev *cdev, struct sk_buff *skb, void *ctx) |
| { |
| struct cpl_act_establish *req = cplhdr(skb); |
| unsigned int tid = GET_TID(req); |
| unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid)); |
| struct sock *sk = (struct sock *)ctx; |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct toedev *tdev = cplios->toedev; |
| struct tom_data *d = TOM_DATA(tdev); |
| |
| /* |
| * It's OK if the TID is currently in use, the owning socket may have |
| * backlogged its last CPL message(s). Just take it away. |
| */ |
| CPL_IO_STATE(sk)->tid = tid; |
| sk_insert_tid(d, sk, tid); |
| free_atid(cdev, atid); |
| |
| cplios->rss_cpu_idx = G_QNUM(ntohl(skb->csum)); |
| |
| process_cpl_msg(sock_act_establish, sk, skb); |
| return 0; |
| } |
| |
| /* |
| * Process an acknowledgment of WR completion. Advance snd_una and send the |
| * next batch of work requests from the write queue. |
| */ |
| static void wr_ack(struct sock *sk, struct sk_buff *skb) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct cpl_wr_ack *hdr = cplhdr(skb); |
| unsigned int credits = ntohs(hdr->credits); |
| u32 snd_una = ntohl(hdr->snd_una); |
| |
| cplios->wr_avail += credits; |
| |
| /* |
| * If the last write request in the queue with a request completion |
| * flag has been consumed, reset our bookeepping. |
| */ |
| if (cplios->wr_unacked > cplios->wr_max - cplios->wr_avail) |
| cplios->wr_unacked = cplios->wr_max - cplios->wr_avail; |
| |
| while (credits) { |
| struct sk_buff *p = peek_wr(sk); |
| |
| if (unlikely(!p)) { |
| printk(KERN_ERR "%u WR_ACK credits for TID %u with " |
| "nothing pending, state %u\n", |
| credits, cplios->tid, sk->sk_state); |
| break; |
| } |
| if (unlikely(credits < p->csum)) { |
| #if DEBUG_WR > 1 |
| struct tx_data_wr *w = cplhdr(p); |
| |
| printk(KERN_ERR |
| "TID %u got %u WR credits, need %u, len %u, " |
| "main body %u, frags %u, seq # %u, ACK una %u," |
| " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n", |
| cplios->tid, credits, p->csum, p->len, |
| p->len - p->data_len, skb_shinfo(p)->nr_frags, |
| ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt), |
| cplios->wr_avail, count_pending_wrs(sk) - credits); |
| #endif |
| p->csum -= credits; |
| break; |
| } else { |
| dequeue_wr(sk); |
| credits -= p->csum; |
| free_wr_skb(p); |
| } |
| } |
| |
| #if DEBUG_WR |
| check_wr_invariants(sk); |
| #endif |
| |
| if (unlikely(before(snd_una, tp->snd_una))) { |
| #if VALIDATE_SEQ |
| struct tom_data *d = TOM_DATA(cplios->toedev); |
| |
| printk(KERN_ERR "%s: unexpected sequence # %u in WR_ACK " |
| "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una, |
| cplios->tid, tp->snd_una); |
| #endif |
| goto out_free; |
| } |
| |
| if (tp->snd_una != snd_una) { |
| tp->snd_una = snd_una; |
| dst_confirm(sk->sk_dst_cache); |
| tp->rcv_tstamp = tcp_time_stamp; |
| if (tp->snd_una == tp->snd_nxt) |
| cplios_reset_flag(sk, CPLIOS_TX_WAIT_IDLE); |
| } |
| |
| /* |
| * If there's more data queued up, see if we can get it into the write |
| * queue ... If we're able to push any data into the write queue, |
| * free up socket send buffer space. |
| */ |
| if (skb_queue_len(&sk->sk_write_queue) && t3_push_frames(sk, 0)) |
| sk->sk_write_space(sk); |
| out_free: |
| __kfree_skb(skb); |
| } |
| |
| /* |
| * Handler for TX_DATA_ACK CPL messages. |
| */ |
| static int do_wr_ack(struct t3cdev *dev, struct sk_buff *skb, void *ctx) |
| { |
| struct sock *sk = (struct sock *)ctx; |
| |
| VALIDATE_SOCK(sk); |
| |
| process_cpl_msg(wr_ack, sk, skb); |
| return 0; |
| } |
| |
| /* |
| * Handler for TRACE_PKT CPL messages. Just sink these packets. |
| */ |
| static int do_trace_pkt(struct t3cdev *dev, struct sk_buff *skb, void *ctx) |
| { |
| __kfree_skb(skb); |
| return 0; |
| } |
| |
| /* |
| * Disconnect offloaded established but not yet accepted connections sitting |
| * on a server's accept_queue. We just send an ABORT_REQ at this point and |
| * finish off the disconnect later as we may need to wait for the ABORT_RPL. |
| */ |
| void t3_disconnect_acceptq(struct sock *listen_sk) |
| { |
| struct request_sock **pprev; |
| |
| pprev = ACCEPT_QUEUE(listen_sk); |
| while (*pprev) { |
| struct request_sock *req = *pprev; |
| |
| if (req->rsk_ops == RSK_OPS(&t3_rsk_ops)) { // one of ours |
| struct sock *child = req->sk; |
| |
| *pprev = req->dl_next; |
| sk_acceptq_removed(listen_sk); |
| __reqsk_free(req); |
| release_tcp_port(child); |
| reset_listen_child(child); |
| } else |
| pprev = &req->dl_next; |
| } |
| } |
| |
| /* |
| * Reset offloaded connections sitting on a server's syn queue. As above |
| * we send ABORT_REQ and finish off when we get ABORT_RPL. |
| */ |
| void t3_reset_synq(struct sock *listen_sk) |
| { |
| struct sock **nextsk = &synq_next(listen_sk); |
| |
| /* |
| * Note: the while predicate below is a little tricky because the |
| * fields used to implement the doubly linked list have been hijacked |
| * out of the (struct tcp_sock) portion of the socket. If the fields |
| * were solely ours to use, then the test of "*nextsk != listen_sk" |
| * would be enough. But when we empty the SYN queue, the state of |
| * those hijacked fields are reset to the values expected by Linux |
| * and "*nextsk" will no longer have any legitimate meaning for us. |
| * Thus the double predicate of testing for both the SYN queue being |
| * empty (which is implemented in a Linux version-dependent fashion) |
| * and making sure the next socket to process isn't our listen |
| * socket ... |
| */ |
| while (!synq_empty(listen_sk) && *nextsk != listen_sk) { |
| struct sock *child = *nextsk; |
| |
| if (child->sk_prot == &t3_tcp_prot.proto) { |
| /* one of ours */ |
| cleanup_syn_rcv_conn(child, listen_sk); |
| release_tcp_port(child); |
| reset_listen_child(child); |
| } else { |
| /* some other offloaded socket ... */ |
| nextsk = &synq_next(*nextsk); |
| } |
| } |
| } |
| |
| int t3_setup_ppods(struct sock *sk, const struct ddp_gather_list *gl, |
| unsigned int nppods, unsigned int tag, unsigned int maxoff, |
| unsigned int pg_off, unsigned int color) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| unsigned int i, j, pidx; |
| struct pagepod *p; |
| struct sk_buff *skb; |
| struct ulp_mem_io *req; |
| struct tcp_sock *tp = tcp_sk(sk); |
| unsigned int tid = cplios->tid; |
| const struct tom_data *td = TOM_DATA(cplios->toedev); |
| unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit; |
| |
| for (i = 0; i < nppods; ++i) { |
| skb = alloc_ctrl_skb(tp, sizeof(*req) + PPOD_SIZE); |
| skb->priority = mkprio(CPL_PRIORITY_CONTROL, sk); |
| req = (struct ulp_mem_io *)__skb_put(skb, |
| sizeof(*req) + PPOD_SIZE); |
| req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); |
| req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) | |
| V_ULPTX_CMD(ULP_MEM_WRITE)); |
| req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) | |
| V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1)); |
| |
| p = (struct pagepod *)(req + 1); |
| if (likely(i < nppods - NUM_SENTINEL_PPODS)) { |
| p->vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid)); |
| p->pgsz_tag_color = htonl(V_PPOD_TAG(tag) | |
| V_PPOD_COLOR(color)); |
| p->max_offset = htonl(maxoff); |
| p->page_offset = htonl(pg_off); |
| p->rsvd = 0; |
| for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx) |
| p->addr[j] = pidx < gl->nelem ? |
| cpu_to_be64(gl->phys_addr[pidx]) : 0; |
| } else |
| p->vld_tid = 0; /* mark sentinel page pods invalid */ |
| send_or_defer(sk, tp, skb, 0); |
| ppod_addr += PPOD_SIZE; |
| } |
| return 0; |
| } |
| |
| /* |
| * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command. |
| */ |
| static inline void mk_cpl_barrier_ulp(struct cpl_barrier *b) |
| { |
| struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b; |
| |
| txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); |
| txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8)); |
| b->opcode = CPL_BARRIER; |
| } |
| |
| /* |
| * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command. |
| */ |
| static inline void mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, |
| unsigned int cpuno) |
| { |
| struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; |
| |
| txpkt = (struct ulp_txpkt *)req; |
| txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); |
| txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); |
| OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid)); |
| req->cpuno = htons(cpuno); |
| } |
| |
| /* |
| * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command. |
| */ |
| static inline void mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, |
| unsigned int tid, unsigned int word, |
| u64 mask, u64 val) |
| { |
| struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req; |
| |
| txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); |
| txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8)); |
| OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid)); |
| req->reply = V_NO_REPLY(1); |
| req->cpu_idx = 0; |
| req->word = htons(word); |
| req->mask = cpu_to_be64(mask); |
| req->val = cpu_to_be64(val); |
| } |
| |
| /* |
| * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command. |
| */ |
| static void mk_rx_data_ack_ulp(struct sock *sk, struct cpl_rx_data_ack *ack, |
| unsigned int tid, |
| unsigned int credits) |
| { |
| struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack; |
| u32 dack; |
| |
| dack = t3_select_delack(sk); |
| |
| txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT)); |
| txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8)); |
| OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid)); |
| ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE | |
| V_RX_DACK_MODE(dack) | |
| V_RX_CREDITS(credits)); |
| } |
| |
| void t3_cancel_ddpbuf(struct sock *sk, unsigned int bufidx) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| unsigned int wrlen; |
| struct sk_buff *skb; |
| struct work_request_hdr *wr; |
| struct cpl_barrier *lock; |
| struct cpl_set_tcb_field *req; |
| struct cpl_get_tcb *getreq; |
| struct ddp_state *p = DDP_STATE(sk); |
| |
| wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) + |
| sizeof(*getreq); |
| skb = alloc_ctrl_skb(tp, wrlen); |
| skb->priority = mkprio(CPL_PRIORITY_CONTROL, sk); |
| |
| wr = (struct work_request_hdr *)__skb_put(skb, wrlen); |
| wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); |
| |
| lock = (struct cpl_barrier *)(wr + 1); |
| mk_cpl_barrier_ulp(lock); |
| |
| req = (struct cpl_set_tcb_field *)(lock + 1); |
| |
| /* Hmmm, not sure if this actually a good thing: reactivating |
| * the other buffer might be an issue if it has been completed |
| * already. However, that is unlikely, since the fact that the UBUF |
| * is not completed indicates that there is no oustanding data. |
| */ |
| if (bufidx == 0) |
| mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_FLAGS, |
| V_TF_DDP_ACTIVE_BUF(1) | |
| V_TF_DDP_BUF0_VALID(1), |
| V_TF_DDP_ACTIVE_BUF(1)); |
| else |
| mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_FLAGS, |
| V_TF_DDP_ACTIVE_BUF(1) | |
| V_TF_DDP_BUF1_VALID(1), 0); |
| |
| getreq = (struct cpl_get_tcb *)(req + 1); |
| mk_get_tcb_ulp(getreq, cplios->tid, cplios->rss_cpu_idx); |
| |
| mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1)); |
| |
| /* Keep track of the number of oustanding CPL_GET_TCB requests |
| */ |
| p->get_tcb_count++; |
| |
| #ifdef T3_TRACE |
| T3_TRACE1(TIDTB(sk), |
| "t3_cancel_ddpbuf: bufidx %u", bufidx); |
| #endif |
| cxgb3_ofld_send(T3C_DEV(sk), skb); |
| } |
| |
| /** |
| * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one |
| * @sk: the socket associated with the buffers |
| * @bufidx: index of HW DDP buffer (0 or 1) |
| * @tag0: new tag for HW buffer 0 |
| * @tag1: new tag for HW buffer 1 |
| * @len: new length for HW buf @bufidx |
| * |
| * Sends a compound WR to overlay a new DDP buffer on top of an existing |
| * buffer by changing the buffer tag and length and setting the valid and |
| * active flag accordingly. The caller must ensure the new buffer is at |
| * least as big as the existing one. Since we typically reprogram both HW |
| * buffers this function sets both tags for convenience. Read the TCB to |
| * determine how made data was written into the buffer before the overlay |
| * took place. |
| */ |
| void t3_overlay_ddpbuf(struct sock *sk, unsigned int bufidx, unsigned int tag0, |
| unsigned int tag1, unsigned int len) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| unsigned int wrlen; |
| struct sk_buff *skb; |
| struct work_request_hdr *wr; |
| struct cpl_get_tcb *getreq; |
| struct cpl_set_tcb_field *req; |
| struct ddp_state *p = DDP_STATE(sk); |
| |
| wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq); |
| skb = alloc_ctrl_skb(tp, wrlen); |
| skb->priority = mkprio(CPL_PRIORITY_CONTROL, sk); |
| |
| wr = (struct work_request_hdr *)__skb_put(skb, wrlen); |
| |
| /* Set the ATOMIC flag to make sure that TP processes the following |
| * CPLs in an atomic manner and no wire segments can be interleaved. |
| */ |
| wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC); |
| |
| req = (struct cpl_set_tcb_field *)(wr + 1); |
| mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_BUF0_TAG, |
| V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) | |
| V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32, |
| V_TCB_RX_DDP_BUF0_TAG(tag0) | |
| V_TCB_RX_DDP_BUF1_TAG((u64)tag1) << 32); |
| req++; |
| if (bufidx == 0) { |
| mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_BUF0_LEN, |
| V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), |
| V_TCB_RX_DDP_BUF0_LEN((u64)len)); |
| req++; |
| mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_FLAGS, |
| V_TF_DDP_PUSH_DISABLE_0(1) | |
| V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), |
| V_TF_DDP_PUSH_DISABLE_0(0) | |
| V_TF_DDP_BUF0_VALID(1)); |
| } else { |
| mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_BUF1_LEN, |
| V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN), |
| V_TCB_RX_DDP_BUF1_LEN((u64)len)); |
| req++; |
| mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_FLAGS, |
| V_TF_DDP_PUSH_DISABLE_1(1) | |
| V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), |
| V_TF_DDP_PUSH_DISABLE_1(0) | |
| V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1)); |
| } |
| |
| getreq = (struct cpl_get_tcb *)(req + 1); |
| mk_get_tcb_ulp(getreq, cplios->tid, cplios->rss_cpu_idx); |
| |
| /* Keep track of the number of oustanding CPL_GET_TCB requests |
| */ |
| p->get_tcb_count++; |
| |
| #ifdef T3_TRACE |
| T3_TRACE4(TIDTB(sk), |
| "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u " |
| "len %d", |
| bufidx, tag0, tag1, len); |
| #endif |
| cxgb3_ofld_send(T3C_DEV(sk), skb); |
| } |
| |
| /* |
| * Sends a compound WR containing all the CPL messages needed to program the |
| * two HW DDP buffers, namely optionally setting up the length and offset of |
| * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK. |
| */ |
| void t3_setup_ddpbufs(struct sock *sk, unsigned int len0, unsigned int offset0, |
| unsigned int len1, unsigned int offset1, |
| u64 ddp_flags, u64 flag_mask, int modulate) |
| { |
| struct cpl_io_state *cplios = CPL_IO_STATE(sk); |
| unsigned int wrlen; |
| struct sk_buff *skb; |
| struct work_request_hdr *wr; |
| struct cpl_set_tcb_field *req; |
| struct tcp_sock *tp = tcp_sk(sk); |
| |
| wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) + |
| (len1 ? sizeof(*req) : 0) + |
| (modulate ? sizeof(struct cpl_rx_data_ack) : 0); |
| skb = alloc_ctrl_skb(tp, wrlen); |
| skb->priority = mkprio(CPL_PRIORITY_CONTROL, sk); |
| |
| wr = (struct work_request_hdr *)__skb_put(skb, wrlen); |
| wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS)); |
| |
| req = (struct cpl_set_tcb_field *)(wr + 1); |
| if (len0) { /* program buffer 0 offset and length */ |
| mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_BUF0_OFFSET, |
| V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | |
| V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), |
| V_TCB_RX_DDP_BUF0_OFFSET((u64)offset0) | |
| V_TCB_RX_DDP_BUF0_LEN((u64)len0)); |
| req++; |
| } |
| if (len1) { /* program buffer 1 offset and length */ |
| mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_BUF1_OFFSET, |
| V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | |
| V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32, |
| V_TCB_RX_DDP_BUF1_OFFSET((u64)offset1) | |
| V_TCB_RX_DDP_BUF1_LEN((u64)len1) << 32); |
| req++; |
| } |
| |
| mk_set_tcb_field_ulp(req, cplios->tid, W_TCB_RX_DDP_FLAGS, flag_mask, |
| ddp_flags); |
| |
| if (modulate) { |
| mk_rx_data_ack_ulp(sk, (struct cpl_rx_data_ack *)(req + 1), |
| cplios->tid, |
| tp->copied_seq - tp->rcv_wup); |
| tp->rcv_wup = tp->copied_seq; |
| } |
| |
| #ifdef T3_TRACE |
| T3_TRACE5(TIDTB(sk), |
| "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x " |
| "modulate %d", |
| len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff, |
| modulate); |
| #endif |
| |
| cxgb3_ofld_send(T3C_DEV(sk), skb); |
| } |
| |
| void t3_init_wr_tab(unsigned int wr_len) |
| { |
| int i; |
| |
| if (skb_wrs[1]) /* already initialized */ |
| return; |
| |
| for (i = 1; i < ARRAY_SIZE(skb_wrs); i++) { |
| int sgl_len = (3 * i) / 2 + (i & 1); |
| |
| sgl_len += 3; |
| skb_wrs[i] = sgl_len <= wr_len ? |
| 1 : 1 + (sgl_len - 2) / (wr_len - 1); |
| } |
| |
| wrlen = wr_len * 8; |
| } |
| |
| int __init t3_init_cpl_io(void) |
| { |
| tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL); |
| if (!tcphdr_skb) { |
| printk(KERN_ERR |
| "Chelsio TCP offload: can't allocate sk_buff\n"); |
| return -1; |
| } |
| skb_put(tcphdr_skb, sizeof(struct tcphdr)); |
| skb_reset_transport_header(tcphdr_skb); |
| memset(tcphdr_skb->data, 0, tcphdr_skb->len); |
| /* CIPSO_V4_OPTEXIST is false for tcphdr_skb without anything extra */ |
| |
| t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); |
| t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish); |
| t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl); |
| t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); |
| t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); |
| t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data); |
| t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); |
| t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); |
| t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack); |
| t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close); |
| t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req); |
| t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl); |
| t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl); |
| t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); |
| t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); |
| return 0; |
| } |