blob: 3d79ca23ca40261b71e22a67a5854de3ec14712b [file] [log] [blame]
/*
* This file implements the interface between the socket layer and
* the HW TCP/CPL, including the protocol operations for Chelsio's HW TCP.
*
* Large portions of this file are taken from net/ipv4/tcp.c.
* See that file for copyrights of the original code.
* Any additional code is
*
* Copyright (C) 2003-2009 Chelsio Communications. All rights reserved.
*
* Written by Dimitris Michailidis (dm@chelsio.com)
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 59
* Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "defs.h"
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/toedev.h>
#include <linux/module.h>
#if defined(CONFIG_T3_ZCOPY_SENDMSG) || defined(CONFIG_T3_ZCOPY_SENDMSG_MODULE)
#include <linux/pagemap.h>
#include <linux/mm.h>
#endif
#include <net/offload.h>
#include <net/tcp.h>
#include <net/ip.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
#include "t3_ddp.h"
#include "tom.h"
#include "tcb.h"
#include "firmware_exports.h"
#include "trace.h"
/*
* This must be called with the socket locked, otherwise dev may be NULL.
*/
static inline int chelsio_wspace(const struct sock *sk)
{
struct toedev *dev = CPL_IO_STATE(sk)->toedev;
return dev ? TOM_TUNABLE(dev, max_host_sndbuf) - sk->sk_wmem_queued : 0;
}
/*
* TCP socket write_space callback. Follows sk_stream_write_space().
*/
void t3_write_space(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
if (chelsio_wspace(sk) >= sk_stream_min_wspace(sk) && sock) {
clear_bit(SOCK_NOSPACE, &sock->flags);
sk_wakeup_sleepers(sk, 1);
sk_wake_async(sk, 2, POLL_OUT);
}
}
static inline int tcp_memory_free(struct sock *sk)
{
return chelsio_wspace(sk) > 0;
}
/*
* Wait for memory to become available, either space in a socket's send buffer
* or system memory.
*/
static int wait_for_mem(struct sock *sk, long *timeout)
{
int sndbuf, err = 0;
long vm_wait = 0;
long current_timeo = *timeout;
#ifdef LINUX_2_4
DECLARE_WAITQUEUE(wait, current);
#else
DEFINE_WAIT(wait);
#endif /* LINUX_2_4 */
/*
* We open code tcp_memory_free() because we need it outside the
* socket lock and chelsio_wspace() isn't safe there.
*/
sndbuf = TOM_TUNABLE(CPL_IO_STATE(sk)->toedev, max_host_sndbuf);
if (sndbuf > sk->sk_wmem_queued)
current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
#ifdef LINUX_2_4
add_wait_queue(sk->sleep, &wait);
#endif /* LINUX_2_4 */
for (;;) {
set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
#ifdef LINUX_2_4
set_current_state(TASK_INTERRUPTIBLE);
#else
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
#endif /* LINUX_2_4 */
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
err = -EPIPE;
break;
}
if (!*timeout) {
err = -EAGAIN;
break;
}
if (signal_pending(current)) {
err = sock_intr_errno(*timeout);
break;
}
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
if (sndbuf > sk->sk_wmem_queued && !vm_wait)
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
sk->sk_write_pending++;
release_sock(sk);
if (!sk->sk_err && !(sk->sk_shutdown & SEND_SHUTDOWN) &&
(sndbuf <= sk->sk_wmem_queued || vm_wait))
current_timeo = schedule_timeout(current_timeo);
lock_sock(sk);
sk->sk_write_pending--;
if (vm_wait) {
vm_wait -= current_timeo;
current_timeo = *timeout;
if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
(current_timeo -= vm_wait) < 0)
current_timeo = 0;
vm_wait = 0;
}
*timeout = current_timeo;
}
#ifdef LINUX_2_4
current->state = TASK_RUNNING;
remove_wait_queue(sk->sleep, &wait);
#else
finish_wait(sk_sleep(sk), &wait);
#endif /* LINUX_2_4 */
return err;
}
static void skb_entail(struct sock *sk, struct sk_buff *skb, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
ULP_SKB_CB(skb)->seq = tp->write_seq;
ULP_SKB_CB(skb)->flags = flags;
__skb_queue_tail(&sk->sk_write_queue, skb);
sk->sk_wmem_queued += skb->truesize;
// tcp_charge_skb(sk, skb);
// Do not share pages across sk_buffs
if (TCP_PAGE(sk) && TCP_OFF(sk)) {
put_page(TCP_PAGE(sk));
TCP_PAGE(sk) = NULL;
TCP_OFF(sk) = 0;
}
}
/*
* Returns true if a connection should send more data to the TOE ASAP.
*/
static inline int should_push(struct sock *sk)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct toedev *dev = cplios->toedev;
/*
* If there aren't any work requests in flight, or there isn't enough
* data in flight, or Nagle is off then send the current TX_DATA
* otherwise hold it and wait to accumulate more data.
*/
return cplios->wr_avail == cplios->wr_max ||
tp->snd_nxt - tp->snd_una <= TOM_TUNABLE(dev, tx_hold_thres) ||
(tp->nonagle & TCP_NAGLE_OFF);
}
/*
* Returns true if a TCP socket is corked.
*/
static inline int corked(const struct tcp_sock *tp, int flags)
{
return (flags & MSG_MORE) | (tp->nonagle & TCP_NAGLE_CORK);
}
/*
* Returns true if a send should try to push new data.
*/
static inline int send_should_push(struct sock *sk, int flags)
{
return should_push(sk) && !corked(tcp_sk(sk), flags);
}
static inline void tx_skb_finalize(struct sk_buff *skb)
{
struct ulp_skb_cb *cb = ULP_SKB_CB(skb);
#if defined(CONFIG_T3_ZCOPY_SENDMSG) || defined(CONFIG_T3_ZCOPY_SENDMSG_MODULE)
/*
* XXX We don't want to finalize an skb if it's flagged for ZCOPY
* XXX since we'll end up losing the flag. This needs to be looked
* XXX at more closely since we're blindly clearing a bunch of flags
* XXX here. Most of these flags (including those for ZCOPY)
* XXX probably ought to be retained rather than tossed and we
* XXX should certainly have an assert for flags that shouldn't
* XXX find their way into this routine ...
*/
if (cb->flags & (ULPCB_FLAG_ZCOPY|ULPCB_FLAG_ZCOPY_COW))
return;
#endif
cb->flags = ULPCB_FLAG_NO_APPEND | ULPCB_FLAG_NEED_HDR;
}
static inline void mark_urg(struct tcp_sock *tp, int flags,
struct sk_buff *skb)
{
if (unlikely(flags & MSG_OOB)) {
tp->snd_up = tp->write_seq;
ULP_SKB_CB(skb)->flags = ULPCB_FLAG_URG | ULPCB_FLAG_BARRIER |
ULPCB_FLAG_NO_APPEND |
ULPCB_FLAG_NEED_HDR;
}
}
/*
* Decide if the last frame on the send queue needs any special annotations
* (e.g., marked URG) and whether it should be transmitted immediately or
* held for additional data. This is the only routine that performs the full
* suite of tests for a Tx packet and therefore must be called for the last
* packet added by the various send*() APIs.
*/
static void tcp_push(struct sock *sk, int flags)
{
int qlen = skb_queue_len(&sk->sk_write_queue);
if (likely(qlen)) {
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = sk->sk_write_queue.prev;
mark_urg(tp, flags, skb);
if (!(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) &&
corked(tp, flags)) {
ULP_SKB_CB(skb)->flags |= ULPCB_FLAG_HOLD;
return;
}
ULP_SKB_CB(skb)->flags &= ~ULPCB_FLAG_HOLD;
if (qlen == 1 &&
((ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) ||
should_push(sk)))
t3_push_frames(sk, 1);
}
}
static void tcp_uncork(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (tp->nonagle & TCP_NAGLE_CORK) {
tp->nonagle &= ~TCP_NAGLE_CORK;
tcp_push(sk, 0);
}
}
/*
* Try to transmit the send queue if it has just one packet. This is intended
* to be called as full packets are added to the send queue by the various
* send*() APIs when we expect additional packets to be generated by the
* current API call. It should not be called for the last packet generated,
* use the full tcp_push call above for that.
*/
static inline void push_frames_if_head(struct sock *sk)
{
if (skb_queue_len(&sk->sk_write_queue) == 1)
t3_push_frames(sk, 1);
}
static struct sk_buff *alloc_tx_skb(struct sock *sk, int size)
{
struct sk_buff *skb;
skb = alloc_skb(size + TX_HEADER_LEN, sk->sk_allocation);
if (likely(skb)) {
skb_reserve(skb, TX_HEADER_LEN);
skb_entail(sk, skb, ULPCB_FLAG_NEED_HDR);
}
return skb;
}
static int chelsio_sendpage(struct sock *sk, struct page *page, int offset,
size_t size, int flags)
{
long timeo;
int mss, err, copied = 0;
struct tcp_sock *tp = tcp_sk(sk);
lock_sock(sk);
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
/* Wait for connection establishment to finish. */
if (!sk_in_state(sk, TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
(err = sk_stream_wait_connect(sk, &timeo)) != 0)
goto out_err;
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
mss = TOM_TUNABLE(CPL_IO_STATE(sk)->toedev, mss);
cplios_set_flag(sk, CPLIOS_TX_MORE_DATA);
while (size > 0) {
int copy, i;
struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue);
if (!skb || (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) ||
(copy = mss - skb->len) <= 0) {
new_buf:
if (!tcp_memory_free(sk))
goto wait_for_sndbuf;
skb = alloc_tx_skb(sk, 0);
if (!skb)
goto wait_for_memory;
copy = mss;
}
if (copy > size)
copy = size;
i = skb_shinfo(skb)->nr_frags;
if (skb_can_coalesce(skb, i, page, offset)) {
skb_shinfo(skb)->frags[i - 1].size += copy;
} else if (i < MAX_SKB_FRAGS) {
get_page(page);
skb_fill_page_desc(skb, i, page, offset, copy);
} else {
tx_skb_finalize(skb);
push_frames_if_head(sk);
goto new_buf;
}
skb->len += copy;
if (skb->len == mss)
tx_skb_finalize(skb);
skb->data_len += copy;
skb->truesize += copy;
sk->sk_wmem_queued += copy;
tp->write_seq += copy;
copied += copy;
offset += copy;
size -= copy;
if (!size)
break;
if (unlikely(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND))
push_frames_if_head(sk);
continue;
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
if ((err = wait_for_mem(sk, &timeo)) != 0)
goto do_error;
}
out:
cplios_reset_flag(sk, CPLIOS_TX_MORE_DATA);
if (copied)
tcp_push(sk, flags);
done:
release_sock(sk);
return copied;
do_error:
if (copied)
goto out;
out_err:
cplios_reset_flag(sk, CPLIOS_TX_MORE_DATA);
copied = sk_stream_error(sk, flags, err);
goto done;
}
/*
* Add a list of skbs to a socket send queue. This interface is intended for
* use by in-kernel ULPs. The skbs must comply with the max size limit of the
* device and have a headroom of at least TX_HEADER_LEN bytes.
*/
int t3_sendskb(struct sock *sk, struct sk_buff *skb, int flags)
{
struct sk_buff *next;
struct tcp_sock *tp = tcp_sk(sk);
int mss, err, copied = 0;
long timeo;
lock_sock(sk);
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
if (!sk_in_state(sk, TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
(err = sk_stream_wait_connect(sk, &timeo)) != 0)
goto out_err;
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
/*
* We check for send buffer space once for the whole skb list. It
* isn't critical if we end up overrunning the send buffer limit as we
* do not allocate any new memory. The benefit is we don't need to
* perform intermediate packet pushes.
*/
while (!tcp_memory_free(sk)) {
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
if ((err = wait_for_mem(sk, &timeo)) != 0)
goto out_err;
}
mss = TOM_TUNABLE(CPL_IO_STATE(sk)->toedev, mss);
while (skb) {
if (unlikely(skb_headroom(skb) < TX_HEADER_LEN)) {
err = -EINVAL;
goto out_err;
}
if (unlikely(skb->len > mss)) {
err = -EMSGSIZE;
goto out_err;
}
next = skb->next;
skb->next = NULL;
skb_entail(sk, skb, ULPCB_FLAG_NO_APPEND | ULPCB_FLAG_NEED_HDR);
copied += skb->len;
tp->write_seq += skb->len + ulp_extra_len(skb);
skb = next;
}
done:
if (likely(skb_queue_len(&sk->sk_write_queue)))
t3_push_frames(sk, 1);
release_sock(sk);
return copied;
out_err:
if (copied == 0)
copied = sk_stream_error(sk, flags, err);
goto done;
}
EXPORT_SYMBOL(t3_sendskb);
/*
* Add data to an sk_buff page fragment.
*/
static int tcp_copy_to_page(struct sock *sk, const void __user *from,
struct sk_buff *skb, struct page *page, int off,
int copy)
{
if (copy_from_user(page_address(page) + off, from, copy))
return -EFAULT;
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
sk->sk_wmem_queued += copy;
return 0;
}
/*
* Add data to the main data portion of an sk_buff.
*/
static inline int ch_skb_add_data(struct sk_buff *skb, const void __user *from,
unsigned int copy)
{
int orig_len = skb->len;
if (!copy_from_user(skb_put(skb, copy), from, copy))
return 0;
__skb_trim(skb, orig_len);
return -EFAULT;
}
/*
* Calculate the size for a new send sk_buff. It's maximum size so we can
* pack lots of data into it, unless we plan to send it immediately, in which
* case we size it more tightly.
*
* Note: we don't bother compensating for MSS < PAGE_SIZE because it doesn't
* arise in normal cases and when it does we are just wasting memory.
*/
static inline int select_size(struct sock *sk, int io_len, int flags)
{
const int pgbreak = SKB_MAX_HEAD(TX_HEADER_LEN);
/*
* If the data wouldn't fit in the main body anyway, put only the
* header in the main body so it can use immediate data and place all
* the payload in page fragments.
*/
if (io_len > pgbreak)
return 0;
/*
* If we will be accumulating payload get a large main body.
*/
if (!send_should_push(sk, flags))
return pgbreak;
return io_len;
}
#if defined(CONFIG_T3_ZCOPY_SENDMSG) || defined(CONFIG_T3_ZCOPY_SENDMSG_MODULE)
/*
* ZCOPY_SENDMSG maps (if necessary) and pins a user space buffer instead of
* copying the payload from user- to kernel space. In normal mode of
* operation, we block until the DMA has completed and it is safe to return
* (considering that the user might modifies the buffer). Since host bus
* performance (PCI-E x8 and PCI-X 2.0) now exceeds the wire speed, this
* actually works pretty well. In addition, I added some tunables to do a
* hybrid scheme where the end of the user space buffer is copied (at the same
* the beginning of the buffer is DMAed). The mechanism provides enough
* pipelinging to achieve 10Gbps linerate on a single connection with moderate
* CPU utilization.
*
* Now, the exception (which as usual makes up for most of the code and
* complexity): while unlikely, there are scenarios where we want to return
* before the DMA completes (i.e. the DMA might not complete if a connection
* doesn't drain (somebody unplugged the cable *&%!) or we want to return for
* anther reason, i.e. because we got a signal. In that case, we must make
* sure that the user doesn't modify the buffer before the DMA has
* completed... yes, you guessed correctly, by remapping the buffer as COW and
* yes, that has some cost associated with it starting with mandatory TLB
* flush and potential page fault and buffer copy (what we wanted to avoid).
* However, it is NOT THE NORMAL case and rare!
*
* Written by Felix Marti (felix@chelsio.com)
*/
#include <asm/pgtable.h>
#ifndef LINUX_2_4
#include <asm/tlbflush.h>
#endif /* LINUX_2_4 */
#include <linux/hugetlb.h>
#define ZCOPY_PRT(m)
/*
* zcopy_to_skb() maps the user space buffer (from/size) and fills in the skb
* page descriptors to point to the buffer.
*/
static int zcopy_to_skb(struct sock *sk, struct sk_buff *skb,
unsigned long from, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
struct page *pages[MAX_SKB_FRAGS];
struct vm_area_struct *vmas[MAX_SKB_FRAGS];
unsigned int off = from & (PAGE_SIZE - 1);
int i, res, numpages = (size + off + (PAGE_SIZE - 1)) / PAGE_SIZE;
unsigned int copied = 0;
int err = 0;
ZCOPY_PRT(("zcopy_to_skb: TID %u from %lx size %lu skb %p\n",
CPL_IO_STATE(sk)->tid, from, size, skb));
BUG_ON(numpages > MAX_SKB_FRAGS);
down_read(&current->mm->mmap_sem);
res = get_user_pages(current, current->mm,
from & PAGE_MASK, numpages,
0, 0,
pages, vmas);
up_read(&current->mm->mmap_sem);
if (unlikely(res != numpages)) {
ZCOPY_PRT(("zcopy_to_skb: get_user_pages() returned %u instead"
" of %u pages\n", res, numpages));
if (res < 0) {
err = res;
res = 0;
} else
err = -EFAULT;
goto no_zcopy;
}
/*
* Scan through all of the returned pages to make sure they are
* appropriate zero copy candidates. If any of the pages are
* problematic or if the address range crosses a VMA boundry we just
* reject the zero copy effort.
*/
for (i = 0; i < numpages; i++)
if (!zcopy_vma(vmas[i]) || vmas[i] != vmas[0]) {
err = -EINVAL;
goto no_zcopy;
}
for (i = 0; i < numpages; i++) {
unsigned int page_off, page_size;
if (i == 0) {
page_off = off;
page_size = ((numpages == 1) ? size : PAGE_SIZE - off);
} else if (i == (numpages - 1)) {
page_off = 0;
page_size = size;
} else {
page_off = 0;
page_size = PAGE_SIZE;
}
BUG_ON(vmas[i] == 0 || pages[i] == 0);
skb_fill_page_desc(skb, i, pages[i], page_off, page_size);
copied += page_size;
size -= page_size;
ZCOPY_PRT(("zcopy_to_skb: p[%d] %p off %d size %d vma %p\n",
i, pages[i], 0, page_size, vmas[i]));
}
BUG_ON(size);
skb->len += copied;
skb->data_len += copied;
skb->truesize += copied;
atomic_add(copied, &sk->sk_omem_alloc);
sk->sk_wmem_queued += copied;
tp->write_seq += copied;
skb_vaddr_set(skb, from);
return err;
no_zcopy:
for (i = 0; i < res; i++)
page_cache_release(pages[i]);
return err;
}
/*
* If we're on an older kernel, we don't have the pte_offset_map_lock() macro
* available to prevent race conditions accessing PTEs in an atomic fashion.
* But on newer kernels, we use that mechanism exclusively and don't take the
* memory map spin lock ... This code is modeled on the mprotect() code
* which does exactly what we want but isn't exported from the kernel.
*/
#if defined(pte_offset_map_lock)
# define mprotect_page_table_lock(mm) \
do { } while (0)
# define mprotect_page_table_unlock(mm) \
do { } while (0)
#else
# define mprotect_page_table_lock(mm) \
do { spin_lock(&(mm)->page_table_lock); } while (0)
# define mprotect_page_table_unlock(mm) \
do { spin_unlock(&(mm)->page_table_lock); } while (0)
# define pgd_none_or_clear_bad(pgd) \
(pgd_none(*(pgd)) || unlikely(pgd_bad(*(pgd))))
# define pud_none_or_clear_bad(pud) \
(pud_none(*(pud)) || unlikely(pud_bad(*(pud))))
# define pmd_none_or_clear_bad(pmd) \
(pmd_none(*(pmd)) || unlikely(pmd_bad(*(pmd))))
# define pte_offset_map_lock(mm, pmd, address, ptl) \
pte_offset_map(pmd, address)
# define pte_unmap_unlock(pte, ptl) \
pte_unmap(pte)
#endif /* !deinfed(pte_offset_map_lock) */
/*
* We have an skb which has outstanding zero-copy DMA references to user pages
* but we need to return to the user. This sometimes happens when an
* application sets up a timer or the user types a ^C. Since the DMA hasn't
* been acknowledged yet, we need to mark all of the pages referenced by the
* skb as copy-on-write in order to fulfill standard UNIX write() semantics.
* (I.e. writes to application memory buffers after a write() call returns cannot
* affect the actual write results.)
*/
static int zcopy_skb_dma_pending(struct sock *sk, struct sk_buff *skb)
{
struct vm_area_struct *vma;
unsigned int wr_hdr_len =
ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR ?
0 : sizeof (struct tx_data_wr);
unsigned int len = skb->len - wr_hdr_len;
unsigned long address = skb_vaddr(skb);
unsigned long end = PAGE_ALIGN(address + len);
int i;
address &= PAGE_MASK;
down_write(&current->mm->mmap_sem);
vma = find_vma(current->mm, skb_vaddr(skb));
#if defined(CONFIG_T3_ZCOPY_HUGEPAGES) && defined(CONFIG_HUGETLB_PAGE)
if (is_vm_hugetlb_page(vma)) {
pte_t *ptep = t3_huge_pte_offset(current->mm, vma->vm_start);
if (ptep) {
spin_lock(&current->mm->page_table_lock);
if (!pte_none(*ptep)) {
t3_ptep_set_wrprotect(current->mm, address, ptep);
pte_unmap(ptep);
}
spin_unlock(&current->mm->page_table_lock);
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
atomic_inc(&frag->page->_mapcount);
}
} else
#endif
{
mprotect_page_table_lock(current->mm);
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, address += PAGE_SIZE) {
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
/* make sure the page doesn't go away */
atomic_inc(&frag->page->_mapcount);
/*
* Dive down the PGD/PUD/PMD/PTE hierarchy for the page and
* mark it COW. When we have a ZERO_PAGE() mapping, some
* portions of the hierarchy may be missing. Since the
* ZERO_PAGE() is already COW and can never change, there's
* nothing we need to do.
*/
if ((pgd = pgd_offset(current->mm, address),
!(pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))) &&
(pud = pud_offset(pgd, address),
!(pud_none(*pud) || unlikely(pud_bad(*pud)))) &&
(pmd = pmd_offset(pud, address),
!(pmd_none(*pmd) || unlikely(pmd_bad(*pmd))))) {
spinlock_t *ptl __attribute__((unused));
pte_t *pte = pte_offset_map_lock(current->mm, pmd,
address, &ptl);
if (pte != NULL && pte_present(*pte))
t3_ptep_set_wrprotect(current->mm, address, pte);
pte_unmap_unlock(pte, ptl);
}
}
mprotect_page_table_unlock(current->mm);
}
t3_flush_tlb_range(vma, skb_vaddr(skb), end);
up_write(&current->mm->mmap_sem);
ULP_SKB_CB(skb)->flags &= ~ULPCB_FLAG_ZCOPY;
ULP_SKB_CB(skb)->flags |= ULPCB_FLAG_ZCOPY_COW;
atomic_sub(len, &sk->sk_omem_alloc);
#ifdef T3_TRACE
T3_TRACE5(TIDTB(sk),
"zcopy_skb_dma_pending: address 0x%lx len %u mm %p "
"mm_count %d need_hdr %d",
address, len, current->mm,
atomic_read(&current->mm->mm_count),
ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR);
#endif
return 0;
}
static void zcopy_skb_dma_complete(struct sock *sk, struct sk_buff *skb)
{
int i;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
atomic_dec(&frag->page->_mapcount);
}
ULP_SKB_CB(skb)->flags &= ~ULPCB_FLAG_ZCOPY_COW;
}
static int zcopy_dma_pending(struct sock *sk)
{
struct sk_buff *skb;
int ret = 0;
wr_queue_walk(sk, skb) {
if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_ZCOPY) {
ret = zcopy_skb_dma_pending(sk, skb);
if (ret)
return ret;
}
}
skb_queue_walk(&sk->sk_write_queue, skb) {
if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_ZCOPY) {
ret = zcopy_skb_dma_pending(sk, skb);
if (ret)
return ret;
}
}
return 0;
}
void t3_zcopy_cleanup_skb(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
unsigned int hdr_len = 0;
if (!(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NEED_HDR)) {
struct tom_data *d = TOM_DATA(CPL_IO_STATE(sk)->toedev);
hdr_len = sizeof (struct tx_data_wr);
atomic_sub(skb->len - hdr_len, &d->tx_dma_pending);
}
if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_ZCOPY) {
ULP_SKB_CB(skb)->flags &= ~ULPCB_FLAG_ZCOPY;
atomic_sub(skb->len - hdr_len, &sk->sk_omem_alloc);
if (!atomic_read(&sk->sk_omem_alloc))
__wake_up(sk_sleep(sk), TASK_INTERRUPTIBLE, 0, NULL);
} else if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_ZCOPY_COW)
zcopy_skb_dma_complete(sk, skb);
skb_vaddr_set(skb, 0);
}
static void zcopy_wait(struct sock *sk, long timeout)
{
#ifdef LINUX_2_4
DECLARE_WAITQUEUE(wait, current);
#else
DEFINE_WAIT(wait);
#endif /* LINUX_2_4 */
timeout = max_t(long, HZ / 2, timeout);
#ifdef LINUX_2_4
add_wait_queue(sk->sleep, &wait);
#endif /* LINUX_2_4 */
while (atomic_read(&sk->sk_omem_alloc) && !sk->sk_err) {
#ifdef LINUX_2_4
set_current_state(TASK_INTERRUPTIBLE);
#else
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
#endif /* LINUX_2_4 */
if (TOM_TUNABLE(CPL_IO_STATE(sk)->toedev,
zcopy_sendmsg_ret_pending_dma)) {
if (signal_pending(current) || !timeout) {
#ifdef T3_TRACE
T3_TRACE4(TIDTB(sk), "zcopy_wait: sk_err %d "
"signal_pending 0x%x timeout %ld "
"sk_omem_alloc %d", sk->sk_err,
signal_pending(current), timeout,
atomic_read(&sk->sk_omem_alloc));
#endif
if (!zcopy_dma_pending(sk)) {
BUG_ON(atomic_read(&sk->sk_omem_alloc));
break;
}
}
} else if (!timeout)
timeout = HZ / 2;
#ifdef T3_TRACE
T3_TRACE1(TIDTB(sk), "zcopy_wait: GTS sk_omem_alloc %d",
atomic_read(&sk->sk_omem_alloc));
#endif
release_sock(sk);
timeout = schedule_timeout(timeout);
lock_sock(sk);
}
#ifdef LINUX_2_4
current->state = TASK_RUNNING;
remove_wait_queue(sk->sleep, &wait);
#else
finish_wait(sk_sleep(sk), &wait);
#endif /* LINUX_2_4 */
}
#endif
#ifdef LINUX_2_4
static int chelsio_sendmsg(struct sock *sk,
struct msghdr *msg, int size)
#else
static int chelsio_sendmsg(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg, size_t size)
#endif /* LINUX_2_4 */
{
long timeo;
struct iovec *iov;
struct sk_buff *skb = NULL;
struct tcp_sock *tp = tcp_sk(sk);
struct toedev *tdev = CPL_IO_STATE(sk)->toedev;
int mss, iovlen, flags, err, copied = 0, zcopy_size = 0, zcopied = 0;
#if defined(CONFIG_T3_ZCOPY_SENDMSG) || defined(CONFIG_T3_ZCOPY_SENDMSG_MODULE)
struct tom_data *d;
int omem_alloc;
#endif
lock_sock(sk);
#if defined(CONFIG_T3_ZCOPY_SENDMSG) || defined(CONFIG_T3_ZCOPY_SENDMSG_MODULE)
omem_alloc = atomic_read(&sk->sk_omem_alloc);
atomic_set(&sk->sk_omem_alloc, 0);
#endif
flags = msg->msg_flags;
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
if (!sk_in_state(sk, TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
(err = sk_stream_wait_connect(sk, &timeo)) != 0)
goto out_err;
/* This should be in poll */
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
mss = TOM_TUNABLE(tdev, mss);
#if defined(CONFIG_T3_ZCOPY_SENDMSG) || defined(CONFIG_T3_ZCOPY_SENDMSG_MODULE)
d = TOM_DATA(tdev);
if (size >= TOM_TUNABLE(tdev, zcopy_sendmsg_partial_thres) &&
!corked(tp, flags) && !segment_eq(get_fs(), KERNEL_DS)) {
int pending = atomic_read(&d->tx_dma_pending);
int thres = TOM_TUNABLE(tdev, zcopy_sendmsg_thres);
if (pending >= thres /*|| size >= thres*/)
zcopy_size = size -
TOM_TUNABLE(tdev, zcopy_sendmsg_copy);
else
zcopy_size = size -
TOM_TUNABLE(tdev, zcopy_sendmsg_partial_copy);
}
/* In the case of NON-BLOCKING IO we don't want to exceed the
* sendbuffer at all which could cause delays in the zcopy path.
*/
if ((zcopy_size > 0) && (flags & MSG_DONTWAIT)) {
int rem = sk->sk_sndbuf - sk->sk_wmem_queued;
if (rem <= 0) {
err = -EAGAIN;
goto do_error;
} else if (size > rem)
size = rem;
}
#endif
cplios_set_flag(sk, CPLIOS_TX_MORE_DATA);
for (iovlen = msg->msg_iovlen, iov = msg->msg_iov; iovlen--; iov++) {
int seglen = min(iov->iov_len, size);
unsigned char __user *from = iov->iov_base;
while (seglen > 0) {
int copy, tailroom;
skb = skb_peek_tail(&sk->sk_write_queue);
if (!skb || zcopy_size > 0 ||
(ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND) ||
(copy = mss - skb->len) <= 0) {
new_buf:
/*
* If we're shy on configured allowable buffer
* space, let's see if we can ship some to the
* card and get our payload queued. Otherwise
* we'll have to wait for buffer space to
* become available ...
*/
if (skb) {
tx_skb_finalize(skb);
push_frames_if_head(sk);
}
if (!tcp_memory_free(sk))
goto wait_for_sndbuf;
skb = alloc_tx_skb(sk, select_size(sk, size,
flags));
if (unlikely(!skb))
goto wait_for_memory;
copy = mss;
}
if (copy > seglen)
copy = seglen;
#if defined(CONFIG_T3_ZCOPY_SENDMSG) || defined(CONFIG_T3_ZCOPY_SENDMSG_MODULE)
if (zcopy_size > 0) {
copy = min(copy, (int)((MAX_SKB_FRAGS - 2) *
PAGE_SIZE));
copy = min(copy, zcopy_size);
err = zcopy_to_skb(sk, skb,
(unsigned long)from, copy);
if (err) {
if (err == -EFAULT)
goto do_fault;
/*
* The zcopy failed -- probably
* because the buffer is shared or
* spans multiple VMAs: revert to
* non-zcopy mode. Disable zcopy and
* try again with the normal path ...
*/
zcopy_size = 0;
continue;
}
from += copy;
copied += copy;
zcopied += copy;
seglen -= copy;
size -= copy;
zcopy_size -= copy;
tx_skb_finalize(skb);
ULP_SKB_CB(skb)->flags |=
ULPCB_FLAG_COMPL | ULPCB_FLAG_ZCOPY;
if (!size) {
cplios_reset_flag(sk, CPLIOS_TX_MORE_DATA);
t3_push_frames(sk, 1);
goto done;
} else {
t3_push_frames(sk, 1);
continue;
}
}
#endif
/*
* There are two ways for an skb to become full:
* a) skb->len == mss
* b) the skb's max capacity is reached
*/
tailroom = skb_tailroom(skb);
if (tailroom >= copy) {
err = ch_skb_add_data(skb, from, copy);
if (err)
goto do_fault;
} else {
int i = skb_shinfo(skb)->nr_frags;
struct page *page = TCP_PAGE(sk);
int merge, off = TCP_OFF(sk);
if (off < PAGE_SIZE &&
skb_can_coalesce(skb, i, page, off)) {
merge = 1;
goto copy;
}
merge = 0;
if (i == MAX_SKB_FRAGS)
goto new_buf;
if (page && off == PAGE_SIZE) {
put_page(page);
TCP_PAGE(sk) = page = NULL;
}
if (!page) {
page = alloc_pages(sk->sk_allocation,
0);
if (!page)
goto wait_for_memory;
off = 0;
}
copy:
if (copy > PAGE_SIZE - off)
copy = PAGE_SIZE - off;
err = tcp_copy_to_page(sk, from, skb, page,
off, copy);
if (unlikely(err)) {
/*
* If the page was new, give it to the
* socket so it does not get leaked.
*/
if (!TCP_PAGE(sk)) {
TCP_PAGE(sk) = page;
TCP_OFF(sk) = 0;
}
goto do_error;
}
/* Update the skb. */
if (merge)
skb_shinfo(skb)->frags[i - 1].size +=
copy;
else {
skb_fill_page_desc(skb, i, page, off,
copy);
if (off + copy < PAGE_SIZE) {
/* space left, keep page */
get_page(page);
TCP_PAGE(sk) = page;
} else
TCP_PAGE(sk) = NULL;
}
TCP_OFF(sk) = off + copy;
}
if (unlikely(skb->len == mss))
tx_skb_finalize(skb);
tp->write_seq += copy;
from += copy;
copied += copy;
seglen -= copy;
size -= copy;
if (size == 0)
goto out;
if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_NO_APPEND)
push_frames_if_head(sk);
continue;
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
if ((err = wait_for_mem(sk, &timeo)) != 0)
goto do_error;
}
}
out:
cplios_reset_flag(sk, CPLIOS_TX_MORE_DATA);
if (copied != zcopied) {
if (zcopied && skb) {
tx_skb_finalize(skb);
t3_push_frames(sk, 1);
} else {
tcp_push(sk, flags);
}
}
done:
#if defined(CONFIG_T3_ZCOPY_SENDMSG) || defined(CONFIG_T3_ZCOPY_SENDMSG_MODULE)
if (zcopied > 0)
zcopy_wait(sk, timeo);
atomic_set(&sk->sk_omem_alloc, omem_alloc);
#endif
release_sock(sk);
return copied;
do_fault:
if (!skb->len) {
__skb_unlink(skb, &sk->sk_write_queue);
// tcp_free_skb(sk, skb);
sk->sk_wmem_queued -= skb->truesize;
__kfree_skb(skb);
}
do_error:
if (copied)
goto out;
out_err:
cplios_reset_flag(sk, CPLIOS_TX_MORE_DATA);
copied = sk_stream_error(sk, flags, err);
goto done;
}
static inline int is_delack_mode_valid(struct toedev *dev, struct sock *sk)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
return cplios->ulp_mode == ULP_MODE_NONE ||
(cplios->ulp_mode == ULP_MODE_TCPDDP &&
dev->ttid >= TOE_ID_CHELSIO_T3);
}
/*
* Set of states for which we should return RX credits.
*/
#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
/*
* Called after some received data has been read. It returns RX credits
* to the HW for the amount of data processed.
*/
void t3_cleanup_rbuf(struct sock *sk, int copied, int request)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct tcp_sock *tp;
struct toedev *dev;
int dack_mode, must_send;
u32 thres, credits, dack = 0;
unsigned int req_win = (request < (M_TCB_RX_DDP_BUF0_LEN >> 1)) ? request : (M_TCB_RX_DDP_BUF0_LEN >> 1);
if (!sk_in_state(sk, CREDIT_RETURN_STATE))
return;
t3_select_window(sk, req_win + 32768);
tp = tcp_sk(sk);
credits = tp->copied_seq - tp->rcv_wup;
if (unlikely(!credits))
return;
dev = cplios->toedev;
thres = TOM_TUNABLE(dev, rx_credit_thres);
if (unlikely(thres == 0))
return;
if (is_delack_mode_valid(dev, sk)) {
dack_mode = t3_select_delack(sk);
if (unlikely(dack_mode != cplios->delack_mode)) {
u32 r = tp->rcv_nxt - cplios->delack_seq;
if (r >= tp->rcv_wnd || r >= 16 * MSS_CLAMP(tp))
dack = F_RX_DACK_CHANGE |
V_RX_DACK_MODE(dack_mode);
}
} else
dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
/*
* For coalescing to work effectively ensure the receive window has
* at least 16KB left.
*/
must_send = credits + 16384 >= tp->rcv_wnd;
if (must_send || credits >= thres)
tp->rcv_wup += t3_send_rx_credits(sk, credits, dack, must_send);
}
EXPORT_SYMBOL(t3_cleanup_rbuf);
static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
skb_queue_walk(&sk->sk_receive_queue, skb) {
u32 offset = seq - ULP_SKB_CB(skb)->seq;
if (offset < skb->len) {
*off = offset;
return skb;
}
}
return NULL;
}
/*
* Returns whether a connection should enable DDP. This happens when all of
* the following conditions are met:
* - the connection's ULP mode is DDP
* - DDP is not already enabled
* - the last receive was above the DDP threshold
* - receive buffers are in user space
* - receive side isn't shutdown (handled by caller)
* - the connection's receive window is big enough so that sizable buffers
* can be posted without closing the window in the middle of DDP (checked
* when the connection is offloaded)
*/
static int sk_should_ddp(const struct sock *sk, const struct tcp_sock *tp,
int last_recv_len)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
return cplios->ulp_mode == ULP_MODE_TCPDDP && !DDP_STATE(sk)->ddp_setup &&
last_recv_len > TOM_TUNABLE(cplios->toedev, ddp_thres) &&
(!segment_eq(get_fs(), KERNEL_DS) || TOM_TUNABLE(cplios->toedev, kseg_ddp)) &&
tcp_sk(sk)->rcv_wnd >
(TOM_TUNABLE(cplios->toedev, ddp_copy_limit) +
DDP_RSVD_WIN);
}
static inline int is_ddp(const struct sk_buff *skb)
{
return skb_gl(skb) != NULL;
}
static inline int is_ddp_psh(const struct sk_buff *skb)
{
return is_ddp(skb) && (skb_ulp_ddp_flags(skb) & DDP_BF_PSH);
}
/*
* Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the
* data in the sk_buff body, and with RX_DATA_DDP, which place the data in a
* DDP buffer.
*/
static inline int copy_data(const struct sk_buff *skb, int offset,
struct iovec *to, int len)
{
if (likely(!is_ddp(skb))) /* RX_DATA */
return skb_copy_datagram_iovec(skb, offset, to, len);
if (likely(skb_ulp_ddp_flags(skb) & DDP_BF_NOCOPY)) { /* user DDP */
to->iov_len -= len;
to->iov_base += len;
return 0;
}
return t3_ddp_copy(skb, offset, to, len); /* kernel DDP */
}
/*
* Peek at data in a socket's receive buffer.
*/
#ifdef LINUX_2_4
static int peekmsg(struct sock *sk, struct msghdr *msg,
int len, int nonblock, int flags)
#else
static int peekmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len, int nonblock, int flags)
#endif /* LINUX_2_4 */
{
long timeo;
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk);
int copied = 0;
u32 peek_seq, offset;
size_t avail; /* amount of available data in current skb */
lock_sock(sk);
timeo = sock_rcvtimeo(sk, nonblock);
peek_seq = tp->copied_seq;
do {
if (unlikely(tp->urg_data && tp->urg_seq == peek_seq)) {
if (copied)
break;
if (signal_pending(current)) {
copied = timeo ? sock_intr_errno(timeo) :
-EAGAIN;
break;
}
}
skb_queue_walk(&sk->sk_receive_queue, skb) {
offset = peek_seq - ULP_SKB_CB(skb)->seq;
if (offset < skb->len)
goto found_ok_skb;
}
/* empty receive queue */
if (copied)
break;
if (sock_flag(sk, SOCK_DONE))
break;
if (sk->sk_err) {
copied = sock_error(sk);
break;
}
if (sk->sk_shutdown & RCV_SHUTDOWN)
break;
if (sk->sk_state == TCP_CLOSE) {
copied = -ENOTCONN;
break;
}
if (!timeo) {
copied = -EAGAIN;
break;
}
if (signal_pending(current)) {
copied = sock_intr_errno(timeo);
break;
}
if (sk->sk_backlog.tail) {
/* Do not sleep, just process backlog. */
release_sock(sk);
lock_sock(sk);
} else
sk_wait_data(sk, &timeo);
if (unlikely(peek_seq != tp->copied_seq)) {
if (net_ratelimit())
printk(KERN_DEBUG "TCP(%s:%d): Application "
"bug, race in MSG_PEEK.\n",
current->comm, current->pid);
peek_seq = tp->copied_seq;
}
continue;
found_ok_skb:
avail = skb->len - offset;
if (len < avail)
avail = len;
/*
* Do we have urgent data here? We need to skip over the
* urgent byte.
*/
if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - peek_seq;
if (urg_offset < avail) {
/*
* The amount of data we are preparing to copy
* contains urgent data.
*/
if (!urg_offset) { /* First byte is urgent */
if (!sock_flag(sk, SOCK_URGINLINE)) {
peek_seq++;
offset++;
avail--;
if (!avail)
continue;
}
} else {
/* stop short of the urgent data */
avail = urg_offset;
}
}
}
/*
* If MSG_TRUNC is specified the data is discarded.
*/
if (likely(!(flags & MSG_TRUNC)))
if (copy_data(skb, offset, msg->msg_iov, avail)) {
if (!copied)
copied = -EFAULT;
break;
}
peek_seq += avail;
copied += avail;
len -= avail;
} while (len > 0);
release_sock(sk);
return copied;
}
static int sk_wait_data_uninterruptible(struct sock *sk)
{
int rc;
long timeo = MAX_SCHEDULE_TIMEOUT;
#ifdef LINUX_2_4
DECLARE_WAITQUEUE(wait, current);
add_wait_queue(sk->sleep, &wait);
set_current_state(TASK_INTERRUPTIBLE);
set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
rc = sk_wait_event(sk, &timeo, !skb_queue_empty(&sk->sk_receive_queue));
clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
current->state = TASK_RUNNING;
remove_wait_queue(sk->sleep, &wait);
return rc;
#else
DEFINE_WAIT(wait);
prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE);
set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
rc = sk_wait_event(sk, &timeo, !skb_queue_empty(&sk->sk_receive_queue));
clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
finish_wait(sk_sleep(sk), &wait);
return rc;
#endif /* LINUX_2_4 */
}
/*
* Called after a user buffer is posted to await DDP completion. The waiting
* mode depends on the receive flags, which in turn determine the HW DDP flags.
*
* - Without MSG_WAITALL we set up the DDP buffer with non-zero initial offset
* and enable the HW timeout. In this case we sleep uninterruptably since we
* know the buffer will complete or timeout in reasonable time.
* - With MSG_WAITALL HW timeout is initially disabled. If a signal arrives
* and the DDP is still on-going we turn on the timer and disable
* no-invalidate, then sleep uninterruptably until the buffer completes.
*/
static inline int await_ddp_completion(struct sock *sk, int rcv_flags,
long *timeo)
{
if (unlikely(rcv_flags & MSG_WAITALL)) {
sk_wait_data(sk, timeo);
if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN))
return 0;
/* Got signal or timed out */
t3_set_tcb_field(sk, W_TCB_RX_DDP_FLAGS,
V_TF_DDP_PSH_NO_INVALIDATE1(1) |
V_TF_DDP_PUSH_DISABLE_1(1), 0);
}
return sk_wait_data_uninterruptible(sk);
}
#if 0
/* Controls whether to post DDP kernel and user buffers in parallel. */
#define PARALLEL_DDP_BUFS 1
/* Controls whether we post the DDP user buffer before copying the kernel buf */
#define EARLY_USERBUF_POST 1
/*
* Receive data from a socket into an application buffer.
*/
#ifdef LINUX_2_4
static int chelsio_recvmsg(struct sock *sk,
struct msghdr *msg, int len, int nonblock,
int flags, int *addr_len)
#else
static int chelsio_recvmsg(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len)
#endif /* LINUX_2_4 */
{
struct tcp_sock *tp = tcp_sk(sk);
int copied = 0, buffers_freed = 0, kern_ddp_done = 0;
unsigned long avail; /* amount of available data in current skb */
int target; /* Read at least this many bytes */
long timeo;
int user_ddp_ok, user_ddp_pending = 0;
/* Urgent data is handled by the SW stack's receive */
#ifdef LINUX_2_4
if (unlikely(flags & MSG_OOB))
return tcp_prot.recvmsg(sk, msg, len, nonblock, flags,
addr_len);
if (unlikely(flags & MSG_PEEK))
return peekmsg(sk, msg, len, nonblock, flags);
#else
if (unlikely(flags & MSG_OOB))
return tcp_prot.recvmsg(iocb, sk, msg, len, nonblock, flags,
addr_len);
if (unlikely(flags & MSG_PEEK))
return peekmsg(iocb, sk, msg, len, nonblock, flags);
#endif /* LINUX_2_4 */
/*
* Note: the code below depends on kern_ddp_done and user_ddp_ok
* having only values 0 and 1, or more precisely on the two variables
* having values either 0 or odd. This is due to the logical &s below.
* It also depends on DDP buffer completions reported in bit 0 of skb
* flags.
*/
lock_sock(sk);
timeo = sock_rcvtimeo(sk, nonblock);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
user_ddp_ok = msg->msg_iovlen == 1;
do {
struct sk_buff *skb;
u32 offset;
if (unlikely(tp->urg_data && tp->urg_seq == tp->copied_seq)) {
if (copied)
break;
if (signal_pending(current)) {
copied = timeo ? sock_intr_errno(timeo) :
-EAGAIN;
break;
}
}
skb = skb_peek(&sk->sk_receive_queue);
if (skb)
goto found_ok_skb;
/* empty receive queue */
if (copied >= target && !sk->sk_backlog.tail &&
!(kern_ddp_done & user_ddp_ok))
break;
if (copied) {
if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) || !timeo ||
signal_pending(current))
break;
} else {
if (sock_flag(sk, SOCK_DONE))
break;
if (sk->sk_err) {
copied = sock_error(sk);
break;
}
if (sk->sk_shutdown & RCV_SHUTDOWN)
break;
if (sk->sk_state == TCP_CLOSE) {
copied = -ENOTCONN; /* SOCK_DONE is off here */
break;
}
if (!timeo) {
copied = -EAGAIN;
break;
}
if (signal_pending(current)) {
copied = sock_intr_errno(timeo);
break;
}
}
if (sk->sk_backlog.tail && !user_ddp_pending) {
/* Do not sleep, just process backlog. */
release_sock(sk);
lock_sock(sk);
t3_cleanup_rbuf(sk, copied);
continue;
}
if (user_ddp_pending ||
((kern_ddp_done & user_ddp_ok) &&
!t3_post_ubuf(sk, msg->msg_iov, nonblock, flags, 1,
PARALLEL_DDP_BUFS && copied >= target))) {
/* One shot at DDP if we already have enough data */
if (copied >= target) {
#if PARALLEL_DDP_BUFS
# if EARLY_USERBUF_POST
if (user_ddp_pending)
t3_repost_kbuf(sk, 1, 0);
# endif
kern_ddp_done = 0;
#endif
user_ddp_ok = 0;
}
await_ddp_completion(sk, flags, &timeo);
user_ddp_pending = 0;
} else if (copied >= target)
break;
else {
if (kern_ddp_done) {
t3_repost_kbuf(sk, 1, 1);
kern_ddp_done = 0;
} else
t3_cleanup_rbuf(sk, copied);
sk_wait_data(sk, &timeo);
}
continue;
found_ok_skb:
offset = tp->copied_seq - ULP_SKB_CB(skb)->seq;
BUG_ON(offset >= skb->len);
avail = skb->len - offset;
if (len < avail)
avail = len;
/*
* Check if the data we are preparing to copy contains urgent
* data. Either stop short of urgent data or skip it if it's
* first and we are not delivering urgent data inline.
*/
if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - tp->copied_seq;
if (urg_offset < avail) {
if (urg_offset) {
/* stop short of the urgent data */
avail = urg_offset;
} else if (!sock_flag(sk, SOCK_URGINLINE)) {
/* First byte is urgent, skip */
tp->copied_seq++;
offset++;
avail--;
if (!avail)
goto skip_copy;
}
}
}
#if EARLY_USERBUF_POST
if (user_ddp_ok && avail + offset >= skb->len && len > avail &&
(skb_ulp_ddp_flags(skb)->flags & 1)) {
struct iovec iov;
iov.iov_len = msg->msg_iov->iov_len - avail;
iov.iov_base = msg->msg_iov->iov_base + avail;
user_ddp_pending = !t3_post_ubuf(sk, &iov, nonblock,
flags, 1, 0);
}
#endif
/*
* If MSG_TRUNC is specified the data is discarded.
*/
if (likely(!(flags & MSG_TRUNC))) {
if (copy_data(skb, offset, iov, avail)) {
if (!copied)
copied = -EFAULT;
break;
}
} else if (user_ddp_ok) {
/*
* Even though we skipped the copy we need to update
* msg->msg_iov since we may be using it for user DDP.
*/
msg->msg_iov->iov_len -= avail;
msg->msg_iov->iov_base += avail;
}
tp->copied_seq += avail;
copied += avail;
len -= avail;
skip_copy:
if (tp->urg_data && after(tp->copied_seq, tp->urg_seq))
tp->urg_data = 0;
/*
* If the buffer is fully consumed free it. If it's a DDP
* buffer also handle any events it indicates.
*/
if (avail + offset >= skb->len) {
unsigned int fl = skb_ulp_ddp_flags(skb);
tom_eat_skb(sk, skb, 0);
buffers_freed++;
if ((fl & DDP_BF_NOCOPY) && !user_ddp_ok)
break;
/* only DDP completions have bit 0 of ->flags set */
kern_ddp_done |= (fl & 1);
}
} while (len > 0);
/*
* If we can still receive decide what to do in preparation for the
* next receive. Note that RCV_SHUTDOWN is set if the connection
* transitioned to CLOSE but not if it was in that state to begin with.
*/
if (likely(!(sk->sk_shutdown & RCV_SHUTDOWN))) {
if (kern_ddp_done) {
t3_repost_kbuf(sk, 1, 1);
} else if (sk_should_ddp(sk, tp, copied) && !nonblock &&
msg->msg_iovlen == 1)
t3_enter_ddp(sk, TOM_TUNABLE(cplios->toedev,
ddp_copy_limit), 0);
}
if (buffers_freed)
t3_cleanup_rbuf(sk, copied);
release_sock(sk);
return copied;
}
#endif
/*
* Receive data from a socket into an application buffer.
*/
#ifdef LINUX_2_4
static int chelsio_recvmsg(struct sock *sk,
struct msghdr *msg, int len, int nonblock,
int flags, int *addr_len)
#else
static int chelsio_recvmsg(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len)
#endif /* LINUX_2_4 */
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
struct tcp_sock *tp = tcp_sk(sk);
int copied = 0, buffers_freed = 0;
unsigned long avail; /* amount of available data in current skb */
int target; /* Read at least this many bytes */
int request;
long timeo;
int user_ddp_ok, user_ddp_pending = 0;
struct ddp_state *p;
struct iovec *iov = msg->msg_iov;
/* Urgent data is handled by the SW stack's receive */
#ifdef LINUX_2_4
if (unlikely(flags & MSG_OOB))
return tcp_prot.recvmsg(sk, msg, len, nonblock, flags,
addr_len);
if (unlikely(flags & MSG_PEEK))
return peekmsg(sk, msg, len, nonblock, flags);
#else
if (unlikely(flags & MSG_OOB))
return tcp_prot.recvmsg(iocb, sk, msg, len, nonblock, flags,
addr_len);
if (unlikely(flags & MSG_PEEK))
return peekmsg(iocb, sk, msg, len, nonblock, flags);
#endif /* LINUX_2_4 */
/*
* Note: the code below depends on kern_ddp_done and user_ddp_ok
* having only values 0 and 1, or more precisely on the two variables
* having values either 0 or odd. This is due to the logical &s below.
* It also depends on DDP buffer completions reported in bit 0 of skb
* flags.
*/
lock_sock(sk);
timeo = sock_rcvtimeo(sk, nonblock);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
request = len;
user_ddp_ok = (target <= iov->iov_len) && !(MSG_WAITALL && (msg->msg_iovlen > 1));
p = DDP_STATE(sk);
/*
* Check to see if we need to grow receive window.
*/
if (unlikely (cplios_flag(sk , CPLIOS_UPDATE_RCV_WND)))
t3_cleanup_rbuf(sk, copied, request);
if (p->ddp_setup && !p->ubuf_ddp_ready)
user_ddp_ok = 0;
if (p->ddp_setup) {
p->cancel_ubuf = 0;
}
do {
struct sk_buff *skb;
u32 offset;
p = DDP_STATE(sk);
again:
#ifdef T3_TRACE
T3_TRACE4(TIDTB(sk),
"chelsio_recvmsg: loop start len %d copied %d "
"user_ddp_pending %u signal 0x%x",
len, copied, user_ddp_pending,
signal_pending(current));
#endif
if (unlikely(tp->urg_data && tp->urg_seq == tp->copied_seq)) {
if (copied)
break;
if (signal_pending(current)) {
copied = timeo ? sock_intr_errno(timeo) :
-EAGAIN;
break;
}
}
skb = skb_peek(&sk->sk_receive_queue);
if (skb)
goto found_ok_skb;
/*
* The receive queue is empty and here we are asking for more
* data. Before we do anything else, check to see if we have
* data queued up to send and if there's available write
* space. If so, push it along and free up the write space.
* This is a major win for request-response style
* communication patterns and doesn't hurt bulk data
* applications.
*/
if (cplios->wr_avail &&
skb_queue_len(&sk->sk_write_queue) &&
t3_push_frames(sk, cplios->wr_avail == cplios->wr_max))
sk->sk_write_space(sk);
if (copied >= target && !sk->sk_backlog.tail &&
!user_ddp_pending)
break;
if (copied) {
#ifdef T3_TRACE
T3_TRACE5(TIDTB(sk),
"chelsio_recvmsg: copied - break %d %d %d %d %d",
sk->sk_err, sk->sk_state == TCP_CLOSE,
(sk->sk_shutdown & RCV_SHUTDOWN), !timeo,
signal_pending(current));
#endif
if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) || !timeo ||
signal_pending(current))
break;
} else {
#ifdef T3_TRACE
T3_TRACE5(TIDTB(sk),
"chelsio_recvmsg: !copied - break %d %d %d %d %d",
sock_flag(sk, SOCK_DONE), sk->sk_err,
(sk->sk_shutdown & RCV_SHUTDOWN),
sk->sk_state == TCP_CLOSE, !timeo);
#endif
if (sock_flag(sk, SOCK_DONE))
break;
if (sk->sk_err) {
copied = sock_error(sk);
break;
}
if (sk->sk_shutdown & RCV_SHUTDOWN)
break;
if (sk->sk_state == TCP_CLOSE) {
copied = -ENOTCONN; /* SOCK_DONE is off here */
break;
}
if (!timeo) {
copied = -EAGAIN;
break;
}
if (signal_pending(current)) {
copied = sock_intr_errno(timeo);
break;
}
}
if (sk->sk_backlog.tail && !user_ddp_pending) {
/* Do not sleep, just process backlog. */
release_sock(sk);
lock_sock(sk);
t3_cleanup_rbuf(sk, copied, request);
continue;
}
#ifdef T3_TRACE
T3_TRACE3(TIDTB(sk),
"user_ddp_ok %d ubuf_ddp_ready %d iov_len %d",
user_ddp_ok, p->ddp_setup ? p->ubuf_ddp_ready : -1, iov->iov_len);
#endif
if (p->ddp_setup && user_ddp_ok && !user_ddp_pending &&
iov->iov_len > p->kbuf[0]->length &&
p->ubuf_ddp_ready) {
user_ddp_pending =
!t3_overlay_ubuf(sk, iov, nonblock, flags, 1, 1);
if (user_ddp_pending) {
p->kbuf_posted++;
user_ddp_ok = 0;
}
#ifdef T3_TRACE
T3_TRACE3(TIDTB(sk),
"overlay_ubuf kbuf_posted %d iov_len %d len %d",
p->kbuf_posted, iov->iov_len, request);
#endif
}
if (p->ddp_setup && !p->kbuf_posted) {
t3_post_kbuf(sk, 1 , nonblock);
p->kbuf_posted++;
#ifdef T3_TRACE
T3_TRACE3(TIDTB(sk),
"post overlay_buf kbuf_posted %d copied %d len %d",
p->kbuf_posted, copied, request);
#endif
}
if (user_ddp_pending) {
/* One shot at DDP if we already have enough data */
if (copied >= target) {
user_ddp_ok = 0;
}
#ifdef T3_TRACE
T3_TRACE0(TIDTB(sk), "chelsio_recvmsg: AWAIT");
#endif
sk_wait_data(sk, &timeo);
// XXX for timers to work
// XXX await_ddp_completion(sk, flags, &timeo);
#ifdef T3_TRACE
T3_TRACE0(TIDTB(sk), "chelsio_recvmsg: AWAITed");
#endif
} else if (copied >= target)
break;
else {
t3_cleanup_rbuf(sk, copied, request);
#ifdef T3_TRACE
T3_TRACE0(TIDTB(sk), "chelsio_recvmsg: DATA AWAIT");
#endif
sk_wait_data(sk, &timeo);
#ifdef T3_TRACE
T3_TRACE0(TIDTB(sk), "chelsio_recvmsg: DATA AWAITed");
#endif
}
continue;
found_ok_skb:
if (!skb->len) { /* ubuf dma is complete */
#ifdef T3_TRACE
T3_TRACE1(TIDTB(sk),
"chelsio_recvmsg: zero len skb flags 0x%x",
skb_ulp_ddp_flags(skb));
#endif
BUG_ON(!(skb_ulp_ddp_flags(skb) & DDP_BF_NOCOPY));
user_ddp_pending = 0;
tom_eat_skb(sk, skb, 0);
if (!copied && !timeo) {
copied = -EAGAIN;
break;
}
if (copied < target)
continue;
break;
}
offset = tp->copied_seq - ULP_SKB_CB(skb)->seq;
if (offset >= skb->len) {
#ifdef T3_TRACE
T3_TRACE3(TIDTB(sk),
"chelsio_recvmsg: BUG: OFFSET > LEN seq 0x%x skb->len %dflags 0x%x",
ULP_SKB_CB(skb)->seq, skb->len,
ULP_SKB_CB(skb)->flags);
#endif
printk("chelsio_recvmsg: BUG: OFFSET > LEN seq 0x%x "
"skb->len %d flags 0x%x",
ULP_SKB_CB(skb)->seq, skb->len,
ULP_SKB_CB(skb)->flags);
BUG_ON(1);
}
avail = skb->len - offset;
if (len < avail) {
if (is_ddp(skb) && (skb_ulp_ddp_flags(skb) & DDP_BF_NOCOPY)) {
#ifdef T3_TRACE
T3_TRACE5(TIDTB(sk),
"chelsio_recvmsg: BUG: len < avail"
" len %u skb->len %d offset %d"
" flags 0x%x avail %u",
len, skb->len, offset,
skb_ulp_ddp_flags(skb), avail);
printk("chelsio_recvmsg: BUG: tid %u state %d\n"
" len < avail skb->len %d offset %dn"
" flags 0x%x avail %u len %u\n",
cplios->tid, sk->sk_state, skb->len,
offset, skb_ulp_ddp_flags(skb),
(unsigned int)avail, (unsigned int)len);
#endif
BUG_ON(1);
};
avail = len;
}
#ifdef T3_TRACE
T3_TRACE5(TIDTB(sk),
"chelsio_recvmsg: seq 0x%x skb->len %d offset %d"
" avail %d flags 0x%x",
ULP_SKB_CB(skb)->seq, skb->len, offset, avail,
ULP_SKB_CB(skb)->flags);
#endif
/*
* Check if the data we are preparing to copy contains urgent
* data. Either stop short of urgent data or skip it if it's
* first and we are not delivering urgent data inline.
*/
if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - tp->copied_seq;
if (urg_offset < avail) {
if (urg_offset) {
/* stop short of the urgent data */
avail = urg_offset;
} else if (!sock_flag(sk, SOCK_URGINLINE)) {
/* First byte is urgent, skip */
tp->copied_seq++;
offset++;
avail--;
if (!avail)
goto skip_copy;
}
}
}
if (is_ddp_psh(skb) || offset) {
user_ddp_ok = 0;
#ifdef T3_TRACE
T3_TRACE0(TIDTB(sk), "chelsio_recvmsg: PSH");
#endif
}
if (p->ddp_setup && user_ddp_ok && !user_ddp_pending &&
iov->iov_len > p->kbuf[0]->length &&
p->ubuf_ddp_ready) {
user_ddp_pending =
!t3_overlay_ubuf(sk, iov, nonblock, flags, 1, 1);
if (user_ddp_pending) {
p->kbuf_posted++;
user_ddp_ok = 0;
}
#ifdef T3_TRACE
T3_TRACE3(TIDTB(sk),
"found_ok_skb: overlay_ubuf kbuf_posted %d"
" iov_len %d len %d",
p->kbuf_posted, iov->iov_len, request);
#endif
}
/*
* If MSG_TRUNC is specified the data is discarded.
*/
if (likely(!(flags & MSG_TRUNC)))
if (copy_data(skb, offset, iov, avail)) {
if (!copied)
copied = -EFAULT;
break;
}
tp->copied_seq += avail;
copied += avail;
len -= avail;
skip_copy:
if (tp->urg_data && after(tp->copied_seq, tp->urg_seq))
tp->urg_data = 0;
/*
* If the buffer is fully consumed free it. If it's a DDP
* buffer also handle any events it indicates.
*/
if (avail + offset >= skb->len) {
unsigned int fl = skb_ulp_ddp_flags(skb);
int exitnow, got_psh = 0, nomoredata = 0;
if (p->ddp_setup && is_ddp(skb) && (fl & 1)) {
if (is_ddp_psh(skb) && user_ddp_pending)
got_psh = 1;
if (fl & DDP_BF_NOCOPY)
user_ddp_pending = 0;
else if ((fl & DDP_BF_NODATA) && nonblock) {
p->kbuf_posted--;
nomoredata = 1;
} else {
p->kbuf_posted--;
p->ubuf_ddp_ready = 1;
}
}
tom_eat_skb(sk, skb, 0);
buffers_freed++;
exitnow = got_psh || nomoredata;
if (copied >= target && !skb_peek(&sk->sk_receive_queue) && exitnow)
break;
}
} while (len > 0);
/*
* If we can still receive decide what to do in preparation for the
* next receive. Note that RCV_SHUTDOWN is set if the connection
* transitioned to CLOSE but not if it was in that state to begin with.
*/
if (likely(!(sk->sk_shutdown & RCV_SHUTDOWN))) {
if (user_ddp_pending) {
user_ddp_ok = 0;
t3_cancel_ubuf(sk, &timeo);
p = DDP_STATE(sk);
if (skb_peek(&sk->sk_receive_queue)) {
if (copied < 0)
copied = 0;
goto again;
}
user_ddp_pending = 0;
}
}
/* Recheck SHUTDOWN conditions as t3_cancel_ubuf can release sock lock */
if (!(sk->sk_err || sk->sk_state == TCP_CLOSE ||
cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN) ||
(sk->sk_shutdown & RCV_SHUTDOWN))) {
if (p->ddp_setup) {
if (!p->kbuf_posted) {
#ifdef T3_TRACE
T3_TRACE0(TIDTB(sk),
"chelsio_recvmsg: about to exit, repost kbuf");
#endif
if ((p->avg_request_len < 4096U) && (request < 4096U)) {
t3_enable_ddp(sk, 0);
t3_release_ddp_resources(sk);
t3_cleanup_ddp(sk);
} else {
t3_post_kbuf(sk, 1, nonblock);
p->kbuf_posted++;
}
#ifdef T3_TRACE
T3_TRACE4(TIDTB(sk),
"%s: kbuf_posted %d copied %d len %d",
__func__, p->kbuf_posted, copied, request);
#endif
}
p->avg_request_len = (p->avg_request_len + request) >> 1;
} else if (sk_should_ddp(sk, tp, copied)) {
if (!t3_enter_ddp(sk, TOM_TUNABLE(cplios->toedev,
ddp_copy_limit), 0, nonblock)) {
p = DDP_STATE(sk);
p->kbuf_posted = 1;
p->avg_request_len = (p->avg_request_len + request) >> 1;
#ifdef T3_TRACE
T3_TRACE4(TIDTB(sk),
"%s: enter ddp kbuf_posted %d"
" copied %d len %d",
__func__, p->kbuf_posted, copied,
request);
#endif
}
}
}
if (buffers_freed)
t3_cleanup_rbuf(sk, copied, request);
#ifdef T3_TRACE
T3_TRACE5(TIDTB(sk),
"chelsio_recvmsg <-: copied %d len %d buffers_freed %d"
" kbuf_posted %d user_ddp_pending %u",
copied, len, buffers_freed, p->ddp_setup ? p->kbuf_posted : -1,
user_ddp_pending);
#endif
release_sock(sk);
return copied;
}
/*
* A visitor-pattern based receive method that runs the supplied receive actor
* directly over the data in the receive queue.
*
* Caller must acquire the socket lock.
*/
int t3_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor)
{
u32 offset = 0;
int used, copied = 0;
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk);
while ((skb = tcp_recv_skb(sk, tp->copied_seq, &offset)) != NULL) {
size_t len = skb->len - offset;
if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - tp->copied_seq;
if (urg_offset < len)
len = urg_offset;
if (!len)
break;
}
used = recv_actor(desc, skb, offset, len);
if (unlikely(used < 0)) {
if (!copied)
return used;
break;
} else if (likely(used <= len)) {
tp->copied_seq += used;
copied += used;
offset += used;
}
if (offset != skb->len)
break;
tom_eat_skb(sk, skb, 0);
if (!desc->count)
break;
}
if (copied > 0)
t3_cleanup_rbuf(sk, copied, 0);
return copied;
}
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
/*
* Offload splice_read() implementation. We need our own because the original
* calls tcp_read_sock.
*/
#include <linux/splice.h>
struct tcp_splice_state {
struct pipe_inode_info *pipe;
size_t len;
unsigned int flags;
};
static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
unsigned int offset, size_t len)
{
struct tcp_splice_state *tss = rd_desc->arg.data;
return skb_splice_bits_pub(skb, offset, tss->pipe, tss->len,
tss->flags);
}
static ssize_t chelsio_splice_read(struct sock *sk, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
struct tcp_splice_state tss = {
.pipe = pipe,
.len = len,
.flags = flags,
};
int ret;
long timeo;
ssize_t spliced;
read_descriptor_t rd_desc;
/* We can't seek on a socket input */
if (unlikely(*ppos))
return -ESPIPE;
ret = spliced = 0;
rd_desc.arg.data = &tss;
lock_sock(sk);
timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK);
while (tss.len) {
ret = t3_read_sock(sk, &rd_desc, tcp_splice_data_recv);
if (ret < 0)
break;
if (!ret) {
if (spliced)
break;
if (flags & SPLICE_F_NONBLOCK) {
ret = -EAGAIN;
break;
}
if (sock_flag(sk, SOCK_DONE))
break;
if (sk->sk_err) {
ret = sock_error(sk);
break;
}
if (sk->sk_shutdown & RCV_SHUTDOWN)
break;
if (sk->sk_state == TCP_CLOSE) {
/*
* This occurs when user tries to read
* from never connected socket.
*/
ret = -ENOTCONN;
break;
}
if (!timeo) {
ret = -EAGAIN;
break;
}
sk_wait_data(sk, &timeo);
if (signal_pending(current)) {
ret = sock_intr_errno(timeo);
break;
}
continue;
}
tss.len -= ret;
spliced += ret;
if (tss.len == 0)
break;
release_sock(sk);
lock_sock(sk);
if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) || !timeo ||
signal_pending(current))
break;
}
release_sock(sk);
return spliced ? spliced : ret;
}
#endif
/*
* Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail
* under any circumstances. We take the easy way out and always queue the
* message to the write_queue. We can optimize the case where the queue is
* already empty though the optimization is probably not worth it.
*/
static void close_conn(struct sock *sk)
{
struct sk_buff *skb;
struct cpl_close_con_req *req;
unsigned int tid = CPL_IO_STATE(sk)->tid;
skb = alloc_skb_nofail(sizeof(struct cpl_close_con_req));
req = (struct cpl_close_con_req *)__skb_put(skb, sizeof(*req));
req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
req->wr.wr_lo = htonl(V_WR_TID(tid));
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
req->rsvd = htonl(tcp_sk(sk)->write_seq);
tcp_uncork(sk);
skb_entail(sk, skb, ULPCB_FLAG_NO_APPEND);
if (sk->sk_state != TCP_SYN_SENT)
t3_push_frames(sk, 1);
}
/*
* State transitions and actions for close. Note that if we are in SYN_SENT
* we remain in that state as we cannot control a connection while it's in
* SYN_SENT; such connections are allowed to establish and are then aborted.
*/
static unsigned char new_state[16] = {
/* current state: new state: action: */
/* (Invalid) */ TCP_CLOSE,
/* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
/* TCP_SYN_SENT */ TCP_SYN_SENT,
/* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
/* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
/* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
/* TCP_TIME_WAIT */ TCP_CLOSE,
/* TCP_CLOSE */ TCP_CLOSE,
/* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
/* TCP_LAST_ACK */ TCP_LAST_ACK,
/* TCP_LISTEN */ TCP_CLOSE,
/* TCP_CLOSING */ TCP_CLOSING,
};
/*
* Perform a state transition during close and return the actions indicated
* for the transition. Do not make this function inline, the main reason
* it exists at all is to avoid multiple inlining of tcp_set_state.
*/
static int make_close_transition(struct sock *sk)
{
int next = (int)new_state[sk->sk_state];
tcp_set_state(sk, next & TCP_STATE_MASK);
return next & TCP_ACTION_FIN;
}
#define SHUTDOWN_ELIGIBLE_STATE (TCPF_ESTABLISHED | TCPF_SYN_RECV | TCPF_CLOSE_WAIT)
/*
* Shutdown the sending side of a connection. Much like close except
* that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
*
* Note: this does not do anything for SYN_SENT state as tcp_shutdown
* does, however this function is not really called for SYN_SENT because
* inet_shutdown handles that state specially. So no harm.
*/
static void chelsio_shutdown(struct sock *sk, int how)
{
if ((how & SEND_SHUTDOWN) &&
sk_in_state(sk, SHUTDOWN_ELIGIBLE_STATE) &&
make_close_transition(sk))
close_conn(sk);
}
static void chelsio_close(struct sock *sk, long timeout)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
int data_lost, old_state;
struct sk_buff *skb;
lock_sock(sk);
sk->sk_shutdown |= SHUTDOWN_MASK;
/*
* We need to flush the receive buffs. We do this only on the
* descriptor close, not protocol-sourced closes, because the
* reader process may not have drained the data yet! Make a note
* of whether any received data will be lost so we can decide whether
* to FIN or RST.
*/
data_lost = skb_queue_len(&sk->sk_receive_queue);
while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
skb_gl_set(skb, NULL);
kfree_skb(skb);
}
/*
* If the connection is in DDP mode, disable DDP and have any
* outstanding data and FIN (!!!) delivered to the host since HW
* might fail a ABORT_REQ if a fin is held.
*/
if (cplios->ulp_mode == ULP_MODE_TCPDDP)
t3_enable_ddp(sk, 0);
if (sk->sk_state == TCP_CLOSE) /* Nothing if we are already closed */
;
else if (data_lost || sk->sk_state == TCP_SYN_SENT) {
// Unread data was tossed, zap the connection.
T3_NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
t3_send_reset(sk, CPL_ABORT_SEND_RST, NULL);
release_tcp_port(sk);
goto unlock;
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
T3_NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
} else if (make_close_transition(sk)) { /* Regular FIN-based close */
close_conn(sk);
}
if (timeout)
sk_stream_wait_close(sk, timeout);
unlock:
old_state = sk->sk_state;
sock_hold(sk); /* must last past the potential inet_csk_destroy_sock */
sock_orphan(sk);
INC_ORPHAN_COUNT(sk);
release_sock(sk); /* Final release_sock in connection's lifetime. */
/*
* There are no more user references at this point. Grab the socket
* spinlock and finish the close.
*/
local_bh_disable();
bh_lock_sock(sk);
/*
* Because the socket was orphaned before the bh_lock_sock
* either the backlog or a BH may have already destroyed it.
* Bail out if so.
*/
if (old_state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
goto out;
if (sk->sk_state == TCP_FIN_WAIT2 && tcp_sk(sk)->linger2 < 0 &&
!cplios_flag(sk, CPLIOS_ABORT_SHUTDOWN)) {
struct sk_buff *skb;
skb = alloc_skb(sizeof(struct cpl_abort_req), GFP_ATOMIC);
if (skb) {
t3_send_reset(sk, CPL_ABORT_SEND_RST, skb);
T3_NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONLINGER);
}
}
#if 0
if (sk->sk_state != TCP_CLOSE) {
sk_stream_mem_reclaim(sk);
if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
(sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
if (net_ratelimit())
printk(KERN_INFO
"TCP: too many orphaned sockets\n");
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_ATOMIC);
NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
}
}
#endif
if (sk->sk_state == TCP_CLOSE)
inet_csk_destroy_sock(sk);
out:
bh_unlock_sock(sk);
local_bh_enable();
sock_put(sk);
}
/*
* Our analog of tcp_free_skb().
*/
static inline void chelsio_tcp_free_skb(struct sock *sk, struct sk_buff *skb)
{
sk->sk_wmem_queued -= skb->truesize;
#if defined(CONFIG_T3_ZCOPY_SENDMSG) || defined(CONFIG_T3_ZCOPY_SENDMSG_MODULE)
if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_ZCOPY_COW)
t3_zcopy_cleanup_skb(skb);
else
skb_vaddr_set(skb, 0);
#endif
__kfree_skb(skb);
}
void t3_purge_write_queue(struct sock *sk)
{
struct sk_buff *skb;
while ((skb = __skb_dequeue(&sk->sk_write_queue)))
chelsio_tcp_free_skb(sk, skb);
// tcp_mem_reclaim(sk);
}
/*
* Switch a socket to the SW TCP's protocol operations.
*/
void install_standard_ops(struct sock *sk)
{
/*
* Once we switch to the standard TCP operations our destructor
* (chelsio_destroy_sock) will not be called. That function normally
* cleans up socket DDP state so we need to do that here to avoid
* leaking DDP resources. Note that while the socket may live on for
* a long time DDP isn't usable with the standard ops, so DDP state
* can be released at this time.
*/
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
t3_cleanup_ddp(sk);
cplios->ulp_mode = ULP_MODE_NONE;
sk->sk_prot = &tcp_prot;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
restore_socket_ops(sk);
if (sk->sk_write_space == t3_write_space)
sk->sk_write_space = sk_stream_write_space;
#ifdef LINUX_2_4
if (likely(sk->filter)) {
sk_filter_release(sk, sk->filter);
sk->filter = NULL;
}
#else
if (likely(sk->sk_filter)) {
sk_filter_uncharge(sk, sk->sk_filter);
sk->sk_filter = NULL;
}
#endif /* LINUX_2_4 */
if (sk->sk_user_data)
restore_special_data_ready(sk);
sock_reset_flag(sk, SOCK_OFFLOADED);
cplios->flags = 0;
CPL_IO_STATE(sk) = NULL;
kfree(cplios);
}
/*
* Wait until a socket enters on of the given states.
*/
static void wait_for_states(struct sock *sk, unsigned int states)
{
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35)
wait_queue_head_t _sk_sleep;
#else
struct socket_wq _sk_wq;
#endif
struct task_struct *tsk = current;
DECLARE_WAITQUEUE(wait, tsk);
/*
* We want this to work even when there's no associated struct socket.
* In that case we provide a temporary wait_queue_head_t.
*/
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35)
if (sk->sk_sleep == NULL) {
init_waitqueue_head(&_sk_sleep);
sk->sk_sleep = &_sk_sleep;
}
#else
if (sk->sk_wq == NULL) {
init_waitqueue_head(&_sk_wq.wait);
_sk_wq.fasync_list = NULL;
init_rcu_head_on_stack(&_sk_wq.rcu);
sk->sk_wq = &_sk_wq;
}
#endif
add_wait_queue(sk_sleep(sk), &wait);
while (!sk_in_state(sk, states)) {
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
release_sock(sk);
if (!sk_in_state(sk, states))
schedule();
__set_task_state(tsk, TASK_RUNNING);
lock_sock(sk);
}
remove_wait_queue(sk_sleep(sk), &wait);
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35)
if (sk_sleep(sk) == &_sk_sleep)
sk->sk_sleep = NULL;
#else
if (sk->sk_wq == &_sk_wq)
sk->sk_wq = NULL;
#endif
}
static int chelsio_disconnect(struct sock *sk, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
__skb_queue_purge(&sk->sk_receive_queue);
t3_purge_write_queue(sk);
if (sk->sk_state != TCP_CLOSE) {
sk->sk_err = ECONNRESET;
t3_send_reset(sk, CPL_ABORT_SEND_RST, NULL);
wait_for_states(sk, TCPF_CLOSE);
}
__skb_queue_purge(&tp->out_of_order_queue);
/*
* We don't know the correct value for max_window but we know an
* upper limit.
*/
tp->max_window = 0xFFFF << SND_WSCALE(tp);
/*
* Now switch to Linux's TCP operations and let it finish the job.
*/
install_standard_ops(sk);
tcp_init_xmit_timers(sk);
return tcp_disconnect(sk, flags);
}
/*
* Our version of tcp_v4_destroy_sock(). We need to do this because
* tcp_writequeue_purge() that is used in the original doesn't quite match
* our needs. If we ever hook into the memory management of the SW stack we
* may be able to use tcp_v4_destroy_sock() directly.
*/
static t3_type_compat chelsio_destroy_sock(struct sock *sk)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
t3_cleanup_ddp(sk);
cplios->ulp_mode = ULP_MODE_NONE;
t3_purge_write_queue(sk);
CPL_IO_STATE(sk) = NULL;
kfree(cplios);
return tcp_prot.destroy(sk);
}
/* IP socket options we do not support on offloaded connections */
#define UNSUP_IP_SOCK_OPT ((1 << IP_OPTIONS))
/*
* Socket option code for IP. We do not allow certain options while a
* connection is offloaded. Some of the other options we handle specially,
* and the rest are directed to the SW IP for their usual processing.
*/
static int t3_ip_setsockopt(struct sock *sk, int level, int optname,
char __user *optval, int optlen, int call_compat)
{
struct cpl_io_state *cplios = CPL_IO_STATE(sk);
if (level != SOL_IP)
return -ENOPROTOOPT;
/* unsupported options */
if ((1 << optname) & UNSUP_IP_SOCK_OPT) {
printk(KERN_WARNING
"IP option %d ignored on offloaded TCP connection\n",
optname);
return -ENOPROTOOPT;
}
/* specially handled options */
if (optname == IP_TOS) {
struct inet_sock *inet = inet_sk(sk);
int val = 0, err = 0;
if (optlen >= sizeof(int)) {
if (get_user(val, (int __user *)optval))
return -EFAULT;
} else if (optlen >= sizeof(char)) {
unsigned char ucval;
if (get_user(ucval, (unsigned char __user *)optval))
return -EFAULT;
val = (int)ucval;
}
lock_sock(sk);
val &= ~3;
val |= inet->tos & 3;
if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP &&
!capable(CAP_NET_ADMIN))
err = -EPERM;
else if (inet->tos != val) {
inet->tos = val;
sk->sk_priority = rt_tos2priority(val);
/*
* Set the HW TOS only if it's not being used to
* determine the scheduling class and if the new
* TOS isn't special.
*/
if (cplios->sched_cls >= 8 && (val & 0xe0) != 0xc0)
t3_set_tos(sk);
}
release_sock(sk);
return err;
}
#ifdef TOM_CONFIG_COMPAT
if (call_compat && inet_csk(sk)->icsk_af_ops->compat_setsockopt)
return inet_csk(sk)->icsk_af_ops->compat_setsockopt(sk, level,
optname, optval, optlen);
#endif
return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, optname,
optval, optlen);
}
/*
* Socket option code for TCP. We override any option processing that needs to
* be handled specially for a TOE and leave the other options to SW TCP.
*/
static int do_t3_tcp_setsockopt(struct sock *sk, int level, int optname,
char __user *optval, socklen_t optlen)
{
struct tcp_sock *tp = tcp_sk(sk);
int val, err = 0;
if (optname == TCP_CONGESTION) {
char name[TCP_CA_NAME_MAX];
if (optlen < 1)
return -EINVAL;
val = strncpy_from_user(name, optval,
min((socklen_t)(TCP_CA_NAME_MAX - 1),
optlen));
if (val < 0)
return -EFAULT;
name[val] = 0;
return t3_set_cong_control(sk, name);
}
if (optlen < sizeof(int))
return -EINVAL;
if (get_user(val, (int __user *)optval))
return -EFAULT;
lock_sock(sk);
switch (optname) {
case TCP_NODELAY: {
int oldval = tp->nonagle;
if (val)
tp->nonagle |= TCP_NAGLE_OFF;
else
tp->nonagle &= ~TCP_NAGLE_OFF;
if (oldval != tp->nonagle)
t3_set_nagle(sk);
break;
}
case TCP_CORK:
if (val)
tp->nonagle |= TCP_NAGLE_CORK;
else
tcp_uncork(sk);
break;
case TCP_KEEPIDLE:
if (val < 1 || val > MAX_TCP_KEEPIDLE)
err = -EINVAL;
else {
tp->keepalive_time = val * HZ;
}
break;
case TCP_QUICKACK:
if (!val) {
inet_csk(sk)->icsk_ack.pingpong = 1;
} else {
inet_csk(sk)->icsk_ack.pingpong = 0;
}
break;
default:
release_sock(sk);
err = tcp_setsockopt(sk, level, optname,
optval, optlen);
goto out;
}
release_sock(sk);
out:
return err;
}
static int t3_tcp_setsockopt(struct sock *sk, int level, int optname,
char __user *optval, socklen_t optlen)
{
return level != SOL_TCP ?
t3_ip_setsockopt(sk, level, optname, optval, optlen, 0) :
do_t3_tcp_setsockopt(sk, level, optname, optval, optlen);
}
#ifdef TOM_CONFIG_COMPAT
static int t3_compat_tcp_setsockopt(struct sock *sk, int level, int optname,
char __user *optval, socklen_t optlen)
{
return level != SOL_TCP ?
t3_ip_setsockopt(sk, level, optname, optval, optlen, 1) :
do_t3_tcp_setsockopt(sk, level, optname, optval, optlen);
}
#endif
#if defined(CONFIG_TCP_OFFLOAD)
static void set_keepalive(struct sock *sk, int on_off)
{
int old = sock_flag(sk, SOCK_KEEPOPEN) != 0;
if (sk->sk_state != TCP_CLOSE && (on_off ^ old))
t3_set_keepalive(sk, on_off);
}
#endif
struct request_sock_ops t3_rsk_ops;
struct sk_ofld_proto t3_tcp_prot;
/*
* Set up the offload protocol operations vector. We start with TCP's and
* override some of the operations. Note that we do not override the backlog
* handler here.
*/
void __init t3_init_offload_ops(void)
{
t3_tcp_prot.proto = tcp_prot;
t3_init_rsk_ops(&t3_tcp_prot.proto, &t3_rsk_ops, &tcp_prot);
t3_tcp_prot.proto.close = chelsio_close;
t3_tcp_prot.proto.disconnect = chelsio_disconnect;
t3_tcp_prot.proto.destroy = chelsio_destroy_sock;
t3_tcp_prot.proto.shutdown = chelsio_shutdown;
t3_tcp_prot.proto.setsockopt = t3_tcp_setsockopt;
t3_tcp_prot.proto.sendmsg = chelsio_sendmsg;
t3_tcp_prot.proto.recvmsg = chelsio_recvmsg;
t3_tcp_prot.proto.sendpage = chelsio_sendpage;
#if defined(CONFIG_TCP_OFFLOAD)
t3_tcp_prot.proto.sendskb = t3_sendskb;
t3_tcp_prot.proto.read_sock = t3_read_sock;
t3_tcp_prot.proto.set_keepalive = set_keepalive;
#endif
#ifdef TOM_CONFIG_COMPAT
t3_tcp_prot.proto.compat_setsockopt = t3_compat_tcp_setsockopt;
#endif
t3_tcp_prot.read_sock = t3_read_sock;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
t3_tcp_prot.splice_read = chelsio_splice_read;
#endif
}