blob: 956f5da328be01296d8a04daa9d884c05c22d16e [file] [log] [blame]
/*
* Pseudo-driver for the intermediate queue device.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Patrick McHardy, <kaber@trash.net>
*
* The first version was written by Martin Devera, <devik@cdi.cz>
*
* Credits: Jan Rafaj <imq2t@cedric.vabo.cz>
* - Update patch to 2.4.21
* Sebastian Strollo <sstrollo@nortelnetworks.com>
* - Fix "Dead-loop on netdevice imq"-issue
* Marcel Sebek <sebek64@post.cz>
* - Update to 2.6.2-rc1
*
* After some time of inactivity there is a group taking care
* of IMQ again: http://www.linuximq.net
*
*
* 2004/06/30 - New version of IMQ patch to kernels <=2.6.7
* including the following changes:
*
* - Correction of ipv6 support "+"s issue (Hasso Tepper)
* - Correction of imq_init_devs() issue that resulted in
* kernel OOPS unloading IMQ as module (Norbert Buchmuller)
* - Addition of functionality to choose number of IMQ devices
* during kernel config (Andre Correa)
* - Addition of functionality to choose how IMQ hooks on
* PRE and POSTROUTING (after or before NAT) (Andre Correa)
* - Cosmetic corrections (Norbert Buchmuller) (Andre Correa)
*
*
* 2005/12/16 - IMQ versions between 2.6.7 and 2.6.13 were
* released with almost no problems. 2.6.14-x was released
* with some important changes: nfcache was removed; After
* some weeks of trouble we figured out that some IMQ fields
* in skb were missing in skbuff.c - skb_clone and copy_skb_header.
* These functions are correctly patched by this new patch version.
*
* Thanks for all who helped to figure out all the problems with
* 2.6.14.x: Patrick McHardy, Rune Kock, VeNoMouS, Max CtRiX,
* Kevin Shanahan, Richard Lucassen, Valery Dachev (hopefully
* I didn't forget anybody). I apologize again for my lack of time.
*
*
* 2008/06/17 - 2.6.25 - Changed imq.c to use qdisc_run() instead
* of qdisc_restart() and moved qdisc_run() to tasklet to avoid
* recursive locking. New initialization routines to fix 'rmmod' not
* working anymore. Used code from ifb.c. (Jussi Kivilinna)
*
* 2008/08/06 - 2.6.26 - (JK)
* - Replaced tasklet with 'netif_schedule()'.
* - Cleaned up and added comments for imq_nf_queue().
*
* 2009/04/12
* - Add skb_save_cb/skb_restore_cb helper functions for backuping
* control buffer. This is needed because qdisc-layer on kernels
* 2.6.27 and newer overwrite control buffer. (Jussi Kivilinna)
* - Add better locking for IMQ device. Hopefully this will solve
* SMP issues. (Jussi Kivilinna)
* - Port to 2.6.27
* - Port to 2.6.28
* - Port to 2.6.29 + fix rmmod not working
*
* 2009/04/20 - (Jussi Kivilinna)
* - Use netdevice feature flags to avoid extra packet handling
* by core networking layer and possibly increase performance.
*
* 2009/09/26 - (Jussi Kivilinna)
* - Add imq_nf_reinject_lockless to fix deadlock with
* imq_nf_queue/imq_nf_reinject.
*
* Also, many thanks to pablo Sebastian Greco for making the initial
* patch and to those who helped the testing.
*
* More info at: http://www.linuximq.net/ (Andre Correa)
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/moduleparam.h>
#include <linux/list.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <linux/if_arp.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
#include <linux/netfilter_ipv6.h>
#endif
#include <linux/imq.h>
#include <net/pkt_sched.h>
#include <net/netfilter/nf_queue.h>
static nf_hookfn imq_nf_hook;
static struct nf_hook_ops imq_ingress_ipv4 = {
.hook = imq_nf_hook,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_INET_PRE_ROUTING,
#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB)
.priority = NF_IP_PRI_MANGLE + 1
#else
.priority = NF_IP_PRI_NAT_DST + 1
#endif
};
static struct nf_hook_ops imq_egress_ipv4 = {
.hook = imq_nf_hook,
.owner = THIS_MODULE,
.pf = PF_INET,
.hooknum = NF_INET_POST_ROUTING,
#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA)
.priority = NF_IP_PRI_LAST
#else
.priority = NF_IP_PRI_NAT_SRC - 1
#endif
};
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
static struct nf_hook_ops imq_ingress_ipv6 = {
.hook = imq_nf_hook,
.owner = THIS_MODULE,
.pf = PF_INET6,
.hooknum = NF_INET_PRE_ROUTING,
#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB)
.priority = NF_IP6_PRI_MANGLE + 1
#else
.priority = NF_IP6_PRI_NAT_DST + 1
#endif
};
static struct nf_hook_ops imq_egress_ipv6 = {
.hook = imq_nf_hook,
.owner = THIS_MODULE,
.pf = PF_INET6,
.hooknum = NF_INET_POST_ROUTING,
#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA)
.priority = NF_IP6_PRI_LAST
#else
.priority = NF_IP6_PRI_NAT_SRC - 1
#endif
};
#endif
#if defined(CONFIG_IMQ_NUM_DEVS)
static unsigned int numdevs = CONFIG_IMQ_NUM_DEVS;
#else
static unsigned int numdevs = IMQ_MAX_DEVS;
#endif
static DEFINE_SPINLOCK(imq_nf_queue_lock);
static struct net_device *imq_devs_cache[IMQ_MAX_DEVS];
static struct net_device_stats *imq_get_stats(struct net_device *dev)
{
return &dev->stats;
}
/* called for packets kfree'd in qdiscs at places other than enqueue */
static void imq_skb_destructor(struct sk_buff *skb)
{
struct nf_queue_entry *entry = skb->nf_queue_entry;
if (entry) {
nf_queue_entry_release_refs(entry);
kfree(entry);
}
skb_restore_cb(skb); /* kfree backup */
}
/* locking not needed when called from imq_nf_queue */
static void imq_nf_reinject_lockless(struct nf_queue_entry *entry,
unsigned int verdict)
{
int status;
if (!entry->next_outfn) {
nf_reinject(entry, verdict);
return;
}
status = entry->next_outfn(entry, entry->next_queuenum);
if (status < 0) {
nf_queue_entry_release_refs(entry);
kfree_skb(entry->skb);
kfree(entry);
}
}
static void imq_nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
int status;
if (!entry->next_outfn) {
spin_lock_bh(&imq_nf_queue_lock);
nf_reinject(entry, verdict);
spin_unlock_bh(&imq_nf_queue_lock);
return;
}
rcu_read_lock();
local_bh_disable();
status = entry->next_outfn(entry, entry->next_queuenum);
local_bh_enable();
if (status < 0) {
nf_queue_entry_release_refs(entry);
kfree_skb(entry->skb);
kfree(entry);
}
rcu_read_unlock();
}
static int imq_dev_xmit(struct sk_buff *skb, struct net_device *dev)
{
dev->stats.tx_bytes += skb->len;
dev->stats.tx_packets++;
skb->imq_flags = 0;
skb->destructor = NULL;
skb_restore_cb(skb); /* restore skb->cb */
dev->trans_start = jiffies;
imq_nf_reinject(skb->nf_queue_entry, NF_ACCEPT);
return 0;
}
static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num)
{
struct net_device *dev;
struct sk_buff *skb_orig, *skb, *skb_shared;
struct Qdisc *q;
struct netdev_queue *txq;
int users, index;
int retval = -EINVAL;
index = entry->skb->imq_flags & IMQ_F_IFMASK;
if (unlikely(index > numdevs - 1)) {
if (net_ratelimit())
printk(KERN_WARNING
"IMQ: invalid device specified, highest is %u\n",
numdevs - 1);
retval = -EINVAL;
goto out;
}
/* check for imq device by index from cache */
dev = imq_devs_cache[index];
if (unlikely(!dev)) {
char buf[8];
/* get device by name and cache result */
snprintf(buf, sizeof(buf), "imq%d", index);
dev = dev_get_by_name(&init_net, buf);
if (!dev) {
/* not found ?!*/
BUG();
retval = -ENODEV;
goto out;
}
imq_devs_cache[index] = dev;
dev_put(dev);
}
if (unlikely(!(dev->flags & IFF_UP))) {
entry->skb->imq_flags = 0;
imq_nf_reinject_lockless(entry, NF_ACCEPT);
retval = 0;
goto out;
}
dev->last_rx = jiffies;
skb = entry->skb;
skb_orig = NULL;
/* skb has owner? => make clone */
if (unlikely(skb->destructor)) {
skb_orig = skb;
skb = skb_clone(skb, GFP_ATOMIC);
if (!skb) {
retval = -ENOMEM;
goto out;
}
entry->skb = skb;
}
skb->nf_queue_entry = entry;
dev->stats.rx_bytes += skb->len;
dev->stats.rx_packets++;
txq = dev_pick_tx(dev, skb);
q = rcu_dereference(txq->qdisc);
if (unlikely(!q->enqueue))
goto packet_not_eaten_by_imq_dev;
spin_lock_bh(qdisc_lock(q));
users = atomic_read(&skb->users);
skb_shared = skb_get(skb); /* increase reference count by one */
skb_save_cb(skb_shared); /* backup skb->cb, as qdisc layer will
overwrite it */
qdisc_enqueue_root(skb_shared, q); /* might kfree_skb */
if (likely(atomic_read(&skb_shared->users) == users + 1)) {
kfree_skb(skb_shared); /* decrease reference count by one */
skb->destructor = &imq_skb_destructor;
/* cloned? */
if (skb_orig)
kfree_skb(skb_orig); /* free original */
spin_unlock_bh(qdisc_lock(q));
/* schedule qdisc dequeue */
__netif_schedule(q);
retval = 0;
goto out;
} else {
skb_restore_cb(skb_shared); /* restore skb->cb */
/* qdisc dropped packet and decreased skb reference count of
* skb, so we don't really want to and try refree as that would
* actually destroy the skb. */
spin_unlock_bh(qdisc_lock(q));
goto packet_not_eaten_by_imq_dev;
}
packet_not_eaten_by_imq_dev:
/* cloned? restore original */
if (skb_orig) {
kfree_skb(skb);
entry->skb = skb_orig;
}
retval = -1;
out:
return retval;
}
static struct nf_queue_handler nfqh = {
.name = "imq",
.outfn = imq_nf_queue,
};
static unsigned int imq_nf_hook(unsigned int hook, struct sk_buff *pskb,
const struct net_device *indev,
const struct net_device *outdev,
int (*okfn)(struct sk_buff *))
{
if (pskb->imq_flags & IMQ_F_ENQUEUE)
return NF_QUEUE;
return NF_ACCEPT;
}
static int imq_close(struct net_device *dev)
{
netif_stop_queue(dev);
return 0;
}
static int imq_open(struct net_device *dev)
{
netif_start_queue(dev);
return 0;
}
static const struct net_device_ops imq_netdev_ops = {
.ndo_open = imq_open,
.ndo_stop = imq_close,
.ndo_start_xmit = imq_dev_xmit,
.ndo_get_stats = imq_get_stats,
};
static void imq_setup(struct net_device *dev)
{
dev->netdev_ops = &imq_netdev_ops;
dev->type = ARPHRD_VOID;
dev->mtu = 16000;
dev->tx_queue_len = 11000;
dev->flags = IFF_NOARP;
dev->features = NETIF_F_SG | NETIF_F_FRAGLIST |
NETIF_F_GSO | NETIF_F_HW_CSUM |
NETIF_F_HIGHDMA;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
}
static int imq_validate(struct nlattr *tb[], struct nlattr *data[])
{
int ret = 0;
if (tb[IFLA_ADDRESS]) {
if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
ret = -EINVAL;
goto end;
}
if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
ret = -EADDRNOTAVAIL;
goto end;
}
}
return 0;
end:
printk(KERN_WARNING "IMQ: imq_validate failed (%d)\n", ret);
return ret;
}
static struct rtnl_link_ops imq_link_ops __read_mostly = {
.kind = "imq",
.priv_size = 0,
.setup = imq_setup,
.validate = imq_validate,
};
static int __init imq_init_hooks(void)
{
int err;
nf_register_queue_imq_handler(&nfqh);
err = nf_register_hook(&imq_ingress_ipv4);
if (err)
goto err1;
err = nf_register_hook(&imq_egress_ipv4);
if (err)
goto err2;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
err = nf_register_hook(&imq_ingress_ipv6);
if (err)
goto err3;
err = nf_register_hook(&imq_egress_ipv6);
if (err)
goto err4;
#endif
return 0;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
err4:
nf_unregister_hook(&imq_ingress_ipv6);
err3:
nf_unregister_hook(&imq_egress_ipv4);
#endif
err2:
nf_unregister_hook(&imq_ingress_ipv4);
err1:
nf_unregister_queue_imq_handler();
return err;
}
static int __init imq_init_one(int index)
{
struct net_device *dev;
int ret;
dev = alloc_netdev(0, "imq%d", imq_setup);
if (!dev)
return -ENOMEM;
ret = dev_alloc_name(dev, dev->name);
if (ret < 0)
goto fail;
dev->rtnl_link_ops = &imq_link_ops;
ret = register_netdevice(dev);
if (ret < 0)
goto fail;
return 0;
fail:
free_netdev(dev);
return ret;
}
static int __init imq_init_devs(void)
{
int err, i;
if (numdevs < 1 || numdevs > IMQ_MAX_DEVS) {
printk(KERN_ERR "IMQ: numdevs has to be betweed 1 and %u\n",
IMQ_MAX_DEVS);
return -EINVAL;
}
rtnl_lock();
err = __rtnl_link_register(&imq_link_ops);
for (i = 0; i < numdevs && !err; i++)
err = imq_init_one(i);
if (err) {
__rtnl_link_unregister(&imq_link_ops);
memset(imq_devs_cache, 0, sizeof(imq_devs_cache));
}
rtnl_unlock();
return err;
}
static int __init imq_init_module(void)
{
int err;
#if defined(CONFIG_IMQ_NUM_DEVS)
BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS > 16);
BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS < 2);
BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS - 1 > IMQ_F_IFMASK);
#endif
err = imq_init_devs();
if (err) {
printk(KERN_ERR "IMQ: Error trying imq_init_devs(net)\n");
return err;
}
err = imq_init_hooks();
if (err) {
printk(KERN_ERR "IMQ: Error trying imq_init_hooks()\n");
rtnl_link_unregister(&imq_link_ops);
memset(imq_devs_cache, 0, sizeof(imq_devs_cache));
return err;
}
printk(KERN_INFO "IMQ driver loaded successfully.\n");
#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB)
printk(KERN_INFO "\tHooking IMQ before NAT on PREROUTING.\n");
#else
printk(KERN_INFO "\tHooking IMQ after NAT on PREROUTING.\n");
#endif
#if defined(CONFIG_IMQ_BEHAVIOR_AB) || defined(CONFIG_IMQ_BEHAVIOR_BB)
printk(KERN_INFO "\tHooking IMQ before NAT on POSTROUTING.\n");
#else
printk(KERN_INFO "\tHooking IMQ after NAT on POSTROUTING.\n");
#endif
return 0;
}
static void __exit imq_unhook(void)
{
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
nf_unregister_hook(&imq_ingress_ipv6);
nf_unregister_hook(&imq_egress_ipv6);
#endif
nf_unregister_hook(&imq_ingress_ipv4);
nf_unregister_hook(&imq_egress_ipv4);
nf_unregister_queue_imq_handler();
}
static void __exit imq_cleanup_devs(void)
{
rtnl_link_unregister(&imq_link_ops);
memset(imq_devs_cache, 0, sizeof(imq_devs_cache));
}
static void __exit imq_exit_module(void)
{
imq_unhook();
imq_cleanup_devs();
printk(KERN_INFO "IMQ driver unloaded successfully.\n");
}
module_init(imq_init_module);
module_exit(imq_exit_module);
module_param(numdevs, int, 0);
MODULE_PARM_DESC(numdevs, "number of IMQ devices (how many imq* devices will "
"be created)");
MODULE_AUTHOR("http://www.linuximq.net");
MODULE_DESCRIPTION("Pseudo-driver for the intermediate queue device. See "
"http://www.linuximq.net/ for more information.");
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("imq");