add idl4k kernel firmware version 1.13.0.105

This commit is contained in:
Jaroslav Kysela
2015-03-26 17:22:37 +01:00
parent 5194d2792e
commit e9070cdc77
31064 changed files with 12769984 additions and 0 deletions

529
kernel/net/sched/Kconfig Normal file
View File

@@ -0,0 +1,529 @@
#
# Traffic control configuration.
#
menuconfig NET_SCHED
bool "QoS and/or fair queueing"
select NET_SCH_FIFO
---help---
When the kernel has several packets to send out over a network
device, it has to decide which ones to send first, which ones to
delay, and which ones to drop. This is the job of the queueing
disciplines, several different algorithms for how to do this
"fairly" have been proposed.
If you say N here, you will get the standard packet scheduler, which
is a FIFO (first come, first served). If you say Y here, you will be
able to choose from among several alternative algorithms which can
then be attached to different network devices. This is useful for
example if some of your network devices are real time devices that
need a certain minimum data flow rate, or if you need to limit the
maximum data flow rate for traffic which matches specified criteria.
This code is considered to be experimental.
To administer these schedulers, you'll need the user-level utilities
from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
That package also contains some documentation; for more, check out
<http://linux-net.osdl.org/index.php/Iproute2>.
This Quality of Service (QoS) support will enable you to use
Differentiated Services (diffserv) and Resource Reservation Protocol
(RSVP) on your Linux router if you also say Y to the corresponding
classifiers below. Documentation and software is at
<http://diffserv.sourceforge.net/>.
If you say Y here and to "/proc file system" below, you will be able
to read status information about packet schedulers from the file
/proc/net/psched.
The available schedulers are listed in the following questions; you
can say Y to as many as you like. If unsure, say N now.
if NET_SCHED
comment "Queueing/Scheduling"
config NET_SCH_CBQ
tristate "Class Based Queueing (CBQ)"
---help---
Say Y here if you want to use the Class-Based Queueing (CBQ) packet
scheduling algorithm. This algorithm classifies the waiting packets
into a tree-like hierarchy of classes; the leaves of this tree are
in turn scheduled by separate algorithms.
See the top of <file:net/sched/sch_cbq.c> for more details.
CBQ is a commonly used scheduler, so if you're unsure, you should
say Y here. Then say Y to all the queueing algorithms below that you
want to use as leaf disciplines.
To compile this code as a module, choose M here: the
module will be called sch_cbq.
config NET_SCH_HTB
tristate "Hierarchical Token Bucket (HTB)"
---help---
Say Y here if you want to use the Hierarchical Token Buckets (HTB)
packet scheduling algorithm. See
<http://luxik.cdi.cz/~devik/qos/htb/> for complete manual and
in-depth articles.
HTB is very similar to CBQ regarding its goals however is has
different properties and different algorithm.
To compile this code as a module, choose M here: the
module will be called sch_htb.
config NET_SCH_HFSC
tristate "Hierarchical Fair Service Curve (HFSC)"
---help---
Say Y here if you want to use the Hierarchical Fair Service Curve
(HFSC) packet scheduling algorithm.
To compile this code as a module, choose M here: the
module will be called sch_hfsc.
config NET_SCH_ATM
tristate "ATM Virtual Circuits (ATM)"
depends on ATM
---help---
Say Y here if you want to use the ATM pseudo-scheduler. This
provides a framework for invoking classifiers, which in turn
select classes of this queuing discipline. Each class maps
the flow(s) it is handling to a given virtual circuit.
See the top of <file:net/sched/sch_atm.c> for more details.
To compile this code as a module, choose M here: the
module will be called sch_atm.
config NET_SCH_PRIO
tristate "Multi Band Priority Queueing (PRIO)"
---help---
Say Y here if you want to use an n-band priority queue packet
scheduler.
To compile this code as a module, choose M here: the
module will be called sch_prio.
config NET_SCH_MULTIQ
tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)"
---help---
Say Y here if you want to use an n-band queue packet scheduler
to support devices that have multiple hardware transmit queues.
To compile this code as a module, choose M here: the
module will be called sch_multiq.
config NET_SCH_RED
tristate "Random Early Detection (RED)"
---help---
Say Y here if you want to use the Random Early Detection (RED)
packet scheduling algorithm.
See the top of <file:net/sched/sch_red.c> for more details.
To compile this code as a module, choose M here: the
module will be called sch_red.
config NET_SCH_SFQ
tristate "Stochastic Fairness Queueing (SFQ)"
---help---
Say Y here if you want to use the Stochastic Fairness Queueing (SFQ)
packet scheduling algorithm.
See the top of <file:net/sched/sch_sfq.c> for more details.
To compile this code as a module, choose M here: the
module will be called sch_sfq.
config NET_SCH_TEQL
tristate "True Link Equalizer (TEQL)"
---help---
Say Y here if you want to use the True Link Equalizer (TLE) packet
scheduling algorithm. This queueing discipline allows the combination
of several physical devices into one virtual device.
See the top of <file:net/sched/sch_teql.c> for more details.
To compile this code as a module, choose M here: the
module will be called sch_teql.
config NET_SCH_TBF
tristate "Token Bucket Filter (TBF)"
---help---
Say Y here if you want to use the Token Bucket Filter (TBF) packet
scheduling algorithm.
See the top of <file:net/sched/sch_tbf.c> for more details.
To compile this code as a module, choose M here: the
module will be called sch_tbf.
config NET_SCH_GRED
tristate "Generic Random Early Detection (GRED)"
---help---
Say Y here if you want to use the Generic Random Early Detection
(GRED) packet scheduling algorithm for some of your network devices
(see the top of <file:net/sched/sch_red.c> for details and
references about the algorithm).
To compile this code as a module, choose M here: the
module will be called sch_gred.
config NET_SCH_DSMARK
tristate "Differentiated Services marker (DSMARK)"
---help---
Say Y if you want to schedule packets according to the
Differentiated Services architecture proposed in RFC 2475.
Technical information on this method, with pointers to associated
RFCs, is available at <http://www.gta.ufrj.br/diffserv/>.
To compile this code as a module, choose M here: the
module will be called sch_dsmark.
config NET_SCH_NETEM
tristate "Network emulator (NETEM)"
---help---
Say Y if you want to emulate network delay, loss, and packet
re-ordering. This is often useful to simulate networks when
testing applications or protocols.
To compile this driver as a module, choose M here: the module
will be called sch_netem.
If unsure, say N.
config NET_SCH_DRR
tristate "Deficit Round Robin scheduler (DRR)"
help
Say Y here if you want to use the Deficit Round Robin (DRR) packet
scheduling algorithm.
To compile this driver as a module, choose M here: the module
will be called sch_drr.
If unsure, say N.
config NET_SCH_INGRESS
tristate "Ingress Qdisc"
depends on NET_CLS_ACT
---help---
Say Y here if you want to use classifiers for incoming packets.
If unsure, say Y.
To compile this code as a module, choose M here: the
module will be called sch_ingress.
comment "Classification"
config NET_CLS
boolean
config NET_CLS_BASIC
tristate "Elementary classification (BASIC)"
select NET_CLS
---help---
Say Y here if you want to be able to classify packets using
only extended matches and actions.
To compile this code as a module, choose M here: the
module will be called cls_basic.
config NET_CLS_TCINDEX
tristate "Traffic-Control Index (TCINDEX)"
select NET_CLS
---help---
Say Y here if you want to be able to classify packets based on
traffic control indices. You will want this feature if you want
to implement Differentiated Services together with DSMARK.
To compile this code as a module, choose M here: the
module will be called cls_tcindex.
config NET_CLS_ROUTE4
tristate "Routing decision (ROUTE)"
select NET_CLS_ROUTE
select NET_CLS
---help---
If you say Y here, you will be able to classify packets
according to the route table entry they matched.
To compile this code as a module, choose M here: the
module will be called cls_route.
config NET_CLS_ROUTE
bool
config NET_CLS_FW
tristate "Netfilter mark (FW)"
select NET_CLS
---help---
If you say Y here, you will be able to classify packets
according to netfilter/firewall marks.
To compile this code as a module, choose M here: the
module will be called cls_fw.
config NET_CLS_U32
tristate "Universal 32bit comparisons w/ hashing (U32)"
select NET_CLS
---help---
Say Y here to be able to classify packets using a universal
32bit pieces based comparison scheme.
To compile this code as a module, choose M here: the
module will be called cls_u32.
config CLS_U32_PERF
bool "Performance counters support"
depends on NET_CLS_U32
---help---
Say Y here to make u32 gather additional statistics useful for
fine tuning u32 classifiers.
config CLS_U32_MARK
bool "Netfilter marks support"
depends on NET_CLS_U32
---help---
Say Y here to be able to use netfilter marks as u32 key.
config NET_CLS_RSVP
tristate "IPv4 Resource Reservation Protocol (RSVP)"
select NET_CLS
---help---
The Resource Reservation Protocol (RSVP) permits end systems to
request a minimum and maximum data flow rate for a connection; this
is important for real time data such as streaming sound or video.
Say Y here if you want to be able to classify outgoing packets based
on their RSVP requests.
To compile this code as a module, choose M here: the
module will be called cls_rsvp.
config NET_CLS_RSVP6
tristate "IPv6 Resource Reservation Protocol (RSVP6)"
select NET_CLS
---help---
The Resource Reservation Protocol (RSVP) permits end systems to
request a minimum and maximum data flow rate for a connection; this
is important for real time data such as streaming sound or video.
Say Y here if you want to be able to classify outgoing packets based
on their RSVP requests and you are using the IPv6 protocol.
To compile this code as a module, choose M here: the
module will be called cls_rsvp6.
config NET_CLS_FLOW
tristate "Flow classifier"
select NET_CLS
---help---
If you say Y here, you will be able to classify packets based on
a configurable combination of packet keys. This is mostly useful
in combination with SFQ.
To compile this code as a module, choose M here: the
module will be called cls_flow.
config NET_CLS_CGROUP
bool "Control Group Classifier"
select NET_CLS
depends on CGROUPS
---help---
Say Y here if you want to classify packets based on the control
cgroup of their process.
config NET_EMATCH
bool "Extended Matches"
select NET_CLS
---help---
Say Y here if you want to use extended matches on top of classifiers
and select the extended matches below.
Extended matches are small classification helpers not worth writing
a separate classifier for.
A recent version of the iproute2 package is required to use
extended matches.
config NET_EMATCH_STACK
int "Stack size"
depends on NET_EMATCH
default "32"
---help---
Size of the local stack variable used while evaluating the tree of
ematches. Limits the depth of the tree, i.e. the number of
encapsulated precedences. Every level requires 4 bytes of additional
stack space.
config NET_EMATCH_CMP
tristate "Simple packet data comparison"
depends on NET_EMATCH
---help---
Say Y here if you want to be able to classify packets based on
simple packet data comparisons for 8, 16, and 32bit values.
To compile this code as a module, choose M here: the
module will be called em_cmp.
config NET_EMATCH_NBYTE
tristate "Multi byte comparison"
depends on NET_EMATCH
---help---
Say Y here if you want to be able to classify packets based on
multiple byte comparisons mainly useful for IPv6 address comparisons.
To compile this code as a module, choose M here: the
module will be called em_nbyte.
config NET_EMATCH_U32
tristate "U32 key"
depends on NET_EMATCH
---help---
Say Y here if you want to be able to classify packets using
the famous u32 key in combination with logic relations.
To compile this code as a module, choose M here: the
module will be called em_u32.
config NET_EMATCH_META
tristate "Metadata"
depends on NET_EMATCH
---help---
Say Y here if you want to be able to classify packets based on
metadata such as load average, netfilter attributes, socket
attributes and routing decisions.
To compile this code as a module, choose M here: the
module will be called em_meta.
config NET_EMATCH_TEXT
tristate "Textsearch"
depends on NET_EMATCH
select TEXTSEARCH
select TEXTSEARCH_KMP
select TEXTSEARCH_BM
select TEXTSEARCH_FSM
---help---
Say Y here if you want to be able to classify packets based on
textsearch comparisons.
To compile this code as a module, choose M here: the
module will be called em_text.
config NET_CLS_ACT
bool "Actions"
---help---
Say Y here if you want to use traffic control actions. Actions
get attached to classifiers and are invoked after a successful
classification. They are used to overwrite the classification
result, instantly drop or redirect packets, etc.
A recent version of the iproute2 package is required to use
extended matches.
config NET_ACT_POLICE
tristate "Traffic Policing"
depends on NET_CLS_ACT
---help---
Say Y here if you want to do traffic policing, i.e. strict
bandwidth limiting. This action replaces the existing policing
module.
To compile this code as a module, choose M here: the
module will be called police.
config NET_ACT_GACT
tristate "Generic actions"
depends on NET_CLS_ACT
---help---
Say Y here to take generic actions such as dropping and
accepting packets.
To compile this code as a module, choose M here: the
module will be called gact.
config GACT_PROB
bool "Probability support"
depends on NET_ACT_GACT
---help---
Say Y here to use the generic action randomly or deterministically.
config NET_ACT_MIRRED
tristate "Redirecting and Mirroring"
depends on NET_CLS_ACT
---help---
Say Y here to allow packets to be mirrored or redirected to
other devices.
To compile this code as a module, choose M here: the
module will be called mirred.
config NET_ACT_IPT
tristate "IPtables targets"
depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
---help---
Say Y here to be able to invoke iptables targets after successful
classification.
To compile this code as a module, choose M here: the
module will be called ipt.
config NET_ACT_NAT
tristate "Stateless NAT"
depends on NET_CLS_ACT
---help---
Say Y here to do stateless NAT on IPv4 packets. You should use
netfilter for NAT unless you know what you are doing.
To compile this code as a module, choose M here: the
module will be called nat.
config NET_ACT_PEDIT
tristate "Packet Editing"
depends on NET_CLS_ACT
---help---
Say Y here if you want to mangle the content of packets.
To compile this code as a module, choose M here: the
module will be called pedit.
config NET_ACT_SIMP
tristate "Simple Example (Debug)"
depends on NET_CLS_ACT
---help---
Say Y here to add a simple action for demonstration purposes.
It is meant as an example and for debugging purposes. It will
print a configured policy string followed by the packet count
to the console for every packet that passes by.
If unsure, say N.
To compile this code as a module, choose M here: the
module will be called simple.
config NET_ACT_SKBEDIT
tristate "SKB Editing"
depends on NET_CLS_ACT
---help---
Say Y here to change skb priority or queue_mapping settings.
If unsure, say N.
To compile this code as a module, choose M here: the
module will be called skbedit.
config NET_CLS_IND
bool "Incoming device classification"
depends on NET_CLS_U32 || NET_CLS_FW
---help---
Say Y here to extend the u32 and fw classifier to support
classification based on the incoming device. This option is
likely to disappear in favour of the metadata ematch.
endif # NET_SCHED
config NET_SCH_FIFO
bool

48
kernel/net/sched/Makefile Normal file
View File

@@ -0,0 +1,48 @@
#
# Makefile for the Linux Traffic Control Unit.
#
obj-y := sch_generic.o sch_mq.o
obj-$(CONFIG_NET_SCHED) += sch_api.o sch_blackhole.o
obj-$(CONFIG_NET_CLS) += cls_api.o
obj-$(CONFIG_NET_CLS_ACT) += act_api.o
obj-$(CONFIG_NET_ACT_POLICE) += act_police.o
obj-$(CONFIG_NET_ACT_GACT) += act_gact.o
obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o
obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o
obj-$(CONFIG_NET_ACT_NAT) += act_nat.o
obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o
obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o
obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o
obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o
obj-$(CONFIG_NET_SCH_RED) += sch_red.o
obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o
obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o
obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o
obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o
obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o
obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o
obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o
obj-$(CONFIG_NET_EMATCH) += ematch.o
obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
obj-$(CONFIG_NET_EMATCH_META) += em_meta.o
obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o

1118
kernel/net/sched/act_api.c Normal file

File diff suppressed because it is too large Load Diff

221
kernel/net/sched/act_gact.c Normal file
View File

@@ -0,0 +1,221 @@
/*
* net/sched/gact.c Generic actions
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* copyright Jamal Hadi Salim (2002-4)
*
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/module.h>
#include <linux/init.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <linux/tc_act/tc_gact.h>
#include <net/tc_act/tc_gact.h>
#define GACT_TAB_MASK 15
static struct tcf_common *tcf_gact_ht[GACT_TAB_MASK + 1];
static u32 gact_idx_gen;
static DEFINE_RWLOCK(gact_lock);
static struct tcf_hashinfo gact_hash_info = {
.htab = tcf_gact_ht,
.hmask = GACT_TAB_MASK,
.lock = &gact_lock,
};
#ifdef CONFIG_GACT_PROB
static int gact_net_rand(struct tcf_gact *gact)
{
if (!gact->tcfg_pval || net_random() % gact->tcfg_pval)
return gact->tcf_action;
return gact->tcfg_paction;
}
static int gact_determ(struct tcf_gact *gact)
{
if (!gact->tcfg_pval || gact->tcf_bstats.packets % gact->tcfg_pval)
return gact->tcf_action;
return gact->tcfg_paction;
}
typedef int (*g_rand)(struct tcf_gact *gact);
static g_rand gact_rand[MAX_RAND]= { NULL, gact_net_rand, gact_determ };
#endif /* CONFIG_GACT_PROB */
static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
[TCA_GACT_PARMS] = { .len = sizeof(struct tc_gact) },
[TCA_GACT_PROB] = { .len = sizeof(struct tc_gact_p) },
};
static int tcf_gact_init(struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
struct nlattr *tb[TCA_GACT_MAX + 1];
struct tc_gact *parm;
struct tcf_gact *gact;
struct tcf_common *pc;
int ret = 0;
int err;
if (nla == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_GACT_MAX, nla, gact_policy);
if (err < 0)
return err;
if (tb[TCA_GACT_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_GACT_PARMS]);
#ifndef CONFIG_GACT_PROB
if (tb[TCA_GACT_PROB] != NULL)
return -EOPNOTSUPP;
#endif
pc = tcf_hash_check(parm->index, a, bind, &gact_hash_info);
if (!pc) {
pc = tcf_hash_create(parm->index, est, a, sizeof(*gact),
bind, &gact_idx_gen, &gact_hash_info);
if (IS_ERR(pc))
return PTR_ERR(pc);
ret = ACT_P_CREATED;
} else {
if (!ovr) {
tcf_hash_release(pc, bind, &gact_hash_info);
return -EEXIST;
}
}
gact = to_gact(pc);
spin_lock_bh(&gact->tcf_lock);
gact->tcf_action = parm->action;
#ifdef CONFIG_GACT_PROB
if (tb[TCA_GACT_PROB] != NULL) {
struct tc_gact_p *p_parm = nla_data(tb[TCA_GACT_PROB]);
gact->tcfg_paction = p_parm->paction;
gact->tcfg_pval = p_parm->pval;
gact->tcfg_ptype = p_parm->ptype;
}
#endif
spin_unlock_bh(&gact->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(pc, &gact_hash_info);
return ret;
}
static int tcf_gact_cleanup(struct tc_action *a, int bind)
{
struct tcf_gact *gact = a->priv;
if (gact)
return tcf_hash_release(&gact->common, bind, &gact_hash_info);
return 0;
}
static int tcf_gact(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
{
struct tcf_gact *gact = a->priv;
int action = TC_ACT_SHOT;
spin_lock(&gact->tcf_lock);
#ifdef CONFIG_GACT_PROB
if (gact->tcfg_ptype && gact_rand[gact->tcfg_ptype] != NULL)
action = gact_rand[gact->tcfg_ptype](gact);
else
action = gact->tcf_action;
#else
action = gact->tcf_action;
#endif
gact->tcf_bstats.bytes += qdisc_pkt_len(skb);
gact->tcf_bstats.packets++;
if (action == TC_ACT_SHOT)
gact->tcf_qstats.drops++;
gact->tcf_tm.lastuse = jiffies;
spin_unlock(&gact->tcf_lock);
return action;
}
static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_gact *gact = a->priv;
struct tc_gact opt = {
.index = gact->tcf_index,
.refcnt = gact->tcf_refcnt - ref,
.bindcnt = gact->tcf_bindcnt - bind,
.action = gact->tcf_action,
};
struct tcf_t t;
NLA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt);
#ifdef CONFIG_GACT_PROB
if (gact->tcfg_ptype) {
struct tc_gact_p p_opt = {
.paction = gact->tcfg_paction,
.pval = gact->tcfg_pval,
.ptype = gact->tcfg_ptype,
};
NLA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt);
}
#endif
t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(gact->tcf_tm.expires);
NLA_PUT(skb, TCA_GACT_TM, sizeof(t), &t);
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tc_action_ops act_gact_ops = {
.kind = "gact",
.hinfo = &gact_hash_info,
.type = TCA_ACT_GACT,
.capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_gact,
.dump = tcf_gact_dump,
.cleanup = tcf_gact_cleanup,
.lookup = tcf_hash_search,
.init = tcf_gact_init,
.walk = tcf_generic_walker
};
MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
MODULE_DESCRIPTION("Generic Classifier actions");
MODULE_LICENSE("GPL");
static int __init gact_init_module(void)
{
#ifdef CONFIG_GACT_PROB
printk("GACT probability on\n");
#else
printk("GACT probability NOT on\n");
#endif
return tcf_register_action(&act_gact_ops);
}
static void __exit gact_cleanup_module(void)
{
tcf_unregister_action(&act_gact_ops);
}
module_init(gact_init_module);
module_exit(gact_cleanup_module);

314
kernel/net/sched/act_ipt.c Normal file
View File

@@ -0,0 +1,314 @@
/*
* net/sched/ipt.c iptables target interface
*
*TODO: Add other tables. For now we only support the ipv4 table targets
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Copyright: Jamal Hadi Salim (2002-4)
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/module.h>
#include <linux/init.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <linux/tc_act/tc_ipt.h>
#include <net/tc_act/tc_ipt.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#define IPT_TAB_MASK 15
static struct tcf_common *tcf_ipt_ht[IPT_TAB_MASK + 1];
static u32 ipt_idx_gen;
static DEFINE_RWLOCK(ipt_lock);
static struct tcf_hashinfo ipt_hash_info = {
.htab = tcf_ipt_ht,
.hmask = IPT_TAB_MASK,
.lock = &ipt_lock,
};
static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook)
{
struct xt_tgchk_param par;
struct xt_target *target;
int ret = 0;
target = xt_request_find_target(AF_INET, t->u.user.name,
t->u.user.revision);
if (!target)
return -ENOENT;
t->u.kernel.target = target;
par.table = table;
par.entryinfo = NULL;
par.target = target;
par.targinfo = t->data;
par.hook_mask = hook;
par.family = NFPROTO_IPV4;
ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
if (ret < 0) {
module_put(t->u.kernel.target->me);
return ret;
}
return 0;
}
static void ipt_destroy_target(struct ipt_entry_target *t)
{
struct xt_tgdtor_param par = {
.target = t->u.kernel.target,
.targinfo = t->data,
};
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
}
static int tcf_ipt_release(struct tcf_ipt *ipt, int bind)
{
int ret = 0;
if (ipt) {
if (bind)
ipt->tcf_bindcnt--;
ipt->tcf_refcnt--;
if (ipt->tcf_bindcnt <= 0 && ipt->tcf_refcnt <= 0) {
ipt_destroy_target(ipt->tcfi_t);
kfree(ipt->tcfi_tname);
kfree(ipt->tcfi_t);
tcf_hash_destroy(&ipt->common, &ipt_hash_info);
ret = ACT_P_DELETED;
}
}
return ret;
}
static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
[TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ },
[TCA_IPT_HOOK] = { .type = NLA_U32 },
[TCA_IPT_INDEX] = { .type = NLA_U32 },
[TCA_IPT_TARG] = { .len = sizeof(struct ipt_entry_target) },
};
static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
struct nlattr *tb[TCA_IPT_MAX + 1];
struct tcf_ipt *ipt;
struct tcf_common *pc;
struct ipt_entry_target *td, *t;
char *tname;
int ret = 0, err;
u32 hook = 0;
u32 index = 0;
if (nla == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_IPT_MAX, nla, ipt_policy);
if (err < 0)
return err;
if (tb[TCA_IPT_HOOK] == NULL)
return -EINVAL;
if (tb[TCA_IPT_TARG] == NULL)
return -EINVAL;
td = (struct ipt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size)
return -EINVAL;
if (tb[TCA_IPT_INDEX] != NULL)
index = nla_get_u32(tb[TCA_IPT_INDEX]);
pc = tcf_hash_check(index, a, bind, &ipt_hash_info);
if (!pc) {
pc = tcf_hash_create(index, est, a, sizeof(*ipt), bind,
&ipt_idx_gen, &ipt_hash_info);
if (IS_ERR(pc))
return PTR_ERR(pc);
ret = ACT_P_CREATED;
} else {
if (!ovr) {
tcf_ipt_release(to_ipt(pc), bind);
return -EEXIST;
}
}
ipt = to_ipt(pc);
hook = nla_get_u32(tb[TCA_IPT_HOOK]);
err = -ENOMEM;
tname = kmalloc(IFNAMSIZ, GFP_KERNEL);
if (unlikely(!tname))
goto err1;
if (tb[TCA_IPT_TABLE] == NULL ||
nla_strlcpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ)
strcpy(tname, "mangle");
t = kmemdup(td, td->u.target_size, GFP_KERNEL);
if (unlikely(!t))
goto err2;
if ((err = ipt_init_target(t, tname, hook)) < 0)
goto err3;
spin_lock_bh(&ipt->tcf_lock);
if (ret != ACT_P_CREATED) {
ipt_destroy_target(ipt->tcfi_t);
kfree(ipt->tcfi_tname);
kfree(ipt->tcfi_t);
}
ipt->tcfi_tname = tname;
ipt->tcfi_t = t;
ipt->tcfi_hook = hook;
spin_unlock_bh(&ipt->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(pc, &ipt_hash_info);
return ret;
err3:
kfree(t);
err2:
kfree(tname);
err1:
kfree(pc);
return err;
}
static int tcf_ipt_cleanup(struct tc_action *a, int bind)
{
struct tcf_ipt *ipt = a->priv;
return tcf_ipt_release(ipt, bind);
}
static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
struct tcf_result *res)
{
int ret = 0, result = 0;
struct tcf_ipt *ipt = a->priv;
struct xt_target_param par;
if (skb_cloned(skb)) {
if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
return TC_ACT_UNSPEC;
}
spin_lock(&ipt->tcf_lock);
ipt->tcf_tm.lastuse = jiffies;
ipt->tcf_bstats.bytes += qdisc_pkt_len(skb);
ipt->tcf_bstats.packets++;
/* yes, we have to worry about both in and out dev
worry later - danger - this API seems to have changed
from earlier kernels */
par.in = skb->dev;
par.out = NULL;
par.hooknum = ipt->tcfi_hook;
par.target = ipt->tcfi_t->u.kernel.target;
par.targinfo = ipt->tcfi_t->data;
ret = par.target->target(skb, &par);
switch (ret) {
case NF_ACCEPT:
result = TC_ACT_OK;
break;
case NF_DROP:
result = TC_ACT_SHOT;
ipt->tcf_qstats.drops++;
break;
case IPT_CONTINUE:
result = TC_ACT_PIPE;
break;
default:
if (net_ratelimit())
printk("Bogus netfilter code %d assume ACCEPT\n", ret);
result = TC_POLICE_OK;
break;
}
spin_unlock(&ipt->tcf_lock);
return result;
}
static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_ipt *ipt = a->priv;
struct ipt_entry_target *t;
struct tcf_t tm;
struct tc_cnt c;
/* for simple targets kernel size == user size
** user name = target name
** for foolproof you need to not assume this
*/
t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
if (unlikely(!t))
goto nla_put_failure;
c.bindcnt = ipt->tcf_bindcnt - bind;
c.refcnt = ipt->tcf_refcnt - ref;
strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name);
NLA_PUT(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t);
NLA_PUT_U32(skb, TCA_IPT_INDEX, ipt->tcf_index);
NLA_PUT_U32(skb, TCA_IPT_HOOK, ipt->tcfi_hook);
NLA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c);
NLA_PUT_STRING(skb, TCA_IPT_TABLE, ipt->tcfi_tname);
tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install);
tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse);
tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires);
NLA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm);
kfree(t);
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
kfree(t);
return -1;
}
static struct tc_action_ops act_ipt_ops = {
.kind = "ipt",
.hinfo = &ipt_hash_info,
.type = TCA_ACT_IPT,
.capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_ipt,
.dump = tcf_ipt_dump,
.cleanup = tcf_ipt_cleanup,
.lookup = tcf_hash_search,
.init = tcf_ipt_init,
.walk = tcf_generic_walker
};
MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
MODULE_DESCRIPTION("Iptables target actions");
MODULE_LICENSE("GPL");
static int __init ipt_init_module(void)
{
return tcf_register_action(&act_ipt_ops);
}
static void __exit ipt_cleanup_module(void)
{
tcf_unregister_action(&act_ipt_ops);
}
module_init(ipt_init_module);
module_exit(ipt_cleanup_module);

View File

@@ -0,0 +1,260 @@
/*
* net/sched/mirred.c packet mirroring and redirect actions
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Jamal Hadi Salim (2002-4)
*
* TODO: Add ingress support (and socket redirect support)
*
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/module.h>
#include <linux/init.h>
#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <linux/tc_act/tc_mirred.h>
#include <net/tc_act/tc_mirred.h>
#include <linux/if_arp.h>
#define MIRRED_TAB_MASK 7
static struct tcf_common *tcf_mirred_ht[MIRRED_TAB_MASK + 1];
static u32 mirred_idx_gen;
static DEFINE_RWLOCK(mirred_lock);
static struct tcf_hashinfo mirred_hash_info = {
.htab = tcf_mirred_ht,
.hmask = MIRRED_TAB_MASK,
.lock = &mirred_lock,
};
static inline int tcf_mirred_release(struct tcf_mirred *m, int bind)
{
if (m) {
if (bind)
m->tcf_bindcnt--;
m->tcf_refcnt--;
if(!m->tcf_bindcnt && m->tcf_refcnt <= 0) {
dev_put(m->tcfm_dev);
tcf_hash_destroy(&m->common, &mirred_hash_info);
return 1;
}
}
return 0;
}
static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
[TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) },
};
static int tcf_mirred_init(struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
struct nlattr *tb[TCA_MIRRED_MAX + 1];
struct tc_mirred *parm;
struct tcf_mirred *m;
struct tcf_common *pc;
struct net_device *dev = NULL;
int ret = 0, err;
int ok_push = 0;
if (nla == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy);
if (err < 0)
return err;
if (tb[TCA_MIRRED_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_MIRRED_PARMS]);
if (parm->ifindex) {
dev = __dev_get_by_index(&init_net, parm->ifindex);
if (dev == NULL)
return -ENODEV;
switch (dev->type) {
case ARPHRD_TUNNEL:
case ARPHRD_TUNNEL6:
case ARPHRD_SIT:
case ARPHRD_IPGRE:
case ARPHRD_VOID:
case ARPHRD_NONE:
ok_push = 0;
break;
default:
ok_push = 1;
break;
}
}
pc = tcf_hash_check(parm->index, a, bind, &mirred_hash_info);
if (!pc) {
if (!parm->ifindex)
return -EINVAL;
pc = tcf_hash_create(parm->index, est, a, sizeof(*m), bind,
&mirred_idx_gen, &mirred_hash_info);
if (IS_ERR(pc))
return PTR_ERR(pc);
ret = ACT_P_CREATED;
} else {
if (!ovr) {
tcf_mirred_release(to_mirred(pc), bind);
return -EEXIST;
}
}
m = to_mirred(pc);
spin_lock_bh(&m->tcf_lock);
m->tcf_action = parm->action;
m->tcfm_eaction = parm->eaction;
if (parm->ifindex) {
m->tcfm_ifindex = parm->ifindex;
if (ret != ACT_P_CREATED)
dev_put(m->tcfm_dev);
m->tcfm_dev = dev;
dev_hold(dev);
m->tcfm_ok_push = ok_push;
}
spin_unlock_bh(&m->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(pc, &mirred_hash_info);
return ret;
}
static int tcf_mirred_cleanup(struct tc_action *a, int bind)
{
struct tcf_mirred *m = a->priv;
if (m)
return tcf_mirred_release(m, bind);
return 0;
}
static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
struct tcf_result *res)
{
struct tcf_mirred *m = a->priv;
struct net_device *dev;
struct sk_buff *skb2 = NULL;
u32 at = G_TC_AT(skb->tc_verd);
spin_lock(&m->tcf_lock);
dev = m->tcfm_dev;
m->tcf_tm.lastuse = jiffies;
if (!(dev->flags&IFF_UP) ) {
if (net_ratelimit())
printk("mirred to Houston: device %s is gone!\n",
dev->name);
bad_mirred:
if (skb2 != NULL)
kfree_skb(skb2);
m->tcf_qstats.overlimits++;
m->tcf_bstats.bytes += qdisc_pkt_len(skb);
m->tcf_bstats.packets++;
spin_unlock(&m->tcf_lock);
/* should we be asking for packet to be dropped?
* may make sense for redirect case only
*/
return TC_ACT_SHOT;
}
skb2 = skb_act_clone(skb, GFP_ATOMIC);
if (skb2 == NULL)
goto bad_mirred;
if (m->tcfm_eaction != TCA_EGRESS_MIRROR &&
m->tcfm_eaction != TCA_EGRESS_REDIR) {
if (net_ratelimit())
printk("tcf_mirred unknown action %d\n",
m->tcfm_eaction);
goto bad_mirred;
}
m->tcf_bstats.bytes += qdisc_pkt_len(skb2);
m->tcf_bstats.packets++;
if (!(at & AT_EGRESS))
if (m->tcfm_ok_push)
skb_push(skb2, skb2->dev->hard_header_len);
/* mirror is always swallowed */
if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
skb2->dev = dev;
skb2->iif = skb->dev->ifindex;
dev_queue_xmit(skb2);
spin_unlock(&m->tcf_lock);
return m->tcf_action;
}
static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_mirred *m = a->priv;
struct tc_mirred opt = {
.index = m->tcf_index,
.action = m->tcf_action,
.refcnt = m->tcf_refcnt - ref,
.bindcnt = m->tcf_bindcnt - bind,
.eaction = m->tcfm_eaction,
.ifindex = m->tcfm_ifindex,
};
struct tcf_t t;
NLA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt);
t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(m->tcf_tm.expires);
NLA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t);
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tc_action_ops act_mirred_ops = {
.kind = "mirred",
.hinfo = &mirred_hash_info,
.type = TCA_ACT_MIRRED,
.capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_mirred,
.dump = tcf_mirred_dump,
.cleanup = tcf_mirred_cleanup,
.lookup = tcf_hash_search,
.init = tcf_mirred_init,
.walk = tcf_generic_walker
};
MODULE_AUTHOR("Jamal Hadi Salim(2002)");
MODULE_DESCRIPTION("Device Mirror/redirect actions");
MODULE_LICENSE("GPL");
static int __init mirred_init_module(void)
{
printk("Mirror/redirect action on\n");
return tcf_register_action(&act_mirred_ops);
}
static void __exit mirred_cleanup_module(void)
{
tcf_unregister_action(&act_mirred_ops);
}
module_init(mirred_init_module);
module_exit(mirred_cleanup_module);

318
kernel/net/sched/act_nat.c Normal file
View File

@@ -0,0 +1,318 @@
/*
* Stateless NAT actions
*
* Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*/
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/netfilter.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/tc_act/tc_nat.h>
#include <net/act_api.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/netlink.h>
#include <net/tc_act/tc_nat.h>
#include <net/tcp.h>
#include <net/udp.h>
#define NAT_TAB_MASK 15
static struct tcf_common *tcf_nat_ht[NAT_TAB_MASK + 1];
static u32 nat_idx_gen;
static DEFINE_RWLOCK(nat_lock);
static struct tcf_hashinfo nat_hash_info = {
.htab = tcf_nat_ht,
.hmask = NAT_TAB_MASK,
.lock = &nat_lock,
};
static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
[TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) },
};
static int tcf_nat_init(struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
struct nlattr *tb[TCA_NAT_MAX + 1];
struct tc_nat *parm;
int ret = 0, err;
struct tcf_nat *p;
struct tcf_common *pc;
if (nla == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_NAT_MAX, nla, nat_policy);
if (err < 0)
return err;
if (tb[TCA_NAT_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_NAT_PARMS]);
pc = tcf_hash_check(parm->index, a, bind, &nat_hash_info);
if (!pc) {
pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
&nat_idx_gen, &nat_hash_info);
if (IS_ERR(pc))
return PTR_ERR(pc);
p = to_tcf_nat(pc);
ret = ACT_P_CREATED;
} else {
p = to_tcf_nat(pc);
if (!ovr) {
tcf_hash_release(pc, bind, &nat_hash_info);
return -EEXIST;
}
}
spin_lock_bh(&p->tcf_lock);
p->old_addr = parm->old_addr;
p->new_addr = parm->new_addr;
p->mask = parm->mask;
p->flags = parm->flags;
p->tcf_action = parm->action;
spin_unlock_bh(&p->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(pc, &nat_hash_info);
return ret;
}
static int tcf_nat_cleanup(struct tc_action *a, int bind)
{
struct tcf_nat *p = a->priv;
return tcf_hash_release(&p->common, bind, &nat_hash_info);
}
static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
struct tcf_result *res)
{
struct tcf_nat *p = a->priv;
struct iphdr *iph;
__be32 old_addr;
__be32 new_addr;
__be32 mask;
__be32 addr;
int egress;
int action;
int ihl;
spin_lock(&p->tcf_lock);
p->tcf_tm.lastuse = jiffies;
old_addr = p->old_addr;
new_addr = p->new_addr;
mask = p->mask;
egress = p->flags & TCA_NAT_FLAG_EGRESS;
action = p->tcf_action;
p->tcf_bstats.bytes += qdisc_pkt_len(skb);
p->tcf_bstats.packets++;
spin_unlock(&p->tcf_lock);
if (unlikely(action == TC_ACT_SHOT))
goto drop;
if (!pskb_may_pull(skb, sizeof(*iph)))
goto drop;
iph = ip_hdr(skb);
if (egress)
addr = iph->saddr;
else
addr = iph->daddr;
if (!((old_addr ^ addr) & mask)) {
if (skb_cloned(skb) &&
!skb_clone_writable(skb, sizeof(*iph)) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
goto drop;
new_addr &= mask;
new_addr |= addr & ~mask;
/* Rewrite IP header */
iph = ip_hdr(skb);
if (egress)
iph->saddr = new_addr;
else
iph->daddr = new_addr;
csum_replace4(&iph->check, addr, new_addr);
}
ihl = iph->ihl * 4;
/* It would be nice to share code with stateful NAT. */
switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
case IPPROTO_TCP:
{
struct tcphdr *tcph;
if (!pskb_may_pull(skb, ihl + sizeof(*tcph)) ||
(skb_cloned(skb) &&
!skb_clone_writable(skb, ihl + sizeof(*tcph)) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
goto drop;
tcph = (void *)(skb_network_header(skb) + ihl);
inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, 1);
break;
}
case IPPROTO_UDP:
{
struct udphdr *udph;
if (!pskb_may_pull(skb, ihl + sizeof(*udph)) ||
(skb_cloned(skb) &&
!skb_clone_writable(skb, ihl + sizeof(*udph)) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
goto drop;
udph = (void *)(skb_network_header(skb) + ihl);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace4(&udph->check, skb, addr,
new_addr, 1);
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
break;
}
case IPPROTO_ICMP:
{
struct icmphdr *icmph;
if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph)))
goto drop;
icmph = (void *)(skb_network_header(skb) + ihl);
if ((icmph->type != ICMP_DEST_UNREACH) &&
(icmph->type != ICMP_TIME_EXCEEDED) &&
(icmph->type != ICMP_PARAMETERPROB))
break;
iph = (void *)(icmph + 1);
if (egress)
addr = iph->daddr;
else
addr = iph->saddr;
if ((old_addr ^ addr) & mask)
break;
if (skb_cloned(skb) &&
!skb_clone_writable(skb,
ihl + sizeof(*icmph) + sizeof(*iph)) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
goto drop;
icmph = (void *)(skb_network_header(skb) + ihl);
iph = (void *)(icmph + 1);
new_addr &= mask;
new_addr |= addr & ~mask;
/* XXX Fix up the inner checksums. */
if (egress)
iph->daddr = new_addr;
else
iph->saddr = new_addr;
inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr,
0);
break;
}
default:
break;
}
return action;
drop:
spin_lock(&p->tcf_lock);
p->tcf_qstats.drops++;
spin_unlock(&p->tcf_lock);
return TC_ACT_SHOT;
}
static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_nat *p = a->priv;
struct tc_nat opt = {
.old_addr = p->old_addr,
.new_addr = p->new_addr,
.mask = p->mask,
.flags = p->flags,
.index = p->tcf_index,
.action = p->tcf_action,
.refcnt = p->tcf_refcnt - ref,
.bindcnt = p->tcf_bindcnt - bind,
};
struct tcf_t t;
NLA_PUT(skb, TCA_NAT_PARMS, sizeof(opt), &opt);
t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
NLA_PUT(skb, TCA_NAT_TM, sizeof(t), &t);
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tc_action_ops act_nat_ops = {
.kind = "nat",
.hinfo = &nat_hash_info,
.type = TCA_ACT_NAT,
.capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_nat,
.dump = tcf_nat_dump,
.cleanup = tcf_nat_cleanup,
.lookup = tcf_hash_search,
.init = tcf_nat_init,
.walk = tcf_generic_walker
};
MODULE_DESCRIPTION("Stateless NAT actions");
MODULE_LICENSE("GPL");
static int __init nat_init_module(void)
{
return tcf_register_action(&act_nat_ops);
}
static void __exit nat_cleanup_module(void)
{
tcf_unregister_action(&act_nat_ops);
}
module_init(nat_init_module);
module_exit(nat_cleanup_module);

View File

@@ -0,0 +1,260 @@
/*
* net/sched/pedit.c Generic packet editor
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Jamal Hadi Salim (2002-4)
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/module.h>
#include <linux/init.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <linux/tc_act/tc_pedit.h>
#include <net/tc_act/tc_pedit.h>
#define PEDIT_TAB_MASK 15
static struct tcf_common *tcf_pedit_ht[PEDIT_TAB_MASK + 1];
static u32 pedit_idx_gen;
static DEFINE_RWLOCK(pedit_lock);
static struct tcf_hashinfo pedit_hash_info = {
.htab = tcf_pedit_ht,
.hmask = PEDIT_TAB_MASK,
.lock = &pedit_lock,
};
static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
[TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) },
};
static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
struct nlattr *tb[TCA_PEDIT_MAX + 1];
struct tc_pedit *parm;
int ret = 0, err;
struct tcf_pedit *p;
struct tcf_common *pc;
struct tc_pedit_key *keys = NULL;
int ksize;
if (nla == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy);
if (err < 0)
return err;
if (tb[TCA_PEDIT_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_PEDIT_PARMS]);
ksize = parm->nkeys * sizeof(struct tc_pedit_key);
if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)
return -EINVAL;
pc = tcf_hash_check(parm->index, a, bind, &pedit_hash_info);
if (!pc) {
if (!parm->nkeys)
return -EINVAL;
pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
&pedit_idx_gen, &pedit_hash_info);
if (IS_ERR(pc))
return PTR_ERR(pc);
p = to_pedit(pc);
keys = kmalloc(ksize, GFP_KERNEL);
if (keys == NULL) {
kfree(pc);
return -ENOMEM;
}
ret = ACT_P_CREATED;
} else {
p = to_pedit(pc);
if (!ovr) {
tcf_hash_release(pc, bind, &pedit_hash_info);
return -EEXIST;
}
if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
keys = kmalloc(ksize, GFP_KERNEL);
if (keys == NULL)
return -ENOMEM;
}
}
spin_lock_bh(&p->tcf_lock);
p->tcfp_flags = parm->flags;
p->tcf_action = parm->action;
if (keys) {
kfree(p->tcfp_keys);
p->tcfp_keys = keys;
p->tcfp_nkeys = parm->nkeys;
}
memcpy(p->tcfp_keys, parm->keys, ksize);
spin_unlock_bh(&p->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(pc, &pedit_hash_info);
return ret;
}
static int tcf_pedit_cleanup(struct tc_action *a, int bind)
{
struct tcf_pedit *p = a->priv;
if (p) {
struct tc_pedit_key *keys = p->tcfp_keys;
if (tcf_hash_release(&p->common, bind, &pedit_hash_info)) {
kfree(keys);
return 1;
}
}
return 0;
}
static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,
struct tcf_result *res)
{
struct tcf_pedit *p = a->priv;
int i, munged = 0;
u8 *pptr;
if (!(skb->tc_verd & TC_OK2MUNGE)) {
/* should we set skb->cloned? */
if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) {
return p->tcf_action;
}
}
pptr = skb_network_header(skb);
spin_lock(&p->tcf_lock);
p->tcf_tm.lastuse = jiffies;
if (p->tcfp_nkeys > 0) {
struct tc_pedit_key *tkey = p->tcfp_keys;
for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
u32 *ptr;
int offset = tkey->off;
if (tkey->offmask) {
if (skb->len > tkey->at) {
char *j = pptr + tkey->at;
offset += ((*j & tkey->offmask) >>
tkey->shift);
} else {
goto bad;
}
}
if (offset % 4) {
printk("offset must be on 32 bit boundaries\n");
goto bad;
}
if (offset > 0 && offset > skb->len) {
printk("offset %d cant exceed pkt length %d\n",
offset, skb->len);
goto bad;
}
ptr = (u32 *)(pptr+offset);
/* just do it, baby */
*ptr = ((*ptr & tkey->mask) ^ tkey->val);
munged++;
}
if (munged)
skb->tc_verd = SET_TC_MUNGED(skb->tc_verd);
goto done;
} else {
printk("pedit BUG: index %d\n", p->tcf_index);
}
bad:
p->tcf_qstats.overlimits++;
done:
p->tcf_bstats.bytes += qdisc_pkt_len(skb);
p->tcf_bstats.packets++;
spin_unlock(&p->tcf_lock);
return p->tcf_action;
}
static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_pedit *p = a->priv;
struct tc_pedit *opt;
struct tcf_t t;
int s;
s = sizeof(*opt) + p->tcfp_nkeys * sizeof(struct tc_pedit_key);
/* netlink spinlocks held above us - must use ATOMIC */
opt = kzalloc(s, GFP_ATOMIC);
if (unlikely(!opt))
return -ENOBUFS;
memcpy(opt->keys, p->tcfp_keys,
p->tcfp_nkeys * sizeof(struct tc_pedit_key));
opt->index = p->tcf_index;
opt->nkeys = p->tcfp_nkeys;
opt->flags = p->tcfp_flags;
opt->action = p->tcf_action;
opt->refcnt = p->tcf_refcnt - ref;
opt->bindcnt = p->tcf_bindcnt - bind;
NLA_PUT(skb, TCA_PEDIT_PARMS, s, opt);
t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
NLA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t);
kfree(opt);
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
kfree(opt);
return -1;
}
static struct tc_action_ops act_pedit_ops = {
.kind = "pedit",
.hinfo = &pedit_hash_info,
.type = TCA_ACT_PEDIT,
.capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_pedit,
.dump = tcf_pedit_dump,
.cleanup = tcf_pedit_cleanup,
.lookup = tcf_hash_search,
.init = tcf_pedit_init,
.walk = tcf_generic_walker
};
MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
MODULE_DESCRIPTION("Generic Packet Editor actions");
MODULE_LICENSE("GPL");
static int __init pedit_init_module(void)
{
return tcf_register_action(&act_pedit_ops);
}
static void __exit pedit_cleanup_module(void)
{
tcf_unregister_action(&act_pedit_ops);
}
module_init(pedit_init_module);
module_exit(pedit_cleanup_module);

View File

@@ -0,0 +1,399 @@
/*
* net/sched/police.c Input police filter.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* J Hadi Salim (action changes)
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <net/act_api.h>
#include <net/netlink.h>
#define L2T(p,L) qdisc_l2t((p)->tcfp_R_tab, L)
#define L2T_P(p,L) qdisc_l2t((p)->tcfp_P_tab, L)
#define POL_TAB_MASK 15
static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1];
static u32 police_idx_gen;
static DEFINE_RWLOCK(police_lock);
static struct tcf_hashinfo police_hash_info = {
.htab = tcf_police_ht,
.hmask = POL_TAB_MASK,
.lock = &police_lock,
};
/* old policer structure from before tc actions */
struct tc_police_compat
{
u32 index;
int action;
u32 limit;
u32 burst;
u32 mtu;
struct tc_ratespec rate;
struct tc_ratespec peakrate;
};
/* Each policer is serialized by its individual spinlock */
static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb,
int type, struct tc_action *a)
{
struct tcf_common *p;
int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
struct nlattr *nest;
read_lock_bh(&police_lock);
s_i = cb->args[0];
for (i = 0; i < (POL_TAB_MASK + 1); i++) {
p = tcf_police_ht[tcf_hash(i, POL_TAB_MASK)];
for (; p; p = p->tcfc_next) {
index++;
if (index < s_i)
continue;
a->priv = p;
a->order = index;
nest = nla_nest_start(skb, a->order);
if (nest == NULL)
goto nla_put_failure;
if (type == RTM_DELACTION)
err = tcf_action_dump_1(skb, a, 0, 1);
else
err = tcf_action_dump_1(skb, a, 0, 0);
if (err < 0) {
index--;
nla_nest_cancel(skb, nest);
goto done;
}
nla_nest_end(skb, nest);
n_i++;
}
}
done:
read_unlock_bh(&police_lock);
if (n_i)
cb->args[0] += n_i;
return n_i;
nla_put_failure:
nla_nest_cancel(skb, nest);
goto done;
}
static void tcf_police_destroy(struct tcf_police *p)
{
unsigned int h = tcf_hash(p->tcf_index, POL_TAB_MASK);
struct tcf_common **p1p;
for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->tcfc_next) {
if (*p1p == &p->common) {
write_lock_bh(&police_lock);
*p1p = p->tcf_next;
write_unlock_bh(&police_lock);
gen_kill_estimator(&p->tcf_bstats,
&p->tcf_rate_est);
if (p->tcfp_R_tab)
qdisc_put_rtab(p->tcfp_R_tab);
if (p->tcfp_P_tab)
qdisc_put_rtab(p->tcfp_P_tab);
kfree(p);
return;
}
}
WARN_ON(1);
}
static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
[TCA_POLICE_RATE] = { .len = TC_RTAB_SIZE },
[TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE },
[TCA_POLICE_AVRATE] = { .type = NLA_U32 },
[TCA_POLICE_RESULT] = { .type = NLA_U32 },
};
static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
unsigned h;
int ret = 0, err;
struct nlattr *tb[TCA_POLICE_MAX + 1];
struct tc_police *parm;
struct tcf_police *police;
struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
int size;
if (nla == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_POLICE_MAX, nla, police_policy);
if (err < 0)
return err;
if (tb[TCA_POLICE_TBF] == NULL)
return -EINVAL;
size = nla_len(tb[TCA_POLICE_TBF]);
if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat))
return -EINVAL;
parm = nla_data(tb[TCA_POLICE_TBF]);
if (parm->index) {
struct tcf_common *pc;
pc = tcf_hash_lookup(parm->index, &police_hash_info);
if (pc != NULL) {
a->priv = pc;
police = to_police(pc);
if (bind) {
police->tcf_bindcnt += 1;
police->tcf_refcnt += 1;
}
if (ovr)
goto override;
return ret;
}
}
police = kzalloc(sizeof(*police), GFP_KERNEL);
if (police == NULL)
return -ENOMEM;
ret = ACT_P_CREATED;
police->tcf_refcnt = 1;
spin_lock_init(&police->tcf_lock);
if (bind)
police->tcf_bindcnt = 1;
override:
if (parm->rate.rate) {
err = -ENOMEM;
R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE]);
if (R_tab == NULL)
goto failure;
if (parm->peakrate.rate) {
P_tab = qdisc_get_rtab(&parm->peakrate,
tb[TCA_POLICE_PEAKRATE]);
if (P_tab == NULL)
goto failure;
}
}
spin_lock_bh(&police->tcf_lock);
if (est) {
err = gen_replace_estimator(&police->tcf_bstats,
&police->tcf_rate_est,
&police->tcf_lock, est);
if (err)
goto failure_unlock;
} else if (tb[TCA_POLICE_AVRATE] &&
(ret == ACT_P_CREATED ||
!gen_estimator_active(&police->tcf_bstats,
&police->tcf_rate_est))) {
err = -EINVAL;
goto failure_unlock;
}
/* No failure allowed after this point */
if (R_tab != NULL) {
qdisc_put_rtab(police->tcfp_R_tab);
police->tcfp_R_tab = R_tab;
}
if (P_tab != NULL) {
qdisc_put_rtab(police->tcfp_P_tab);
police->tcfp_P_tab = P_tab;
}
if (tb[TCA_POLICE_RESULT])
police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
police->tcfp_toks = police->tcfp_burst = parm->burst;
police->tcfp_mtu = parm->mtu;
if (police->tcfp_mtu == 0) {
police->tcfp_mtu = ~0;
if (police->tcfp_R_tab)
police->tcfp_mtu = 255<<police->tcfp_R_tab->rate.cell_log;
}
if (police->tcfp_P_tab)
police->tcfp_ptoks = L2T_P(police, police->tcfp_mtu);
police->tcf_action = parm->action;
if (tb[TCA_POLICE_AVRATE])
police->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
spin_unlock_bh(&police->tcf_lock);
if (ret != ACT_P_CREATED)
return ret;
police->tcfp_t_c = psched_get_time();
police->tcf_index = parm->index ? parm->index :
tcf_hash_new_index(&police_idx_gen, &police_hash_info);
h = tcf_hash(police->tcf_index, POL_TAB_MASK);
write_lock_bh(&police_lock);
police->tcf_next = tcf_police_ht[h];
tcf_police_ht[h] = &police->common;
write_unlock_bh(&police_lock);
a->priv = police;
return ret;
failure_unlock:
spin_unlock_bh(&police->tcf_lock);
failure:
if (P_tab)
qdisc_put_rtab(P_tab);
if (R_tab)
qdisc_put_rtab(R_tab);
if (ret == ACT_P_CREATED)
kfree(police);
return err;
}
static int tcf_act_police_cleanup(struct tc_action *a, int bind)
{
struct tcf_police *p = a->priv;
int ret = 0;
if (p != NULL) {
if (bind)
p->tcf_bindcnt--;
p->tcf_refcnt--;
if (p->tcf_refcnt <= 0 && !p->tcf_bindcnt) {
tcf_police_destroy(p);
ret = 1;
}
}
return ret;
}
static int tcf_act_police(struct sk_buff *skb, struct tc_action *a,
struct tcf_result *res)
{
struct tcf_police *police = a->priv;
psched_time_t now;
long toks;
long ptoks = 0;
spin_lock(&police->tcf_lock);
police->tcf_bstats.bytes += qdisc_pkt_len(skb);
police->tcf_bstats.packets++;
if (police->tcfp_ewma_rate &&
police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
police->tcf_qstats.overlimits++;
if (police->tcf_action == TC_ACT_SHOT)
police->tcf_qstats.drops++;
spin_unlock(&police->tcf_lock);
return police->tcf_action;
}
if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
if (police->tcfp_R_tab == NULL) {
spin_unlock(&police->tcf_lock);
return police->tcfp_result;
}
now = psched_get_time();
toks = psched_tdiff_bounded(now, police->tcfp_t_c,
police->tcfp_burst);
if (police->tcfp_P_tab) {
ptoks = toks + police->tcfp_ptoks;
if (ptoks > (long)L2T_P(police, police->tcfp_mtu))
ptoks = (long)L2T_P(police, police->tcfp_mtu);
ptoks -= L2T_P(police, qdisc_pkt_len(skb));
}
toks += police->tcfp_toks;
if (toks > (long)police->tcfp_burst)
toks = police->tcfp_burst;
toks -= L2T(police, qdisc_pkt_len(skb));
if ((toks|ptoks) >= 0) {
police->tcfp_t_c = now;
police->tcfp_toks = toks;
police->tcfp_ptoks = ptoks;
spin_unlock(&police->tcf_lock);
return police->tcfp_result;
}
}
police->tcf_qstats.overlimits++;
if (police->tcf_action == TC_ACT_SHOT)
police->tcf_qstats.drops++;
spin_unlock(&police->tcf_lock);
return police->tcf_action;
}
static int
tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_police *police = a->priv;
struct tc_police opt = {
.index = police->tcf_index,
.action = police->tcf_action,
.mtu = police->tcfp_mtu,
.burst = police->tcfp_burst,
.refcnt = police->tcf_refcnt - ref,
.bindcnt = police->tcf_bindcnt - bind,
};
if (police->tcfp_R_tab)
opt.rate = police->tcfp_R_tab->rate;
if (police->tcfp_P_tab)
opt.peakrate = police->tcfp_P_tab->rate;
NLA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt);
if (police->tcfp_result)
NLA_PUT_U32(skb, TCA_POLICE_RESULT, police->tcfp_result);
if (police->tcfp_ewma_rate)
NLA_PUT_U32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate);
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
MODULE_AUTHOR("Alexey Kuznetsov");
MODULE_DESCRIPTION("Policing actions");
MODULE_LICENSE("GPL");
static struct tc_action_ops act_police_ops = {
.kind = "police",
.hinfo = &police_hash_info,
.type = TCA_ID_POLICE,
.capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_act_police,
.dump = tcf_act_police_dump,
.cleanup = tcf_act_police_cleanup,
.lookup = tcf_hash_search,
.init = tcf_act_police_locate,
.walk = tcf_act_police_walker
};
static int __init
police_init_module(void)
{
return tcf_register_action(&act_police_ops);
}
static void __exit
police_cleanup_module(void)
{
tcf_unregister_action(&act_police_ops);
}
module_init(police_init_module);
module_exit(police_cleanup_module);

View File

@@ -0,0 +1,218 @@
/*
* net/sched/simp.c Simple example of an action
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Jamal Hadi Salim (2005-8)
*
*/
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#define TCA_ACT_SIMP 22
#include <linux/tc_act/tc_defact.h>
#include <net/tc_act/tc_defact.h>
#define SIMP_TAB_MASK 7
static struct tcf_common *tcf_simp_ht[SIMP_TAB_MASK + 1];
static u32 simp_idx_gen;
static DEFINE_RWLOCK(simp_lock);
static struct tcf_hashinfo simp_hash_info = {
.htab = tcf_simp_ht,
.hmask = SIMP_TAB_MASK,
.lock = &simp_lock,
};
#define SIMP_MAX_DATA 32
static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result *res)
{
struct tcf_defact *d = a->priv;
spin_lock(&d->tcf_lock);
d->tcf_tm.lastuse = jiffies;
d->tcf_bstats.bytes += qdisc_pkt_len(skb);
d->tcf_bstats.packets++;
/* print policy string followed by _ then packet count
* Example if this was the 3rd packet and the string was "hello"
* then it would look like "hello_3" (without quotes)
**/
printk("simple: %s_%d\n",
(char *)d->tcfd_defdata, d->tcf_bstats.packets);
spin_unlock(&d->tcf_lock);
return d->tcf_action;
}
static int tcf_simp_release(struct tcf_defact *d, int bind)
{
int ret = 0;
if (d) {
if (bind)
d->tcf_bindcnt--;
d->tcf_refcnt--;
if (d->tcf_bindcnt <= 0 && d->tcf_refcnt <= 0) {
kfree(d->tcfd_defdata);
tcf_hash_destroy(&d->common, &simp_hash_info);
ret = 1;
}
}
return ret;
}
static int alloc_defdata(struct tcf_defact *d, char *defdata)
{
d->tcfd_defdata = kstrndup(defdata, SIMP_MAX_DATA, GFP_KERNEL);
if (unlikely(!d->tcfd_defdata))
return -ENOMEM;
return 0;
}
static void reset_policy(struct tcf_defact *d, char *defdata,
struct tc_defact *p)
{
spin_lock_bh(&d->tcf_lock);
d->tcf_action = p->action;
memset(d->tcfd_defdata, 0, SIMP_MAX_DATA);
strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
spin_unlock_bh(&d->tcf_lock);
}
static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
[TCA_DEF_PARMS] = { .len = sizeof(struct tc_defact) },
[TCA_DEF_DATA] = { .type = NLA_STRING, .len = SIMP_MAX_DATA },
};
static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
struct nlattr *tb[TCA_DEF_MAX + 1];
struct tc_defact *parm;
struct tcf_defact *d;
struct tcf_common *pc;
char *defdata;
int ret = 0, err;
if (nla == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_DEF_MAX, nla, simple_policy);
if (err < 0)
return err;
if (tb[TCA_DEF_PARMS] == NULL)
return -EINVAL;
if (tb[TCA_DEF_DATA] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_DEF_PARMS]);
defdata = nla_data(tb[TCA_DEF_DATA]);
pc = tcf_hash_check(parm->index, a, bind, &simp_hash_info);
if (!pc) {
pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
&simp_idx_gen, &simp_hash_info);
if (IS_ERR(pc))
return PTR_ERR(pc);
d = to_defact(pc);
ret = alloc_defdata(d, defdata);
if (ret < 0) {
kfree(pc);
return ret;
}
d->tcf_action = parm->action;
ret = ACT_P_CREATED;
} else {
d = to_defact(pc);
if (!ovr) {
tcf_simp_release(d, bind);
return -EEXIST;
}
reset_policy(d, defdata, parm);
}
if (ret == ACT_P_CREATED)
tcf_hash_insert(pc, &simp_hash_info);
return ret;
}
static inline int tcf_simp_cleanup(struct tc_action *a, int bind)
{
struct tcf_defact *d = a->priv;
if (d)
return tcf_simp_release(d, bind);
return 0;
}
static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_defact *d = a->priv;
struct tc_defact opt = {
.index = d->tcf_index,
.refcnt = d->tcf_refcnt - ref,
.bindcnt = d->tcf_bindcnt - bind,
.action = d->tcf_action,
};
struct tcf_t t;
NLA_PUT(skb, TCA_DEF_PARMS, sizeof(opt), &opt);
NLA_PUT_STRING(skb, TCA_DEF_DATA, d->tcfd_defdata);
t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
NLA_PUT(skb, TCA_DEF_TM, sizeof(t), &t);
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tc_action_ops act_simp_ops = {
.kind = "simple",
.hinfo = &simp_hash_info,
.type = TCA_ACT_SIMP,
.capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_simp,
.dump = tcf_simp_dump,
.cleanup = tcf_simp_cleanup,
.init = tcf_simp_init,
.walk = tcf_generic_walker,
};
MODULE_AUTHOR("Jamal Hadi Salim(2005)");
MODULE_DESCRIPTION("Simple example action");
MODULE_LICENSE("GPL");
static int __init simp_init_module(void)
{
int ret = tcf_register_action(&act_simp_ops);
if (!ret)
printk("Simple TC action Loaded\n");
return ret;
}
static void __exit simp_cleanup_module(void)
{
tcf_unregister_action(&act_simp_ops);
}
module_init(simp_init_module);
module_exit(simp_cleanup_module);

View File

@@ -0,0 +1,204 @@
/*
* Copyright (c) 2008, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 59 Temple
* Place - Suite 330, Boston, MA 02111-1307 USA.
*
* Author: Alexander Duyck <alexander.h.duyck@intel.com>
*/
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <linux/tc_act/tc_skbedit.h>
#include <net/tc_act/tc_skbedit.h>
#define SKBEDIT_TAB_MASK 15
static struct tcf_common *tcf_skbedit_ht[SKBEDIT_TAB_MASK + 1];
static u32 skbedit_idx_gen;
static DEFINE_RWLOCK(skbedit_lock);
static struct tcf_hashinfo skbedit_hash_info = {
.htab = tcf_skbedit_ht,
.hmask = SKBEDIT_TAB_MASK,
.lock = &skbedit_lock,
};
static int tcf_skbedit(struct sk_buff *skb, struct tc_action *a,
struct tcf_result *res)
{
struct tcf_skbedit *d = a->priv;
spin_lock(&d->tcf_lock);
d->tcf_tm.lastuse = jiffies;
d->tcf_bstats.bytes += qdisc_pkt_len(skb);
d->tcf_bstats.packets++;
if (d->flags & SKBEDIT_F_PRIORITY)
skb->priority = d->priority;
if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
skb->dev->real_num_tx_queues > d->queue_mapping)
skb_set_queue_mapping(skb, d->queue_mapping);
spin_unlock(&d->tcf_lock);
return d->tcf_action;
}
static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
[TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) },
[TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) },
[TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) },
};
static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
struct tc_skbedit *parm;
struct tcf_skbedit *d;
struct tcf_common *pc;
u32 flags = 0, *priority = NULL;
u16 *queue_mapping = NULL;
int ret = 0, err;
if (nla == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy);
if (err < 0)
return err;
if (tb[TCA_SKBEDIT_PARMS] == NULL)
return -EINVAL;
if (tb[TCA_SKBEDIT_PRIORITY] != NULL) {
flags |= SKBEDIT_F_PRIORITY;
priority = nla_data(tb[TCA_SKBEDIT_PRIORITY]);
}
if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) {
flags |= SKBEDIT_F_QUEUE_MAPPING;
queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
}
if (!flags)
return -EINVAL;
parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
pc = tcf_hash_check(parm->index, a, bind, &skbedit_hash_info);
if (!pc) {
pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
&skbedit_idx_gen, &skbedit_hash_info);
if (IS_ERR(pc))
return PTR_ERR(pc);
d = to_skbedit(pc);
ret = ACT_P_CREATED;
} else {
d = to_skbedit(pc);
if (!ovr) {
tcf_hash_release(pc, bind, &skbedit_hash_info);
return -EEXIST;
}
}
spin_lock_bh(&d->tcf_lock);
d->flags = flags;
if (flags & SKBEDIT_F_PRIORITY)
d->priority = *priority;
if (flags & SKBEDIT_F_QUEUE_MAPPING)
d->queue_mapping = *queue_mapping;
d->tcf_action = parm->action;
spin_unlock_bh(&d->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(pc, &skbedit_hash_info);
return ret;
}
static inline int tcf_skbedit_cleanup(struct tc_action *a, int bind)
{
struct tcf_skbedit *d = a->priv;
if (d)
return tcf_hash_release(&d->common, bind, &skbedit_hash_info);
return 0;
}
static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_skbedit *d = a->priv;
struct tc_skbedit opt = {
.index = d->tcf_index,
.refcnt = d->tcf_refcnt - ref,
.bindcnt = d->tcf_bindcnt - bind,
.action = d->tcf_action,
};
struct tcf_t t;
NLA_PUT(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt);
if (d->flags & SKBEDIT_F_PRIORITY)
NLA_PUT(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
&d->priority);
if (d->flags & SKBEDIT_F_QUEUE_MAPPING)
NLA_PUT(skb, TCA_SKBEDIT_QUEUE_MAPPING,
sizeof(d->queue_mapping), &d->queue_mapping);
t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
NLA_PUT(skb, TCA_SKBEDIT_TM, sizeof(t), &t);
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tc_action_ops act_skbedit_ops = {
.kind = "skbedit",
.hinfo = &skbedit_hash_info,
.type = TCA_ACT_SKBEDIT,
.capab = TCA_CAP_NONE,
.owner = THIS_MODULE,
.act = tcf_skbedit,
.dump = tcf_skbedit_dump,
.cleanup = tcf_skbedit_cleanup,
.init = tcf_skbedit_init,
.walk = tcf_generic_walker,
};
MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>");
MODULE_DESCRIPTION("SKB Editing");
MODULE_LICENSE("GPL");
static int __init skbedit_init_module(void)
{
return tcf_register_action(&act_skbedit_ops);
}
static void __exit skbedit_cleanup_module(void)
{
tcf_unregister_action(&act_skbedit_ops);
}
module_init(skbedit_init_module);
module_exit(skbedit_cleanup_module);

617
kernel/net/sched/cls_api.c Normal file
View File

@@ -0,0 +1,617 @@
/*
* net/sched/cls_api.c Packet classifier API.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
*
* Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
*
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/kmod.h>
#include <linux/netlink.h>
#include <linux/err.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
/* The list of all installed classifier types */
static struct tcf_proto_ops *tcf_proto_base __read_mostly;
/* Protects list of registered TC modules. It is pure SMP lock. */
static DEFINE_RWLOCK(cls_mod_lock);
/* Find classifier type by string name */
static struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind)
{
struct tcf_proto_ops *t = NULL;
if (kind) {
read_lock(&cls_mod_lock);
for (t = tcf_proto_base; t; t = t->next) {
if (nla_strcmp(kind, t->kind) == 0) {
if (!try_module_get(t->owner))
t = NULL;
break;
}
}
read_unlock(&cls_mod_lock);
}
return t;
}
/* Register(unregister) new classifier type */
int register_tcf_proto_ops(struct tcf_proto_ops *ops)
{
struct tcf_proto_ops *t, **tp;
int rc = -EEXIST;
write_lock(&cls_mod_lock);
for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next)
if (!strcmp(ops->kind, t->kind))
goto out;
ops->next = NULL;
*tp = ops;
rc = 0;
out:
write_unlock(&cls_mod_lock);
return rc;
}
EXPORT_SYMBOL(register_tcf_proto_ops);
int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
{
struct tcf_proto_ops *t, **tp;
int rc = -ENOENT;
write_lock(&cls_mod_lock);
for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next)
if (t == ops)
break;
if (!t)
goto out;
*tp = t->next;
rc = 0;
out:
write_unlock(&cls_mod_lock);
return rc;
}
EXPORT_SYMBOL(unregister_tcf_proto_ops);
static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
struct tcf_proto *tp, unsigned long fh, int event);
/* Select new prio value from the range, managed by kernel. */
static inline u32 tcf_auto_prio(struct tcf_proto *tp)
{
u32 first = TC_H_MAKE(0xC0000000U, 0U);
if (tp)
first = tp->prio-1;
return first;
}
/* Add/change/delete/get a filter node */
static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
spinlock_t *root_lock;
struct tcmsg *t;
u32 protocol;
u32 prio;
u32 nprio;
u32 parent;
struct net_device *dev;
struct Qdisc *q;
struct tcf_proto **back, **chain;
struct tcf_proto *tp;
struct tcf_proto_ops *tp_ops;
const struct Qdisc_class_ops *cops;
unsigned long cl;
unsigned long fh;
int err;
int tp_created = 0;
if (net != &init_net)
return -EINVAL;
replay:
t = NLMSG_DATA(n);
protocol = TC_H_MIN(t->tcm_info);
prio = TC_H_MAJ(t->tcm_info);
nprio = prio;
parent = t->tcm_parent;
cl = 0;
if (prio == 0) {
/* If no priority is given, user wants we allocated it. */
if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
return -ENOENT;
prio = TC_H_MAKE(0x80000000U, 0U);
}
/* Find head of filter chain. */
/* Find link */
dev = __dev_get_by_index(&init_net, t->tcm_ifindex);
if (dev == NULL)
return -ENODEV;
err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
if (err < 0)
return err;
/* Find qdisc */
if (!parent) {
q = dev->qdisc;
parent = q->handle;
} else {
q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
if (q == NULL)
return -EINVAL;
}
/* Is it classful? */
if ((cops = q->ops->cl_ops) == NULL)
return -EINVAL;
if (cops->tcf_chain == NULL)
return -EOPNOTSUPP;
/* Do we search for filter, attached to class? */
if (TC_H_MIN(parent)) {
cl = cops->get(q, parent);
if (cl == 0)
return -ENOENT;
}
/* And the last stroke */
chain = cops->tcf_chain(q, cl);
err = -EINVAL;
if (chain == NULL)
goto errout;
/* Check the chain for existence of proto-tcf with this priority */
for (back = chain; (tp=*back) != NULL; back = &tp->next) {
if (tp->prio >= prio) {
if (tp->prio == prio) {
if (!nprio || (tp->protocol != protocol && protocol))
goto errout;
} else
tp = NULL;
break;
}
}
root_lock = qdisc_root_sleeping_lock(q);
if (tp == NULL) {
/* Proto-tcf does not exist, create new one */
if (tca[TCA_KIND] == NULL || !protocol)
goto errout;
err = -ENOENT;
if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
goto errout;
/* Create new proto tcf */
err = -ENOBUFS;
tp = kzalloc(sizeof(*tp), GFP_KERNEL);
if (tp == NULL)
goto errout;
err = -ENOENT;
tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
if (tp_ops == NULL) {
#ifdef CONFIG_MODULES
struct nlattr *kind = tca[TCA_KIND];
char name[IFNAMSIZ];
if (kind != NULL &&
nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
rtnl_unlock();
request_module("cls_%s", name);
rtnl_lock();
tp_ops = tcf_proto_lookup_ops(kind);
/* We dropped the RTNL semaphore in order to
* perform the module load. So, even if we
* succeeded in loading the module we have to
* replay the request. We indicate this using
* -EAGAIN.
*/
if (tp_ops != NULL) {
module_put(tp_ops->owner);
err = -EAGAIN;
}
}
#endif
kfree(tp);
goto errout;
}
tp->ops = tp_ops;
tp->protocol = protocol;
tp->prio = nprio ? : TC_H_MAJ(tcf_auto_prio(*back));
tp->q = q;
tp->classify = tp_ops->classify;
tp->classid = parent;
err = tp_ops->init(tp);
if (err != 0) {
module_put(tp_ops->owner);
kfree(tp);
goto errout;
}
tp_created = 1;
} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind))
goto errout;
fh = tp->ops->get(tp, t->tcm_handle);
if (fh == 0) {
if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
spin_lock_bh(root_lock);
*back = tp->next;
spin_unlock_bh(root_lock);
tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER);
tcf_destroy(tp);
err = 0;
goto errout;
}
err = -ENOENT;
if (n->nlmsg_type != RTM_NEWTFILTER ||
!(n->nlmsg_flags & NLM_F_CREATE))
goto errout;
} else {
switch (n->nlmsg_type) {
case RTM_NEWTFILTER:
err = -EEXIST;
if (n->nlmsg_flags & NLM_F_EXCL) {
if (tp_created)
tcf_destroy(tp);
goto errout;
}
break;
case RTM_DELTFILTER:
err = tp->ops->delete(tp, fh);
if (err == 0)
tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER);
goto errout;
case RTM_GETTFILTER:
err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
goto errout;
default:
err = -EINVAL;
goto errout;
}
}
err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh);
if (err == 0) {
if (tp_created) {
spin_lock_bh(root_lock);
tp->next = *back;
*back = tp;
spin_unlock_bh(root_lock);
}
tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
} else {
if (tp_created)
tcf_destroy(tp);
}
errout:
if (cl)
cops->put(q, cl);
if (err == -EAGAIN)
/* Replay the request. */
goto replay;
return err;
}
static int tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp,
unsigned long fh, u32 pid, u32 seq, u16 flags, int event)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
unsigned char *b = skb_tail_pointer(skb);
nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
tcm = NLMSG_DATA(nlh);
tcm->tcm_family = AF_UNSPEC;
tcm->tcm__pad1 = 0;
tcm->tcm__pad2 = 0;
tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
tcm->tcm_parent = tp->classid;
tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
NLA_PUT_STRING(skb, TCA_KIND, tp->ops->kind);
tcm->tcm_handle = fh;
if (RTM_DELTFILTER != event) {
tcm->tcm_handle = 0;
if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0)
goto nla_put_failure;
}
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
return skb->len;
nlmsg_failure:
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n,
struct tcf_proto *tp, unsigned long fh, int event)
{
struct sk_buff *skb;
u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC,
n->nlmsg_flags & NLM_F_ECHO);
}
struct tcf_dump_args {
struct tcf_walker w;
struct sk_buff *skb;
struct netlink_callback *cb;
};
static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
struct tcf_walker *arg)
{
struct tcf_dump_args *a = (void *)arg;
return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid,
a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);
}
static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
int t;
int s_t;
struct net_device *dev;
struct Qdisc *q;
struct tcf_proto *tp, **chain;
struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
unsigned long cl = 0;
const struct Qdisc_class_ops *cops;
struct tcf_dump_args arg;
if (net != &init_net)
return 0;
if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
return skb->len;
if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
return skb->len;
if (!tcm->tcm_parent)
q = dev->qdisc;
else
q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
if (!q)
goto out;
if ((cops = q->ops->cl_ops) == NULL)
goto errout;
if (cops->tcf_chain == NULL)
goto errout;
if (TC_H_MIN(tcm->tcm_parent)) {
cl = cops->get(q, tcm->tcm_parent);
if (cl == 0)
goto errout;
}
chain = cops->tcf_chain(q, cl);
if (chain == NULL)
goto errout;
s_t = cb->args[0];
for (tp=*chain, t=0; tp; tp = tp->next, t++) {
if (t < s_t) continue;
if (TC_H_MAJ(tcm->tcm_info) &&
TC_H_MAJ(tcm->tcm_info) != tp->prio)
continue;
if (TC_H_MIN(tcm->tcm_info) &&
TC_H_MIN(tcm->tcm_info) != tp->protocol)
continue;
if (t > s_t)
memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
if (cb->args[1] == 0) {
if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
RTM_NEWTFILTER) <= 0)
break;
cb->args[1] = 1;
}
if (tp->ops->walk == NULL)
continue;
arg.w.fn = tcf_node_dump;
arg.skb = skb;
arg.cb = cb;
arg.w.stop = 0;
arg.w.skip = cb->args[1]-1;
arg.w.count = 0;
tp->ops->walk(tp, &arg.w);
cb->args[1] = arg.w.count+1;
if (arg.w.stop)
break;
}
cb->args[0] = t;
errout:
if (cl)
cops->put(q, cl);
out:
dev_put(dev);
return skb->len;
}
void tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
if (exts->action) {
tcf_action_destroy(exts->action, TCA_ACT_UNBIND);
exts->action = NULL;
}
#endif
}
EXPORT_SYMBOL(tcf_exts_destroy);
int tcf_exts_validate(struct tcf_proto *tp, struct nlattr **tb,
struct nlattr *rate_tlv, struct tcf_exts *exts,
const struct tcf_ext_map *map)
{
memset(exts, 0, sizeof(*exts));
#ifdef CONFIG_NET_CLS_ACT
{
struct tc_action *act;
if (map->police && tb[map->police]) {
act = tcf_action_init_1(tb[map->police], rate_tlv,
"police", TCA_ACT_NOREPLACE,
TCA_ACT_BIND);
if (IS_ERR(act))
return PTR_ERR(act);
act->type = TCA_OLD_COMPAT;
exts->action = act;
} else if (map->action && tb[map->action]) {
act = tcf_action_init(tb[map->action], rate_tlv, NULL,
TCA_ACT_NOREPLACE, TCA_ACT_BIND);
if (IS_ERR(act))
return PTR_ERR(act);
exts->action = act;
}
}
#else
if ((map->action && tb[map->action]) ||
(map->police && tb[map->police]))
return -EOPNOTSUPP;
#endif
return 0;
}
EXPORT_SYMBOL(tcf_exts_validate);
void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
struct tcf_exts *src)
{
#ifdef CONFIG_NET_CLS_ACT
if (src->action) {
struct tc_action *act;
tcf_tree_lock(tp);
act = dst->action;
dst->action = src->action;
tcf_tree_unlock(tp);
if (act)
tcf_action_destroy(act, TCA_ACT_UNBIND);
}
#endif
}
EXPORT_SYMBOL(tcf_exts_change);
int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts,
const struct tcf_ext_map *map)
{
#ifdef CONFIG_NET_CLS_ACT
if (map->action && exts->action) {
/*
* again for backward compatible mode - we want
* to work with both old and new modes of entering
* tc data even if iproute2 was newer - jhs
*/
struct nlattr *nest;
if (exts->action->type != TCA_OLD_COMPAT) {
nest = nla_nest_start(skb, map->action);
if (nest == NULL)
goto nla_put_failure;
if (tcf_action_dump(skb, exts->action, 0, 0) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
} else if (map->police) {
nest = nla_nest_start(skb, map->police);
if (nest == NULL)
goto nla_put_failure;
if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
}
}
#endif
return 0;
nla_put_failure: __attribute__ ((unused))
return -1;
}
EXPORT_SYMBOL(tcf_exts_dump);
int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts,
const struct tcf_ext_map *map)
{
#ifdef CONFIG_NET_CLS_ACT
if (exts->action)
if (tcf_action_copy_stats(skb, exts->action, 1) < 0)
goto nla_put_failure;
#endif
return 0;
nla_put_failure: __attribute__ ((unused))
return -1;
}
EXPORT_SYMBOL(tcf_exts_dump_stats);
static int __init tc_filter_init(void)
{
rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL);
rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL);
rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
tc_dump_tfilter);
return 0;
}
subsys_initcall(tc_filter_init);

View File

@@ -0,0 +1,304 @@
/*
* net/sched/cls_basic.c Basic Packet Classifier.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
struct basic_head
{
u32 hgenerator;
struct list_head flist;
};
struct basic_filter
{
u32 handle;
struct tcf_exts exts;
struct tcf_ematch_tree ematches;
struct tcf_result res;
struct list_head link;
};
static const struct tcf_ext_map basic_ext_map = {
.action = TCA_BASIC_ACT,
.police = TCA_BASIC_POLICE
};
static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
int r;
struct basic_head *head = (struct basic_head *) tp->root;
struct basic_filter *f;
list_for_each_entry(f, &head->flist, link) {
if (!tcf_em_tree_match(skb, &f->ematches, NULL))
continue;
*res = f->res;
r = tcf_exts_exec(skb, &f->exts, res);
if (r < 0)
continue;
return r;
}
return -1;
}
static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
{
unsigned long l = 0UL;
struct basic_head *head = (struct basic_head *) tp->root;
struct basic_filter *f;
if (head == NULL)
return 0UL;
list_for_each_entry(f, &head->flist, link)
if (f->handle == handle)
l = (unsigned long) f;
return l;
}
static void basic_put(struct tcf_proto *tp, unsigned long f)
{
}
static int basic_init(struct tcf_proto *tp)
{
struct basic_head *head;
head = kzalloc(sizeof(*head), GFP_KERNEL);
if (head == NULL)
return -ENOBUFS;
INIT_LIST_HEAD(&head->flist);
tp->root = head;
return 0;
}
static inline void basic_delete_filter(struct tcf_proto *tp,
struct basic_filter *f)
{
tcf_unbind_filter(tp, &f->res);
tcf_exts_destroy(tp, &f->exts);
tcf_em_tree_destroy(tp, &f->ematches);
kfree(f);
}
static void basic_destroy(struct tcf_proto *tp)
{
struct basic_head *head = tp->root;
struct basic_filter *f, *n;
list_for_each_entry_safe(f, n, &head->flist, link) {
list_del(&f->link);
basic_delete_filter(tp, f);
}
kfree(head);
}
static int basic_delete(struct tcf_proto *tp, unsigned long arg)
{
struct basic_head *head = (struct basic_head *) tp->root;
struct basic_filter *t, *f = (struct basic_filter *) arg;
list_for_each_entry(t, &head->flist, link)
if (t == f) {
tcf_tree_lock(tp);
list_del(&t->link);
tcf_tree_unlock(tp);
basic_delete_filter(tp, t);
return 0;
}
return -ENOENT;
}
static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {
[TCA_BASIC_CLASSID] = { .type = NLA_U32 },
[TCA_BASIC_EMATCHES] = { .type = NLA_NESTED },
};
static inline int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f,
unsigned long base, struct nlattr **tb,
struct nlattr *est)
{
int err = -EINVAL;
struct tcf_exts e;
struct tcf_ematch_tree t;
err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map);
if (err < 0)
return err;
err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &t);
if (err < 0)
goto errout;
if (tb[TCA_BASIC_CLASSID]) {
f->res.classid = nla_get_u32(tb[TCA_BASIC_CLASSID]);
tcf_bind_filter(tp, &f->res, base);
}
tcf_exts_change(tp, &f->exts, &e);
tcf_em_tree_change(tp, &f->ematches, &t);
return 0;
errout:
tcf_exts_destroy(tp, &e);
return err;
}
static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
struct nlattr **tca, unsigned long *arg)
{
int err;
struct basic_head *head = (struct basic_head *) tp->root;
struct nlattr *tb[TCA_BASIC_MAX + 1];
struct basic_filter *f = (struct basic_filter *) *arg;
if (tca[TCA_OPTIONS] == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS],
basic_policy);
if (err < 0)
return err;
if (f != NULL) {
if (handle && f->handle != handle)
return -EINVAL;
return basic_set_parms(tp, f, base, tb, tca[TCA_RATE]);
}
err = -ENOBUFS;
f = kzalloc(sizeof(*f), GFP_KERNEL);
if (f == NULL)
goto errout;
err = -EINVAL;
if (handle)
f->handle = handle;
else {
unsigned int i = 0x80000000;
do {
if (++head->hgenerator == 0x7FFFFFFF)
head->hgenerator = 1;
} while (--i > 0 && basic_get(tp, head->hgenerator));
if (i <= 0) {
printk(KERN_ERR "Insufficient number of handles\n");
goto errout;
}
f->handle = head->hgenerator;
}
err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE]);
if (err < 0)
goto errout;
tcf_tree_lock(tp);
list_add(&f->link, &head->flist);
tcf_tree_unlock(tp);
*arg = (unsigned long) f;
return 0;
errout:
if (*arg == 0UL && f)
kfree(f);
return err;
}
static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct basic_head *head = (struct basic_head *) tp->root;
struct basic_filter *f;
list_for_each_entry(f, &head->flist, link) {
if (arg->count < arg->skip)
goto skip;
if (arg->fn(tp, (unsigned long) f, arg) < 0) {
arg->stop = 1;
break;
}
skip:
arg->count++;
}
}
static int basic_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct basic_filter *f = (struct basic_filter *) fh;
struct nlattr *nest;
if (f == NULL)
return skb->len;
t->tcm_handle = f->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (f->res.classid)
NLA_PUT_U32(skb, TCA_BASIC_CLASSID, f->res.classid);
if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 ||
tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
return skb->len;
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
}
static struct tcf_proto_ops cls_basic_ops __read_mostly = {
.kind = "basic",
.classify = basic_classify,
.init = basic_init,
.destroy = basic_destroy,
.get = basic_get,
.put = basic_put,
.change = basic_change,
.delete = basic_delete,
.walk = basic_walk,
.dump = basic_dump,
.owner = THIS_MODULE,
};
static int __init init_basic(void)
{
return register_tcf_proto_ops(&cls_basic_ops);
}
static void __exit exit_basic(void)
{
unregister_tcf_proto_ops(&cls_basic_ops);
}
module_init(init_basic)
module_exit(exit_basic)
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,290 @@
/*
* net/sched/cls_cgroup.c Control Group Classifier
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/cgroup.h>
#include <net/rtnetlink.h>
#include <net/pkt_cls.h>
struct cgroup_cls_state
{
struct cgroup_subsys_state css;
u32 classid;
};
static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp)
{
return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id),
struct cgroup_cls_state, css);
}
static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p)
{
return container_of(task_subsys_state(p, net_cls_subsys_id),
struct cgroup_cls_state, css);
}
static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
struct cgroup *cgrp)
{
struct cgroup_cls_state *cs;
if (!(cs = kzalloc(sizeof(*cs), GFP_KERNEL)))
return ERR_PTR(-ENOMEM);
if (cgrp->parent)
cs->classid = cgrp_cls_state(cgrp->parent)->classid;
return &cs->css;
}
static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
kfree(cgrp_cls_state(cgrp));
}
static u64 read_classid(struct cgroup *cgrp, struct cftype *cft)
{
return cgrp_cls_state(cgrp)->classid;
}
static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value)
{
cgrp_cls_state(cgrp)->classid = (u32) value;
return 0;
}
static struct cftype ss_files[] = {
{
.name = "classid",
.read_u64 = read_classid,
.write_u64 = write_classid,
},
};
static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
}
struct cgroup_subsys net_cls_subsys = {
.name = "net_cls",
.create = cgrp_create,
.destroy = cgrp_destroy,
.populate = cgrp_populate,
.subsys_id = net_cls_subsys_id,
};
struct cls_cgroup_head
{
u32 handle;
struct tcf_exts exts;
struct tcf_ematch_tree ematches;
};
static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
struct cls_cgroup_head *head = tp->root;
u32 classid;
/*
* Due to the nature of the classifier it is required to ignore all
* packets originating from softirq context as accessing `current'
* would lead to false results.
*
* This test assumes that all callers of dev_queue_xmit() explicitely
* disable bh. Knowing this, it is possible to detect softirq based
* calls by looking at the number of nested bh disable calls because
* softirqs always disables bh.
*/
if (in_serving_softirq())
return -1;
rcu_read_lock();
classid = task_cls_state(current)->classid;
rcu_read_unlock();
if (!classid)
return -1;
if (!tcf_em_tree_match(skb, &head->ematches, NULL))
return -1;
res->classid = classid;
res->class = 0;
return tcf_exts_exec(skb, &head->exts, res);
}
static unsigned long cls_cgroup_get(struct tcf_proto *tp, u32 handle)
{
return 0UL;
}
static void cls_cgroup_put(struct tcf_proto *tp, unsigned long f)
{
}
static int cls_cgroup_init(struct tcf_proto *tp)
{
return 0;
}
static const struct tcf_ext_map cgroup_ext_map = {
.action = TCA_CGROUP_ACT,
.police = TCA_CGROUP_POLICE,
};
static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = {
[TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED },
};
static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
unsigned long *arg)
{
struct nlattr *tb[TCA_CGROUP_MAX+1];
struct cls_cgroup_head *head = tp->root;
struct tcf_ematch_tree t;
struct tcf_exts e;
int err;
if (!tca[TCA_OPTIONS])
return -EINVAL;
if (head == NULL) {
if (!handle)
return -EINVAL;
head = kzalloc(sizeof(*head), GFP_KERNEL);
if (head == NULL)
return -ENOBUFS;
head->handle = handle;
tcf_tree_lock(tp);
tp->root = head;
tcf_tree_unlock(tp);
}
if (handle != head->handle)
return -ENOENT;
err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS],
cgroup_policy);
if (err < 0)
return err;
err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &cgroup_ext_map);
if (err < 0)
return err;
err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t);
if (err < 0)
return err;
tcf_exts_change(tp, &head->exts, &e);
tcf_em_tree_change(tp, &head->ematches, &t);
return 0;
}
static void cls_cgroup_destroy(struct tcf_proto *tp)
{
struct cls_cgroup_head *head = tp->root;
if (head) {
tcf_exts_destroy(tp, &head->exts);
tcf_em_tree_destroy(tp, &head->ematches);
kfree(head);
}
}
static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg)
{
return -EOPNOTSUPP;
}
static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct cls_cgroup_head *head = tp->root;
if (arg->count < arg->skip)
goto skip;
if (arg->fn(tp, (unsigned long) head, arg) < 0) {
arg->stop = 1;
return;
}
skip:
arg->count++;
}
static int cls_cgroup_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct cls_cgroup_head *head = tp->root;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
t->tcm_handle = head->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (tcf_exts_dump(skb, &head->exts, &cgroup_ext_map) < 0 ||
tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
if (tcf_exts_dump_stats(skb, &head->exts, &cgroup_ext_map) < 0)
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {
.kind = "cgroup",
.init = cls_cgroup_init,
.change = cls_cgroup_change,
.classify = cls_cgroup_classify,
.destroy = cls_cgroup_destroy,
.get = cls_cgroup_get,
.put = cls_cgroup_put,
.delete = cls_cgroup_delete,
.walk = cls_cgroup_walk,
.dump = cls_cgroup_dump,
.owner = THIS_MODULE,
};
static int __init init_cgroup_cls(void)
{
return register_tcf_proto_ops(&cls_cgroup_ops);
}
static void __exit exit_cgroup_cls(void)
{
unregister_tcf_proto_ops(&cls_cgroup_ops);
}
module_init(init_cgroup_cls);
module_exit(exit_cgroup_cls);
MODULE_LICENSE("GPL");

707
kernel/net/sched/cls_flow.c Normal file
View File

@@ -0,0 +1,707 @@
/*
* net/sched/cls_flow.c Generic flow classifier
*
* Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/jhash.h>
#include <linux/random.h>
#include <linux/pkt_cls.h>
#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/if_vlan.h>
#include <net/pkt_cls.h>
#include <net/ip.h>
#include <net/route.h>
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#include <net/netfilter/nf_conntrack.h>
#endif
struct flow_head {
struct list_head filters;
};
struct flow_filter {
struct list_head list;
struct tcf_exts exts;
struct tcf_ematch_tree ematches;
struct timer_list perturb_timer;
u32 perturb_period;
u32 handle;
u32 nkeys;
u32 keymask;
u32 mode;
u32 mask;
u32 xor;
u32 rshift;
u32 addend;
u32 divisor;
u32 baseclass;
u32 hashrnd;
};
static const struct tcf_ext_map flow_ext_map = {
.action = TCA_FLOW_ACT,
.police = TCA_FLOW_POLICE,
};
static inline u32 addr_fold(void *addr)
{
unsigned long a = (unsigned long)addr;
return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0);
}
static u32 flow_get_src(const struct sk_buff *skb)
{
switch (skb->protocol) {
case htons(ETH_P_IP):
return ntohl(ip_hdr(skb)->saddr);
case htons(ETH_P_IPV6):
return ntohl(ipv6_hdr(skb)->saddr.s6_addr32[3]);
default:
return addr_fold(skb->sk);
}
}
static u32 flow_get_dst(const struct sk_buff *skb)
{
switch (skb->protocol) {
case htons(ETH_P_IP):
return ntohl(ip_hdr(skb)->daddr);
case htons(ETH_P_IPV6):
return ntohl(ipv6_hdr(skb)->daddr.s6_addr32[3]);
default:
return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
}
}
static u32 flow_get_proto(const struct sk_buff *skb)
{
switch (skb->protocol) {
case htons(ETH_P_IP):
return ip_hdr(skb)->protocol;
case htons(ETH_P_IPV6):
return ipv6_hdr(skb)->nexthdr;
default:
return 0;
}
}
static int has_ports(u8 protocol)
{
switch (protocol) {
case IPPROTO_TCP:
case IPPROTO_UDP:
case IPPROTO_UDPLITE:
case IPPROTO_SCTP:
case IPPROTO_DCCP:
case IPPROTO_ESP:
return 1;
default:
return 0;
}
}
static u32 flow_get_proto_src(const struct sk_buff *skb)
{
u32 res = 0;
switch (skb->protocol) {
case htons(ETH_P_IP): {
struct iphdr *iph = ip_hdr(skb);
if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
has_ports(iph->protocol))
res = ntohs(*(__be16 *)((void *)iph + iph->ihl * 4));
break;
}
case htons(ETH_P_IPV6): {
struct ipv6hdr *iph = ipv6_hdr(skb);
if (has_ports(iph->nexthdr))
res = ntohs(*(__be16 *)&iph[1]);
break;
}
default:
res = addr_fold(skb->sk);
}
return res;
}
static u32 flow_get_proto_dst(const struct sk_buff *skb)
{
u32 res = 0;
switch (skb->protocol) {
case htons(ETH_P_IP): {
struct iphdr *iph = ip_hdr(skb);
if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
has_ports(iph->protocol))
res = ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + 2));
break;
}
case htons(ETH_P_IPV6): {
struct ipv6hdr *iph = ipv6_hdr(skb);
if (has_ports(iph->nexthdr))
res = ntohs(*(__be16 *)((void *)&iph[1] + 2));
break;
}
default:
res = addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
}
return res;
}
static u32 flow_get_iif(const struct sk_buff *skb)
{
return skb->iif;
}
static u32 flow_get_priority(const struct sk_buff *skb)
{
return skb->priority;
}
static u32 flow_get_mark(const struct sk_buff *skb)
{
return skb->mark;
}
static u32 flow_get_nfct(const struct sk_buff *skb)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
return addr_fold(skb->nfct);
#else
return 0;
#endif
}
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#define CTTUPLE(skb, member) \
({ \
enum ip_conntrack_info ctinfo; \
struct nf_conn *ct = nf_ct_get(skb, &ctinfo); \
if (ct == NULL) \
goto fallback; \
ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member; \
})
#else
#define CTTUPLE(skb, member) \
({ \
goto fallback; \
0; \
})
#endif
static u32 flow_get_nfct_src(const struct sk_buff *skb)
{
switch (skb->protocol) {
case htons(ETH_P_IP):
return ntohl(CTTUPLE(skb, src.u3.ip));
case htons(ETH_P_IPV6):
return ntohl(CTTUPLE(skb, src.u3.ip6[3]));
}
fallback:
return flow_get_src(skb);
}
static u32 flow_get_nfct_dst(const struct sk_buff *skb)
{
switch (skb->protocol) {
case htons(ETH_P_IP):
return ntohl(CTTUPLE(skb, dst.u3.ip));
case htons(ETH_P_IPV6):
return ntohl(CTTUPLE(skb, dst.u3.ip6[3]));
}
fallback:
return flow_get_dst(skb);
}
static u32 flow_get_nfct_proto_src(const struct sk_buff *skb)
{
return ntohs(CTTUPLE(skb, src.u.all));
fallback:
return flow_get_proto_src(skb);
}
static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb)
{
return ntohs(CTTUPLE(skb, dst.u.all));
fallback:
return flow_get_proto_dst(skb);
}
static u32 flow_get_rtclassid(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_CLS_ROUTE
if (skb_dst(skb))
return skb_dst(skb)->tclassid;
#endif
return 0;
}
static u32 flow_get_skuid(const struct sk_buff *skb)
{
if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file)
return skb->sk->sk_socket->file->f_cred->fsuid;
return 0;
}
static u32 flow_get_skgid(const struct sk_buff *skb)
{
if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file)
return skb->sk->sk_socket->file->f_cred->fsgid;
return 0;
}
static u32 flow_get_vlan_tag(const struct sk_buff *skb)
{
u16 uninitialized_var(tag);
if (vlan_get_tag(skb, &tag) < 0)
return 0;
return tag & VLAN_VID_MASK;
}
static u32 flow_key_get(const struct sk_buff *skb, int key)
{
switch (key) {
case FLOW_KEY_SRC:
return flow_get_src(skb);
case FLOW_KEY_DST:
return flow_get_dst(skb);
case FLOW_KEY_PROTO:
return flow_get_proto(skb);
case FLOW_KEY_PROTO_SRC:
return flow_get_proto_src(skb);
case FLOW_KEY_PROTO_DST:
return flow_get_proto_dst(skb);
case FLOW_KEY_IIF:
return flow_get_iif(skb);
case FLOW_KEY_PRIORITY:
return flow_get_priority(skb);
case FLOW_KEY_MARK:
return flow_get_mark(skb);
case FLOW_KEY_NFCT:
return flow_get_nfct(skb);
case FLOW_KEY_NFCT_SRC:
return flow_get_nfct_src(skb);
case FLOW_KEY_NFCT_DST:
return flow_get_nfct_dst(skb);
case FLOW_KEY_NFCT_PROTO_SRC:
return flow_get_nfct_proto_src(skb);
case FLOW_KEY_NFCT_PROTO_DST:
return flow_get_nfct_proto_dst(skb);
case FLOW_KEY_RTCLASSID:
return flow_get_rtclassid(skb);
case FLOW_KEY_SKUID:
return flow_get_skuid(skb);
case FLOW_KEY_SKGID:
return flow_get_skgid(skb);
case FLOW_KEY_VLAN_TAG:
return flow_get_vlan_tag(skb);
default:
WARN_ON(1);
return 0;
}
}
static int flow_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
struct flow_head *head = tp->root;
struct flow_filter *f;
u32 keymask;
u32 classid;
unsigned int n, key;
int r;
list_for_each_entry(f, &head->filters, list) {
u32 keys[f->nkeys];
if (!tcf_em_tree_match(skb, &f->ematches, NULL))
continue;
keymask = f->keymask;
for (n = 0; n < f->nkeys; n++) {
key = ffs(keymask) - 1;
keymask &= ~(1 << key);
keys[n] = flow_key_get(skb, key);
}
if (f->mode == FLOW_MODE_HASH)
classid = jhash2(keys, f->nkeys, f->hashrnd);
else {
classid = keys[0];
classid = (classid & f->mask) ^ f->xor;
classid = (classid >> f->rshift) + f->addend;
}
if (f->divisor)
classid %= f->divisor;
res->class = 0;
res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid);
r = tcf_exts_exec(skb, &f->exts, res);
if (r < 0)
continue;
return r;
}
return -1;
}
static void flow_perturbation(unsigned long arg)
{
struct flow_filter *f = (struct flow_filter *)arg;
get_random_bytes(&f->hashrnd, 4);
if (f->perturb_period)
mod_timer(&f->perturb_timer, jiffies + f->perturb_period);
}
static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
[TCA_FLOW_KEYS] = { .type = NLA_U32 },
[TCA_FLOW_MODE] = { .type = NLA_U32 },
[TCA_FLOW_BASECLASS] = { .type = NLA_U32 },
[TCA_FLOW_RSHIFT] = { .type = NLA_U32 },
[TCA_FLOW_ADDEND] = { .type = NLA_U32 },
[TCA_FLOW_MASK] = { .type = NLA_U32 },
[TCA_FLOW_XOR] = { .type = NLA_U32 },
[TCA_FLOW_DIVISOR] = { .type = NLA_U32 },
[TCA_FLOW_ACT] = { .type = NLA_NESTED },
[TCA_FLOW_POLICE] = { .type = NLA_NESTED },
[TCA_FLOW_EMATCHES] = { .type = NLA_NESTED },
[TCA_FLOW_PERTURB] = { .type = NLA_U32 },
};
static int flow_change(struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
unsigned long *arg)
{
struct flow_head *head = tp->root;
struct flow_filter *f;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_FLOW_MAX + 1];
struct tcf_exts e;
struct tcf_ematch_tree t;
unsigned int nkeys = 0;
unsigned int perturb_period = 0;
u32 baseclass = 0;
u32 keymask = 0;
u32 mode;
int err;
if (opt == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy);
if (err < 0)
return err;
if (tb[TCA_FLOW_BASECLASS]) {
baseclass = nla_get_u32(tb[TCA_FLOW_BASECLASS]);
if (TC_H_MIN(baseclass) == 0)
return -EINVAL;
}
if (tb[TCA_FLOW_KEYS]) {
keymask = nla_get_u32(tb[TCA_FLOW_KEYS]);
nkeys = hweight32(keymask);
if (nkeys == 0)
return -EINVAL;
if (fls(keymask) - 1 > FLOW_KEY_MAX)
return -EOPNOTSUPP;
}
err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &flow_ext_map);
if (err < 0)
return err;
err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t);
if (err < 0)
goto err1;
f = (struct flow_filter *)*arg;
if (f != NULL) {
err = -EINVAL;
if (f->handle != handle && handle)
goto err2;
mode = f->mode;
if (tb[TCA_FLOW_MODE])
mode = nla_get_u32(tb[TCA_FLOW_MODE]);
if (mode != FLOW_MODE_HASH && nkeys > 1)
goto err2;
if (mode == FLOW_MODE_HASH)
perturb_period = f->perturb_period;
if (tb[TCA_FLOW_PERTURB]) {
if (mode != FLOW_MODE_HASH)
goto err2;
perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
}
} else {
err = -EINVAL;
if (!handle)
goto err2;
if (!tb[TCA_FLOW_KEYS])
goto err2;
mode = FLOW_MODE_MAP;
if (tb[TCA_FLOW_MODE])
mode = nla_get_u32(tb[TCA_FLOW_MODE]);
if (mode != FLOW_MODE_HASH && nkeys > 1)
goto err2;
if (tb[TCA_FLOW_PERTURB]) {
if (mode != FLOW_MODE_HASH)
goto err2;
perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
}
if (TC_H_MAJ(baseclass) == 0)
baseclass = TC_H_MAKE(tp->q->handle, baseclass);
if (TC_H_MIN(baseclass) == 0)
baseclass = TC_H_MAKE(baseclass, 1);
err = -ENOBUFS;
f = kzalloc(sizeof(*f), GFP_KERNEL);
if (f == NULL)
goto err2;
f->handle = handle;
f->mask = ~0U;
get_random_bytes(&f->hashrnd, 4);
f->perturb_timer.function = flow_perturbation;
f->perturb_timer.data = (unsigned long)f;
init_timer_deferrable(&f->perturb_timer);
}
tcf_exts_change(tp, &f->exts, &e);
tcf_em_tree_change(tp, &f->ematches, &t);
tcf_tree_lock(tp);
if (tb[TCA_FLOW_KEYS]) {
f->keymask = keymask;
f->nkeys = nkeys;
}
f->mode = mode;
if (tb[TCA_FLOW_MASK])
f->mask = nla_get_u32(tb[TCA_FLOW_MASK]);
if (tb[TCA_FLOW_XOR])
f->xor = nla_get_u32(tb[TCA_FLOW_XOR]);
if (tb[TCA_FLOW_RSHIFT])
f->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]);
if (tb[TCA_FLOW_ADDEND])
f->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]);
if (tb[TCA_FLOW_DIVISOR])
f->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]);
if (baseclass)
f->baseclass = baseclass;
f->perturb_period = perturb_period;
del_timer(&f->perturb_timer);
if (perturb_period)
mod_timer(&f->perturb_timer, jiffies + perturb_period);
if (*arg == 0)
list_add_tail(&f->list, &head->filters);
tcf_tree_unlock(tp);
*arg = (unsigned long)f;
return 0;
err2:
tcf_em_tree_destroy(tp, &t);
err1:
tcf_exts_destroy(tp, &e);
return err;
}
static void flow_destroy_filter(struct tcf_proto *tp, struct flow_filter *f)
{
del_timer_sync(&f->perturb_timer);
tcf_exts_destroy(tp, &f->exts);
tcf_em_tree_destroy(tp, &f->ematches);
kfree(f);
}
static int flow_delete(struct tcf_proto *tp, unsigned long arg)
{
struct flow_filter *f = (struct flow_filter *)arg;
tcf_tree_lock(tp);
list_del(&f->list);
tcf_tree_unlock(tp);
flow_destroy_filter(tp, f);
return 0;
}
static int flow_init(struct tcf_proto *tp)
{
struct flow_head *head;
head = kzalloc(sizeof(*head), GFP_KERNEL);
if (head == NULL)
return -ENOBUFS;
INIT_LIST_HEAD(&head->filters);
tp->root = head;
return 0;
}
static void flow_destroy(struct tcf_proto *tp)
{
struct flow_head *head = tp->root;
struct flow_filter *f, *next;
list_for_each_entry_safe(f, next, &head->filters, list) {
list_del(&f->list);
flow_destroy_filter(tp, f);
}
kfree(head);
}
static unsigned long flow_get(struct tcf_proto *tp, u32 handle)
{
struct flow_head *head = tp->root;
struct flow_filter *f;
list_for_each_entry(f, &head->filters, list)
if (f->handle == handle)
return (unsigned long)f;
return 0;
}
static void flow_put(struct tcf_proto *tp, unsigned long f)
{
return;
}
static int flow_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct flow_filter *f = (struct flow_filter *)fh;
struct nlattr *nest;
if (f == NULL)
return skb->len;
t->tcm_handle = f->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
NLA_PUT_U32(skb, TCA_FLOW_KEYS, f->keymask);
NLA_PUT_U32(skb, TCA_FLOW_MODE, f->mode);
if (f->mask != ~0 || f->xor != 0) {
NLA_PUT_U32(skb, TCA_FLOW_MASK, f->mask);
NLA_PUT_U32(skb, TCA_FLOW_XOR, f->xor);
}
if (f->rshift)
NLA_PUT_U32(skb, TCA_FLOW_RSHIFT, f->rshift);
if (f->addend)
NLA_PUT_U32(skb, TCA_FLOW_ADDEND, f->addend);
if (f->divisor)
NLA_PUT_U32(skb, TCA_FLOW_DIVISOR, f->divisor);
if (f->baseclass)
NLA_PUT_U32(skb, TCA_FLOW_BASECLASS, f->baseclass);
if (f->perturb_period)
NLA_PUT_U32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ);
if (tcf_exts_dump(skb, &f->exts, &flow_ext_map) < 0)
goto nla_put_failure;
#ifdef CONFIG_NET_EMATCH
if (f->ematches.hdr.nmatches &&
tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0)
goto nla_put_failure;
#endif
nla_nest_end(skb, nest);
if (tcf_exts_dump_stats(skb, &f->exts, &flow_ext_map) < 0)
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, nest);
return -1;
}
static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct flow_head *head = tp->root;
struct flow_filter *f;
list_for_each_entry(f, &head->filters, list) {
if (arg->count < arg->skip)
goto skip;
if (arg->fn(tp, (unsigned long)f, arg) < 0) {
arg->stop = 1;
break;
}
skip:
arg->count++;
}
}
static struct tcf_proto_ops cls_flow_ops __read_mostly = {
.kind = "flow",
.classify = flow_classify,
.init = flow_init,
.destroy = flow_destroy,
.change = flow_change,
.delete = flow_delete,
.get = flow_get,
.put = flow_put,
.dump = flow_dump,
.walk = flow_walk,
.owner = THIS_MODULE,
};
static int __init cls_flow_init(void)
{
return register_tcf_proto_ops(&cls_flow_ops);
}
static void __exit cls_flow_exit(void)
{
unregister_tcf_proto_ops(&cls_flow_ops);
}
module_init(cls_flow_init);
module_exit(cls_flow_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_DESCRIPTION("TC flow classifier");

400
kernel/net/sched/cls_fw.c Normal file
View File

@@ -0,0 +1,400 @@
/*
* net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel).
* Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension
*
* JHS: We should remove the CONFIG_NET_CLS_IND from here
* eventually when the meta match extension is made available
*
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *))
struct fw_head
{
struct fw_filter *ht[HTSIZE];
u32 mask;
};
struct fw_filter
{
struct fw_filter *next;
u32 id;
struct tcf_result res;
#ifdef CONFIG_NET_CLS_IND
char indev[IFNAMSIZ];
#endif /* CONFIG_NET_CLS_IND */
struct tcf_exts exts;
};
static const struct tcf_ext_map fw_ext_map = {
.action = TCA_FW_ACT,
.police = TCA_FW_POLICE
};
static __inline__ int fw_hash(u32 handle)
{
if (HTSIZE == 4096)
return ((handle >> 24) & 0xFFF) ^
((handle >> 12) & 0xFFF) ^
(handle & 0xFFF);
else if (HTSIZE == 2048)
return ((handle >> 22) & 0x7FF) ^
((handle >> 11) & 0x7FF) ^
(handle & 0x7FF);
else if (HTSIZE == 1024)
return ((handle >> 20) & 0x3FF) ^
((handle >> 10) & 0x3FF) ^
(handle & 0x3FF);
else if (HTSIZE == 512)
return (handle >> 27) ^
((handle >> 18) & 0x1FF) ^
((handle >> 9) & 0x1FF) ^
(handle & 0x1FF);
else if (HTSIZE == 256) {
u8 *t = (u8 *) &handle;
return t[0] ^ t[1] ^ t[2] ^ t[3];
} else
return handle & (HTSIZE - 1);
}
static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
struct fw_head *head = (struct fw_head*)tp->root;
struct fw_filter *f;
int r;
u32 id = skb->mark;
if (head != NULL) {
id &= head->mask;
for (f=head->ht[fw_hash(id)]; f; f=f->next) {
if (f->id == id) {
*res = f->res;
#ifdef CONFIG_NET_CLS_IND
if (!tcf_match_indev(skb, f->indev))
continue;
#endif /* CONFIG_NET_CLS_IND */
r = tcf_exts_exec(skb, &f->exts, res);
if (r < 0)
continue;
return r;
}
}
} else {
/* old method */
if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id^tp->q->handle)))) {
res->classid = id;
res->class = 0;
return 0;
}
}
return -1;
}
static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
{
struct fw_head *head = (struct fw_head*)tp->root;
struct fw_filter *f;
if (head == NULL)
return 0;
for (f=head->ht[fw_hash(handle)]; f; f=f->next) {
if (f->id == handle)
return (unsigned long)f;
}
return 0;
}
static void fw_put(struct tcf_proto *tp, unsigned long f)
{
}
static int fw_init(struct tcf_proto *tp)
{
return 0;
}
static inline void
fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
{
tcf_unbind_filter(tp, &f->res);
tcf_exts_destroy(tp, &f->exts);
kfree(f);
}
static void fw_destroy(struct tcf_proto *tp)
{
struct fw_head *head = tp->root;
struct fw_filter *f;
int h;
if (head == NULL)
return;
for (h=0; h<HTSIZE; h++) {
while ((f=head->ht[h]) != NULL) {
head->ht[h] = f->next;
fw_delete_filter(tp, f);
}
}
kfree(head);
}
static int fw_delete(struct tcf_proto *tp, unsigned long arg)
{
struct fw_head *head = (struct fw_head*)tp->root;
struct fw_filter *f = (struct fw_filter*)arg;
struct fw_filter **fp;
if (head == NULL || f == NULL)
goto out;
for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
tcf_tree_lock(tp);
*fp = f->next;
tcf_tree_unlock(tp);
fw_delete_filter(tp, f);
return 0;
}
}
out:
return -EINVAL;
}
static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
[TCA_FW_CLASSID] = { .type = NLA_U32 },
[TCA_FW_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ },
[TCA_FW_MASK] = { .type = NLA_U32 },
};
static int
fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f,
struct nlattr **tb, struct nlattr **tca, unsigned long base)
{
struct fw_head *head = (struct fw_head *)tp->root;
struct tcf_exts e;
u32 mask;
int err;
err = tcf_exts_validate(tp, tb, tca[TCA_RATE], &e, &fw_ext_map);
if (err < 0)
return err;
err = -EINVAL;
if (tb[TCA_FW_CLASSID]) {
f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]);
tcf_bind_filter(tp, &f->res, base);
}
#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_FW_INDEV]) {
err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV]);
if (err < 0)
goto errout;
}
#endif /* CONFIG_NET_CLS_IND */
if (tb[TCA_FW_MASK]) {
mask = nla_get_u32(tb[TCA_FW_MASK]);
if (mask != head->mask)
goto errout;
} else if (head->mask != 0xFFFFFFFF)
goto errout;
tcf_exts_change(tp, &f->exts, &e);
return 0;
errout:
tcf_exts_destroy(tp, &e);
return err;
}
static int fw_change(struct tcf_proto *tp, unsigned long base,
u32 handle,
struct nlattr **tca,
unsigned long *arg)
{
struct fw_head *head = (struct fw_head*)tp->root;
struct fw_filter *f = (struct fw_filter *) *arg;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_FW_MAX + 1];
int err;
if (!opt)
return handle ? -EINVAL : 0;
err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy);
if (err < 0)
return err;
if (f != NULL) {
if (f->id != handle && handle)
return -EINVAL;
return fw_change_attrs(tp, f, tb, tca, base);
}
if (!handle)
return -EINVAL;
if (head == NULL) {
u32 mask = 0xFFFFFFFF;
if (tb[TCA_FW_MASK])
mask = nla_get_u32(tb[TCA_FW_MASK]);
head = kzalloc(sizeof(struct fw_head), GFP_KERNEL);
if (head == NULL)
return -ENOBUFS;
head->mask = mask;
tcf_tree_lock(tp);
tp->root = head;
tcf_tree_unlock(tp);
}
f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL);
if (f == NULL)
return -ENOBUFS;
f->id = handle;
err = fw_change_attrs(tp, f, tb, tca, base);
if (err < 0)
goto errout;
f->next = head->ht[fw_hash(handle)];
tcf_tree_lock(tp);
head->ht[fw_hash(handle)] = f;
tcf_tree_unlock(tp);
*arg = (unsigned long)f;
return 0;
errout:
kfree(f);
return err;
}
static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct fw_head *head = (struct fw_head*)tp->root;
int h;
if (head == NULL)
arg->stop = 1;
if (arg->stop)
return;
for (h = 0; h < HTSIZE; h++) {
struct fw_filter *f;
for (f = head->ht[h]; f; f = f->next) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(tp, (unsigned long)f, arg) < 0) {
arg->stop = 1;
return;
}
arg->count++;
}
}
}
static int fw_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct fw_head *head = (struct fw_head *)tp->root;
struct fw_filter *f = (struct fw_filter*)fh;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
if (f == NULL)
return skb->len;
t->tcm_handle = f->id;
if (!f->res.classid && !tcf_exts_is_available(&f->exts))
return skb->len;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (f->res.classid)
NLA_PUT_U32(skb, TCA_FW_CLASSID, f->res.classid);
#ifdef CONFIG_NET_CLS_IND
if (strlen(f->indev))
NLA_PUT_STRING(skb, TCA_FW_INDEV, f->indev);
#endif /* CONFIG_NET_CLS_IND */
if (head->mask != 0xFFFFFFFF)
NLA_PUT_U32(skb, TCA_FW_MASK, head->mask);
if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0)
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tcf_proto_ops cls_fw_ops __read_mostly = {
.kind = "fw",
.classify = fw_classify,
.init = fw_init,
.destroy = fw_destroy,
.get = fw_get,
.put = fw_put,
.change = fw_change,
.delete = fw_delete,
.walk = fw_walk,
.dump = fw_dump,
.owner = THIS_MODULE,
};
static int __init init_fw(void)
{
return register_tcf_proto_ops(&cls_fw_ops);
}
static void __exit exit_fw(void)
{
unregister_tcf_proto_ops(&cls_fw_ops);
}
module_init(init_fw)
module_exit(exit_fw)
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,620 @@
/*
* net/sched/cls_route.c ROUTE4 classifier.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/dst.h>
#include <net/route.h>
#include <net/netlink.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
/*
1. For now we assume that route tags < 256.
It allows to use direct table lookups, instead of hash tables.
2. For now we assume that "from TAG" and "fromdev DEV" statements
are mutually exclusive.
3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
*/
struct route4_fastmap
{
struct route4_filter *filter;
u32 id;
int iif;
};
struct route4_head
{
struct route4_fastmap fastmap[16];
struct route4_bucket *table[256+1];
};
struct route4_bucket
{
/* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */
struct route4_filter *ht[16+16+1];
};
struct route4_filter
{
struct route4_filter *next;
u32 id;
int iif;
struct tcf_result res;
struct tcf_exts exts;
u32 handle;
struct route4_bucket *bkt;
};
#define ROUTE4_FAILURE ((struct route4_filter*)(-1L))
static const struct tcf_ext_map route_ext_map = {
.police = TCA_ROUTE4_POLICE,
.action = TCA_ROUTE4_ACT
};
static __inline__ int route4_fastmap_hash(u32 id, int iif)
{
return id&0xF;
}
static inline
void route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)
{
spinlock_t *root_lock = qdisc_root_sleeping_lock(q);
spin_lock_bh(root_lock);
memset(head->fastmap, 0, sizeof(head->fastmap));
spin_unlock_bh(root_lock);
}
static inline void
route4_set_fastmap(struct route4_head *head, u32 id, int iif,
struct route4_filter *f)
{
int h = route4_fastmap_hash(id, iif);
head->fastmap[h].id = id;
head->fastmap[h].iif = iif;
head->fastmap[h].filter = f;
}
static __inline__ int route4_hash_to(u32 id)
{
return id&0xFF;
}
static __inline__ int route4_hash_from(u32 id)
{
return (id>>16)&0xF;
}
static __inline__ int route4_hash_iif(int iif)
{
return 16 + ((iif>>16)&0xF);
}
static __inline__ int route4_hash_wild(void)
{
return 32;
}
#define ROUTE4_APPLY_RESULT() \
{ \
*res = f->res; \
if (tcf_exts_is_available(&f->exts)) { \
int r = tcf_exts_exec(skb, &f->exts, res); \
if (r < 0) { \
dont_cache = 1; \
continue; \
} \
return r; \
} else if (!dont_cache) \
route4_set_fastmap(head, id, iif, f); \
return 0; \
}
static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
struct route4_head *head = (struct route4_head*)tp->root;
struct dst_entry *dst;
struct route4_bucket *b;
struct route4_filter *f;
u32 id, h;
int iif, dont_cache = 0;
if ((dst = skb_dst(skb)) == NULL)
goto failure;
id = dst->tclassid;
if (head == NULL)
goto old_method;
iif = ((struct rtable*)dst)->fl.iif;
h = route4_fastmap_hash(id, iif);
if (id == head->fastmap[h].id &&
iif == head->fastmap[h].iif &&
(f = head->fastmap[h].filter) != NULL) {
if (f == ROUTE4_FAILURE)
goto failure;
*res = f->res;
return 0;
}
h = route4_hash_to(id);
restart:
if ((b = head->table[h]) != NULL) {
for (f = b->ht[route4_hash_from(id)]; f; f = f->next)
if (f->id == id)
ROUTE4_APPLY_RESULT();
for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next)
if (f->iif == iif)
ROUTE4_APPLY_RESULT();
for (f = b->ht[route4_hash_wild()]; f; f = f->next)
ROUTE4_APPLY_RESULT();
}
if (h < 256) {
h = 256;
id &= ~0xFFFF;
goto restart;
}
if (!dont_cache)
route4_set_fastmap(head, id, iif, ROUTE4_FAILURE);
failure:
return -1;
old_method:
if (id && (TC_H_MAJ(id) == 0 ||
!(TC_H_MAJ(id^tp->q->handle)))) {
res->classid = id;
res->class = 0;
return 0;
}
return -1;
}
static inline u32 to_hash(u32 id)
{
u32 h = id&0xFF;
if (id&0x8000)
h += 256;
return h;
}
static inline u32 from_hash(u32 id)
{
id &= 0xFFFF;
if (id == 0xFFFF)
return 32;
if (!(id & 0x8000)) {
if (id > 255)
return 256;
return id&0xF;
}
return 16 + (id&0xF);
}
static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
{
struct route4_head *head = (struct route4_head*)tp->root;
struct route4_bucket *b;
struct route4_filter *f;
unsigned h1, h2;
if (!head)
return 0;
h1 = to_hash(handle);
if (h1 > 256)
return 0;
h2 = from_hash(handle>>16);
if (h2 > 32)
return 0;
if ((b = head->table[h1]) != NULL) {
for (f = b->ht[h2]; f; f = f->next)
if (f->handle == handle)
return (unsigned long)f;
}
return 0;
}
static void route4_put(struct tcf_proto *tp, unsigned long f)
{
}
static int route4_init(struct tcf_proto *tp)
{
return 0;
}
static inline void
route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f)
{
tcf_unbind_filter(tp, &f->res);
tcf_exts_destroy(tp, &f->exts);
kfree(f);
}
static void route4_destroy(struct tcf_proto *tp)
{
struct route4_head *head = tp->root;
int h1, h2;
if (head == NULL)
return;
for (h1=0; h1<=256; h1++) {
struct route4_bucket *b;
if ((b = head->table[h1]) != NULL) {
for (h2=0; h2<=32; h2++) {
struct route4_filter *f;
while ((f = b->ht[h2]) != NULL) {
b->ht[h2] = f->next;
route4_delete_filter(tp, f);
}
}
kfree(b);
}
}
kfree(head);
}
static int route4_delete(struct tcf_proto *tp, unsigned long arg)
{
struct route4_head *head = (struct route4_head*)tp->root;
struct route4_filter **fp, *f = (struct route4_filter*)arg;
unsigned h = 0;
struct route4_bucket *b;
int i;
if (!head || !f)
return -EINVAL;
h = f->handle;
b = f->bkt;
for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
tcf_tree_lock(tp);
*fp = f->next;
tcf_tree_unlock(tp);
route4_reset_fastmap(tp->q, head, f->id);
route4_delete_filter(tp, f);
/* Strip tree */
for (i=0; i<=32; i++)
if (b->ht[i])
return 0;
/* OK, session has no flows */
tcf_tree_lock(tp);
head->table[to_hash(h)] = NULL;
tcf_tree_unlock(tp);
kfree(b);
return 0;
}
}
return 0;
}
static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = {
[TCA_ROUTE4_CLASSID] = { .type = NLA_U32 },
[TCA_ROUTE4_TO] = { .type = NLA_U32 },
[TCA_ROUTE4_FROM] = { .type = NLA_U32 },
[TCA_ROUTE4_IIF] = { .type = NLA_U32 },
};
static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
struct route4_filter *f, u32 handle, struct route4_head *head,
struct nlattr **tb, struct nlattr *est, int new)
{
int err;
u32 id = 0, to = 0, nhandle = 0x8000;
struct route4_filter *fp;
unsigned int h1;
struct route4_bucket *b;
struct tcf_exts e;
err = tcf_exts_validate(tp, tb, est, &e, &route_ext_map);
if (err < 0)
return err;
err = -EINVAL;
if (tb[TCA_ROUTE4_TO]) {
if (new && handle & 0x8000)
goto errout;
to = nla_get_u32(tb[TCA_ROUTE4_TO]);
if (to > 0xFF)
goto errout;
nhandle = to;
}
if (tb[TCA_ROUTE4_FROM]) {
if (tb[TCA_ROUTE4_IIF])
goto errout;
id = nla_get_u32(tb[TCA_ROUTE4_FROM]);
if (id > 0xFF)
goto errout;
nhandle |= id << 16;
} else if (tb[TCA_ROUTE4_IIF]) {
id = nla_get_u32(tb[TCA_ROUTE4_IIF]);
if (id > 0x7FFF)
goto errout;
nhandle |= (id | 0x8000) << 16;
} else
nhandle |= 0xFFFF << 16;
if (handle && new) {
nhandle |= handle & 0x7F00;
if (nhandle != handle)
goto errout;
}
h1 = to_hash(nhandle);
if ((b = head->table[h1]) == NULL) {
err = -ENOBUFS;
b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL);
if (b == NULL)
goto errout;
tcf_tree_lock(tp);
head->table[h1] = b;
tcf_tree_unlock(tp);
} else {
unsigned int h2 = from_hash(nhandle >> 16);
err = -EEXIST;
for (fp = b->ht[h2]; fp; fp = fp->next)
if (fp->handle == f->handle)
goto errout;
}
tcf_tree_lock(tp);
if (tb[TCA_ROUTE4_TO])
f->id = to;
if (tb[TCA_ROUTE4_FROM])
f->id = to | id<<16;
else if (tb[TCA_ROUTE4_IIF])
f->iif = id;
f->handle = nhandle;
f->bkt = b;
tcf_tree_unlock(tp);
if (tb[TCA_ROUTE4_CLASSID]) {
f->res.classid = nla_get_u32(tb[TCA_ROUTE4_CLASSID]);
tcf_bind_filter(tp, &f->res, base);
}
tcf_exts_change(tp, &f->exts, &e);
return 0;
errout:
tcf_exts_destroy(tp, &e);
return err;
}
static int route4_change(struct tcf_proto *tp, unsigned long base,
u32 handle,
struct nlattr **tca,
unsigned long *arg)
{
struct route4_head *head = tp->root;
struct route4_filter *f, *f1, **fp;
struct route4_bucket *b;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_ROUTE4_MAX + 1];
unsigned int h, th;
u32 old_handle = 0;
int err;
if (opt == NULL)
return handle ? -EINVAL : 0;
err = nla_parse_nested(tb, TCA_ROUTE4_MAX, opt, route4_policy);
if (err < 0)
return err;
if ((f = (struct route4_filter*)*arg) != NULL) {
if (f->handle != handle && handle)
return -EINVAL;
if (f->bkt)
old_handle = f->handle;
err = route4_set_parms(tp, base, f, handle, head, tb,
tca[TCA_RATE], 0);
if (err < 0)
return err;
goto reinsert;
}
err = -ENOBUFS;
if (head == NULL) {
head = kzalloc(sizeof(struct route4_head), GFP_KERNEL);
if (head == NULL)
goto errout;
tcf_tree_lock(tp);
tp->root = head;
tcf_tree_unlock(tp);
}
f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL);
if (f == NULL)
goto errout;
err = route4_set_parms(tp, base, f, handle, head, tb,
tca[TCA_RATE], 1);
if (err < 0)
goto errout;
reinsert:
h = from_hash(f->handle >> 16);
for (fp = &f->bkt->ht[h]; (f1=*fp) != NULL; fp = &f1->next)
if (f->handle < f1->handle)
break;
f->next = f1;
tcf_tree_lock(tp);
*fp = f;
if (old_handle && f->handle != old_handle) {
th = to_hash(old_handle);
h = from_hash(old_handle >> 16);
if ((b = head->table[th]) != NULL) {
for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
*fp = f->next;
break;
}
}
}
}
tcf_tree_unlock(tp);
route4_reset_fastmap(tp->q, head, f->id);
*arg = (unsigned long)f;
return 0;
errout:
kfree(f);
return err;
}
static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct route4_head *head = tp->root;
unsigned h, h1;
if (head == NULL)
arg->stop = 1;
if (arg->stop)
return;
for (h = 0; h <= 256; h++) {
struct route4_bucket *b = head->table[h];
if (b) {
for (h1 = 0; h1 <= 32; h1++) {
struct route4_filter *f;
for (f = b->ht[h1]; f; f = f->next) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(tp, (unsigned long)f, arg) < 0) {
arg->stop = 1;
return;
}
arg->count++;
}
}
}
}
}
static int route4_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct route4_filter *f = (struct route4_filter*)fh;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
u32 id;
if (f == NULL)
return skb->len;
t->tcm_handle = f->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (!(f->handle&0x8000)) {
id = f->id&0xFF;
NLA_PUT_U32(skb, TCA_ROUTE4_TO, id);
}
if (f->handle&0x80000000) {
if ((f->handle>>16) != 0xFFFF)
NLA_PUT_U32(skb, TCA_ROUTE4_IIF, f->iif);
} else {
id = f->id>>16;
NLA_PUT_U32(skb, TCA_ROUTE4_FROM, id);
}
if (f->res.classid)
NLA_PUT_U32(skb, TCA_ROUTE4_CLASSID, f->res.classid);
if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0)
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tcf_proto_ops cls_route4_ops __read_mostly = {
.kind = "route",
.classify = route4_classify,
.init = route4_init,
.destroy = route4_destroy,
.get = route4_get,
.put = route4_put,
.change = route4_change,
.delete = route4_delete,
.walk = route4_walk,
.dump = route4_dump,
.owner = THIS_MODULE,
};
static int __init init_route4(void)
{
return register_tcf_proto_ops(&cls_route4_ops);
}
static void __exit exit_route4(void)
{
unregister_tcf_proto_ops(&cls_route4_ops);
}
module_init(init_route4)
module_exit(exit_route4)
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,28 @@
/*
* net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/ip.h>
#include <net/netlink.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
#define RSVP_DST_LEN 1
#define RSVP_ID "rsvp"
#define RSVP_OPS cls_rsvp_ops
#include "cls_rsvp.h"
MODULE_LICENSE("GPL");

660
kernel/net/sched/cls_rsvp.h Normal file
View File

@@ -0,0 +1,660 @@
/*
* net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
/*
Comparing to general packet classification problem,
RSVP needs only sevaral relatively simple rules:
* (dst, protocol) are always specified,
so that we are able to hash them.
* src may be exact, or may be wildcard, so that
we can keep a hash table plus one wildcard entry.
* source port (or flow label) is important only if src is given.
IMPLEMENTATION.
We use a two level hash table: The top level is keyed by
destination address and protocol ID, every bucket contains a list
of "rsvp sessions", identified by destination address, protocol and
DPI(="Destination Port ID"): triple (key, mask, offset).
Every bucket has a smaller hash table keyed by source address
(cf. RSVP flowspec) and one wildcard entry for wildcard reservations.
Every bucket is again a list of "RSVP flows", selected by
source address and SPI(="Source Port ID" here rather than
"security parameter index"): triple (key, mask, offset).
NOTE 1. All the packets with IPv6 extension headers (but AH and ESP)
and all fragmented packets go to the best-effort traffic class.
NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires
only one "Generalized Port Identifier". So that for classic
ah, esp (and udp,tcp) both *pi should coincide or one of them
should be wildcard.
At first sight, this redundancy is just a waste of CPU
resources. But DPI and SPI add the possibility to assign different
priorities to GPIs. Look also at note 4 about tunnels below.
NOTE 3. One complication is the case of tunneled packets.
We implement it as following: if the first lookup
matches a special session with "tunnelhdr" value not zero,
flowid doesn't contain the true flow ID, but the tunnel ID (1...255).
In this case, we pull tunnelhdr bytes and restart lookup
with tunnel ID added to the list of keys. Simple and stupid 8)8)
It's enough for PIMREG and IPIP.
NOTE 4. Two GPIs make it possible to parse even GRE packets.
F.e. DPI can select ETH_P_IP (and necessary flags to make
tunnelhdr correct) in GRE protocol field and SPI matches
GRE key. Is it not nice? 8)8)
Well, as result, despite its simplicity, we get a pretty
powerful classification engine. */
struct rsvp_head
{
u32 tmap[256/32];
u32 hgenerator;
u8 tgenerator;
struct rsvp_session *ht[256];
};
struct rsvp_session
{
struct rsvp_session *next;
__be32 dst[RSVP_DST_LEN];
struct tc_rsvp_gpi dpi;
u8 protocol;
u8 tunnelid;
/* 16 (src,sport) hash slots, and one wildcard source slot */
struct rsvp_filter *ht[16+1];
};
struct rsvp_filter
{
struct rsvp_filter *next;
__be32 src[RSVP_DST_LEN];
struct tc_rsvp_gpi spi;
u8 tunnelhdr;
struct tcf_result res;
struct tcf_exts exts;
u32 handle;
struct rsvp_session *sess;
};
static __inline__ unsigned hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
{
unsigned h = (__force __u32)dst[RSVP_DST_LEN-1];
h ^= h>>16;
h ^= h>>8;
return (h ^ protocol ^ tunnelid) & 0xFF;
}
static __inline__ unsigned hash_src(__be32 *src)
{
unsigned h = (__force __u32)src[RSVP_DST_LEN-1];
h ^= h>>16;
h ^= h>>8;
h ^= h>>4;
return h & 0xF;
}
static struct tcf_ext_map rsvp_ext_map = {
.police = TCA_RSVP_POLICE,
.action = TCA_RSVP_ACT
};
#define RSVP_APPLY_RESULT() \
{ \
int r = tcf_exts_exec(skb, &f->exts, res); \
if (r < 0) \
continue; \
else if (r > 0) \
return r; \
}
static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht;
struct rsvp_session *s;
struct rsvp_filter *f;
unsigned h1, h2;
__be32 *dst, *src;
u8 protocol;
u8 tunnelid = 0;
u8 *xprt;
#if RSVP_DST_LEN == 4
struct ipv6hdr *nhptr = ipv6_hdr(skb);
#else
struct iphdr *nhptr = ip_hdr(skb);
#endif
restart:
#if RSVP_DST_LEN == 4
src = &nhptr->saddr.s6_addr32[0];
dst = &nhptr->daddr.s6_addr32[0];
protocol = nhptr->nexthdr;
xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr);
#else
src = &nhptr->saddr;
dst = &nhptr->daddr;
protocol = nhptr->protocol;
xprt = ((u8*)nhptr) + (nhptr->ihl<<2);
if (nhptr->frag_off & htons(IP_MF|IP_OFFSET))
return -1;
#endif
h1 = hash_dst(dst, protocol, tunnelid);
h2 = hash_src(src);
for (s = sht[h1]; s; s = s->next) {
if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
protocol == s->protocol &&
!(s->dpi.mask & (*(u32*)(xprt+s->dpi.offset)^s->dpi.key))
#if RSVP_DST_LEN == 4
&& dst[0] == s->dst[0]
&& dst[1] == s->dst[1]
&& dst[2] == s->dst[2]
#endif
&& tunnelid == s->tunnelid) {
for (f = s->ht[h2]; f; f = f->next) {
if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] &&
!(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key))
#if RSVP_DST_LEN == 4
&& src[0] == f->src[0]
&& src[1] == f->src[1]
&& src[2] == f->src[2]
#endif
) {
*res = f->res;
RSVP_APPLY_RESULT();
matched:
if (f->tunnelhdr == 0)
return 0;
tunnelid = f->res.classid;
nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr));
goto restart;
}
}
/* And wildcard bucket... */
for (f = s->ht[16]; f; f = f->next) {
*res = f->res;
RSVP_APPLY_RESULT();
goto matched;
}
return -1;
}
}
return -1;
}
static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
{
struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht;
struct rsvp_session *s;
struct rsvp_filter *f;
unsigned h1 = handle&0xFF;
unsigned h2 = (handle>>8)&0xFF;
if (h2 > 16)
return 0;
for (s = sht[h1]; s; s = s->next) {
for (f = s->ht[h2]; f; f = f->next) {
if (f->handle == handle)
return (unsigned long)f;
}
}
return 0;
}
static void rsvp_put(struct tcf_proto *tp, unsigned long f)
{
}
static int rsvp_init(struct tcf_proto *tp)
{
struct rsvp_head *data;
data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL);
if (data) {
tp->root = data;
return 0;
}
return -ENOBUFS;
}
static inline void
rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
{
tcf_unbind_filter(tp, &f->res);
tcf_exts_destroy(tp, &f->exts);
kfree(f);
}
static void rsvp_destroy(struct tcf_proto *tp)
{
struct rsvp_head *data = xchg(&tp->root, NULL);
struct rsvp_session **sht;
int h1, h2;
if (data == NULL)
return;
sht = data->ht;
for (h1=0; h1<256; h1++) {
struct rsvp_session *s;
while ((s = sht[h1]) != NULL) {
sht[h1] = s->next;
for (h2=0; h2<=16; h2++) {
struct rsvp_filter *f;
while ((f = s->ht[h2]) != NULL) {
s->ht[h2] = f->next;
rsvp_delete_filter(tp, f);
}
}
kfree(s);
}
}
kfree(data);
}
static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
{
struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg;
unsigned h = f->handle;
struct rsvp_session **sp;
struct rsvp_session *s = f->sess;
int i;
for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) {
if (*fp == f) {
tcf_tree_lock(tp);
*fp = f->next;
tcf_tree_unlock(tp);
rsvp_delete_filter(tp, f);
/* Strip tree */
for (i=0; i<=16; i++)
if (s->ht[i])
return 0;
/* OK, session has no flows */
for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF];
*sp; sp = &(*sp)->next) {
if (*sp == s) {
tcf_tree_lock(tp);
*sp = s->next;
tcf_tree_unlock(tp);
kfree(s);
return 0;
}
}
return 0;
}
}
return 0;
}
static unsigned gen_handle(struct tcf_proto *tp, unsigned salt)
{
struct rsvp_head *data = tp->root;
int i = 0xFFFF;
while (i-- > 0) {
u32 h;
if ((data->hgenerator += 0x10000) == 0)
data->hgenerator = 0x10000;
h = data->hgenerator|salt;
if (rsvp_get(tp, h) == 0)
return h;
}
return 0;
}
static int tunnel_bts(struct rsvp_head *data)
{
int n = data->tgenerator>>5;
u32 b = 1<<(data->tgenerator&0x1F);
if (data->tmap[n]&b)
return 0;
data->tmap[n] |= b;
return 1;
}
static void tunnel_recycle(struct rsvp_head *data)
{
struct rsvp_session **sht = data->ht;
u32 tmap[256/32];
int h1, h2;
memset(tmap, 0, sizeof(tmap));
for (h1=0; h1<256; h1++) {
struct rsvp_session *s;
for (s = sht[h1]; s; s = s->next) {
for (h2=0; h2<=16; h2++) {
struct rsvp_filter *f;
for (f = s->ht[h2]; f; f = f->next) {
if (f->tunnelhdr == 0)
continue;
data->tgenerator = f->res.classid;
tunnel_bts(data);
}
}
}
}
memcpy(data->tmap, tmap, sizeof(tmap));
}
static u32 gen_tunnel(struct rsvp_head *data)
{
int i, k;
for (k=0; k<2; k++) {
for (i=255; i>0; i--) {
if (++data->tgenerator == 0)
data->tgenerator = 1;
if (tunnel_bts(data))
return data->tgenerator;
}
tunnel_recycle(data);
}
return 0;
}
static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
[TCA_RSVP_CLASSID] = { .type = NLA_U32 },
[TCA_RSVP_DST] = { .type = NLA_BINARY,
.len = RSVP_DST_LEN * sizeof(u32) },
[TCA_RSVP_SRC] = { .type = NLA_BINARY,
.len = RSVP_DST_LEN * sizeof(u32) },
[TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) },
};
static int rsvp_change(struct tcf_proto *tp, unsigned long base,
u32 handle,
struct nlattr **tca,
unsigned long *arg)
{
struct rsvp_head *data = tp->root;
struct rsvp_filter *f, **fp;
struct rsvp_session *s, **sp;
struct tc_rsvp_pinfo *pinfo = NULL;
struct nlattr *opt = tca[TCA_OPTIONS-1];
struct nlattr *tb[TCA_RSVP_MAX + 1];
struct tcf_exts e;
unsigned h1, h2;
__be32 *dst;
int err;
if (opt == NULL)
return handle ? -EINVAL : 0;
err = nla_parse_nested(tb, TCA_RSVP_MAX, opt, rsvp_policy);
if (err < 0)
return err;
err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &rsvp_ext_map);
if (err < 0)
return err;
if ((f = (struct rsvp_filter*)*arg) != NULL) {
/* Node exists: adjust only classid */
if (f->handle != handle && handle)
goto errout2;
if (tb[TCA_RSVP_CLASSID-1]) {
f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID-1]);
tcf_bind_filter(tp, &f->res, base);
}
tcf_exts_change(tp, &f->exts, &e);
return 0;
}
/* Now more serious part... */
err = -EINVAL;
if (handle)
goto errout2;
if (tb[TCA_RSVP_DST-1] == NULL)
goto errout2;
err = -ENOBUFS;
f = kzalloc(sizeof(struct rsvp_filter), GFP_KERNEL);
if (f == NULL)
goto errout2;
h2 = 16;
if (tb[TCA_RSVP_SRC-1]) {
memcpy(f->src, nla_data(tb[TCA_RSVP_SRC-1]), sizeof(f->src));
h2 = hash_src(f->src);
}
if (tb[TCA_RSVP_PINFO-1]) {
pinfo = nla_data(tb[TCA_RSVP_PINFO-1]);
f->spi = pinfo->spi;
f->tunnelhdr = pinfo->tunnelhdr;
}
if (tb[TCA_RSVP_CLASSID-1])
f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID-1]);
dst = nla_data(tb[TCA_RSVP_DST-1]);
h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0);
err = -ENOMEM;
if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0)
goto errout;
if (f->tunnelhdr) {
err = -EINVAL;
if (f->res.classid > 255)
goto errout;
err = -ENOMEM;
if (f->res.classid == 0 &&
(f->res.classid = gen_tunnel(data)) == 0)
goto errout;
}
for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) {
if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
pinfo && pinfo->protocol == s->protocol &&
memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0
#if RSVP_DST_LEN == 4
&& dst[0] == s->dst[0]
&& dst[1] == s->dst[1]
&& dst[2] == s->dst[2]
#endif
&& pinfo->tunnelid == s->tunnelid) {
insert:
/* OK, we found appropriate session */
fp = &s->ht[h2];
f->sess = s;
if (f->tunnelhdr == 0)
tcf_bind_filter(tp, &f->res, base);
tcf_exts_change(tp, &f->exts, &e);
for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next)
if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask)
break;
f->next = *fp;
wmb();
*fp = f;
*arg = (unsigned long)f;
return 0;
}
}
/* No session found. Create new one. */
err = -ENOBUFS;
s = kzalloc(sizeof(struct rsvp_session), GFP_KERNEL);
if (s == NULL)
goto errout;
memcpy(s->dst, dst, sizeof(s->dst));
if (pinfo) {
s->dpi = pinfo->dpi;
s->protocol = pinfo->protocol;
s->tunnelid = pinfo->tunnelid;
}
for (sp = &data->ht[h1]; *sp; sp = &(*sp)->next) {
if (((*sp)->dpi.mask&s->dpi.mask) != s->dpi.mask)
break;
}
s->next = *sp;
wmb();
*sp = s;
goto insert;
errout:
kfree(f);
errout2:
tcf_exts_destroy(tp, &e);
return err;
}
static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct rsvp_head *head = tp->root;
unsigned h, h1;
if (arg->stop)
return;
for (h = 0; h < 256; h++) {
struct rsvp_session *s;
for (s = head->ht[h]; s; s = s->next) {
for (h1 = 0; h1 <= 16; h1++) {
struct rsvp_filter *f;
for (f = s->ht[h1]; f; f = f->next) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(tp, (unsigned long)f, arg) < 0) {
arg->stop = 1;
return;
}
arg->count++;
}
}
}
}
}
static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct rsvp_filter *f = (struct rsvp_filter*)fh;
struct rsvp_session *s;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
struct tc_rsvp_pinfo pinfo;
if (f == NULL)
return skb->len;
s = f->sess;
t->tcm_handle = f->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
NLA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst);
pinfo.dpi = s->dpi;
pinfo.spi = f->spi;
pinfo.protocol = s->protocol;
pinfo.tunnelid = s->tunnelid;
pinfo.tunnelhdr = f->tunnelhdr;
pinfo.pad = 0;
NLA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
if (f->res.classid)
NLA_PUT_U32(skb, TCA_RSVP_CLASSID, f->res.classid);
if (((f->handle>>8)&0xFF) != 16)
NLA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src);
if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0)
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tcf_proto_ops RSVP_OPS = {
.next = NULL,
.kind = RSVP_ID,
.classify = rsvp_classify,
.init = rsvp_init,
.destroy = rsvp_destroy,
.get = rsvp_get,
.put = rsvp_put,
.change = rsvp_change,
.delete = rsvp_delete,
.walk = rsvp_walk,
.dump = rsvp_dump,
.owner = THIS_MODULE,
};
static int __init init_rsvp(void)
{
return register_tcf_proto_ops(&RSVP_OPS);
}
static void __exit exit_rsvp(void)
{
unregister_tcf_proto_ops(&RSVP_OPS);
}
module_init(init_rsvp)
module_exit(exit_rsvp)

View File

@@ -0,0 +1,28 @@
/*
* net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/ipv6.h>
#include <linux/skbuff.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
#include <net/netlink.h>
#define RSVP_DST_LEN 4
#define RSVP_ID "rsvp6"
#define RSVP_OPS cls_rsvp6_ops
#include "cls_rsvp.h"
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,506 @@
/*
* net/sched/cls_tcindex.c Packet classifier for skb->tc_index
*
* Written 1998,1999 by Werner Almesberger, EPFL ICA
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <net/act_api.h>
#include <net/netlink.h>
#include <net/pkt_cls.h>
/*
* Passing parameters to the root seems to be done more awkwardly than really
* necessary. At least, u32 doesn't seem to use such dirty hacks. To be
* verified. FIXME.
*/
#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */
#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */
#define PRIV(tp) ((struct tcindex_data *) (tp)->root)
struct tcindex_filter_result {
struct tcf_exts exts;
struct tcf_result res;
};
struct tcindex_filter {
u16 key;
struct tcindex_filter_result result;
struct tcindex_filter *next;
};
struct tcindex_data {
struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */
struct tcindex_filter **h; /* imperfect hash; only used if !perfect;
NULL if unused */
u16 mask; /* AND key with mask */
int shift; /* shift ANDed key to the right */
int hash; /* hash table size; 0 if undefined */
int alloc_hash; /* allocated size */
int fall_through; /* 0: only classify if explicit match */
};
static const struct tcf_ext_map tcindex_ext_map = {
.police = TCA_TCINDEX_POLICE,
.action = TCA_TCINDEX_ACT
};
static inline int
tcindex_filter_is_set(struct tcindex_filter_result *r)
{
return tcf_exts_is_predicative(&r->exts) || r->res.classid;
}
static struct tcindex_filter_result *
tcindex_lookup(struct tcindex_data *p, u16 key)
{
struct tcindex_filter *f;
if (p->perfect)
return tcindex_filter_is_set(p->perfect + key) ?
p->perfect + key : NULL;
else if (p->h) {
for (f = p->h[key % p->hash]; f; f = f->next)
if (f->key == key)
return &f->result;
}
return NULL;
}
static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp,
struct tcf_result *res)
{
struct tcindex_data *p = PRIV(tp);
struct tcindex_filter_result *f;
int key = (skb->tc_index & p->mask) >> p->shift;
pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n",
skb, tp, res, p);
f = tcindex_lookup(p, key);
if (!f) {
if (!p->fall_through)
return -1;
res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key);
res->class = 0;
pr_debug("alg 0x%x\n", res->classid);
return 0;
}
*res = f->res;
pr_debug("map 0x%x\n", res->classid);
return tcf_exts_exec(skb, &f->exts, res);
}
static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle)
{
struct tcindex_data *p = PRIV(tp);
struct tcindex_filter_result *r;
pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle);
if (p->perfect && handle >= p->alloc_hash)
return 0;
r = tcindex_lookup(p, handle);
return r && tcindex_filter_is_set(r) ? (unsigned long) r : 0UL;
}
static void tcindex_put(struct tcf_proto *tp, unsigned long f)
{
pr_debug("tcindex_put(tp %p,f 0x%lx)\n", tp, f);
}
static int tcindex_init(struct tcf_proto *tp)
{
struct tcindex_data *p;
pr_debug("tcindex_init(tp %p)\n", tp);
p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL);
if (!p)
return -ENOMEM;
p->mask = 0xffff;
p->hash = DEFAULT_HASH_SIZE;
p->fall_through = 1;
tp->root = p;
return 0;
}
static int
__tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock)
{
struct tcindex_data *p = PRIV(tp);
struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
struct tcindex_filter *f = NULL;
pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n", tp, arg, p, f);
if (p->perfect) {
if (!r->res.class)
return -ENOENT;
} else {
int i;
struct tcindex_filter **walk = NULL;
for (i = 0; i < p->hash; i++)
for (walk = p->h+i; *walk; walk = &(*walk)->next)
if (&(*walk)->result == r)
goto found;
return -ENOENT;
found:
f = *walk;
if (lock)
tcf_tree_lock(tp);
*walk = f->next;
if (lock)
tcf_tree_unlock(tp);
}
tcf_unbind_filter(tp, &r->res);
tcf_exts_destroy(tp, &r->exts);
kfree(f);
return 0;
}
static int tcindex_delete(struct tcf_proto *tp, unsigned long arg)
{
return __tcindex_delete(tp, arg, 1);
}
static inline int
valid_perfect_hash(struct tcindex_data *p)
{
return p->hash > (p->mask >> p->shift);
}
static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
[TCA_TCINDEX_HASH] = { .type = NLA_U32 },
[TCA_TCINDEX_MASK] = { .type = NLA_U16 },
[TCA_TCINDEX_SHIFT] = { .type = NLA_U32 },
[TCA_TCINDEX_FALL_THROUGH] = { .type = NLA_U32 },
[TCA_TCINDEX_CLASSID] = { .type = NLA_U32 },
};
static int
tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
struct tcindex_data *p, struct tcindex_filter_result *r,
struct nlattr **tb, struct nlattr *est)
{
int err, balloc = 0;
struct tcindex_filter_result new_filter_result, *old_r = r;
struct tcindex_filter_result cr;
struct tcindex_data cp;
struct tcindex_filter *f = NULL; /* make gcc behave */
struct tcf_exts e;
err = tcf_exts_validate(tp, tb, est, &e, &tcindex_ext_map);
if (err < 0)
return err;
memcpy(&cp, p, sizeof(cp));
memset(&new_filter_result, 0, sizeof(new_filter_result));
if (old_r)
memcpy(&cr, r, sizeof(cr));
else
memset(&cr, 0, sizeof(cr));
if (tb[TCA_TCINDEX_HASH])
cp.hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
if (tb[TCA_TCINDEX_MASK])
cp.mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
if (tb[TCA_TCINDEX_SHIFT])
cp.shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
err = -EBUSY;
/* Hash already allocated, make sure that we still meet the
* requirements for the allocated hash.
*/
if (cp.perfect) {
if (!valid_perfect_hash(&cp) ||
cp.hash > cp.alloc_hash)
goto errout;
} else if (cp.h && cp.hash != cp.alloc_hash)
goto errout;
err = -EINVAL;
if (tb[TCA_TCINDEX_FALL_THROUGH])
cp.fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
if (!cp.hash) {
/* Hash not specified, use perfect hash if the upper limit
* of the hashing index is below the threshold.
*/
if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD)
cp.hash = (cp.mask >> cp.shift)+1;
else
cp.hash = DEFAULT_HASH_SIZE;
}
if (!cp.perfect && !cp.h)
cp.alloc_hash = cp.hash;
/* Note: this could be as restrictive as if (handle & ~(mask >> shift))
* but then, we'd fail handles that may become valid after some future
* mask change. While this is extremely unlikely to ever matter,
* the check below is safer (and also more backwards-compatible).
*/
if (cp.perfect || valid_perfect_hash(&cp))
if (handle >= cp.alloc_hash)
goto errout;
err = -ENOMEM;
if (!cp.perfect && !cp.h) {
if (valid_perfect_hash(&cp)) {
cp.perfect = kcalloc(cp.hash, sizeof(*r), GFP_KERNEL);
if (!cp.perfect)
goto errout;
balloc = 1;
} else {
cp.h = kcalloc(cp.hash, sizeof(f), GFP_KERNEL);
if (!cp.h)
goto errout;
balloc = 2;
}
}
if (cp.perfect)
r = cp.perfect + handle;
else
r = tcindex_lookup(&cp, handle) ? : &new_filter_result;
if (r == &new_filter_result) {
f = kzalloc(sizeof(*f), GFP_KERNEL);
if (!f)
goto errout_alloc;
}
if (tb[TCA_TCINDEX_CLASSID]) {
cr.res.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]);
tcf_bind_filter(tp, &cr.res, base);
}
tcf_exts_change(tp, &cr.exts, &e);
tcf_tree_lock(tp);
if (old_r && old_r != r)
memset(old_r, 0, sizeof(*old_r));
memcpy(p, &cp, sizeof(cp));
memcpy(r, &cr, sizeof(cr));
if (r == &new_filter_result) {
struct tcindex_filter **fp;
f->key = handle;
f->result = new_filter_result;
f->next = NULL;
for (fp = p->h+(handle % p->hash); *fp; fp = &(*fp)->next)
/* nothing */;
*fp = f;
}
tcf_tree_unlock(tp);
return 0;
errout_alloc:
if (balloc == 1)
kfree(cp.perfect);
else if (balloc == 2)
kfree(cp.h);
errout:
tcf_exts_destroy(tp, &e);
return err;
}
static int
tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle,
struct nlattr **tca, unsigned long *arg)
{
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_TCINDEX_MAX + 1];
struct tcindex_data *p = PRIV(tp);
struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg;
int err;
pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
"p %p,r %p,*arg 0x%lx\n",
tp, handle, tca, arg, opt, p, r, arg ? *arg : 0L);
if (!opt)
return 0;
err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy);
if (err < 0)
return err;
return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE]);
}
static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
{
struct tcindex_data *p = PRIV(tp);
struct tcindex_filter *f, *next;
int i;
pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p);
if (p->perfect) {
for (i = 0; i < p->hash; i++) {
if (!p->perfect[i].res.class)
continue;
if (walker->count >= walker->skip) {
if (walker->fn(tp,
(unsigned long) (p->perfect+i), walker)
< 0) {
walker->stop = 1;
return;
}
}
walker->count++;
}
}
if (!p->h)
return;
for (i = 0; i < p->hash; i++) {
for (f = p->h[i]; f; f = next) {
next = f->next;
if (walker->count >= walker->skip) {
if (walker->fn(tp, (unsigned long) &f->result,
walker) < 0) {
walker->stop = 1;
return;
}
}
walker->count++;
}
}
}
static int tcindex_destroy_element(struct tcf_proto *tp,
unsigned long arg, struct tcf_walker *walker)
{
return __tcindex_delete(tp, arg, 0);
}
static void tcindex_destroy(struct tcf_proto *tp)
{
struct tcindex_data *p = PRIV(tp);
struct tcf_walker walker;
pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
walker.count = 0;
walker.skip = 0;
walker.fn = &tcindex_destroy_element;
tcindex_walk(tp, &walker);
kfree(p->perfect);
kfree(p->h);
kfree(p);
tp->root = NULL;
}
static int tcindex_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct tcindex_data *p = PRIV(tp);
struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
pr_debug("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n",
tp, fh, skb, t, p, r, b);
pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h);
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (!fh) {
t->tcm_handle = ~0; /* whatever ... */
NLA_PUT_U32(skb, TCA_TCINDEX_HASH, p->hash);
NLA_PUT_U16(skb, TCA_TCINDEX_MASK, p->mask);
NLA_PUT_U32(skb, TCA_TCINDEX_SHIFT, p->shift);
NLA_PUT_U32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through);
nla_nest_end(skb, nest);
} else {
if (p->perfect) {
t->tcm_handle = r-p->perfect;
} else {
struct tcindex_filter *f;
int i;
t->tcm_handle = 0;
for (i = 0; !t->tcm_handle && i < p->hash; i++) {
for (f = p->h[i]; !t->tcm_handle && f;
f = f->next) {
if (&f->result == r)
t->tcm_handle = f->key;
}
}
}
pr_debug("handle = %d\n", t->tcm_handle);
if (r->res.class)
NLA_PUT_U32(skb, TCA_TCINDEX_CLASSID, r->res.classid);
if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0)
goto nla_put_failure;
}
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tcf_proto_ops cls_tcindex_ops __read_mostly = {
.kind = "tcindex",
.classify = tcindex_classify,
.init = tcindex_init,
.destroy = tcindex_destroy,
.get = tcindex_get,
.put = tcindex_put,
.change = tcindex_change,
.delete = tcindex_delete,
.walk = tcindex_walk,
.dump = tcindex_dump,
.owner = THIS_MODULE,
};
static int __init init_tcindex(void)
{
return register_tcf_proto_ops(&cls_tcindex_ops);
}
static void __exit exit_tcindex(void)
{
unregister_tcf_proto_ops(&cls_tcindex_ops);
}
module_init(init_tcindex)
module_exit(exit_tcindex)
MODULE_LICENSE("GPL");

790
kernel/net/sched/cls_u32.c Normal file
View File

@@ -0,0 +1,790 @@
/*
* net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* The filters are packed to hash tables of key nodes
* with a set of 32bit key/mask pairs at every node.
* Nodes reference next level hash tables etc.
*
* This scheme is the best universal classifier I managed to
* invent; it is not super-fast, but it is not slow (provided you
* program it correctly), and general enough. And its relative
* speed grows as the number of rules becomes larger.
*
* It seems that it represents the best middle point between
* speed and manageability both by human and by machine.
*
* It is especially useful for link sharing combined with QoS;
* pure RSVP doesn't need such a general approach and can use
* much simpler (and faster) schemes, sort of cls_rsvp.c.
*
* JHS: We should remove the CONFIG_NET_CLS_IND from here
* eventually when the meta match extension is made available
*
* nfmark match added by Catalin(ux aka Dino) BOIE <catab at umbrella.ro>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
struct tc_u_knode
{
struct tc_u_knode *next;
u32 handle;
struct tc_u_hnode *ht_up;
struct tcf_exts exts;
#ifdef CONFIG_NET_CLS_IND
char indev[IFNAMSIZ];
#endif
u8 fshift;
struct tcf_result res;
struct tc_u_hnode *ht_down;
#ifdef CONFIG_CLS_U32_PERF
struct tc_u32_pcnt *pf;
#endif
#ifdef CONFIG_CLS_U32_MARK
struct tc_u32_mark mark;
#endif
struct tc_u32_sel sel;
};
struct tc_u_hnode
{
struct tc_u_hnode *next;
u32 handle;
u32 prio;
struct tc_u_common *tp_c;
int refcnt;
unsigned divisor;
struct tc_u_knode *ht[1];
};
struct tc_u_common
{
struct tc_u_hnode *hlist;
struct Qdisc *q;
int refcnt;
u32 hgenerator;
};
static const struct tcf_ext_map u32_ext_map = {
.action = TCA_U32_ACT,
.police = TCA_U32_POLICE
};
static __inline__ unsigned u32_hash_fold(__be32 key, struct tc_u32_sel *sel, u8 fshift)
{
unsigned h = ntohl(key & sel->hmask)>>fshift;
return h;
}
static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res)
{
struct {
struct tc_u_knode *knode;
u8 *ptr;
} stack[TC_U32_MAXDEPTH];
struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root;
u8 *ptr = skb_network_header(skb);
struct tc_u_knode *n;
int sdepth = 0;
int off2 = 0;
int sel = 0;
#ifdef CONFIG_CLS_U32_PERF
int j;
#endif
int i, r;
next_ht:
n = ht->ht[sel];
next_knode:
if (n) {
struct tc_u32_key *key = n->sel.keys;
#ifdef CONFIG_CLS_U32_PERF
n->pf->rcnt +=1;
j = 0;
#endif
#ifdef CONFIG_CLS_U32_MARK
if ((skb->mark & n->mark.mask) != n->mark.val) {
n = n->next;
goto next_knode;
} else {
n->mark.success++;
}
#endif
for (i = n->sel.nkeys; i>0; i--, key++) {
if ((*(__be32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) {
n = n->next;
goto next_knode;
}
#ifdef CONFIG_CLS_U32_PERF
n->pf->kcnts[j] +=1;
j++;
#endif
}
if (n->ht_down == NULL) {
check_terminal:
if (n->sel.flags&TC_U32_TERMINAL) {
*res = n->res;
#ifdef CONFIG_NET_CLS_IND
if (!tcf_match_indev(skb, n->indev)) {
n = n->next;
goto next_knode;
}
#endif
#ifdef CONFIG_CLS_U32_PERF
n->pf->rhit +=1;
#endif
r = tcf_exts_exec(skb, &n->exts, res);
if (r < 0) {
n = n->next;
goto next_knode;
}
return r;
}
n = n->next;
goto next_knode;
}
/* PUSH */
if (sdepth >= TC_U32_MAXDEPTH)
goto deadloop;
stack[sdepth].knode = n;
stack[sdepth].ptr = ptr;
sdepth++;
ht = n->ht_down;
sel = 0;
if (ht->divisor)
sel = ht->divisor&u32_hash_fold(*(__be32*)(ptr+n->sel.hoff), &n->sel,n->fshift);
if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT)))
goto next_ht;
if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) {
off2 = n->sel.off + 3;
if (n->sel.flags&TC_U32_VAROFFSET)
off2 += ntohs(n->sel.offmask & *(__be16*)(ptr+n->sel.offoff)) >>n->sel.offshift;
off2 &= ~3;
}
if (n->sel.flags&TC_U32_EAT) {
ptr += off2;
off2 = 0;
}
if (ptr < skb_tail_pointer(skb))
goto next_ht;
}
/* POP */
if (sdepth--) {
n = stack[sdepth].knode;
ht = n->ht_up;
ptr = stack[sdepth].ptr;
goto check_terminal;
}
return -1;
deadloop:
if (net_ratelimit())
printk("cls_u32: dead loop\n");
return -1;
}
static __inline__ struct tc_u_hnode *
u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
{
struct tc_u_hnode *ht;
for (ht = tp_c->hlist; ht; ht = ht->next)
if (ht->handle == handle)
break;
return ht;
}
static __inline__ struct tc_u_knode *
u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
{
unsigned sel;
struct tc_u_knode *n = NULL;
sel = TC_U32_HASH(handle);
if (sel > ht->divisor)
goto out;
for (n = ht->ht[sel]; n; n = n->next)
if (n->handle == handle)
break;
out:
return n;
}
static unsigned long u32_get(struct tcf_proto *tp, u32 handle)
{
struct tc_u_hnode *ht;
struct tc_u_common *tp_c = tp->data;
if (TC_U32_HTID(handle) == TC_U32_ROOT)
ht = tp->root;
else
ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle));
if (!ht)
return 0;
if (TC_U32_KEY(handle) == 0)
return (unsigned long)ht;
return (unsigned long)u32_lookup_key(ht, handle);
}
static void u32_put(struct tcf_proto *tp, unsigned long f)
{
}
static u32 gen_new_htid(struct tc_u_common *tp_c)
{
int i = 0x800;
do {
if (++tp_c->hgenerator == 0x7FF)
tp_c->hgenerator = 1;
} while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
}
static int u32_init(struct tcf_proto *tp)
{
struct tc_u_hnode *root_ht;
struct tc_u_common *tp_c;
tp_c = tp->q->u32_node;
root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
if (root_ht == NULL)
return -ENOBUFS;
root_ht->divisor = 0;
root_ht->refcnt++;
root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000;
root_ht->prio = tp->prio;
if (tp_c == NULL) {
tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL);
if (tp_c == NULL) {
kfree(root_ht);
return -ENOBUFS;
}
tp_c->q = tp->q;
tp->q->u32_node = tp_c;
}
tp_c->refcnt++;
root_ht->next = tp_c->hlist;
tp_c->hlist = root_ht;
root_ht->tp_c = tp_c;
tp->root = root_ht;
tp->data = tp_c;
return 0;
}
static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n)
{
tcf_unbind_filter(tp, &n->res);
tcf_exts_destroy(tp, &n->exts);
if (n->ht_down)
n->ht_down->refcnt--;
#ifdef CONFIG_CLS_U32_PERF
kfree(n->pf);
#endif
kfree(n);
return 0;
}
static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key)
{
struct tc_u_knode **kp;
struct tc_u_hnode *ht = key->ht_up;
if (ht) {
for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) {
if (*kp == key) {
tcf_tree_lock(tp);
*kp = key->next;
tcf_tree_unlock(tp);
u32_destroy_key(tp, key);
return 0;
}
}
}
WARN_ON(1);
return 0;
}
static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
{
struct tc_u_knode *n;
unsigned h;
for (h=0; h<=ht->divisor; h++) {
while ((n = ht->ht[h]) != NULL) {
ht->ht[h] = n->next;
u32_destroy_key(tp, n);
}
}
}
static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
{
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode **hn;
WARN_ON(ht->refcnt);
u32_clear_hnode(tp, ht);
for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) {
if (*hn == ht) {
*hn = ht->next;
kfree(ht);
return 0;
}
}
WARN_ON(1);
return -ENOENT;
}
static void u32_destroy(struct tcf_proto *tp)
{
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *root_ht = tp->root;
WARN_ON(root_ht == NULL);
if (root_ht && --root_ht->refcnt == 0)
u32_destroy_hnode(tp, root_ht);
if (--tp_c->refcnt == 0) {
struct tc_u_hnode *ht;
tp->q->u32_node = NULL;
for (ht = tp_c->hlist; ht; ht = ht->next) {
ht->refcnt--;
u32_clear_hnode(tp, ht);
}
while ((ht = tp_c->hlist) != NULL) {
tp_c->hlist = ht->next;
WARN_ON(ht->refcnt != 0);
kfree(ht);
}
kfree(tp_c);
}
tp->data = NULL;
}
static int u32_delete(struct tcf_proto *tp, unsigned long arg)
{
struct tc_u_hnode *ht = (struct tc_u_hnode*)arg;
if (ht == NULL)
return 0;
if (TC_U32_KEY(ht->handle))
return u32_delete_key(tp, (struct tc_u_knode*)ht);
if (tp->root == ht)
return -EINVAL;
if (ht->refcnt == 1) {
ht->refcnt--;
u32_destroy_hnode(tp, ht);
} else {
return -EBUSY;
}
return 0;
}
static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
{
struct tc_u_knode *n;
unsigned i = 0x7FF;
for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
if (i < TC_U32_NODE(n->handle))
i = TC_U32_NODE(n->handle);
i++;
return handle|(i>0xFFF ? 0xFFF : i);
}
static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
[TCA_U32_CLASSID] = { .type = NLA_U32 },
[TCA_U32_HASH] = { .type = NLA_U32 },
[TCA_U32_LINK] = { .type = NLA_U32 },
[TCA_U32_DIVISOR] = { .type = NLA_U32 },
[TCA_U32_SEL] = { .len = sizeof(struct tc_u32_sel) },
[TCA_U32_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ },
[TCA_U32_MARK] = { .len = sizeof(struct tc_u32_mark) },
};
static int u32_set_parms(struct tcf_proto *tp, unsigned long base,
struct tc_u_hnode *ht,
struct tc_u_knode *n, struct nlattr **tb,
struct nlattr *est)
{
int err;
struct tcf_exts e;
err = tcf_exts_validate(tp, tb, est, &e, &u32_ext_map);
if (err < 0)
return err;
err = -EINVAL;
if (tb[TCA_U32_LINK]) {
u32 handle = nla_get_u32(tb[TCA_U32_LINK]);
struct tc_u_hnode *ht_down = NULL, *ht_old;
if (TC_U32_KEY(handle))
goto errout;
if (handle) {
ht_down = u32_lookup_ht(ht->tp_c, handle);
if (ht_down == NULL)
goto errout;
ht_down->refcnt++;
}
tcf_tree_lock(tp);
ht_old = n->ht_down;
n->ht_down = ht_down;
tcf_tree_unlock(tp);
if (ht_old)
ht_old->refcnt--;
}
if (tb[TCA_U32_CLASSID]) {
n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]);
tcf_bind_filter(tp, &n->res, base);
}
#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_U32_INDEV]) {
err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV]);
if (err < 0)
goto errout;
}
#endif
tcf_exts_change(tp, &n->exts, &e);
return 0;
errout:
tcf_exts_destroy(tp, &e);
return err;
}
static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
struct nlattr **tca,
unsigned long *arg)
{
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *ht;
struct tc_u_knode *n;
struct tc_u32_sel *s;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_U32_MAX + 1];
u32 htid;
int err;
if (opt == NULL)
return handle ? -EINVAL : 0;
err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy);
if (err < 0)
return err;
if ((n = (struct tc_u_knode*)*arg) != NULL) {
if (TC_U32_KEY(n->handle) == 0)
return -EINVAL;
return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE]);
}
if (tb[TCA_U32_DIVISOR]) {
unsigned divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
if (--divisor > 0x100)
return -EINVAL;
if (TC_U32_KEY(handle))
return -EINVAL;
if (handle == 0) {
handle = gen_new_htid(tp->data);
if (handle == 0)
return -ENOMEM;
}
ht = kzalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL);
if (ht == NULL)
return -ENOBUFS;
ht->tp_c = tp_c;
ht->refcnt = 1;
ht->divisor = divisor;
ht->handle = handle;
ht->prio = tp->prio;
ht->next = tp_c->hlist;
tp_c->hlist = ht;
*arg = (unsigned long)ht;
return 0;
}
if (tb[TCA_U32_HASH]) {
htid = nla_get_u32(tb[TCA_U32_HASH]);
if (TC_U32_HTID(htid) == TC_U32_ROOT) {
ht = tp->root;
htid = ht->handle;
} else {
ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid));
if (ht == NULL)
return -EINVAL;
}
} else {
ht = tp->root;
htid = ht->handle;
}
if (ht->divisor < TC_U32_HASH(htid))
return -EINVAL;
if (handle) {
if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid))
return -EINVAL;
handle = htid | TC_U32_NODE(handle);
} else
handle = gen_new_kid(ht, htid);
if (tb[TCA_U32_SEL] == NULL)
return -EINVAL;
s = nla_data(tb[TCA_U32_SEL]);
n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL);
if (n == NULL)
return -ENOBUFS;
#ifdef CONFIG_CLS_U32_PERF
n->pf = kzalloc(sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64), GFP_KERNEL);
if (n->pf == NULL) {
kfree(n);
return -ENOBUFS;
}
#endif
memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
n->ht_up = ht;
n->handle = handle;
n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
#ifdef CONFIG_CLS_U32_MARK
if (tb[TCA_U32_MARK]) {
struct tc_u32_mark *mark;
mark = nla_data(tb[TCA_U32_MARK]);
memcpy(&n->mark, mark, sizeof(struct tc_u32_mark));
n->mark.success = 0;
}
#endif
err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE]);
if (err == 0) {
struct tc_u_knode **ins;
for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next)
if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle))
break;
n->next = *ins;
tcf_tree_lock(tp);
*ins = n;
tcf_tree_unlock(tp);
*arg = (unsigned long)n;
return 0;
}
#ifdef CONFIG_CLS_U32_PERF
kfree(n->pf);
#endif
kfree(n);
return err;
}
static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct tc_u_common *tp_c = tp->data;
struct tc_u_hnode *ht;
struct tc_u_knode *n;
unsigned h;
if (arg->stop)
return;
for (ht = tp_c->hlist; ht; ht = ht->next) {
if (ht->prio != tp->prio)
continue;
if (arg->count >= arg->skip) {
if (arg->fn(tp, (unsigned long)ht, arg) < 0) {
arg->stop = 1;
return;
}
}
arg->count++;
for (h = 0; h <= ht->divisor; h++) {
for (n = ht->ht[h]; n; n = n->next) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(tp, (unsigned long)n, arg) < 0) {
arg->stop = 1;
return;
}
arg->count++;
}
}
}
}
static int u32_dump(struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct tc_u_knode *n = (struct tc_u_knode*)fh;
struct nlattr *nest;
if (n == NULL)
return skb->len;
t->tcm_handle = n->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (TC_U32_KEY(n->handle) == 0) {
struct tc_u_hnode *ht = (struct tc_u_hnode*)fh;
u32 divisor = ht->divisor+1;
NLA_PUT_U32(skb, TCA_U32_DIVISOR, divisor);
} else {
NLA_PUT(skb, TCA_U32_SEL,
sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key),
&n->sel);
if (n->ht_up) {
u32 htid = n->handle & 0xFFFFF000;
NLA_PUT_U32(skb, TCA_U32_HASH, htid);
}
if (n->res.classid)
NLA_PUT_U32(skb, TCA_U32_CLASSID, n->res.classid);
if (n->ht_down)
NLA_PUT_U32(skb, TCA_U32_LINK, n->ht_down->handle);
#ifdef CONFIG_CLS_U32_MARK
if (n->mark.val || n->mark.mask)
NLA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark);
#endif
if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0)
goto nla_put_failure;
#ifdef CONFIG_NET_CLS_IND
if(strlen(n->indev))
NLA_PUT_STRING(skb, TCA_U32_INDEV, n->indev);
#endif
#ifdef CONFIG_CLS_U32_PERF
NLA_PUT(skb, TCA_U32_PCNT,
sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64),
n->pf);
#endif
}
nla_nest_end(skb, nest);
if (TC_U32_KEY(n->handle))
if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0)
goto nla_put_failure;
return skb->len;
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
}
static struct tcf_proto_ops cls_u32_ops __read_mostly = {
.kind = "u32",
.classify = u32_classify,
.init = u32_init,
.destroy = u32_destroy,
.get = u32_get,
.put = u32_put,
.change = u32_change,
.delete = u32_delete,
.walk = u32_walk,
.dump = u32_dump,
.owner = THIS_MODULE,
};
static int __init init_u32(void)
{
printk("u32 classifier\n");
#ifdef CONFIG_CLS_U32_PERF
printk(" Performance counters on\n");
#endif
#ifdef CONFIG_NET_CLS_IND
printk(" input device check on \n");
#endif
#ifdef CONFIG_NET_CLS_ACT
printk(" Actions configured \n");
#endif
return register_tcf_proto_ops(&cls_u32_ops);
}
static void __exit exit_u32(void)
{
unregister_tcf_proto_ops(&cls_u32_ops);
}
module_init(init_u32)
module_exit(exit_u32)
MODULE_LICENSE("GPL");

98
kernel/net/sched/em_cmp.c Normal file
View File

@@ -0,0 +1,98 @@
/*
* net/sched/em_cmp.c Simple packet data comparison ematch
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/tc_ematch/tc_em_cmp.h>
#include <asm/unaligned.h>
#include <net/pkt_cls.h>
static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp)
{
return unlikely(cmp->flags & TCF_EM_CMP_TRANS);
}
static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
struct tcf_pkt_info *info)
{
struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data;
unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off;
u32 val = 0;
if (!tcf_valid_offset(skb, ptr, cmp->align))
return 0;
switch (cmp->align) {
case TCF_EM_ALIGN_U8:
val = *ptr;
break;
case TCF_EM_ALIGN_U16:
val = get_unaligned_be16(ptr);
if (cmp_needs_transformation(cmp))
val = be16_to_cpu(val);
break;
case TCF_EM_ALIGN_U32:
/* Worth checking boundries? The branching seems
* to get worse. Visit again. */
val = get_unaligned_be32(ptr);
if (cmp_needs_transformation(cmp))
val = be32_to_cpu(val);
break;
default:
return 0;
}
if (cmp->mask)
val &= cmp->mask;
switch (cmp->opnd) {
case TCF_EM_OPND_EQ:
return val == cmp->val;
case TCF_EM_OPND_LT:
return val < cmp->val;
case TCF_EM_OPND_GT:
return val > cmp->val;
}
return 0;
}
static struct tcf_ematch_ops em_cmp_ops = {
.kind = TCF_EM_CMP,
.datalen = sizeof(struct tcf_em_cmp),
.match = em_cmp_match,
.owner = THIS_MODULE,
.link = LIST_HEAD_INIT(em_cmp_ops.link)
};
static int __init init_em_cmp(void)
{
return tcf_em_register(&em_cmp_ops);
}
static void __exit exit_em_cmp(void)
{
tcf_em_unregister(&em_cmp_ops);
}
MODULE_LICENSE("GPL");
module_init(init_em_cmp);
module_exit(exit_em_cmp);
MODULE_ALIAS_TCF_EMATCH(TCF_EM_CMP);

877
kernel/net/sched/em_meta.c Normal file
View File

@@ -0,0 +1,877 @@
/*
* net/sched/em_meta.c Metadata ematch
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*
* ==========================================================================
*
* The metadata ematch compares two meta objects where each object
* represents either a meta value stored in the kernel or a static
* value provided by userspace. The objects are not provided by
* userspace itself but rather a definition providing the information
* to build them. Every object is of a certain type which must be
* equal to the object it is being compared to.
*
* The definition of a objects conists of the type (meta type), a
* identifier (meta id) and additional type specific information.
* The meta id is either TCF_META_TYPE_VALUE for values provided by
* userspace or a index to the meta operations table consisting of
* function pointers to type specific meta data collectors returning
* the value of the requested meta value.
*
* lvalue rvalue
* +-----------+ +-----------+
* | type: INT | | type: INT |
* def | id: DEV | | id: VALUE |
* | data: | | data: 3 |
* +-----------+ +-----------+
* | |
* ---> meta_ops[INT][DEV](...) |
* | |
* ----------- |
* V V
* +-----------+ +-----------+
* | type: INT | | type: INT |
* obj | id: DEV | | id: VALUE |
* | data: 2 |<--data got filled out | data: 3 |
* +-----------+ +-----------+
* | |
* --------------> 2 equals 3 <--------------
*
* This is a simplified schema, the complexity varies depending
* on the meta type. Obviously, the length of the data must also
* be provided for non-numeric types.
*
* Additionaly, type dependant modifiers such as shift operators
* or mask may be applied to extend the functionaliy. As of now,
* the variable length type supports shifting the byte string to
* the right, eating up any number of octets and thus supporting
* wildcard interface name comparisons such as "ppp%" matching
* ppp0..9.
*
* NOTE: Certain meta values depend on other subsystems and are
* only available if that subsystem is enabled in the kernel.
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/random.h>
#include <linux/if_vlan.h>
#include <linux/tc_ematch/tc_em_meta.h>
#include <net/dst.h>
#include <net/route.h>
#include <net/pkt_cls.h>
#include <net/sock.h>
struct meta_obj
{
unsigned long value;
unsigned int len;
};
struct meta_value
{
struct tcf_meta_val hdr;
unsigned long val;
unsigned int len;
};
struct meta_match
{
struct meta_value lvalue;
struct meta_value rvalue;
};
static inline int meta_id(struct meta_value *v)
{
return TCF_META_ID(v->hdr.kind);
}
static inline int meta_type(struct meta_value *v)
{
return TCF_META_TYPE(v->hdr.kind);
}
#define META_COLLECTOR(FUNC) static void meta_##FUNC(struct sk_buff *skb, \
struct tcf_pkt_info *info, struct meta_value *v, \
struct meta_obj *dst, int *err)
/**************************************************************************
* System status & misc
**************************************************************************/
META_COLLECTOR(int_random)
{
get_random_bytes(&dst->value, sizeof(dst->value));
}
static inline unsigned long fixed_loadavg(int load)
{
int rnd_load = load + (FIXED_1/200);
int rnd_frac = ((rnd_load & (FIXED_1-1)) * 100) >> FSHIFT;
return ((rnd_load >> FSHIFT) * 100) + rnd_frac;
}
META_COLLECTOR(int_loadavg_0)
{
dst->value = fixed_loadavg(avenrun[0]);
}
META_COLLECTOR(int_loadavg_1)
{
dst->value = fixed_loadavg(avenrun[1]);
}
META_COLLECTOR(int_loadavg_2)
{
dst->value = fixed_loadavg(avenrun[2]);
}
/**************************************************************************
* Device names & indices
**************************************************************************/
static inline int int_dev(struct net_device *dev, struct meta_obj *dst)
{
if (unlikely(dev == NULL))
return -1;
dst->value = dev->ifindex;
return 0;
}
static inline int var_dev(struct net_device *dev, struct meta_obj *dst)
{
if (unlikely(dev == NULL))
return -1;
dst->value = (unsigned long) dev->name;
dst->len = strlen(dev->name);
return 0;
}
META_COLLECTOR(int_dev)
{
*err = int_dev(skb->dev, dst);
}
META_COLLECTOR(var_dev)
{
*err = var_dev(skb->dev, dst);
}
/**************************************************************************
* vlan tag
**************************************************************************/
META_COLLECTOR(int_vlan_tag)
{
unsigned short tag;
tag = vlan_tx_tag_get(skb);
if (!tag && __vlan_get_tag(skb, &tag))
*err = -1;
else
dst->value = tag;
}
/**************************************************************************
* skb attributes
**************************************************************************/
META_COLLECTOR(int_priority)
{
dst->value = skb->priority;
}
META_COLLECTOR(int_protocol)
{
/* Let userspace take care of the byte ordering */
dst->value = skb->protocol;
}
META_COLLECTOR(int_pkttype)
{
dst->value = skb->pkt_type;
}
META_COLLECTOR(int_pktlen)
{
dst->value = skb->len;
}
META_COLLECTOR(int_datalen)
{
dst->value = skb->data_len;
}
META_COLLECTOR(int_maclen)
{
dst->value = skb->mac_len;
}
/**************************************************************************
* Netfilter
**************************************************************************/
META_COLLECTOR(int_mark)
{
dst->value = skb->mark;
}
/**************************************************************************
* Traffic Control
**************************************************************************/
META_COLLECTOR(int_tcindex)
{
dst->value = skb->tc_index;
}
/**************************************************************************
* Routing
**************************************************************************/
META_COLLECTOR(int_rtclassid)
{
if (unlikely(skb_dst(skb) == NULL))
*err = -1;
else
#ifdef CONFIG_NET_CLS_ROUTE
dst->value = skb_dst(skb)->tclassid;
#else
dst->value = 0;
#endif
}
META_COLLECTOR(int_rtiif)
{
if (unlikely(skb_rtable(skb) == NULL))
*err = -1;
else
dst->value = skb_rtable(skb)->fl.iif;
}
/**************************************************************************
* Socket Attributes
**************************************************************************/
#define SKIP_NONLOCAL(skb) \
if (unlikely(skb->sk == NULL)) { \
*err = -1; \
return; \
}
META_COLLECTOR(int_sk_family)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_family;
}
META_COLLECTOR(int_sk_state)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_state;
}
META_COLLECTOR(int_sk_reuse)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_reuse;
}
META_COLLECTOR(int_sk_bound_if)
{
SKIP_NONLOCAL(skb);
/* No error if bound_dev_if is 0, legal userspace check */
dst->value = skb->sk->sk_bound_dev_if;
}
META_COLLECTOR(var_sk_bound_if)
{
SKIP_NONLOCAL(skb);
if (skb->sk->sk_bound_dev_if == 0) {
dst->value = (unsigned long) "any";
dst->len = 3;
} else {
struct net_device *dev;
dev = dev_get_by_index(&init_net, skb->sk->sk_bound_dev_if);
*err = var_dev(dev, dst);
if (dev)
dev_put(dev);
}
}
META_COLLECTOR(int_sk_refcnt)
{
SKIP_NONLOCAL(skb);
dst->value = atomic_read(&skb->sk->sk_refcnt);
}
META_COLLECTOR(int_sk_rcvbuf)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_rcvbuf;
}
META_COLLECTOR(int_sk_shutdown)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_shutdown;
}
META_COLLECTOR(int_sk_proto)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_protocol;
}
META_COLLECTOR(int_sk_type)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_type;
}
META_COLLECTOR(int_sk_rmem_alloc)
{
SKIP_NONLOCAL(skb);
dst->value = sk_rmem_alloc_get(skb->sk);
}
META_COLLECTOR(int_sk_wmem_alloc)
{
SKIP_NONLOCAL(skb);
dst->value = sk_wmem_alloc_get(skb->sk);
}
META_COLLECTOR(int_sk_omem_alloc)
{
SKIP_NONLOCAL(skb);
dst->value = atomic_read(&skb->sk->sk_omem_alloc);
}
META_COLLECTOR(int_sk_rcv_qlen)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_receive_queue.qlen;
}
META_COLLECTOR(int_sk_snd_qlen)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_write_queue.qlen;
}
META_COLLECTOR(int_sk_wmem_queued)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_wmem_queued;
}
META_COLLECTOR(int_sk_fwd_alloc)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_forward_alloc;
}
META_COLLECTOR(int_sk_sndbuf)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_sndbuf;
}
META_COLLECTOR(int_sk_alloc)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_allocation;
}
META_COLLECTOR(int_sk_route_caps)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_route_caps;
}
META_COLLECTOR(int_sk_hash)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_hash;
}
META_COLLECTOR(int_sk_lingertime)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_lingertime / HZ;
}
META_COLLECTOR(int_sk_err_qlen)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_error_queue.qlen;
}
META_COLLECTOR(int_sk_ack_bl)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_ack_backlog;
}
META_COLLECTOR(int_sk_max_ack_bl)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_max_ack_backlog;
}
META_COLLECTOR(int_sk_prio)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_priority;
}
META_COLLECTOR(int_sk_rcvlowat)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_rcvlowat;
}
META_COLLECTOR(int_sk_rcvtimeo)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_rcvtimeo / HZ;
}
META_COLLECTOR(int_sk_sndtimeo)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_sndtimeo / HZ;
}
META_COLLECTOR(int_sk_sendmsg_off)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_sndmsg_off;
}
META_COLLECTOR(int_sk_write_pend)
{
SKIP_NONLOCAL(skb);
dst->value = skb->sk->sk_write_pending;
}
/**************************************************************************
* Meta value collectors assignment table
**************************************************************************/
struct meta_ops
{
void (*get)(struct sk_buff *, struct tcf_pkt_info *,
struct meta_value *, struct meta_obj *, int *);
};
#define META_ID(name) TCF_META_ID_##name
#define META_FUNC(name) { .get = meta_##name }
/* Meta value operations table listing all meta value collectors and
* assigns them to a type and meta id. */
static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
[TCF_META_TYPE_VAR] = {
[META_ID(DEV)] = META_FUNC(var_dev),
[META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if),
},
[TCF_META_TYPE_INT] = {
[META_ID(RANDOM)] = META_FUNC(int_random),
[META_ID(LOADAVG_0)] = META_FUNC(int_loadavg_0),
[META_ID(LOADAVG_1)] = META_FUNC(int_loadavg_1),
[META_ID(LOADAVG_2)] = META_FUNC(int_loadavg_2),
[META_ID(DEV)] = META_FUNC(int_dev),
[META_ID(PRIORITY)] = META_FUNC(int_priority),
[META_ID(PROTOCOL)] = META_FUNC(int_protocol),
[META_ID(PKTTYPE)] = META_FUNC(int_pkttype),
[META_ID(PKTLEN)] = META_FUNC(int_pktlen),
[META_ID(DATALEN)] = META_FUNC(int_datalen),
[META_ID(MACLEN)] = META_FUNC(int_maclen),
[META_ID(NFMARK)] = META_FUNC(int_mark),
[META_ID(TCINDEX)] = META_FUNC(int_tcindex),
[META_ID(RTCLASSID)] = META_FUNC(int_rtclassid),
[META_ID(RTIIF)] = META_FUNC(int_rtiif),
[META_ID(SK_FAMILY)] = META_FUNC(int_sk_family),
[META_ID(SK_STATE)] = META_FUNC(int_sk_state),
[META_ID(SK_REUSE)] = META_FUNC(int_sk_reuse),
[META_ID(SK_BOUND_IF)] = META_FUNC(int_sk_bound_if),
[META_ID(SK_REFCNT)] = META_FUNC(int_sk_refcnt),
[META_ID(SK_RCVBUF)] = META_FUNC(int_sk_rcvbuf),
[META_ID(SK_SNDBUF)] = META_FUNC(int_sk_sndbuf),
[META_ID(SK_SHUTDOWN)] = META_FUNC(int_sk_shutdown),
[META_ID(SK_PROTO)] = META_FUNC(int_sk_proto),
[META_ID(SK_TYPE)] = META_FUNC(int_sk_type),
[META_ID(SK_RMEM_ALLOC)] = META_FUNC(int_sk_rmem_alloc),
[META_ID(SK_WMEM_ALLOC)] = META_FUNC(int_sk_wmem_alloc),
[META_ID(SK_OMEM_ALLOC)] = META_FUNC(int_sk_omem_alloc),
[META_ID(SK_WMEM_QUEUED)] = META_FUNC(int_sk_wmem_queued),
[META_ID(SK_RCV_QLEN)] = META_FUNC(int_sk_rcv_qlen),
[META_ID(SK_SND_QLEN)] = META_FUNC(int_sk_snd_qlen),
[META_ID(SK_ERR_QLEN)] = META_FUNC(int_sk_err_qlen),
[META_ID(SK_FORWARD_ALLOCS)] = META_FUNC(int_sk_fwd_alloc),
[META_ID(SK_ALLOCS)] = META_FUNC(int_sk_alloc),
[META_ID(SK_ROUTE_CAPS)] = META_FUNC(int_sk_route_caps),
[META_ID(SK_HASH)] = META_FUNC(int_sk_hash),
[META_ID(SK_LINGERTIME)] = META_FUNC(int_sk_lingertime),
[META_ID(SK_ACK_BACKLOG)] = META_FUNC(int_sk_ack_bl),
[META_ID(SK_MAX_ACK_BACKLOG)] = META_FUNC(int_sk_max_ack_bl),
[META_ID(SK_PRIO)] = META_FUNC(int_sk_prio),
[META_ID(SK_RCVLOWAT)] = META_FUNC(int_sk_rcvlowat),
[META_ID(SK_RCVTIMEO)] = META_FUNC(int_sk_rcvtimeo),
[META_ID(SK_SNDTIMEO)] = META_FUNC(int_sk_sndtimeo),
[META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off),
[META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend),
[META_ID(VLAN_TAG)] = META_FUNC(int_vlan_tag),
}
};
static inline struct meta_ops * meta_ops(struct meta_value *val)
{
return &__meta_ops[meta_type(val)][meta_id(val)];
}
/**************************************************************************
* Type specific operations for TCF_META_TYPE_VAR
**************************************************************************/
static int meta_var_compare(struct meta_obj *a, struct meta_obj *b)
{
int r = a->len - b->len;
if (r == 0)
r = memcmp((void *) a->value, (void *) b->value, a->len);
return r;
}
static int meta_var_change(struct meta_value *dst, struct nlattr *nla)
{
int len = nla_len(nla);
dst->val = (unsigned long)kmemdup(nla_data(nla), len, GFP_KERNEL);
if (dst->val == 0UL)
return -ENOMEM;
dst->len = len;
return 0;
}
static void meta_var_destroy(struct meta_value *v)
{
kfree((void *) v->val);
}
static void meta_var_apply_extras(struct meta_value *v,
struct meta_obj *dst)
{
int shift = v->hdr.shift;
if (shift && shift < dst->len)
dst->len -= shift;
}
static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
{
if (v->val && v->len)
NLA_PUT(skb, tlv, v->len, (void *) v->val);
return 0;
nla_put_failure:
return -1;
}
/**************************************************************************
* Type specific operations for TCF_META_TYPE_INT
**************************************************************************/
static int meta_int_compare(struct meta_obj *a, struct meta_obj *b)
{
/* Let gcc optimize it, the unlikely is not really based on
* some numbers but jump free code for mismatches seems
* more logical. */
if (unlikely(a->value == b->value))
return 0;
else if (a->value < b->value)
return -1;
else
return 1;
}
static int meta_int_change(struct meta_value *dst, struct nlattr *nla)
{
if (nla_len(nla) >= sizeof(unsigned long)) {
dst->val = *(unsigned long *) nla_data(nla);
dst->len = sizeof(unsigned long);
} else if (nla_len(nla) == sizeof(u32)) {
dst->val = nla_get_u32(nla);
dst->len = sizeof(u32);
} else
return -EINVAL;
return 0;
}
static void meta_int_apply_extras(struct meta_value *v,
struct meta_obj *dst)
{
if (v->hdr.shift)
dst->value >>= v->hdr.shift;
if (v->val)
dst->value &= v->val;
}
static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
{
if (v->len == sizeof(unsigned long))
NLA_PUT(skb, tlv, sizeof(unsigned long), &v->val);
else if (v->len == sizeof(u32)) {
NLA_PUT_U32(skb, tlv, v->val);
}
return 0;
nla_put_failure:
return -1;
}
/**************************************************************************
* Type specific operations table
**************************************************************************/
struct meta_type_ops
{
void (*destroy)(struct meta_value *);
int (*compare)(struct meta_obj *, struct meta_obj *);
int (*change)(struct meta_value *, struct nlattr *);
void (*apply_extras)(struct meta_value *, struct meta_obj *);
int (*dump)(struct sk_buff *, struct meta_value *, int);
};
static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = {
[TCF_META_TYPE_VAR] = {
.destroy = meta_var_destroy,
.compare = meta_var_compare,
.change = meta_var_change,
.apply_extras = meta_var_apply_extras,
.dump = meta_var_dump
},
[TCF_META_TYPE_INT] = {
.compare = meta_int_compare,
.change = meta_int_change,
.apply_extras = meta_int_apply_extras,
.dump = meta_int_dump
}
};
static inline struct meta_type_ops * meta_type_ops(struct meta_value *v)
{
return &__meta_type_ops[meta_type(v)];
}
/**************************************************************************
* Core
**************************************************************************/
static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info,
struct meta_value *v, struct meta_obj *dst)
{
int err = 0;
if (meta_id(v) == TCF_META_ID_VALUE) {
dst->value = v->val;
dst->len = v->len;
return 0;
}
meta_ops(v)->get(skb, info, v, dst, &err);
if (err < 0)
return err;
if (meta_type_ops(v)->apply_extras)
meta_type_ops(v)->apply_extras(v, dst);
return 0;
}
static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
struct tcf_pkt_info *info)
{
int r;
struct meta_match *meta = (struct meta_match *) m->data;
struct meta_obj l_value, r_value;
if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 ||
meta_get(skb, info, &meta->rvalue, &r_value) < 0)
return 0;
r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
switch (meta->lvalue.hdr.op) {
case TCF_EM_OPND_EQ:
return !r;
case TCF_EM_OPND_LT:
return r < 0;
case TCF_EM_OPND_GT:
return r > 0;
}
return 0;
}
static void meta_delete(struct meta_match *meta)
{
if (meta) {
struct meta_type_ops *ops = meta_type_ops(&meta->lvalue);
if (ops && ops->destroy) {
ops->destroy(&meta->lvalue);
ops->destroy(&meta->rvalue);
}
}
kfree(meta);
}
static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla)
{
if (nla) {
if (nla_len(nla) == 0)
return -EINVAL;
return meta_type_ops(dst)->change(dst, nla);
}
return 0;
}
static inline int meta_is_supported(struct meta_value *val)
{
return (!meta_id(val) || meta_ops(val)->get);
}
static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = {
[TCA_EM_META_HDR] = { .len = sizeof(struct tcf_meta_hdr) },
};
static int em_meta_change(struct tcf_proto *tp, void *data, int len,
struct tcf_ematch *m)
{
int err;
struct nlattr *tb[TCA_EM_META_MAX + 1];
struct tcf_meta_hdr *hdr;
struct meta_match *meta = NULL;
err = nla_parse(tb, TCA_EM_META_MAX, data, len, meta_policy);
if (err < 0)
goto errout;
err = -EINVAL;
if (tb[TCA_EM_META_HDR] == NULL)
goto errout;
hdr = nla_data(tb[TCA_EM_META_HDR]);
if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) ||
TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX ||
TCF_META_ID(hdr->left.kind) > TCF_META_ID_MAX ||
TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX)
goto errout;
meta = kzalloc(sizeof(*meta), GFP_KERNEL);
if (meta == NULL)
goto errout;
memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left));
memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right));
if (!meta_is_supported(&meta->lvalue) ||
!meta_is_supported(&meta->rvalue)) {
err = -EOPNOTSUPP;
goto errout;
}
if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE]) < 0 ||
meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE]) < 0)
goto errout;
m->datalen = sizeof(*meta);
m->data = (unsigned long) meta;
err = 0;
errout:
if (err && meta)
meta_delete(meta);
return err;
}
static void em_meta_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
{
if (m)
meta_delete((struct meta_match *) m->data);
}
static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em)
{
struct meta_match *meta = (struct meta_match *) em->data;
struct tcf_meta_hdr hdr;
struct meta_type_ops *ops;
memset(&hdr, 0, sizeof(hdr));
memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left));
memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right));
NLA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr);
ops = meta_type_ops(&meta->lvalue);
if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 ||
ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0)
goto nla_put_failure;
return 0;
nla_put_failure:
return -1;
}
static struct tcf_ematch_ops em_meta_ops = {
.kind = TCF_EM_META,
.change = em_meta_change,
.match = em_meta_match,
.destroy = em_meta_destroy,
.dump = em_meta_dump,
.owner = THIS_MODULE,
.link = LIST_HEAD_INIT(em_meta_ops.link)
};
static int __init init_em_meta(void)
{
return tcf_em_register(&em_meta_ops);
}
static void __exit exit_em_meta(void)
{
tcf_em_unregister(&em_meta_ops);
}
MODULE_LICENSE("GPL");
module_init(init_em_meta);
module_exit(exit_em_meta);
MODULE_ALIAS_TCF_EMATCH(TCF_EM_META);

View File

@@ -0,0 +1,80 @@
/*
* net/sched/em_nbyte.c N-Byte ematch
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/tc_ematch/tc_em_nbyte.h>
#include <net/pkt_cls.h>
struct nbyte_data
{
struct tcf_em_nbyte hdr;
char pattern[0];
};
static int em_nbyte_change(struct tcf_proto *tp, void *data, int data_len,
struct tcf_ematch *em)
{
struct tcf_em_nbyte *nbyte = data;
if (data_len < sizeof(*nbyte) ||
data_len < (sizeof(*nbyte) + nbyte->len))
return -EINVAL;
em->datalen = sizeof(*nbyte) + nbyte->len;
em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL);
if (em->data == 0UL)
return -ENOBUFS;
return 0;
}
static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
struct tcf_pkt_info *info)
{
struct nbyte_data *nbyte = (struct nbyte_data *) em->data;
unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer);
ptr += nbyte->hdr.off;
if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
return 0;
return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len);
}
static struct tcf_ematch_ops em_nbyte_ops = {
.kind = TCF_EM_NBYTE,
.change = em_nbyte_change,
.match = em_nbyte_match,
.owner = THIS_MODULE,
.link = LIST_HEAD_INIT(em_nbyte_ops.link)
};
static int __init init_em_nbyte(void)
{
return tcf_em_register(&em_nbyte_ops);
}
static void __exit exit_em_nbyte(void)
{
tcf_em_unregister(&em_nbyte_ops);
}
MODULE_LICENSE("GPL");
module_init(init_em_nbyte);
module_exit(exit_em_nbyte);
MODULE_ALIAS_TCF_EMATCH(TCF_EM_NBYTE);

157
kernel/net/sched/em_text.c Normal file
View File

@@ -0,0 +1,157 @@
/*
* net/sched/em_text.c Textsearch ematch
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/textsearch.h>
#include <linux/tc_ematch/tc_em_text.h>
#include <net/pkt_cls.h>
struct text_match
{
u16 from_offset;
u16 to_offset;
u8 from_layer;
u8 to_layer;
struct ts_config *config;
};
#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data)
static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
struct tcf_pkt_info *info)
{
struct text_match *tm = EM_TEXT_PRIV(m);
int from, to;
struct ts_state state;
from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
from += tm->from_offset;
to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
to += tm->to_offset;
return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX;
}
static int em_text_change(struct tcf_proto *tp, void *data, int len,
struct tcf_ematch *m)
{
struct text_match *tm;
struct tcf_em_text *conf = data;
struct ts_config *ts_conf;
int flags = 0;
if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
return -EINVAL;
if (conf->from_layer > conf->to_layer)
return -EINVAL;
if (conf->from_layer == conf->to_layer &&
conf->from_offset > conf->to_offset)
return -EINVAL;
retry:
ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf),
conf->pattern_len, GFP_KERNEL, flags);
if (flags & TS_AUTOLOAD)
rtnl_lock();
if (IS_ERR(ts_conf)) {
if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) {
rtnl_unlock();
flags |= TS_AUTOLOAD;
goto retry;
} else
return PTR_ERR(ts_conf);
} else if (flags & TS_AUTOLOAD) {
textsearch_destroy(ts_conf);
return -EAGAIN;
}
tm = kmalloc(sizeof(*tm), GFP_KERNEL);
if (tm == NULL) {
textsearch_destroy(ts_conf);
return -ENOBUFS;
}
tm->from_offset = conf->from_offset;
tm->to_offset = conf->to_offset;
tm->from_layer = conf->from_layer;
tm->to_layer = conf->to_layer;
tm->config = ts_conf;
m->datalen = sizeof(*tm);
m->data = (unsigned long) tm;
return 0;
}
static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
{
textsearch_destroy(EM_TEXT_PRIV(m)->config);
}
static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
{
struct text_match *tm = EM_TEXT_PRIV(m);
struct tcf_em_text conf;
strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
conf.from_offset = tm->from_offset;
conf.to_offset = tm->to_offset;
conf.from_layer = tm->from_layer;
conf.to_layer = tm->to_layer;
conf.pattern_len = textsearch_get_pattern_len(tm->config);
conf.pad = 0;
if (nla_put_nohdr(skb, sizeof(conf), &conf) < 0)
goto nla_put_failure;
if (nla_append(skb, conf.pattern_len,
textsearch_get_pattern(tm->config)) < 0)
goto nla_put_failure;
return 0;
nla_put_failure:
return -1;
}
static struct tcf_ematch_ops em_text_ops = {
.kind = TCF_EM_TEXT,
.change = em_text_change,
.match = em_text_match,
.destroy = em_text_destroy,
.dump = em_text_dump,
.owner = THIS_MODULE,
.link = LIST_HEAD_INIT(em_text_ops.link)
};
static int __init init_em_text(void)
{
return tcf_em_register(&em_text_ops);
}
static void __exit exit_em_text(void)
{
tcf_em_unregister(&em_text_ops);
}
MODULE_LICENSE("GPL");
module_init(init_em_text);
module_exit(exit_em_text);
MODULE_ALIAS_TCF_EMATCH(TCF_EM_TEXT);

64
kernel/net/sched/em_u32.c Normal file
View File

@@ -0,0 +1,64 @@
/*
* net/sched/em_u32.c U32 Ematch
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
* Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Based on net/sched/cls_u32.c
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <net/pkt_cls.h>
static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em,
struct tcf_pkt_info *info)
{
struct tc_u32_key *key = (struct tc_u32_key *) em->data;
const unsigned char *ptr = skb_network_header(skb);
if (info) {
if (info->ptr)
ptr = info->ptr;
ptr += (info->nexthdr & key->offmask);
}
ptr += key->off;
if (!tcf_valid_offset(skb, ptr, sizeof(u32)))
return 0;
return !(((*(__be32*) ptr) ^ key->val) & key->mask);
}
static struct tcf_ematch_ops em_u32_ops = {
.kind = TCF_EM_U32,
.datalen = sizeof(struct tc_u32_key),
.match = em_u32_match,
.owner = THIS_MODULE,
.link = LIST_HEAD_INIT(em_u32_ops.link)
};
static int __init init_em_u32(void)
{
return tcf_em_register(&em_u32_ops);
}
static void __exit exit_em_u32(void)
{
tcf_em_unregister(&em_u32_ops);
}
MODULE_LICENSE("GPL");
module_init(init_em_u32);
module_exit(exit_em_u32);
MODULE_ALIAS_TCF_EMATCH(TCF_EM_U32);

532
kernel/net/sched/ematch.c Normal file
View File

@@ -0,0 +1,532 @@
/*
* net/sched/ematch.c Extended Match API
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*
* ==========================================================================
*
* An extended match (ematch) is a small classification tool not worth
* writing a full classifier for. Ematches can be interconnected to form
* a logic expression and get attached to classifiers to extend their
* functionatlity.
*
* The userspace part transforms the logic expressions into an array
* consisting of multiple sequences of interconnected ematches separated
* by markers. Precedence is implemented by a special ematch kind
* referencing a sequence beyond the marker of the current sequence
* causing the current position in the sequence to be pushed onto a stack
* to allow the current position to be overwritten by the position referenced
* in the special ematch. Matching continues in the new sequence until a
* marker is reached causing the position to be restored from the stack.
*
* Example:
* A AND (B1 OR B2) AND C AND D
*
* ------->-PUSH-------
* -->-- / -->-- \ -->--
* / \ / / \ \ / \
* +-------+-------+-------+-------+-------+--------+
* | A AND | B AND | C AND | D END | B1 OR | B2 END |
* +-------+-------+-------+-------+-------+--------+
* \ /
* --------<-POP---------
*
* where B is a virtual ematch referencing to sequence starting with B1.
*
* ==========================================================================
*
* How to write an ematch in 60 seconds
* ------------------------------------
*
* 1) Provide a matcher function:
* static int my_match(struct sk_buff *skb, struct tcf_ematch *m,
* struct tcf_pkt_info *info)
* {
* struct mydata *d = (struct mydata *) m->data;
*
* if (...matching goes here...)
* return 1;
* else
* return 0;
* }
*
* 2) Fill out a struct tcf_ematch_ops:
* static struct tcf_ematch_ops my_ops = {
* .kind = unique id,
* .datalen = sizeof(struct mydata),
* .match = my_match,
* .owner = THIS_MODULE,
* };
*
* 3) Register/Unregister your ematch:
* static int __init init_my_ematch(void)
* {
* return tcf_em_register(&my_ops);
* }
*
* static void __exit exit_my_ematch(void)
* {
* tcf_em_unregister(&my_ops);
* }
*
* module_init(init_my_ematch);
* module_exit(exit_my_ematch);
*
* 4) By now you should have two more seconds left, barely enough to
* open up a beer to watch the compilation going.
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <net/pkt_cls.h>
static LIST_HEAD(ematch_ops);
static DEFINE_RWLOCK(ematch_mod_lock);
static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind)
{
struct tcf_ematch_ops *e = NULL;
read_lock(&ematch_mod_lock);
list_for_each_entry(e, &ematch_ops, link) {
if (kind == e->kind) {
if (!try_module_get(e->owner))
e = NULL;
read_unlock(&ematch_mod_lock);
return e;
}
}
read_unlock(&ematch_mod_lock);
return NULL;
}
/**
* tcf_em_register - register an extended match
*
* @ops: ematch operations lookup table
*
* This function must be called by ematches to announce their presence.
* The given @ops must have kind set to a unique identifier and the
* callback match() must be implemented. All other callbacks are optional
* and a fallback implementation is used instead.
*
* Returns -EEXISTS if an ematch of the same kind has already registered.
*/
int tcf_em_register(struct tcf_ematch_ops *ops)
{
int err = -EEXIST;
struct tcf_ematch_ops *e;
if (ops->match == NULL)
return -EINVAL;
write_lock(&ematch_mod_lock);
list_for_each_entry(e, &ematch_ops, link)
if (ops->kind == e->kind)
goto errout;
list_add_tail(&ops->link, &ematch_ops);
err = 0;
errout:
write_unlock(&ematch_mod_lock);
return err;
}
EXPORT_SYMBOL(tcf_em_register);
/**
* tcf_em_unregister - unregster and extended match
*
* @ops: ematch operations lookup table
*
* This function must be called by ematches to announce their disappearance
* for examples when the module gets unloaded. The @ops parameter must be
* the same as the one used for registration.
*
* Returns -ENOENT if no matching ematch was found.
*/
void tcf_em_unregister(struct tcf_ematch_ops *ops)
{
write_lock(&ematch_mod_lock);
list_del(&ops->link);
write_unlock(&ematch_mod_lock);
}
EXPORT_SYMBOL(tcf_em_unregister);
static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree,
int index)
{
return &tree->matches[index];
}
static int tcf_em_validate(struct tcf_proto *tp,
struct tcf_ematch_tree_hdr *tree_hdr,
struct tcf_ematch *em, struct nlattr *nla, int idx)
{
int err = -EINVAL;
struct tcf_ematch_hdr *em_hdr = nla_data(nla);
int data_len = nla_len(nla) - sizeof(*em_hdr);
void *data = (void *) em_hdr + sizeof(*em_hdr);
if (!TCF_EM_REL_VALID(em_hdr->flags))
goto errout;
if (em_hdr->kind == TCF_EM_CONTAINER) {
/* Special ematch called "container", carries an index
* referencing an external ematch sequence. */
u32 ref;
if (data_len < sizeof(ref))
goto errout;
ref = *(u32 *) data;
if (ref >= tree_hdr->nmatches)
goto errout;
/* We do not allow backward jumps to avoid loops and jumps
* to our own position are of course illegal. */
if (ref <= idx)
goto errout;
em->data = ref;
} else {
/* Note: This lookup will increase the module refcnt
* of the ematch module referenced. In case of a failure,
* a destroy function is called by the underlying layer
* which automatically releases the reference again, therefore
* the module MUST not be given back under any circumstances
* here. Be aware, the destroy function assumes that the
* module is held if the ops field is non zero. */
em->ops = tcf_em_lookup(em_hdr->kind);
if (em->ops == NULL) {
err = -ENOENT;
#ifdef CONFIG_MODULES
__rtnl_unlock();
request_module("ematch-kind-%u", em_hdr->kind);
rtnl_lock();
em->ops = tcf_em_lookup(em_hdr->kind);
if (em->ops) {
/* We dropped the RTNL mutex in order to
* perform the module load. Tell the caller
* to replay the request. */
module_put(em->ops->owner);
err = -EAGAIN;
}
#endif
goto errout;
}
/* ematch module provides expected length of data, so we
* can do a basic sanity check. */
if (em->ops->datalen && data_len < em->ops->datalen)
goto errout;
if (em->ops->change) {
err = em->ops->change(tp, data, data_len, em);
if (err < 0)
goto errout;
} else if (data_len > 0) {
/* ematch module doesn't provide an own change
* procedure and expects us to allocate and copy
* the ematch data.
*
* TCF_EM_SIMPLE may be specified stating that the
* data only consists of a u32 integer and the module
* does not expected a memory reference but rather
* the value carried. */
if (em_hdr->flags & TCF_EM_SIMPLE) {
if (data_len < sizeof(u32))
goto errout;
em->data = *(u32 *) data;
} else {
void *v = kmemdup(data, data_len, GFP_KERNEL);
if (v == NULL) {
err = -ENOBUFS;
goto errout;
}
em->data = (unsigned long) v;
}
}
}
em->matchid = em_hdr->matchid;
em->flags = em_hdr->flags;
em->datalen = data_len;
err = 0;
errout:
return err;
}
static const struct nla_policy em_policy[TCA_EMATCH_TREE_MAX + 1] = {
[TCA_EMATCH_TREE_HDR] = { .len = sizeof(struct tcf_ematch_tree_hdr) },
[TCA_EMATCH_TREE_LIST] = { .type = NLA_NESTED },
};
/**
* tcf_em_tree_validate - validate ematch config TLV and build ematch tree
*
* @tp: classifier kind handle
* @nla: ematch tree configuration TLV
* @tree: destination ematch tree variable to store the resulting
* ematch tree.
*
* This function validates the given configuration TLV @nla and builds an
* ematch tree in @tree. The resulting tree must later be copied into
* the private classifier data using tcf_em_tree_change(). You MUST NOT
* provide the ematch tree variable of the private classifier data directly,
* the changes would not be locked properly.
*
* Returns a negative error code if the configuration TLV contains errors.
*/
int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,
struct tcf_ematch_tree *tree)
{
int idx, list_len, matches_len, err;
struct nlattr *tb[TCA_EMATCH_TREE_MAX + 1];
struct nlattr *rt_match, *rt_hdr, *rt_list;
struct tcf_ematch_tree_hdr *tree_hdr;
struct tcf_ematch *em;
memset(tree, 0, sizeof(*tree));
if (!nla)
return 0;
err = nla_parse_nested(tb, TCA_EMATCH_TREE_MAX, nla, em_policy);
if (err < 0)
goto errout;
err = -EINVAL;
rt_hdr = tb[TCA_EMATCH_TREE_HDR];
rt_list = tb[TCA_EMATCH_TREE_LIST];
if (rt_hdr == NULL || rt_list == NULL)
goto errout;
tree_hdr = nla_data(rt_hdr);
memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr));
rt_match = nla_data(rt_list);
list_len = nla_len(rt_list);
matches_len = tree_hdr->nmatches * sizeof(*em);
tree->matches = kzalloc(matches_len, GFP_KERNEL);
if (tree->matches == NULL)
goto errout;
/* We do not use nla_parse_nested here because the maximum
* number of attributes is unknown. This saves us the allocation
* for a tb buffer which would serve no purpose at all.
*
* The array of rt attributes is parsed in the order as they are
* provided, their type must be incremental from 1 to n. Even
* if it does not serve any real purpose, a failure of sticking
* to this policy will result in parsing failure. */
for (idx = 0; nla_ok(rt_match, list_len); idx++) {
err = -EINVAL;
if (rt_match->nla_type != (idx + 1))
goto errout_abort;
if (idx >= tree_hdr->nmatches)
goto errout_abort;
if (nla_len(rt_match) < sizeof(struct tcf_ematch_hdr))
goto errout_abort;
em = tcf_em_get_match(tree, idx);
err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx);
if (err < 0)
goto errout_abort;
rt_match = nla_next(rt_match, &list_len);
}
/* Check if the number of matches provided by userspace actually
* complies with the array of matches. The number was used for
* the validation of references and a mismatch could lead to
* undefined references during the matching process. */
if (idx != tree_hdr->nmatches) {
err = -EINVAL;
goto errout_abort;
}
err = 0;
errout:
return err;
errout_abort:
tcf_em_tree_destroy(tp, tree);
return err;
}
EXPORT_SYMBOL(tcf_em_tree_validate);
/**
* tcf_em_tree_destroy - destroy an ematch tree
*
* @tp: classifier kind handle
* @tree: ematch tree to be deleted
*
* This functions destroys an ematch tree previously created by
* tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that
* the ematch tree is not in use before calling this function.
*/
void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree)
{
int i;
if (tree->matches == NULL)
return;
for (i = 0; i < tree->hdr.nmatches; i++) {
struct tcf_ematch *em = tcf_em_get_match(tree, i);
if (em->ops) {
if (em->ops->destroy)
em->ops->destroy(tp, em);
else if (!tcf_em_is_simple(em))
kfree((void *) em->data);
module_put(em->ops->owner);
}
}
tree->hdr.nmatches = 0;
kfree(tree->matches);
tree->matches = NULL;
}
EXPORT_SYMBOL(tcf_em_tree_destroy);
/**
* tcf_em_tree_dump - dump ematch tree into a rtnl message
*
* @skb: skb holding the rtnl message
* @t: ematch tree to be dumped
* @tlv: TLV type to be used to encapsulate the tree
*
* This function dumps a ematch tree into a rtnl message. It is valid to
* call this function while the ematch tree is in use.
*
* Returns -1 if the skb tailroom is insufficient.
*/
int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
{
int i;
u8 *tail;
struct nlattr *top_start;
struct nlattr *list_start;
top_start = nla_nest_start(skb, tlv);
if (top_start == NULL)
goto nla_put_failure;
NLA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr);
list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST);
if (list_start == NULL)
goto nla_put_failure;
tail = skb_tail_pointer(skb);
for (i = 0; i < tree->hdr.nmatches; i++) {
struct nlattr *match_start = (struct nlattr *)tail;
struct tcf_ematch *em = tcf_em_get_match(tree, i);
struct tcf_ematch_hdr em_hdr = {
.kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER,
.matchid = em->matchid,
.flags = em->flags
};
NLA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr);
if (em->ops && em->ops->dump) {
if (em->ops->dump(skb, em) < 0)
goto nla_put_failure;
} else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) {
u32 u = em->data;
nla_put_nohdr(skb, sizeof(u), &u);
} else if (em->datalen > 0)
nla_put_nohdr(skb, em->datalen, (void *) em->data);
tail = skb_tail_pointer(skb);
match_start->nla_len = tail - (u8 *)match_start;
}
nla_nest_end(skb, list_start);
nla_nest_end(skb, top_start);
return 0;
nla_put_failure:
return -1;
}
EXPORT_SYMBOL(tcf_em_tree_dump);
static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em,
struct tcf_pkt_info *info)
{
int r = em->ops->match(skb, em, info);
return tcf_em_is_inverted(em) ? !r : r;
}
/* Do not use this function directly, use tcf_em_tree_match instead */
int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree,
struct tcf_pkt_info *info)
{
int stackp = 0, match_idx = 0, res = 0;
struct tcf_ematch *cur_match;
int stack[CONFIG_NET_EMATCH_STACK];
proceed:
while (match_idx < tree->hdr.nmatches) {
cur_match = tcf_em_get_match(tree, match_idx);
if (tcf_em_is_container(cur_match)) {
if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK))
goto stack_overflow;
stack[stackp++] = match_idx;
match_idx = cur_match->data;
goto proceed;
}
res = tcf_em_match(skb, cur_match, info);
if (tcf_em_early_end(cur_match, res))
break;
match_idx++;
}
pop_stack:
if (stackp > 0) {
match_idx = stack[--stackp];
cur_match = tcf_em_get_match(tree, match_idx);
if (tcf_em_early_end(cur_match, res))
goto pop_stack;
else {
match_idx++;
goto proceed;
}
}
return res;
stack_overflow:
if (net_ratelimit())
printk("Local stack overflow, increase NET_EMATCH_STACK\n");
return -1;
}
EXPORT_SYMBOL(__tcf_em_tree_match);

1722
kernel/net/sched/sch_api.c Normal file

File diff suppressed because it is too large Load Diff

714
kernel/net/sched/sch_atm.c Normal file
View File

@@ -0,0 +1,714 @@
/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */
/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
#include <linux/module.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/atmdev.h>
#include <linux/atmclip.h>
#include <linux/rtnetlink.h>
#include <linux/file.h> /* for fput */
#include <net/netlink.h>
#include <net/pkt_sched.h>
extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */
/*
* The ATM queuing discipline provides a framework for invoking classifiers
* (aka "filters"), which in turn select classes of this queuing discipline.
* Each class maps the flow(s) it is handling to a given VC. Multiple classes
* may share the same VC.
*
* When creating a class, VCs are specified by passing the number of the open
* socket descriptor by which the calling process references the VC. The kernel
* keeps the VC open at least until all classes using it are removed.
*
* In this file, most functions are named atm_tc_* to avoid confusion with all
* the atm_* in net/atm. This naming convention differs from what's used in the
* rest of net/sched.
*
* Known bugs:
* - sometimes messes up the IP stack
* - any manipulations besides the few operations described in the README, are
* untested and likely to crash the system
* - should lock the flow while there is data in the queue (?)
*/
#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back))
struct atm_flow_data {
struct Qdisc *q; /* FIFO, TBF, etc. */
struct tcf_proto *filter_list;
struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */
void (*old_pop)(struct atm_vcc *vcc,
struct sk_buff *skb); /* chaining */
struct atm_qdisc_data *parent; /* parent qdisc */
struct socket *sock; /* for closing */
u32 classid; /* x:y type ID */
int ref; /* reference count */
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
struct atm_flow_data *next;
struct atm_flow_data *excess; /* flow for excess traffic;
NULL to set CLP instead */
int hdr_len;
unsigned char hdr[0]; /* header data; MUST BE LAST */
};
struct atm_qdisc_data {
struct atm_flow_data link; /* unclassified skbs go here */
struct atm_flow_data *flows; /* NB: "link" is also on this
list */
struct tasklet_struct task; /* dequeue tasklet */
};
/* ------------------------- Class/flow operations ------------------------- */
static int find_flow(struct atm_qdisc_data *qdisc, struct atm_flow_data *flow)
{
struct atm_flow_data *walk;
pr_debug("find_flow(qdisc %p,flow %p)\n", qdisc, flow);
for (walk = qdisc->flows; walk; walk = walk->next)
if (walk == flow)
return 1;
pr_debug("find_flow: not found\n");
return 0;
}
static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow;
for (flow = p->flows; flow; flow = flow->next)
if (flow->classid == classid)
break;
return flow;
}
static int atm_tc_graft(struct Qdisc *sch, unsigned long arg,
struct Qdisc *new, struct Qdisc **old)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow = (struct atm_flow_data *)arg;
pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",
sch, p, flow, new, old);
if (!find_flow(p, flow))
return -EINVAL;
if (!new)
new = &noop_qdisc;
*old = flow->q;
flow->q = new;
if (*old)
qdisc_reset(*old);
return 0;
}
static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl)
{
struct atm_flow_data *flow = (struct atm_flow_data *)cl;
pr_debug("atm_tc_leaf(sch %p,flow %p)\n", sch, flow);
return flow ? flow->q : NULL;
}
static unsigned long atm_tc_get(struct Qdisc *sch, u32 classid)
{
struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch);
struct atm_flow_data *flow;
pr_debug("atm_tc_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid);
flow = lookup_flow(sch, classid);
if (flow)
flow->ref++;
pr_debug("atm_tc_get: flow %p\n", flow);
return (unsigned long)flow;
}
static unsigned long atm_tc_bind_filter(struct Qdisc *sch,
unsigned long parent, u32 classid)
{
return atm_tc_get(sch, classid);
}
/*
* atm_tc_put handles all destructions, including the ones that are explicitly
* requested (atm_tc_destroy, etc.). The assumption here is that we never drop
* anything that still seems to be in use.
*/
static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow = (struct atm_flow_data *)cl;
struct atm_flow_data **prev;
pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
if (--flow->ref)
return;
pr_debug("atm_tc_put: destroying\n");
for (prev = &p->flows; *prev; prev = &(*prev)->next)
if (*prev == flow)
break;
if (!*prev) {
printk(KERN_CRIT "atm_tc_put: class %p not found\n", flow);
return;
}
*prev = flow->next;
pr_debug("atm_tc_put: qdisc %p\n", flow->q);
qdisc_destroy(flow->q);
tcf_destroy_chain(&flow->filter_list);
if (flow->sock) {
pr_debug("atm_tc_put: f_count %ld\n",
file_count(flow->sock->file));
flow->vcc->pop = flow->old_pop;
sockfd_put(flow->sock);
}
if (flow->excess)
atm_tc_put(sch, (unsigned long)flow->excess);
if (flow != &p->link)
kfree(flow);
/*
* If flow == &p->link, the qdisc no longer works at this point and
* needs to be removed. (By the caller of atm_tc_put.)
*/
}
static void sch_atm_pop(struct atm_vcc *vcc, struct sk_buff *skb)
{
struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent;
pr_debug("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p);
VCC2FLOW(vcc)->old_pop(vcc, skb);
tasklet_schedule(&p->task);
}
static const u8 llc_oui_ip[] = {
0xaa, /* DSAP: non-ISO */
0xaa, /* SSAP: non-ISO */
0x03, /* Ctrl: Unnumbered Information Command PDU */
0x00, /* OUI: EtherType */
0x00, 0x00,
0x08, 0x00
}; /* Ethertype IP (0800) */
static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = {
[TCA_ATM_FD] = { .type = NLA_U32 },
[TCA_ATM_EXCESS] = { .type = NLA_U32 },
};
static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
struct nlattr **tca, unsigned long *arg)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow = (struct atm_flow_data *)*arg;
struct atm_flow_data *excess = NULL;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_ATM_MAX + 1];
struct socket *sock;
int fd, error, hdr_len;
void *hdr;
pr_debug("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x,"
"flow %p,opt %p)\n", sch, p, classid, parent, flow, opt);
/*
* The concept of parents doesn't apply for this qdisc.
*/
if (parent && parent != TC_H_ROOT && parent != sch->handle)
return -EINVAL;
/*
* ATM classes cannot be changed. In order to change properties of the
* ATM connection, that socket needs to be modified directly (via the
* native ATM API. In order to send a flow to a different VC, the old
* class needs to be removed and a new one added. (This may be changed
* later.)
*/
if (flow)
return -EBUSY;
if (opt == NULL)
return -EINVAL;
error = nla_parse_nested(tb, TCA_ATM_MAX, opt, atm_policy);
if (error < 0)
return error;
if (!tb[TCA_ATM_FD])
return -EINVAL;
fd = nla_get_u32(tb[TCA_ATM_FD]);
pr_debug("atm_tc_change: fd %d\n", fd);
if (tb[TCA_ATM_HDR]) {
hdr_len = nla_len(tb[TCA_ATM_HDR]);
hdr = nla_data(tb[TCA_ATM_HDR]);
} else {
hdr_len = RFC1483LLC_LEN;
hdr = NULL; /* default LLC/SNAP for IP */
}
if (!tb[TCA_ATM_EXCESS])
excess = NULL;
else {
excess = (struct atm_flow_data *)
atm_tc_get(sch, nla_get_u32(tb[TCA_ATM_EXCESS]));
if (!excess)
return -ENOENT;
}
pr_debug("atm_tc_change: type %d, payload %d, hdr_len %d\n",
opt->nla_type, nla_len(opt), hdr_len);
sock = sockfd_lookup(fd, &error);
if (!sock)
return error; /* f_count++ */
pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file));
if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) {
error = -EPROTOTYPE;
goto err_out;
}
/* @@@ should check if the socket is really operational or we'll crash
on vcc->send */
if (classid) {
if (TC_H_MAJ(classid ^ sch->handle)) {
pr_debug("atm_tc_change: classid mismatch\n");
error = -EINVAL;
goto err_out;
}
if (find_flow(p, flow)) {
error = -EEXIST;
goto err_out;
}
} else {
int i;
unsigned long cl;
for (i = 1; i < 0x8000; i++) {
classid = TC_H_MAKE(sch->handle, 0x8000 | i);
cl = atm_tc_get(sch, classid);
if (!cl)
break;
atm_tc_put(sch, cl);
}
}
pr_debug("atm_tc_change: new id %x\n", classid);
flow = kzalloc(sizeof(struct atm_flow_data) + hdr_len, GFP_KERNEL);
pr_debug("atm_tc_change: flow %p\n", flow);
if (!flow) {
error = -ENOBUFS;
goto err_out;
}
flow->filter_list = NULL;
flow->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
&pfifo_qdisc_ops, classid);
if (!flow->q)
flow->q = &noop_qdisc;
pr_debug("atm_tc_change: qdisc %p\n", flow->q);
flow->sock = sock;
flow->vcc = ATM_SD(sock); /* speedup */
flow->vcc->user_back = flow;
pr_debug("atm_tc_change: vcc %p\n", flow->vcc);
flow->old_pop = flow->vcc->pop;
flow->parent = p;
flow->vcc->pop = sch_atm_pop;
flow->classid = classid;
flow->ref = 1;
flow->excess = excess;
flow->next = p->link.next;
p->link.next = flow;
flow->hdr_len = hdr_len;
if (hdr)
memcpy(flow->hdr, hdr, hdr_len);
else
memcpy(flow->hdr, llc_oui_ip, sizeof(llc_oui_ip));
*arg = (unsigned long)flow;
return 0;
err_out:
if (excess)
atm_tc_put(sch, (unsigned long)excess);
sockfd_put(sock);
return error;
}
static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow = (struct atm_flow_data *)arg;
pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
if (!find_flow(qdisc_priv(sch), flow))
return -EINVAL;
if (flow->filter_list || flow == &p->link)
return -EBUSY;
/*
* Reference count must be 2: one for "keepalive" (set at class
* creation), and one for the reference held when calling delete.
*/
if (flow->ref < 2) {
printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n", flow->ref);
return -EINVAL;
}
if (flow->ref > 2)
return -EBUSY; /* catch references via excess, etc. */
atm_tc_put(sch, arg);
return 0;
}
static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow;
pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
if (walker->stop)
return;
for (flow = p->flows; flow; flow = flow->next) {
if (walker->count >= walker->skip)
if (walker->fn(sch, (unsigned long)flow, walker) < 0) {
walker->stop = 1;
break;
}
walker->count++;
}
}
static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch, unsigned long cl)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow = (struct atm_flow_data *)cl;
pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
return flow ? &flow->filter_list : &p->link.filter_list;
}
/* --------------------------- Qdisc operations ---------------------------- */
static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow = NULL; /* @@@ */
struct tcf_result res;
int result;
int ret = NET_XMIT_POLICED;
pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
result = TC_POLICE_OK; /* be nice to gcc */
if (TC_H_MAJ(skb->priority) != sch->handle ||
!(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority)))
for (flow = p->flows; flow; flow = flow->next)
if (flow->filter_list) {
result = tc_classify_compat(skb,
flow->filter_list,
&res);
if (result < 0)
continue;
flow = (struct atm_flow_data *)res.class;
if (!flow)
flow = lookup_flow(sch, res.classid);
break;
}
if (!flow)
flow = &p->link;
else {
if (flow->vcc)
ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
/*@@@ looks good ... but it's not supposed to work :-) */
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
case TC_ACT_QUEUED:
case TC_ACT_STOLEN:
kfree_skb(skb);
return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
kfree_skb(skb);
goto drop;
case TC_POLICE_RECLASSIFY:
if (flow->excess)
flow = flow->excess;
else
ATM_SKB(skb)->atm_options |= ATM_ATMOPT_CLP;
break;
}
#endif
}
ret = qdisc_enqueue(skb, flow->q);
if (ret != 0) {
drop: __maybe_unused
if (net_xmit_drop_count(ret)) {
sch->qstats.drops++;
if (flow)
flow->qstats.drops++;
}
return ret;
}
sch->bstats.bytes += qdisc_pkt_len(skb);
sch->bstats.packets++;
flow->bstats.bytes += qdisc_pkt_len(skb);
flow->bstats.packets++;
/*
* Okay, this may seem weird. We pretend we've dropped the packet if
* it goes via ATM. The reason for this is that the outer qdisc
* expects to be able to q->dequeue the packet later on if we return
* success at this place. Also, sch->q.qdisc needs to reflect whether
* there is a packet egligible for dequeuing or not. Note that the
* statistics of the outer qdisc are necessarily wrong because of all
* this. There's currently no correct solution for this.
*/
if (flow == &p->link) {
sch->q.qlen++;
return 0;
}
tasklet_schedule(&p->task);
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
}
/*
* Dequeue packets and send them over ATM. Note that we quite deliberately
* avoid checking net_device's flow control here, simply because sch_atm
* uses its own channels, which have nothing to do with any CLIP/LANE/or
* non-ATM interfaces.
*/
static void sch_atm_dequeue(unsigned long data)
{
struct Qdisc *sch = (struct Qdisc *)data;
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow;
struct sk_buff *skb;
pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p);
for (flow = p->link.next; flow; flow = flow->next)
/*
* If traffic is properly shaped, this won't generate nasty
* little bursts. Otherwise, it may ... (but that's okay)
*/
while ((skb = flow->q->ops->peek(flow->q))) {
if (!atm_may_send(flow->vcc, skb->truesize))
break;
skb = qdisc_dequeue_peeked(flow->q);
if (unlikely(!skb))
break;
pr_debug("atm_tc_dequeue: sending on class %p\n", flow);
/* remove any LL header somebody else has attached */
skb_pull(skb, skb_network_offset(skb));
if (skb_headroom(skb) < flow->hdr_len) {
struct sk_buff *new;
new = skb_realloc_headroom(skb, flow->hdr_len);
dev_kfree_skb(skb);
if (!new)
continue;
skb = new;
}
pr_debug("sch_atm_dequeue: ip %p, data %p\n",
skb_network_header(skb), skb->data);
ATM_SKB(skb)->vcc = flow->vcc;
memcpy(skb_push(skb, flow->hdr_len), flow->hdr,
flow->hdr_len);
atomic_add(skb->truesize,
&sk_atm(flow->vcc)->sk_wmem_alloc);
/* atm.atm_options are already set by atm_tc_enqueue */
flow->vcc->send(flow->vcc, skb);
}
}
static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct sk_buff *skb;
pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p);
tasklet_schedule(&p->task);
skb = qdisc_dequeue_peeked(p->link.q);
if (skb)
sch->q.qlen--;
return skb;
}
static struct sk_buff *atm_tc_peek(struct Qdisc *sch)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
pr_debug("atm_tc_peek(sch %p,[qdisc %p])\n", sch, p);
return p->link.q->ops->peek(p->link.q);
}
static unsigned int atm_tc_drop(struct Qdisc *sch)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow;
unsigned int len;
pr_debug("atm_tc_drop(sch %p,[qdisc %p])\n", sch, p);
for (flow = p->flows; flow; flow = flow->next)
if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q)))
return len;
return 0;
}
static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
p->flows = &p->link;
p->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
&pfifo_qdisc_ops, sch->handle);
if (!p->link.q)
p->link.q = &noop_qdisc;
pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
p->link.filter_list = NULL;
p->link.vcc = NULL;
p->link.sock = NULL;
p->link.classid = sch->handle;
p->link.ref = 1;
p->link.next = NULL;
tasklet_init(&p->task, sch_atm_dequeue, (unsigned long)sch);
return 0;
}
static void atm_tc_reset(struct Qdisc *sch)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow;
pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p);
for (flow = p->flows; flow; flow = flow->next)
qdisc_reset(flow->q);
sch->q.qlen = 0;
}
static void atm_tc_destroy(struct Qdisc *sch)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow;
pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p);
for (flow = p->flows; flow; flow = flow->next)
tcf_destroy_chain(&flow->filter_list);
/* races ? */
while ((flow = p->flows)) {
if (flow->ref > 1)
printk(KERN_ERR "atm_destroy: %p->ref = %d\n", flow,
flow->ref);
atm_tc_put(sch, (unsigned long)flow);
if (p->flows == flow) {
printk(KERN_ERR "atm_destroy: putting flow %p didn't "
"kill it\n", flow);
p->flows = flow->next; /* brute force */
break;
}
}
tasklet_kill(&p->task);
}
static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow = (struct atm_flow_data *)cl;
struct nlattr *nest;
pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n",
sch, p, flow, skb, tcm);
if (!find_flow(p, flow))
return -EINVAL;
tcm->tcm_handle = flow->classid;
tcm->tcm_info = flow->q->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
NLA_PUT(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr);
if (flow->vcc) {
struct sockaddr_atmpvc pvc;
int state;
pvc.sap_family = AF_ATMPVC;
pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;
pvc.sap_addr.vpi = flow->vcc->vpi;
pvc.sap_addr.vci = flow->vcc->vci;
NLA_PUT(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc);
state = ATM_VF2VS(flow->vcc->flags);
NLA_PUT_U32(skb, TCA_ATM_STATE, state);
}
if (flow->excess)
NLA_PUT_U32(skb, TCA_ATM_EXCESS, flow->classid);
else {
NLA_PUT_U32(skb, TCA_ATM_EXCESS, 0);
}
nla_nest_end(skb, nest);
return skb->len;
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
}
static int
atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
struct gnet_dump *d)
{
struct atm_flow_data *flow = (struct atm_flow_data *)arg;
flow->qstats.qlen = flow->q->q.qlen;
if (gnet_stats_copy_basic(d, &flow->bstats) < 0 ||
gnet_stats_copy_queue(d, &flow->qstats) < 0)
return -1;
return 0;
}
static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb)
{
return 0;
}
static const struct Qdisc_class_ops atm_class_ops = {
.graft = atm_tc_graft,
.leaf = atm_tc_leaf,
.get = atm_tc_get,
.put = atm_tc_put,
.change = atm_tc_change,
.delete = atm_tc_delete,
.walk = atm_tc_walk,
.tcf_chain = atm_tc_find_tcf,
.bind_tcf = atm_tc_bind_filter,
.unbind_tcf = atm_tc_put,
.dump = atm_tc_dump_class,
.dump_stats = atm_tc_dump_class_stats,
};
static struct Qdisc_ops atm_qdisc_ops __read_mostly = {
.cl_ops = &atm_class_ops,
.id = "atm",
.priv_size = sizeof(struct atm_qdisc_data),
.enqueue = atm_tc_enqueue,
.dequeue = atm_tc_dequeue,
.peek = atm_tc_peek,
.drop = atm_tc_drop,
.init = atm_tc_init,
.reset = atm_tc_reset,
.destroy = atm_tc_destroy,
.dump = atm_tc_dump,
.owner = THIS_MODULE,
};
static int __init atm_init(void)
{
return register_qdisc(&atm_qdisc_ops);
}
static void __exit atm_exit(void)
{
unregister_qdisc(&atm_qdisc_ops);
}
module_init(atm_init)
module_exit(atm_exit)
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,53 @@
/*
* net/sched/sch_blackhole.c Black hole queue
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*
* Note: Quantum tunneling is not supported.
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
qdisc_drop(skb, sch);
return NET_XMIT_SUCCESS;
}
static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
{
return NULL;
}
static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = {
.id = "blackhole",
.priv_size = 0,
.enqueue = blackhole_enqueue,
.dequeue = blackhole_dequeue,
.peek = blackhole_dequeue,
.owner = THIS_MODULE,
};
static int __init blackhole_module_init(void)
{
return register_qdisc(&blackhole_qdisc_ops);
}
static void __exit blackhole_module_exit(void)
{
unregister_qdisc(&blackhole_qdisc_ops);
}
module_init(blackhole_module_init)
module_exit(blackhole_module_exit)
MODULE_LICENSE("GPL");

2071
kernel/net/sched/sch_cbq.c Normal file

File diff suppressed because it is too large Load Diff

528
kernel/net/sched/sch_drr.c Normal file
View File

@@ -0,0 +1,528 @@
/*
* net/sched/sch_drr.c Deficit Round Robin scheduler
*
* Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/pkt_sched.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
struct drr_class {
struct Qdisc_class_common common;
unsigned int refcnt;
unsigned int filter_cnt;
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
struct gnet_stats_rate_est rate_est;
struct list_head alist;
struct Qdisc *qdisc;
u32 quantum;
u32 deficit;
};
struct drr_sched {
struct list_head active;
struct tcf_proto *filter_list;
struct Qdisc_class_hash clhash;
};
static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid)
{
struct drr_sched *q = qdisc_priv(sch);
struct Qdisc_class_common *clc;
clc = qdisc_class_find(&q->clhash, classid);
if (clc == NULL)
return NULL;
return container_of(clc, struct drr_class, common);
}
static void drr_purge_queue(struct drr_class *cl)
{
unsigned int len = cl->qdisc->q.qlen;
qdisc_reset(cl->qdisc);
qdisc_tree_decrease_qlen(cl->qdisc, len);
}
static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = {
[TCA_DRR_QUANTUM] = { .type = NLA_U32 },
};
static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
struct nlattr **tca, unsigned long *arg)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl = (struct drr_class *)*arg;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_DRR_MAX + 1];
u32 quantum;
int err;
if (!opt)
return -EINVAL;
err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy);
if (err < 0)
return err;
if (tb[TCA_DRR_QUANTUM]) {
quantum = nla_get_u32(tb[TCA_DRR_QUANTUM]);
if (quantum == 0)
return -EINVAL;
} else
quantum = psched_mtu(qdisc_dev(sch));
if (cl != NULL) {
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
qdisc_root_sleeping_lock(sch),
tca[TCA_RATE]);
if (err)
return err;
}
sch_tree_lock(sch);
if (tb[TCA_DRR_QUANTUM])
cl->quantum = quantum;
sch_tree_unlock(sch);
return 0;
}
cl = kzalloc(sizeof(struct drr_class), GFP_KERNEL);
if (cl == NULL)
return -ENOBUFS;
cl->refcnt = 1;
cl->common.classid = classid;
cl->quantum = quantum;
cl->qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
&pfifo_qdisc_ops, classid);
if (cl->qdisc == NULL)
cl->qdisc = &noop_qdisc;
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
qdisc_root_sleeping_lock(sch),
tca[TCA_RATE]);
if (err) {
qdisc_destroy(cl->qdisc);
kfree(cl);
return err;
}
}
sch_tree_lock(sch);
qdisc_class_hash_insert(&q->clhash, &cl->common);
sch_tree_unlock(sch);
qdisc_class_hash_grow(sch, &q->clhash);
*arg = (unsigned long)cl;
return 0;
}
static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
{
gen_kill_estimator(&cl->bstats, &cl->rate_est);
qdisc_destroy(cl->qdisc);
kfree(cl);
}
static int drr_delete_class(struct Qdisc *sch, unsigned long arg)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl = (struct drr_class *)arg;
if (cl->filter_cnt > 0)
return -EBUSY;
sch_tree_lock(sch);
drr_purge_queue(cl);
qdisc_class_hash_remove(&q->clhash, &cl->common);
BUG_ON(--cl->refcnt == 0);
/*
* This shouldn't happen: we "hold" one cops->get() when called
* from tc_ctl_tclass; the destroy method is done from cops->put().
*/
sch_tree_unlock(sch);
return 0;
}
static unsigned long drr_get_class(struct Qdisc *sch, u32 classid)
{
struct drr_class *cl = drr_find_class(sch, classid);
if (cl != NULL)
cl->refcnt++;
return (unsigned long)cl;
}
static void drr_put_class(struct Qdisc *sch, unsigned long arg)
{
struct drr_class *cl = (struct drr_class *)arg;
if (--cl->refcnt == 0)
drr_destroy_class(sch, cl);
}
static struct tcf_proto **drr_tcf_chain(struct Qdisc *sch, unsigned long cl)
{
struct drr_sched *q = qdisc_priv(sch);
if (cl)
return NULL;
return &q->filter_list;
}
static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
struct drr_class *cl = drr_find_class(sch, classid);
if (cl != NULL)
cl->filter_cnt++;
return (unsigned long)cl;
}
static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg)
{
struct drr_class *cl = (struct drr_class *)arg;
cl->filter_cnt--;
}
static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
struct Qdisc *new, struct Qdisc **old)
{
struct drr_class *cl = (struct drr_class *)arg;
if (new == NULL) {
new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
&pfifo_qdisc_ops, cl->common.classid);
if (new == NULL)
new = &noop_qdisc;
}
sch_tree_lock(sch);
drr_purge_queue(cl);
*old = cl->qdisc;
cl->qdisc = new;
sch_tree_unlock(sch);
return 0;
}
static struct Qdisc *drr_class_leaf(struct Qdisc *sch, unsigned long arg)
{
struct drr_class *cl = (struct drr_class *)arg;
return cl->qdisc;
}
static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg)
{
struct drr_class *cl = (struct drr_class *)arg;
if (cl->qdisc->q.qlen == 0)
list_del(&cl->alist);
}
static int drr_dump_class(struct Qdisc *sch, unsigned long arg,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct drr_class *cl = (struct drr_class *)arg;
struct nlattr *nest;
tcm->tcm_parent = TC_H_ROOT;
tcm->tcm_handle = cl->common.classid;
tcm->tcm_info = cl->qdisc->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
NLA_PUT_U32(skb, TCA_DRR_QUANTUM, cl->quantum);
return nla_nest_end(skb, nest);
nla_put_failure:
nla_nest_cancel(skb, nest);
return -EMSGSIZE;
}
static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
struct gnet_dump *d)
{
struct drr_class *cl = (struct drr_class *)arg;
struct tc_drr_stats xstats;
memset(&xstats, 0, sizeof(xstats));
if (cl->qdisc->q.qlen) {
xstats.deficit = cl->deficit;
cl->qdisc->qstats.qlen = cl->qdisc->q.qlen;
}
if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0)
return -1;
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
}
static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
struct hlist_node *n;
unsigned int i;
if (arg->stop)
return;
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
arg->stop = 1;
return;
}
arg->count++;
}
}
}
static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
int *qerr)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
struct tcf_result res;
int result;
if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
cl = drr_find_class(sch, skb->priority);
if (cl != NULL)
return cl;
}
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
result = tc_classify(skb, q->filter_list, &res);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
case TC_ACT_QUEUED:
case TC_ACT_STOLEN:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
return NULL;
}
#endif
cl = (struct drr_class *)res.class;
if (cl == NULL)
cl = drr_find_class(sch, res.classid);
return cl;
}
return NULL;
}
static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
unsigned int len;
int err;
cl = drr_classify(skb, sch, &err);
if (cl == NULL) {
if (err & __NET_XMIT_BYPASS)
sch->qstats.drops++;
kfree_skb(skb);
return err;
}
len = qdisc_pkt_len(skb);
err = qdisc_enqueue(skb, cl->qdisc);
if (unlikely(err != NET_XMIT_SUCCESS)) {
if (net_xmit_drop_count(err)) {
cl->qstats.drops++;
sch->qstats.drops++;
}
return err;
}
if (cl->qdisc->q.qlen == 1) {
list_add_tail(&cl->alist, &q->active);
cl->deficit = cl->quantum;
}
cl->bstats.packets++;
cl->bstats.bytes += len;
sch->bstats.packets++;
sch->bstats.bytes += len;
sch->q.qlen++;
return err;
}
static struct sk_buff *drr_dequeue(struct Qdisc *sch)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
struct sk_buff *skb;
unsigned int len;
if (list_empty(&q->active))
goto out;
while (1) {
cl = list_first_entry(&q->active, struct drr_class, alist);
skb = cl->qdisc->ops->peek(cl->qdisc);
if (skb == NULL)
goto out;
len = qdisc_pkt_len(skb);
if (len <= cl->deficit) {
cl->deficit -= len;
skb = qdisc_dequeue_peeked(cl->qdisc);
if (cl->qdisc->q.qlen == 0)
list_del(&cl->alist);
sch->q.qlen--;
return skb;
}
cl->deficit += cl->quantum;
list_move_tail(&cl->alist, &q->active);
}
out:
return NULL;
}
static unsigned int drr_drop(struct Qdisc *sch)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
unsigned int len;
list_for_each_entry(cl, &q->active, alist) {
if (cl->qdisc->ops->drop) {
len = cl->qdisc->ops->drop(cl->qdisc);
if (len > 0) {
sch->q.qlen--;
if (cl->qdisc->q.qlen == 0)
list_del(&cl->alist);
return len;
}
}
}
return 0;
}
static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
{
struct drr_sched *q = qdisc_priv(sch);
int err;
err = qdisc_class_hash_init(&q->clhash);
if (err < 0)
return err;
INIT_LIST_HEAD(&q->active);
return 0;
}
static void drr_reset_qdisc(struct Qdisc *sch)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
struct hlist_node *n;
unsigned int i;
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
if (cl->qdisc->q.qlen)
list_del(&cl->alist);
qdisc_reset(cl->qdisc);
}
}
sch->q.qlen = 0;
}
static void drr_destroy_qdisc(struct Qdisc *sch)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
struct hlist_node *n, *next;
unsigned int i;
tcf_destroy_chain(&q->filter_list);
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
common.hnode)
drr_destroy_class(sch, cl);
}
qdisc_class_hash_destroy(&q->clhash);
}
static const struct Qdisc_class_ops drr_class_ops = {
.change = drr_change_class,
.delete = drr_delete_class,
.get = drr_get_class,
.put = drr_put_class,
.tcf_chain = drr_tcf_chain,
.bind_tcf = drr_bind_tcf,
.unbind_tcf = drr_unbind_tcf,
.graft = drr_graft_class,
.leaf = drr_class_leaf,
.qlen_notify = drr_qlen_notify,
.dump = drr_dump_class,
.dump_stats = drr_dump_class_stats,
.walk = drr_walk,
};
static struct Qdisc_ops drr_qdisc_ops __read_mostly = {
.cl_ops = &drr_class_ops,
.id = "drr",
.priv_size = sizeof(struct drr_sched),
.enqueue = drr_enqueue,
.dequeue = drr_dequeue,
.peek = qdisc_peek_dequeued,
.drop = drr_drop,
.init = drr_init_qdisc,
.reset = drr_reset_qdisc,
.destroy = drr_destroy_qdisc,
.owner = THIS_MODULE,
};
static int __init drr_init(void)
{
return register_qdisc(&drr_qdisc_ops);
}
static void __exit drr_exit(void)
{
unregister_qdisc(&drr_qdisc_ops);
}
module_init(drr_init);
module_exit(drr_exit);
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,512 @@
/* net/sched/sch_dsmark.c - Differentiated Services field marker */
/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/bitops.h>
#include <net/pkt_sched.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
#include <asm/byteorder.h>
/*
* classid class marking
* ------- ----- -------
* n/a 0 n/a
* x:0 1 use entry [0]
* ... ... ...
* x:y y>0 y+1 use entry [y]
* ... ... ...
* x:indices-1 indices use entry [indices-1]
* ... ... ...
* x:y y+1 use entry [y & (indices-1)]
* ... ... ...
* 0xffff 0x10000 use entry [indices-1]
*/
#define NO_DEFAULT_INDEX (1 << 16)
struct dsmark_qdisc_data {
struct Qdisc *q;
struct tcf_proto *filter_list;
u8 *mask; /* "owns" the array */
u8 *value;
u16 indices;
u32 default_index; /* index range is 0...0xffff */
int set_tc_index;
};
static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
{
return (index <= p->indices && index > 0);
}
/* ------------------------- Class/flow operations ------------------------- */
static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
struct Qdisc *new, struct Qdisc **old)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
pr_debug("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",
sch, p, new, old);
if (new == NULL) {
new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
&pfifo_qdisc_ops,
sch->handle);
if (new == NULL)
new = &noop_qdisc;
}
sch_tree_lock(sch);
*old = p->q;
p->q = new;
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
qdisc_reset(*old);
sch_tree_unlock(sch);
return 0;
}
static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
return p->q;
}
static unsigned long dsmark_get(struct Qdisc *sch, u32 classid)
{
pr_debug("dsmark_get(sch %p,[qdisc %p],classid %x)\n",
sch, qdisc_priv(sch), classid);
return TC_H_MIN(classid) + 1;
}
static unsigned long dsmark_bind_filter(struct Qdisc *sch,
unsigned long parent, u32 classid)
{
return dsmark_get(sch, classid);
}
static void dsmark_put(struct Qdisc *sch, unsigned long cl)
{
}
static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = {
[TCA_DSMARK_INDICES] = { .type = NLA_U16 },
[TCA_DSMARK_DEFAULT_INDEX] = { .type = NLA_U16 },
[TCA_DSMARK_SET_TC_INDEX] = { .type = NLA_FLAG },
[TCA_DSMARK_MASK] = { .type = NLA_U8 },
[TCA_DSMARK_VALUE] = { .type = NLA_U8 },
};
static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
struct nlattr **tca, unsigned long *arg)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_DSMARK_MAX + 1];
int err = -EINVAL;
u8 mask = 0;
pr_debug("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x),"
"arg 0x%lx\n", sch, p, classid, parent, *arg);
if (!dsmark_valid_index(p, *arg)) {
err = -ENOENT;
goto errout;
}
if (!opt)
goto errout;
err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy);
if (err < 0)
goto errout;
if (tb[TCA_DSMARK_MASK])
mask = nla_get_u8(tb[TCA_DSMARK_MASK]);
if (tb[TCA_DSMARK_VALUE])
p->value[*arg-1] = nla_get_u8(tb[TCA_DSMARK_VALUE]);
if (tb[TCA_DSMARK_MASK])
p->mask[*arg-1] = mask;
err = 0;
errout:
return err;
}
static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
if (!dsmark_valid_index(p, arg))
return -EINVAL;
p->mask[arg-1] = 0xff;
p->value[arg-1] = 0;
return 0;
}
static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
int i;
pr_debug("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
if (walker->stop)
return;
for (i = 0; i < p->indices; i++) {
if (p->mask[i] == 0xff && !p->value[i])
goto ignore;
if (walker->count >= walker->skip) {
if (walker->fn(sch, i+1, walker) < 0) {
walker->stop = 1;
break;
}
}
ignore:
walker->count++;
}
}
static inline struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,
unsigned long cl)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
return &p->filter_list;
}
/* --------------------------- Qdisc operations ---------------------------- */
static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
int err;
pr_debug("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
if (p->set_tc_index) {
switch (skb->protocol) {
case htons(ETH_P_IP):
if (skb_cow_head(skb, sizeof(struct iphdr)))
goto drop;
skb->tc_index = ipv4_get_dsfield(ip_hdr(skb))
& ~INET_ECN_MASK;
break;
case htons(ETH_P_IPV6):
if (skb_cow_head(skb, sizeof(struct ipv6hdr)))
goto drop;
skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb))
& ~INET_ECN_MASK;
break;
default:
skb->tc_index = 0;
break;
}
}
if (TC_H_MAJ(skb->priority) == sch->handle)
skb->tc_index = TC_H_MIN(skb->priority);
else {
struct tcf_result res;
int result = tc_classify(skb, p->filter_list, &res);
pr_debug("result %d class 0x%04x\n", result, res.classid);
switch (result) {
#ifdef CONFIG_NET_CLS_ACT
case TC_ACT_QUEUED:
case TC_ACT_STOLEN:
kfree_skb(skb);
return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
goto drop;
#endif
case TC_ACT_OK:
skb->tc_index = TC_H_MIN(res.classid);
break;
default:
if (p->default_index != NO_DEFAULT_INDEX)
skb->tc_index = p->default_index;
break;
}
}
err = qdisc_enqueue(skb, p->q);
if (err != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(err))
sch->qstats.drops++;
return err;
}
sch->bstats.bytes += qdisc_pkt_len(skb);
sch->bstats.packets++;
sch->q.qlen++;
return NET_XMIT_SUCCESS;
drop:
kfree_skb(skb);
sch->qstats.drops++;
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
}
static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
struct sk_buff *skb;
u32 index;
pr_debug("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p);
skb = p->q->ops->dequeue(p->q);
if (skb == NULL)
return NULL;
sch->q.qlen--;
index = skb->tc_index & (p->indices - 1);
pr_debug("index %d->%d\n", skb->tc_index, index);
switch (skb->protocol) {
case htons(ETH_P_IP):
ipv4_change_dsfield(ip_hdr(skb), p->mask[index],
p->value[index]);
break;
case htons(ETH_P_IPV6):
ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index],
p->value[index]);
break;
default:
/*
* Only complain if a change was actually attempted.
* This way, we can send non-IP traffic through dsmark
* and don't need yet another qdisc as a bypass.
*/
if (p->mask[index] != 0xff || p->value[index])
printk(KERN_WARNING
"dsmark_dequeue: unsupported protocol %d\n",
ntohs(skb->protocol));
break;
}
return skb;
}
static struct sk_buff *dsmark_peek(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
pr_debug("dsmark_peek(sch %p,[qdisc %p])\n", sch, p);
return p->q->ops->peek(p->q);
}
static unsigned int dsmark_drop(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
unsigned int len;
pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
if (p->q->ops->drop == NULL)
return 0;
len = p->q->ops->drop(p->q);
if (len)
sch->q.qlen--;
return len;
}
static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
struct nlattr *tb[TCA_DSMARK_MAX + 1];
int err = -EINVAL;
u32 default_index = NO_DEFAULT_INDEX;
u16 indices;
u8 *mask;
pr_debug("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
if (!opt)
goto errout;
err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy);
if (err < 0)
goto errout;
err = -EINVAL;
indices = nla_get_u16(tb[TCA_DSMARK_INDICES]);
if (hweight32(indices) != 1)
goto errout;
if (tb[TCA_DSMARK_DEFAULT_INDEX])
default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]);
mask = kmalloc(indices * 2, GFP_KERNEL);
if (mask == NULL) {
err = -ENOMEM;
goto errout;
}
p->mask = mask;
memset(p->mask, 0xff, indices);
p->value = p->mask + indices;
memset(p->value, 0, indices);
p->indices = indices;
p->default_index = default_index;
p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]);
p->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
&pfifo_qdisc_ops, sch->handle);
if (p->q == NULL)
p->q = &noop_qdisc;
pr_debug("dsmark_init: qdisc %p\n", p->q);
err = 0;
errout:
return err;
}
static void dsmark_reset(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
pr_debug("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
qdisc_reset(p->q);
sch->q.qlen = 0;
}
static void dsmark_destroy(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
pr_debug("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p);
tcf_destroy_chain(&p->filter_list);
qdisc_destroy(p->q);
kfree(p->mask);
}
static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
struct nlattr *opts = NULL;
pr_debug("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl);
if (!dsmark_valid_index(p, cl))
return -EINVAL;
tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl-1);
tcm->tcm_info = p->q->handle;
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl-1]);
NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl-1]);
return nla_nest_end(skb, opts);
nla_put_failure:
nla_nest_cancel(skb, opts);
return -EMSGSIZE;
}
static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
struct nlattr *opts = NULL;
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
NLA_PUT_U16(skb, TCA_DSMARK_INDICES, p->indices);
if (p->default_index != NO_DEFAULT_INDEX)
NLA_PUT_U16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index);
if (p->set_tc_index)
NLA_PUT_FLAG(skb, TCA_DSMARK_SET_TC_INDEX);
return nla_nest_end(skb, opts);
nla_put_failure:
nla_nest_cancel(skb, opts);
return -EMSGSIZE;
}
static const struct Qdisc_class_ops dsmark_class_ops = {
.graft = dsmark_graft,
.leaf = dsmark_leaf,
.get = dsmark_get,
.put = dsmark_put,
.change = dsmark_change,
.delete = dsmark_delete,
.walk = dsmark_walk,
.tcf_chain = dsmark_find_tcf,
.bind_tcf = dsmark_bind_filter,
.unbind_tcf = dsmark_put,
.dump = dsmark_dump_class,
};
static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = {
.next = NULL,
.cl_ops = &dsmark_class_ops,
.id = "dsmark",
.priv_size = sizeof(struct dsmark_qdisc_data),
.enqueue = dsmark_enqueue,
.dequeue = dsmark_dequeue,
.peek = dsmark_peek,
.drop = dsmark_drop,
.init = dsmark_init,
.reset = dsmark_reset,
.destroy = dsmark_destroy,
.change = NULL,
.dump = dsmark_dump,
.owner = THIS_MODULE,
};
static int __init dsmark_module_init(void)
{
return register_qdisc(&dsmark_qdisc_ops);
}
static void __exit dsmark_module_exit(void)
{
unregister_qdisc(&dsmark_qdisc_ops);
}
module_init(dsmark_module_init)
module_exit(dsmark_module_exit)
MODULE_LICENSE("GPL");

152
kernel/net/sched/sch_fifo.c Normal file
View File

@@ -0,0 +1,152 @@
/*
* net/sched/sch_fifo.c The simplest FIFO queue.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
/* 1 band FIFO pseudo-"scheduler" */
struct fifo_sched_data
{
u32 limit;
};
static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct fifo_sched_data *q = qdisc_priv(sch);
if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= q->limit))
return qdisc_enqueue_tail(skb, sch);
return qdisc_reshape_fail(skb, sch);
}
static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct fifo_sched_data *q = qdisc_priv(sch);
if (likely(skb_queue_len(&sch->q) < q->limit))
return qdisc_enqueue_tail(skb, sch);
return qdisc_reshape_fail(skb, sch);
}
static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
{
struct fifo_sched_data *q = qdisc_priv(sch);
if (opt == NULL) {
u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1;
if (sch->ops == &bfifo_qdisc_ops)
limit *= psched_mtu(qdisc_dev(sch));
q->limit = limit;
} else {
struct tc_fifo_qopt *ctl = nla_data(opt);
if (nla_len(opt) < sizeof(*ctl))
return -EINVAL;
q->limit = ctl->limit;
}
return 0;
}
static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct fifo_sched_data *q = qdisc_priv(sch);
struct tc_fifo_qopt opt = { .limit = q->limit };
NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
return skb->len;
nla_put_failure:
return -1;
}
struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
.id = "pfifo",
.priv_size = sizeof(struct fifo_sched_data),
.enqueue = pfifo_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
.drop = qdisc_queue_drop,
.init = fifo_init,
.reset = qdisc_reset_queue,
.change = fifo_init,
.dump = fifo_dump,
.owner = THIS_MODULE,
};
EXPORT_SYMBOL(pfifo_qdisc_ops);
struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
.id = "bfifo",
.priv_size = sizeof(struct fifo_sched_data),
.enqueue = bfifo_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
.drop = qdisc_queue_drop,
.init = fifo_init,
.reset = qdisc_reset_queue,
.change = fifo_init,
.dump = fifo_dump,
.owner = THIS_MODULE,
};
EXPORT_SYMBOL(bfifo_qdisc_ops);
/* Pass size change message down to embedded FIFO */
int fifo_set_limit(struct Qdisc *q, unsigned int limit)
{
struct nlattr *nla;
int ret = -ENOMEM;
/* Hack to avoid sending change message to non-FIFO */
if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
return 0;
nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
if (nla) {
nla->nla_type = RTM_NEWQDISC;
nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt));
((struct tc_fifo_qopt *)nla_data(nla))->limit = limit;
ret = q->ops->change(q, nla);
kfree(nla);
}
return ret;
}
EXPORT_SYMBOL(fifo_set_limit);
struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
unsigned int limit)
{
struct Qdisc *q;
int err = -ENOMEM;
q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
ops, TC_H_MAKE(sch->handle, 1));
if (q) {
err = fifo_set_limit(q, limit);
if (err < 0) {
qdisc_destroy(q);
q = NULL;
}
}
return q ? : ERR_PTR(err);
}
EXPORT_SYMBOL(fifo_create_dflt);

View File

@@ -0,0 +1,860 @@
/*
* net/sched/sch_generic.c Generic packet scheduler routines.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Jamal Hadi Salim, <hadi@cyberus.ca> 990601
* - Ingress support
*/
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <net/pkt_sched.h>
/* Main transmission queue. */
/* Modifications to data participating in scheduling must be protected with
* qdisc_lock(qdisc) spinlock.
*
* The idea is the following:
* - enqueue, dequeue are serialized via qdisc root lock
* - ingress filtering is also serialized via qdisc root lock
* - updates to tree and tree walking are only done under the rtnl mutex.
*/
static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
{
q->gso_skb = skb;
q->qstats.requeues++;
q->q.qlen++; /* it's still part of the queue */
__netif_schedule(q);
return 0;
}
static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
{
struct sk_buff *skb = q->gso_skb;
if (unlikely(skb)) {
struct net_device *dev = qdisc_dev(q);
struct netdev_queue *txq;
/* check the reason of requeuing without tx lock first */
txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
if (!netif_tx_queue_stopped(txq) &&
!netif_tx_queue_frozen(txq)) {
q->gso_skb = NULL;
q->q.qlen--;
} else
skb = NULL;
} else {
skb = q->dequeue(q);
}
return skb;
}
static inline int handle_dev_cpu_collision(struct sk_buff *skb,
struct netdev_queue *dev_queue,
struct Qdisc *q)
{
int ret;
if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
/*
* Same CPU holding the lock. It may be a transient
* configuration error, when hard_start_xmit() recurses. We
* detect it by checking xmit owner and drop the packet when
* deadloop is detected. Return OK to try the next skb.
*/
kfree_skb(skb);
if (net_ratelimit())
printk(KERN_WARNING "Dead loop on netdevice %s, "
"fix it urgently!\n", dev_queue->dev->name);
ret = qdisc_qlen(q);
} else {
/*
* Another cpu is holding lock, requeue & delay xmits for
* some time.
*/
__get_cpu_var(netdev_rx_stat).cpu_collision++;
ret = dev_requeue_skb(skb, q);
}
return ret;
}
/*
* Transmit one skb, and handle the return status as required. Holding the
* __QDISC_STATE_RUNNING bit guarantees that only one CPU can execute this
* function.
*
* Returns to the caller:
* 0 - queue is empty or throttled.
* >0 - queue is not empty.
*/
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev, struct netdev_queue *txq,
spinlock_t *root_lock)
{
int ret = NETDEV_TX_BUSY;
/* And release qdisc */
spin_unlock(root_lock);
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (!netif_tx_queue_stopped(txq) &&
!netif_tx_queue_frozen(txq))
ret = dev_hard_start_xmit(skb, dev, txq);
HARD_TX_UNLOCK(dev, txq);
spin_lock(root_lock);
switch (ret) {
case NETDEV_TX_OK:
/* Driver sent out skb successfully */
ret = qdisc_qlen(q);
break;
case NETDEV_TX_LOCKED:
/* Driver try lock failed */
ret = handle_dev_cpu_collision(skb, txq, q);
break;
default:
/* Driver returned NETDEV_TX_BUSY - requeue skb */
if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
printk(KERN_WARNING "BUG %s code %d qlen %d\n",
dev->name, ret, q->q.qlen);
ret = dev_requeue_skb(skb, q);
break;
}
if (ret && (netif_tx_queue_stopped(txq) ||
netif_tx_queue_frozen(txq)))
ret = 0;
return ret;
}
/*
* NOTE: Called under qdisc_lock(q) with locally disabled BH.
*
* __QDISC_STATE_RUNNING guarantees only one CPU can process
* this qdisc at a time. qdisc_lock(q) serializes queue accesses for
* this queue.
*
* netif_tx_lock serializes accesses to device driver.
*
* qdisc_lock(q) and netif_tx_lock are mutually exclusive,
* if one is grabbed, another must be free.
*
* Note, that this procedure can be called by a watchdog timer
*
* Returns to the caller:
* 0 - queue is empty or throttled.
* >0 - queue is not empty.
*
*/
static inline int qdisc_restart(struct Qdisc *q)
{
struct netdev_queue *txq;
struct net_device *dev;
spinlock_t *root_lock;
struct sk_buff *skb;
/* Dequeue packet */
skb = dequeue_skb(q);
if (unlikely(!skb))
return 0;
root_lock = qdisc_lock(q);
dev = qdisc_dev(q);
txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
return sch_direct_xmit(skb, q, dev, txq, root_lock);
}
void __qdisc_run(struct Qdisc *q)
{
unsigned long start_time = jiffies;
while (qdisc_restart(q)) {
/*
* Postpone processing if
* 1. another process needs the CPU;
* 2. we've been doing it for too long.
*/
if (need_resched() || jiffies != start_time) {
__netif_schedule(q);
break;
}
}
clear_bit(__QDISC_STATE_RUNNING, &q->state);
}
unsigned long dev_trans_start(struct net_device *dev)
{
unsigned long val, res = dev->trans_start;
unsigned int i;
for (i = 0; i < dev->num_tx_queues; i++) {
val = netdev_get_tx_queue(dev, i)->trans_start;
if (val && time_after(val, res))
res = val;
}
dev->trans_start = res;
return res;
}
EXPORT_SYMBOL(dev_trans_start);
static void dev_watchdog(unsigned long arg)
{
struct net_device *dev = (struct net_device *)arg;
netif_tx_lock(dev);
if (!qdisc_tx_is_noop(dev)) {
if (netif_device_present(dev) &&
netif_running(dev) &&
netif_carrier_ok(dev)) {
int some_queue_timedout = 0;
unsigned int i;
unsigned long trans_start;
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq;
txq = netdev_get_tx_queue(dev, i);
/*
* old device drivers set dev->trans_start
*/
trans_start = txq->trans_start ? : dev->trans_start;
if (netif_tx_queue_stopped(txq) &&
time_after(jiffies, (trans_start +
dev->watchdog_timeo))) {
some_queue_timedout = 1;
break;
}
}
if (some_queue_timedout) {
char drivername[64];
WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
dev->name, netdev_drivername(dev, drivername, 64), i);
dev->netdev_ops->ndo_tx_timeout(dev);
}
if (!mod_timer(&dev->watchdog_timer,
round_jiffies(jiffies +
dev->watchdog_timeo)))
dev_hold(dev);
}
}
netif_tx_unlock(dev);
dev_put(dev);
}
void __netdev_watchdog_up(struct net_device *dev)
{
if (dev->netdev_ops->ndo_tx_timeout) {
if (dev->watchdog_timeo <= 0)
dev->watchdog_timeo = 5*HZ;
if (!mod_timer(&dev->watchdog_timer,
round_jiffies(jiffies + dev->watchdog_timeo)))
dev_hold(dev);
}
}
static void dev_watchdog_up(struct net_device *dev)
{
__netdev_watchdog_up(dev);
}
static void dev_watchdog_down(struct net_device *dev)
{
netif_tx_lock_bh(dev);
if (del_timer(&dev->watchdog_timer))
dev_put(dev);
netif_tx_unlock_bh(dev);
}
/**
* netif_carrier_on - set carrier
* @dev: network device
*
* Device has detected that carrier.
*/
void netif_carrier_on(struct net_device *dev)
{
if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
if (dev->reg_state == NETREG_UNINITIALIZED)
return;
linkwatch_fire_event(dev);
if (netif_running(dev))
__netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(netif_carrier_on);
/**
* netif_carrier_off - clear carrier
* @dev: network device
*
* Device has detected loss of carrier.
*/
void netif_carrier_off(struct net_device *dev)
{
if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
if (dev->reg_state == NETREG_UNINITIALIZED)
return;
linkwatch_fire_event(dev);
}
}
EXPORT_SYMBOL(netif_carrier_off);
/**
* netif_notify_peers - notify network peers about existence of @dev
* @dev: network device
*
* Generate traffic such that interested network peers are aware of
* @dev, such as by generating a gratuitous ARP. This may be used when
* a device wants to inform the rest of the network about some sort of
* reconfiguration such as a failover event or virtual machine
* migration.
*/
void netif_notify_peers(struct net_device *dev)
{
rtnl_lock();
call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
rtnl_unlock();
}
EXPORT_SYMBOL(netif_notify_peers);
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
under all circumstances. It is difficult to invent anything faster or
cheaper.
*/
static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
{
kfree_skb(skb);
return NET_XMIT_CN;
}
static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
{
return NULL;
}
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
.id = "noop",
.priv_size = 0,
.enqueue = noop_enqueue,
.dequeue = noop_dequeue,
.peek = noop_dequeue,
.owner = THIS_MODULE,
};
static struct netdev_queue noop_netdev_queue = {
.qdisc = &noop_qdisc,
.qdisc_sleeping = &noop_qdisc,
};
struct Qdisc noop_qdisc = {
.enqueue = noop_enqueue,
.dequeue = noop_dequeue,
.flags = TCQ_F_BUILTIN,
.ops = &noop_qdisc_ops,
.list = LIST_HEAD_INIT(noop_qdisc.list),
.q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
.dev_queue = &noop_netdev_queue,
};
EXPORT_SYMBOL(noop_qdisc);
static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
.id = "noqueue",
.priv_size = 0,
.enqueue = noop_enqueue,
.dequeue = noop_dequeue,
.peek = noop_dequeue,
.owner = THIS_MODULE,
};
static struct Qdisc noqueue_qdisc;
static struct netdev_queue noqueue_netdev_queue = {
.qdisc = &noqueue_qdisc,
.qdisc_sleeping = &noqueue_qdisc,
};
static struct Qdisc noqueue_qdisc = {
.enqueue = NULL,
.dequeue = noop_dequeue,
.flags = TCQ_F_BUILTIN,
.ops = &noqueue_qdisc_ops,
.list = LIST_HEAD_INIT(noqueue_qdisc.list),
.q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
.dev_queue = &noqueue_netdev_queue,
};
static const u8 prio2band[TC_PRIO_MAX+1] =
{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
/* 3-band FIFO queue: old style, but should be a bit faster than
generic prio+fifo combination.
*/
#define PFIFO_FAST_BANDS 3
/*
* Private data for a pfifo_fast scheduler containing:
* - queues for the three band
* - bitmap indicating which of the bands contain skbs
*/
struct pfifo_fast_priv {
u32 bitmap;
struct sk_buff_head q[PFIFO_FAST_BANDS];
};
/*
* Convert a bitmap to the first band number where an skb is queued, where:
* bitmap=0 means there are no skbs on any band.
* bitmap=1 means there is an skb on band 0.
* bitmap=7 means there are skbs on all 3 bands, etc.
*/
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
int band)
{
return priv->q + band;
}
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
{
if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
int band = prio2band[skb->priority & TC_PRIO_MAX];
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
struct sk_buff_head *list = band2list(priv, band);
priv->bitmap |= (1 << band);
qdisc->q.qlen++;
return __qdisc_enqueue_tail(skb, qdisc, list);
}
return qdisc_drop(skb, qdisc);
}
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
{
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
int band = bitmap2band[priv->bitmap];
if (likely(band >= 0)) {
struct sk_buff_head *list = band2list(priv, band);
struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
qdisc->q.qlen--;
if (skb_queue_empty(list))
priv->bitmap &= ~(1 << band);
return skb;
}
return NULL;
}
static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc)
{
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
int band = bitmap2band[priv->bitmap];
if (band >= 0) {
struct sk_buff_head *list = band2list(priv, band);
return skb_peek(list);
}
return NULL;
}
static void pfifo_fast_reset(struct Qdisc* qdisc)
{
int prio;
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
__qdisc_reset_queue(qdisc, band2list(priv, prio));
priv->bitmap = 0;
qdisc->qstats.backlog = 0;
qdisc->q.qlen = 0;
}
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
return skb->len;
nla_put_failure:
return -1;
}
static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
{
int prio;
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
skb_queue_head_init(band2list(priv, prio));
return 0;
}
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
.id = "pfifo_fast",
.priv_size = sizeof(struct pfifo_fast_priv),
.enqueue = pfifo_fast_enqueue,
.dequeue = pfifo_fast_dequeue,
.peek = pfifo_fast_peek,
.init = pfifo_fast_init,
.reset = pfifo_fast_reset,
.dump = pfifo_fast_dump,
.owner = THIS_MODULE,
};
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
struct Qdisc_ops *ops)
{
void *p;
struct Qdisc *sch;
unsigned int size;
int err = -ENOBUFS;
/* ensure that the Qdisc and the private data are 32-byte aligned */
size = QDISC_ALIGN(sizeof(*sch));
size += ops->priv_size + (QDISC_ALIGNTO - 1);
p = kzalloc(size, GFP_KERNEL);
if (!p)
goto errout;
sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
sch->padded = (char *) sch - (char *) p;
INIT_LIST_HEAD(&sch->list);
skb_queue_head_init(&sch->q);
sch->ops = ops;
sch->enqueue = ops->enqueue;
sch->dequeue = ops->dequeue;
sch->dev_queue = dev_queue;
dev_hold(qdisc_dev(sch));
atomic_set(&sch->refcnt, 1);
return sch;
errout:
return ERR_PTR(err);
}
struct Qdisc * qdisc_create_dflt(struct net_device *dev,
struct netdev_queue *dev_queue,
struct Qdisc_ops *ops,
unsigned int parentid)
{
struct Qdisc *sch;
sch = qdisc_alloc(dev_queue, ops);
if (IS_ERR(sch))
goto errout;
sch->parent = parentid;
if (!ops->init || ops->init(sch, NULL) == 0)
return sch;
qdisc_destroy(sch);
errout:
return NULL;
}
EXPORT_SYMBOL(qdisc_create_dflt);
/* Under qdisc_lock(qdisc) and BH! */
void qdisc_reset(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
if (ops->reset)
ops->reset(qdisc);
if (qdisc->gso_skb) {
kfree_skb(qdisc->gso_skb);
qdisc->gso_skb = NULL;
qdisc->q.qlen = 0;
}
}
EXPORT_SYMBOL(qdisc_reset);
void qdisc_destroy(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
if (qdisc->flags & TCQ_F_BUILTIN ||
!atomic_dec_and_test(&qdisc->refcnt))
return;
#ifdef CONFIG_NET_SCHED
qdisc_list_del(qdisc);
qdisc_put_stab(qdisc->stab);
#endif
gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
if (ops->reset)
ops->reset(qdisc);
if (ops->destroy)
ops->destroy(qdisc);
module_put(ops->owner);
dev_put(qdisc_dev(qdisc));
kfree_skb(qdisc->gso_skb);
kfree((char *) qdisc - qdisc->padded);
}
EXPORT_SYMBOL(qdisc_destroy);
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
struct Qdisc *qdisc)
{
struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
spinlock_t *root_lock;
root_lock = qdisc_lock(oqdisc);
spin_lock_bh(root_lock);
/* Prune old scheduler */
if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
qdisc_reset(oqdisc);
/* ... and graft new one */
if (qdisc == NULL)
qdisc = &noop_qdisc;
dev_queue->qdisc_sleeping = qdisc;
rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
spin_unlock_bh(root_lock);
return oqdisc;
}
static void attach_one_default_qdisc(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_unused)
{
struct Qdisc *qdisc;
if (dev->tx_queue_len) {
qdisc = qdisc_create_dflt(dev, dev_queue,
&pfifo_fast_ops, TC_H_ROOT);
if (!qdisc) {
printk(KERN_INFO "%s: activation failed\n", dev->name);
return;
}
/* Can by-pass the queue discipline for default qdisc */
qdisc->flags |= TCQ_F_CAN_BYPASS;
} else {
qdisc = &noqueue_qdisc;
}
dev_queue->qdisc_sleeping = qdisc;
}
static void attach_default_qdiscs(struct net_device *dev)
{
struct netdev_queue *txq;
struct Qdisc *qdisc;
txq = netdev_get_tx_queue(dev, 0);
if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
dev->qdisc = txq->qdisc_sleeping;
atomic_inc(&dev->qdisc->refcnt);
} else {
qdisc = qdisc_create_dflt(dev, txq, &mq_qdisc_ops, TC_H_ROOT);
if (qdisc) {
qdisc->ops->attach(qdisc);
dev->qdisc = qdisc;
}
}
}
static void transition_one_qdisc(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_need_watchdog)
{
struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
int *need_watchdog_p = _need_watchdog;
if (!(new_qdisc->flags & TCQ_F_BUILTIN))
clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
dev_queue->trans_start = 0;
*need_watchdog_p = 1;
}
}
void dev_activate(struct net_device *dev)
{
int need_watchdog;
/* No queueing discipline is attached to device;
create default one i.e. pfifo_fast for devices,
which need queueing and noqueue_qdisc for
virtual interfaces
*/
if (dev->qdisc == &noop_qdisc)
attach_default_qdiscs(dev);
if (!netif_carrier_ok(dev))
/* Delay activation until next carrier-on event */
return;
need_watchdog = 0;
netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
transition_one_qdisc(dev, &dev->rx_queue, NULL);
if (need_watchdog) {
dev->trans_start = jiffies;
dev_watchdog_up(dev);
}
}
static void dev_deactivate_queue(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_qdisc_default)
{
struct Qdisc *qdisc_default = _qdisc_default;
struct Qdisc *qdisc;
qdisc = dev_queue->qdisc;
if (qdisc) {
spin_lock_bh(qdisc_lock(qdisc));
if (!(qdisc->flags & TCQ_F_BUILTIN))
set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
qdisc_reset(qdisc);
spin_unlock_bh(qdisc_lock(qdisc));
}
}
static bool some_qdisc_is_busy(struct net_device *dev)
{
unsigned int i;
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *dev_queue;
spinlock_t *root_lock;
struct Qdisc *q;
int val;
dev_queue = netdev_get_tx_queue(dev, i);
q = dev_queue->qdisc_sleeping;
root_lock = qdisc_lock(q);
spin_lock_bh(root_lock);
val = (test_bit(__QDISC_STATE_RUNNING, &q->state) ||
test_bit(__QDISC_STATE_SCHED, &q->state));
spin_unlock_bh(root_lock);
if (val)
return true;
}
return false;
}
void dev_deactivate(struct net_device *dev)
{
netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
dev_deactivate_queue(dev, &dev->rx_queue, &noop_qdisc);
dev_watchdog_down(dev);
/* Wait for outstanding qdisc-less dev_queue_xmit calls. */
synchronize_rcu();
/* Wait for outstanding qdisc_run calls. */
while (some_qdisc_is_busy(dev))
yield();
}
static void dev_init_scheduler_queue(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_qdisc)
{
struct Qdisc *qdisc = _qdisc;
dev_queue->qdisc = qdisc;
dev_queue->qdisc_sleeping = qdisc;
}
void dev_init_scheduler(struct net_device *dev)
{
dev->qdisc = &noop_qdisc;
netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
dev_init_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
}
static void shutdown_scheduler_queue(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_qdisc_default)
{
struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
struct Qdisc *qdisc_default = _qdisc_default;
if (qdisc) {
rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
dev_queue->qdisc_sleeping = qdisc_default;
qdisc_destroy(qdisc);
}
}
void dev_shutdown(struct net_device *dev)
{
netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
shutdown_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
qdisc_destroy(dev->qdisc);
dev->qdisc = &noop_qdisc;
WARN_ON(timer_pending(&dev->watchdog_timer));
}

608
kernel/net/sched/sch_gred.c Normal file
View File

@@ -0,0 +1,608 @@
/*
* net/sched/sch_gred.c Generic Random Early Detection queue.
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002
*
* 991129: - Bug fix with grio mode
* - a better sing. AvgQ mode with Grio(WRED)
* - A finer grained VQ dequeue based on sugestion
* from Ren Liu
* - More error checks
*
* For all the glorious comments look at include/net/red.h
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
#include <net/red.h>
#define GRED_DEF_PRIO (MAX_DPs / 2)
#define GRED_VQ_MASK (MAX_DPs - 1)
struct gred_sched_data;
struct gred_sched;
struct gred_sched_data
{
u32 limit; /* HARD maximal queue length */
u32 DP; /* the drop pramaters */
u32 bytesin; /* bytes seen on virtualQ so far*/
u32 packetsin; /* packets seen on virtualQ so far*/
u32 backlog; /* bytes on the virtualQ */
u8 prio; /* the prio of this vq */
struct red_parms parms;
struct red_stats stats;
};
enum {
GRED_WRED_MODE = 1,
GRED_RIO_MODE,
};
struct gred_sched
{
struct gred_sched_data *tab[MAX_DPs];
unsigned long flags;
u32 red_flags;
u32 DPs;
u32 def;
struct red_parms wred_set;
};
static inline int gred_wred_mode(struct gred_sched *table)
{
return test_bit(GRED_WRED_MODE, &table->flags);
}
static inline void gred_enable_wred_mode(struct gred_sched *table)
{
__set_bit(GRED_WRED_MODE, &table->flags);
}
static inline void gred_disable_wred_mode(struct gred_sched *table)
{
__clear_bit(GRED_WRED_MODE, &table->flags);
}
static inline int gred_rio_mode(struct gred_sched *table)
{
return test_bit(GRED_RIO_MODE, &table->flags);
}
static inline void gred_enable_rio_mode(struct gred_sched *table)
{
__set_bit(GRED_RIO_MODE, &table->flags);
}
static inline void gred_disable_rio_mode(struct gred_sched *table)
{
__clear_bit(GRED_RIO_MODE, &table->flags);
}
static inline int gred_wred_mode_check(struct Qdisc *sch)
{
struct gred_sched *table = qdisc_priv(sch);
int i;
/* Really ugly O(n^2) but shouldn't be necessary too frequent. */
for (i = 0; i < table->DPs; i++) {
struct gred_sched_data *q = table->tab[i];
int n;
if (q == NULL)
continue;
for (n = 0; n < table->DPs; n++)
if (table->tab[n] && table->tab[n] != q &&
table->tab[n]->prio == q->prio)
return 1;
}
return 0;
}
static inline unsigned int gred_backlog(struct gred_sched *table,
struct gred_sched_data *q,
struct Qdisc *sch)
{
if (gred_wred_mode(table))
return sch->qstats.backlog;
else
return q->backlog;
}
static inline u16 tc_index_to_dp(struct sk_buff *skb)
{
return skb->tc_index & GRED_VQ_MASK;
}
static inline void gred_load_wred_set(struct gred_sched *table,
struct gred_sched_data *q)
{
q->parms.qavg = table->wred_set.qavg;
q->parms.qidlestart = table->wred_set.qidlestart;
}
static inline void gred_store_wred_set(struct gred_sched *table,
struct gred_sched_data *q)
{
table->wred_set.qavg = q->parms.qavg;
}
static inline int gred_use_ecn(struct gred_sched *t)
{
return t->red_flags & TC_RED_ECN;
}
static inline int gred_use_harddrop(struct gred_sched *t)
{
return t->red_flags & TC_RED_HARDDROP;
}
static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct gred_sched_data *q=NULL;
struct gred_sched *t= qdisc_priv(sch);
unsigned long qavg = 0;
u16 dp = tc_index_to_dp(skb);
if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
dp = t->def;
if ((q = t->tab[dp]) == NULL) {
/* Pass through packets not assigned to a DP
* if no default DP has been configured. This
* allows for DP flows to be left untouched.
*/
if (skb_queue_len(&sch->q) < qdisc_dev(sch)->tx_queue_len)
return qdisc_enqueue_tail(skb, sch);
else
goto drop;
}
/* fix tc_index? --could be controvesial but needed for
requeueing */
skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;
}
/* sum up all the qaves of prios <= to ours to get the new qave */
if (!gred_wred_mode(t) && gred_rio_mode(t)) {
int i;
for (i = 0; i < t->DPs; i++) {
if (t->tab[i] && t->tab[i]->prio < q->prio &&
!red_is_idling(&t->tab[i]->parms))
qavg +=t->tab[i]->parms.qavg;
}
}
q->packetsin++;
q->bytesin += qdisc_pkt_len(skb);
if (gred_wred_mode(t))
gred_load_wred_set(t, q);
q->parms.qavg = red_calc_qavg(&q->parms, gred_backlog(t, q, sch));
if (red_is_idling(&q->parms))
red_end_of_idle_period(&q->parms);
if (gred_wred_mode(t))
gred_store_wred_set(t, q);
switch (red_action(&q->parms, q->parms.qavg + qavg)) {
case RED_DONT_MARK:
break;
case RED_PROB_MARK:
sch->qstats.overlimits++;
if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
q->stats.prob_drop++;
goto congestion_drop;
}
q->stats.prob_mark++;
break;
case RED_HARD_MARK:
sch->qstats.overlimits++;
if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
!INET_ECN_set_ce(skb)) {
q->stats.forced_drop++;
goto congestion_drop;
}
q->stats.forced_mark++;
break;
}
if (q->backlog + qdisc_pkt_len(skb) <= q->limit) {
q->backlog += qdisc_pkt_len(skb);
return qdisc_enqueue_tail(skb, sch);
}
q->stats.pdrop++;
drop:
return qdisc_drop(skb, sch);
congestion_drop:
qdisc_drop(skb, sch);
return NET_XMIT_CN;
}
static struct sk_buff *gred_dequeue(struct Qdisc* sch)
{
struct sk_buff *skb;
struct gred_sched *t = qdisc_priv(sch);
skb = qdisc_dequeue_head(sch);
if (skb) {
struct gred_sched_data *q;
u16 dp = tc_index_to_dp(skb);
if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
if (net_ratelimit())
printk(KERN_WARNING "GRED: Unable to relocate "
"VQ 0x%x after dequeue, screwing up "
"backlog.\n", tc_index_to_dp(skb));
} else {
q->backlog -= qdisc_pkt_len(skb);
if (!q->backlog && !gred_wred_mode(t))
red_start_of_idle_period(&q->parms);
}
return skb;
}
if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
red_start_of_idle_period(&t->wred_set);
return NULL;
}
static unsigned int gred_drop(struct Qdisc* sch)
{
struct sk_buff *skb;
struct gred_sched *t = qdisc_priv(sch);
skb = qdisc_dequeue_tail(sch);
if (skb) {
unsigned int len = qdisc_pkt_len(skb);
struct gred_sched_data *q;
u16 dp = tc_index_to_dp(skb);
if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
if (net_ratelimit())
printk(KERN_WARNING "GRED: Unable to relocate "
"VQ 0x%x while dropping, screwing up "
"backlog.\n", tc_index_to_dp(skb));
} else {
q->backlog -= len;
q->stats.other++;
if (!q->backlog && !gred_wred_mode(t))
red_start_of_idle_period(&q->parms);
}
qdisc_drop(skb, sch);
return len;
}
if (gred_wred_mode(t) && !red_is_idling(&t->wred_set))
red_start_of_idle_period(&t->wred_set);
return 0;
}
static void gred_reset(struct Qdisc* sch)
{
int i;
struct gred_sched *t = qdisc_priv(sch);
qdisc_reset_queue(sch);
for (i = 0; i < t->DPs; i++) {
struct gred_sched_data *q = t->tab[i];
if (!q)
continue;
red_restart(&q->parms);
q->backlog = 0;
}
}
static inline void gred_destroy_vq(struct gred_sched_data *q)
{
kfree(q);
}
static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
{
struct gred_sched *table = qdisc_priv(sch);
struct tc_gred_sopt *sopt;
int i;
if (dps == NULL)
return -EINVAL;
sopt = nla_data(dps);
if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs)
return -EINVAL;
sch_tree_lock(sch);
table->DPs = sopt->DPs;
table->def = sopt->def_DP;
table->red_flags = sopt->flags;
/*
* Every entry point to GRED is synchronized with the above code
* and the DP is checked against DPs, i.e. shadowed VQs can no
* longer be found so we can unlock right here.
*/
sch_tree_unlock(sch);
if (sopt->grio) {
gred_enable_rio_mode(table);
gred_disable_wred_mode(table);
if (gred_wred_mode_check(sch))
gred_enable_wred_mode(table);
} else {
gred_disable_rio_mode(table);
gred_disable_wred_mode(table);
}
for (i = table->DPs; i < MAX_DPs; i++) {
if (table->tab[i]) {
printk(KERN_WARNING "GRED: Warning: Destroying "
"shadowed VQ 0x%x\n", i);
gred_destroy_vq(table->tab[i]);
table->tab[i] = NULL;
}
}
return 0;
}
static inline int gred_change_vq(struct Qdisc *sch, int dp,
struct tc_gred_qopt *ctl, int prio, u8 *stab)
{
struct gred_sched *table = qdisc_priv(sch);
struct gred_sched_data *q;
if (table->tab[dp] == NULL) {
table->tab[dp] = kzalloc(sizeof(*q), GFP_KERNEL);
if (table->tab[dp] == NULL)
return -ENOMEM;
}
q = table->tab[dp];
q->DP = dp;
q->prio = prio;
q->limit = ctl->limit;
if (q->backlog == 0)
red_end_of_idle_period(&q->parms);
red_set_parms(&q->parms,
ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog,
ctl->Scell_log, stab);
return 0;
}
static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
[TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) },
[TCA_GRED_STAB] = { .len = 256 },
[TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) },
};
static int gred_change(struct Qdisc *sch, struct nlattr *opt)
{
struct gred_sched *table = qdisc_priv(sch);
struct tc_gred_qopt *ctl;
struct nlattr *tb[TCA_GRED_MAX + 1];
int err, prio = GRED_DEF_PRIO;
u8 *stab;
if (opt == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy);
if (err < 0)
return err;
if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL)
return gred_change_table_def(sch, opt);
if (tb[TCA_GRED_PARMS] == NULL ||
tb[TCA_GRED_STAB] == NULL)
return -EINVAL;
err = -EINVAL;
ctl = nla_data(tb[TCA_GRED_PARMS]);
stab = nla_data(tb[TCA_GRED_STAB]);
if (ctl->DP >= table->DPs)
goto errout;
if (gred_rio_mode(table)) {
if (ctl->prio == 0) {
int def_prio = GRED_DEF_PRIO;
if (table->tab[table->def])
def_prio = table->tab[table->def]->prio;
printk(KERN_DEBUG "GRED: DP %u does not have a prio "
"setting default to %d\n", ctl->DP, def_prio);
prio = def_prio;
} else
prio = ctl->prio;
}
sch_tree_lock(sch);
err = gred_change_vq(sch, ctl->DP, ctl, prio, stab);
if (err < 0)
goto errout_locked;
if (gred_rio_mode(table)) {
gred_disable_wred_mode(table);
if (gred_wred_mode_check(sch))
gred_enable_wred_mode(table);
}
err = 0;
errout_locked:
sch_tree_unlock(sch);
errout:
return err;
}
static int gred_init(struct Qdisc *sch, struct nlattr *opt)
{
struct nlattr *tb[TCA_GRED_MAX + 1];
int err;
if (opt == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy);
if (err < 0)
return err;
if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB])
return -EINVAL;
return gred_change_table_def(sch, tb[TCA_GRED_DPS]);
}
static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct gred_sched *table = qdisc_priv(sch);
struct nlattr *parms, *opts = NULL;
int i;
struct tc_gred_sopt sopt = {
.DPs = table->DPs,
.def_DP = table->def,
.grio = gred_rio_mode(table),
.flags = table->red_flags,
};
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
NLA_PUT(skb, TCA_GRED_DPS, sizeof(sopt), &sopt);
parms = nla_nest_start(skb, TCA_GRED_PARMS);
if (parms == NULL)
goto nla_put_failure;
for (i = 0; i < MAX_DPs; i++) {
struct gred_sched_data *q = table->tab[i];
struct tc_gred_qopt opt;
memset(&opt, 0, sizeof(opt));
if (!q) {
/* hack -- fix at some point with proper message
This is how we indicate to tc that there is no VQ
at this DP */
opt.DP = MAX_DPs + i;
goto append_opt;
}
opt.limit = q->limit;
opt.DP = q->DP;
opt.backlog = q->backlog;
opt.prio = q->prio;
opt.qth_min = q->parms.qth_min >> q->parms.Wlog;
opt.qth_max = q->parms.qth_max >> q->parms.Wlog;
opt.Wlog = q->parms.Wlog;
opt.Plog = q->parms.Plog;
opt.Scell_log = q->parms.Scell_log;
opt.other = q->stats.other;
opt.early = q->stats.prob_drop;
opt.forced = q->stats.forced_drop;
opt.pdrop = q->stats.pdrop;
opt.packets = q->packetsin;
opt.bytesin = q->bytesin;
if (gred_wred_mode(table)) {
q->parms.qidlestart =
table->tab[table->def]->parms.qidlestart;
q->parms.qavg = table->tab[table->def]->parms.qavg;
}
opt.qave = red_calc_qavg(&q->parms, q->parms.qavg);
append_opt:
if (nla_append(skb, sizeof(opt), &opt) < 0)
goto nla_put_failure;
}
nla_nest_end(skb, parms);
return nla_nest_end(skb, opts);
nla_put_failure:
nla_nest_cancel(skb, opts);
return -EMSGSIZE;
}
static void gred_destroy(struct Qdisc *sch)
{
struct gred_sched *table = qdisc_priv(sch);
int i;
for (i = 0; i < table->DPs; i++) {
if (table->tab[i])
gred_destroy_vq(table->tab[i]);
}
}
static struct Qdisc_ops gred_qdisc_ops __read_mostly = {
.id = "gred",
.priv_size = sizeof(struct gred_sched),
.enqueue = gred_enqueue,
.dequeue = gred_dequeue,
.peek = qdisc_peek_head,
.drop = gred_drop,
.init = gred_init,
.reset = gred_reset,
.destroy = gred_destroy,
.change = gred_change,
.dump = gred_dump,
.owner = THIS_MODULE,
};
static int __init gred_module_init(void)
{
return register_qdisc(&gred_qdisc_ops);
}
static void __exit gred_module_exit(void)
{
unregister_qdisc(&gred_qdisc_ops);
}
module_init(gred_module_init)
module_exit(gred_module_exit)
MODULE_LICENSE("GPL");

1746
kernel/net/sched/sch_hfsc.c Normal file

File diff suppressed because it is too large Load Diff

1579
kernel/net/sched/sch_htb.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,145 @@
/* net/sched/sch_ingress.c - Ingress qdisc
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Jamal Hadi Salim 1999
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/list.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
struct ingress_qdisc_data {
struct tcf_proto *filter_list;
};
/* ------------------------- Class/flow operations ------------------------- */
static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
{
return NULL;
}
static unsigned long ingress_get(struct Qdisc *sch, u32 classid)
{
return TC_H_MIN(classid) + 1;
}
static unsigned long ingress_bind_filter(struct Qdisc *sch,
unsigned long parent, u32 classid)
{
return ingress_get(sch, classid);
}
static void ingress_put(struct Qdisc *sch, unsigned long cl)
{
}
static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
return;
}
static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch, unsigned long cl)
{
struct ingress_qdisc_data *p = qdisc_priv(sch);
return &p->filter_list;
}
/* --------------------------- Qdisc operations ---------------------------- */
static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct ingress_qdisc_data *p = qdisc_priv(sch);
struct tcf_result res;
int result;
result = tc_classify(skb, p->filter_list, &res);
sch->bstats.packets++;
sch->bstats.bytes += qdisc_pkt_len(skb);
switch (result) {
case TC_ACT_SHOT:
result = TC_ACT_SHOT;
sch->qstats.drops++;
break;
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
result = TC_ACT_STOLEN;
break;
case TC_ACT_RECLASSIFY:
case TC_ACT_OK:
skb->tc_index = TC_H_MIN(res.classid);
default:
result = TC_ACT_OK;
break;
}
return result;
}
/* ------------------------------------------------------------- */
static void ingress_destroy(struct Qdisc *sch)
{
struct ingress_qdisc_data *p = qdisc_priv(sch);
tcf_destroy_chain(&p->filter_list);
}
static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct nlattr *nest;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
nla_nest_end(skb, nest);
return skb->len;
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
}
static const struct Qdisc_class_ops ingress_class_ops = {
.leaf = ingress_leaf,
.get = ingress_get,
.put = ingress_put,
.walk = ingress_walk,
.tcf_chain = ingress_find_tcf,
.bind_tcf = ingress_bind_filter,
.unbind_tcf = ingress_put,
};
static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
.cl_ops = &ingress_class_ops,
.id = "ingress",
.priv_size = sizeof(struct ingress_qdisc_data),
.enqueue = ingress_enqueue,
.destroy = ingress_destroy,
.dump = ingress_dump,
.owner = THIS_MODULE,
};
static int __init ingress_module_init(void)
{
return register_qdisc(&ingress_qdisc_ops);
}
static void __exit ingress_module_exit(void)
{
unregister_qdisc(&ingress_qdisc_ops);
}
module_init(ingress_module_init)
module_exit(ingress_module_exit)
MODULE_LICENSE("GPL");

241
kernel/net/sched/sch_mq.c Normal file
View File

@@ -0,0 +1,241 @@
/*
* net/sched/sch_mq.c Classful multiqueue dummy scheduler
*
* Copyright (c) 2009 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
struct mq_sched {
struct Qdisc **qdiscs;
};
static void mq_destroy(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct mq_sched *priv = qdisc_priv(sch);
unsigned int ntx;
if (!priv->qdiscs)
return;
for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
qdisc_destroy(priv->qdiscs[ntx]);
kfree(priv->qdiscs);
}
static int mq_init(struct Qdisc *sch, struct nlattr *opt)
{
struct net_device *dev = qdisc_dev(sch);
struct mq_sched *priv = qdisc_priv(sch);
struct netdev_queue *dev_queue;
struct Qdisc *qdisc;
unsigned int ntx;
if (sch->parent != TC_H_ROOT)
return -EOPNOTSUPP;
if (!netif_is_multiqueue(dev))
return -EOPNOTSUPP;
/* pre-allocate qdiscs, attachment can't fail */
priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
GFP_KERNEL);
if (priv->qdiscs == NULL)
return -ENOMEM;
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
dev_queue = netdev_get_tx_queue(dev, ntx);
qdisc = qdisc_create_dflt(dev, dev_queue, &pfifo_fast_ops,
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(ntx + 1)));
if (qdisc == NULL)
goto err;
qdisc->flags |= TCQ_F_CAN_BYPASS;
priv->qdiscs[ntx] = qdisc;
}
sch->flags |= TCQ_F_MQROOT;
return 0;
err:
mq_destroy(sch);
return -ENOMEM;
}
static void mq_attach(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct mq_sched *priv = qdisc_priv(sch);
struct Qdisc *qdisc;
unsigned int ntx;
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
qdisc = priv->qdiscs[ntx];
qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc);
if (qdisc)
qdisc_destroy(qdisc);
}
kfree(priv->qdiscs);
priv->qdiscs = NULL;
}
static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct net_device *dev = qdisc_dev(sch);
struct Qdisc *qdisc;
unsigned int ntx;
sch->q.qlen = 0;
memset(&sch->bstats, 0, sizeof(sch->bstats));
memset(&sch->qstats, 0, sizeof(sch->qstats));
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
spin_lock_bh(qdisc_lock(qdisc));
sch->q.qlen += qdisc->q.qlen;
sch->bstats.bytes += qdisc->bstats.bytes;
sch->bstats.packets += qdisc->bstats.packets;
sch->qstats.qlen += qdisc->qstats.qlen;
sch->qstats.backlog += qdisc->qstats.backlog;
sch->qstats.drops += qdisc->qstats.drops;
sch->qstats.requeues += qdisc->qstats.requeues;
sch->qstats.overlimits += qdisc->qstats.overlimits;
spin_unlock_bh(qdisc_lock(qdisc));
}
return 0;
}
static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl)
{
struct net_device *dev = qdisc_dev(sch);
unsigned long ntx = cl - 1;
if (ntx >= dev->num_tx_queues)
return NULL;
return netdev_get_tx_queue(dev, ntx);
}
static struct netdev_queue *mq_select_queue(struct Qdisc *sch,
struct tcmsg *tcm)
{
unsigned int ntx = TC_H_MIN(tcm->tcm_parent);
struct netdev_queue *dev_queue = mq_queue_get(sch, ntx);
if (!dev_queue) {
struct net_device *dev = qdisc_dev(sch);
return netdev_get_tx_queue(dev, 0);
}
return dev_queue;
}
static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
struct Qdisc **old)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
struct net_device *dev = qdisc_dev(sch);
if (dev->flags & IFF_UP)
dev_deactivate(dev);
*old = dev_graft_qdisc(dev_queue, new);
if (dev->flags & IFF_UP)
dev_activate(dev);
return 0;
}
static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
return dev_queue->qdisc_sleeping;
}
static unsigned long mq_get(struct Qdisc *sch, u32 classid)
{
unsigned int ntx = TC_H_MIN(classid);
if (!mq_queue_get(sch, ntx))
return 0;
return ntx;
}
static void mq_put(struct Qdisc *sch, unsigned long cl)
{
return;
}
static int mq_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
tcm->tcm_parent = TC_H_ROOT;
tcm->tcm_handle |= TC_H_MIN(cl);
tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
return 0;
}
static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct gnet_dump *d)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
sch = dev_queue->qdisc_sleeping;
sch->qstats.qlen = sch->q.qlen;
if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
gnet_stats_copy_queue(d, &sch->qstats) < 0)
return -1;
return 0;
}
static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct net_device *dev = qdisc_dev(sch);
unsigned int ntx;
if (arg->stop)
return;
arg->count = arg->skip;
for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
if (arg->fn(sch, ntx + 1, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
static const struct Qdisc_class_ops mq_class_ops = {
.select_queue = mq_select_queue,
.graft = mq_graft,
.leaf = mq_leaf,
.get = mq_get,
.put = mq_put,
.walk = mq_walk,
.dump = mq_dump_class,
.dump_stats = mq_dump_class_stats,
};
struct Qdisc_ops mq_qdisc_ops __read_mostly = {
.cl_ops = &mq_class_ops,
.id = "mq",
.priv_size = sizeof(struct mq_sched),
.init = mq_init,
.destroy = mq_destroy,
.attach = mq_attach,
.dump = mq_dump,
.owner = THIS_MODULE,
};

View File

@@ -0,0 +1,443 @@
/*
* Copyright (c) 2008, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 59 Temple
* Place - Suite 330, Boston, MA 02111-1307 USA.
*
* Author: Alexander Duyck <alexander.h.duyck@intel.com>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
struct multiq_sched_data {
u16 bands;
u16 max_bands;
u16 curband;
struct tcf_proto *filter_list;
struct Qdisc **queues;
};
static struct Qdisc *
multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
{
struct multiq_sched_data *q = qdisc_priv(sch);
u32 band;
struct tcf_result res;
int err;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
err = tc_classify(skb, q->filter_list, &res);
#ifdef CONFIG_NET_CLS_ACT
switch (err) {
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
return NULL;
}
#endif
band = skb_get_queue_mapping(skb);
if (band >= q->bands)
return q->queues[0];
return q->queues[band];
}
static int
multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct Qdisc *qdisc;
int ret;
qdisc = multiq_classify(skb, sch, &ret);
#ifdef CONFIG_NET_CLS_ACT
if (qdisc == NULL) {
if (ret & __NET_XMIT_BYPASS)
sch->qstats.drops++;
kfree_skb(skb);
return ret;
}
#endif
ret = qdisc_enqueue(skb, qdisc);
if (ret == NET_XMIT_SUCCESS) {
sch->bstats.bytes += qdisc_pkt_len(skb);
sch->bstats.packets++;
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
if (net_xmit_drop_count(ret))
sch->qstats.drops++;
return ret;
}
static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
{
struct multiq_sched_data *q = qdisc_priv(sch);
struct Qdisc *qdisc;
struct sk_buff *skb;
int band;
for (band = 0; band < q->bands; band++) {
/* cycle through bands to ensure fairness */
q->curband++;
if (q->curband >= q->bands)
q->curband = 0;
/* Check that target subqueue is available before
* pulling an skb to avoid head-of-line blocking.
*/
if (!__netif_subqueue_stopped(qdisc_dev(sch), q->curband)) {
qdisc = q->queues[q->curband];
skb = qdisc->dequeue(qdisc);
if (skb) {
sch->q.qlen--;
return skb;
}
}
}
return NULL;
}
static struct sk_buff *multiq_peek(struct Qdisc *sch)
{
struct multiq_sched_data *q = qdisc_priv(sch);
unsigned int curband = q->curband;
struct Qdisc *qdisc;
struct sk_buff *skb;
int band;
for (band = 0; band < q->bands; band++) {
/* cycle through bands to ensure fairness */
curband++;
if (curband >= q->bands)
curband = 0;
/* Check that target subqueue is available before
* pulling an skb to avoid head-of-line blocking.
*/
if (!__netif_subqueue_stopped(qdisc_dev(sch), curband)) {
qdisc = q->queues[curband];
skb = qdisc->ops->peek(qdisc);
if (skb)
return skb;
}
}
return NULL;
}
static unsigned int multiq_drop(struct Qdisc *sch)
{
struct multiq_sched_data *q = qdisc_priv(sch);
int band;
unsigned int len;
struct Qdisc *qdisc;
for (band = q->bands-1; band >= 0; band--) {
qdisc = q->queues[band];
if (qdisc->ops->drop) {
len = qdisc->ops->drop(qdisc);
if (len != 0) {
sch->q.qlen--;
return len;
}
}
}
return 0;
}
static void
multiq_reset(struct Qdisc *sch)
{
u16 band;
struct multiq_sched_data *q = qdisc_priv(sch);
for (band = 0; band < q->bands; band++)
qdisc_reset(q->queues[band]);
sch->q.qlen = 0;
q->curband = 0;
}
static void
multiq_destroy(struct Qdisc *sch)
{
int band;
struct multiq_sched_data *q = qdisc_priv(sch);
tcf_destroy_chain(&q->filter_list);
for (band = 0; band < q->bands; band++)
qdisc_destroy(q->queues[band]);
kfree(q->queues);
}
static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
{
struct multiq_sched_data *q = qdisc_priv(sch);
struct tc_multiq_qopt *qopt;
int i;
if (!netif_is_multiqueue(qdisc_dev(sch)))
return -EOPNOTSUPP;
if (nla_len(opt) < sizeof(*qopt))
return -EINVAL;
qopt = nla_data(opt);
qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
sch_tree_lock(sch);
q->bands = qopt->bands;
for (i = q->bands; i < q->max_bands; i++) {
if (q->queues[i] != &noop_qdisc) {
struct Qdisc *child = q->queues[i];
q->queues[i] = &noop_qdisc;
qdisc_tree_decrease_qlen(child, child->q.qlen);
qdisc_destroy(child);
}
}
sch_tree_unlock(sch);
for (i = 0; i < q->bands; i++) {
if (q->queues[i] == &noop_qdisc) {
struct Qdisc *child, *old;
child = qdisc_create_dflt(qdisc_dev(sch),
sch->dev_queue,
&pfifo_qdisc_ops,
TC_H_MAKE(sch->handle,
i + 1));
if (child) {
sch_tree_lock(sch);
old = q->queues[i];
q->queues[i] = child;
if (old != &noop_qdisc) {
qdisc_tree_decrease_qlen(old,
old->q.qlen);
qdisc_destroy(old);
}
sch_tree_unlock(sch);
}
}
}
return 0;
}
static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
{
struct multiq_sched_data *q = qdisc_priv(sch);
int i, err;
q->queues = NULL;
if (opt == NULL)
return -EINVAL;
q->max_bands = qdisc_dev(sch)->num_tx_queues;
q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL);
if (!q->queues)
return -ENOBUFS;
for (i = 0; i < q->max_bands; i++)
q->queues[i] = &noop_qdisc;
err = multiq_tune(sch,opt);
if (err)
kfree(q->queues);
return err;
}
static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct multiq_sched_data *q = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
struct tc_multiq_qopt opt;
opt.bands = q->bands;
opt.max_bands = q->max_bands;
NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
struct multiq_sched_data *q = qdisc_priv(sch);
unsigned long band = arg - 1;
if (new == NULL)
new = &noop_qdisc;
sch_tree_lock(sch);
*old = q->queues[band];
q->queues[band] = new;
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
qdisc_reset(*old);
sch_tree_unlock(sch);
return 0;
}
static struct Qdisc *
multiq_leaf(struct Qdisc *sch, unsigned long arg)
{
struct multiq_sched_data *q = qdisc_priv(sch);
unsigned long band = arg - 1;
return q->queues[band];
}
static unsigned long multiq_get(struct Qdisc *sch, u32 classid)
{
struct multiq_sched_data *q = qdisc_priv(sch);
unsigned long band = TC_H_MIN(classid);
if (band - 1 >= q->bands)
return 0;
return band;
}
static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
return multiq_get(sch, classid);
}
static void multiq_put(struct Qdisc *q, unsigned long cl)
{
return;
}
static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct multiq_sched_data *q = qdisc_priv(sch);
tcm->tcm_handle |= TC_H_MIN(cl);
tcm->tcm_info = q->queues[cl-1]->handle;
return 0;
}
static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct gnet_dump *d)
{
struct multiq_sched_data *q = qdisc_priv(sch);
struct Qdisc *cl_q;
cl_q = q->queues[cl - 1];
cl_q->qstats.qlen = cl_q->q.qlen;
if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
return -1;
return 0;
}
static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct multiq_sched_data *q = qdisc_priv(sch);
int band;
if (arg->stop)
return;
for (band = 0; band < q->bands; band++) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(sch, band+1, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
static struct tcf_proto **multiq_find_tcf(struct Qdisc *sch, unsigned long cl)
{
struct multiq_sched_data *q = qdisc_priv(sch);
if (cl)
return NULL;
return &q->filter_list;
}
static const struct Qdisc_class_ops multiq_class_ops = {
.graft = multiq_graft,
.leaf = multiq_leaf,
.get = multiq_get,
.put = multiq_put,
.walk = multiq_walk,
.tcf_chain = multiq_find_tcf,
.bind_tcf = multiq_bind,
.unbind_tcf = multiq_put,
.dump = multiq_dump_class,
.dump_stats = multiq_dump_class_stats,
};
static struct Qdisc_ops multiq_qdisc_ops __read_mostly = {
.next = NULL,
.cl_ops = &multiq_class_ops,
.id = "multiq",
.priv_size = sizeof(struct multiq_sched_data),
.enqueue = multiq_enqueue,
.dequeue = multiq_dequeue,
.peek = multiq_peek,
.drop = multiq_drop,
.init = multiq_init,
.reset = multiq_reset,
.destroy = multiq_destroy,
.change = multiq_tune,
.dump = multiq_dump,
.owner = THIS_MODULE,
};
static int __init multiq_module_init(void)
{
return register_qdisc(&multiq_qdisc_ops);
}
static void __exit multiq_module_exit(void)
{
unregister_qdisc(&multiq_qdisc_ops);
}
module_init(multiq_module_init)
module_exit(multiq_module_exit)
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,632 @@
/*
* net/sched/sch_netem.c Network emulator
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License.
*
* Many of the algorithms and ideas for this came from
* NIST Net which is not copyrighted.
*
* Authors: Stephen Hemminger <shemminger@osdl.org>
* Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#define VERSION "1.2"
/* Network Emulation Queuing algorithm.
====================================
Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
Network Emulation Tool
[2] Luigi Rizzo, DummyNet for FreeBSD
----------------------------------------------------------------
This started out as a simple way to delay outgoing packets to
test TCP but has grown to include most of the functionality
of a full blown network emulator like NISTnet. It can delay
packets and add random jitter (and correlation). The random
distribution can be loaded from a table as well to provide
normal, Pareto, or experimental curves. Packet loss,
duplication, and reordering can also be emulated.
This qdisc does not do classification that can be handled in
layering other disciplines. It does not need to do bandwidth
control either since that can be handled by using token
bucket or other rate control.
*/
struct netem_sched_data {
struct Qdisc *qdisc;
struct qdisc_watchdog watchdog;
psched_tdiff_t latency;
psched_tdiff_t jitter;
u32 loss;
u32 limit;
u32 counter;
u32 gap;
u32 duplicate;
u32 reorder;
u32 corrupt;
struct crndstate {
u32 last;
u32 rho;
} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
struct disttable {
u32 size;
s16 table[0];
} *delay_dist;
};
/* Time stamp put into socket buffer control block */
struct netem_skb_cb {
psched_time_t time_to_send;
};
static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
{
BUILD_BUG_ON(sizeof(skb->cb) <
sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb));
return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
}
/* init_crandom - initialize correlated random number generator
* Use entropy source for initial seed.
*/
static void init_crandom(struct crndstate *state, unsigned long rho)
{
state->rho = rho;
state->last = net_random();
}
/* get_crandom - correlated random number generator
* Next number depends on last value.
* rho is scaled to avoid floating point.
*/
static u32 get_crandom(struct crndstate *state)
{
u64 value, rho;
unsigned long answer;
if (state->rho == 0) /* no correlation */
return net_random();
value = net_random();
rho = (u64)state->rho + 1;
answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
state->last = answer;
return answer;
}
/* tabledist - return a pseudo-randomly distributed value with mean mu and
* std deviation sigma. Uses table lookup to approximate the desired
* distribution, and a uniformly-distributed pseudo-random source.
*/
static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
struct crndstate *state,
const struct disttable *dist)
{
psched_tdiff_t x;
long t;
u32 rnd;
if (sigma == 0)
return mu;
rnd = get_crandom(state);
/* default uniform distribution */
if (dist == NULL)
return (rnd % (2*sigma)) - sigma + mu;
t = dist->table[rnd % dist->size];
x = (sigma % NETEM_DIST_SCALE) * t;
if (x >= 0)
x += NETEM_DIST_SCALE/2;
else
x -= NETEM_DIST_SCALE/2;
return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
}
/*
* Insert one skb into qdisc.
* Note: parent depends on return value to account for queue length.
* NET_XMIT_DROP: queue length didn't change.
* NET_XMIT_SUCCESS: one skb was queued.
*/
static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
/* We don't fill cb now as skb_unshare() may invalidate it */
struct netem_skb_cb *cb;
struct sk_buff *skb2;
int ret;
int count = 1;
pr_debug("netem_enqueue skb=%p\n", skb);
/* Random duplication */
if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
++count;
/* Random packet drop 0 => none, ~0 => all */
if (q->loss && q->loss >= get_crandom(&q->loss_cor))
--count;
if (count == 0) {
sch->qstats.drops++;
kfree_skb(skb);
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
}
skb_orphan(skb);
/*
* If we need to duplicate packet, then re-insert at top of the
* qdisc tree, since parent queuer expects that only one
* skb will be queued.
*/
if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
struct Qdisc *rootq = qdisc_root(sch);
u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
q->duplicate = 0;
qdisc_enqueue_root(skb2, rootq);
q->duplicate = dupsave;
}
/*
* Randomized packet corruption.
* Make copy if needed since we are modifying
* If packet is going to be hardware checksummed, then
* do it now in software before we mangle it.
*/
if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
if (!(skb = skb_unshare(skb, GFP_ATOMIC))
|| (skb->ip_summed == CHECKSUM_PARTIAL
&& skb_checksum_help(skb))) {
sch->qstats.drops++;
return NET_XMIT_DROP;
}
skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
}
cb = netem_skb_cb(skb);
if (q->gap == 0 /* not doing reordering */
|| q->counter < q->gap /* inside last reordering gap */
|| q->reorder < get_crandom(&q->reorder_cor)) {
psched_time_t now;
psched_tdiff_t delay;
delay = tabledist(q->latency, q->jitter,
&q->delay_cor, q->delay_dist);
now = psched_get_time();
cb->time_to_send = now + delay;
++q->counter;
ret = qdisc_enqueue(skb, q->qdisc);
} else {
/*
* Do re-ordering by putting one out of N packets at the front
* of the queue.
*/
cb->time_to_send = psched_get_time();
q->counter = 0;
__skb_queue_head(&q->qdisc->q, skb);
q->qdisc->qstats.backlog += qdisc_pkt_len(skb);
q->qdisc->qstats.requeues++;
ret = NET_XMIT_SUCCESS;
}
if (likely(ret == NET_XMIT_SUCCESS)) {
sch->q.qlen++;
sch->bstats.bytes += qdisc_pkt_len(skb);
sch->bstats.packets++;
} else if (net_xmit_drop_count(ret)) {
sch->qstats.drops++;
}
pr_debug("netem: enqueue ret %d\n", ret);
return ret;
}
static unsigned int netem_drop(struct Qdisc* sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
unsigned int len = 0;
if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
sch->q.qlen--;
sch->qstats.drops++;
}
return len;
}
static struct sk_buff *netem_dequeue(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
if (sch->flags & TCQ_F_THROTTLED)
return NULL;
skb = q->qdisc->ops->peek(q->qdisc);
if (skb) {
const struct netem_skb_cb *cb = netem_skb_cb(skb);
psched_time_t now = psched_get_time();
/* if more time remaining? */
if (cb->time_to_send <= now) {
skb = qdisc_dequeue_peeked(q->qdisc);
if (unlikely(!skb))
return NULL;
#ifdef CONFIG_NET_CLS_ACT
/*
* If it's at ingress let's pretend the delay is
* from the network (tstamp will be updated).
*/
if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
skb->tstamp.tv64 = 0;
#endif
pr_debug("netem_dequeue: return skb=%p\n", skb);
sch->q.qlen--;
return skb;
}
qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
}
return NULL;
}
static void netem_reset(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
sch->q.qlen = 0;
qdisc_watchdog_cancel(&q->watchdog);
}
/*
* Distribution data is a variable size payload containing
* signed 16 bit values.
*/
static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
{
struct netem_sched_data *q = qdisc_priv(sch);
unsigned long n = nla_len(attr)/sizeof(__s16);
const __s16 *data = nla_data(attr);
spinlock_t *root_lock;
struct disttable *d;
int i;
if (n > 65536)
return -EINVAL;
d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
if (!d)
return -ENOMEM;
d->size = n;
for (i = 0; i < n; i++)
d->table[i] = data[i];
root_lock = qdisc_root_sleeping_lock(sch);
spin_lock_bh(root_lock);
kfree(q->delay_dist);
q->delay_dist = d;
spin_unlock_bh(root_lock);
return 0;
}
static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
{
struct netem_sched_data *q = qdisc_priv(sch);
const struct tc_netem_corr *c = nla_data(attr);
init_crandom(&q->delay_cor, c->delay_corr);
init_crandom(&q->loss_cor, c->loss_corr);
init_crandom(&q->dup_cor, c->dup_corr);
}
static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
{
struct netem_sched_data *q = qdisc_priv(sch);
const struct tc_netem_reorder *r = nla_data(attr);
q->reorder = r->probability;
init_crandom(&q->reorder_cor, r->correlation);
}
static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
{
struct netem_sched_data *q = qdisc_priv(sch);
const struct tc_netem_corrupt *r = nla_data(attr);
q->corrupt = r->probability;
init_crandom(&q->corrupt_cor, r->correlation);
}
static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
[TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
[TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
[TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
};
static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
const struct nla_policy *policy, int len)
{
int nested_len = nla_len(nla) - NLA_ALIGN(len);
if (nested_len < 0)
return -EINVAL;
if (nested_len >= nla_attr_size(0))
return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
nested_len, policy);
memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
return 0;
}
/* Parse netlink message to set options */
static int netem_change(struct Qdisc *sch, struct nlattr *opt)
{
struct netem_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_NETEM_MAX + 1];
struct tc_netem_qopt *qopt;
int ret;
if (opt == NULL)
return -EINVAL;
qopt = nla_data(opt);
ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
if (ret < 0)
return ret;
ret = fifo_set_limit(q->qdisc, qopt->limit);
if (ret) {
pr_debug("netem: can't set fifo limit\n");
return ret;
}
q->latency = qopt->latency;
q->jitter = qopt->jitter;
q->limit = qopt->limit;
q->gap = qopt->gap;
q->counter = 0;
q->loss = qopt->loss;
q->duplicate = qopt->duplicate;
/* for compatibility with earlier versions.
* if gap is set, need to assume 100% probability
*/
if (q->gap)
q->reorder = ~0;
if (tb[TCA_NETEM_CORR])
get_correlation(sch, tb[TCA_NETEM_CORR]);
if (tb[TCA_NETEM_DELAY_DIST]) {
ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
if (ret)
return ret;
}
if (tb[TCA_NETEM_REORDER])
get_reorder(sch, tb[TCA_NETEM_REORDER]);
if (tb[TCA_NETEM_CORRUPT])
get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
return 0;
}
/*
* Special case version of FIFO queue for use by netem.
* It queues in order based on timestamps in skb's
*/
struct fifo_sched_data {
u32 limit;
psched_time_t oldest;
};
static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
{
struct fifo_sched_data *q = qdisc_priv(sch);
struct sk_buff_head *list = &sch->q;
psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
struct sk_buff *skb;
if (likely(skb_queue_len(list) < q->limit)) {
/* Optimize for add at tail */
if (likely(skb_queue_empty(list) || tnext >= q->oldest)) {
q->oldest = tnext;
return qdisc_enqueue_tail(nskb, sch);
}
skb_queue_reverse_walk(list, skb) {
const struct netem_skb_cb *cb = netem_skb_cb(skb);
if (tnext >= cb->time_to_send)
break;
}
__skb_queue_after(list, skb, nskb);
sch->qstats.backlog += qdisc_pkt_len(nskb);
sch->bstats.bytes += qdisc_pkt_len(nskb);
sch->bstats.packets++;
return NET_XMIT_SUCCESS;
}
return qdisc_reshape_fail(nskb, sch);
}
static int tfifo_init(struct Qdisc *sch, struct nlattr *opt)
{
struct fifo_sched_data *q = qdisc_priv(sch);
if (opt) {
struct tc_fifo_qopt *ctl = nla_data(opt);
if (nla_len(opt) < sizeof(*ctl))
return -EINVAL;
q->limit = ctl->limit;
} else
q->limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
q->oldest = PSCHED_PASTPERFECT;
return 0;
}
static int tfifo_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct fifo_sched_data *q = qdisc_priv(sch);
struct tc_fifo_qopt opt = { .limit = q->limit };
NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
return skb->len;
nla_put_failure:
return -1;
}
static struct Qdisc_ops tfifo_qdisc_ops __read_mostly = {
.id = "tfifo",
.priv_size = sizeof(struct fifo_sched_data),
.enqueue = tfifo_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
.drop = qdisc_queue_drop,
.init = tfifo_init,
.reset = qdisc_reset_queue,
.change = tfifo_init,
.dump = tfifo_dump,
};
static int netem_init(struct Qdisc *sch, struct nlattr *opt)
{
struct netem_sched_data *q = qdisc_priv(sch);
int ret;
if (!opt)
return -EINVAL;
qdisc_watchdog_init(&q->watchdog, sch);
q->qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
&tfifo_qdisc_ops,
TC_H_MAKE(sch->handle, 1));
if (!q->qdisc) {
pr_debug("netem: qdisc create failed\n");
return -ENOMEM;
}
ret = netem_change(sch, opt);
if (ret) {
pr_debug("netem: change failed\n");
qdisc_destroy(q->qdisc);
}
return ret;
}
static void netem_destroy(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
qdisc_watchdog_cancel(&q->watchdog);
qdisc_destroy(q->qdisc);
kfree(q->delay_dist);
}
static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
{
const struct netem_sched_data *q = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nla = (struct nlattr *) b;
struct tc_netem_qopt qopt;
struct tc_netem_corr cor;
struct tc_netem_reorder reorder;
struct tc_netem_corrupt corrupt;
qopt.latency = q->latency;
qopt.jitter = q->jitter;
qopt.limit = q->limit;
qopt.loss = q->loss;
qopt.gap = q->gap;
qopt.duplicate = q->duplicate;
NLA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
cor.delay_corr = q->delay_cor.rho;
cor.loss_corr = q->loss_cor.rho;
cor.dup_corr = q->dup_cor.rho;
NLA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
reorder.probability = q->reorder;
reorder.correlation = q->reorder_cor.rho;
NLA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
corrupt.probability = q->corrupt;
corrupt.correlation = q->corrupt_cor.rho;
NLA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt);
nla->nla_len = skb_tail_pointer(skb) - b;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
.id = "netem",
.priv_size = sizeof(struct netem_sched_data),
.enqueue = netem_enqueue,
.dequeue = netem_dequeue,
.peek = qdisc_peek_dequeued,
.drop = netem_drop,
.init = netem_init,
.reset = netem_reset,
.destroy = netem_destroy,
.change = netem_change,
.dump = netem_dump,
.owner = THIS_MODULE,
};
static int __init netem_module_init(void)
{
pr_info("netem: version " VERSION "\n");
return register_qdisc(&netem_qdisc_ops);
}
static void __exit netem_module_exit(void)
{
unregister_qdisc(&netem_qdisc_ops);
}
module_init(netem_module_init)
module_exit(netem_module_exit)
MODULE_LICENSE("GPL");

406
kernel/net/sched/sch_prio.c Normal file
View File

@@ -0,0 +1,406 @@
/*
* net/sched/sch_prio.c Simple 3-band priority "scheduler".
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>:
* Init -- EINVAL when opt undefined
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
struct prio_sched_data
{
int bands;
struct tcf_proto *filter_list;
u8 prio2band[TC_PRIO_MAX+1];
struct Qdisc *queues[TCQ_PRIO_BANDS];
};
static struct Qdisc *
prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
{
struct prio_sched_data *q = qdisc_priv(sch);
u32 band = skb->priority;
struct tcf_result res;
int err;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
if (TC_H_MAJ(skb->priority) != sch->handle) {
err = tc_classify(skb, q->filter_list, &res);
#ifdef CONFIG_NET_CLS_ACT
switch (err) {
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
return NULL;
}
#endif
if (!q->filter_list || err < 0) {
if (TC_H_MAJ(band))
band = 0;
return q->queues[q->prio2band[band&TC_PRIO_MAX]];
}
band = res.classid;
}
band = TC_H_MIN(band) - 1;
if (band >= q->bands)
return q->queues[q->prio2band[0]];
return q->queues[band];
}
static int
prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct Qdisc *qdisc;
int ret;
qdisc = prio_classify(skb, sch, &ret);
#ifdef CONFIG_NET_CLS_ACT
if (qdisc == NULL) {
if (ret & __NET_XMIT_BYPASS)
sch->qstats.drops++;
kfree_skb(skb);
return ret;
}
#endif
ret = qdisc_enqueue(skb, qdisc);
if (ret == NET_XMIT_SUCCESS) {
sch->bstats.bytes += qdisc_pkt_len(skb);
sch->bstats.packets++;
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
if (net_xmit_drop_count(ret))
sch->qstats.drops++;
return ret;
}
static struct sk_buff *prio_peek(struct Qdisc *sch)
{
struct prio_sched_data *q = qdisc_priv(sch);
int prio;
for (prio = 0; prio < q->bands; prio++) {
struct Qdisc *qdisc = q->queues[prio];
struct sk_buff *skb = qdisc->ops->peek(qdisc);
if (skb)
return skb;
}
return NULL;
}
static struct sk_buff *prio_dequeue(struct Qdisc* sch)
{
struct prio_sched_data *q = qdisc_priv(sch);
int prio;
for (prio = 0; prio < q->bands; prio++) {
struct Qdisc *qdisc = q->queues[prio];
struct sk_buff *skb = qdisc->dequeue(qdisc);
if (skb) {
sch->q.qlen--;
return skb;
}
}
return NULL;
}
static unsigned int prio_drop(struct Qdisc* sch)
{
struct prio_sched_data *q = qdisc_priv(sch);
int prio;
unsigned int len;
struct Qdisc *qdisc;
for (prio = q->bands-1; prio >= 0; prio--) {
qdisc = q->queues[prio];
if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) {
sch->q.qlen--;
return len;
}
}
return 0;
}
static void
prio_reset(struct Qdisc* sch)
{
int prio;
struct prio_sched_data *q = qdisc_priv(sch);
for (prio=0; prio<q->bands; prio++)
qdisc_reset(q->queues[prio]);
sch->q.qlen = 0;
}
static void
prio_destroy(struct Qdisc* sch)
{
int prio;
struct prio_sched_data *q = qdisc_priv(sch);
tcf_destroy_chain(&q->filter_list);
for (prio=0; prio<q->bands; prio++)
qdisc_destroy(q->queues[prio]);
}
static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
{
struct prio_sched_data *q = qdisc_priv(sch);
struct tc_prio_qopt *qopt;
int i;
if (nla_len(opt) < sizeof(*qopt))
return -EINVAL;
qopt = nla_data(opt);
if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
return -EINVAL;
for (i=0; i<=TC_PRIO_MAX; i++) {
if (qopt->priomap[i] >= qopt->bands)
return -EINVAL;
}
sch_tree_lock(sch);
q->bands = qopt->bands;
memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
for (i=q->bands; i<TCQ_PRIO_BANDS; i++) {
struct Qdisc *child = q->queues[i];
q->queues[i] = &noop_qdisc;
if (child != &noop_qdisc) {
qdisc_tree_decrease_qlen(child, child->q.qlen);
qdisc_destroy(child);
}
}
sch_tree_unlock(sch);
for (i=0; i<q->bands; i++) {
if (q->queues[i] == &noop_qdisc) {
struct Qdisc *child, *old;
child = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
&pfifo_qdisc_ops,
TC_H_MAKE(sch->handle, i + 1));
if (child) {
sch_tree_lock(sch);
old = q->queues[i];
q->queues[i] = child;
if (old != &noop_qdisc) {
qdisc_tree_decrease_qlen(old,
old->q.qlen);
qdisc_destroy(old);
}
sch_tree_unlock(sch);
}
}
}
return 0;
}
static int prio_init(struct Qdisc *sch, struct nlattr *opt)
{
struct prio_sched_data *q = qdisc_priv(sch);
int i;
for (i=0; i<TCQ_PRIO_BANDS; i++)
q->queues[i] = &noop_qdisc;
if (opt == NULL) {
return -EINVAL;
} else {
int err;
if ((err= prio_tune(sch, opt)) != 0)
return err;
}
return 0;
}
static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct prio_sched_data *q = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
struct tc_prio_qopt opt;
opt.bands = q->bands;
memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
struct prio_sched_data *q = qdisc_priv(sch);
unsigned long band = arg - 1;
if (new == NULL)
new = &noop_qdisc;
sch_tree_lock(sch);
*old = q->queues[band];
q->queues[band] = new;
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
qdisc_reset(*old);
sch_tree_unlock(sch);
return 0;
}
static struct Qdisc *
prio_leaf(struct Qdisc *sch, unsigned long arg)
{
struct prio_sched_data *q = qdisc_priv(sch);
unsigned long band = arg - 1;
return q->queues[band];
}
static unsigned long prio_get(struct Qdisc *sch, u32 classid)
{
struct prio_sched_data *q = qdisc_priv(sch);
unsigned long band = TC_H_MIN(classid);
if (band - 1 >= q->bands)
return 0;
return band;
}
static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
{
return prio_get(sch, classid);
}
static void prio_put(struct Qdisc *q, unsigned long cl)
{
return;
}
static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb,
struct tcmsg *tcm)
{
struct prio_sched_data *q = qdisc_priv(sch);
tcm->tcm_handle |= TC_H_MIN(cl);
tcm->tcm_info = q->queues[cl-1]->handle;
return 0;
}
static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct gnet_dump *d)
{
struct prio_sched_data *q = qdisc_priv(sch);
struct Qdisc *cl_q;
cl_q = q->queues[cl - 1];
cl_q->qstats.qlen = cl_q->q.qlen;
if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 ||
gnet_stats_copy_queue(d, &cl_q->qstats) < 0)
return -1;
return 0;
}
static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct prio_sched_data *q = qdisc_priv(sch);
int prio;
if (arg->stop)
return;
for (prio = 0; prio < q->bands; prio++) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(sch, prio+1, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl)
{
struct prio_sched_data *q = qdisc_priv(sch);
if (cl)
return NULL;
return &q->filter_list;
}
static const struct Qdisc_class_ops prio_class_ops = {
.graft = prio_graft,
.leaf = prio_leaf,
.get = prio_get,
.put = prio_put,
.walk = prio_walk,
.tcf_chain = prio_find_tcf,
.bind_tcf = prio_bind,
.unbind_tcf = prio_put,
.dump = prio_dump_class,
.dump_stats = prio_dump_class_stats,
};
static struct Qdisc_ops prio_qdisc_ops __read_mostly = {
.next = NULL,
.cl_ops = &prio_class_ops,
.id = "prio",
.priv_size = sizeof(struct prio_sched_data),
.enqueue = prio_enqueue,
.dequeue = prio_dequeue,
.peek = prio_peek,
.drop = prio_drop,
.init = prio_init,
.reset = prio_reset,
.destroy = prio_destroy,
.change = prio_tune,
.dump = prio_dump,
.owner = THIS_MODULE,
};
static int __init prio_module_init(void)
{
return register_qdisc(&prio_qdisc_ops);
}
static void __exit prio_module_exit(void)
{
unregister_qdisc(&prio_qdisc_ops);
}
module_init(prio_module_init)
module_exit(prio_module_exit)
MODULE_LICENSE("GPL");

360
kernel/net/sched/sch_red.c Normal file
View File

@@ -0,0 +1,360 @@
/*
* net/sched/sch_red.c Random Early Detection queue.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
* J Hadi Salim 980914: computation fixes
* Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
* J Hadi Salim 980816: ECN support
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
#include <net/red.h>
/* Parameters, settable by user:
-----------------------------
limit - bytes (must be > qth_max + burst)
Hard limit on queue length, should be chosen >qth_max
to allow packet bursts. This parameter does not
affect the algorithms behaviour and can be chosen
arbitrarily high (well, less than ram size)
Really, this limit will never be reached
if RED works correctly.
*/
struct red_sched_data
{
u32 limit; /* HARD maximal queue length */
unsigned char flags;
struct red_parms parms;
struct red_stats stats;
struct Qdisc *qdisc;
};
static inline int red_use_ecn(struct red_sched_data *q)
{
return q->flags & TC_RED_ECN;
}
static inline int red_use_harddrop(struct red_sched_data *q)
{
return q->flags & TC_RED_HARDDROP;
}
static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
int ret;
q->parms.qavg = red_calc_qavg(&q->parms, child->qstats.backlog);
if (red_is_idling(&q->parms))
red_end_of_idle_period(&q->parms);
switch (red_action(&q->parms, q->parms.qavg)) {
case RED_DONT_MARK:
break;
case RED_PROB_MARK:
sch->qstats.overlimits++;
if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
q->stats.prob_drop++;
goto congestion_drop;
}
q->stats.prob_mark++;
break;
case RED_HARD_MARK:
sch->qstats.overlimits++;
if (red_use_harddrop(q) || !red_use_ecn(q) ||
!INET_ECN_set_ce(skb)) {
q->stats.forced_drop++;
goto congestion_drop;
}
q->stats.forced_mark++;
break;
}
ret = qdisc_enqueue(skb, child);
if (likely(ret == NET_XMIT_SUCCESS)) {
sch->bstats.bytes += qdisc_pkt_len(skb);
sch->bstats.packets++;
sch->q.qlen++;
} else if (net_xmit_drop_count(ret)) {
q->stats.pdrop++;
sch->qstats.drops++;
}
return ret;
congestion_drop:
qdisc_drop(skb, sch);
return NET_XMIT_CN;
}
static struct sk_buff * red_dequeue(struct Qdisc* sch)
{
struct sk_buff *skb;
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
skb = child->dequeue(child);
if (skb)
sch->q.qlen--;
else if (!red_is_idling(&q->parms))
red_start_of_idle_period(&q->parms);
return skb;
}
static struct sk_buff * red_peek(struct Qdisc* sch)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
return child->ops->peek(child);
}
static unsigned int red_drop(struct Qdisc* sch)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
unsigned int len;
if (child->ops->drop && (len = child->ops->drop(child)) > 0) {
q->stats.other++;
sch->qstats.drops++;
sch->q.qlen--;
return len;
}
if (!red_is_idling(&q->parms))
red_start_of_idle_period(&q->parms);
return 0;
}
static void red_reset(struct Qdisc* sch)
{
struct red_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
sch->q.qlen = 0;
red_restart(&q->parms);
}
static void red_destroy(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
qdisc_destroy(q->qdisc);
}
static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
[TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) },
[TCA_RED_STAB] = { .len = RED_STAB_SIZE },
};
static int red_change(struct Qdisc *sch, struct nlattr *opt)
{
struct red_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_RED_MAX + 1];
struct tc_red_qopt *ctl;
struct Qdisc *child = NULL;
int err;
if (opt == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_RED_MAX, opt, red_policy);
if (err < 0)
return err;
if (tb[TCA_RED_PARMS] == NULL ||
tb[TCA_RED_STAB] == NULL)
return -EINVAL;
ctl = nla_data(tb[TCA_RED_PARMS]);
if (ctl->limit > 0) {
child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit);
if (IS_ERR(child))
return PTR_ERR(child);
}
sch_tree_lock(sch);
q->flags = ctl->flags;
q->limit = ctl->limit;
if (child) {
qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
qdisc_destroy(q->qdisc);
q->qdisc = child;
}
red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
ctl->Plog, ctl->Scell_log,
nla_data(tb[TCA_RED_STAB]));
if (skb_queue_empty(&sch->q))
red_end_of_idle_period(&q->parms);
sch_tree_unlock(sch);
return 0;
}
static int red_init(struct Qdisc* sch, struct nlattr *opt)
{
struct red_sched_data *q = qdisc_priv(sch);
q->qdisc = &noop_qdisc;
return red_change(sch, opt);
}
static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct red_sched_data *q = qdisc_priv(sch);
struct nlattr *opts = NULL;
struct tc_red_qopt opt = {
.limit = q->limit,
.flags = q->flags,
.qth_min = q->parms.qth_min >> q->parms.Wlog,
.qth_max = q->parms.qth_max >> q->parms.Wlog,
.Wlog = q->parms.Wlog,
.Plog = q->parms.Plog,
.Scell_log = q->parms.Scell_log,
};
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
NLA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
return nla_nest_end(skb, opts);
nla_put_failure:
nla_nest_cancel(skb, opts);
return -EMSGSIZE;
}
static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct red_sched_data *q = qdisc_priv(sch);
struct tc_red_xstats st = {
.early = q->stats.prob_drop + q->stats.forced_drop,
.pdrop = q->stats.pdrop,
.other = q->stats.other,
.marked = q->stats.prob_mark + q->stats.forced_mark,
};
return gnet_stats_copy_app(d, &st, sizeof(st));
}
static int red_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct red_sched_data *q = qdisc_priv(sch);
tcm->tcm_handle |= TC_H_MIN(1);
tcm->tcm_info = q->qdisc->handle;
return 0;
}
static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
struct red_sched_data *q = qdisc_priv(sch);
if (new == NULL)
new = &noop_qdisc;
sch_tree_lock(sch);
*old = q->qdisc;
q->qdisc = new;
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
qdisc_reset(*old);
sch_tree_unlock(sch);
return 0;
}
static struct Qdisc *red_leaf(struct Qdisc *sch, unsigned long arg)
{
struct red_sched_data *q = qdisc_priv(sch);
return q->qdisc;
}
static unsigned long red_get(struct Qdisc *sch, u32 classid)
{
return 1;
}
static void red_put(struct Qdisc *sch, unsigned long arg)
{
return;
}
static void red_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
if (!walker->stop) {
if (walker->count >= walker->skip)
if (walker->fn(sch, 1, walker) < 0) {
walker->stop = 1;
return;
}
walker->count++;
}
}
static const struct Qdisc_class_ops red_class_ops = {
.graft = red_graft,
.leaf = red_leaf,
.get = red_get,
.put = red_put,
.walk = red_walk,
.dump = red_dump_class,
};
static struct Qdisc_ops red_qdisc_ops __read_mostly = {
.id = "red",
.priv_size = sizeof(struct red_sched_data),
.cl_ops = &red_class_ops,
.enqueue = red_enqueue,
.dequeue = red_dequeue,
.peek = red_peek,
.drop = red_drop,
.init = red_init,
.reset = red_reset,
.destroy = red_destroy,
.change = red_change,
.dump = red_dump,
.dump_stats = red_dump_stats,
.owner = THIS_MODULE,
};
static int __init red_module_init(void)
{
return register_qdisc(&red_qdisc_ops);
}
static void __exit red_module_exit(void)
{
unregister_qdisc(&red_qdisc_ops);
}
module_init(red_module_init)
module_exit(red_module_exit)
MODULE_LICENSE("GPL");

589
kernel/net/sched/sch_sfq.c Normal file
View File

@@ -0,0 +1,589 @@
/*
* net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/string.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/ipv6.h>
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <net/ip.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
/* Stochastic Fairness Queuing algorithm.
=======================================
Source:
Paul E. McKenney "Stochastic Fairness Queuing",
IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
Paul E. McKenney "Stochastic Fairness Queuing",
"Interworking: Research and Experience", v.2, 1991, p.113-131.
See also:
M. Shreedhar and George Varghese "Efficient Fair
Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
This is not the thing that is usually called (W)FQ nowadays.
It does not use any timestamp mechanism, but instead
processes queues in round-robin order.
ADVANTAGE:
- It is very cheap. Both CPU and memory requirements are minimal.
DRAWBACKS:
- "Stochastic" -> It is not 100% fair.
When hash collisions occur, several flows are considered as one.
- "Round-robin" -> It introduces larger delays than virtual clock
based schemes, and should not be used for isolating interactive
traffic from non-interactive. It means, that this scheduler
should be used as leaf of CBQ or P3, which put interactive traffic
to higher priority band.
We still need true WFQ for top level CSZ, but using WFQ
for the best effort traffic is absolutely pointless:
SFQ is superior for this purpose.
IMPLEMENTATION:
This implementation limits maximal queue length to 128;
maximal mtu to 2^15-1; number of hash buckets to 1024.
The only goal of this restrictions was that all data
fit into one 4K page :-). Struct sfq_sched_data is
organized in anti-cache manner: all the data for a bucket
are scattered over different locations. This is not good,
but it allowed me to put it into 4K.
It is easy to increase these values, but not in flight. */
#define SFQ_DEPTH 128
#define SFQ_HASH_DIVISOR 1024
/* This type should contain at least SFQ_DEPTH*2 values */
typedef unsigned char sfq_index;
struct sfq_head
{
sfq_index next;
sfq_index prev;
};
struct sfq_sched_data
{
/* Parameters */
int perturb_period;
unsigned quantum; /* Allotment per round: MUST BE >= MTU */
int limit;
/* Variables */
struct tcf_proto *filter_list;
struct timer_list perturb_timer;
u32 perturbation;
sfq_index tail; /* Index of current slot in round */
sfq_index max_depth; /* Maximal depth */
sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */
sfq_index next[SFQ_DEPTH]; /* Active slots link */
short allot[SFQ_DEPTH]; /* Current allotment per slot */
unsigned short hash[SFQ_DEPTH]; /* Hash value indexed by slots */
struct sk_buff_head qs[SFQ_DEPTH]; /* Slot queue */
struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */
};
static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
{
return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
}
static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
{
u32 h, h2;
switch (skb->protocol) {
case htons(ETH_P_IP):
{
const struct iphdr *iph = ip_hdr(skb);
h = iph->daddr;
h2 = iph->saddr ^ iph->protocol;
if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
(iph->protocol == IPPROTO_TCP ||
iph->protocol == IPPROTO_UDP ||
iph->protocol == IPPROTO_UDPLITE ||
iph->protocol == IPPROTO_SCTP ||
iph->protocol == IPPROTO_DCCP ||
iph->protocol == IPPROTO_ESP))
h2 ^= *(((u32*)iph) + iph->ihl);
break;
}
case htons(ETH_P_IPV6):
{
struct ipv6hdr *iph = ipv6_hdr(skb);
h = iph->daddr.s6_addr32[3];
h2 = iph->saddr.s6_addr32[3] ^ iph->nexthdr;
if (iph->nexthdr == IPPROTO_TCP ||
iph->nexthdr == IPPROTO_UDP ||
iph->nexthdr == IPPROTO_UDPLITE ||
iph->nexthdr == IPPROTO_SCTP ||
iph->nexthdr == IPPROTO_DCCP ||
iph->nexthdr == IPPROTO_ESP)
h2 ^= *(u32*)&iph[1];
break;
}
default:
h = (unsigned long)skb_dst(skb) ^ skb->protocol;
h2 = (unsigned long)skb->sk;
}
return sfq_fold_hash(q, h, h2);
}
static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
int *qerr)
{
struct sfq_sched_data *q = qdisc_priv(sch);
struct tcf_result res;
int result;
if (TC_H_MAJ(skb->priority) == sch->handle &&
TC_H_MIN(skb->priority) > 0 &&
TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
return TC_H_MIN(skb->priority);
if (!q->filter_list)
return sfq_hash(q, skb) + 1;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
result = tc_classify(skb, q->filter_list, &res);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
return 0;
}
#endif
if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
return TC_H_MIN(res.classid);
}
return 0;
}
static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
{
sfq_index p, n;
int d = q->qs[x].qlen + SFQ_DEPTH;
p = d;
n = q->dep[d].next;
q->dep[x].next = n;
q->dep[x].prev = p;
q->dep[p].next = q->dep[n].prev = x;
}
static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
{
sfq_index p, n;
n = q->dep[x].next;
p = q->dep[x].prev;
q->dep[p].next = n;
q->dep[n].prev = p;
if (n == p && q->max_depth == q->qs[x].qlen + 1)
q->max_depth--;
sfq_link(q, x);
}
static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
{
sfq_index p, n;
int d;
n = q->dep[x].next;
p = q->dep[x].prev;
q->dep[p].next = n;
q->dep[n].prev = p;
d = q->qs[x].qlen;
if (q->max_depth < d)
q->max_depth = d;
sfq_link(q, x);
}
static unsigned int sfq_drop(struct Qdisc *sch)
{
struct sfq_sched_data *q = qdisc_priv(sch);
sfq_index d = q->max_depth;
struct sk_buff *skb;
unsigned int len;
/* Queue is full! Find the longest slot and
drop a packet from it */
if (d > 1) {
sfq_index x = q->dep[d + SFQ_DEPTH].next;
skb = q->qs[x].prev;
len = qdisc_pkt_len(skb);
__skb_unlink(skb, &q->qs[x]);
kfree_skb(skb);
sfq_dec(q, x);
sch->q.qlen--;
sch->qstats.drops++;
sch->qstats.backlog -= len;
return len;
}
if (d == 1) {
/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
d = q->next[q->tail];
q->next[q->tail] = q->next[d];
q->allot[q->next[d]] += q->quantum;
skb = q->qs[d].prev;
len = qdisc_pkt_len(skb);
__skb_unlink(skb, &q->qs[d]);
kfree_skb(skb);
sfq_dec(q, d);
sch->q.qlen--;
q->ht[q->hash[d]] = SFQ_DEPTH;
sch->qstats.drops++;
sch->qstats.backlog -= len;
return len;
}
return 0;
}
static int
sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct sfq_sched_data *q = qdisc_priv(sch);
unsigned int hash;
sfq_index x;
int uninitialized_var(ret);
hash = sfq_classify(skb, sch, &ret);
if (hash == 0) {
if (ret & __NET_XMIT_BYPASS)
sch->qstats.drops++;
kfree_skb(skb);
return ret;
}
hash--;
x = q->ht[hash];
if (x == SFQ_DEPTH) {
q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
q->hash[x] = hash;
}
/* If selected queue has length q->limit, this means that
* all another queues are empty and that we do simple tail drop,
* i.e. drop _this_ packet.
*/
if (q->qs[x].qlen >= q->limit)
return qdisc_drop(skb, sch);
sch->qstats.backlog += qdisc_pkt_len(skb);
__skb_queue_tail(&q->qs[x], skb);
sfq_inc(q, x);
if (q->qs[x].qlen == 1) { /* The flow is new */
if (q->tail == SFQ_DEPTH) { /* It is the first flow */
q->tail = x;
q->next[x] = x;
q->allot[x] = q->quantum;
} else {
q->next[x] = q->next[q->tail];
q->next[q->tail] = x;
q->tail = x;
}
}
if (++sch->q.qlen <= q->limit) {
sch->bstats.bytes += qdisc_pkt_len(skb);
sch->bstats.packets++;
return 0;
}
sfq_drop(sch);
return NET_XMIT_CN;
}
static struct sk_buff *
sfq_peek(struct Qdisc *sch)
{
struct sfq_sched_data *q = qdisc_priv(sch);
sfq_index a;
/* No active slots */
if (q->tail == SFQ_DEPTH)
return NULL;
a = q->next[q->tail];
return skb_peek(&q->qs[a]);
}
static struct sk_buff *
sfq_dequeue(struct Qdisc *sch)
{
struct sfq_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
sfq_index a, old_a;
/* No active slots */
if (q->tail == SFQ_DEPTH)
return NULL;
a = old_a = q->next[q->tail];
/* Grab packet */
skb = __skb_dequeue(&q->qs[a]);
sfq_dec(q, a);
sch->q.qlen--;
sch->qstats.backlog -= qdisc_pkt_len(skb);
/* Is the slot empty? */
if (q->qs[a].qlen == 0) {
q->ht[q->hash[a]] = SFQ_DEPTH;
a = q->next[a];
if (a == old_a) {
q->tail = SFQ_DEPTH;
return skb;
}
q->next[q->tail] = a;
q->allot[a] += q->quantum;
} else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
q->tail = a;
a = q->next[a];
q->allot[a] += q->quantum;
}
return skb;
}
static void
sfq_reset(struct Qdisc *sch)
{
struct sk_buff *skb;
while ((skb = sfq_dequeue(sch)) != NULL)
kfree_skb(skb);
}
static void sfq_perturbation(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc *)arg;
struct sfq_sched_data *q = qdisc_priv(sch);
q->perturbation = net_random();
if (q->perturb_period)
mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
}
static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
{
struct sfq_sched_data *q = qdisc_priv(sch);
struct tc_sfq_qopt *ctl = nla_data(opt);
unsigned int qlen;
if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
return -EINVAL;
sch_tree_lock(sch);
q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
q->perturb_period = ctl->perturb_period * HZ;
if (ctl->limit)
q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
qlen = sch->q.qlen;
while (sch->q.qlen > q->limit)
sfq_drop(sch);
qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
del_timer(&q->perturb_timer);
if (q->perturb_period) {
mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
q->perturbation = net_random();
}
sch_tree_unlock(sch);
return 0;
}
static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
{
struct sfq_sched_data *q = qdisc_priv(sch);
int i;
q->perturb_timer.function = sfq_perturbation;
q->perturb_timer.data = (unsigned long)sch;
init_timer_deferrable(&q->perturb_timer);
for (i = 0; i < SFQ_HASH_DIVISOR; i++)
q->ht[i] = SFQ_DEPTH;
for (i = 0; i < SFQ_DEPTH; i++) {
skb_queue_head_init(&q->qs[i]);
q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
}
q->limit = SFQ_DEPTH - 1;
q->max_depth = 0;
q->tail = SFQ_DEPTH;
if (opt == NULL) {
q->quantum = psched_mtu(qdisc_dev(sch));
q->perturb_period = 0;
q->perturbation = net_random();
} else {
int err = sfq_change(sch, opt);
if (err)
return err;
}
for (i = 0; i < SFQ_DEPTH; i++)
sfq_link(q, i);
return 0;
}
static void sfq_destroy(struct Qdisc *sch)
{
struct sfq_sched_data *q = qdisc_priv(sch);
tcf_destroy_chain(&q->filter_list);
q->perturb_period = 0;
del_timer_sync(&q->perturb_timer);
}
static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct sfq_sched_data *q = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
struct tc_sfq_qopt opt;
opt.quantum = q->quantum;
opt.perturb_period = q->perturb_period / HZ;
opt.limit = q->limit;
opt.divisor = SFQ_HASH_DIVISOR;
opt.flows = q->limit;
NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
{
return 0;
}
static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
{
struct sfq_sched_data *q = qdisc_priv(sch);
if (cl)
return NULL;
return &q->filter_list;
}
static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
tcm->tcm_handle |= TC_H_MIN(cl);
return 0;
}
static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct gnet_dump *d)
{
struct sfq_sched_data *q = qdisc_priv(sch);
sfq_index idx = q->ht[cl-1];
struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
if (gnet_stats_copy_queue(d, &qs) < 0)
return -1;
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
}
static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct sfq_sched_data *q = qdisc_priv(sch);
unsigned int i;
if (arg->stop)
return;
for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
if (q->ht[i] == SFQ_DEPTH ||
arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(sch, i + 1, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
static const struct Qdisc_class_ops sfq_class_ops = {
.get = sfq_get,
.tcf_chain = sfq_find_tcf,
.dump = sfq_dump_class,
.dump_stats = sfq_dump_class_stats,
.walk = sfq_walk,
};
static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
.cl_ops = &sfq_class_ops,
.id = "sfq",
.priv_size = sizeof(struct sfq_sched_data),
.enqueue = sfq_enqueue,
.dequeue = sfq_dequeue,
.peek = sfq_peek,
.drop = sfq_drop,
.init = sfq_init,
.reset = sfq_reset,
.destroy = sfq_destroy,
.change = NULL,
.dump = sfq_dump,
.owner = THIS_MODULE,
};
static int __init sfq_module_init(void)
{
return register_qdisc(&sfq_qdisc_ops);
}
static void __exit sfq_module_exit(void)
{
unregister_qdisc(&sfq_qdisc_ops);
}
module_init(sfq_module_init)
module_exit(sfq_module_exit)
MODULE_LICENSE("GPL");

460
kernel/net/sched/sch_tbf.c Normal file
View File

@@ -0,0 +1,460 @@
/*
* net/sched/sch_tbf.c Token Bucket Filter queue.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
* original idea by Martin Devera
*
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
/* Simple Token Bucket Filter.
=======================================
SOURCE.
-------
None.
Description.
------------
A data flow obeys TBF with rate R and depth B, if for any
time interval t_i...t_f the number of transmitted bits
does not exceed B + R*(t_f-t_i).
Packetized version of this definition:
The sequence of packets of sizes s_i served at moments t_i
obeys TBF, if for any i<=k:
s_i+....+s_k <= B + R*(t_k - t_i)
Algorithm.
----------
Let N(t_i) be B/R initially and N(t) grow continuously with time as:
N(t+delta) = min{B/R, N(t) + delta}
If the first packet in queue has length S, it may be
transmitted only at the time t_* when S/R <= N(t_*),
and in this case N(t) jumps:
N(t_* + 0) = N(t_* - 0) - S/R.
Actually, QoS requires two TBF to be applied to a data stream.
One of them controls steady state burst size, another
one with rate P (peak rate) and depth M (equal to link MTU)
limits bursts at a smaller time scale.
It is easy to see that P>R, and B>M. If P is infinity, this double
TBF is equivalent to a single one.
When TBF works in reshaping mode, latency is estimated as:
lat = max ((L-B)/R, (L-M)/P)
NOTES.
------
If TBF throttles, it starts a watchdog timer, which will wake it up
when it is ready to transmit.
Note that the minimal timer resolution is 1/HZ.
If no new packets arrive during this period,
or if the device is not awaken by EOI for some previous packet,
TBF can stop its activity for 1/HZ.
This means, that with depth B, the maximal rate is
R_crit = B*HZ
F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
Note that the peak rate TBF is much more tough: with MTU 1500
P_crit = 150Kbytes/sec. So, if you need greater peak
rates, use alpha with HZ=1000 :-)
With classful TBF, limit is just kept for backwards compatibility.
It is passed to the default bfifo qdisc - if the inner qdisc is
changed the limit is not effective anymore.
*/
struct tbf_sched_data
{
/* Parameters */
u32 limit; /* Maximal length of backlog: bytes */
u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
u32 mtu;
u32 max_size;
struct qdisc_rate_table *R_tab;
struct qdisc_rate_table *P_tab;
/* Variables */
long tokens; /* Current number of B tokens */
long ptokens; /* Current number of P tokens */
psched_time_t t_c; /* Time check-point */
struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */
struct qdisc_watchdog watchdog; /* Watchdog timer */
};
#define L2T(q,L) qdisc_l2t((q)->R_tab,L)
#define L2T_P(q,L) qdisc_l2t((q)->P_tab,L)
static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
int ret;
if (qdisc_pkt_len(skb) > q->max_size)
return qdisc_reshape_fail(skb, sch);
ret = qdisc_enqueue(skb, q->qdisc);
if (ret != 0) {
if (net_xmit_drop_count(ret))
sch->qstats.drops++;
return ret;
}
sch->q.qlen++;
sch->bstats.bytes += qdisc_pkt_len(skb);
sch->bstats.packets++;
return 0;
}
static unsigned int tbf_drop(struct Qdisc* sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
unsigned int len = 0;
if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
sch->q.qlen--;
sch->qstats.drops++;
}
return len;
}
static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
skb = q->qdisc->ops->peek(q->qdisc);
if (skb) {
psched_time_t now;
long toks;
long ptoks = 0;
unsigned int len = qdisc_pkt_len(skb);
now = psched_get_time();
toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
if (q->P_tab) {
ptoks = toks + q->ptokens;
if (ptoks > (long)q->mtu)
ptoks = q->mtu;
ptoks -= L2T_P(q, len);
}
toks += q->tokens;
if (toks > (long)q->buffer)
toks = q->buffer;
toks -= L2T(q, len);
if ((toks|ptoks) >= 0) {
skb = qdisc_dequeue_peeked(q->qdisc);
if (unlikely(!skb))
return NULL;
q->t_c = now;
q->tokens = toks;
q->ptokens = ptoks;
sch->q.qlen--;
sch->flags &= ~TCQ_F_THROTTLED;
return skb;
}
qdisc_watchdog_schedule(&q->watchdog,
now + max_t(long, -toks, -ptoks));
/* Maybe we have a shorter packet in the queue,
which can be sent now. It sounds cool,
but, however, this is wrong in principle.
We MUST NOT reorder packets under these circumstances.
Really, if we split the flow into independent
subflows, it would be a very good solution.
This is the main idea of all FQ algorithms
(cf. CSZ, HPFQ, HFSC)
*/
sch->qstats.overlimits++;
}
return NULL;
}
static void tbf_reset(struct Qdisc* sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
sch->q.qlen = 0;
q->t_c = psched_get_time();
q->tokens = q->buffer;
q->ptokens = q->mtu;
qdisc_watchdog_cancel(&q->watchdog);
}
static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
[TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
[TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
[TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
};
static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
{
int err;
struct tbf_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_TBF_PTAB + 1];
struct tc_tbf_qopt *qopt;
struct qdisc_rate_table *rtab = NULL;
struct qdisc_rate_table *ptab = NULL;
struct Qdisc *child = NULL;
int max_size,n;
err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
if (err < 0)
return err;
err = -EINVAL;
if (tb[TCA_TBF_PARMS] == NULL)
goto done;
qopt = nla_data(tb[TCA_TBF_PARMS]);
rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB]);
if (rtab == NULL)
goto done;
if (qopt->peakrate.rate) {
if (qopt->peakrate.rate > qopt->rate.rate)
ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB]);
if (ptab == NULL)
goto done;
}
for (n = 0; n < 256; n++)
if (rtab->data[n] > qopt->buffer) break;
max_size = (n << qopt->rate.cell_log)-1;
if (ptab) {
int size;
for (n = 0; n < 256; n++)
if (ptab->data[n] > qopt->mtu) break;
size = (n << qopt->peakrate.cell_log)-1;
if (size < max_size) max_size = size;
}
if (max_size < 0)
goto done;
if (qopt->limit > 0) {
child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
if (IS_ERR(child)) {
err = PTR_ERR(child);
goto done;
}
}
sch_tree_lock(sch);
if (child) {
qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
qdisc_destroy(q->qdisc);
q->qdisc = child;
}
q->limit = qopt->limit;
q->mtu = qopt->mtu;
q->max_size = max_size;
q->buffer = qopt->buffer;
q->tokens = q->buffer;
q->ptokens = q->mtu;
swap(q->R_tab, rtab);
swap(q->P_tab, ptab);
sch_tree_unlock(sch);
err = 0;
done:
if (rtab)
qdisc_put_rtab(rtab);
if (ptab)
qdisc_put_rtab(ptab);
return err;
}
static int tbf_init(struct Qdisc* sch, struct nlattr *opt)
{
struct tbf_sched_data *q = qdisc_priv(sch);
if (opt == NULL)
return -EINVAL;
q->t_c = psched_get_time();
qdisc_watchdog_init(&q->watchdog, sch);
q->qdisc = &noop_qdisc;
return tbf_change(sch, opt);
}
static void tbf_destroy(struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_watchdog_cancel(&q->watchdog);
if (q->P_tab)
qdisc_put_rtab(q->P_tab);
if (q->R_tab)
qdisc_put_rtab(q->R_tab);
qdisc_destroy(q->qdisc);
}
static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct tbf_sched_data *q = qdisc_priv(sch);
struct nlattr *nest;
struct tc_tbf_qopt opt;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
opt.limit = q->limit;
opt.rate = q->R_tab->rate;
if (q->P_tab)
opt.peakrate = q->P_tab->rate;
else
memset(&opt.peakrate, 0, sizeof(opt.peakrate));
opt.mtu = q->mtu;
opt.buffer = q->buffer;
NLA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
nla_nest_end(skb, nest);
return skb->len;
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
}
static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct tbf_sched_data *q = qdisc_priv(sch);
tcm->tcm_handle |= TC_H_MIN(1);
tcm->tcm_info = q->qdisc->handle;
return 0;
}
static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
struct tbf_sched_data *q = qdisc_priv(sch);
if (new == NULL)
new = &noop_qdisc;
sch_tree_lock(sch);
*old = q->qdisc;
q->qdisc = new;
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
qdisc_reset(*old);
sch_tree_unlock(sch);
return 0;
}
static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
{
struct tbf_sched_data *q = qdisc_priv(sch);
return q->qdisc;
}
static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
{
return 1;
}
static void tbf_put(struct Qdisc *sch, unsigned long arg)
{
}
static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
if (!walker->stop) {
if (walker->count >= walker->skip)
if (walker->fn(sch, 1, walker) < 0) {
walker->stop = 1;
return;
}
walker->count++;
}
}
static const struct Qdisc_class_ops tbf_class_ops =
{
.graft = tbf_graft,
.leaf = tbf_leaf,
.get = tbf_get,
.put = tbf_put,
.walk = tbf_walk,
.dump = tbf_dump_class,
};
static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
.next = NULL,
.cl_ops = &tbf_class_ops,
.id = "tbf",
.priv_size = sizeof(struct tbf_sched_data),
.enqueue = tbf_enqueue,
.dequeue = tbf_dequeue,
.peek = qdisc_peek_dequeued,
.drop = tbf_drop,
.init = tbf_init,
.reset = tbf_reset,
.destroy = tbf_destroy,
.change = tbf_change,
.dump = tbf_dump,
.owner = THIS_MODULE,
};
static int __init tbf_module_init(void)
{
return register_qdisc(&tbf_qdisc_ops);
}
static void __exit tbf_module_exit(void)
{
unregister_qdisc(&tbf_qdisc_ops);
}
module_init(tbf_module_init)
module_exit(tbf_module_exit)
MODULE_LICENSE("GPL");

509
kernel/net/sched/sch_teql.c Normal file
View File

@@ -0,0 +1,509 @@
/* net/sched/sch_teql.c "True" (or "trivial") link equalizer.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/if_arp.h>
#include <linux/netdevice.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/moduleparam.h>
#include <net/dst.h>
#include <net/neighbour.h>
#include <net/pkt_sched.h>
/*
How to setup it.
----------------
After loading this module you will find a new device teqlN
and new qdisc with the same name. To join a slave to the equalizer
you should just set this qdisc on a device f.e.
# tc qdisc add dev eth0 root teql0
# tc qdisc add dev eth1 root teql0
That's all. Full PnP 8)
Applicability.
--------------
1. Slave devices MUST be active devices, i.e., they must raise the tbusy
signal and generate EOI events. If you want to equalize virtual devices
like tunnels, use a normal eql device.
2. This device puts no limitations on physical slave characteristics
f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-)
Certainly, large difference in link speeds will make the resulting
eqalized link unusable, because of huge packet reordering.
I estimate an upper useful difference as ~10 times.
3. If the slave requires address resolution, only protocols using
neighbour cache (IPv4/IPv6) will work over the equalized link.
Other protocols are still allowed to use the slave device directly,
which will not break load balancing, though native slave
traffic will have the highest priority. */
struct teql_master
{
struct Qdisc_ops qops;
struct net_device *dev;
struct Qdisc *slaves;
struct list_head master_list;
};
struct teql_sched_data
{
struct Qdisc *next;
struct teql_master *m;
struct neighbour *ncache;
struct sk_buff_head q;
};
#define NEXT_SLAVE(q) (((struct teql_sched_data*)qdisc_priv(q))->next)
#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT)
/* "teql*" qdisc routines */
static int
teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
{
struct net_device *dev = qdisc_dev(sch);
struct teql_sched_data *q = qdisc_priv(sch);
if (q->q.qlen < dev->tx_queue_len) {
__skb_queue_tail(&q->q, skb);
sch->bstats.bytes += qdisc_pkt_len(skb);
sch->bstats.packets++;
return 0;
}
kfree_skb(skb);
sch->qstats.drops++;
return NET_XMIT_DROP;
}
static struct sk_buff *
teql_dequeue(struct Qdisc* sch)
{
struct teql_sched_data *dat = qdisc_priv(sch);
struct netdev_queue *dat_queue;
struct sk_buff *skb;
skb = __skb_dequeue(&dat->q);
dat_queue = netdev_get_tx_queue(dat->m->dev, 0);
if (skb == NULL) {
struct net_device *m = qdisc_dev(dat_queue->qdisc);
if (m) {
dat->m->slaves = sch;
netif_wake_queue(m);
}
}
sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen;
return skb;
}
static struct sk_buff *
teql_peek(struct Qdisc* sch)
{
/* teql is meant to be used as root qdisc */
return NULL;
}
static __inline__ void
teql_neigh_release(struct neighbour *n)
{
if (n)
neigh_release(n);
}
static void
teql_reset(struct Qdisc* sch)
{
struct teql_sched_data *dat = qdisc_priv(sch);
skb_queue_purge(&dat->q);
sch->q.qlen = 0;
teql_neigh_release(xchg(&dat->ncache, NULL));
}
static void
teql_destroy(struct Qdisc* sch)
{
struct Qdisc *q, *prev;
struct teql_sched_data *dat = qdisc_priv(sch);
struct teql_master *master = dat->m;
if ((prev = master->slaves) != NULL) {
do {
q = NEXT_SLAVE(prev);
if (q == sch) {
NEXT_SLAVE(prev) = NEXT_SLAVE(q);
if (q == master->slaves) {
master->slaves = NEXT_SLAVE(q);
if (q == master->slaves) {
struct netdev_queue *txq;
spinlock_t *root_lock;
txq = netdev_get_tx_queue(master->dev, 0);
master->slaves = NULL;
root_lock = qdisc_root_sleeping_lock(txq->qdisc);
spin_lock_bh(root_lock);
qdisc_reset(txq->qdisc);
spin_unlock_bh(root_lock);
}
}
skb_queue_purge(&dat->q);
teql_neigh_release(xchg(&dat->ncache, NULL));
break;
}
} while ((prev = q) != master->slaves);
}
}
static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
{
struct net_device *dev = qdisc_dev(sch);
struct teql_master *m = (struct teql_master*)sch->ops;
struct teql_sched_data *q = qdisc_priv(sch);
if (dev->hard_header_len > m->dev->hard_header_len)
return -EINVAL;
if (m->dev == dev)
return -ELOOP;
q->m = m;
skb_queue_head_init(&q->q);
if (m->slaves) {
if (m->dev->flags & IFF_UP) {
if ((m->dev->flags&IFF_POINTOPOINT && !(dev->flags&IFF_POINTOPOINT))
|| (m->dev->flags&IFF_BROADCAST && !(dev->flags&IFF_BROADCAST))
|| (m->dev->flags&IFF_MULTICAST && !(dev->flags&IFF_MULTICAST))
|| dev->mtu < m->dev->mtu)
return -EINVAL;
} else {
if (!(dev->flags&IFF_POINTOPOINT))
m->dev->flags &= ~IFF_POINTOPOINT;
if (!(dev->flags&IFF_BROADCAST))
m->dev->flags &= ~IFF_BROADCAST;
if (!(dev->flags&IFF_MULTICAST))
m->dev->flags &= ~IFF_MULTICAST;
if (dev->mtu < m->dev->mtu)
m->dev->mtu = dev->mtu;
}
q->next = NEXT_SLAVE(m->slaves);
NEXT_SLAVE(m->slaves) = sch;
} else {
q->next = sch;
m->slaves = sch;
m->dev->mtu = dev->mtu;
m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK);
}
return 0;
}
static int
__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev)
{
struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, 0);
struct teql_sched_data *q = qdisc_priv(dev_queue->qdisc);
struct neighbour *mn = skb_dst(skb)->neighbour;
struct neighbour *n = q->ncache;
if (mn->tbl == NULL)
return -EINVAL;
if (n && n->tbl == mn->tbl &&
memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) {
atomic_inc(&n->refcnt);
} else {
n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev);
if (IS_ERR(n))
return PTR_ERR(n);
}
if (neigh_event_send(n, skb_res) == 0) {
int err;
read_lock(&n->lock);
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
n->ha, NULL, skb->len);
read_unlock(&n->lock);
if (err < 0) {
neigh_release(n);
return -EINVAL;
}
teql_neigh_release(xchg(&q->ncache, n));
return 0;
}
neigh_release(n);
return (skb_res == NULL) ? -EAGAIN : 1;
}
static inline int teql_resolve(struct sk_buff *skb,
struct sk_buff *skb_res, struct net_device *dev)
{
struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
if (txq->qdisc == &noop_qdisc)
return -ENODEV;
if (dev->header_ops == NULL ||
skb_dst(skb) == NULL ||
skb_dst(skb)->neighbour == NULL)
return 0;
return __teql_resolve(skb, skb_res, dev);
}
static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct teql_master *master = netdev_priv(dev);
struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
struct Qdisc *start, *q;
int busy;
int nores;
int subq = skb_get_queue_mapping(skb);
struct sk_buff *skb_res = NULL;
start = master->slaves;
restart:
nores = 0;
busy = 0;
if ((q = start) == NULL)
goto drop;
do {
struct net_device *slave = qdisc_dev(q);
struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0);
const struct net_device_ops *slave_ops = slave->netdev_ops;
if (slave_txq->qdisc_sleeping != q)
continue;
if (__netif_subqueue_stopped(slave, subq) ||
!netif_running(slave)) {
busy = 1;
continue;
}
switch (teql_resolve(skb, skb_res, slave)) {
case 0:
if (__netif_tx_trylock(slave_txq)) {
unsigned int length = qdisc_pkt_len(skb);
if (!netif_tx_queue_stopped(slave_txq) &&
!netif_tx_queue_frozen(slave_txq) &&
slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) {
txq_trans_update(slave_txq);
__netif_tx_unlock(slave_txq);
master->slaves = NEXT_SLAVE(q);
netif_wake_queue(dev);
txq->tx_packets++;
txq->tx_bytes += length;
return NETDEV_TX_OK;
}
__netif_tx_unlock(slave_txq);
}
if (netif_queue_stopped(dev))
busy = 1;
break;
case 1:
master->slaves = NEXT_SLAVE(q);
return NETDEV_TX_OK;
default:
nores = 1;
break;
}
__skb_pull(skb, skb_network_offset(skb));
} while ((q = NEXT_SLAVE(q)) != start);
if (nores && skb_res == NULL) {
skb_res = skb;
goto restart;
}
if (busy) {
netif_stop_queue(dev);
return NETDEV_TX_BUSY;
}
dev->stats.tx_errors++;
drop:
txq->tx_dropped++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
static int teql_master_open(struct net_device *dev)
{
struct Qdisc * q;
struct teql_master *m = netdev_priv(dev);
int mtu = 0xFFFE;
unsigned flags = IFF_NOARP|IFF_MULTICAST;
if (m->slaves == NULL)
return -EUNATCH;
flags = FMASK;
q = m->slaves;
do {
struct net_device *slave = qdisc_dev(q);
if (slave == NULL)
return -EUNATCH;
if (slave->mtu < mtu)
mtu = slave->mtu;
if (slave->hard_header_len > LL_MAX_HEADER)
return -EINVAL;
/* If all the slaves are BROADCAST, master is BROADCAST
If all the slaves are PtP, master is PtP
Otherwise, master is NBMA.
*/
if (!(slave->flags&IFF_POINTOPOINT))
flags &= ~IFF_POINTOPOINT;
if (!(slave->flags&IFF_BROADCAST))
flags &= ~IFF_BROADCAST;
if (!(slave->flags&IFF_MULTICAST))
flags &= ~IFF_MULTICAST;
} while ((q = NEXT_SLAVE(q)) != m->slaves);
m->dev->mtu = mtu;
m->dev->flags = (m->dev->flags&~FMASK) | flags;
netif_start_queue(m->dev);
return 0;
}
static int teql_master_close(struct net_device *dev)
{
netif_stop_queue(dev);
return 0;
}
static int teql_master_mtu(struct net_device *dev, int new_mtu)
{
struct teql_master *m = netdev_priv(dev);
struct Qdisc *q;
if (new_mtu < 68)
return -EINVAL;
q = m->slaves;
if (q) {
do {
if (new_mtu > qdisc_dev(q)->mtu)
return -EINVAL;
} while ((q=NEXT_SLAVE(q)) != m->slaves);
}
dev->mtu = new_mtu;
return 0;
}
static const struct net_device_ops teql_netdev_ops = {
.ndo_open = teql_master_open,
.ndo_stop = teql_master_close,
.ndo_start_xmit = teql_master_xmit,
.ndo_change_mtu = teql_master_mtu,
};
static __init void teql_master_setup(struct net_device *dev)
{
struct teql_master *master = netdev_priv(dev);
struct Qdisc_ops *ops = &master->qops;
master->dev = dev;
ops->priv_size = sizeof(struct teql_sched_data);
ops->enqueue = teql_enqueue;
ops->dequeue = teql_dequeue;
ops->peek = teql_peek;
ops->init = teql_qdisc_init;
ops->reset = teql_reset;
ops->destroy = teql_destroy;
ops->owner = THIS_MODULE;
dev->netdev_ops = &teql_netdev_ops;
dev->type = ARPHRD_VOID;
dev->mtu = 1500;
dev->tx_queue_len = 100;
dev->flags = IFF_NOARP;
dev->hard_header_len = LL_MAX_HEADER;
}
static LIST_HEAD(master_dev_list);
static int max_equalizers = 1;
module_param(max_equalizers, int, 0);
MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers");
static int __init teql_init(void)
{
int i;
int err = -ENODEV;
for (i = 0; i < max_equalizers; i++) {
struct net_device *dev;
struct teql_master *master;
dev = alloc_netdev(sizeof(struct teql_master),
"teql%d", teql_master_setup);
if (!dev) {
err = -ENOMEM;
break;
}
if ((err = register_netdev(dev))) {
free_netdev(dev);
break;
}
master = netdev_priv(dev);
strlcpy(master->qops.id, dev->name, IFNAMSIZ);
err = register_qdisc(&master->qops);
if (err) {
unregister_netdev(dev);
free_netdev(dev);
break;
}
list_add_tail(&master->master_list, &master_dev_list);
}
return i ? 0 : err;
}
static void __exit teql_exit(void)
{
struct teql_master *master, *nxt;
list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) {
list_del(&master->master_list);
unregister_qdisc(&master->qops);
unregister_netdev(master->dev);
free_netdev(master->dev);
}
}
module_init(teql_init);
module_exit(teql_exit);
MODULE_LICENSE("GPL");