Functions for network filtering

ip_nat_fn()

static unsigned int
ip_nat_fn(unsigned int hooknum,
	  struct sk_buff **pskb,
	  const struct net_device *in,
	  const struct net_device *out,
	  int (*okfn)(struct sk_buff *))
{
	struct ip_conntrack *ct;
	enum ip_conntrack_info ctinfo;
	struct ip_nat_info *info;
	/* maniptype == SRC for postrouting. */
	enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);

	/* We never see fragments: conntrack defrags on pre-routing
	   and local-out, and ip_nat_out protects post-routing. */
	IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
		       & __constant_htons(IP_MF|IP_OFFSET)));

	(*pskb)->nfcache |= NFC_UNKNOWN;

	/* If we had a hardware checksum before, it's now invalid */
	if ((*pskb)->ip_summed == CHECKSUM_HW)
		(*pskb)->ip_summed = CHECKSUM_NONE;

	ct = ip_conntrack_get(*pskb, &ctinfo);
	/* Can't track?  It's not due to stress, or conntrack would
	   have dropped it.  Hence it's the user's responsibilty to
	   packet filter it out, or implement conntrack/NAT for that
	   protocol. 8) --RR */
	if (!ct) {
		/* Exception: ICMP redirect to new connection (not in
                   hash table yet).  We must not let this through, in
                   case we're doing NAT to the same network. */
		struct iphdr *iph = (*pskb)->nh.iph;
		struct icmphdr *hdr = (struct icmphdr *)
			((u_int32_t *)iph + iph->ihl);
		if (iph->protocol == IPPROTO_ICMP
		    && hdr->type == ICMP_REDIRECT)
			return NF_DROP;
		return NF_ACCEPT;
	}

	switch (ctinfo) {
	case IP_CT_RELATED:
	case IP_CT_RELATED+IP_CT_IS_REPLY:
		if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
			return icmp_reply_translation(*pskb, ct, hooknum,
						      CTINFO2DIR(ctinfo));
		}
		/* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */

In the case of IP_CT_NEW, code checks info->initialized flag with and-ed
operation. The checked bit is 1 << maniptype.
maniptype is decided at the first of this function as following

enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);

This value is eiher of IPNAT_MANIP_SRC or IPNAT_MANIP_DST.
If it is not initialized, calls ip_nat_rule_file(), which calls
ipt_do_table() to check the necessity ofNAT translation.

	case IP_CT_NEW:
		info = &ct->nat.info;

		WRITE_LOCK(&ip_nat_lock);
		/* Seen it before?  This can happen for loopback, retrans,
		   or local packets.. */
		if (!(info->initialized & (1 << maniptype))) {
			int in_hashes = info->initialized;
			unsigned int ret;

			ret = ip_nat_rule_find(pskb, hooknum, in, out,
					       ct, info);
			if (ret != NF_ACCEPT) {
				WRITE_UNLOCK(&ip_nat_lock);
				return ret;
			}

If this conntrack is in bysource hash[], the code replace it with new
hash value made from new tuple.
Otherwise, put the ip_conntrack into bysource[] hash by place_in_hashes.

These placement or replacement is happened to relate the connection to
original or translated tuple (reply tuple: IP_CT_DIR_REPLY).

			if (in_hashes) {
				IP_NF_ASSERT(info->bysource.conntrack);
				replace_in_hashes(ct, info);
			} else {
				place_in_hashes(ct, info);
			}
		} else
			DEBUGP("Already setup manip %s for ct %p\n",
			       maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
			       ct);
		WRITE_UNLOCK(&ip_nat_lock);
		break;

	default:
		/* ESTABLISHED */
		IP_NF_ASSERT(ctinfo == IP_CT_ESTABLISHED
			     || ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY));
		info = &ct->nat.info;
	}

Once connection tracking starts andNAT table is established,
do_bindings() does theNAT translation jobs here.

	IP_NF_ASSERT(info);
	return do_bindings(ct, ctinfo, info, hooknum, pskb);
}

ip_nat_used_tuple()

ip_nat_used_tuple() is defined ${linux src}/net/ipv4/netfilter/ip_nat_core.c.

ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
                  const struct ip_conntrack *ignored_conntrack)
{
        /* Conntrack tracking doesn't keep track of outgoing tuples; only
           incoming ones.  NAT means they don't have a fixed mapping,
           so we invert the tuple and look for the incoming reply.

           We could keep a separate hash if this proves too slow. */
        struct ip_conntrack_tuple reply;

        invert_tuplepr(&reply, tuple);
        return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
}

And ip_conntrack_tuple_taken is defined in
${linux src}/net/ipv4/netfilter/ip_conntrack_standalone.c

int
ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
                         const struct ip_conntrack *ignored_conntrack)
{
        struct ip_conntrack_tuple_hash *h;

        READ_LOCK(&ip_conntrack_lock);
        h = __ip_conntrack_find(tuple, ignored_conntrack);
        READ_UNLOCK(&ip_conntrack_lock);

        return h != NULL;
}

init_conntrack()

static struct ip_conntrack_tuple_hash *
init_conntrack(const struct ip_conntrack_tuple *tuple,
	       struct ip_conntrack_protocol *protocol,
	       struct sk_buff *skb)
{
	struct ip_conntrack *conntrack;
	struct ip_conntrack_tuple repl_tuple;
	size_t hash, repl_hash;
	struct ip_conntrack_expect *expected;
	int i;
	static unsigned int drop_next = 0;

	hash = hash_conntrack(tuple);

	if (ip_conntrack_max &&
	    atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
		/* Try dropping from random chain, or else from the
                   chain about to put into (in case they're trying to
                   bomb one hash chain). */
		if (drop_next >= ip_conntrack_htable_size)
			drop_next = 0;
		if (!early_drop(&ip_conntrack_hash[drop_next++])
		    && !early_drop(&ip_conntrack_hash[hash])) {
			if (net_ratelimit())
				printk(KERN_WARNING
				       "ip_conntrack: table full, dropping"
				       " packet.\n");
			return ERR_PTR(-ENOMEM);
		}
	}

	if (!invert_tuple(&repl_tuple, tuple, protocol)) {
		DEBUGP("Can't invert tuple.\n");
		return NULL;
	}
	repl_hash = hash_conntrack(&repl_tuple);

	conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
	if (!conntrack) {
		DEBUGP("Can't allocate conntrack.\n");
		return ERR_PTR(-ENOMEM);
	}

	memset(conntrack, 0, sizeof(struct ip_conntrack));
	atomic_set(&conntrack->ct_general.use, 1);
	conntrack->ct_general.destroy = destroy_conntrack;
	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
	conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
	for (i=0; i < IP_CT_NUMBER; i++)
		conntrack->infos[i].master = &conntrack->ct_general;

	if (!protocol->new(conntrack, skb->nh.iph, skb->len)) {
		kmem_cache_free(ip_conntrack_cachep, conntrack);
		return NULL;
	}
	/* Don't set timer yet: wait for confirmation */
	init_timer(&conntrack->timeout);
	conntrack->timeout.data = (unsigned long)conntrack;
	conntrack->timeout.function = death_by_timeout;

	/* Mark clearly that it's not in the hash table. */
	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list.next = NULL;

	/* Write lock required for deletion of expected.  Without
           this, a read-lock would do. */
	WRITE_LOCK(&ip_conntrack_lock);
	conntrack->helper = LIST_FIND(&helpers, helper_cmp,
				      struct ip_conntrack_helper *,
				      &repl_tuple);
	/* Need finding and deleting of expected ONLY if we win race */
	expected = LIST_FIND(&expect_list, expect_cmp,
			     struct ip_conntrack_expect *, tuple);
	/* If master is not in hash table yet (ie. packet hasn't left
	   this machine yet), how can other end know about expected?
	   Hence these are not the droids you are looking for (if
	   master ct never got confirmed, we'd hold a reference to it
	   and weird things would happen to future packets). */
	if (expected && is_confirmed(expected->expectant)) {
		/* Welcome, Mr. Bond.  We've been expecting you... */
		conntrack->status = IPS_EXPECTED;
		conntrack->master.master = &expected->expectant->ct_general;
		IP_NF_ASSERT(conntrack->master.master);
		LIST_DELETE(&expect_list, expected);
		expected->expectant = NULL;
		nf_conntrack_get(&conntrack->master);
	}
	atomic_inc(&ip_conntrack_count);
	WRITE_UNLOCK(&ip_conntrack_lock);

	if (expected && expected->expectfn)
		expected->expectfn(conntrack);
	return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
}

resolve_normal_ct()

static inline struct ip_conntrack *
resolve_normal_ct(struct sk_buff *skb,
		  struct ip_conntrack_protocol *proto,
		  int *set_reply,
		  unsigned int hooknum,
		  enum ip_conntrack_info *ctinfo)
{
	struct ip_conntrack_tuple tuple;
	struct ip_conntrack_tuple_hash *h;

	IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);

	if (!get_tuple(skb->nh.iph, skb->len, &tuple, proto))
		return NULL;

	/* look for tuple match */
	h = ip_conntrack_find_get(&tuple, NULL);
	if (!h) {
		h = init_conntrack(&tuple, proto, skb);
		if (!h)
			return NULL;
		if (IS_ERR(h))
			return (void *)h;
	}

	/* It exists; we have (non-exclusive) reference. */
	if (DIRECTION(h) == IP_CT_DIR_REPLY) {
		*ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
		/* Please set reply bit if this packet OK */
		*set_reply = 1;
	} else {
		/* Once we've had two way comms, always ESTABLISHED. */
		if (h->ctrack->status & IPS_SEEN_REPLY) {
			DEBUGP("ip_conntrack_in: normal packet for %p\n",
			       h->ctrack);
		        *ctinfo = IP_CT_ESTABLISHED;
		} else if (h->ctrack->status & IPS_EXPECTED) {
			DEBUGP("ip_conntrack_in: related packet for %p\n",
			       h->ctrack);
			*ctinfo = IP_CT_RELATED;
		} else {
			DEBUGP("ip_conntrack_in: new packet for %p\n",
			       h->ctrack);
			*ctinfo = IP_CT_NEW;
		}
		*set_reply = 0;
	}
	skb->nfct = &h->ctrack->infos[*ctinfo];
	return h->ctrack;
}

ozen@pj4.harenet.ne.jp