David Miller a écrit :
From: Eric Dumazet <[EMAIL PROTECTED]>
Date: Mon, 31 Dec 2007 09:54:32 +0100

Maybe I read the patch incorrectly, or we could add some new sysctl so that
we not try to uncharge memory if a socket 'forward_alloc' is beyond a given limit (say 2 pages), so that number of atomic_inc/dec on udp_memory_allocated (or tcp_memory_allocated) is reduced.

This is what we should be striving for, using forward_alloc
as much as possible as a "cache" to avoid the atomics on
the global var as much as possible.

Thank you for this confirmation David, I understand now that tcp doesnt currently satisfy the contract.

For example, tcp_delack_timer() calls sk_mem_reclaim().

So on a machine with a lot of mostly idle sockets (but all sockets are doing some trafic, say one message per minute / socket), we can see :

$ grep TCP /proc/net/sockstat
TCP: inuse 1083667 orphan 8840 tw 6646 alloc 1083809 mem 262305
$ cat /proc/sys/net/ipv4/tcp_mem
2000000 3000000 4000000

so an average of 1/4 page are 'allocated' per socket :(

On this machine, we constantly change tcp_memory_allocated, even if we always are under tcp_mem[0] limit.

Maybe we need to introduce some mechanism to let sk_forward between 0 and SK_MEM_QUANTUM (inclusive).

static inline void sk_mem_reclaim_overpage(struct sock *sk)
{
        if (sk->sk_forward_alloc > SK_MEM_QUANTUM) {
                __sk_mem_reclaim(sk);
                }
}

and use sk_mem_reclaim_overpage() instead of sk_mem_reclaim() in tcp_delack_timer() ?

Thank you

Small program output :

$ gcc -o prog prog.c ; ./prog
TCP: inuse 1035 orphan 0 tw 271 alloc 1203 mem 16
TCP: inuse 1035 orphan 0 tw 271 alloc 1203 mem 4016
TCP: inuse 1034 orphan 0 tw 272 alloc 1202 mem 3015
TCP: inuse 1034 orphan 0 tw 272 alloc 1202 mem 3016
TCP: inuse 1034 orphan 0 tw 272 alloc 1202 mem 3516
TCP: inuse 1034 orphan 0 tw 272 alloc 1202 mem 14 <OOPS>

$ cat prog.c
#include <sys/socket.h>
#include <string.h>
#include <stdlib.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <unistd.h>
#include <errno.h>
#include <asm/ioctls.h>
#include <stdio.h>

int SOCK_COUNT = 1000;
int *sockets_fd_tab;
unsigned int count;

static void open_sockets(int domain, int type)
{
        int fdlisten=-1, on = 1;
        socklen_t addrlen;
        struct sockaddr_in host, peer;

        if (domain == AF_INET && type == SOCK_STREAM) {
                fdlisten = socket(AF_INET, type, 0);
                setsockopt(fdlisten, SOL_SOCKET, SO_REUSEADDR, &on, 
sizeof(int));
                memset(&host, 0, sizeof(host));
                host.sin_family = AF_INET;
                bind(fdlisten, (struct sockaddr *)&host, sizeof(host));
                addrlen = sizeof(host);
                getsockname(fdlisten, (struct sockaddr *)&host, &addrlen);
                listen(fdlisten, 5);
        }
        while (1) {
                int res, vec[2];
                if (domain == AF_UNIX) {
                        res = socketpair(AF_UNIX, type, 0, vec);
                        if (res == -1)
                                break;
                } else {
                        vec[0] = socket(AF_INET, type, 0);
                        if (vec[0] == -1)
                                break;
                        ioctl(vec[0], FIONBIO, &on);
                        if (type == SOCK_STREAM) {
connect(vec[0], (struct sockaddr *)&host, sizeof(host));
                                addrlen = sizeof(peer);
vec[1] = accept(fdlisten, (struct sockaddr *)&peer, &addrlen);
                                if (vec[1] == -1) {
                                        close(vec[0]);
                                        break;
                                }
                        } else {
                        }
                }
                sockets_fd_tab[count++] = vec[0];
                sockets_fd_tab[count++] = vec[1];
                if (count == SOCK_COUNT)
                        break;
        }
}

const char some_msg[1024] = "One dummy message";

static void fill_sockets()
{
        unsigned int ui;
        for (ui = 0; ui < count; ui++)
                send(sockets_fd_tab[ui], some_msg, 100, 0);
        for (ui = 0; ui < count; ui++)
                send(sockets_fd_tab[ui], some_msg, 100, 0);
}

static void empty_sockets()
{
        unsigned int ui;
        char buffer[4096];

        for (ui = 0; ui < count; ui++)
                recv(sockets_fd_tab[ui], buffer, sizeof(buffer), 0);
}

static void dump_infos()
{
        system("grep TCP /proc/net/sockstat");
}

int main(int argc, char *argv[])
{
        int c;
        while ((c = getopt(argc, argv, "n:")) != EOF) {
                if (c == 'n')
                        SOCK_COUNT = atoi(optarg);
        }
        sockets_fd_tab = malloc(SOCK_COUNT * sizeof(int));
        open_sockets(AF_INET, SOCK_STREAM);
        dump_infos();

        fill_sockets();
        dump_infos();
        sleep(1); /* to see effect of delayed acks */
        dump_infos();
        empty_sockets();
        dump_infos();

        fill_sockets();
        dump_infos();
        empty_sockets();
        sleep(1); /* to see effect of delayed acks */
        dump_infos();

        return 0;
}

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to