This is a discussion on sendfile(2) of 03-18 within the mailing.openbsd.tech forums, part of the OpenBSD category; --> After a few days of digging in the VFS code, I've used the VOP* call that seems appropriate to ...
| |||||||
| FAQ | Members List | Calendar | Search | Today's Posts | Mark Forums Read |
| ||||
| After a few days of digging in the VFS code, I've used the VOP* call that seems appropriate to populate bufs, vice the bread call I had before; I've added a quick'n'dirty readahead for performance reasons (or so I think...) and addressed a major oversight on my part concerning the lack of paying attention to the offset. I'm still somewhat unsure as to how to deal with the error return from copyout vice the error return from sosendfile. Which is more important? Of course, this issue could be rendered entirely moot, as I'm starting to look towards emulating the Linux sendfile interface, as it is simpler and more consistent with the other send* functions in appearance and use. I'd much appreciate any feedback on the API issue from the community, especially those of you maintaining network software. - Bert Index: src/sys/kern/uipc_syscalls.c ================================================== ================= RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v retrieving revision 1.66 diff -u -r1.66 uipc_syscalls.c --- src/sys/kern/uipc_syscalls.c 23 Oct 2006 07:13:56 -0000 1.66 +++ src/sys/kern/uipc_syscalls.c 19 Mar 2007 02:28:59 -0000 @@ -47,6 +47,7 @@ #include <sys/signalvar.h> #include <sys/unpcb.h> #include <sys/un.h> +#include <sys/vnode.h> #ifdef KTRACE #include <sys/ktrace.h> #endif @@ -1091,4 +1092,69 @@ FREF(fp); return (0); +} + +int +sys_sendfile(struct proc *p, void* v, register_t *retval) +{ + struct sys_sendfile_args /* { + syscallarg(int) fd; + syscallarg(int) s; + syscallarg(off_t) off; + syscallarg(size_t) nbytes; + syscallarg(off_t *) sbytes; + syscallarg(int) flags; + } */ *uap = v; + struct file *fp, *sp; + struct socket *so; + struct vnode *vp; + off_t *sbytes, sbtmp, off; + size_t nbytes; + int error, flags, fd, s; + + nbytes = SCARG(uap, nbytes); + sbytes = SCARG(uap, sbytes); + flags = SCARG(uap, flags); + off = SCARG(uap, off); + fd = SCARG(uap, fd); + s = SCARG(uap, s); + + /* + * Malicious or incorrect values + * may cause off + nbytes to wrap. + */ + if (off < 0 || off + nbytes < off) { + error = EINVAL; + goto out; + } + + /* validate userspace pointer */ + if (sbytes != NULL) + if ((error = copyin(sbytes, &sbtmp, sizeof(off_t))) != 0) + goto out; + + if ((error = getvnode(p->p_fd, fd, &fp)) != 0) + goto out; + vp = fp->f_data; + + if ((error = getsock(p->p_fd, s, &sp)) != 0) + goto vfail; + so = sp->f_data; + + if (!(so->so_type & SOCK_STREAM)) { + error = EINVAL; + goto sfail; + } + + error = sosendfile(so, vp, nbytes, &sbtmp, p, off, flags); + + if (sbytes != NULL) + copyout(&sbtmp, sbytes, sizeof(off_t)); + +sfail: + FRELE(sp); +vfail: + FRELE(fp); +out: + return (error); } Index: src/sys/kern/uipc_socket.c ================================================== ================= RCS file: /cvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.66 diff -u -r1.66 uipc_socket.c --- src/sys/kern/uipc_socket.c 26 Feb 2007 23:53:33 -0000 1.66 +++ src/sys/kern/uipc_socket.c 19 Mar 2007 02:29:11 -0000 @@ -47,6 +47,9 @@ #include <sys/signalvar.h> #include <sys/resourcevar.h> #include <sys/pool.h> +#include <sys/buf.h> +#include <sys/vnode.h> +#include <sys/mount.h> void filt_sordetach(struct knote *kn); int filt_soread(struct knote *kn, long hint); @@ -54,6 +57,8 @@ int filt_sowrite(struct knote *kn, long hint); int filt_solisten(struct knote *kn, long hint); +void sf_free_fbuf(caddr_t, u_int, void *); + struct filterops solisten_filtops = { 1, NULL, filt_sordetach, filt_solisten }; struct filterops soread_filtops = @@ -1295,4 +1300,184 @@ kn->kn_data = so->so_qlen; return (so->so_qlen != 0); +} + +/* + * Implement zero-copy file transmit over a socket. + * + * Socket MUST be SOCK_STREAM. + * + * File bufs are used as external data storage for mbufs, + * which are passed to the appropriate socket send routine. + * + * If a user has specified a number of bytes to send, transmit + * that amount of data; otherwise, send the entire file. Return + * the number of bytes sent in sbytes. + * + * Flags: + * SF_NODISKIO: + * return EBUSY instead of waiting + * for disk I/O to complete + */ +#define SFRABLKS 2 /* better value is ??? */ +int +sosendfile(struct socket *so, struct vnode *vp, size_t nbytes, off_t *sbytes, + struct proc *p, off_t off, int flags) +{ + struct vattr va; + struct mbuf *m, *m0; + struct buf *bp, *bbp; + ssize_t space; + off_t resid, sent, sent0; + u_int sblk, eblk, len; + int error, i, j, s; + + sent = sent0 = 0; + m = NULL; + + s = splsoftnet(); + + if ((error = sblock(&so->so_snd, M_WAITOK)) != 0) + goto out; + + if ((vn_lock(vp, (LK_EXCLUSIVE | LK_RETRY), p) != 0) || + (VOP_GETATTR(vp, &va, p->p_ucred, p) != 0)) { + error = EIO; + goto release; + } + + /* + * off + nbytes wraparound validated in sys_sendfile() + */ + if (off + nbytes > va.va_size) { + error = EINVAL; + goto release; + } + + if (nbytes > 0) + resid = nbytes; + else + resid = va.va_size - off; + + eblk = (u_int)((off + resid) / va.va_blocksize); + sblk = (u_int)(off / va.va_blocksize); + off -= sblk * va.va_blocksize; + + so->so_state |= SS_ISSENDING; + + for (i = sblk; i <= eblk + + if (!(so->so_state & SS_ISCONNECTED)) + error = ENOTCONN; + else if (so->so_state & SS_CANTSENDMORE) + error = EPIPE; + else if (so->so_error) { + error = so->so_error; + so->so_error = 0; + } + if (error) + goto release; + +retry: + space = sbspace(&so->so_snd); + if (space < so->so_snd.sb_lowat && space < resid) { + if ((error = sbwait(&so->so_snd)) != 0) + goto release; + + goto retry; + } + if (space > resid) + space = resid; + + while (space > 0) { + + bp = getblk(vp, i, va.va_blocksize, 0, 0); + + /* + * If it wasn't found in the cache, then disk I/O + * is needed; is that what the caller asked for? + */ + if (!(bp->b_flags & (B_DONE | B_DELWRI))) { + if (flags & SF_NODISKIO) { + error = EBUSY; + brelse(bp); + goto release; + } + bp->b_flags |= B_READ; + VOP_STRATEGY(bp); + p->p_stats->p_ru.ru_inblock++; + } + + /* async readahead */ + for (j = 1; j < min(eblk, i + SFRABLKS); j++) { + bbp = getblk(vp, i + j, va.va_blocksize, 0, 0); + if (!(bbp->b_flags & (B_DONE | B_DELWRI))) { + bbp->b_flags |= (B_READ | B_ASYNC); + VOP_STRATEGY(bbp); + p->p_stats->p_ru.ru_inblock++; + } else { + brelse(bbp); + } + } + + error = biowait(bp); + if (error) { + brelse(bp); + goto release; + } + + len = (u_int)bp->b_bcount - off; + if (len > resid) + len = (u_int)resid; + + if (m == NULL) { + MGETHDR(m0, M_WAITOK, MT_DATA); + m = m0; + m->m_pkthdr.rcvif = NULL; + m->m_pkthdr.len = 0; + } else { + MGET(m0, M_WAITOK, MT_DATA); + m_cat(m, m0); + } + MEXTADD(m0, bp->b_data + off, len, + M_FILE, sf_free_fbuf, bp); + m->m_pkthdr.len += len; + + sent0 += len; + resid -= len; + space -= len; + off = 0; + i++; + } + + error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, m, NULL, NULL); + if (error) + goto release; + + m_freem(m); + + sent = sent0; + } + +release: + VOP_UNLOCK(vp, 0, p); + so->so_state &= ~SS_ISSENDING; + sbunlock(&so->so_snd); +out: + splx(s); + if (m != NULL) + m_freem(m); + *sbytes = sent; + return (error); +} + +/* + * Utility function to free file buffers + * as they are consumed in sosendfile. + */ +/* ARGSUSED */ +void +sf_free_fbuf(caddr_t buf, u_int size, void *arg) +{ + brelse(arg); } Index: src/sys/kern/syscalls.master ================================================== ================= RCS file: /cvs/src/sys/kern/syscalls.master,v retrieving revision 1.86 diff -u -r1.86 syscalls.master --- src/sys/kern/syscalls.master 22 Sep 2006 17:35:41 -0000 1.86 +++ src/sys/kern/syscalls.master 19 Mar 2007 02:29:26 -0000 @@ -611,3 +611,5 @@ 304 STD { int sys___getcwd(char *buf, size_t len); } 305 STD { int sys_adjfreq(const int64_t *freq, \ int64_t *oldfreq); } +306 STD { int sys_sendfile(int fd, int s, off_t off, \ + size_t nbytes, off_t *sbytes, int flags); } Index: src/sys/sys/socket.h ================================================== ================= RCS file: /cvs/src/sys/sys/socket.h,v retrieving revision 1.53 diff -u -r1.53 socket.h --- src/sys/sys/socket.h 31 Mar 2006 17:30:39 -0000 1.53 +++ src/sys/sys/socket.h 19 Mar 2007 02:29:47 -0000 @@ -436,6 +436,11 @@ #define SA_LEN(x) ((x)->sa_len) +/* + * Sendfile flags. + */ +#define SF_NODISKIO 0x0001 + #ifndef _KERNEL #include <sys/cdefs.h> @@ -453,6 +458,7 @@ ssize_t recvfrom(int, void *, size_t, int, struct sockaddr *, socklen_t *); ssize_t recvmsg(int, struct msghdr *, int); ssize_t send(int, const void *, size_t, int); +int sendfile(int, int, off_t, size_t, off_t *, int); ssize_t sendto(int, const void *, size_t, int, const struct sockaddr *, socklen_t); ssize_t sendmsg(int, const struct msghdr *, int); Index: src/sys/sys/socketvar.h ================================================== ================= RCS file: /cvs/src/sys/sys/socketvar.h,v retrieving revision 1.39 diff -u -r1.39 socketvar.h --- src/sys/sys/socketvar.h 26 Feb 2007 23:53:33 -0000 1.39 +++ src/sys/sys/socketvar.h 19 Mar 2007 02:29:56 -0000 @@ -37,6 +37,8 @@ TAILQ_HEAD(soqhead, socket); +struct vnode; + /* * Kernel structure per socket. * Contains send and receive buffer queues, @@ -302,6 +304,8 @@ void sorflush(struct socket *so); int sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags); +int sosendfile(struct socket *so, struct vnode *vp, size_t nbytes, + off_t *sbytes, struct proc *p, off_t off, int flags); int sosetopt(struct socket *so, int level, int optname, struct mbuf *m0); int soshutdown(struct socket *so, int how); Index: src/lib/libc/sys/Makefile.inc ================================================== ================= RCS file: /cvs/src/lib/libc/sys/Makefile.inc,v retrieving revision 1.80 diff -u -r1.80 Makefile.inc --- src/lib/libc/sys/Makefile.inc 24 Oct 2006 04:40:59 -0000 1.80 +++ src/lib/libc/sys/Makefile.inc 19 Mar 2007 02:30:52 -0000 @@ -50,7 +50,8 @@ munlock.o munlockall.o munmap.o nanosleep.o nfssvc.o \ open.o pathconf.o pipe.o poll.o profil.o quotactl.o \ read.o readlink.o readv.o reboot.o recvfrom.o recvmsg.o rename.o \ - revoke.o rmdir.o select.o semget.o semop.o sendmsg.o sendto.o \ + revoke.o rmdir.o select.o semget.o semop.o sendfile.o sendmsg.o \ + sendto.o \ setegid.o seteuid.o setgid.o setgroups.o setitimer.o setpgid.o \ setpriority.o setregid.o setreuid.o setresgid.o setresuid.o \ setrlimit.o setsid.o setsockopt.o settimeofday.o \ Index: src/libexec/tftpd/tftpd.c ================================================== ================= RCS file: /cvs/src/libexec/tftpd/tftpd.c,v retrieving revision 1.54 diff -u -r1.54 tftpd.c --- src/libexec/tftpd/tftpd.c 15 Dec 2006 05:52:06 -0000 1.54 +++ src/libexec/tftpd/tftpd.c 19 Mar 2007 02:31:28 -0000 @@ -83,7 +83,7 @@ __dead void usage(void); void tftp(struct tftphdr *, int); int validate_access(char *, int); -int sendfile(struct formats *); +int tftpd_sendfile(struct formats *); int recvfile(struct formats *); void nak(int); void oack(int); @@ -112,8 +112,8 @@ int (*f_recv)(struct formats *); int f_convert; } formats[] = { - { "netascii", validate_access, sendfile, recvfile, 1 }, - { "octet", validate_access, sendfile, recvfile, 0 }, + { "netascii", validate_access, tftpd_sendfile, recvfile, 1 }, + { "octet", validate_access, tftpd_sendfile, recvfile, 0 }, { NULL, NULL, NULL, NULL, 0 } }; @@ -603,7 +603,7 @@ * Send the requested file. */ int -sendfile(struct formats *pf) +tftpd_sendfile(struct formats *pf) { struct tftphdr *dp, *r_init(void); struct tftphdr *ap; /* ack packet */ Index: src/usr.bin/tftp/main.c ================================================== ================= RCS file: /cvs/src/usr.bin/tftp/main.c,v retrieving revision 1.28 diff -u -r1.28 main.c --- src/usr.bin/tftp/main.c 26 Jul 2006 22:43:53 -0000 1.28 +++ src/usr.bin/tftp/main.c 19 Mar 2007 02:32:32 -0000 @@ -381,7 +381,7 @@ printf("putting %s to %s:%s [%s]\n", cp, hostname, targ, mode); peeraddr.sin_port = port; - sendfile(fd, targ, mode); + tftp_sendfile(fd, targ, mode); return; } @@ -402,7 +402,7 @@ printf("putting %s to %s:%s [%s]\n", argv[n], hostname, cp, mode); peeraddr.sin_port = port; - sendfile(fd, cp, mode); + tftp_sendfile(fd, cp, mode); free(cp); } } Index: src/usr.bin/tftp/tftp.c ================================================== ================= RCS file: /cvs/src/usr.bin/tftp/tftp.c,v retrieving revision 1.20 diff -u -r1.20 tftp.c --- src/usr.bin/tftp/tftp.c 26 Jul 2006 09:10:03 -0000 1.20 +++ src/usr.bin/tftp/tftp.c 19 Mar 2007 02:34:41 -0000 @@ -129,7 +129,7 @@ * Send the requested file. */ void -sendfile(int fd, char *name, char *mode) +tftp_sendfile(int fd, char *name, char *mode) { struct tftphdr *dp, *ap; /* data and ack packets */ struct sockaddr_in from; Index: src/usr.bin/tftp/extern.h ================================================== ================= RCS file: /cvs/src/usr.bin/tftp/extern.h,v retrieving revision 1.5 diff -u -r1.5 extern.h --- src/usr.bin/tftp/extern.h 26 Jul 2006 16:43:31 -0000 1.5 +++ src/usr.bin/tftp/extern.h 19 Mar 2007 02:35:58 -0000 @@ -33,7 +33,7 @@ */ void recvfile(int, char *, char *); -void sendfile(int, char *, char *); +void tftp_sendfile(int, char *, char *); #define TIMEOUT 5 /* packet rexmt timeout */ #define TIMEOUT_MIN 1 /* minimal packet rexmt timeout */ |
| Thread Tools | |
| Display Modes | |
|
|