--- .pc/epio.patch/arch/i386/kernel/entry.S 2004-10-14 12:58:37 -07:00 +++ arch/i386/kernel/entry.S 2004-10-14 12:58:41 -07:00 @@ -911,5 +911,6 @@ ENTRY(sys_call_table) .long sys_add_key .long sys_request_key .long sys_keyctl + .long sys_io_modify syscall_table_size=(.-sys_call_table) --- .pc/epio.patch/fs/aio.c 2004-10-14 12:58:40 -07:00 +++ fs/aio.c 2004-10-14 12:58:41 -07:00 @@ -223,7 +223,7 @@ static struct kioctx *ioctx_alloc(unsign ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); - + ctx->epoll_file = NULL; memset(ctx, 0, sizeof(*ctx)); ctx->max_reqs = nr_events; mm = ctx->mm = current->mm; @@ -250,6 +250,7 @@ static struct kioctx *ioctx_alloc(unsign write_lock(&mm->ioctx_list_lock); ctx->next = mm->ioctx_list; mm->ioctx_list = ctx; + write_unlock(&mm->ioctx_list_lock); dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", @@ -1047,6 +1048,7 @@ static int aio_read_evt(struct kioctx *i { struct aio_ring_info *info = &ioctx->ring_info; struct aio_ring *ring; + struct io_event *evp; unsigned long head; int ret = 0; @@ -1055,22 +1057,21 @@ static int aio_read_evt(struct kioctx *i (unsigned long)ring->head, (unsigned long)ring->tail, (unsigned long)ring->nr); - if (ring->head == ring->tail) - goto out; - - spin_lock(&info->ring_lock); - - head = ring->head % info->nr; - if (head != ring->tail) { - struct io_event *evp = aio_ring_event(info, head, KM_USER1); - *ent = *evp; - head = (head + 1) % info->nr; - smp_mb(); /* finish reading the event before updatng the head */ - ring->head = head; - ret = 1; + do { + head = ring->head; + if (head == ring->tail) { + ret = 0; + break; + } + evp = aio_ring_event(info, head, KM_USER1); + if (evp->obj != -1) { + *ent = *evp; + ret = 1; + smp_mb(); /* finish reading the event before updatng the head */ + } put_aio_ring_event(evp, KM_USER1); - } - spin_unlock(&info->ring_lock); + } while (head != cmpxchg(&ring->head, head, (head + 1) % info->nr) + || ret == 0); out: kunmap_atomic(ring, KM_USER0); @@ -1302,6 +1303,59 @@ out: return ret; } +asmlinkage long sys_io_modify(aio_context_t ctx, unsigned int command, int arg) { + long ret = 0; + struct file *file = NULL; + struct eventpoll *ep = NULL; + struct kioctx *ioctx = lookup_ioctx(ctx); + + if (!ioctx) { + ret = -EINVAL; /* not a valid aio context */ + goto out; + } + + switch (command) { + case 1: /* IO_SET_EPOLLFD */ + if (arg != -1) { + /* try to get the epoll descriptor */ + file = fget(arg); /* reference is still held after we + return */ + if (file == NULL) { + ret = -EINVAL; /* cannot find the file */ + goto out; + } + if (is_file_epoll(file)) { + ep = (struct eventpoll *)file->private_data; + BUG_ON(!ep); + } else { + fput(file); + ret = -EINVAL; /* not epoll file */ + goto out; + } + } + if (ioctx->epoll_file) { + /* remove reference to the old epoll file */ + struct eventpoll *oldep = (struct eventpoll *)ioctx->epoll_file->private_data; + BUG_ON(!oldep); + eventpoll_set_ioctx(oldep, NULL); + fput(ioctx->epoll_file); + } + ioctx->epoll_file = file; + if (ep) + eventpoll_set_ioctx(ep, ioctx); + break; + + default: + ret = -ENOSYS; + } + + out: + if (ioctx) + put_ioctx(ioctx); + return ret; +} + + /* sys_io_destroy: * Destroy the aio_context specified. May cancel any outstanding * AIOs and block on completion. Will fail with -ENOSYS if not @@ -1311,6 +1365,12 @@ out: asmlinkage long sys_io_destroy(aio_context_t ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); + if (ioctx->epoll_file) { + struct eventpoll *ep = (struct eventpoll *)ioctx->epoll_file->private_data; + eventpoll_set_ioctx(ep, NULL); + fput(ioctx->epoll_file); + ioctx->epoll_file = NULL; + } if (likely(NULL != ioctx)) { io_destroy(ioctx); return 0; --- .pc/epio.patch/fs/eventpoll.c 2004-10-14 12:58:37 -07:00 +++ fs/eventpoll.c 2004-10-14 12:58:41 -07:00 @@ -41,6 +41,9 @@ #include #include +#include +#include + /* * LOCKING: @@ -207,6 +210,9 @@ struct eventpoll { /* RB-Tree root used to store monitored fd structs */ struct rb_root rbr; + + /* for using aio event queue */ + struct kioctx *ioctx; }; /* Wait structure used by the poll hooks */ @@ -270,6 +276,10 @@ struct epitem { * to pin items empty events set. */ unsigned int revents; + + /* Current event position in aio event queue */ + int current_event_pos; + }; /* Wrapper struct used by poll queueing */ @@ -763,6 +773,7 @@ static int ep_file_init(struct file *fil init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT; + ep->ioctx = NULL; file->private_data = ep; @@ -875,6 +886,151 @@ static void ep_release_epitem(struct epi EPI_MEM_FREE(epi); } +#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) +#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) +#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) + +#define aio_ring_event(info, nr) ({ \ + unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ + struct io_event *__event; \ + __event = kmap_atomic( \ + (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], KM_IRQ0); \ + __event += pos % AIO_EVENTS_PER_PAGE; \ + __event; \ + }) + +#define put_aio_ring_event(event) do { \ + struct io_event *__event = (event); \ + (void)__event; \ + kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), KM_IRQ0); \ + } while(0) + +/* + Find the event with index NR for file descriptor FD in the event + ring. If this function returns a non-NULL object, you must free it + by calling put_aio_ring_event(event); +*/ +static inline struct io_event *io_find_queued_event(struct aio_ring_info *info, + const struct aio_ring *ring, + int nr, int fd) +{ + if ((ring->head <= nr && nr < ring->tail) + || (ring->tail < ring->head + && (nr < ring->tail + || (ring->head <= nr && nr < ring->nr)))) { + struct io_event *event = aio_ring_event(info, nr); + if (event->obj == -2 && event->res2 == fd) { + return event; + } + put_aio_ring_event(event); + } + return NULL; +} + +/* + Deliver an epoll event through the shared aio event queue. + Revents is the return value from f_op->poll(). +*/ +static void io_deliver_epoll_event(struct epitem *epi, unsigned revents) { + struct aio_ring_info *info; + struct aio_ring *ring; + struct io_event *event; + unsigned long flags, flags2; + unsigned oldhead, newhead; + struct kioctx *ctx = epi->ep->ioctx; + int pos = epi->current_event_pos; + + info = &ctx->ring_info; + + retry: + /* make sure no one else inserts new events */ + spin_lock_irqsave(&ctx->ctx_lock, flags); + + if (info->ring_pages == NULL) { + spin_unlock_irqrestore(&ctx->ctx_lock, flags); + return; + } + + ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); + + oldhead = ring->head; + event = io_find_queued_event(info, ring, pos, epi->ffd.fd); + + if (event) { + /* We have an update for the event before it gets to the user */ + event->res = revents; + smp_mb(); /* write new revents before checking + head */ + /* Check whether the user has retrieved the event yet. In that + case, we need to restart because the user could have got the + old events. This is done by checking whether pos is between + oldhead and ring->head. It is safe to read ring->head directly + because updates to ring->head is done with locked cmpxchg(). + So we will alway get the latest value. + */ + newhead = ring->head; /* get a stable value of ring->head */ + if ((oldhead <= pos && pos < newhead) || + (oldhead > newhead && (pos < newhead || pos >= oldhead))) { + /* user has retrieved the event already */ + put_aio_ring_event(event); + kunmap_atomic(ring, KM_IRQ1); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); + goto retry; + } + } else { + /* We insert a new event into the queue */ + unsigned long tail = info->tail; + event = aio_ring_event(info, tail); + epi->current_event_pos = tail; + tail = (tail + 1) % info->nr; + event->obj = -2; // IOCB_POLL + event->data = epi->event.data; + event->res = revents; + event->res2 = epi->ffd.fd; + /* after flagging the request as done, we must never even look at + * it again + */ + smp_wmb(); /* make event visible before updating tail */ + + info->tail = tail; + ring->tail = tail; + } + + put_aio_ring_event(event); + kunmap_atomic(ring, KM_IRQ1); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); + + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); +} + +/* + Called when EPOLL_CTL_DEL is called for an epoll fd using aio event queue. +*/ +static void io_cancel_epoll_event(struct epitem *epi) +{ + struct kioctx *ctx = epi->ep->ioctx; + struct aio_ring_info *info = &ctx->ring_info; + struct aio_ring *ring; + struct io_event *event; + unsigned long flags; + + /* retrieving events from the ring is protected by ring_lock */ + spin_lock_irqsave(&info->ring_lock, flags); + + if (info->ring_pages != NULL) { + ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); + + event = io_find_queued_event(info, ring, epi->current_event_pos, epi->ffd.fd); + if (event) { + event->obj = -1; /* canceled */ + put_aio_ring_event(event); + } + kunmap_atomic(ring, KM_IRQ1); + } + spin_unlock_irqrestore(&info->ring_lock, flags); +} + /* * This is the callback that is used to add our wait queue to the @@ -943,6 +1099,7 @@ static int ep_insert(struct eventpoll *e epi->event = *event; atomic_set(&epi->usecnt, 1); epi->nwait = 0; + epi->current_event_pos = INT_MAX; /* Initialize the poll table using the queue callback */ epq.epi = epi; @@ -975,7 +1132,10 @@ static int ep_insert(struct eventpoll *e ep_rbtree_insert(ep, epi); /* If the file is already "ready" we drop it inside the ready list */ - if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) { + if ((revents & event->events)) { + if (ep->ioctx) { + io_deliver_epoll_event(epi, revents); + } else if (!EP_IS_LINKED(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ @@ -983,6 +1143,7 @@ static int ep_insert(struct eventpoll *e wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; + } } write_unlock_irqrestore(&ep->lock, flags); @@ -1054,7 +1215,9 @@ static int ep_modify(struct eventpoll *e * registered inside the ready list, unlink it. */ if (revents & event->events) { - if (!EP_IS_LINKED(&epi->rdllink)) { + if (ep->ioctx) { + io_deliver_epoll_event(epi, revents); + } else if (!EP_IS_LINKED(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ @@ -1136,7 +1299,9 @@ static int ep_unlink(struct eventpoll *e * If the item we are going to remove is inside the ready file descriptors * we want to remove it from this list to avoid stale events. */ - if (EP_IS_LINKED(&epi->rdllink)) + if (ep->ioctx) + io_cancel_epoll_event(epi); + else if (EP_IS_LINKED(&epi->rdllink)) EP_LIST_DEL(&epi->rdllink); error = 0; @@ -1224,11 +1389,19 @@ static int ep_poll_callback(wait_queue_t if (!(epi->event.events & ~EP_PRIVATE_BITS)) goto is_disabled; - /* If this file is already in the ready list we exit soon */ - if (EP_IS_LINKED(&epi->rdllink)) - goto is_linked; - - list_add_tail(&epi->rdllink, &ep->rdllist); + if (!ep->ioctx) { + /* If this file is already in the ready list we exit soon */ + if (EP_IS_LINKED(&epi->rdllink)) + goto is_linked; + list_add_tail(&epi->rdllink, &ep->rdllist); + } else { + /* we deliver the event through the aio event queue */ + unsigned revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); + if ((revents & epi->event.events)) + io_deliver_epoll_event(epi, revents); + write_unlock_irqrestore(&ep->lock, flags); + return 0; + } is_linked: /* @@ -1578,6 +1751,13 @@ eventpollfs_get_sb(struct file_system_ty return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC); } +void eventpoll_set_ioctx(struct eventpoll *epoll, struct kioctx *ioctx) { + epoll->ioctx = ioctx; +} + +int is_file_epoll(struct file *file) { + return IS_FILE_EPOLL(file); +} static int __init eventpoll_init(void) { --- .pc/epio.patch/include/asm-i386/unistd.h 2004-10-14 12:58:37 -07:00 +++ include/asm-i386/unistd.h 2004-10-14 12:58:41 -07:00 @@ -299,8 +299,9 @@ #define __NR_add_key 291 #define __NR_request_key 292 #define __NR_keyctl 293 +#define __NR_io_modify 294 -#define NR_syscalls 294 +#define NR_syscalls 295 /* * user-visible error numbers are in the range -1 - -128: see --- .pc/epio.patch/include/linux/aio.h 2004-10-14 12:58:40 -07:00 +++ include/linux/aio.h 2004-10-14 12:58:41 -07:00 @@ -163,6 +163,9 @@ struct kioctx { struct aio_ring_info ring_info; struct work_struct wq; + + /* epoll-over-aio extension */ + struct file *epoll_file; }; /* prototypes */ @@ -202,6 +205,7 @@ do { \ #define is_retried_kiocb(iocb) ((iocb)->ki_retried > 1) #include +#include static inline struct kiocb *list_kiocb(struct list_head *h) { --- .pc/epio.patch/include/linux/eventpoll.h 2004-10-14 12:58:37 -07:00 +++ include/linux/eventpoll.h 2004-10-14 12:58:41 -07:00 @@ -15,6 +15,7 @@ #define _LINUX_EVENTPOLL_H #include +#include /* Valid opcodes to issue to sys_epoll_ctl() */ @@ -85,6 +86,10 @@ static inline void eventpoll_release(str eventpoll_release_file(file); } +struct eventpoll; +struct kioctx; +extern void eventpoll_set_ioctx(struct eventpoll *epoll, struct kioctx *ioctx); +extern int is_file_epoll(struct file *); #else --- /dev/null 2004-02-23 13:02:56 -08:00 +++ linstall2.sh 2004-10-14 12:58:41 -07:00 @@ -0,0 +1,4 @@ +cp arch/i386/boot/bzImage /boot/vmlinuz-2.6.8.1-aiov +/sbin/mkinitrd -f /boot/initrd-2.6.8.1-aiov.img 2.6.8.1-aiov +/sbin/grub-install /dev/hda + --- .pc/epio.patch/mm/mmap.c 2004-10-14 12:58:37 -07:00 +++ mm/mmap.c 2004-10-14 12:58:41 -07:00 @@ -1094,6 +1094,25 @@ full_search: start_addr = addr = TASK_UNMAPPED_BASE; goto full_search; } + printk("MMAP: size=%d cache=%x, failure\n", + len, mm->free_area_cache); + { + start_addr = addr = mm->free_area_cache; + restartxx: + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + if (TASK_SIZE - len < addr) { + if (start_addr != TASK_UNMAPPED_BASE) { + start_addr = addr = TASK_UNMAPPED_BASE; + goto restartxx; + } else + break; + } else { + printk("MMAPLL %x\n", addr); + addr = vma->vm_end; + } + } + } + printk("MMAP END\n"); return -ENOMEM; } if (!vma)