假设缓冲区是使用基于页面的scheme分配的。 实现mmap的一种方法是使用remap_pfn_range,但是LDD3说这不适用于传统的内存。 看来我们可以通过使用SetPageReserved标记保留的页面来解决这个问题,这样它就会被locking在内存中。 但是,并不是所有的内核内存已经不可交换,即已经保留了吗? 为什么需要明确设置保留位?
这是否与从HIGH_MEM分配的页面有关?
在mmap方法中从内核映射一组页面的最简单方法是使用错误处理程序来映射页面。 基本上你最终会得到类似的东西:
static int my_mmap(struct file *filp, struct vm_area_struct *vma) { vma->vm_ops = &my_vm_ops; return 0; } static const struct file_operations my_fops = { .owner = THIS_MODULE, .open = nonseekable_open, .mmap = my_mmap, .llseek = no_llseek, };
(其他文件操作都是你的模块需要的)。 同样在my_mmap
你需要进行任何范围检查等来验证mmap参数。
然后, vm_ops
看起来像:
static int my_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { vmf->page = my_page_at_index(vmf->pgoff); get_page(vmf->page); return 0; } static const struct vm_operations_struct my_vm_ops = { .fault = my_fault }
在那里你只需要找出一个给定的vma / vmf传递给你的错误函数的哪个页面映射到用户空间。 这取决于你的模块是如何工作的。 例如,如果你有
my_buf = vmalloc_user(MY_BUF_SIZE);
那么你使用的页面会是这样的
vmalloc_to_page(my_buf + (vmf->pgoff << PAGE_SHIFT));
但是你可以很容易地创建一个数组,并为每个条目分配一个页面,使用kmalloc,不管。
[只是注意到, my_fault
是一个函数的一个有趣的名字]
最小的可运行示例和用户级测试
内核模块 :
#include <asm/uaccess.h> /* copy_from_user */ #include <linux/debugfs.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> /* min */ #include <linux/mm.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/slab.h> static const char *filename = "lkmc_mmap"; enum { BUFFER_SIZE = 4 }; struct mmap_info { char *data; }; /* After unmap. */ static void vm_close(struct vm_area_struct *vma) { pr_info("vm_close\n"); } /* First page access. */ static int vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page; struct mmap_info *info; pr_info("vm_fault\n"); info = (struct mmap_info *)vma->vm_private_data; if (info->data) { page = virt_to_page(info->data); get_page(page); vmf->page = page; } return 0; } /* Aftr mmap. TODO vs mmap, when can this happen at a different time than mmap? */ static void vm_open(struct vm_area_struct *vma) { pr_info("vm_open\n"); } static struct vm_operations_struct vm_ops = { .close = vm_close, .fault = vm_fault, .open = vm_open, }; static int mmap(struct file *filp, struct vm_area_struct *vma) { pr_info("mmap\n"); vma->vm_ops = &vm_ops; vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_private_data = filp->private_data; vm_open(vma); return 0; } static int open(struct inode *inode, struct file *filp) { struct mmap_info *info; pr_info("open\n"); info = kmalloc(sizeof(struct mmap_info), GFP_KERNEL); pr_info("virt_to_phys = 0x%llx\n", (unsigned long long)virt_to_phys((void *)info)); info->data = (char *)get_zeroed_page(GFP_KERNEL); memcpy(info->data, "asdf", BUFFER_SIZE); filp->private_data = info; return 0; } static ssize_t read(struct file *filp, char __user *buf, size_t len, loff_t *off) { struct mmap_info *info; int ret; pr_info("read\n"); info = filp->private_data; ret = min(len, (size_t)BUFFER_SIZE); if (copy_to_user(buf, info->data, ret)) { ret = -EFAULT; } return ret; } static ssize_t write(struct file *filp, const char __user *buf, size_t len, loff_t *off) { struct mmap_info *info; pr_info("write\n"); info = filp->private_data; if (copy_from_user(info->data, buf, min(len, (size_t)BUFFER_SIZE))) { return -EFAULT; } else { return len; } } static int release(struct inode *inode, struct file *filp) { struct mmap_info *info; pr_info("release\n"); info = filp->private_data; free_page((unsigned long)info->data); kfree(info); filp->private_data = NULL; return 0; } static const struct file_operations fops = { .mmap = mmap, .open = open, .release = release, .read = read, .write = write, }; static int myinit(void) { proc_create(filename, 0, NULL, &fops); return 0; } static void myexit(void) { remove_proc_entry(filename, NULL); } module_init(myinit) module_exit(myexit) MODULE_LICENSE("GPL");
Userland测试 :
#define _XOPEN_SOURCE 700 #include <assert.h> #include <fcntl.h> #include <stdio.h> #include <stdlib.h> #include <stdint.h> /* uintmax_t */ #include <string.h> #include <sys/mman.h> #include <unistd.h> /* sysconf */ #include "common.h" /* virt_to_phys_user */ enum { BUFFER_SIZE = 4 }; int main(int argc, char **argv) { int fd; long page_size; char *address1, *address2; char buf[BUFFER_SIZE]; uintptr_t paddr; if (argc < 2) { printf("Usage: %s <mmap_file>\n", argv[0]); return EXIT_FAILURE; } page_size = sysconf(_SC_PAGE_SIZE); printf("open pathname = %s\n", argv[1]); fd = open(argv[1], O_RDWR | O_SYNC); if (fd < 0) { perror("open"); assert(0); } printf("fd = %d\n", fd); /* mmap twice for double fun. */ puts("mmap 1"); address1 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (address1 == MAP_FAILED) { perror("mmap"); assert(0); } puts("mmap 2"); address2 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (address2 == MAP_FAILED) { perror("mmap"); return EXIT_FAILURE; } assert(address1 != address2); /* Read and modify memory. */ puts("access 1"); assert(!strcmp(address1, "asdf")); /* vm_fault */ puts("access 2"); assert(!strcmp(address2, "asdf")); /* vm_fault */ strcpy(address1, "qwer"); /* Also modified. So both virtual addresses point to the same physical address. */ assert(!strcmp(address2, "qwer")); /* Check that the physical addresses are the same. * They are, but TODO why virt_to_phys on kernel gives a different value? */ assert(!virt_to_phys_user(&paddr, getpid(), (uintptr_t)address1)); printf("paddr1 = 0x%jx\n", (uintmax_t)paddr); assert(!virt_to_phys_user(&paddr, getpid(), (uintptr_t)address2)); printf("paddr2 = 0x%jx\n", (uintmax_t)paddr); /* Check that modifications made from userland are also visible from the kernel. */ read(fd, buf, BUFFER_SIZE); assert(!memcmp(buf, "qwer", BUFFER_SIZE)); /* Modify the data from the kernel, and check that the change is visible from userland. */ write(fd, "zxcv", 4); assert(!strcmp(address1, "zxcv")); assert(!strcmp(address2, "zxcv")); /* Cleanup. */ puts("munmap 1"); if (munmap(address1, page_size)) { perror("munmap"); assert(0); } puts("munmap 2"); if (munmap(address2, page_size)) { perror("munmap"); assert(0); } puts("close"); close(fd); return EXIT_SUCCESS; }
尽管这些页面是通过内核驱动程序保留的,但它是通过用户空间访问的。 因此,PTE(页表条目)不知道pfn是否属于用户空间或内核空间(即使它们是通过内核驱动程序分配的)。
这就是为什么它们被标记为SetPageReserved
。