/*
* fs/dax.c - Direct Access filesystem code
* Copyright (c) 2013-2014 Intel Corporation
* Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
* Author: Ross Zwisler <ross.zwisler@linux.intel.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/atomic.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/genhd.h>
#include <linux/highmem.h>
#include <linux/memcontrol.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/pagevec.h>
#include <linux/pmem.h>
#include <linux/sched.h>
#include <linux/uio.h>
#include <linux/vmstat.h>
#include <linux/pfn_t.h>
#include <linux/sizes.h>
#define RADIX_DAX_MASK 0xf
#define RADIX_DAX_SHIFT 4
#define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY)
#define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY)
#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK)
#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE)))
static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
{
struct request_queue *q = bdev->bd_queue;
long rc = -EIO;
dax->addr = (void __pmem *) ERR_PTR(-EIO);
if (blk_queue_enter(q, true) != 0)
return rc;
rc = bdev_direct_access(bdev, dax);
if (rc < 0) {
dax->addr = (void __pmem *) ERR_PTR(rc);
blk_queue_exit(q);
return rc;
}
return rc;
}
static void dax_unmap_atomic(struct block_device *bdev,
const struct blk_dax_ctl *dax)
{
if (IS_ERR(dax->addr))
return;
blk_queue_exit(bdev->bd_queue);
}
struct page *read_dax_sector(struct block_device *bdev, sector_t n)
{
struct page *page = alloc_pages(GFP_KERNEL, 0);
struct blk_dax_ctl dax = {
.size = PAGE_SIZE,
.sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
};
long rc;
if (!page)
return ERR_PTR(-ENOMEM);
rc = dax_map_atomic(bdev, &dax);
if (rc < 0)
return ERR_PTR(rc);
memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
dax_unmap_atomic(bdev, &dax);
return page;
}
static bool buffer_written(struct buffer_head *bh)
{
return buffer_mapped(bh) && !buffer_unwritten(bh);
}
/*
* When ext4 encounters a hole, it returns without modifying the buffer_head
* which means that we can't trust b_size. To cope with this, we set b_state
* to 0 before calling get_block and, if any bit is set, we know we can trust
* b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
* and would save us time calling get_block repeatedly.
*/
static bool buffer_size_valid(struct buffer_head *bh)
{
return bh->b_state != 0;
}
static sector_t to_sector(const struct buffer_head *bh,
const struct inode *inode)
{
sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
return sector;
}
static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
loff_t start, loff_t end, get_block_t get_block,
struct buffer_head *bh)
{
loff_t pos = start, max = start, bh_max = start;
bool hole = false, need_wmb = false;
struct block_device *bdev = NULL;
int rw = iov_iter_rw(iter), rc;
long map_len = 0;
struct blk_dax_ctl dax = {
.addr = (void __pmem *) ERR_PTR(-EIO),
};
unsigned blkbits = inode->i_blkbits;
sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
>> blkbits;
if (rw == READ)
end = min(end, i_size_read(inode));
while (pos