Linux 3.8 Writeback机制源码分析
writeback机制源码分析writeback相关数据结构与writeback相关的数据结构主要有:1,backing_dev_info,该数据结构描述了backing_dev的所有信息,通常块设备的request queue中会包含backing_dev对象。2,bdi_writeback,该数据结构封装了writeback的内核线程以及需要操作的inode
·
writeback相关数据结构
与writeback相关的数据结构主要有:- backing_dev_info,该数据结构描述了backing_dev的所有信息,通常块设备的request queue中会包含backing_dev对象。
- bdi_writeback,该数据结构封装了writeback的内核线程以及需要操作的inode队列。
- wb_writeback_work,该数据结构封装了writeback的工作任务。
它们的结构体分别如下:
struct backing_dev_info {
struct list_head bdi_list;
unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
unsigned long state; /* Always use atomic bitops on this */
unsigned int capabilities; /* Device capabilities */
congested_fn *congested_fn; /* Function pointer if device is md/dm */
void *congested_data; /* Pointer to aux data for congested func */
char *name;
struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
unsigned long bw_time_stamp; /* last time write bw is updated */
unsigned long dirtied_stamp;
unsigned long written_stamp; /* pages written at bw_time_stamp */
unsigned long write_bandwidth; /* the estimated write bandwidth */
unsigned long avg_write_bandwidth; /* further smoothed write bw */
/*
* The base dirty throttle rate, re-calculated on every 200ms.
* All the bdi tasks' dirty rate will be curbed under it.
* @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
* in small steps and is much more smooth/stable than the latter.
*/
unsigned long dirty_ratelimit;
unsigned long balanced_dirty_ratelimit;
struct fprop_local_percpu completions;
int dirty_exceeded;
unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;
struct bdi_writeback wb; /* default writeback info for this bdi */
spinlock_t wb_lock; /* protects work_list */
struct list_head work_list;
struct device *dev;
struct timer_list laptop_mode_wb_timer;
#ifdef CONFIG_DEBUG_FS
struct dentry *debug_dir;
struct dentry *debug_stats;
#endif
};
struct bdi_writeback {
struct backing_dev_info *bdi; /* our parent bdi */
unsigned int nr;
unsigned long last_old_flush; /* last old data flush */
unsigned long last_active; /* last time bdi thread was active */
struct task_struct *task; /* writeback thread */
struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
struct list_head b_dirty; /* dirty inodes */
struct list_head b_io; /* parked for writeback */
struct list_head b_more_io; /* parked for more writeback */
spinlock_t list_lock; /* protects the b_* lists */
};
/*
* Passed into wb_writeback(), essentially a subset of writeback_control
*/
struct wb_writeback_work {
long nr_pages;
struct super_block *sb;
unsigned long *older_than_this;
enum writeback_sync_modes sync_mode;
unsigned int tagged_writepages:1;
unsigned int for_kupdate:1;
unsigned int range_cyclic:1;
unsigned int for_background:1;
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
struct completion *done; /* set if the caller waits */
};
- BDI数据结构是对块设备的一个描述。bdi对象在块设备添加的时候需要注册到系统的bdi队列中。对于ext3而言,在mount的时候需要将底层块设备的bdi对象联系到ext3 root_inode中。在bdi数据结构中有一条work_list,该队列维护了writeback内核线程需要处理的任务。如果该队列上没有work可以处理,那么writeback内核线程将会睡眠等待。
- writeback对象封装了内核线程task以及需要处理的inode队列。当page cache/buffer cache需要刷新radix tree上的inode时,可以将该inode挂载到writeback对象的b_dirty队列上,然后唤醒writeback线程。在处理过程中,inode会被移到b_io队列上进行处理。多条链表的方式可以降低多线程之间的资源共享。
- wb_writeback_work数据结构是对writeback任务的封装,不同的任务可以采用不同的刷新策略。writeback线程的处理对象就是writeback_work。如果writeback_work队列为空,那么内核线程就可以睡眠了。
writeback主要函数分析
writeback机制的主要函数包括如下两个方面:
- 管理bdi对象并且fork相应的writeback内核线程处理cache数据的刷新工作。
- writeback内核线程处理函数,实现dirty page的刷新操作
writeback线程管理
Linux中有一个内核守护线程,该线程用来管理系统bdi队列,并且负责为block device创建writeback thread。当bdi中有dirty page并且还没有为bdi分配内核线程的时候,bdi_forker_thread程序会为其分配线程资源;当一个writeback线程长时间(默认为5min)处于空闲状态时,bdi_forker_thread程序会释放该线程资源。static int bdi_forker_thread(void *ptr)
{
struct bdi_writeback *me = ptr;
current->flags |= PF_SWAPWRITE;
set_freezable();
/*
* Our parent may run at a different priority, just set us to normal
*/
set_user_nice(current, 0);
for (;;) {
struct task_struct *task = NULL;
struct backing_dev_info *bdi;
enum {
NO_ACTION, /* Nothing to do */
FORK_THREAD, /* Fork bdi thread */
KILL_THREAD, /* Kill inactive bdi thread */
} action = NO_ACTION;
/*
* Temporary measure, we want to make sure we don't see
* dirty data on the default backing_dev_info
*/
if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
del_timer(&me->wakeup_timer);
wb_do_writeback(me, 0);
}
spin_lock_bh(&bdi_lock);
/*
* In the following loop we are going to check whether we have
* some work to do without any synchronization with tasks
* waking us up to do work for them. Set the task state here
* so that we don't miss wakeups after verifying conditions.
*/
set_current_state(TASK_INTERRUPTIBLE);
list_for_each_entry(bdi, &bdi_list, bdi_list) {
bool have_dirty_io;
if (!bdi_cap_writeback_dirty(bdi) ||
bdi_cap_flush_forker(bdi))
continue;
WARN(!test_bit(BDI_registered, &bdi->state),
"bdi %p/%s is not registered!\n", bdi, bdi->name);
have_dirty_io = !list_empty(&bdi->work_list) ||
wb_has_dirty_io(&bdi->wb);
/*
* If the bdi has work to do, but the thread does not
* exist - create it.
*/
if (!bdi->wb.task && have_dirty_io) {
/*
* Set the pending bit - if someone will try to
* unregister this bdi - it'll wait on this bit.
*/
set_bit(BDI_pending, &bdi->state);
action = FORK_THREAD;
break;
}
spin_lock(&bdi->wb_lock);
/*
* If there is no work to do and the bdi thread was
* inactive long enough - kill it. The wb_lock is taken
* to make sure no-one adds more work to this bdi and
* wakes the bdi thread up.
*/
if (bdi->wb.task && !have_dirty_io &&
time_after(jiffies, bdi->wb.last_active +
bdi_longest_inactive())) {
task = bdi->wb.task;
bdi->wb.task = NULL;
spin_unlock(&bdi->wb_lock);
set_bit(BDI_pending, &bdi->state);
action = KILL_THREAD;
break;
}
spin_unlock(&bdi->wb_lock);
}
spin_unlock_bh(&bdi_lock);
/* Keep working if default bdi still has things to do */
if (!list_empty(&me->bdi->work_list))
__set_current_state(TASK_RUNNING);
switch (action) {
case FORK_THREAD:
__set_current_state(TASK_RUNNING);
task = kthread_create(bdi_writeback_thread, &bdi->wb,
"flush-%s", dev_name(bdi->dev));
if (IS_ERR(task)) {
/*
* If thread creation fails, force writeout of
* the bdi from the thread. Hopefully 1024 is
* large enough for efficient IO.
*/
writeback_inodes_wb(&bdi->wb, 1024,
WB_REASON_FORKER_THREAD);
} else {
/*
* The spinlock makes sure we do not lose
* wake-ups when racing with 'bdi_queue_work()'.
* And as soon as the bdi thread is visible, we
* can start it.
*/
spin_lock_bh(&bdi->wb_lock);
bdi->wb.task = task;
spin_unlock_bh(&bdi->wb_lock);
wake_up_process(task);
}
bdi_clear_pending(bdi);
break;
case KILL_THREAD:
__set_current_state(TASK_RUNNING);
kthread_stop(task);
bdi_clear_pending(bdi);
break;
case NO_ACTION:
if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
/*
* There are no dirty data. The only thing we
* should now care about is checking for
* inactive bdi threads and killing them. Thus,
* let's sleep for longer time, save energy and
* be friendly for battery-driven devices.
*/
schedule_timeout(bdi_longest_inactive());
else
schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
try_to_freeze();
break;
}
}
return 0;
}
Writeback工作线程
writeback线程是bdi_forker_thread 创建的,该线程的任务就是处理等待的数据回刷任务。线程处理函数为bdi_writeback_thread,该函数的实现如下:
/*
* Handle writeback of dirty data for the device backed by this bdi. Also
* wakes up periodically and does kupdated style flushing.
*/
int bdi_writeback_thread(void *data)
{
struct bdi_writeback *wb = data;
struct backing_dev_info *bdi = wb->bdi;
long pages_written;
current->flags |= PF_SWAPWRITE;
set_freezable();
wb->last_active = jiffies;
/*
* Our parent may run at a different priority, just set us to normal
*/
set_user_nice(current, 0);
trace_writeback_thread_start(bdi);
while (!kthread_freezable_should_stop(NULL)) {
/*
* Remove own delayed wake-up timer, since we are already awake
* and we'll take care of the periodic write-back.
*/
del_timer(&wb->wakeup_timer);
pages_written = wb_do_writeback(wb, 0);
trace_writeback_pages_written(pages_written);
if (pages_written)
wb->last_active = jiffies;
set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
continue;
}
if (wb_has_dirty_io(wb) && dirty_writeback_interval)
schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
else {
/*
* We have nothing to do, so can go sleep without any
* timeout and save power. When a work is queued or
* something is made dirty - we will be woken up.
*/
schedule();
}
}
/* Flush any work that raced with us exiting */
if (!list_empty(&bdi->work_list))
wb_do_writeback(wb, 1);
trace_writeback_thread_stop(bdi);
return 0;
}
bdi_writeback_thread函数主要是调用wb_do_writeback()函数。
/*
* Retrieve work items and do the writeback they describe
*/
long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
{
struct backing_dev_info *bdi = wb->bdi;
struct wb_writeback_work *work;
long wrote = 0;
set_bit(BDI_writeback_running, &wb->bdi->state);
while ((work = get_next_work_item(bdi)) != NULL) {
/*
* Override sync mode, in case we must wait for completion
* because this thread is exiting now.
*/
if (force_wait)
work->sync_mode = WB_SYNC_ALL;
trace_writeback_exec(bdi, work);
wrote += wb_writeback(wb, work);
/*
* Notify the caller of completion if this is a synchronous
* work item, otherwise just free it.
*/
if (work->done)
complete(work->done);
else
kfree(work);
}
/*
* Check for periodic writeback, kupdated() style
*/
wrote += wb_check_old_data_flush(wb);
wrote += wb_check_background_flush(wb);
clear_bit(BDI_writeback_running, &wb->bdi->state);
return wrote;
}
static long wb_check_old_data_flush(struct bdi_writeback *wb)
{
unsigned long expired;
long nr_pages;
/*
* When set to zero, disable periodic writeback
*/
if (!dirty_writeback_interval)
return 0;
expired = wb->last_old_flush +
msecs_to_jiffies(dirty_writeback_interval * 10);
if (time_before(jiffies, expired))
return 0;
wb->last_old_flush = jiffies;
nr_pages = get_nr_dirty_pages();
if (nr_pages) {
struct wb_writeback_work work = {
.nr_pages = nr_pages,
.sync_mode = WB_SYNC_NONE,
.for_kupdate = 1,
.range_cyclic = 1,
.reason = WB_REASON_PERIODIC,
};
return wb_writeback(wb, &work);
}
return 0;
}
static long wb_check_background_flush(struct bdi_writeback *wb)
{
if (over_bground_thresh(wb->bdi)) {
struct wb_writeback_work work = {
.nr_pages = LONG_MAX,
.sync_mode = WB_SYNC_NONE,
.for_background = 1,
.range_cyclic = 1,
.reason = WB_REASON_BACKGROUND,
};
return wb_writeback(wb, &work);
}
return 0;
}
wb_check_background_flush和wb_check_old_data_flush的函数只是设置wb_writeback_work的各项参数,然后执行wb_writeback函数,该函数是Writeback机制中真正执行写回的函数。Writeback机制中的写回磁盘操作都是通过wb_writeback函数实现的,wb_writeback调用与文件系统有关的write函数,执行协会磁盘的操作。
/*
* Explicit flushing or periodic writeback of "old" data.
*
* Define "old": the first time one of an inode's pages is dirtied, we mark the
* dirtying-time in the inode's address_space. So this periodic writeback code
* just walks the superblock inode list, writing back any inodes which are
* older than a specific point in time.
*
* Try to run once per dirty_writeback_interval. But if a writeback event
* takes longer than a dirty_writeback_interval interval, then leave a
* one-second gap.
*
* older_than_this takes precedence over nr_to_write. So we'll only write back
* all dirty pages if they are all attached to "old" mappings.
*/
static long wb_writeback(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
unsigned long wb_start = jiffies;
long nr_pages = work->nr_pages;
unsigned long oldest_jif;
struct inode *inode;
long progress;
oldest_jif = jiffies;
work->older_than_this = &oldest_jif;
spin_lock(&wb->list_lock);
for (;;) {
/*
* Stop writeback when nr_pages has been consumed
*/
if (work->nr_pages <= 0)
break;
/*
* Background writeout and kupdate-style writeback may
* run forever. Stop them if there is other work to do
* so that e.g. sync can proceed. They'll be restarted
* after the other works are all done.
*/
if ((work->for_background || work->for_kupdate) &&
!list_empty(&wb->bdi->work_list))
break;
/*
* For background writeout, stop when we are below the
* background dirty threshold
*/
if (work->for_background && !over_bground_thresh(wb->bdi))
break;
/*
* Kupdate and background works are special and we want to
* include all inodes that need writing. Livelock avoidance is
* handled by these works yielding to any other work so we are
* safe.
*/
if (work->for_kupdate) {
oldest_jif = jiffies -
msecs_to_jiffies(dirty_expire_interval * 10);
} else if (work->for_background)
oldest_jif = jiffies;
trace_writeback_start(wb->bdi, work);
if (list_empty(&wb->b_io))
queue_io(wb, work);
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work);
else
progress = __writeback_inodes_wb(wb, work);
trace_writeback_written(wb->bdi, work);
wb_update_bandwidth(wb, wb_start);
/*
* Did we write something? Try for more
*
* Dirty inodes are moved to b_io for writeback in batches.
* The completion of the current batch does not necessarily
* mean the overall work is done. So we keep looping as long
* as made some progress on cleaning pages or inodes.
*/
if (progress)
continue;
/*
* No more inodes for IO, bail
*/
if (list_empty(&wb->b_more_io))
break;
/*
* Nothing written. Wait for some inode to
* become available for writeback. Otherwise
* we'll just busyloop.
*/
if (!list_empty(&wb->b_more_io)) {
trace_writeback_wait(wb->bdi, work);
inode = wb_inode(wb->b_more_io.prev);
spin_lock(&inode->i_lock);
spin_unlock(&wb->list_lock);
/* This function drops i_lock... */
inode_sleep_on_writeback(inode);
spin_lock(&wb->list_lock);
}
}
spin_unlock(&wb->list_lock);
return nr_pages - work->nr_pages;
}
总结
writeback机制是比较简单的,其核心是通过一个常驻内核线程为每个BDI对象分配writeback线程,实现对cache中dirty page的数据回刷。
参考:http://www.linuxidc.com/Linux/2013-01/77576.htm
更多推荐
已为社区贡献2条内容
所有评论(0)