add idl4k kernel firmware version 1.13.0.105

This commit is contained in:
Jaroslav Kysela
2015-03-26 17:22:37 +01:00
parent 5194d2792e
commit e9070cdc77
31064 changed files with 12769984 additions and 0 deletions

33
kernel/fs/jbd2/Kconfig Normal file
View File

@@ -0,0 +1,33 @@
config JBD2
tristate
select CRC32
help
This is a generic journaling layer for block devices that support
both 32-bit and 64-bit block numbers. It is currently used by
the ext4 and OCFS2 filesystems, but it could also be used to add
journal support to other file systems or block devices such
as RAID or LVM.
If you are using ext4 or OCFS2, you need to say Y here.
If you are not using ext4 or OCFS2 then you will
probably want to say N.
To compile this device as a module, choose M here. The module will be
called jbd2. If you are compiling ext4 or OCFS2 into the kernel,
you cannot compile this code as a module.
config JBD2_DEBUG
bool "JBD2 (ext4) debugging support"
depends on JBD2 && DEBUG_FS
help
If you are using the ext4 journaled file system (or
potentially any other filesystem/device using JBD2), this option
allows you to enable debugging output while the system is running,
in order to help track down any problems you are having.
By default, the debugging output will be turned off.
If you select Y here, then you will be able to turn on debugging
with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
number between 1 and 5. The higher the number, the more debugging
output is generated. To turn debugging off again, do
"echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".

7
kernel/fs/jbd2/Makefile Normal file
View File

@@ -0,0 +1,7 @@
#
# Makefile for the linux journaling routines.
#
obj-$(CONFIG_JBD2) += jbd2.o
jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o

781
kernel/fs/jbd2/checkpoint.c Normal file
View File

@@ -0,0 +1,781 @@
/*
* linux/fs/jbd2/checkpoint.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
*
* Copyright 1999 Red Hat Software --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Checkpoint routines for the generic filesystem journaling code.
* Part of the ext2fs journaling system.
*
* Checkpointing is the process of ensuring that a section of the log is
* committed fully to disk, so that that portion of the log can be
* reused.
*/
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <trace/events/jbd2.h>
/*
* Unlink a buffer from a transaction checkpoint list.
*
* Called with j_list_lock held.
*/
static inline void __buffer_unlink_first(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
jh->b_cpnext->b_cpprev = jh->b_cpprev;
jh->b_cpprev->b_cpnext = jh->b_cpnext;
if (transaction->t_checkpoint_list == jh) {
transaction->t_checkpoint_list = jh->b_cpnext;
if (transaction->t_checkpoint_list == jh)
transaction->t_checkpoint_list = NULL;
}
}
/*
* Unlink a buffer from a transaction checkpoint(io) list.
*
* Called with j_list_lock held.
*/
static inline void __buffer_unlink(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
__buffer_unlink_first(jh);
if (transaction->t_checkpoint_io_list == jh) {
transaction->t_checkpoint_io_list = jh->b_cpnext;
if (transaction->t_checkpoint_io_list == jh)
transaction->t_checkpoint_io_list = NULL;
}
}
/*
* Move a buffer from the checkpoint list to the checkpoint io list
*
* Called with j_list_lock held
*/
static inline void __buffer_relink_io(struct journal_head *jh)
{
transaction_t *transaction = jh->b_cp_transaction;
__buffer_unlink_first(jh);
if (!transaction->t_checkpoint_io_list) {
jh->b_cpnext = jh->b_cpprev = jh;
} else {
jh->b_cpnext = transaction->t_checkpoint_io_list;
jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
jh->b_cpprev->b_cpnext = jh;
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_io_list = jh;
}
/*
* Try to release a checkpointed buffer from its transaction.
* Returns 1 if we released it and 2 if we also released the
* whole transaction.
*
* Requires j_list_lock
* Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
static int __try_to_free_cp_buf(struct journal_head *jh)
{
int ret = 0;
struct buffer_head *bh = jh2bh(jh);
if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
!buffer_dirty(bh) && !buffer_write_io_error(bh)) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
ret = __jbd2_journal_remove_checkpoint(jh) + 1;
jbd_unlock_bh_state(bh);
jbd2_journal_remove_journal_head(bh);
BUFFER_TRACE(bh, "release");
__brelse(bh);
} else {
jbd_unlock_bh_state(bh);
}
return ret;
}
/*
* __jbd2_log_wait_for_space: wait until there is space in the journal.
*
* Called under j-state_lock *only*. It will be unlocked if we have to wait
* for a checkpoint to free up some space in the log.
*/
void __jbd2_log_wait_for_space(journal_t *journal)
{
int nblocks, space_left;
assert_spin_locked(&journal->j_state_lock);
nblocks = jbd_space_needed(journal);
while (__jbd2_log_space_left(journal) < nblocks) {
if (journal->j_flags & JBD2_ABORT)
return;
spin_unlock(&journal->j_state_lock);
mutex_lock(&journal->j_checkpoint_mutex);
/*
* Test again, another process may have checkpointed while we
* were waiting for the checkpoint lock. If there are no
* transactions ready to be checkpointed, try to recover
* journal space by calling cleanup_journal_tail(), and if
* that doesn't work, by waiting for the currently committing
* transaction to complete. If there is absolutely no way
* to make progress, this is either a BUG or corrupted
* filesystem, so abort the journal and leave a stack
* trace for forensic evidence.
*/
spin_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
nblocks = jbd_space_needed(journal);
space_left = __jbd2_log_space_left(journal);
if (space_left < nblocks) {
int chkpt = journal->j_checkpoint_transactions != NULL;
tid_t tid = 0;
if (journal->j_committing_transaction)
tid = journal->j_committing_transaction->t_tid;
spin_unlock(&journal->j_list_lock);
spin_unlock(&journal->j_state_lock);
if (chkpt) {
jbd2_log_do_checkpoint(journal);
} else if (jbd2_cleanup_journal_tail(journal) == 0) {
/* We were able to recover space; yay! */
;
} else if (tid) {
jbd2_log_wait_commit(journal, tid);
} else {
printk(KERN_ERR "%s: needed %d blocks and "
"only had %d space available\n",
__func__, nblocks, space_left);
printk(KERN_ERR "%s: no way to get more "
"journal space in %s\n", __func__,
journal->j_devname);
WARN_ON(1);
jbd2_journal_abort(journal, 0);
}
spin_lock(&journal->j_state_lock);
} else {
spin_unlock(&journal->j_list_lock);
}
mutex_unlock(&journal->j_checkpoint_mutex);
}
}
/*
* We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
* The caller must restart a list walk. Wait for someone else to run
* jbd_unlock_bh_state().
*/
static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
__releases(journal->j_list_lock)
{
get_bh(bh);
spin_unlock(&journal->j_list_lock);
jbd_lock_bh_state(bh);
jbd_unlock_bh_state(bh);
put_bh(bh);
}
/*
* Clean up transaction's list of buffers submitted for io.
* We wait for any pending IO to complete and remove any clean
* buffers. Note that we take the buffers in the opposite ordering
* from the one in which they were submitted for IO.
*
* Return 0 on success, and return <0 if some buffers have failed
* to be written out.
*
* Called with j_list_lock held.
*/
static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
{
struct journal_head *jh;
struct buffer_head *bh;
tid_t this_tid;
int released = 0;
int ret = 0;
this_tid = transaction->t_tid;
restart:
/* Did somebody clean up the transaction in the meanwhile? */
if (journal->j_checkpoint_transactions != transaction ||
transaction->t_tid != this_tid)
return ret;
while (!released && transaction->t_checkpoint_io_list) {
jh = transaction->t_checkpoint_io_list;
bh = jh2bh(jh);
if (!jbd_trylock_bh_state(bh)) {
jbd_sync_bh(journal, bh);
spin_lock(&journal->j_list_lock);
goto restart;
}
if (buffer_locked(bh)) {
atomic_inc(&bh->b_count);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
spin_lock(&journal->j_list_lock);
goto restart;
}
if (unlikely(buffer_write_io_error(bh)))
ret = -EIO;
/*
* Now in whatever state the buffer currently is, we know that
* it has been written out and so we can drop it from the list
*/
released = __jbd2_journal_remove_checkpoint(jh);
jbd_unlock_bh_state(bh);
jbd2_journal_remove_journal_head(bh);
__brelse(bh);
}
return ret;
}
static void
__flush_batch(journal_t *journal, int *batch_count)
{
int i;
ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
for (i = 0; i < *batch_count; i++) {
struct buffer_head *bh = journal->j_chkpt_bhs[i];
clear_buffer_jwrite(bh);
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
}
*batch_count = 0;
}
/*
* Try to flush one buffer from the checkpoint list to disk.
*
* Return 1 if something happened which requires us to abort the current
* scan of the checkpoint list. Return <0 if the buffer has failed to
* be written out.
*
* Called with j_list_lock held and drops it if 1 is returned
* Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
static int __process_buffer(journal_t *journal, struct journal_head *jh,
int *batch_count, transaction_t *transaction)
{
struct buffer_head *bh = jh2bh(jh);
int ret = 0;
if (buffer_locked(bh)) {
atomic_inc(&bh->b_count);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
ret = 1;
} else if (jh->b_transaction != NULL) {
transaction_t *t = jh->b_transaction;
tid_t tid = t->t_tid;
transaction->t_chp_stats.cs_forced_to_close++;
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
jbd2_log_start_commit(journal, tid);
jbd2_log_wait_commit(journal, tid);
ret = 1;
} else if (!buffer_dirty(bh)) {
ret = 1;
if (unlikely(buffer_write_io_error(bh)))
ret = -EIO;
J_ASSERT_JH(jh, !buffer_jbddirty(bh));
BUFFER_TRACE(bh, "remove from checkpoint");
__jbd2_journal_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
jbd2_journal_remove_journal_head(bh);
__brelse(bh);
} else {
/*
* Important: we are about to write the buffer, and
* possibly block, while still holding the journal lock.
* We cannot afford to let the transaction logic start
* messing around with this buffer before we write it to
* disk, as that would break recoverability.
*/
BUFFER_TRACE(bh, "queue");
get_bh(bh);
J_ASSERT_BH(bh, !buffer_jwrite(bh));
set_buffer_jwrite(bh);
journal->j_chkpt_bhs[*batch_count] = bh;
__buffer_relink_io(jh);
jbd_unlock_bh_state(bh);
transaction->t_chp_stats.cs_written++;
(*batch_count)++;
if (*batch_count == JBD2_NR_BATCH) {
spin_unlock(&journal->j_list_lock);
__flush_batch(journal, batch_count);
ret = 1;
}
}
return ret;
}
/*
* Perform an actual checkpoint. We take the first transaction on the
* list of transactions to be checkpointed and send all its buffers
* to disk. We submit larger chunks of data at once.
*
* The journal should be locked before calling this function.
* Called with j_checkpoint_mutex held.
*/
int jbd2_log_do_checkpoint(journal_t *journal)
{
transaction_t *transaction;
tid_t this_tid;
int result;
jbd_debug(1, "Start checkpoint\n");
/*
* First thing: if there are any transactions in the log which
* don't need checkpointing, just eliminate them from the
* journal straight away.
*/
result = jbd2_cleanup_journal_tail(journal);
trace_jbd2_checkpoint(journal, result);
jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
if (result <= 0)
return result;
/*
* OK, we need to start writing disk blocks. Take one transaction
* and write it.
*/
result = 0;
spin_lock(&journal->j_list_lock);
if (!journal->j_checkpoint_transactions)
goto out;
transaction = journal->j_checkpoint_transactions;
if (transaction->t_chp_stats.cs_chp_time == 0)
transaction->t_chp_stats.cs_chp_time = jiffies;
this_tid = transaction->t_tid;
restart:
/*
* If someone cleaned up this transaction while we slept, we're
* done (maybe it's a new transaction, but it fell at the same
* address).
*/
if (journal->j_checkpoint_transactions == transaction &&
transaction->t_tid == this_tid) {
int batch_count = 0;
struct journal_head *jh;
int retry = 0, err;
while (!retry && transaction->t_checkpoint_list) {
struct buffer_head *bh;
jh = transaction->t_checkpoint_list;
bh = jh2bh(jh);
if (!jbd_trylock_bh_state(bh)) {
jbd_sync_bh(journal, bh);
retry = 1;
break;
}
retry = __process_buffer(journal, jh, &batch_count,
transaction);
if (retry < 0 && !result)
result = retry;
if (!retry && (need_resched() ||
spin_needbreak(&journal->j_list_lock))) {
spin_unlock(&journal->j_list_lock);
retry = 1;
break;
}
}
if (batch_count) {
if (!retry) {
spin_unlock(&journal->j_list_lock);
retry = 1;
}
__flush_batch(journal, &batch_count);
}
if (retry) {
spin_lock(&journal->j_list_lock);
goto restart;
}
/*
* Now we have cleaned up the first transaction's checkpoint
* list. Let's clean up the second one
*/
err = __wait_cp_io(journal, transaction);
if (!result)
result = err;
}
out:
spin_unlock(&journal->j_list_lock);
if (result < 0)
jbd2_journal_abort(journal, result);
else
result = jbd2_cleanup_journal_tail(journal);
return (result < 0) ? result : 0;
}
/*
* Check the list of checkpoint transactions for the journal to see if
* we have already got rid of any since the last update of the log tail
* in the journal superblock. If so, we can instantly roll the
* superblock forward to remove those transactions from the log.
*
* Return <0 on error, 0 on success, 1 if there was nothing to clean up.
*
* Called with the journal lock held.
*
* This is the only part of the journaling code which really needs to be
* aware of transaction aborts. Checkpointing involves writing to the
* main filesystem area rather than to the journal, so it can proceed
* even in abort state, but we must not update the super block if
* checkpointing may have failed. Otherwise, we would lose some metadata
* buffers which should be written-back to the filesystem.
*/
int jbd2_cleanup_journal_tail(journal_t *journal)
{
transaction_t * transaction;
tid_t first_tid;
unsigned long blocknr, freed;
if (is_journal_aborted(journal))
return 1;
/* OK, work out the oldest transaction remaining in the log, and
* the log block it starts at.
*
* If the log is now empty, we need to work out which is the
* next transaction ID we will write, and where it will
* start. */
spin_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
transaction = journal->j_checkpoint_transactions;
if (transaction) {
first_tid = transaction->t_tid;
blocknr = transaction->t_log_start;
} else if ((transaction = journal->j_committing_transaction) != NULL) {
first_tid = transaction->t_tid;
blocknr = transaction->t_log_start;
} else if ((transaction = journal->j_running_transaction) != NULL) {
first_tid = transaction->t_tid;
blocknr = journal->j_head;
} else {
first_tid = journal->j_transaction_sequence;
blocknr = journal->j_head;
}
spin_unlock(&journal->j_list_lock);
J_ASSERT(blocknr != 0);
/* If the oldest pinned transaction is at the tail of the log
already then there's not much we can do right now. */
if (journal->j_tail_sequence == first_tid) {
spin_unlock(&journal->j_state_lock);
return 1;
}
/* OK, update the superblock to recover the freed space.
* Physical blocks come first: have we wrapped beyond the end of
* the log? */
freed = blocknr - journal->j_tail;
if (blocknr < journal->j_tail)
freed = freed + journal->j_last - journal->j_first;
jbd_debug(1,
"Cleaning journal tail from %d to %d (offset %lu), "
"freeing %lu\n",
journal->j_tail_sequence, first_tid, blocknr, freed);
journal->j_free += freed;
journal->j_tail_sequence = first_tid;
journal->j_tail = blocknr;
spin_unlock(&journal->j_state_lock);
/*
* If there is an external journal, we need to make sure that
* any data blocks that were recently written out --- perhaps
* by jbd2_log_do_checkpoint() --- are flushed out before we
* drop the transactions from the external journal. It's
* unlikely this will be necessary, especially with a
* appropriately sized journal, but we need this to guarantee
* correctness. Fortunately jbd2_cleanup_journal_tail()
* doesn't get called all that often.
*/
if ((journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
blkdev_issue_flush(journal->j_fs_dev, NULL);
if (!(journal->j_flags & JBD2_ABORT))
jbd2_journal_update_superblock(journal, 1);
return 0;
}
/* Checkpoint list management */
/*
* journal_clean_one_cp_list
*
* Find all the written-back checkpoint buffers in the given list and release them.
*
* Called with the journal locked.
* Called with j_list_lock held.
* Returns number of bufers reaped (for debug)
*/
static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
{
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
int ret, freed = 0;
*released = 0;
if (!jh)
return 0;
last_jh = jh->b_cpprev;
do {
jh = next_jh;
next_jh = jh->b_cpnext;
/* Use trylock because of the ranking */
if (jbd_trylock_bh_state(jh2bh(jh))) {
ret = __try_to_free_cp_buf(jh);
if (ret) {
freed++;
if (ret == 2) {
*released = 1;
return freed;
}
}
}
/*
* This function only frees up some memory
* if possible so we dont have an obligation
* to finish processing. Bail out if preemption
* requested:
*/
if (need_resched())
return freed;
} while (jh != last_jh);
return freed;
}
/*
* journal_clean_checkpoint_list
*
* Find all the written-back checkpoint buffers in the journal and release them.
*
* Called with the journal locked.
* Called with j_list_lock held.
* Returns number of buffers reaped (for debug)
*/
int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
{
transaction_t *transaction, *last_transaction, *next_transaction;
int ret = 0;
int released;
transaction = journal->j_checkpoint_transactions;
if (!transaction)
goto out;
last_transaction = transaction->t_cpprev;
next_transaction = transaction;
do {
transaction = next_transaction;
next_transaction = transaction->t_cpnext;
ret += journal_clean_one_cp_list(transaction->
t_checkpoint_list, &released);
/*
* This function only frees up some memory if possible so we
* dont have an obligation to finish processing. Bail out if
* preemption requested:
*/
if (need_resched())
goto out;
if (released)
continue;
/*
* It is essential that we are as careful as in the case of
* t_checkpoint_list with removing the buffer from the list as
* we can possibly see not yet submitted buffers on io_list
*/
ret += journal_clean_one_cp_list(transaction->
t_checkpoint_io_list, &released);
if (need_resched())
goto out;
} while (transaction != last_transaction);
out:
return ret;
}
/*
* journal_remove_checkpoint: called after a buffer has been committed
* to disk (either by being write-back flushed to disk, or being
* committed to the log).
*
* We cannot safely clean a transaction out of the log until all of the
* buffer updates committed in that transaction have safely been stored
* elsewhere on disk. To achieve this, all of the buffers in a
* transaction need to be maintained on the transaction's checkpoint
* lists until they have been rewritten, at which point this function is
* called to remove the buffer from the existing transaction's
* checkpoint lists.
*
* The function returns 1 if it frees the transaction, 0 otherwise.
*
* This function is called with the journal locked.
* This function is called with j_list_lock held.
* This function is called with jbd_lock_bh_state(jh2bh(jh))
*/
int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
{
struct transaction_chp_stats_s *stats;
transaction_t *transaction;
journal_t *journal;
int ret = 0;
JBUFFER_TRACE(jh, "entry");
if ((transaction = jh->b_cp_transaction) == NULL) {
JBUFFER_TRACE(jh, "not on transaction");
goto out;
}
journal = transaction->t_journal;
__buffer_unlink(jh);
jh->b_cp_transaction = NULL;
if (transaction->t_checkpoint_list != NULL ||
transaction->t_checkpoint_io_list != NULL)
goto out;
JBUFFER_TRACE(jh, "transaction has no more buffers");
/*
* There is one special case to worry about: if we have just pulled the
* buffer off a running or committing transaction's checkpoing list,
* then even if the checkpoint list is empty, the transaction obviously
* cannot be dropped!
*
* The locking here around t_state is a bit sleazy.
* See the comment at the end of jbd2_journal_commit_transaction().
*/
if (transaction->t_state != T_FINISHED) {
JBUFFER_TRACE(jh, "belongs to running/committing transaction");
goto out;
}
/* OK, that was the last buffer for the transaction: we can now
safely remove this transaction from the log */
stats = &transaction->t_chp_stats;
if (stats->cs_chp_time)
stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
jiffies);
trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev,
transaction->t_tid, stats);
__jbd2_journal_drop_transaction(journal, transaction);
kfree(transaction);
/* Just in case anybody was waiting for more transactions to be
checkpointed... */
wake_up(&journal->j_wait_logspace);
ret = 1;
out:
JBUFFER_TRACE(jh, "exit");
return ret;
}
/*
* journal_insert_checkpoint: put a committed buffer onto a checkpoint
* list so that we know when it is safe to clean the transaction out of
* the log.
*
* Called with the journal locked.
* Called with j_list_lock held.
*/
void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
transaction_t *transaction)
{
JBUFFER_TRACE(jh, "entry");
J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
jh->b_cp_transaction = transaction;
if (!transaction->t_checkpoint_list) {
jh->b_cpnext = jh->b_cpprev = jh;
} else {
jh->b_cpnext = transaction->t_checkpoint_list;
jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
jh->b_cpprev->b_cpnext = jh;
jh->b_cpnext->b_cpprev = jh;
}
transaction->t_checkpoint_list = jh;
}
/*
* We've finished with this transaction structure: adios...
*
* The transaction must have no links except for the checkpoint by this
* point.
*
* Called with the journal locked.
* Called with j_list_lock held.
*/
void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
{
assert_spin_locked(&journal->j_list_lock);
if (transaction->t_cpnext) {
transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
if (journal->j_checkpoint_transactions == transaction)
journal->j_checkpoint_transactions =
transaction->t_cpnext;
if (journal->j_checkpoint_transactions == transaction)
journal->j_checkpoint_transactions = NULL;
}
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
J_ASSERT(transaction->t_log_list == NULL);
J_ASSERT(transaction->t_checkpoint_list == NULL);
J_ASSERT(transaction->t_checkpoint_io_list == NULL);
J_ASSERT(transaction->t_updates == 0);
J_ASSERT(journal->j_committing_transaction != transaction);
J_ASSERT(journal->j_running_transaction != transaction);
jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
}

1073
kernel/fs/jbd2/commit.c Normal file

File diff suppressed because it is too large Load Diff

2306
kernel/fs/jbd2/journal.c Normal file

File diff suppressed because it is too large Load Diff

745
kernel/fs/jbd2/recovery.c Normal file
View File

@@ -0,0 +1,745 @@
/*
* linux/fs/jbd2/recovery.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1999
*
* Copyright 1999-2000 Red Hat Software --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Journal recovery routines for the generic filesystem journaling code;
* part of the ext2fs journaling system.
*/
#ifndef __KERNEL__
#include "jfs_user.h"
#else
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/crc32.h>
#endif
/*
* Maintain information about the progress of the recovery job, so that
* the different passes can carry information between them.
*/
struct recovery_info
{
tid_t start_transaction;
tid_t end_transaction;
int nr_replays;
int nr_revokes;
int nr_revoke_hits;
};
enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass);
static int scan_revoke_records(journal_t *, struct buffer_head *,
tid_t, struct recovery_info *);
#ifdef __KERNEL__
/* Release readahead buffers after use */
static void journal_brelse_array(struct buffer_head *b[], int n)
{
while (--n >= 0)
brelse (b[n]);
}
/*
* When reading from the journal, we are going through the block device
* layer directly and so there is no readahead being done for us. We
* need to implement any readahead ourselves if we want it to happen at
* all. Recovery is basically one long sequential read, so make sure we
* do the IO in reasonably large chunks.
*
* This is not so critical that we need to be enormously clever about
* the readahead size, though. 128K is a purely arbitrary, good-enough
* fixed value.
*/
#define MAXBUF 8
static int do_readahead(journal_t *journal, unsigned int start)
{
int err;
unsigned int max, nbufs, next;
unsigned long long blocknr;
struct buffer_head *bh;
struct buffer_head * bufs[MAXBUF];
/* Do up to 128K of readahead */
max = start + (128 * 1024 / journal->j_blocksize);
if (max > journal->j_maxlen)
max = journal->j_maxlen;
/* Do the readahead itself. We'll submit MAXBUF buffer_heads at
* a time to the block device IO layer. */
nbufs = 0;
for (next = start; next < max; next++) {
err = jbd2_journal_bmap(journal, next, &blocknr);
if (err) {
printk (KERN_ERR "JBD: bad block at offset %u\n",
next);
goto failed;
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh) {
err = -ENOMEM;
goto failed;
}
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
if (nbufs == MAXBUF) {
ll_rw_block(READ, nbufs, bufs);
journal_brelse_array(bufs, nbufs);
nbufs = 0;
}
} else
brelse(bh);
}
if (nbufs)
ll_rw_block(READ, nbufs, bufs);
err = 0;
failed:
if (nbufs)
journal_brelse_array(bufs, nbufs);
return err;
}
#endif /* __KERNEL__ */
/*
* Read a block from the journal
*/
static int jread(struct buffer_head **bhp, journal_t *journal,
unsigned int offset)
{
int err;
unsigned long long blocknr;
struct buffer_head *bh;
*bhp = NULL;
if (offset >= journal->j_maxlen) {
printk(KERN_ERR "JBD: corrupted journal superblock\n");
return -EIO;
}
err = jbd2_journal_bmap(journal, offset, &blocknr);
if (err) {
printk (KERN_ERR "JBD: bad block at offset %u\n",
offset);
return err;
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
if (!bh)
return -ENOMEM;
if (!buffer_uptodate(bh)) {
/* If this is a brand new buffer, start readahead.
Otherwise, we assume we are already reading it. */
if (!buffer_req(bh))
do_readahead(journal, offset);
wait_on_buffer(bh);
}
if (!buffer_uptodate(bh)) {
printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
offset);
brelse(bh);
return -EIO;
}
*bhp = bh;
return 0;
}
/*
* Count the number of in-use tags in a journal descriptor block.
*/
static int count_tags(journal_t *journal, struct buffer_head *bh)
{
char * tagp;
journal_block_tag_t * tag;
int nr = 0, size = journal->j_blocksize;
int tag_bytes = journal_tag_bytes(journal);
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data + tag_bytes) <= size) {
tag = (journal_block_tag_t *) tagp;
nr++;
tagp += tag_bytes;
if (!(tag->t_flags & cpu_to_be32(JBD2_FLAG_SAME_UUID)))
tagp += 16;
if (tag->t_flags & cpu_to_be32(JBD2_FLAG_LAST_TAG))
break;
}
return nr;
}
/* Make sure we wrap around the log correctly! */
#define wrap(journal, var) \
do { \
if (var >= (journal)->j_last) \
var -= ((journal)->j_last - (journal)->j_first); \
} while (0)
/**
* jbd2_journal_recover - recovers a on-disk journal
* @journal: the journal to recover
*
* The primary function for recovering the log contents when mounting a
* journaled device.
*
* Recovery is done in three passes. In the first pass, we look for the
* end of the log. In the second, we assemble the list of revoke
* blocks. In the third and final pass, we replay any un-revoked blocks
* in the log.
*/
int jbd2_journal_recover(journal_t *journal)
{
int err, err2;
journal_superblock_t * sb;
struct recovery_info info;
memset(&info, 0, sizeof(info));
sb = journal->j_superblock;
/*
* The journal superblock's s_start field (the current log head)
* is always zero if, and only if, the journal was cleanly
* unmounted.
*/
if (!sb->s_start) {
jbd_debug(1, "No recovery required, last transaction %d\n",
be32_to_cpu(sb->s_sequence));
journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
return 0;
}
err = do_one_pass(journal, &info, PASS_SCAN);
if (!err)
err = do_one_pass(journal, &info, PASS_REVOKE);
if (!err)
err = do_one_pass(journal, &info, PASS_REPLAY);
jbd_debug(1, "JBD: recovery, exit status %d, "
"recovered transactions %u to %u\n",
err, info.start_transaction, info.end_transaction);
jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n",
info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
/* Restart the log at the next transaction ID, thus invalidating
* any existing commit records in the log. */
journal->j_transaction_sequence = ++info.end_transaction;
jbd2_journal_clear_revoke(journal);
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
return err;
}
/**
* jbd2_journal_skip_recovery - Start journal and wipe exiting records
* @journal: journal to startup
*
* Locate any valid recovery information from the journal and set up the
* journal structures in memory to ignore it (presumably because the
* caller has evidence that it is out of date).
* This function does'nt appear to be exorted..
*
* We perform one pass over the journal to allow us to tell the user how
* much recovery information is being erased, and to let us initialise
* the journal transaction sequence numbers to the next unused ID.
*/
int jbd2_journal_skip_recovery(journal_t *journal)
{
int err;
journal_superblock_t * sb;
struct recovery_info info;
memset (&info, 0, sizeof(info));
sb = journal->j_superblock;
err = do_one_pass(journal, &info, PASS_SCAN);
if (err) {
printk(KERN_ERR "JBD: error %d scanning journal\n", err);
++journal->j_transaction_sequence;
} else {
#ifdef CONFIG_JBD2_DEBUG
int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
#endif
jbd_debug(1,
"JBD: ignoring %d transaction%s from the journal.\n",
dropped, (dropped == 1) ? "" : "s");
journal->j_transaction_sequence = ++info.end_transaction;
}
journal->j_tail = 0;
return err;
}
static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
{
unsigned long long block = be32_to_cpu(tag->t_blocknr);
if (tag_bytes > JBD2_TAG_SIZE32)
block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
return block;
}
/*
* calc_chksums calculates the checksums for the blocks described in the
* descriptor block.
*/
static int calc_chksums(journal_t *journal, struct buffer_head *bh,
unsigned long *next_log_block, __u32 *crc32_sum)
{
int i, num_blks, err;
unsigned long io_block;
struct buffer_head *obh;
num_blks = count_tags(journal, bh);
/* Calculate checksum of the descriptor block. */
*crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
for (i = 0; i < num_blks; i++) {
io_block = (*next_log_block)++;
wrap(journal, *next_log_block);
err = jread(&obh, journal, io_block);
if (err) {
printk(KERN_ERR "JBD: IO error %d recovering block "
"%lu in log\n", err, io_block);
return 1;
} else {
*crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
obh->b_size);
}
put_bh(obh);
}
return 0;
}
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
unsigned int first_commit_ID, next_commit_ID;
unsigned long next_log_block;
int err, success = 0;
journal_superblock_t * sb;
journal_header_t * tmp;
struct buffer_head * bh;
unsigned int sequence;
int blocktype;
int tag_bytes = journal_tag_bytes(journal);
__u32 crc32_sum = ~0; /* Transactional Checksums */
/* Precompute the maximum metadata descriptors in a descriptor block */
int MAX_BLOCKS_PER_DESC;
MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
/ tag_bytes);
/*
* First thing is to establish what we expect to find in the log
* (in terms of transaction IDs), and where (in terms of log
* block offsets): query the superblock.
*/
sb = journal->j_superblock;
next_commit_ID = be32_to_cpu(sb->s_sequence);
next_log_block = be32_to_cpu(sb->s_start);
first_commit_ID = next_commit_ID;
if (pass == PASS_SCAN)
info->start_transaction = first_commit_ID;
jbd_debug(1, "Starting recovery pass %d\n", pass);
/*
* Now we walk through the log, transaction by transaction,
* making sure that each transaction has a commit block in the
* expected place. Each complete transaction gets replayed back
* into the main filesystem.
*/
while (1) {
int flags;
char * tagp;
journal_block_tag_t * tag;
struct buffer_head * obh;
struct buffer_head * nbh;
cond_resched();
/* If we already know where to stop the log traversal,
* check right now that we haven't gone past the end of
* the log. */
if (pass != PASS_SCAN)
if (tid_geq(next_commit_ID, info->end_transaction))
break;
jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
next_commit_ID, next_log_block, journal->j_last);
/* Skip over each chunk of the transaction looking
* either the next descriptor block or the final commit
* record. */
jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
next_log_block++;
wrap(journal, next_log_block);
/* What kind of buffer is it?
*
* If it is a descriptor block, check that it has the
* expected sequence number. Otherwise, we're all done
* here. */
tmp = (journal_header_t *)bh->b_data;
if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
brelse(bh);
break;
}
blocktype = be32_to_cpu(tmp->h_blocktype);
sequence = be32_to_cpu(tmp->h_sequence);
jbd_debug(3, "Found magic %d, sequence %d\n",
blocktype, sequence);
if (sequence != next_commit_ID) {
brelse(bh);
break;
}
/* OK, we have a valid descriptor block which matches
* all of the sequence number checks. What are we going
* to do with it? That depends on the pass... */
switch(blocktype) {
case JBD2_DESCRIPTOR_BLOCK:
/* If it is a valid descriptor block, replay it
* in pass REPLAY; if journal_checksums enabled, then
* calculate checksums in PASS_SCAN, otherwise,
* just skip over the blocks it describes. */
if (pass != PASS_REPLAY) {
if (pass == PASS_SCAN &&
JBD2_HAS_COMPAT_FEATURE(journal,
JBD2_FEATURE_COMPAT_CHECKSUM) &&
!info->end_transaction) {
if (calc_chksums(journal, bh,
&next_log_block,
&crc32_sum)) {
put_bh(bh);
break;
}
put_bh(bh);
continue;
}
next_log_block += count_tags(journal, bh);
wrap(journal, next_log_block);
put_bh(bh);
continue;
}
/* A descriptor block: we can now write all of
* the data blocks. Yay, useful work is finally
* getting done here! */
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data + tag_bytes)
<= journal->j_blocksize) {
unsigned long io_block;
tag = (journal_block_tag_t *) tagp;
flags = be32_to_cpu(tag->t_flags);
io_block = next_log_block++;
wrap(journal, next_log_block);
err = jread(&obh, journal, io_block);
if (err) {
/* Recover what we can, but
* report failure at the end. */
success = err;
printk (KERN_ERR
"JBD: IO error %d recovering "
"block %ld in log\n",
err, io_block);
} else {
unsigned long long blocknr;
J_ASSERT(obh != NULL);
blocknr = read_tag_block(tag_bytes,
tag);
/* If the block has been
* revoked, then we're all done
* here. */
if (jbd2_journal_test_revoke
(journal, blocknr,
next_commit_ID)) {
brelse(obh);
++info->nr_revoke_hits;
goto skip_write;
}
/* Find a buffer for the new
* data being restored */
nbh = __getblk(journal->j_fs_dev,
blocknr,
journal->j_blocksize);
if (nbh == NULL) {
printk(KERN_ERR
"JBD: Out of memory "
"during recovery.\n");
err = -ENOMEM;
brelse(bh);
brelse(obh);
goto failed;
}
lock_buffer(nbh);
memcpy(nbh->b_data, obh->b_data,
journal->j_blocksize);
if (flags & JBD2_FLAG_ESCAPE) {
*((__be32 *)nbh->b_data) =
cpu_to_be32(JBD2_MAGIC_NUMBER);
}
BUFFER_TRACE(nbh, "marking dirty");
set_buffer_uptodate(nbh);
mark_buffer_dirty(nbh);
BUFFER_TRACE(nbh, "marking uptodate");
++info->nr_replays;
/* ll_rw_block(WRITE, 1, &nbh); */
unlock_buffer(nbh);
brelse(obh);
brelse(nbh);
}
skip_write:
tagp += tag_bytes;
if (!(flags & JBD2_FLAG_SAME_UUID))
tagp += 16;
if (flags & JBD2_FLAG_LAST_TAG)
break;
}
brelse(bh);
continue;
case JBD2_COMMIT_BLOCK:
/* How to differentiate between interrupted commit
* and journal corruption ?
*
* {nth transaction}
* Checksum Verification Failed
* |
* ____________________
* | |
* async_commit sync_commit
* | |
* | GO TO NEXT "Journal Corruption"
* | TRANSACTION
* |
* {(n+1)th transanction}
* |
* _______|______________
* | |
* Commit block found Commit block not found
* | |
* "Journal Corruption" |
* _____________|_________
* | |
* nth trans corrupt OR nth trans
* and (n+1)th interrupted interrupted
* before commit block
* could reach the disk.
* (Cannot find the difference in above
* mentioned conditions. Hence assume
* "Interrupted Commit".)
*/
/* Found an expected commit block: if checksums
* are present verify them in PASS_SCAN; else not
* much to do other than move on to the next sequence
* number. */
if (pass == PASS_SCAN &&
JBD2_HAS_COMPAT_FEATURE(journal,
JBD2_FEATURE_COMPAT_CHECKSUM)) {
int chksum_err, chksum_seen;
struct commit_header *cbh =
(struct commit_header *)bh->b_data;
unsigned found_chksum =
be32_to_cpu(cbh->h_chksum[0]);
chksum_err = chksum_seen = 0;
if (info->end_transaction) {
journal->j_failed_commit =
info->end_transaction;
brelse(bh);
break;
}
if (crc32_sum == found_chksum &&
cbh->h_chksum_type == JBD2_CRC32_CHKSUM &&
cbh->h_chksum_size ==
JBD2_CRC32_CHKSUM_SIZE)
chksum_seen = 1;
else if (!(cbh->h_chksum_type == 0 &&
cbh->h_chksum_size == 0 &&
found_chksum == 0 &&
!chksum_seen))
/*
* If fs is mounted using an old kernel and then
* kernel with journal_chksum is used then we
* get a situation where the journal flag has
* checksum flag set but checksums are not
* present i.e chksum = 0, in the individual
* commit blocks.
* Hence to avoid checksum failures, in this
* situation, this extra check is added.
*/
chksum_err = 1;
if (chksum_err) {
info->end_transaction = next_commit_ID;
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
journal->j_failed_commit =
next_commit_ID;
brelse(bh);
break;
}
}
crc32_sum = ~0;
}
brelse(bh);
next_commit_ID++;
continue;
case JBD2_REVOKE_BLOCK:
/* If we aren't in the REVOKE pass, then we can
* just skip over this block. */
if (pass != PASS_REVOKE) {
brelse(bh);
continue;
}
err = scan_revoke_records(journal, bh,
next_commit_ID, info);
brelse(bh);
if (err)
goto failed;
continue;
default:
jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
blocktype);
brelse(bh);
goto done;
}
}
done:
/*
* We broke out of the log scan loop: either we came to the
* known end of the log or we found an unexpected block in the
* log. If the latter happened, then we know that the "current"
* transaction marks the end of the valid log.
*/
if (pass == PASS_SCAN) {
if (!info->end_transaction)
info->end_transaction = next_commit_ID;
} else {
/* It's really bad news if different passes end up at
* different places (but possible due to IO errors). */
if (info->end_transaction != next_commit_ID) {
printk (KERN_ERR "JBD: recovery pass %d ended at "
"transaction %u, expected %u\n",
pass, next_commit_ID, info->end_transaction);
if (!success)
success = -EIO;
}
}
return success;
failed:
return err;
}
/* Scan a revoke record, marking all blocks mentioned as revoked. */
static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
tid_t sequence, struct recovery_info *info)
{
jbd2_journal_revoke_header_t *header;
int offset, max;
int record_len = 4;
header = (jbd2_journal_revoke_header_t *) bh->b_data;
offset = sizeof(jbd2_journal_revoke_header_t);
max = be32_to_cpu(header->r_count);
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
record_len = 8;
while (offset + record_len <= max) {
unsigned long long blocknr;
int err;
if (record_len == 4)
blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
else
blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset)));
offset += record_len;
err = jbd2_journal_set_revoke(journal, blocknr, sequence);
if (err)
return err;
++info->nr_revokes;
}
return 0;
}

714
kernel/fs/jbd2/revoke.c Normal file
View File

@@ -0,0 +1,714 @@
/*
* linux/fs/jbd2/revoke.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 2000
*
* Copyright 2000 Red Hat corp --- All Rights Reserved
*
* This file is part of the Linux kernel and is made available under
* the terms of the GNU General Public License, version 2, or at your
* option, any later version, incorporated herein by reference.
*
* Journal revoke routines for the generic filesystem journaling code;
* part of the ext2fs journaling system.
*
* Revoke is the mechanism used to prevent old log records for deleted
* metadata from being replayed on top of newer data using the same
* blocks. The revoke mechanism is used in two separate places:
*
* + Commit: during commit we write the entire list of the current
* transaction's revoked blocks to the journal
*
* + Recovery: during recovery we record the transaction ID of all
* revoked blocks. If there are multiple revoke records in the log
* for a single block, only the last one counts, and if there is a log
* entry for a block beyond the last revoke, then that log entry still
* gets replayed.
*
* We can get interactions between revokes and new log data within a
* single transaction:
*
* Block is revoked and then journaled:
* The desired end result is the journaling of the new block, so we
* cancel the revoke before the transaction commits.
*
* Block is journaled and then revoked:
* The revoke must take precedence over the write of the block, so we
* need either to cancel the journal entry or to write the revoke
* later in the log than the log block. In this case, we choose the
* latter: journaling a block cancels any revoke record for that block
* in the current transaction, so any revoke for that block in the
* transaction must have happened after the block was journaled and so
* the revoke must take precedence.
*
* Block is revoked and then written as data:
* The data write is allowed to succeed, but the revoke is _not_
* cancelled. We still need to prevent old log records from
* overwriting the new data. We don't even need to clear the revoke
* bit here.
*
* Revoke information on buffers is a tri-state value:
*
* RevokeValid clear: no cached revoke status, need to look it up
* RevokeValid set, Revoked clear:
* buffer has not been revoked, and cancel_revoke
* need do nothing.
* RevokeValid set, Revoked set:
* buffer has been revoked.
*
* Locking rules:
* We keep two hash tables of revoke records. One hashtable belongs to the
* running transaction (is pointed to by journal->j_revoke), the other one
* belongs to the committing transaction. Accesses to the second hash table
* happen only from the kjournald and no other thread touches this table. Also
* journal_switch_revoke_table() which switches which hashtable belongs to the
* running and which to the committing transaction is called only from
* kjournald. Therefore we need no locks when accessing the hashtable belonging
* to the committing transaction.
*
* All users operating on the hash table belonging to the running transaction
* have a handle to the transaction. Therefore they are safe from kjournald
* switching hash tables under them. For operations on the lists of entries in
* the hash table j_revoke_lock is used.
*
* Finally, also replay code uses the hash tables but at this moment noone else
* can touch them (filesystem isn't mounted yet) and hence no locking is
* needed.
*/
#ifndef __KERNEL__
#include "jfs_user.h"
#else
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/bio.h>
#endif
#include <linux/log2.h>
static struct kmem_cache *jbd2_revoke_record_cache;
static struct kmem_cache *jbd2_revoke_table_cache;
/* Each revoke record represents one single revoked block. During
journal replay, this involves recording the transaction ID of the
last transaction to revoke this block. */
struct jbd2_revoke_record_s
{
struct list_head hash;
tid_t sequence; /* Used for recovery only */
unsigned long long blocknr;
};
/* The revoke table is just a simple hash table of revoke records. */
struct jbd2_revoke_table_s
{
/* It is conceivable that we might want a larger hash table
* for recovery. Must be a power of two. */
int hash_size;
int hash_shift;
struct list_head *hash_table;
};
#ifdef __KERNEL__
static void write_one_revoke_record(journal_t *, transaction_t *,
struct journal_head **, int *,
struct jbd2_revoke_record_s *, int);
static void flush_descriptor(journal_t *, struct journal_head *, int, int);
#endif
/* Utility functions to maintain the revoke table */
/* Borrowed from buffer.c: this is a tried and tested block hash function */
static inline int hash(journal_t *journal, unsigned long long block)
{
struct jbd2_revoke_table_s *table = journal->j_revoke;
int hash_shift = table->hash_shift;
int hash = (int)block ^ (int)((block >> 31) >> 1);
return ((hash << (hash_shift - 6)) ^
(hash >> 13) ^
(hash << (hash_shift - 12))) & (table->hash_size - 1);
}
static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
tid_t seq)
{
struct list_head *hash_list;
struct jbd2_revoke_record_s *record;
repeat:
record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS);
if (!record)
goto oom;
record->sequence = seq;
record->blocknr = blocknr;
hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
spin_lock(&journal->j_revoke_lock);
list_add(&record->hash, hash_list);
spin_unlock(&journal->j_revoke_lock);
return 0;
oom:
if (!journal_oom_retry)
return -ENOMEM;
jbd_debug(1, "ENOMEM in %s, retrying\n", __func__);
yield();
goto repeat;
}
/* Find a revoke record in the journal's hash table. */
static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
unsigned long long blocknr)
{
struct list_head *hash_list;
struct jbd2_revoke_record_s *record;
hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
spin_lock(&journal->j_revoke_lock);
record = (struct jbd2_revoke_record_s *) hash_list->next;
while (&(record->hash) != hash_list) {
if (record->blocknr == blocknr) {
spin_unlock(&journal->j_revoke_lock);
return record;
}
record = (struct jbd2_revoke_record_s *) record->hash.next;
}
spin_unlock(&journal->j_revoke_lock);
return NULL;
}
void jbd2_journal_destroy_revoke_caches(void)
{
if (jbd2_revoke_record_cache) {
kmem_cache_destroy(jbd2_revoke_record_cache);
jbd2_revoke_record_cache = NULL;
}
if (jbd2_revoke_table_cache) {
kmem_cache_destroy(jbd2_revoke_table_cache);
jbd2_revoke_table_cache = NULL;
}
}
int __init jbd2_journal_init_revoke_caches(void)
{
J_ASSERT(!jbd2_revoke_record_cache);
J_ASSERT(!jbd2_revoke_table_cache);
jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
sizeof(struct jbd2_revoke_record_s),
0,
SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
NULL);
if (!jbd2_revoke_record_cache)
goto record_cache_failure;
jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
sizeof(struct jbd2_revoke_table_s),
0, SLAB_TEMPORARY, NULL);
if (!jbd2_revoke_table_cache)
goto table_cache_failure;
return 0;
table_cache_failure:
jbd2_journal_destroy_revoke_caches();
record_cache_failure:
return -ENOMEM;
}
static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
{
int shift = 0;
int tmp = hash_size;
struct jbd2_revoke_table_s *table;
table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
if (!table)
goto out;
while((tmp >>= 1UL) != 0UL)
shift++;
table->hash_size = hash_size;
table->hash_shift = shift;
table->hash_table =
kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
if (!table->hash_table) {
kmem_cache_free(jbd2_revoke_table_cache, table);
table = NULL;
goto out;
}
for (tmp = 0; tmp < hash_size; tmp++)
INIT_LIST_HEAD(&table->hash_table[tmp]);
out:
return table;
}
static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
{
int i;
struct list_head *hash_list;
for (i = 0; i < table->hash_size; i++) {
hash_list = &table->hash_table[i];
J_ASSERT(list_empty(hash_list));
}
kfree(table->hash_table);
kmem_cache_free(jbd2_revoke_table_cache, table);
}
/* Initialise the revoke table for a given journal to a given size. */
int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
{
J_ASSERT(journal->j_revoke_table[0] == NULL);
J_ASSERT(is_power_of_2(hash_size));
journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size);
if (!journal->j_revoke_table[0])
goto fail0;
journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size);
if (!journal->j_revoke_table[1])
goto fail1;
journal->j_revoke = journal->j_revoke_table[1];
spin_lock_init(&journal->j_revoke_lock);
return 0;
fail1:
jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
fail0:
return -ENOMEM;
}
/* Destroy a journal's revoke table. The table must already be empty! */
void jbd2_journal_destroy_revoke(journal_t *journal)
{
journal->j_revoke = NULL;
if (journal->j_revoke_table[0])
jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]);
if (journal->j_revoke_table[1])
jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]);
}
#ifdef __KERNEL__
/*
* jbd2_journal_revoke: revoke a given buffer_head from the journal. This
* prevents the block from being replayed during recovery if we take a
* crash after this current transaction commits. Any subsequent
* metadata writes of the buffer in this transaction cancel the
* revoke.
*
* Note that this call may block --- it is up to the caller to make
* sure that there are no further calls to journal_write_metadata
* before the revoke is complete. In ext3, this implies calling the
* revoke before clearing the block bitmap when we are deleting
* metadata.
*
* Revoke performs a jbd2_journal_forget on any buffer_head passed in as a
* parameter, but does _not_ forget the buffer_head if the bh was only
* found implicitly.
*
* bh_in may not be a journalled buffer - it may have come off
* the hash tables without an attached journal_head.
*
* If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count
* by one.
*/
int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
struct buffer_head *bh_in)
{
struct buffer_head *bh = NULL;
journal_t *journal;
struct block_device *bdev;
int err;
might_sleep();
if (bh_in)
BUFFER_TRACE(bh_in, "enter");
journal = handle->h_transaction->t_journal;
if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){
J_ASSERT (!"Cannot set revoke feature!");
return -EINVAL;
}
bdev = journal->j_fs_dev;
bh = bh_in;
if (!bh) {
bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
if (bh)
BUFFER_TRACE(bh, "found on hash");
}
#ifdef JBD2_EXPENSIVE_CHECKING
else {
struct buffer_head *bh2;
/* If there is a different buffer_head lying around in
* memory anywhere... */
bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
if (bh2) {
/* ... and it has RevokeValid status... */
if (bh2 != bh && buffer_revokevalid(bh2))
/* ...then it better be revoked too,
* since it's illegal to create a revoke
* record against a buffer_head which is
* not marked revoked --- that would
* risk missing a subsequent revoke
* cancel. */
J_ASSERT_BH(bh2, buffer_revoked(bh2));
put_bh(bh2);
}
}
#endif
/* We really ought not ever to revoke twice in a row without
first having the revoke cancelled: it's illegal to free a
block twice without allocating it in between! */
if (bh) {
if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
"inconsistent data on disk")) {
if (!bh_in)
brelse(bh);
return -EIO;
}
set_buffer_revoked(bh);
set_buffer_revokevalid(bh);
if (bh_in) {
BUFFER_TRACE(bh_in, "call jbd2_journal_forget");
jbd2_journal_forget(handle, bh_in);
} else {
BUFFER_TRACE(bh, "call brelse");
__brelse(bh);
}
}
jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
err = insert_revoke_hash(journal, blocknr,
handle->h_transaction->t_tid);
BUFFER_TRACE(bh_in, "exit");
return err;
}
/*
* Cancel an outstanding revoke. For use only internally by the
* journaling code (called from jbd2_journal_get_write_access).
*
* We trust buffer_revoked() on the buffer if the buffer is already
* being journaled: if there is no revoke pending on the buffer, then we
* don't do anything here.
*
* This would break if it were possible for a buffer to be revoked and
* discarded, and then reallocated within the same transaction. In such
* a case we would have lost the revoked bit, but when we arrived here
* the second time we would still have a pending revoke to cancel. So,
* do not trust the Revoked bit on buffers unless RevokeValid is also
* set.
*/
int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
{
struct jbd2_revoke_record_s *record;
journal_t *journal = handle->h_transaction->t_journal;
int need_cancel;
int did_revoke = 0; /* akpm: debug */
struct buffer_head *bh = jh2bh(jh);
jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
/* Is the existing Revoke bit valid? If so, we trust it, and
* only perform the full cancel if the revoke bit is set. If
* not, we can't trust the revoke bit, and we need to do the
* full search for a revoke record. */
if (test_set_buffer_revokevalid(bh)) {
need_cancel = test_clear_buffer_revoked(bh);
} else {
need_cancel = 1;
clear_buffer_revoked(bh);
}
if (need_cancel) {
record = find_revoke_record(journal, bh->b_blocknr);
if (record) {
jbd_debug(4, "cancelled existing revoke on "
"blocknr %llu\n", (unsigned long long)bh->b_blocknr);
spin_lock(&journal->j_revoke_lock);
list_del(&record->hash);
spin_unlock(&journal->j_revoke_lock);
kmem_cache_free(jbd2_revoke_record_cache, record);
did_revoke = 1;
}
}
#ifdef JBD2_EXPENSIVE_CHECKING
/* There better not be one left behind by now! */
record = find_revoke_record(journal, bh->b_blocknr);
J_ASSERT_JH(jh, record == NULL);
#endif
/* Finally, have we just cleared revoke on an unhashed
* buffer_head? If so, we'd better make sure we clear the
* revoked status on any hashed alias too, otherwise the revoke
* state machine will get very upset later on. */
if (need_cancel) {
struct buffer_head *bh2;
bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
if (bh2) {
if (bh2 != bh)
clear_buffer_revoked(bh2);
__brelse(bh2);
}
}
return did_revoke;
}
/* journal_switch_revoke table select j_revoke for next transaction
* we do not want to suspend any processing until all revokes are
* written -bzzz
*/
void jbd2_journal_switch_revoke_table(journal_t *journal)
{
int i;
if (journal->j_revoke == journal->j_revoke_table[0])
journal->j_revoke = journal->j_revoke_table[1];
else
journal->j_revoke = journal->j_revoke_table[0];
for (i = 0; i < journal->j_revoke->hash_size; i++)
INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
}
/*
* Write revoke records to the journal for all entries in the current
* revoke hash, deleting the entries as we go.
*/
void jbd2_journal_write_revoke_records(journal_t *journal,
transaction_t *transaction,
int write_op)
{
struct journal_head *descriptor;
struct jbd2_revoke_record_s *record;
struct jbd2_revoke_table_s *revoke;
struct list_head *hash_list;
int i, offset, count;
descriptor = NULL;
offset = 0;
count = 0;
/* select revoke table for committing transaction */
revoke = journal->j_revoke == journal->j_revoke_table[0] ?
journal->j_revoke_table[1] : journal->j_revoke_table[0];
for (i = 0; i < revoke->hash_size; i++) {
hash_list = &revoke->hash_table[i];
while (!list_empty(hash_list)) {
record = (struct jbd2_revoke_record_s *)
hash_list->next;
write_one_revoke_record(journal, transaction,
&descriptor, &offset,
record, write_op);
count++;
list_del(&record->hash);
kmem_cache_free(jbd2_revoke_record_cache, record);
}
}
if (descriptor)
flush_descriptor(journal, descriptor, offset, write_op);
jbd_debug(1, "Wrote %d revoke records\n", count);
}
/*
* Write out one revoke record. We need to create a new descriptor
* block if the old one is full or if we have not already created one.
*/
static void write_one_revoke_record(journal_t *journal,
transaction_t *transaction,
struct journal_head **descriptorp,
int *offsetp,
struct jbd2_revoke_record_s *record,
int write_op)
{
struct journal_head *descriptor;
int offset;
journal_header_t *header;
/* If we are already aborting, this all becomes a noop. We
still need to go round the loop in
jbd2_journal_write_revoke_records in order to free all of the
revoke records: only the IO to the journal is omitted. */
if (is_journal_aborted(journal))
return;
descriptor = *descriptorp;
offset = *offsetp;
/* Make sure we have a descriptor with space left for the record */
if (descriptor) {
if (offset == journal->j_blocksize) {
flush_descriptor(journal, descriptor, offset, write_op);
descriptor = NULL;
}
}
if (!descriptor) {
descriptor = jbd2_journal_get_descriptor_buffer(journal);
if (!descriptor)
return;
header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
header->h_sequence = cpu_to_be32(transaction->t_tid);
/* Record it so that we can wait for IO completion later */
JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
offset = sizeof(jbd2_journal_revoke_header_t);
*descriptorp = descriptor;
}
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
* ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
cpu_to_be64(record->blocknr);
offset += 8;
} else {
* ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
cpu_to_be32(record->blocknr);
offset += 4;
}
*offsetp = offset;
}
/*
* Flush a revoke descriptor out to the journal. If we are aborting,
* this is a noop; otherwise we are generating a buffer which needs to
* be waited for during commit, so it has to go onto the appropriate
* journal buffer list.
*/
static void flush_descriptor(journal_t *journal,
struct journal_head *descriptor,
int offset, int write_op)
{
jbd2_journal_revoke_header_t *header;
struct buffer_head *bh = jh2bh(descriptor);
if (is_journal_aborted(journal)) {
put_bh(bh);
return;
}
header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
header->r_count = cpu_to_be32(offset);
set_buffer_jwrite(bh);
BUFFER_TRACE(bh, "write");
set_buffer_dirty(bh);
ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
}
#endif
/*
* Revoke support for recovery.
*
* Recovery needs to be able to:
*
* record all revoke records, including the tid of the latest instance
* of each revoke in the journal
*
* check whether a given block in a given transaction should be replayed
* (ie. has not been revoked by a revoke record in that or a subsequent
* transaction)
*
* empty the revoke table after recovery.
*/
/*
* First, setting revoke records. We create a new revoke record for
* every block ever revoked in the log as we scan it for recovery, and
* we update the existing records if we find multiple revokes for a
* single block.
*/
int jbd2_journal_set_revoke(journal_t *journal,
unsigned long long blocknr,
tid_t sequence)
{
struct jbd2_revoke_record_s *record;
record = find_revoke_record(journal, blocknr);
if (record) {
/* If we have multiple occurrences, only record the
* latest sequence number in the hashed record */
if (tid_gt(sequence, record->sequence))
record->sequence = sequence;
return 0;
}
return insert_revoke_hash(journal, blocknr, sequence);
}
/*
* Test revoke records. For a given block referenced in the log, has
* that block been revoked? A revoke record with a given transaction
* sequence number revokes all blocks in that transaction and earlier
* ones, but later transactions still need replayed.
*/
int jbd2_journal_test_revoke(journal_t *journal,
unsigned long long blocknr,
tid_t sequence)
{
struct jbd2_revoke_record_s *record;
record = find_revoke_record(journal, blocknr);
if (!record)
return 0;
if (tid_gt(sequence, record->sequence))
return 0;
return 1;
}
/*
* Finally, once recovery is over, we need to clear the revoke table so
* that it can be reused by the running filesystem.
*/
void jbd2_journal_clear_revoke(journal_t *journal)
{
int i;
struct list_head *hash_list;
struct jbd2_revoke_record_s *record;
struct jbd2_revoke_table_s *revoke;
revoke = journal->j_revoke;
for (i = 0; i < revoke->hash_size; i++) {
hash_list = &revoke->hash_table[i];
while (!list_empty(hash_list)) {
record = (struct jbd2_revoke_record_s*) hash_list->next;
list_del(&record->hash);
kmem_cache_free(jbd2_revoke_record_cache, record);
}
}
}

2134
kernel/fs/jbd2/transaction.c Normal file

File diff suppressed because it is too large Load Diff