add idl4k kernel firmware version 1.13.0.105

This commit is contained in:
Jaroslav Kysela
2015-03-26 17:22:37 +01:00
parent 5194d2792e
commit e9070cdc77
31064 changed files with 12769984 additions and 0 deletions

View File

@@ -0,0 +1,8 @@
EXTRA_CFLAGS += -Ifs/ocfs2
obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o
ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o

View File

@@ -0,0 +1,220 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmapi.h
*
* externally exported dlm interfaces
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#ifndef DLMAPI_H
#define DLMAPI_H
struct dlm_lock;
struct dlm_ctxt;
/* NOTE: changes made to this enum should be reflected in dlmdebug.c */
enum dlm_status {
DLM_NORMAL = 0, /* 0: request in progress */
DLM_GRANTED, /* 1: request granted */
DLM_DENIED, /* 2: request denied */
DLM_DENIED_NOLOCKS, /* 3: request denied, out of system resources */
DLM_WORKING, /* 4: async request in progress */
DLM_BLOCKED, /* 5: lock request blocked */
DLM_BLOCKED_ORPHAN, /* 6: lock request blocked by a orphan lock*/
DLM_DENIED_GRACE_PERIOD, /* 7: topological change in progress */
DLM_SYSERR, /* 8: system error */
DLM_NOSUPPORT, /* 9: unsupported */
DLM_CANCELGRANT, /* 10: can't cancel convert: already granted */
DLM_IVLOCKID, /* 11: bad lockid */
DLM_SYNC, /* 12: synchronous request granted */
DLM_BADTYPE, /* 13: bad resource type */
DLM_BADRESOURCE, /* 14: bad resource handle */
DLM_MAXHANDLES, /* 15: no more resource handles */
DLM_NOCLINFO, /* 16: can't contact cluster manager */
DLM_NOLOCKMGR, /* 17: can't contact lock manager */
DLM_NOPURGED, /* 18: can't contact purge daemon */
DLM_BADARGS, /* 19: bad api args */
DLM_VOID, /* 20: no status */
DLM_NOTQUEUED, /* 21: NOQUEUE was specified and request failed */
DLM_IVBUFLEN, /* 22: invalid resource name length */
DLM_CVTUNGRANT, /* 23: attempted to convert ungranted lock */
DLM_BADPARAM, /* 24: invalid lock mode specified */
DLM_VALNOTVALID, /* 25: value block has been invalidated */
DLM_REJECTED, /* 26: request rejected, unrecognized client */
DLM_ABORT, /* 27: blocked lock request cancelled */
DLM_CANCEL, /* 28: conversion request cancelled */
DLM_IVRESHANDLE, /* 29: invalid resource handle */
DLM_DEADLOCK, /* 30: deadlock recovery refused this request */
DLM_DENIED_NOASTS, /* 31: failed to allocate AST */
DLM_FORWARD, /* 32: request must wait for primary's response */
DLM_TIMEOUT, /* 33: timeout value for lock has expired */
DLM_IVGROUPID, /* 34: invalid group specification */
DLM_VERS_CONFLICT, /* 35: version conflicts prevent request handling */
DLM_BAD_DEVICE_PATH, /* 36: Locks device does not exist or path wrong */
DLM_NO_DEVICE_PERMISSION, /* 37: Client has insufficient pers for device */
DLM_NO_CONTROL_DEVICE, /* 38: Cannot set options on opened device */
DLM_RECOVERING, /* 39: extension, allows caller to fail a lock
request if it is being recovered */
DLM_MIGRATING, /* 40: extension, allows caller to fail a lock
request if it is being migrated */
DLM_MAXSTATS, /* 41: upper limit for return code validation */
};
/* for pretty-printing dlm_status error messages */
const char *dlm_errmsg(enum dlm_status err);
/* for pretty-printing dlm_status error names */
const char *dlm_errname(enum dlm_status err);
/* Eventually the DLM will use standard errno values, but in the
* meantime this lets us track dlm errors as they bubble up. When we
* bring its error reporting into line with the rest of the stack,
* these can just be replaced with calls to mlog_errno. */
#define dlm_error(st) do { \
if ((st) != DLM_RECOVERING && \
(st) != DLM_MIGRATING && \
(st) != DLM_FORWARD) \
mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \
} while (0)
#define DLM_LKSB_UNUSED1 0x01
#define DLM_LKSB_PUT_LVB 0x02
#define DLM_LKSB_GET_LVB 0x04
#define DLM_LKSB_UNUSED2 0x08
#define DLM_LKSB_UNUSED3 0x10
#define DLM_LKSB_UNUSED4 0x20
#define DLM_LKSB_UNUSED5 0x40
#define DLM_LKSB_UNUSED6 0x80
#define DLM_LVB_LEN 64
/* Callers are only allowed access to the lvb and status members of
* this struct. */
struct dlm_lockstatus {
enum dlm_status status;
u32 flags;
struct dlm_lock *lockid;
char lvb[DLM_LVB_LEN];
};
/* Valid lock modes. */
#define LKM_IVMODE (-1) /* invalid mode */
#define LKM_NLMODE 0 /* null lock */
#define LKM_CRMODE 1 /* concurrent read unsupported */
#define LKM_CWMODE 2 /* concurrent write unsupported */
#define LKM_PRMODE 3 /* protected read */
#define LKM_PWMODE 4 /* protected write unsupported */
#define LKM_EXMODE 5 /* exclusive */
#define LKM_MAXMODE 5
#define LKM_MODEMASK 0xff
/* Flags passed to dlmlock and dlmunlock:
* reserved: flags used by the "real" dlm
* only a few are supported by this dlm
* (U) = unsupported by ocfs2 dlm */
#define LKM_ORPHAN 0x00000010 /* this lock is orphanable (U) */
#define LKM_PARENTABLE 0x00000020 /* this lock was orphaned (U) */
#define LKM_BLOCK 0x00000040 /* blocking lock request (U) */
#define LKM_LOCAL 0x00000080 /* local lock request */
#define LKM_VALBLK 0x00000100 /* lock value block request */
#define LKM_NOQUEUE 0x00000200 /* non blocking request */
#define LKM_CONVERT 0x00000400 /* conversion request */
#define LKM_NODLCKWT 0x00000800 /* this lock wont deadlock (U) */
#define LKM_UNLOCK 0x00001000 /* deallocate this lock */
#define LKM_CANCEL 0x00002000 /* cancel conversion request */
#define LKM_DEQALL 0x00004000 /* remove all locks held by proc (U) */
#define LKM_INVVALBLK 0x00008000 /* invalidate lock value block */
#define LKM_SYNCSTS 0x00010000 /* return synchronous status if poss (U) */
#define LKM_TIMEOUT 0x00020000 /* lock request contains timeout (U) */
#define LKM_SNGLDLCK 0x00040000 /* request can self-deadlock (U) */
#define LKM_FINDLOCAL 0x00080000 /* find local lock request (U) */
#define LKM_PROC_OWNED 0x00100000 /* owned by process, not group (U) */
#define LKM_XID 0x00200000 /* use transaction id for deadlock (U) */
#define LKM_XID_CONFLICT 0x00400000 /* do not allow lock inheritance (U) */
#define LKM_FORCE 0x00800000 /* force unlock flag */
#define LKM_REVVALBLK 0x01000000 /* temporary solution: re-validate
lock value block (U) */
/* unused */
#define LKM_UNUSED1 0x00000001 /* unused */
#define LKM_UNUSED2 0x00000002 /* unused */
#define LKM_UNUSED3 0x00000004 /* unused */
#define LKM_UNUSED4 0x00000008 /* unused */
#define LKM_UNUSED5 0x02000000 /* unused */
#define LKM_UNUSED6 0x04000000 /* unused */
#define LKM_UNUSED7 0x08000000 /* unused */
/* ocfs2 extensions: internal only
* should never be used by caller */
#define LKM_MIGRATION 0x10000000 /* extension: lockres is to be migrated
to another node */
#define LKM_PUT_LVB 0x20000000 /* extension: lvb is being passed
should be applied to lockres */
#define LKM_GET_LVB 0x40000000 /* extension: lvb should be copied
from lockres when lock is granted */
#define LKM_RECOVERY 0x80000000 /* extension: flag for recovery lock
used to avoid recovery rwsem */
typedef void (dlm_astlockfunc_t)(void *);
typedef void (dlm_bastlockfunc_t)(void *, int);
typedef void (dlm_astunlockfunc_t)(void *, enum dlm_status);
enum dlm_status dlmlock(struct dlm_ctxt *dlm,
int mode,
struct dlm_lockstatus *lksb,
int flags,
const char *name,
int namelen,
dlm_astlockfunc_t *ast,
void *data,
dlm_bastlockfunc_t *bast);
enum dlm_status dlmunlock(struct dlm_ctxt *dlm,
struct dlm_lockstatus *lksb,
int flags,
dlm_astunlockfunc_t *unlockast,
void *data);
struct dlm_protocol_version {
u8 pv_major;
u8 pv_minor;
};
struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key,
struct dlm_protocol_version *fs_proto);
void dlm_unregister_domain(struct dlm_ctxt *dlm);
void dlm_print_one_lock(struct dlm_lock *lockid);
typedef void (dlm_eviction_func)(int, void *);
struct dlm_eviction_cb {
struct list_head ec_item;
dlm_eviction_func *ec_func;
void *ec_data;
};
void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
dlm_eviction_func *f,
void *data);
void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
struct dlm_eviction_cb *cb);
void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb);
#endif /* DLMAPI_H */

View File

@@ -0,0 +1,474 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmast.c
*
* AST and BAST functionality for local and remote nodes
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
#include <linux/blkdev.h>
#include <linux/socket.h>
#include <linux/inet.h>
#include <linux/spinlock.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock);
static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
/* Should be called as an ast gets queued to see if the new
* lock level will obsolete a pending bast.
* For example, if dlm_thread queued a bast for an EX lock that
* was blocking another EX, but before sending the bast the
* lock owner downconverted to NL, the bast is now obsolete.
* Only the ast should be sent.
* This is needed because the lock and convert paths can queue
* asts out-of-band (not waiting for dlm_thread) in order to
* allow for LKM_NOQUEUE to get immediate responses. */
static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
assert_spin_locked(&dlm->ast_lock);
assert_spin_locked(&lock->spinlock);
if (lock->ml.highest_blocked == LKM_IVMODE)
return 0;
BUG_ON(lock->ml.highest_blocked == LKM_NLMODE);
if (lock->bast_pending &&
list_empty(&lock->bast_list))
/* old bast already sent, ok */
return 0;
if (lock->ml.type == LKM_EXMODE)
/* EX blocks anything left, any bast still valid */
return 0;
else if (lock->ml.type == LKM_NLMODE)
/* NL blocks nothing, no reason to send any bast, cancel it */
return 1;
else if (lock->ml.highest_blocked != LKM_EXMODE)
/* PR only blocks EX */
return 1;
return 0;
}
static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
mlog_entry_void();
BUG_ON(!dlm);
BUG_ON(!lock);
assert_spin_locked(&dlm->ast_lock);
if (!list_empty(&lock->ast_list)) {
mlog(ML_ERROR, "ast list not empty!! pending=%d, newlevel=%d\n",
lock->ast_pending, lock->ml.type);
BUG();
}
if (lock->ast_pending)
mlog(0, "lock has an ast getting flushed right now\n");
/* putting lock on list, add a ref */
dlm_lock_get(lock);
spin_lock(&lock->spinlock);
/* check to see if this ast obsoletes the bast */
if (dlm_should_cancel_bast(dlm, lock)) {
struct dlm_lock_resource *res = lock->lockres;
mlog(0, "%s: cancelling bast for %.*s\n",
dlm->name, res->lockname.len, res->lockname.name);
lock->bast_pending = 0;
list_del_init(&lock->bast_list);
lock->ml.highest_blocked = LKM_IVMODE;
/* removing lock from list, remove a ref. guaranteed
* this won't be the last ref because of the get above,
* so res->spinlock will not be taken here */
dlm_lock_put(lock);
/* free up the reserved bast that we are cancelling.
* guaranteed that this will not be the last reserved
* ast because *both* an ast and a bast were reserved
* to get to this point. the res->spinlock will not be
* taken here */
dlm_lockres_release_ast(dlm, res);
}
list_add_tail(&lock->ast_list, &dlm->pending_asts);
lock->ast_pending = 1;
spin_unlock(&lock->spinlock);
}
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
mlog_entry_void();
BUG_ON(!dlm);
BUG_ON(!lock);
spin_lock(&dlm->ast_lock);
__dlm_queue_ast(dlm, lock);
spin_unlock(&dlm->ast_lock);
}
static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
mlog_entry_void();
BUG_ON(!dlm);
BUG_ON(!lock);
assert_spin_locked(&dlm->ast_lock);
BUG_ON(!list_empty(&lock->bast_list));
if (lock->bast_pending)
mlog(0, "lock has a bast getting flushed right now\n");
/* putting lock on list, add a ref */
dlm_lock_get(lock);
spin_lock(&lock->spinlock);
list_add_tail(&lock->bast_list, &dlm->pending_basts);
lock->bast_pending = 1;
spin_unlock(&lock->spinlock);
}
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
mlog_entry_void();
BUG_ON(!dlm);
BUG_ON(!lock);
spin_lock(&dlm->ast_lock);
__dlm_queue_bast(dlm, lock);
spin_unlock(&dlm->ast_lock);
}
static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
struct dlm_lockstatus *lksb = lock->lksb;
BUG_ON(!lksb);
/* only updates if this node masters the lockres */
if (res->owner == dlm->node_num) {
spin_lock(&res->spinlock);
/* check the lksb flags for the direction */
if (lksb->flags & DLM_LKSB_GET_LVB) {
mlog(0, "getting lvb from lockres for %s node\n",
lock->ml.node == dlm->node_num ? "master" :
"remote");
memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
}
/* Do nothing for lvb put requests - they should be done in
* place when the lock is downconverted - otherwise we risk
* racing gets and puts which could result in old lvb data
* being propagated. We leave the put flag set and clear it
* here. In the future we might want to clear it at the time
* the put is actually done.
*/
spin_unlock(&res->spinlock);
}
/* reset any lvb flags on the lksb */
lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
}
void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
dlm_astlockfunc_t *fn;
struct dlm_lockstatus *lksb;
mlog_entry_void();
lksb = lock->lksb;
fn = lock->ast;
BUG_ON(lock->ml.node != dlm->node_num);
dlm_update_lvb(dlm, res, lock);
(*fn)(lock->astdata);
}
int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
int ret;
struct dlm_lockstatus *lksb;
int lksbflags;
mlog_entry_void();
lksb = lock->lksb;
BUG_ON(lock->ml.node == dlm->node_num);
lksbflags = lksb->flags;
dlm_update_lvb(dlm, res, lock);
/* lock request came from another node
* go do the ast over there */
ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags);
return ret;
}
void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock, int blocked_type)
{
dlm_bastlockfunc_t *fn = lock->bast;
mlog_entry_void();
BUG_ON(lock->ml.node != dlm->node_num);
(*fn)(lock->astdata, blocked_type);
}
int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
int ret;
unsigned int locklen;
struct dlm_ctxt *dlm = data;
struct dlm_lock_resource *res = NULL;
struct dlm_lock *lock = NULL;
struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
char *name;
struct list_head *iter, *head=NULL;
u64 cookie;
u32 flags;
u8 node;
if (!dlm_grab(dlm)) {
dlm_error(DLM_REJECTED);
return DLM_REJECTED;
}
mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
"Domain %s not fully joined!\n", dlm->name);
name = past->name;
locklen = past->namelen;
cookie = past->cookie;
flags = be32_to_cpu(past->flags);
node = past->node_idx;
if (locklen > DLM_LOCKID_NAME_MAX) {
ret = DLM_IVBUFLEN;
mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
"handler!\n", locklen);
goto leave;
}
if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
(LKM_PUT_LVB|LKM_GET_LVB)) {
mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
flags);
ret = DLM_BADARGS;
goto leave;
}
mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
(flags & LKM_GET_LVB ? "get lvb" : "none"));
mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type);
if (past->type != DLM_AST &&
past->type != DLM_BAST) {
mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
"name=%.*s, node=%u\n", past->type,
dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
locklen, name, node);
ret = DLM_IVLOCKID;
goto leave;
}
res = dlm_lookup_lockres(dlm, name, locklen);
if (!res) {
mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
"name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
locklen, name, node);
ret = DLM_IVLOCKID;
goto leave;
}
/* cannot get a proxy ast message if this node owns it */
BUG_ON(res->owner == dlm->node_num);
mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
mlog(0, "Responding with DLM_RECOVERING!\n");
ret = DLM_RECOVERING;
goto unlock_out;
}
if (res->state & DLM_LOCK_RES_MIGRATING) {
mlog(0, "Responding with DLM_MIGRATING!\n");
ret = DLM_MIGRATING;
goto unlock_out;
}
/* try convert queue for both ast/bast */
head = &res->converting;
lock = NULL;
list_for_each(iter, head) {
lock = list_entry (iter, struct dlm_lock, list);
if (lock->ml.cookie == cookie)
goto do_ast;
}
/* if not on convert, try blocked for ast, granted for bast */
if (past->type == DLM_AST)
head = &res->blocked;
else
head = &res->granted;
list_for_each(iter, head) {
lock = list_entry (iter, struct dlm_lock, list);
if (lock->ml.cookie == cookie)
goto do_ast;
}
mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
"node=%u\n", past->type == DLM_AST ? "" : "b",
dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
locklen, name, node);
ret = DLM_NORMAL;
unlock_out:
spin_unlock(&res->spinlock);
goto leave;
do_ast:
ret = DLM_NORMAL;
if (past->type == DLM_AST) {
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, &res->granted);
mlog(0, "ast: Adding to granted list... type=%d, "
"convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
if (lock->ml.convert_type != LKM_IVMODE) {
lock->ml.type = lock->ml.convert_type;
lock->ml.convert_type = LKM_IVMODE;
} else {
// should already be there....
}
lock->lksb->status = DLM_NORMAL;
/* if we requested the lvb, fetch it into our lksb now */
if (flags & LKM_GET_LVB) {
BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB));
memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN);
}
}
spin_unlock(&res->spinlock);
if (past->type == DLM_AST)
dlm_do_local_ast(dlm, res, lock);
else
dlm_do_local_bast(dlm, res, lock, past->blocked_type);
leave:
if (res)
dlm_lockres_put(res);
dlm_put(dlm);
return ret;
}
int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock, int msg_type,
int blocked_type, int flags)
{
int ret = 0;
struct dlm_proxy_ast past;
struct kvec vec[2];
size_t veclen = 1;
int status;
mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
res->lockname.len, res->lockname.name, lock->ml.node,
msg_type, blocked_type);
memset(&past, 0, sizeof(struct dlm_proxy_ast));
past.node_idx = dlm->node_num;
past.type = msg_type;
past.blocked_type = blocked_type;
past.namelen = res->lockname.len;
memcpy(past.name, res->lockname.name, past.namelen);
past.cookie = lock->ml.cookie;
vec[0].iov_len = sizeof(struct dlm_proxy_ast);
vec[0].iov_base = &past;
if (flags & DLM_LKSB_GET_LVB) {
mlog(0, "returning requested LVB data\n");
be32_add_cpu(&past.flags, LKM_GET_LVB);
vec[1].iov_len = DLM_LVB_LEN;
vec[1].iov_base = lock->lksb->lvb;
veclen++;
}
ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
lock->ml.node, &status);
if (ret < 0)
mlog_errno(ret);
else {
if (status == DLM_RECOVERING) {
mlog(ML_ERROR, "sent AST to node %u, it thinks this "
"node is dead!\n", lock->ml.node);
BUG();
} else if (status == DLM_MIGRATING) {
mlog(ML_ERROR, "sent AST to node %u, it returned "
"DLM_MIGRATING!\n", lock->ml.node);
BUG();
} else if (status != DLM_NORMAL && status != DLM_IVLOCKID) {
mlog(ML_ERROR, "AST to node %u returned %d!\n",
lock->ml.node, status);
/* ignore it */
}
ret = 0;
}
return ret;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,547 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmconvert.c
*
* underlying calls for lock conversion
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
#include <linux/blkdev.h>
#include <linux/socket.h>
#include <linux/inet.h>
#include <linux/spinlock.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmconvert.h"
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
/* NOTE: __dlmconvert_master is the only function in here that
* needs a spinlock held on entry (res->spinlock) and it is the
* only one that holds a lock on exit (res->spinlock).
* All other functions in here need no locks and drop all of
* the locks that they acquire. */
static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags,
int type, int *call_ast,
int *kick_thread);
static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags, int type);
/*
* this is only called directly by dlmlock(), and only when the
* local node is the owner of the lockres
* locking:
* caller needs: none
* taken: takes and drops res->spinlock
* held on exit: none
* returns: see __dlmconvert_master
*/
enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags, int type)
{
int call_ast = 0, kick_thread = 0;
enum dlm_status status;
spin_lock(&res->spinlock);
/* we are not in a network handler, this is fine */
__dlm_wait_on_lockres(res);
__dlm_lockres_reserve_ast(res);
res->state |= DLM_LOCK_RES_IN_PROGRESS;
status = __dlmconvert_master(dlm, res, lock, flags, type,
&call_ast, &kick_thread);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
if (status != DLM_NORMAL && status != DLM_NOTQUEUED)
dlm_error(status);
/* either queue the ast or release it */
if (call_ast)
dlm_queue_ast(dlm, lock);
else
dlm_lockres_release_ast(dlm, res);
if (kick_thread)
dlm_kick_thread(dlm, res);
return status;
}
/* performs lock conversion at the lockres master site
* locking:
* caller needs: res->spinlock
* taken: takes and drops lock->spinlock
* held on exit: res->spinlock
* returns: DLM_NORMAL, DLM_NOTQUEUED, DLM_DENIED
* call_ast: whether ast should be called for this lock
* kick_thread: whether dlm_kick_thread should be called
*/
static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags,
int type, int *call_ast,
int *kick_thread)
{
enum dlm_status status = DLM_NORMAL;
struct list_head *iter;
struct dlm_lock *tmplock=NULL;
assert_spin_locked(&res->spinlock);
mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n",
lock->ml.type, lock->ml.convert_type, type);
spin_lock(&lock->spinlock);
/* already converting? */
if (lock->ml.convert_type != LKM_IVMODE) {
mlog(ML_ERROR, "attempted to convert a lock with a lock "
"conversion pending\n");
status = DLM_DENIED;
goto unlock_exit;
}
/* must be on grant queue to convert */
if (!dlm_lock_on_list(&res->granted, lock)) {
mlog(ML_ERROR, "attempted to convert a lock not on grant "
"queue\n");
status = DLM_DENIED;
goto unlock_exit;
}
if (flags & LKM_VALBLK) {
switch (lock->ml.type) {
case LKM_EXMODE:
/* EX + LKM_VALBLK + convert == set lvb */
mlog(0, "will set lvb: converting %s->%s\n",
dlm_lock_mode_name(lock->ml.type),
dlm_lock_mode_name(type));
lock->lksb->flags |= DLM_LKSB_PUT_LVB;
break;
case LKM_PRMODE:
case LKM_NLMODE:
/* refetch if new level is not NL */
if (type > LKM_NLMODE) {
mlog(0, "will fetch new value into "
"lvb: converting %s->%s\n",
dlm_lock_mode_name(lock->ml.type),
dlm_lock_mode_name(type));
lock->lksb->flags |= DLM_LKSB_GET_LVB;
} else {
mlog(0, "will NOT fetch new value "
"into lvb: converting %s->%s\n",
dlm_lock_mode_name(lock->ml.type),
dlm_lock_mode_name(type));
flags &= ~(LKM_VALBLK);
}
break;
}
}
/* in-place downconvert? */
if (type <= lock->ml.type)
goto grant;
/* upconvert from here on */
status = DLM_NORMAL;
list_for_each(iter, &res->granted) {
tmplock = list_entry(iter, struct dlm_lock, list);
if (tmplock == lock)
continue;
if (!dlm_lock_compatible(tmplock->ml.type, type))
goto switch_queues;
}
list_for_each(iter, &res->converting) {
tmplock = list_entry(iter, struct dlm_lock, list);
if (!dlm_lock_compatible(tmplock->ml.type, type))
goto switch_queues;
/* existing conversion requests take precedence */
if (!dlm_lock_compatible(tmplock->ml.convert_type, type))
goto switch_queues;
}
/* fall thru to grant */
grant:
mlog(0, "res %.*s, granting %s lock\n", res->lockname.len,
res->lockname.name, dlm_lock_mode_name(type));
/* immediately grant the new lock type */
lock->lksb->status = DLM_NORMAL;
if (lock->ml.node == dlm->node_num)
mlog(0, "doing in-place convert for nonlocal lock\n");
lock->ml.type = type;
if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
status = DLM_NORMAL;
*call_ast = 1;
goto unlock_exit;
switch_queues:
if (flags & LKM_NOQUEUE) {
mlog(0, "failed to convert NOQUEUE lock %.*s from "
"%d to %d...\n", res->lockname.len, res->lockname.name,
lock->ml.type, type);
status = DLM_NOTQUEUED;
goto unlock_exit;
}
mlog(0, "res %.*s, queueing...\n", res->lockname.len,
res->lockname.name);
lock->ml.convert_type = type;
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, &res->converting);
unlock_exit:
spin_unlock(&lock->spinlock);
if (status == DLM_DENIED) {
__dlm_print_one_lock_resource(res);
}
if (status == DLM_NORMAL)
*kick_thread = 1;
return status;
}
void dlm_revert_pending_convert(struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, &res->granted);
lock->ml.convert_type = LKM_IVMODE;
lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
}
/* messages the master site to do lock conversion
* locking:
* caller needs: none
* taken: takes and drops res->spinlock, uses DLM_LOCK_RES_IN_PROGRESS
* held on exit: none
* returns: DLM_NORMAL, DLM_RECOVERING, status from remote node
*/
enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags, int type)
{
enum dlm_status status;
mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
mlog(0, "bailing out early since res is RECOVERING "
"on secondary queue\n");
/* __dlm_print_one_lock_resource(res); */
status = DLM_RECOVERING;
goto bail;
}
/* will exit this call with spinlock held */
__dlm_wait_on_lockres(res);
if (lock->ml.convert_type != LKM_IVMODE) {
__dlm_print_one_lock_resource(res);
mlog(ML_ERROR, "converting a remote lock that is already "
"converting! (cookie=%u:%llu, conv=%d)\n",
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
lock->ml.convert_type);
status = DLM_DENIED;
goto bail;
}
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* move lock to local convert queue */
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, &res->converting);
lock->convert_pending = 1;
lock->ml.convert_type = type;
if (flags & LKM_VALBLK) {
if (lock->ml.type == LKM_EXMODE) {
flags |= LKM_PUT_LVB;
lock->lksb->flags |= DLM_LKSB_PUT_LVB;
} else {
if (lock->ml.convert_type == LKM_NLMODE)
flags &= ~LKM_VALBLK;
else {
flags |= LKM_GET_LVB;
lock->lksb->flags |= DLM_LKSB_GET_LVB;
}
}
}
spin_unlock(&res->spinlock);
/* no locks held here.
* need to wait for a reply as to whether it got queued or not. */
status = dlm_send_remote_convert_request(dlm, res, lock, flags, type);
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
lock->convert_pending = 0;
/* if it failed, move it back to granted queue */
if (status != DLM_NORMAL) {
if (status != DLM_NOTQUEUED)
dlm_error(status);
dlm_revert_pending_convert(res, lock);
}
bail:
spin_unlock(&res->spinlock);
/* TODO: should this be a wake_one? */
/* wake up any IN_PROGRESS waiters */
wake_up(&res->wq);
return status;
}
/* sends DLM_CONVERT_LOCK_MSG to master site
* locking:
* caller needs: none
* taken: none
* held on exit: none
* returns: DLM_NOLOCKMGR, status from remote node
*/
static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags, int type)
{
struct dlm_convert_lock convert;
int tmpret;
enum dlm_status ret;
int status = 0;
struct kvec vec[2];
size_t veclen = 1;
mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
memset(&convert, 0, sizeof(struct dlm_convert_lock));
convert.node_idx = dlm->node_num;
convert.requested_type = type;
convert.cookie = lock->ml.cookie;
convert.namelen = res->lockname.len;
convert.flags = cpu_to_be32(flags);
memcpy(convert.name, res->lockname.name, convert.namelen);
vec[0].iov_len = sizeof(struct dlm_convert_lock);
vec[0].iov_base = &convert;
if (flags & LKM_PUT_LVB) {
/* extra data to send if we are updating lvb */
vec[1].iov_len = DLM_LVB_LEN;
vec[1].iov_base = lock->lksb->lvb;
veclen++;
}
tmpret = o2net_send_message_vec(DLM_CONVERT_LOCK_MSG, dlm->key,
vec, veclen, res->owner, &status);
if (tmpret >= 0) {
// successfully sent and received
ret = status; // this is already a dlm_status
if (ret == DLM_RECOVERING) {
mlog(0, "node %u returned DLM_RECOVERING from convert "
"message!\n", res->owner);
} else if (ret == DLM_MIGRATING) {
mlog(0, "node %u returned DLM_MIGRATING from convert "
"message!\n", res->owner);
} else if (ret == DLM_FORWARD) {
mlog(0, "node %u returned DLM_FORWARD from convert "
"message!\n", res->owner);
} else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
dlm_error(ret);
} else {
mlog_errno(tmpret);
if (dlm_is_host_down(tmpret)) {
/* instead of logging the same network error over
* and over, sleep here and wait for the heartbeat
* to notice the node is dead. times out after 5s. */
dlm_wait_for_node_death(dlm, res->owner,
DLM_NODE_DEATH_WAIT_MAX);
ret = DLM_RECOVERING;
mlog(0, "node %u died so returning DLM_RECOVERING "
"from convert message!\n", res->owner);
} else {
ret = dlm_err_to_dlm_status(tmpret);
}
}
return ret;
}
/* handler for DLM_CONVERT_LOCK_MSG on master site
* locking:
* caller needs: none
* taken: takes and drop res->spinlock
* held on exit: none
* returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS,
* status from __dlmconvert_master
*/
int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
struct dlm_lock_resource *res = NULL;
struct list_head *iter;
struct dlm_lock *lock = NULL;
struct dlm_lockstatus *lksb;
enum dlm_status status = DLM_NORMAL;
u32 flags;
int call_ast = 0, kick_thread = 0, ast_reserved = 0, wake = 0;
if (!dlm_grab(dlm)) {
dlm_error(DLM_REJECTED);
return DLM_REJECTED;
}
mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
"Domain %s not fully joined!\n", dlm->name);
if (cnv->namelen > DLM_LOCKID_NAME_MAX) {
status = DLM_IVBUFLEN;
dlm_error(status);
goto leave;
}
flags = be32_to_cpu(cnv->flags);
if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
(LKM_PUT_LVB|LKM_GET_LVB)) {
mlog(ML_ERROR, "both PUT and GET lvb specified\n");
status = DLM_BADARGS;
goto leave;
}
mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
(flags & LKM_GET_LVB ? "get lvb" : "none"));
status = DLM_IVLOCKID;
res = dlm_lookup_lockres(dlm, cnv->name, cnv->namelen);
if (!res) {
dlm_error(status);
goto leave;
}
spin_lock(&res->spinlock);
status = __dlm_lockres_state_to_status(res);
if (status != DLM_NORMAL) {
spin_unlock(&res->spinlock);
dlm_error(status);
goto leave;
}
list_for_each(iter, &res->granted) {
lock = list_entry(iter, struct dlm_lock, list);
if (lock->ml.cookie == cnv->cookie &&
lock->ml.node == cnv->node_idx) {
dlm_lock_get(lock);
break;
}
lock = NULL;
}
spin_unlock(&res->spinlock);
if (!lock) {
status = DLM_IVLOCKID;
mlog(ML_ERROR, "did not find lock to convert on grant queue! "
"cookie=%u:%llu\n",
dlm_get_lock_cookie_node(be64_to_cpu(cnv->cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(cnv->cookie)));
dlm_print_one_lock_resource(res);
goto leave;
}
/* found the lock */
lksb = lock->lksb;
/* see if caller needed to get/put lvb */
if (flags & LKM_PUT_LVB) {
BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
lksb->flags |= DLM_LKSB_PUT_LVB;
memcpy(&lksb->lvb[0], &cnv->lvb[0], DLM_LVB_LEN);
} else if (flags & LKM_GET_LVB) {
BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
lksb->flags |= DLM_LKSB_GET_LVB;
}
spin_lock(&res->spinlock);
status = __dlm_lockres_state_to_status(res);
if (status == DLM_NORMAL) {
__dlm_lockres_reserve_ast(res);
ast_reserved = 1;
res->state |= DLM_LOCK_RES_IN_PROGRESS;
status = __dlmconvert_master(dlm, res, lock, flags,
cnv->requested_type,
&call_ast, &kick_thread);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
wake = 1;
}
spin_unlock(&res->spinlock);
if (wake)
wake_up(&res->wq);
if (status != DLM_NORMAL) {
if (status != DLM_NOTQUEUED)
dlm_error(status);
lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
}
leave:
if (lock)
dlm_lock_put(lock);
/* either queue the ast or release it, if reserved */
if (call_ast)
dlm_queue_ast(dlm, lock);
else if (ast_reserved)
dlm_lockres_release_ast(dlm, res);
if (kick_thread)
dlm_kick_thread(dlm, res);
if (res)
dlm_lockres_put(res);
dlm_put(dlm);
return status;
}

View File

@@ -0,0 +1,35 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmconvert.h
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#ifndef DLMCONVERT_H
#define DLMCONVERT_H
enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags, int type);
enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags, int type);
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,86 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmdebug.h
*
* Copyright (C) 2008 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#ifndef DLMDEBUG_H
#define DLMDEBUG_H
void dlm_print_one_mle(struct dlm_master_list_entry *mle);
#ifdef CONFIG_DEBUG_FS
struct dlm_debug_ctxt {
struct kref debug_refcnt;
struct dentry *debug_state_dentry;
struct dentry *debug_lockres_dentry;
struct dentry *debug_mle_dentry;
struct dentry *debug_purgelist_dentry;
};
struct debug_buffer {
int len;
char *buf;
};
struct debug_lockres {
int dl_len;
char *dl_buf;
struct dlm_ctxt *dl_ctxt;
struct dlm_lock_resource *dl_res;
};
int dlm_debug_init(struct dlm_ctxt *dlm);
void dlm_debug_shutdown(struct dlm_ctxt *dlm);
int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
int dlm_create_debugfs_root(void);
void dlm_destroy_debugfs_root(void);
#else
static inline int dlm_debug_init(struct dlm_ctxt *dlm)
{
return 0;
}
static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm)
{
}
static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
{
return 0;
}
static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
{
}
static inline int dlm_create_debugfs_root(void)
{
return 0;
}
static inline void dlm_destroy_debugfs_root(void)
{
}
#endif /* CONFIG_DEBUG_FS */
#endif /* DLMDEBUG_H */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,36 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmdomain.h
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#ifndef DLMDOMAIN_H
#define DLMDOMAIN_H
extern spinlock_t dlm_domain_lock;
extern struct list_head dlm_domains;
int dlm_joined(struct dlm_ctxt *dlm);
int dlm_shutting_down(struct dlm_ctxt *dlm);
void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
int node_num);
#endif

652
kernel/fs/ocfs2/dlm/dlmfs.c Normal file
View File

@@ -0,0 +1,652 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmfs.c
*
* Code which implements the kernel side of a minimal userspace
* interface to our DLM. This file handles the virtual file system
* used for communication with userspace. Credit should go to ramfs,
* which was a template for the fs side of this module.
*
* Copyright (C) 2003, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
/* Simple VFS hooks based on: */
/*
* Resizable simple ram filesystem for Linux.
*
* Copyright (C) 2000 Linus Torvalds.
* 2000 Transmeta Corp.
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <asm/uaccess.h>
#include "cluster/nodemanager.h"
#include "cluster/heartbeat.h"
#include "cluster/tcp.h"
#include "dlmapi.h"
#include "userdlm.h"
#include "dlmfsver.h"
#define MLOG_MASK_PREFIX ML_DLMFS
#include "cluster/masklog.h"
#include "ocfs2_lockingver.h"
static const struct super_operations dlmfs_ops;
static const struct file_operations dlmfs_file_operations;
static const struct inode_operations dlmfs_dir_inode_operations;
static const struct inode_operations dlmfs_root_inode_operations;
static const struct inode_operations dlmfs_file_inode_operations;
static struct kmem_cache *dlmfs_inode_cache;
struct workqueue_struct *user_dlm_worker;
/*
* This is the userdlmfs locking protocol version.
*
* See fs/ocfs2/dlmglue.c for more details on locking versions.
*/
static const struct dlm_protocol_version user_locking_protocol = {
.pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
.pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
};
/*
* decodes a set of open flags into a valid lock level and a set of flags.
* returns < 0 if we have invalid flags
* flags which mean something to us:
* O_RDONLY -> PRMODE level
* O_WRONLY -> EXMODE level
*
* O_NONBLOCK -> LKM_NOQUEUE
*/
static int dlmfs_decode_open_flags(int open_flags,
int *level,
int *flags)
{
if (open_flags & (O_WRONLY|O_RDWR))
*level = LKM_EXMODE;
else
*level = LKM_PRMODE;
*flags = 0;
if (open_flags & O_NONBLOCK)
*flags |= LKM_NOQUEUE;
return 0;
}
static int dlmfs_file_open(struct inode *inode,
struct file *file)
{
int status, level, flags;
struct dlmfs_filp_private *fp = NULL;
struct dlmfs_inode_private *ip;
if (S_ISDIR(inode->i_mode))
BUG();
mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
file->f_flags);
status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
if (status < 0)
goto bail;
/* We don't want to honor O_APPEND at read/write time as it
* doesn't make sense for LVB writes. */
file->f_flags &= ~O_APPEND;
fp = kmalloc(sizeof(*fp), GFP_NOFS);
if (!fp) {
status = -ENOMEM;
goto bail;
}
fp->fp_lock_level = level;
ip = DLMFS_I(inode);
status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
if (status < 0) {
/* this is a strange error to return here but I want
* to be able userspace to be able to distinguish a
* valid lock request from one that simply couldn't be
* granted. */
if (flags & LKM_NOQUEUE && status == -EAGAIN)
status = -ETXTBSY;
kfree(fp);
goto bail;
}
file->private_data = fp;
bail:
return status;
}
static int dlmfs_file_release(struct inode *inode,
struct file *file)
{
int level, status;
struct dlmfs_inode_private *ip = DLMFS_I(inode);
struct dlmfs_filp_private *fp =
(struct dlmfs_filp_private *) file->private_data;
if (S_ISDIR(inode->i_mode))
BUG();
mlog(0, "close called on inode %lu\n", inode->i_ino);
status = 0;
if (fp) {
level = fp->fp_lock_level;
if (level != LKM_IVMODE)
user_dlm_cluster_unlock(&ip->ip_lockres, level);
kfree(fp);
file->private_data = NULL;
}
return 0;
}
static ssize_t dlmfs_file_read(struct file *filp,
char __user *buf,
size_t count,
loff_t *ppos)
{
int bytes_left;
ssize_t readlen;
char *lvb_buf;
struct inode *inode = filp->f_path.dentry->d_inode;
mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
inode->i_ino, count, *ppos);
if (*ppos >= i_size_read(inode))
return 0;
if (!count)
return 0;
if (!access_ok(VERIFY_WRITE, buf, count))
return -EFAULT;
/* don't read past the lvb */
if ((count + *ppos) > i_size_read(inode))
readlen = i_size_read(inode) - *ppos;
else
readlen = count;
lvb_buf = kmalloc(readlen, GFP_NOFS);
if (!lvb_buf)
return -ENOMEM;
user_dlm_read_lvb(inode, lvb_buf, readlen);
bytes_left = __copy_to_user(buf, lvb_buf, readlen);
readlen -= bytes_left;
kfree(lvb_buf);
*ppos = *ppos + readlen;
mlog(0, "read %zd bytes\n", readlen);
return readlen;
}
static ssize_t dlmfs_file_write(struct file *filp,
const char __user *buf,
size_t count,
loff_t *ppos)
{
int bytes_left;
ssize_t writelen;
char *lvb_buf;
struct inode *inode = filp->f_path.dentry->d_inode;
mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
inode->i_ino, count, *ppos);
if (*ppos >= i_size_read(inode))
return -ENOSPC;
if (!count)
return 0;
if (!access_ok(VERIFY_READ, buf, count))
return -EFAULT;
/* don't write past the lvb */
if ((count + *ppos) > i_size_read(inode))
writelen = i_size_read(inode) - *ppos;
else
writelen = count - *ppos;
lvb_buf = kmalloc(writelen, GFP_NOFS);
if (!lvb_buf)
return -ENOMEM;
bytes_left = copy_from_user(lvb_buf, buf, writelen);
writelen -= bytes_left;
if (writelen)
user_dlm_write_lvb(inode, lvb_buf, writelen);
kfree(lvb_buf);
*ppos = *ppos + writelen;
mlog(0, "wrote %zd bytes\n", writelen);
return writelen;
}
static void dlmfs_init_once(void *foo)
{
struct dlmfs_inode_private *ip =
(struct dlmfs_inode_private *) foo;
ip->ip_dlm = NULL;
ip->ip_parent = NULL;
inode_init_once(&ip->ip_vfs_inode);
}
static struct inode *dlmfs_alloc_inode(struct super_block *sb)
{
struct dlmfs_inode_private *ip;
ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS);
if (!ip)
return NULL;
return &ip->ip_vfs_inode;
}
static void dlmfs_destroy_inode(struct inode *inode)
{
kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
}
static void dlmfs_clear_inode(struct inode *inode)
{
int status;
struct dlmfs_inode_private *ip;
if (!inode)
return;
mlog(0, "inode %lu\n", inode->i_ino);
ip = DLMFS_I(inode);
if (S_ISREG(inode->i_mode)) {
status = user_dlm_destroy_lock(&ip->ip_lockres);
if (status < 0)
mlog_errno(status);
iput(ip->ip_parent);
goto clear_fields;
}
mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
/* we must be a directory. If required, lets unregister the
* dlm context now. */
if (ip->ip_dlm)
user_dlm_unregister_context(ip->ip_dlm);
clear_fields:
ip->ip_parent = NULL;
ip->ip_dlm = NULL;
}
static struct backing_dev_info dlmfs_backing_dev_info = {
.name = "ocfs2-dlmfs",
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
static struct inode *dlmfs_get_root_inode(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
int mode = S_IFDIR | 0755;
struct dlmfs_inode_private *ip;
if (inode) {
ip = DLMFS_I(inode);
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inc_nlink(inode);
inode->i_fop = &simple_dir_operations;
inode->i_op = &dlmfs_root_inode_operations;
}
return inode;
}
static struct inode *dlmfs_get_inode(struct inode *parent,
struct dentry *dentry,
int mode)
{
struct super_block *sb = parent->i_sb;
struct inode * inode = new_inode(sb);
struct dlmfs_inode_private *ip;
if (!inode)
return NULL;
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
ip = DLMFS_I(inode);
ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
switch (mode & S_IFMT) {
default:
/* for now we don't support anything other than
* directories and regular files. */
BUG();
break;
case S_IFREG:
inode->i_op = &dlmfs_file_inode_operations;
inode->i_fop = &dlmfs_file_operations;
i_size_write(inode, DLM_LVB_LEN);
user_dlm_lock_res_init(&ip->ip_lockres, dentry);
/* released at clear_inode time, this insures that we
* get to drop the dlm reference on each lock *before*
* we call the unregister code for releasing parent
* directories. */
ip->ip_parent = igrab(parent);
BUG_ON(!ip->ip_parent);
break;
case S_IFDIR:
inode->i_op = &dlmfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directory inodes start off with i_nlink ==
* 2 (for "." entry) */
inc_nlink(inode);
break;
}
if (parent->i_mode & S_ISGID) {
inode->i_gid = parent->i_gid;
if (S_ISDIR(mode))
inode->i_mode |= S_ISGID;
}
return inode;
}
/*
* File creation. Allocate an inode, and we're done..
*/
/* SMP-safe */
static int dlmfs_mkdir(struct inode * dir,
struct dentry * dentry,
int mode)
{
int status;
struct inode *inode = NULL;
struct qstr *domain = &dentry->d_name;
struct dlmfs_inode_private *ip;
struct dlm_ctxt *dlm;
struct dlm_protocol_version proto = user_locking_protocol;
mlog(0, "mkdir %.*s\n", domain->len, domain->name);
/* verify that we have a proper domain */
if (domain->len >= O2NM_MAX_NAME_LEN) {
status = -EINVAL;
mlog(ML_ERROR, "invalid domain name for directory.\n");
goto bail;
}
inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
if (!inode) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
ip = DLMFS_I(inode);
dlm = user_dlm_register_context(domain, &proto);
if (IS_ERR(dlm)) {
status = PTR_ERR(dlm);
mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
status, domain->len, domain->name);
goto bail;
}
ip->ip_dlm = dlm;
inc_nlink(dir);
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
status = 0;
bail:
if (status < 0)
iput(inode);
return status;
}
static int dlmfs_create(struct inode *dir,
struct dentry *dentry,
int mode,
struct nameidata *nd)
{
int status = 0;
struct inode *inode;
struct qstr *name = &dentry->d_name;
mlog(0, "create %.*s\n", name->len, name->name);
/* verify name is valid and doesn't contain any dlm reserved
* characters */
if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
name->name[0] == '$') {
status = -EINVAL;
mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
name->name);
goto bail;
}
inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
if (!inode) {
status = -ENOMEM;
mlog_errno(status);
goto bail;
}
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
bail:
return status;
}
static int dlmfs_unlink(struct inode *dir,
struct dentry *dentry)
{
int status;
struct inode *inode = dentry->d_inode;
mlog(0, "unlink inode %lu\n", inode->i_ino);
/* if there are no current holders, or none that are waiting
* to acquire a lock, this basically destroys our lockres. */
status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
if (status < 0) {
mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
dentry->d_name.len, dentry->d_name.name, status);
goto bail;
}
status = simple_unlink(dir, dentry);
bail:
return status;
}
static int dlmfs_fill_super(struct super_block * sb,
void * data,
int silent)
{
struct inode * inode;
struct dentry * root;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = DLMFS_MAGIC;
sb->s_op = &dlmfs_ops;
inode = dlmfs_get_root_inode(sb);
if (!inode)
return -ENOMEM;
root = d_alloc_root(inode);
if (!root) {
iput(inode);
return -ENOMEM;
}
sb->s_root = root;
return 0;
}
static const struct file_operations dlmfs_file_operations = {
.open = dlmfs_file_open,
.release = dlmfs_file_release,
.read = dlmfs_file_read,
.write = dlmfs_file_write,
};
static const struct inode_operations dlmfs_dir_inode_operations = {
.create = dlmfs_create,
.lookup = simple_lookup,
.unlink = dlmfs_unlink,
};
/* this way we can restrict mkdir to only the toplevel of the fs. */
static const struct inode_operations dlmfs_root_inode_operations = {
.lookup = simple_lookup,
.mkdir = dlmfs_mkdir,
.rmdir = simple_rmdir,
};
static const struct super_operations dlmfs_ops = {
.statfs = simple_statfs,
.alloc_inode = dlmfs_alloc_inode,
.destroy_inode = dlmfs_destroy_inode,
.clear_inode = dlmfs_clear_inode,
.drop_inode = generic_delete_inode,
};
static const struct inode_operations dlmfs_file_inode_operations = {
.getattr = simple_getattr,
};
static int dlmfs_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data, struct vfsmount *mnt)
{
return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
}
static struct file_system_type dlmfs_fs_type = {
.owner = THIS_MODULE,
.name = "ocfs2_dlmfs",
.get_sb = dlmfs_get_sb,
.kill_sb = kill_litter_super,
};
static int __init init_dlmfs_fs(void)
{
int status;
int cleanup_inode = 0, cleanup_worker = 0;
dlmfs_print_version();
status = bdi_init(&dlmfs_backing_dev_info);
if (status)
return status;
dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
sizeof(struct dlmfs_inode_private),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD),
dlmfs_init_once);
if (!dlmfs_inode_cache) {
status = -ENOMEM;
goto bail;
}
cleanup_inode = 1;
user_dlm_worker = create_singlethread_workqueue("user_dlm");
if (!user_dlm_worker) {
status = -ENOMEM;
goto bail;
}
cleanup_worker = 1;
status = register_filesystem(&dlmfs_fs_type);
bail:
if (status) {
if (cleanup_inode)
kmem_cache_destroy(dlmfs_inode_cache);
if (cleanup_worker)
destroy_workqueue(user_dlm_worker);
bdi_destroy(&dlmfs_backing_dev_info);
} else
printk("OCFS2 User DLM kernel interface loaded\n");
return status;
}
static void __exit exit_dlmfs_fs(void)
{
unregister_filesystem(&dlmfs_fs_type);
flush_workqueue(user_dlm_worker);
destroy_workqueue(user_dlm_worker);
kmem_cache_destroy(dlmfs_inode_cache);
bdi_destroy(&dlmfs_backing_dev_info);
}
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
module_init(init_dlmfs_fs)
module_exit(exit_dlmfs_fs)

View File

@@ -0,0 +1,42 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmfsver.c
*
* version string
*
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include "dlmfsver.h"
#define DLM_BUILD_VERSION "1.5.0"
#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
void dlmfs_print_version(void)
{
printk(KERN_INFO "%s\n", VERSION_STR);
}
MODULE_DESCRIPTION(VERSION_STR);
MODULE_VERSION(DLM_BUILD_VERSION);

View File

@@ -0,0 +1,31 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmver.h
*
* Function prototypes
*
* Copyright (C) 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef DLMFS_VER_H
#define DLMFS_VER_H
void dlmfs_print_version(void);
#endif /* DLMFS_VER_H */

View File

@@ -0,0 +1,766 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmlock.c
*
* underlying calls for lock creation
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
#include <linux/blkdev.h>
#include <linux/socket.h>
#include <linux/inet.h>
#include <linux/spinlock.h>
#include <linux/delay.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmconvert.h"
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
static struct kmem_cache *dlm_lock_cache = NULL;
static DEFINE_SPINLOCK(dlm_cookie_lock);
static u64 dlm_next_cookie = 1;
static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags);
static void dlm_init_lock(struct dlm_lock *newlock, int type,
u8 node, u64 cookie);
static void dlm_lock_release(struct kref *kref);
static void dlm_lock_detach_lockres(struct dlm_lock *lock);
int dlm_init_lock_cache(void)
{
dlm_lock_cache = kmem_cache_create("o2dlm_lock",
sizeof(struct dlm_lock),
0, SLAB_HWCACHE_ALIGN, NULL);
if (dlm_lock_cache == NULL)
return -ENOMEM;
return 0;
}
void dlm_destroy_lock_cache(void)
{
if (dlm_lock_cache)
kmem_cache_destroy(dlm_lock_cache);
}
/* Tell us whether we can grant a new lock request.
* locking:
* caller needs: res->spinlock
* taken: none
* held on exit: none
* returns: 1 if the lock can be granted, 0 otherwise.
*/
static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
struct list_head *iter;
struct dlm_lock *tmplock;
list_for_each(iter, &res->granted) {
tmplock = list_entry(iter, struct dlm_lock, list);
if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
return 0;
}
list_for_each(iter, &res->converting) {
tmplock = list_entry(iter, struct dlm_lock, list);
if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
return 0;
}
return 1;
}
/* performs lock creation at the lockres master site
* locking:
* caller needs: none
* taken: takes and drops res->spinlock
* held on exit: none
* returns: DLM_NORMAL, DLM_NOTQUEUED
*/
static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags)
{
int call_ast = 0, kick_thread = 0;
enum dlm_status status = DLM_NORMAL;
mlog_entry("type=%d\n", lock->ml.type);
spin_lock(&res->spinlock);
/* if called from dlm_create_lock_handler, need to
* ensure it will not sleep in dlm_wait_on_lockres */
status = __dlm_lockres_state_to_status(res);
if (status != DLM_NORMAL &&
lock->ml.node != dlm->node_num) {
/* erf. state changed after lock was dropped. */
spin_unlock(&res->spinlock);
dlm_error(status);
return status;
}
__dlm_wait_on_lockres(res);
__dlm_lockres_reserve_ast(res);
if (dlm_can_grant_new_lock(res, lock)) {
mlog(0, "I can grant this lock right away\n");
/* got it right away */
lock->lksb->status = DLM_NORMAL;
status = DLM_NORMAL;
dlm_lock_get(lock);
list_add_tail(&lock->list, &res->granted);
/* for the recovery lock, we can't allow the ast
* to be queued since the dlmthread is already
* frozen. but the recovery lock is always locked
* with LKM_NOQUEUE so we do not need the ast in
* this special case */
if (!dlm_is_recovery_lock(res->lockname.name,
res->lockname.len)) {
kick_thread = 1;
call_ast = 1;
} else {
mlog(0, "%s: returning DLM_NORMAL to "
"node %u for reco lock\n", dlm->name,
lock->ml.node);
}
} else {
/* for NOQUEUE request, unless we get the
* lock right away, return DLM_NOTQUEUED */
if (flags & LKM_NOQUEUE) {
status = DLM_NOTQUEUED;
if (dlm_is_recovery_lock(res->lockname.name,
res->lockname.len)) {
mlog(0, "%s: returning NOTQUEUED to "
"node %u for reco lock\n", dlm->name,
lock->ml.node);
}
} else {
dlm_lock_get(lock);
list_add_tail(&lock->list, &res->blocked);
kick_thread = 1;
}
}
/* reduce the inflight count, this may result in the lockres
* being purged below during calc_usage */
if (lock->ml.node == dlm->node_num)
dlm_lockres_drop_inflight_ref(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
/* either queue the ast or release it */
if (call_ast)
dlm_queue_ast(dlm, lock);
else
dlm_lockres_release_ast(dlm, res);
dlm_lockres_calc_usage(dlm, res);
if (kick_thread)
dlm_kick_thread(dlm, res);
return status;
}
void dlm_revert_pending_lock(struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
/* remove from local queue if it failed */
list_del_init(&lock->list);
lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
}
/*
* locking:
* caller needs: none
* taken: takes and drops res->spinlock
* held on exit: none
* returns: DLM_DENIED, DLM_RECOVERING, or net status
*/
static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags)
{
enum dlm_status status = DLM_DENIED;
int lockres_changed = 1;
mlog_entry("type=%d\n", lock->ml.type);
mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
res->lockname.name, flags);
spin_lock(&res->spinlock);
/* will exit this call with spinlock held */
__dlm_wait_on_lockres(res);
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* add lock to local (secondary) queue */
dlm_lock_get(lock);
list_add_tail(&lock->list, &res->blocked);
lock->lock_pending = 1;
spin_unlock(&res->spinlock);
/* spec seems to say that you will get DLM_NORMAL when the lock
* has been queued, meaning we need to wait for a reply here. */
status = dlm_send_remote_lock_request(dlm, res, lock, flags);
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
lock->lock_pending = 0;
if (status != DLM_NORMAL) {
if (status == DLM_RECOVERING &&
dlm_is_recovery_lock(res->lockname.name,
res->lockname.len)) {
/* recovery lock was mastered by dead node.
* we need to have calc_usage shoot down this
* lockres and completely remaster it. */
mlog(0, "%s: recovery lock was owned by "
"dead node %u, remaster it now.\n",
dlm->name, res->owner);
} else if (status != DLM_NOTQUEUED) {
/*
* DO NOT call calc_usage, as this would unhash
* the remote lockres before we ever get to use
* it. treat as if we never made any change to
* the lockres.
*/
lockres_changed = 0;
dlm_error(status);
}
dlm_revert_pending_lock(res, lock);
dlm_lock_put(lock);
} else if (dlm_is_recovery_lock(res->lockname.name,
res->lockname.len)) {
/* special case for the $RECOVERY lock.
* there will never be an AST delivered to put
* this lock on the proper secondary queue
* (granted), so do it manually. */
mlog(0, "%s: $RECOVERY lock for this node (%u) is "
"mastered by %u; got lock, manually granting (no ast)\n",
dlm->name, dlm->node_num, res->owner);
list_move_tail(&lock->list, &res->granted);
}
spin_unlock(&res->spinlock);
if (lockres_changed)
dlm_lockres_calc_usage(dlm, res);
wake_up(&res->wq);
return status;
}
/* for remote lock creation.
* locking:
* caller needs: none, but need res->state & DLM_LOCK_RES_IN_PROGRESS
* taken: none
* held on exit: none
* returns: DLM_NOLOCKMGR, or net status
*/
static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock, int flags)
{
struct dlm_create_lock create;
int tmpret, status = 0;
enum dlm_status ret;
mlog_entry_void();
memset(&create, 0, sizeof(create));
create.node_idx = dlm->node_num;
create.requested_type = lock->ml.type;
create.cookie = lock->ml.cookie;
create.namelen = res->lockname.len;
create.flags = cpu_to_be32(flags);
memcpy(create.name, res->lockname.name, create.namelen);
tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
sizeof(create), res->owner, &status);
if (tmpret >= 0) {
// successfully sent and received
ret = status; // this is already a dlm_status
if (ret == DLM_REJECTED) {
mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres "
"no longer owned by %u. that node is coming back "
"up currently.\n", dlm->name, create.namelen,
create.name, res->owner);
dlm_print_one_lock_resource(res);
BUG();
}
} else {
mlog_errno(tmpret);
if (dlm_is_host_down(tmpret)) {
ret = DLM_RECOVERING;
mlog(0, "node %u died so returning DLM_RECOVERING "
"from lock message!\n", res->owner);
} else {
ret = dlm_err_to_dlm_status(tmpret);
}
}
return ret;
}
void dlm_lock_get(struct dlm_lock *lock)
{
kref_get(&lock->lock_refs);
}
void dlm_lock_put(struct dlm_lock *lock)
{
kref_put(&lock->lock_refs, dlm_lock_release);
}
static void dlm_lock_release(struct kref *kref)
{
struct dlm_lock *lock;
lock = container_of(kref, struct dlm_lock, lock_refs);
BUG_ON(!list_empty(&lock->list));
BUG_ON(!list_empty(&lock->ast_list));
BUG_ON(!list_empty(&lock->bast_list));
BUG_ON(lock->ast_pending);
BUG_ON(lock->bast_pending);
dlm_lock_detach_lockres(lock);
if (lock->lksb_kernel_allocated) {
mlog(0, "freeing kernel-allocated lksb\n");
kfree(lock->lksb);
}
kmem_cache_free(dlm_lock_cache, lock);
}
/* associate a lock with it's lockres, getting a ref on the lockres */
void dlm_lock_attach_lockres(struct dlm_lock *lock,
struct dlm_lock_resource *res)
{
dlm_lockres_get(res);
lock->lockres = res;
}
/* drop ref on lockres, if there is still one associated with lock */
static void dlm_lock_detach_lockres(struct dlm_lock *lock)
{
struct dlm_lock_resource *res;
res = lock->lockres;
if (res) {
lock->lockres = NULL;
mlog(0, "removing lock's lockres reference\n");
dlm_lockres_put(res);
}
}
static void dlm_init_lock(struct dlm_lock *newlock, int type,
u8 node, u64 cookie)
{
INIT_LIST_HEAD(&newlock->list);
INIT_LIST_HEAD(&newlock->ast_list);
INIT_LIST_HEAD(&newlock->bast_list);
spin_lock_init(&newlock->spinlock);
newlock->ml.type = type;
newlock->ml.convert_type = LKM_IVMODE;
newlock->ml.highest_blocked = LKM_IVMODE;
newlock->ml.node = node;
newlock->ml.pad1 = 0;
newlock->ml.list = 0;
newlock->ml.flags = 0;
newlock->ast = NULL;
newlock->bast = NULL;
newlock->astdata = NULL;
newlock->ml.cookie = cpu_to_be64(cookie);
newlock->ast_pending = 0;
newlock->bast_pending = 0;
newlock->convert_pending = 0;
newlock->lock_pending = 0;
newlock->unlock_pending = 0;
newlock->cancel_pending = 0;
newlock->lksb_kernel_allocated = 0;
kref_init(&newlock->lock_refs);
}
struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
struct dlm_lockstatus *lksb)
{
struct dlm_lock *lock;
int kernel_allocated = 0;
lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
if (!lock)
return NULL;
if (!lksb) {
/* zero memory only if kernel-allocated */
lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
if (!lksb) {
kfree(lock);
return NULL;
}
kernel_allocated = 1;
}
dlm_init_lock(lock, type, node, cookie);
if (kernel_allocated)
lock->lksb_kernel_allocated = 1;
lock->lksb = lksb;
lksb->lockid = lock;
return lock;
}
/* handler for lock creation net message
* locking:
* caller needs: none
* taken: takes and drops res->spinlock
* held on exit: none
* returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED
*/
int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf;
struct dlm_lock_resource *res = NULL;
struct dlm_lock *newlock = NULL;
struct dlm_lockstatus *lksb = NULL;
enum dlm_status status = DLM_NORMAL;
char *name;
unsigned int namelen;
BUG_ON(!dlm);
mlog_entry_void();
if (!dlm_grab(dlm))
return DLM_REJECTED;
name = create->name;
namelen = create->namelen;
status = DLM_REJECTED;
if (!dlm_domain_fully_joined(dlm)) {
mlog(ML_ERROR, "Domain %s not fully joined, but node %u is "
"sending a create_lock message for lock %.*s!\n",
dlm->name, create->node_idx, namelen, name);
dlm_error(status);
goto leave;
}
status = DLM_IVBUFLEN;
if (namelen > DLM_LOCKID_NAME_MAX) {
dlm_error(status);
goto leave;
}
status = DLM_SYSERR;
newlock = dlm_new_lock(create->requested_type,
create->node_idx,
be64_to_cpu(create->cookie), NULL);
if (!newlock) {
dlm_error(status);
goto leave;
}
lksb = newlock->lksb;
if (be32_to_cpu(create->flags) & LKM_GET_LVB) {
lksb->flags |= DLM_LKSB_GET_LVB;
mlog(0, "set DLM_LKSB_GET_LVB flag\n");
}
status = DLM_IVLOCKID;
res = dlm_lookup_lockres(dlm, name, namelen);
if (!res) {
dlm_error(status);
goto leave;
}
spin_lock(&res->spinlock);
status = __dlm_lockres_state_to_status(res);
spin_unlock(&res->spinlock);
if (status != DLM_NORMAL) {
mlog(0, "lockres recovering/migrating/in-progress\n");
goto leave;
}
dlm_lock_attach_lockres(newlock, res);
status = dlmlock_master(dlm, res, newlock, be32_to_cpu(create->flags));
leave:
if (status != DLM_NORMAL)
if (newlock)
dlm_lock_put(newlock);
if (res)
dlm_lockres_put(res);
dlm_put(dlm);
return status;
}
/* fetch next node-local (u8 nodenum + u56 cookie) into u64 */
static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie)
{
u64 tmpnode = node_num;
/* shift single byte of node num into top 8 bits */
tmpnode <<= 56;
spin_lock(&dlm_cookie_lock);
*cookie = (dlm_next_cookie | tmpnode);
if (++dlm_next_cookie & 0xff00000000000000ull) {
mlog(0, "This node's cookie will now wrap!\n");
dlm_next_cookie = 1;
}
spin_unlock(&dlm_cookie_lock);
}
enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode,
struct dlm_lockstatus *lksb, int flags,
const char *name, int namelen, dlm_astlockfunc_t *ast,
void *data, dlm_bastlockfunc_t *bast)
{
enum dlm_status status;
struct dlm_lock_resource *res = NULL;
struct dlm_lock *lock = NULL;
int convert = 0, recovery = 0;
/* yes this function is a mess.
* TODO: clean this up. lots of common code in the
* lock and convert paths, especially in the retry blocks */
if (!lksb) {
dlm_error(DLM_BADARGS);
return DLM_BADARGS;
}
status = DLM_BADPARAM;
if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE) {
dlm_error(status);
goto error;
}
if (flags & ~LKM_VALID_FLAGS) {
dlm_error(status);
goto error;
}
convert = (flags & LKM_CONVERT);
recovery = (flags & LKM_RECOVERY);
if (recovery &&
(!dlm_is_recovery_lock(name, namelen) || convert) ) {
dlm_error(status);
goto error;
}
if (convert && (flags & LKM_LOCAL)) {
mlog(ML_ERROR, "strange LOCAL convert request!\n");
goto error;
}
if (convert) {
/* CONVERT request */
/* if converting, must pass in a valid dlm_lock */
lock = lksb->lockid;
if (!lock) {
mlog(ML_ERROR, "NULL lock pointer in convert "
"request\n");
goto error;
}
res = lock->lockres;
if (!res) {
mlog(ML_ERROR, "NULL lockres pointer in convert "
"request\n");
goto error;
}
dlm_lockres_get(res);
/* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are
* static after the original lock call. convert requests will
* ensure that everything is the same, or return DLM_BADARGS.
* this means that DLM_DENIED_NOASTS will never be returned.
*/
if (lock->lksb != lksb || lock->ast != ast ||
lock->bast != bast || lock->astdata != data) {
status = DLM_BADARGS;
mlog(ML_ERROR, "new args: lksb=%p, ast=%p, bast=%p, "
"astdata=%p\n", lksb, ast, bast, data);
mlog(ML_ERROR, "orig args: lksb=%p, ast=%p, bast=%p, "
"astdata=%p\n", lock->lksb, lock->ast,
lock->bast, lock->astdata);
goto error;
}
retry_convert:
dlm_wait_for_recovery(dlm);
if (res->owner == dlm->node_num)
status = dlmconvert_master(dlm, res, lock, flags, mode);
else
status = dlmconvert_remote(dlm, res, lock, flags, mode);
if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
status == DLM_FORWARD) {
/* for now, see how this works without sleeping
* and just retry right away. I suspect the reco
* or migration will complete fast enough that
* no waiting will be necessary */
mlog(0, "retrying convert with migration/recovery/"
"in-progress\n");
msleep(100);
goto retry_convert;
}
} else {
u64 tmpcookie;
/* LOCK request */
status = DLM_BADARGS;
if (!name) {
dlm_error(status);
goto error;
}
status = DLM_IVBUFLEN;
if (namelen > DLM_LOCKID_NAME_MAX || namelen < 1) {
dlm_error(status);
goto error;
}
dlm_get_next_cookie(dlm->node_num, &tmpcookie);
lock = dlm_new_lock(mode, dlm->node_num, tmpcookie, lksb);
if (!lock) {
dlm_error(status);
goto error;
}
if (!recovery)
dlm_wait_for_recovery(dlm);
/* find or create the lock resource */
res = dlm_get_lock_resource(dlm, name, namelen, flags);
if (!res) {
status = DLM_IVLOCKID;
dlm_error(status);
goto error;
}
mlog(0, "type=%d, flags = 0x%x\n", mode, flags);
mlog(0, "creating lock: lock=%p res=%p\n", lock, res);
dlm_lock_attach_lockres(lock, res);
lock->ast = ast;
lock->bast = bast;
lock->astdata = data;
retry_lock:
if (flags & LKM_VALBLK) {
mlog(0, "LKM_VALBLK passed by caller\n");
/* LVB requests for non PR, PW or EX locks are
* ignored. */
if (mode < LKM_PRMODE)
flags &= ~LKM_VALBLK;
else {
flags |= LKM_GET_LVB;
lock->lksb->flags |= DLM_LKSB_GET_LVB;
}
}
if (res->owner == dlm->node_num)
status = dlmlock_master(dlm, res, lock, flags);
else
status = dlmlock_remote(dlm, res, lock, flags);
if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
status == DLM_FORWARD) {
mlog(0, "retrying lock with migration/"
"recovery/in progress\n");
msleep(100);
/* no waiting for dlm_reco_thread */
if (recovery) {
if (status != DLM_RECOVERING)
goto retry_lock;
mlog(0, "%s: got RECOVERING "
"for $RECOVERY lock, master "
"was %u\n", dlm->name,
res->owner);
/* wait to see the node go down, then
* drop down and allow the lockres to
* get cleaned up. need to remaster. */
dlm_wait_for_node_death(dlm, res->owner,
DLM_NODE_DEATH_WAIT_MAX);
} else {
dlm_wait_for_recovery(dlm);
goto retry_lock;
}
}
if (status != DLM_NORMAL) {
lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
if (status != DLM_NOTQUEUED)
dlm_error(status);
goto error;
}
}
error:
if (status != DLM_NORMAL) {
if (lock && !convert)
dlm_lock_put(lock);
// this is kind of unnecessary
lksb->status = status;
}
/* put lockres ref from the convert path
* or from dlm_get_lock_resource */
if (res)
dlm_lockres_put(res);
return status;
}
EXPORT_SYMBOL_GPL(dlmlock);

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,755 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmthread.c
*
* standalone DLM module
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
#include <linux/blkdev.h>
#include <linux/socket.h>
#include <linux/inet.h>
#include <linux/timer.h>
#include <linux/kthread.h>
#include <linux/delay.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmdomain.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
#include "cluster/masklog.h"
static int dlm_thread(void *data);
static void dlm_flush_asts(struct dlm_ctxt *dlm);
#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num)
/* will exit holding res->spinlock, but may drop in function */
/* waits until flags are cleared on res->state */
void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags)
{
DECLARE_WAITQUEUE(wait, current);
assert_spin_locked(&res->spinlock);
add_wait_queue(&res->wq, &wait);
repeat:
set_current_state(TASK_UNINTERRUPTIBLE);
if (res->state & flags) {
spin_unlock(&res->spinlock);
schedule();
spin_lock(&res->spinlock);
goto repeat;
}
remove_wait_queue(&res->wq, &wait);
__set_current_state(TASK_RUNNING);
}
int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
{
if (list_empty(&res->granted) &&
list_empty(&res->converting) &&
list_empty(&res->blocked))
return 0;
return 1;
}
/* "unused": the lockres has no locks, is not on the dirty list,
* has no inflight locks (in the gap between mastery and acquiring
* the first lock), and has no bits in its refmap.
* truly ready to be freed. */
int __dlm_lockres_unused(struct dlm_lock_resource *res)
{
int bit;
if (__dlm_lockres_has_locks(res))
return 0;
if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
return 0;
if (res->state & DLM_LOCK_RES_RECOVERING)
return 0;
bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES)
return 0;
/*
* since the bit for dlm->node_num is not set, inflight_locks better
* be zero
*/
BUG_ON(res->inflight_locks != 0);
return 1;
}
/* Call whenever you may have added or deleted something from one of
* the lockres queue's. This will figure out whether it belongs on the
* unused list or not and does the appropriate thing. */
void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
if (__dlm_lockres_unused(res)){
if (list_empty(&res->purge)) {
mlog(0, "putting lockres %.*s:%p onto purge list\n",
res->lockname.len, res->lockname.name, res);
res->last_used = jiffies;
dlm_lockres_get(res);
list_add_tail(&res->purge, &dlm->purge_list);
dlm->purge_count++;
}
} else if (!list_empty(&res->purge)) {
mlog(0, "removing lockres %.*s:%p from purge list, owner=%u\n",
res->lockname.len, res->lockname.name, res, res->owner);
list_del_init(&res->purge);
dlm_lockres_put(res);
dlm->purge_count--;
}
}
void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
spin_lock(&dlm->spinlock);
spin_lock(&res->spinlock);
__dlm_lockres_calc_usage(dlm, res);
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
}
static void dlm_purge_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
int master;
int ret = 0;
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
master = (res->owner == dlm->node_num);
mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
res->lockname.name, master);
if (!master) {
res->state |= DLM_LOCK_RES_DROPPING_REF;
/* drop spinlock... retake below */
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
spin_lock(&res->spinlock);
/* This ensures that clear refmap is sent after the set */
__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
spin_unlock(&res->spinlock);
/* clear our bit from the master's refmap, ignore errors */
ret = dlm_drop_lockres_ref(dlm, res);
if (ret < 0) {
mlog_errno(ret);
if (!dlm_is_host_down(ret))
BUG();
}
mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
dlm->name, res->lockname.len, res->lockname.name, ret);
spin_lock(&dlm->spinlock);
spin_lock(&res->spinlock);
}
if (!list_empty(&res->purge)) {
mlog(0, "removing lockres %.*s:%p from purgelist, "
"master = %d\n", res->lockname.len, res->lockname.name,
res, master);
list_del_init(&res->purge);
dlm_lockres_put(res);
dlm->purge_count--;
}
if (!__dlm_lockres_unused(res)) {
mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
dlm->name, res->lockname.len, res->lockname.name);
__dlm_print_one_lock_resource(res);
BUG();
}
__dlm_unhash_lockres(res);
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
if (!master) {
res->state &= ~DLM_LOCK_RES_DROPPING_REF;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
} else
spin_unlock(&res->spinlock);
}
static void dlm_run_purge_list(struct dlm_ctxt *dlm,
int purge_now)
{
unsigned int run_max, unused;
unsigned long purge_jiffies;
struct dlm_lock_resource *lockres;
spin_lock(&dlm->spinlock);
run_max = dlm->purge_count;
while(run_max && !list_empty(&dlm->purge_list)) {
run_max--;
lockres = list_entry(dlm->purge_list.next,
struct dlm_lock_resource, purge);
spin_lock(&lockres->spinlock);
purge_jiffies = lockres->last_used +
msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
/* Make sure that we want to be processing this guy at
* this time. */
if (!purge_now && time_after(purge_jiffies, jiffies)) {
/* Since resources are added to the purge list
* in tail order, we can stop at the first
* unpurgable resource -- anyone added after
* him will have a greater last_used value */
spin_unlock(&lockres->spinlock);
break;
}
/* Status of the lockres *might* change so double
* check. If the lockres is unused, holding the dlm
* spinlock will prevent people from getting and more
* refs on it. */
unused = __dlm_lockres_unused(lockres);
if (!unused ||
(lockres->state & DLM_LOCK_RES_MIGRATING)) {
mlog(0, "lockres %s:%.*s: is in use or "
"being remastered, used %d, state %d\n",
dlm->name, lockres->lockname.len,
lockres->lockname.name, !unused, lockres->state);
list_move_tail(&dlm->purge_list, &lockres->purge);
spin_unlock(&lockres->spinlock);
continue;
}
dlm_lockres_get(lockres);
dlm_purge_lockres(dlm, lockres);
dlm_lockres_put(lockres);
/* Avoid adding any scheduling latencies */
cond_resched_lock(&dlm->spinlock);
}
spin_unlock(&dlm->spinlock);
}
static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
struct dlm_lock *lock, *target;
struct list_head *iter;
struct list_head *head;
int can_grant = 1;
//mlog(0, "res->lockname.len=%d\n", res->lockname.len);
//mlog(0, "res->lockname.name=%p\n", res->lockname.name);
//mlog(0, "shuffle res %.*s\n", res->lockname.len,
// res->lockname.name);
/* because this function is called with the lockres
* spinlock, and because we know that it is not migrating/
* recovering/in-progress, it is fine to reserve asts and
* basts right before queueing them all throughout */
assert_spin_locked(&res->spinlock);
BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
DLM_LOCK_RES_RECOVERING|
DLM_LOCK_RES_IN_PROGRESS)));
converting:
if (list_empty(&res->converting))
goto blocked;
mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
res->lockname.name);
target = list_entry(res->converting.next, struct dlm_lock, list);
if (target->ml.convert_type == LKM_IVMODE) {
mlog(ML_ERROR, "%.*s: converting a lock with no "
"convert_type!\n", res->lockname.len, res->lockname.name);
BUG();
}
head = &res->granted;
list_for_each(iter, head) {
lock = list_entry(iter, struct dlm_lock, list);
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type,
target->ml.convert_type)) {
can_grant = 0;
/* queue the BAST if not already */
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
dlm_queue_bast(dlm, lock);
}
/* update the highest_blocked if needed */
if (lock->ml.highest_blocked < target->ml.convert_type)
lock->ml.highest_blocked =
target->ml.convert_type;
}
}
head = &res->converting;
list_for_each(iter, head) {
lock = list_entry(iter, struct dlm_lock, list);
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type,
target->ml.convert_type)) {
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.convert_type)
lock->ml.highest_blocked =
target->ml.convert_type;
}
}
/* we can convert the lock */
if (can_grant) {
spin_lock(&target->spinlock);
BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
mlog(0, "calling ast for converting lock: %.*s, have: %d, "
"granting: %d, node: %u\n", res->lockname.len,
res->lockname.name, target->ml.type,
target->ml.convert_type, target->ml.node);
target->ml.type = target->ml.convert_type;
target->ml.convert_type = LKM_IVMODE;
list_move_tail(&target->list, &res->granted);
BUG_ON(!target->lksb);
target->lksb->status = DLM_NORMAL;
spin_unlock(&target->spinlock);
__dlm_lockres_reserve_ast(res);
dlm_queue_ast(dlm, target);
/* go back and check for more */
goto converting;
}
blocked:
if (list_empty(&res->blocked))
goto leave;
target = list_entry(res->blocked.next, struct dlm_lock, list);
head = &res->granted;
list_for_each(iter, head) {
lock = list_entry(iter, struct dlm_lock, list);
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.type)
lock->ml.highest_blocked = target->ml.type;
}
}
head = &res->converting;
list_for_each(iter, head) {
lock = list_entry(iter, struct dlm_lock, list);
if (lock==target)
continue;
if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
can_grant = 0;
if (lock->ml.highest_blocked == LKM_IVMODE) {
__dlm_lockres_reserve_ast(res);
dlm_queue_bast(dlm, lock);
}
if (lock->ml.highest_blocked < target->ml.type)
lock->ml.highest_blocked = target->ml.type;
}
}
/* we can grant the blocked lock (only
* possible if converting list empty) */
if (can_grant) {
spin_lock(&target->spinlock);
BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
"node: %u\n", res->lockname.len, res->lockname.name,
target->ml.type, target->ml.node);
// target->ml.type is already correct
list_move_tail(&target->list, &res->granted);
BUG_ON(!target->lksb);
target->lksb->status = DLM_NORMAL;
spin_unlock(&target->spinlock);
__dlm_lockres_reserve_ast(res);
dlm_queue_ast(dlm, target);
/* go back and check for more */
goto converting;
}
leave:
return;
}
/* must have NO locks when calling this with res !=NULL * */
void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
mlog_entry("dlm=%p, res=%p\n", dlm, res);
if (res) {
spin_lock(&dlm->spinlock);
spin_lock(&res->spinlock);
__dlm_dirty_lockres(dlm, res);
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
}
wake_up(&dlm->dlm_thread_wq);
}
void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
mlog_entry("dlm=%p, res=%p\n", dlm, res);
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
/* don't shuffle secondary queues */
if ((res->owner == dlm->node_num)) {
if (res->state & (DLM_LOCK_RES_MIGRATING |
DLM_LOCK_RES_BLOCK_DIRTY))
return;
if (list_empty(&res->dirty)) {
/* ref for dirty_list */
dlm_lockres_get(res);
list_add_tail(&res->dirty, &dlm->dirty_list);
res->state |= DLM_LOCK_RES_DIRTY;
}
}
}
/* Launch the NM thread for the mounted volume */
int dlm_launch_thread(struct dlm_ctxt *dlm)
{
mlog(0, "starting dlm thread...\n");
dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
if (IS_ERR(dlm->dlm_thread_task)) {
mlog_errno(PTR_ERR(dlm->dlm_thread_task));
dlm->dlm_thread_task = NULL;
return -EINVAL;
}
return 0;
}
void dlm_complete_thread(struct dlm_ctxt *dlm)
{
if (dlm->dlm_thread_task) {
mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
kthread_stop(dlm->dlm_thread_task);
dlm->dlm_thread_task = NULL;
}
}
static int dlm_dirty_list_empty(struct dlm_ctxt *dlm)
{
int empty;
spin_lock(&dlm->spinlock);
empty = list_empty(&dlm->dirty_list);
spin_unlock(&dlm->spinlock);
return empty;
}
static void dlm_flush_asts(struct dlm_ctxt *dlm)
{
int ret;
struct dlm_lock *lock;
struct dlm_lock_resource *res;
u8 hi;
spin_lock(&dlm->ast_lock);
while (!list_empty(&dlm->pending_asts)) {
lock = list_entry(dlm->pending_asts.next,
struct dlm_lock, ast_list);
/* get an extra ref on lock */
dlm_lock_get(lock);
res = lock->lockres;
mlog(0, "delivering an ast for this lockres\n");
BUG_ON(!lock->ast_pending);
/* remove from list (including ref) */
list_del_init(&lock->ast_list);
dlm_lock_put(lock);
spin_unlock(&dlm->ast_lock);
if (lock->ml.node != dlm->node_num) {
ret = dlm_do_remote_ast(dlm, res, lock);
if (ret < 0)
mlog_errno(ret);
} else
dlm_do_local_ast(dlm, res, lock);
spin_lock(&dlm->ast_lock);
/* possible that another ast was queued while
* we were delivering the last one */
if (!list_empty(&lock->ast_list)) {
mlog(0, "aha another ast got queued while "
"we were finishing the last one. will "
"keep the ast_pending flag set.\n");
} else
lock->ast_pending = 0;
/* drop the extra ref.
* this may drop it completely. */
dlm_lock_put(lock);
dlm_lockres_release_ast(dlm, res);
}
while (!list_empty(&dlm->pending_basts)) {
lock = list_entry(dlm->pending_basts.next,
struct dlm_lock, bast_list);
/* get an extra ref on lock */
dlm_lock_get(lock);
res = lock->lockres;
BUG_ON(!lock->bast_pending);
/* get the highest blocked lock, and reset */
spin_lock(&lock->spinlock);
BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE);
hi = lock->ml.highest_blocked;
lock->ml.highest_blocked = LKM_IVMODE;
spin_unlock(&lock->spinlock);
/* remove from list (including ref) */
list_del_init(&lock->bast_list);
dlm_lock_put(lock);
spin_unlock(&dlm->ast_lock);
mlog(0, "delivering a bast for this lockres "
"(blocked = %d\n", hi);
if (lock->ml.node != dlm->node_num) {
ret = dlm_send_proxy_bast(dlm, res, lock, hi);
if (ret < 0)
mlog_errno(ret);
} else
dlm_do_local_bast(dlm, res, lock, hi);
spin_lock(&dlm->ast_lock);
/* possible that another bast was queued while
* we were delivering the last one */
if (!list_empty(&lock->bast_list)) {
mlog(0, "aha another bast got queued while "
"we were finishing the last one. will "
"keep the bast_pending flag set.\n");
} else
lock->bast_pending = 0;
/* drop the extra ref.
* this may drop it completely. */
dlm_lock_put(lock);
dlm_lockres_release_ast(dlm, res);
}
wake_up(&dlm->ast_wq);
spin_unlock(&dlm->ast_lock);
}
#define DLM_THREAD_TIMEOUT_MS (4 * 1000)
#define DLM_THREAD_MAX_DIRTY 100
#define DLM_THREAD_MAX_ASTS 10
static int dlm_thread(void *data)
{
struct dlm_lock_resource *res;
struct dlm_ctxt *dlm = data;
unsigned long timeout = msecs_to_jiffies(DLM_THREAD_TIMEOUT_MS);
mlog(0, "dlm thread running for %s...\n", dlm->name);
while (!kthread_should_stop()) {
int n = DLM_THREAD_MAX_DIRTY;
/* dlm_shutting_down is very point-in-time, but that
* doesn't matter as we'll just loop back around if we
* get false on the leading edge of a state
* transition. */
dlm_run_purge_list(dlm, dlm_shutting_down(dlm));
/* We really don't want to hold dlm->spinlock while
* calling dlm_shuffle_lists on each lockres that
* needs to have its queues adjusted and AST/BASTs
* run. So let's pull each entry off the dirty_list
* and drop dlm->spinlock ASAP. Once off the list,
* res->spinlock needs to be taken again to protect
* the queues while calling dlm_shuffle_lists. */
spin_lock(&dlm->spinlock);
while (!list_empty(&dlm->dirty_list)) {
int delay = 0;
res = list_entry(dlm->dirty_list.next,
struct dlm_lock_resource, dirty);
/* peel a lockres off, remove it from the list,
* unset the dirty flag and drop the dlm lock */
BUG_ON(!res);
dlm_lockres_get(res);
spin_lock(&res->spinlock);
/* We clear the DLM_LOCK_RES_DIRTY state once we shuffle lists below */
list_del_init(&res->dirty);
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
/* Drop dirty_list ref */
dlm_lockres_put(res);
/* lockres can be re-dirtied/re-added to the
* dirty_list in this gap, but that is ok */
spin_lock(&res->spinlock);
if (res->owner != dlm->node_num) {
__dlm_print_one_lock_resource(res);
mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
}
BUG_ON(res->owner != dlm->node_num);
/* it is now ok to move lockreses in these states
* to the dirty list, assuming that they will only be
* dirty for a short while. */
BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
DLM_LOCK_RES_RECOVERING)) {
/* move it to the tail and keep going */
res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
mlog(0, "delaying list shuffling for in-"
"progress lockres %.*s, state=%d\n",
res->lockname.len, res->lockname.name,
res->state);
delay = 1;
goto in_progress;
}
/* at this point the lockres is not migrating/
* recovering/in-progress. we have the lockres
* spinlock and do NOT have the dlm lock.
* safe to reserve/queue asts and run the lists. */
mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
"res=%.*s\n", dlm->name,
res->lockname.len, res->lockname.name);
/* called while holding lockres lock */
dlm_shuffle_lists(dlm, res);
res->state &= ~DLM_LOCK_RES_DIRTY;
spin_unlock(&res->spinlock);
dlm_lockres_calc_usage(dlm, res);
in_progress:
spin_lock(&dlm->spinlock);
/* if the lock was in-progress, stick
* it on the back of the list */
if (delay) {
spin_lock(&res->spinlock);
__dlm_dirty_lockres(dlm, res);
spin_unlock(&res->spinlock);
}
dlm_lockres_put(res);
/* unlikely, but we may need to give time to
* other tasks */
if (!--n) {
mlog(0, "throttling dlm_thread\n");
break;
}
}
spin_unlock(&dlm->spinlock);
dlm_flush_asts(dlm);
/* yield and continue right away if there is more work to do */
if (!n) {
cond_resched();
continue;
}
wait_event_interruptible_timeout(dlm->dlm_thread_wq,
!dlm_dirty_list_empty(dlm) ||
kthread_should_stop(),
timeout);
}
mlog(0, "quitting DLM thread\n");
return 0;
}

View File

@@ -0,0 +1,694 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmunlock.c
*
* underlying calls for unlocking locks
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
#include <linux/blkdev.h>
#include <linux/socket.h>
#include <linux/inet.h>
#include <linux/spinlock.h>
#include <linux/delay.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
#define DLM_UNLOCK_FREE_LOCK 0x00000001
#define DLM_UNLOCK_CALL_AST 0x00000002
#define DLM_UNLOCK_REMOVE_LOCK 0x00000004
#define DLM_UNLOCK_REGRANT_LOCK 0x00000008
#define DLM_UNLOCK_CLEAR_CONVERT_TYPE 0x00000010
static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int *actions);
static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int *actions);
static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int flags,
u8 owner);
/*
* according to the spec:
* http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
*
* flags & LKM_CANCEL != 0: must be converting or blocked
* flags & LKM_CANCEL == 0: must be granted
*
* So to unlock a converting lock, you must first cancel the
* convert (passing LKM_CANCEL in flags), then call the unlock
* again (with no LKM_CANCEL in flags).
*/
/*
* locking:
* caller needs: none
* taken: res->spinlock and lock->spinlock taken and dropped
* held on exit: none
* returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
* all callers should have taken an extra ref on lock coming in
*/
static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int flags, int *call_ast,
int master_node)
{
enum dlm_status status;
int actions = 0;
int in_use;
u8 owner;
mlog(0, "master_node = %d, valblk = %d\n", master_node,
flags & LKM_VALBLK);
if (master_node)
BUG_ON(res->owner != dlm->node_num);
else
BUG_ON(res->owner == dlm->node_num);
spin_lock(&dlm->ast_lock);
/* We want to be sure that we're not freeing a lock
* that still has AST's pending... */
in_use = !list_empty(&lock->ast_list);
spin_unlock(&dlm->ast_lock);
if (in_use && !(flags & LKM_CANCEL)) {
mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
"while waiting for an ast!", res->lockname.len,
res->lockname.name);
return DLM_BADPARAM;
}
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
if (master_node && !(flags & LKM_CANCEL)) {
mlog(ML_ERROR, "lockres in progress!\n");
spin_unlock(&res->spinlock);
return DLM_FORWARD;
}
/* ok for this to sleep if not in a network handler */
__dlm_wait_on_lockres(res);
res->state |= DLM_LOCK_RES_IN_PROGRESS;
}
spin_lock(&lock->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
status = DLM_RECOVERING;
goto leave;
}
if (res->state & DLM_LOCK_RES_MIGRATING) {
status = DLM_MIGRATING;
goto leave;
}
/* see above for what the spec says about
* LKM_CANCEL and the lock queue state */
if (flags & LKM_CANCEL)
status = dlm_get_cancel_actions(dlm, res, lock, lksb, &actions);
else
status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions);
if (status != DLM_NORMAL && (status != DLM_CANCELGRANT || !master_node))
goto leave;
/* By now this has been masked out of cancel requests. */
if (flags & LKM_VALBLK) {
/* make the final update to the lvb */
if (master_node)
memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
else
flags |= LKM_PUT_LVB; /* let the send function
* handle it. */
}
if (!master_node) {
owner = res->owner;
/* drop locks and send message */
if (flags & LKM_CANCEL)
lock->cancel_pending = 1;
else
lock->unlock_pending = 1;
spin_unlock(&lock->spinlock);
spin_unlock(&res->spinlock);
status = dlm_send_remote_unlock_request(dlm, res, lock, lksb,
flags, owner);
spin_lock(&res->spinlock);
spin_lock(&lock->spinlock);
/* if the master told us the lock was already granted,
* let the ast handle all of these actions */
if (status == DLM_CANCELGRANT) {
actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
DLM_UNLOCK_REGRANT_LOCK|
DLM_UNLOCK_CLEAR_CONVERT_TYPE);
} else if (status == DLM_RECOVERING ||
status == DLM_MIGRATING ||
status == DLM_FORWARD) {
/* must clear the actions because this unlock
* is about to be retried. cannot free or do
* any list manipulation. */
mlog(0, "%s:%.*s: clearing actions, %s\n",
dlm->name, res->lockname.len,
res->lockname.name,
status==DLM_RECOVERING?"recovering":
(status==DLM_MIGRATING?"migrating":
"forward"));
actions = 0;
}
if (flags & LKM_CANCEL)
lock->cancel_pending = 0;
else
lock->unlock_pending = 0;
}
/* get an extra ref on lock. if we are just switching
* lists here, we dont want the lock to go away. */
dlm_lock_get(lock);
if (actions & DLM_UNLOCK_REMOVE_LOCK) {
list_del_init(&lock->list);
dlm_lock_put(lock);
}
if (actions & DLM_UNLOCK_REGRANT_LOCK) {
dlm_lock_get(lock);
list_add_tail(&lock->list, &res->granted);
}
if (actions & DLM_UNLOCK_CLEAR_CONVERT_TYPE) {
mlog(0, "clearing convert_type at %smaster node\n",
master_node ? "" : "non-");
lock->ml.convert_type = LKM_IVMODE;
}
/* remove the extra ref on lock */
dlm_lock_put(lock);
leave:
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
if (!dlm_lock_on_list(&res->converting, lock))
BUG_ON(lock->ml.convert_type != LKM_IVMODE);
else
BUG_ON(lock->ml.convert_type == LKM_IVMODE);
spin_unlock(&lock->spinlock);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
/* let the caller's final dlm_lock_put handle the actual kfree */
if (actions & DLM_UNLOCK_FREE_LOCK) {
/* this should always be coupled with list removal */
BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK));
mlog(0, "lock %u:%llu should be gone now! refs=%d\n",
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
atomic_read(&lock->lock_refs.refcount)-1);
dlm_lock_put(lock);
}
if (actions & DLM_UNLOCK_CALL_AST)
*call_ast = 1;
/* if cancel or unlock succeeded, lvb work is done */
if (status == DLM_NORMAL)
lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
return status;
}
void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
/* leave DLM_LKSB_PUT_LVB on the lksb so any final
* update of the lvb will be sent to the new master */
list_del_init(&lock->list);
}
void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
list_move_tail(&lock->list, &res->granted);
lock->ml.convert_type = LKM_IVMODE;
}
static inline enum dlm_status dlmunlock_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int flags,
int *call_ast)
{
return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 1);
}
static inline enum dlm_status dlmunlock_remote(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int flags, int *call_ast)
{
return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 0);
}
/*
* locking:
* caller needs: none
* taken: none
* held on exit: none
* returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
*/
static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int flags,
u8 owner)
{
struct dlm_unlock_lock unlock;
int tmpret;
enum dlm_status ret;
int status = 0;
struct kvec vec[2];
size_t veclen = 1;
mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
if (owner == dlm->node_num) {
/* ended up trying to contact ourself. this means
* that the lockres had been remote but became local
* via a migration. just retry it, now as local */
mlog(0, "%s:%.*s: this node became the master due to a "
"migration, re-evaluate now\n", dlm->name,
res->lockname.len, res->lockname.name);
return DLM_FORWARD;
}
memset(&unlock, 0, sizeof(unlock));
unlock.node_idx = dlm->node_num;
unlock.flags = cpu_to_be32(flags);
unlock.cookie = lock->ml.cookie;
unlock.namelen = res->lockname.len;
memcpy(unlock.name, res->lockname.name, unlock.namelen);
vec[0].iov_len = sizeof(struct dlm_unlock_lock);
vec[0].iov_base = &unlock;
if (flags & LKM_PUT_LVB) {
/* extra data to send if we are updating lvb */
vec[1].iov_len = DLM_LVB_LEN;
vec[1].iov_base = lock->lksb->lvb;
veclen++;
}
tmpret = o2net_send_message_vec(DLM_UNLOCK_LOCK_MSG, dlm->key,
vec, veclen, owner, &status);
if (tmpret >= 0) {
// successfully sent and received
if (status == DLM_FORWARD)
mlog(0, "master was in-progress. retry\n");
ret = status;
} else {
mlog_errno(tmpret);
if (dlm_is_host_down(tmpret)) {
/* NOTE: this seems strange, but it is what we want.
* when the master goes down during a cancel or
* unlock, the recovery code completes the operation
* as if the master had not died, then passes the
* updated state to the recovery master. this thread
* just needs to finish out the operation and call
* the unlockast. */
ret = DLM_NORMAL;
} else {
/* something bad. this will BUG in ocfs2 */
ret = dlm_err_to_dlm_status(tmpret);
}
}
return ret;
}
/*
* locking:
* caller needs: none
* taken: takes and drops res->spinlock
* held on exit: none
* returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID,
* return value from dlmunlock_master
*/
int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
struct dlm_lock_resource *res = NULL;
struct list_head *iter;
struct dlm_lock *lock = NULL;
enum dlm_status status = DLM_NORMAL;
int found = 0, i;
struct dlm_lockstatus *lksb = NULL;
int ignore;
u32 flags;
struct list_head *queue;
flags = be32_to_cpu(unlock->flags);
if (flags & LKM_GET_LVB) {
mlog(ML_ERROR, "bad args! GET_LVB specified on unlock!\n");
return DLM_BADARGS;
}
if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) {
mlog(ML_ERROR, "bad args! cannot modify lvb on a CANCEL "
"request!\n");
return DLM_BADARGS;
}
if (unlock->namelen > DLM_LOCKID_NAME_MAX) {
mlog(ML_ERROR, "Invalid name length in unlock handler!\n");
return DLM_IVBUFLEN;
}
if (!dlm_grab(dlm))
return DLM_REJECTED;
mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
"Domain %s not fully joined!\n", dlm->name);
mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none");
res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen);
if (!res) {
/* We assume here that a no lock resource simply means
* it was migrated away and destroyed before the other
* node could detect it. */
mlog(0, "returning DLM_FORWARD -- res no longer exists\n");
status = DLM_FORWARD;
goto not_found;
}
queue=&res->granted;
found = 0;
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
spin_unlock(&res->spinlock);
mlog(0, "returning DLM_RECOVERING\n");
status = DLM_RECOVERING;
goto leave;
}
if (res->state & DLM_LOCK_RES_MIGRATING) {
spin_unlock(&res->spinlock);
mlog(0, "returning DLM_MIGRATING\n");
status = DLM_MIGRATING;
goto leave;
}
if (res->owner != dlm->node_num) {
spin_unlock(&res->spinlock);
mlog(0, "returning DLM_FORWARD -- not master\n");
status = DLM_FORWARD;
goto leave;
}
for (i=0; i<3; i++) {
list_for_each(iter, queue) {
lock = list_entry(iter, struct dlm_lock, list);
if (lock->ml.cookie == unlock->cookie &&
lock->ml.node == unlock->node_idx) {
dlm_lock_get(lock);
found = 1;
break;
}
}
if (found)
break;
/* scan granted -> converting -> blocked queues */
queue++;
}
spin_unlock(&res->spinlock);
if (!found) {
status = DLM_IVLOCKID;
goto not_found;
}
/* lock was found on queue */
lksb = lock->lksb;
if (flags & (LKM_VALBLK|LKM_PUT_LVB) &&
lock->ml.type != LKM_EXMODE)
flags &= ~(LKM_VALBLK|LKM_PUT_LVB);
/* unlockast only called on originating node */
if (flags & LKM_PUT_LVB) {
lksb->flags |= DLM_LKSB_PUT_LVB;
memcpy(&lksb->lvb[0], &unlock->lvb[0], DLM_LVB_LEN);
}
/* if this is in-progress, propagate the DLM_FORWARD
* all the way back out */
status = dlmunlock_master(dlm, res, lock, lksb, flags, &ignore);
if (status == DLM_FORWARD)
mlog(0, "lockres is in progress\n");
if (flags & LKM_PUT_LVB)
lksb->flags &= ~DLM_LKSB_PUT_LVB;
dlm_lockres_calc_usage(dlm, res);
dlm_kick_thread(dlm, res);
not_found:
if (!found)
mlog(ML_ERROR, "failed to find lock to unlock! "
"cookie=%u:%llu\n",
dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(unlock->cookie)));
else
dlm_lock_put(lock);
leave:
if (res)
dlm_lockres_put(res);
dlm_put(dlm);
return status;
}
static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int *actions)
{
enum dlm_status status;
if (dlm_lock_on_list(&res->blocked, lock)) {
/* cancel this outright */
status = DLM_NORMAL;
*actions = (DLM_UNLOCK_CALL_AST |
DLM_UNLOCK_REMOVE_LOCK);
} else if (dlm_lock_on_list(&res->converting, lock)) {
/* cancel the request, put back on granted */
status = DLM_NORMAL;
*actions = (DLM_UNLOCK_CALL_AST |
DLM_UNLOCK_REMOVE_LOCK |
DLM_UNLOCK_REGRANT_LOCK |
DLM_UNLOCK_CLEAR_CONVERT_TYPE);
} else if (dlm_lock_on_list(&res->granted, lock)) {
/* too late, already granted. */
status = DLM_CANCELGRANT;
*actions = DLM_UNLOCK_CALL_AST;
} else {
mlog(ML_ERROR, "lock to cancel is not on any list!\n");
status = DLM_IVLOCKID;
*actions = 0;
}
return status;
}
static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_lock *lock,
struct dlm_lockstatus *lksb,
int *actions)
{
enum dlm_status status;
/* unlock request */
if (!dlm_lock_on_list(&res->granted, lock)) {
status = DLM_DENIED;
dlm_error(status);
*actions = 0;
} else {
/* unlock granted lock */
status = DLM_NORMAL;
*actions = (DLM_UNLOCK_FREE_LOCK |
DLM_UNLOCK_CALL_AST |
DLM_UNLOCK_REMOVE_LOCK);
}
return status;
}
/* there seems to be no point in doing this async
* since (even for the remote case) there is really
* no work to queue up... so just do it and fire the
* unlockast by hand when done... */
enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
int flags, dlm_astunlockfunc_t *unlockast, void *data)
{
enum dlm_status status;
struct dlm_lock_resource *res;
struct dlm_lock *lock = NULL;
int call_ast, is_master;
mlog_entry_void();
if (!lksb) {
dlm_error(DLM_BADARGS);
return DLM_BADARGS;
}
if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK)) {
dlm_error(DLM_BADPARAM);
return DLM_BADPARAM;
}
if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) {
mlog(0, "VALBLK given with CANCEL: ignoring VALBLK\n");
flags &= ~LKM_VALBLK;
}
if (!lksb->lockid || !lksb->lockid->lockres) {
dlm_error(DLM_BADPARAM);
return DLM_BADPARAM;
}
lock = lksb->lockid;
BUG_ON(!lock);
dlm_lock_get(lock);
res = lock->lockres;
BUG_ON(!res);
dlm_lockres_get(res);
retry:
call_ast = 0;
/* need to retry up here because owner may have changed */
mlog(0, "lock=%p res=%p\n", lock, res);
spin_lock(&res->spinlock);
is_master = (res->owner == dlm->node_num);
if (flags & LKM_VALBLK && lock->ml.type != LKM_EXMODE)
flags &= ~LKM_VALBLK;
spin_unlock(&res->spinlock);
if (is_master) {
status = dlmunlock_master(dlm, res, lock, lksb, flags,
&call_ast);
mlog(0, "done calling dlmunlock_master: returned %d, "
"call_ast is %d\n", status, call_ast);
} else {
status = dlmunlock_remote(dlm, res, lock, lksb, flags,
&call_ast);
mlog(0, "done calling dlmunlock_remote: returned %d, "
"call_ast is %d\n", status, call_ast);
}
if (status == DLM_RECOVERING ||
status == DLM_MIGRATING ||
status == DLM_FORWARD) {
/* We want to go away for a tiny bit to allow recovery
* / migration to complete on this resource. I don't
* know of any wait queue we could sleep on as this
* may be happening on another node. Perhaps the
* proper solution is to queue up requests on the
* other end? */
/* do we want to yield(); ?? */
msleep(50);
mlog(0, "retrying unlock due to pending recovery/"
"migration/in-progress\n");
goto retry;
}
if (call_ast) {
mlog(0, "calling unlockast(%p, %d)\n", data, status);
if (is_master) {
/* it is possible that there is one last bast
* pending. make sure it is flushed, then
* call the unlockast.
* not an issue if this is a mastered remotely,
* since this lock has been removed from the
* lockres queues and cannot be found. */
dlm_kick_thread(dlm, NULL);
wait_event(dlm->ast_wq,
dlm_lock_basts_flushed(dlm, lock));
}
(*unlockast)(data, status);
}
if (status == DLM_CANCELGRANT)
status = DLM_NORMAL;
if (status == DLM_NORMAL) {
mlog(0, "kicking the thread\n");
dlm_kick_thread(dlm, res);
} else
dlm_error(status);
dlm_lockres_calc_usage(dlm, res);
dlm_lockres_put(res);
dlm_lock_put(lock);
mlog(0, "returning status=%d!\n", status);
return status;
}
EXPORT_SYMBOL_GPL(dlmunlock);

View File

@@ -0,0 +1,42 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmver.c
*
* version string
*
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/module.h>
#include <linux/kernel.h>
#include "dlmver.h"
#define DLM_BUILD_VERSION "1.5.0"
#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
void dlm_print_version(void)
{
printk(KERN_INFO "%s\n", VERSION_STR);
}
MODULE_DESCRIPTION(VERSION_STR);
MODULE_VERSION(DLM_BUILD_VERSION);

View File

@@ -0,0 +1,31 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmfsver.h
*
* Function prototypes
*
* Copyright (C) 2005 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef DLM_VER_H
#define DLM_VER_H
void dlm_print_version(void);
#endif /* DLM_VER_H */

View File

@@ -0,0 +1,676 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* userdlm.c
*
* Code which implements the kernel side of a minimal userspace
* interface to our DLM.
*
* Many of the functions here are pared down versions of dlmglue.c
* functions.
*
* Copyright (C) 2003, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <linux/signal.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/crc32.h>
#include "cluster/nodemanager.h"
#include "cluster/heartbeat.h"
#include "cluster/tcp.h"
#include "dlmapi.h"
#include "userdlm.h"
#define MLOG_MASK_PREFIX ML_DLMFS
#include "cluster/masklog.h"
static inline int user_check_wait_flag(struct user_lock_res *lockres,
int flag)
{
int ret;
spin_lock(&lockres->l_lock);
ret = lockres->l_flags & flag;
spin_unlock(&lockres->l_lock);
return ret;
}
static inline void user_wait_on_busy_lock(struct user_lock_res *lockres)
{
wait_event(lockres->l_event,
!user_check_wait_flag(lockres, USER_LOCK_BUSY));
}
static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
{
wait_event(lockres->l_event,
!user_check_wait_flag(lockres, USER_LOCK_BLOCKED));
}
/* I heart container_of... */
static inline struct dlm_ctxt *
dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
{
struct dlmfs_inode_private *ip;
ip = container_of(lockres,
struct dlmfs_inode_private,
ip_lockres);
return ip->ip_dlm;
}
static struct inode *
user_dlm_inode_from_user_lockres(struct user_lock_res *lockres)
{
struct dlmfs_inode_private *ip;
ip = container_of(lockres,
struct dlmfs_inode_private,
ip_lockres);
return &ip->ip_vfs_inode;
}
static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
{
spin_lock(&lockres->l_lock);
lockres->l_flags &= ~USER_LOCK_BUSY;
spin_unlock(&lockres->l_lock);
}
#define user_log_dlm_error(_func, _stat, _lockres) do { \
mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \
"resource %.*s: %s\n", dlm_errname(_stat), _func, \
_lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \
} while (0)
/* WARNING: This function lives in a world where the only three lock
* levels are EX, PR, and NL. It *will* have to be adjusted when more
* lock types are added. */
static inline int user_highest_compat_lock_level(int level)
{
int new_level = LKM_EXMODE;
if (level == LKM_EXMODE)
new_level = LKM_NLMODE;
else if (level == LKM_PRMODE)
new_level = LKM_PRMODE;
return new_level;
}
static void user_ast(void *opaque)
{
struct user_lock_res *lockres = opaque;
struct dlm_lockstatus *lksb;
mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen,
lockres->l_name);
spin_lock(&lockres->l_lock);
lksb = &(lockres->l_lksb);
if (lksb->status != DLM_NORMAL) {
mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n",
lksb->status, lockres->l_namelen, lockres->l_name);
spin_unlock(&lockres->l_lock);
return;
}
mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE,
"Lockres %.*s, requested ivmode. flags 0x%x\n",
lockres->l_namelen, lockres->l_name, lockres->l_flags);
/* we're downconverting. */
if (lockres->l_requested < lockres->l_level) {
if (lockres->l_requested <=
user_highest_compat_lock_level(lockres->l_blocking)) {
lockres->l_blocking = LKM_NLMODE;
lockres->l_flags &= ~USER_LOCK_BLOCKED;
}
}
lockres->l_level = lockres->l_requested;
lockres->l_requested = LKM_IVMODE;
lockres->l_flags |= USER_LOCK_ATTACHED;
lockres->l_flags &= ~USER_LOCK_BUSY;
spin_unlock(&lockres->l_lock);
wake_up(&lockres->l_event);
}
static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
{
struct inode *inode;
inode = user_dlm_inode_from_user_lockres(lockres);
if (!igrab(inode))
BUG();
}
static void user_dlm_unblock_lock(struct work_struct *work);
static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
{
if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
user_dlm_grab_inode_ref(lockres);
INIT_WORK(&lockres->l_work, user_dlm_unblock_lock);
queue_work(user_dlm_worker, &lockres->l_work);
lockres->l_flags |= USER_LOCK_QUEUED;
}
}
static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
{
int queue = 0;
if (!(lockres->l_flags & USER_LOCK_BLOCKED))
return;
switch (lockres->l_blocking) {
case LKM_EXMODE:
if (!lockres->l_ex_holders && !lockres->l_ro_holders)
queue = 1;
break;
case LKM_PRMODE:
if (!lockres->l_ex_holders)
queue = 1;
break;
default:
BUG();
}
if (queue)
__user_dlm_queue_lockres(lockres);
}
static void user_bast(void *opaque, int level)
{
struct user_lock_res *lockres = opaque;
mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n",
lockres->l_namelen, lockres->l_name, level);
spin_lock(&lockres->l_lock);
lockres->l_flags |= USER_LOCK_BLOCKED;
if (level > lockres->l_blocking)
lockres->l_blocking = level;
__user_dlm_queue_lockres(lockres);
spin_unlock(&lockres->l_lock);
wake_up(&lockres->l_event);
}
static void user_unlock_ast(void *opaque, enum dlm_status status)
{
struct user_lock_res *lockres = opaque;
mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen,
lockres->l_name);
if (status != DLM_NORMAL && status != DLM_CANCELGRANT)
mlog(ML_ERROR, "Dlm returns status %d\n", status);
spin_lock(&lockres->l_lock);
/* The teardown flag gets set early during the unlock process,
* so test the cancel flag to make sure that this ast isn't
* for a concurrent cancel. */
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
&& !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
lockres->l_level = LKM_IVMODE;
} else if (status == DLM_CANCELGRANT) {
/* We tried to cancel a convert request, but it was
* already granted. Don't clear the busy flag - the
* ast should've done this already. */
BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
goto out_noclear;
} else {
BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
/* Cancel succeeded, we want to re-queue */
lockres->l_requested = LKM_IVMODE; /* cancel an
* upconvert
* request. */
lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
/* we want the unblock thread to look at it again
* now. */
if (lockres->l_flags & USER_LOCK_BLOCKED)
__user_dlm_queue_lockres(lockres);
}
lockres->l_flags &= ~USER_LOCK_BUSY;
out_noclear:
spin_unlock(&lockres->l_lock);
wake_up(&lockres->l_event);
}
static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
{
struct inode *inode;
inode = user_dlm_inode_from_user_lockres(lockres);
iput(inode);
}
static void user_dlm_unblock_lock(struct work_struct *work)
{
int new_level, status;
struct user_lock_res *lockres =
container_of(work, struct user_lock_res, l_work);
struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
mlog(0, "processing lockres %.*s\n", lockres->l_namelen,
lockres->l_name);
spin_lock(&lockres->l_lock);
mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED),
"Lockres %.*s, flags 0x%x\n",
lockres->l_namelen, lockres->l_name, lockres->l_flags);
/* notice that we don't clear USER_LOCK_BLOCKED here. If it's
* set, we want user_ast clear it. */
lockres->l_flags &= ~USER_LOCK_QUEUED;
/* It's valid to get here and no longer be blocked - if we get
* several basts in a row, we might be queued by the first
* one, the unblock thread might run and clear the queued
* flag, and finally we might get another bast which re-queues
* us before our ast for the downconvert is called. */
if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
spin_unlock(&lockres->l_lock);
goto drop_ref;
}
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
spin_unlock(&lockres->l_lock);
goto drop_ref;
}
if (lockres->l_flags & USER_LOCK_BUSY) {
if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
spin_unlock(&lockres->l_lock);
goto drop_ref;
}
lockres->l_flags |= USER_LOCK_IN_CANCEL;
spin_unlock(&lockres->l_lock);
status = dlmunlock(dlm,
&lockres->l_lksb,
LKM_CANCEL,
user_unlock_ast,
lockres);
if (status != DLM_NORMAL)
user_log_dlm_error("dlmunlock", status, lockres);
goto drop_ref;
}
/* If there are still incompat holders, we can exit safely
* without worrying about re-queueing this lock as that will
* happen on the last call to user_cluster_unlock. */
if ((lockres->l_blocking == LKM_EXMODE)
&& (lockres->l_ex_holders || lockres->l_ro_holders)) {
spin_unlock(&lockres->l_lock);
mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
lockres->l_ro_holders, lockres->l_ex_holders);
goto drop_ref;
}
if ((lockres->l_blocking == LKM_PRMODE)
&& lockres->l_ex_holders) {
spin_unlock(&lockres->l_lock);
mlog(0, "can't downconvert for pr: ex = %u\n",
lockres->l_ex_holders);
goto drop_ref;
}
/* yay, we can downconvert now. */
new_level = user_highest_compat_lock_level(lockres->l_blocking);
lockres->l_requested = new_level;
lockres->l_flags |= USER_LOCK_BUSY;
mlog(0, "Downconvert lock from %d to %d\n",
lockres->l_level, new_level);
spin_unlock(&lockres->l_lock);
/* need lock downconvert request now... */
status = dlmlock(dlm,
new_level,
&lockres->l_lksb,
LKM_CONVERT|LKM_VALBLK,
lockres->l_name,
lockres->l_namelen,
user_ast,
lockres,
user_bast);
if (status != DLM_NORMAL) {
user_log_dlm_error("dlmlock", status, lockres);
user_recover_from_dlm_error(lockres);
}
drop_ref:
user_dlm_drop_inode_ref(lockres);
}
static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
int level)
{
switch(level) {
case LKM_EXMODE:
lockres->l_ex_holders++;
break;
case LKM_PRMODE:
lockres->l_ro_holders++;
break;
default:
BUG();
}
}
/* predict what lock level we'll be dropping down to on behalf
* of another node, and return true if the currently wanted
* level will be compatible with it. */
static inline int
user_may_continue_on_blocked_lock(struct user_lock_res *lockres,
int wanted)
{
BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
return wanted <= user_highest_compat_lock_level(lockres->l_blocking);
}
int user_dlm_cluster_lock(struct user_lock_res *lockres,
int level,
int lkm_flags)
{
int status, local_flags;
struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
if (level != LKM_EXMODE &&
level != LKM_PRMODE) {
mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
lockres->l_namelen, lockres->l_name);
status = -EINVAL;
goto bail;
}
mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n",
lockres->l_namelen, lockres->l_name,
(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
lkm_flags);
again:
if (signal_pending(current)) {
status = -ERESTARTSYS;
goto bail;
}
spin_lock(&lockres->l_lock);
/* We only compare against the currently granted level
* here. If the lock is blocked waiting on a downconvert,
* we'll get caught below. */
if ((lockres->l_flags & USER_LOCK_BUSY) &&
(level > lockres->l_level)) {
/* is someone sitting in dlm_lock? If so, wait on
* them. */
spin_unlock(&lockres->l_lock);
user_wait_on_busy_lock(lockres);
goto again;
}
if ((lockres->l_flags & USER_LOCK_BLOCKED) &&
(!user_may_continue_on_blocked_lock(lockres, level))) {
/* is the lock is currently blocked on behalf of
* another node */
spin_unlock(&lockres->l_lock);
user_wait_on_blocked_lock(lockres);
goto again;
}
if (level > lockres->l_level) {
local_flags = lkm_flags | LKM_VALBLK;
if (lockres->l_level != LKM_IVMODE)
local_flags |= LKM_CONVERT;
lockres->l_requested = level;
lockres->l_flags |= USER_LOCK_BUSY;
spin_unlock(&lockres->l_lock);
BUG_ON(level == LKM_IVMODE);
BUG_ON(level == LKM_NLMODE);
/* call dlm_lock to upgrade lock now */
status = dlmlock(dlm,
level,
&lockres->l_lksb,
local_flags,
lockres->l_name,
lockres->l_namelen,
user_ast,
lockres,
user_bast);
if (status != DLM_NORMAL) {
if ((lkm_flags & LKM_NOQUEUE) &&
(status == DLM_NOTQUEUED))
status = -EAGAIN;
else {
user_log_dlm_error("dlmlock", status, lockres);
status = -EINVAL;
}
user_recover_from_dlm_error(lockres);
goto bail;
}
user_wait_on_busy_lock(lockres);
goto again;
}
user_dlm_inc_holders(lockres, level);
spin_unlock(&lockres->l_lock);
status = 0;
bail:
return status;
}
static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
int level)
{
switch(level) {
case LKM_EXMODE:
BUG_ON(!lockres->l_ex_holders);
lockres->l_ex_holders--;
break;
case LKM_PRMODE:
BUG_ON(!lockres->l_ro_holders);
lockres->l_ro_holders--;
break;
default:
BUG();
}
}
void user_dlm_cluster_unlock(struct user_lock_res *lockres,
int level)
{
if (level != LKM_EXMODE &&
level != LKM_PRMODE) {
mlog(ML_ERROR, "lockres %.*s: invalid request!\n",
lockres->l_namelen, lockres->l_name);
return;
}
spin_lock(&lockres->l_lock);
user_dlm_dec_holders(lockres, level);
__user_dlm_cond_queue_lockres(lockres);
spin_unlock(&lockres->l_lock);
}
void user_dlm_write_lvb(struct inode *inode,
const char *val,
unsigned int len)
{
struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
char *lvb = lockres->l_lksb.lvb;
BUG_ON(len > DLM_LVB_LEN);
spin_lock(&lockres->l_lock);
BUG_ON(lockres->l_level < LKM_EXMODE);
memcpy(lvb, val, len);
spin_unlock(&lockres->l_lock);
}
void user_dlm_read_lvb(struct inode *inode,
char *val,
unsigned int len)
{
struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
char *lvb = lockres->l_lksb.lvb;
BUG_ON(len > DLM_LVB_LEN);
spin_lock(&lockres->l_lock);
BUG_ON(lockres->l_level < LKM_PRMODE);
memcpy(val, lvb, len);
spin_unlock(&lockres->l_lock);
}
void user_dlm_lock_res_init(struct user_lock_res *lockres,
struct dentry *dentry)
{
memset(lockres, 0, sizeof(*lockres));
spin_lock_init(&lockres->l_lock);
init_waitqueue_head(&lockres->l_event);
lockres->l_level = LKM_IVMODE;
lockres->l_requested = LKM_IVMODE;
lockres->l_blocking = LKM_IVMODE;
/* should have been checked before getting here. */
BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
memcpy(lockres->l_name,
dentry->d_name.name,
dentry->d_name.len);
lockres->l_namelen = dentry->d_name.len;
}
int user_dlm_destroy_lock(struct user_lock_res *lockres)
{
int status = -EBUSY;
struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name);
spin_lock(&lockres->l_lock);
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
spin_unlock(&lockres->l_lock);
return 0;
}
lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
while (lockres->l_flags & USER_LOCK_BUSY) {
spin_unlock(&lockres->l_lock);
user_wait_on_busy_lock(lockres);
spin_lock(&lockres->l_lock);
}
if (lockres->l_ro_holders || lockres->l_ex_holders) {
spin_unlock(&lockres->l_lock);
goto bail;
}
status = 0;
if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
spin_unlock(&lockres->l_lock);
goto bail;
}
lockres->l_flags &= ~USER_LOCK_ATTACHED;
lockres->l_flags |= USER_LOCK_BUSY;
spin_unlock(&lockres->l_lock);
status = dlmunlock(dlm,
&lockres->l_lksb,
LKM_VALBLK,
user_unlock_ast,
lockres);
if (status != DLM_NORMAL) {
user_log_dlm_error("dlmunlock", status, lockres);
status = -EINVAL;
goto bail;
}
user_wait_on_busy_lock(lockres);
status = 0;
bail:
return status;
}
struct dlm_ctxt *user_dlm_register_context(struct qstr *name,
struct dlm_protocol_version *proto)
{
struct dlm_ctxt *dlm;
u32 dlm_key;
char *domain;
domain = kmalloc(name->len + 1, GFP_NOFS);
if (!domain) {
mlog_errno(-ENOMEM);
return ERR_PTR(-ENOMEM);
}
dlm_key = crc32_le(0, name->name, name->len);
snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
dlm = dlm_register_domain(domain, dlm_key, proto);
if (IS_ERR(dlm))
mlog_errno(PTR_ERR(dlm));
kfree(domain);
return dlm;
}
void user_dlm_unregister_context(struct dlm_ctxt *dlm)
{
dlm_unregister_domain(dlm);
}

View File

@@ -0,0 +1,113 @@
/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* userdlm.h
*
* Userspace dlm defines
*
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#ifndef USERDLM_H
#define USERDLM_H
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/workqueue.h>
/* user_lock_res->l_flags flags. */
#define USER_LOCK_ATTACHED (0x00000001) /* we have initialized
* the lvb */
#define USER_LOCK_BUSY (0x00000002) /* we are currently in
* dlm_lock */
#define USER_LOCK_BLOCKED (0x00000004) /* blocked waiting to
* downconvert*/
#define USER_LOCK_IN_TEARDOWN (0x00000008) /* we're currently
* destroying this
* lock. */
#define USER_LOCK_QUEUED (0x00000010) /* lock is on the
* workqueue */
#define USER_LOCK_IN_CANCEL (0x00000020)
struct user_lock_res {
spinlock_t l_lock;
int l_flags;
#define USER_DLM_LOCK_ID_MAX_LEN 32
char l_name[USER_DLM_LOCK_ID_MAX_LEN];
int l_namelen;
int l_level;
unsigned int l_ro_holders;
unsigned int l_ex_holders;
struct dlm_lockstatus l_lksb;
int l_requested;
int l_blocking;
wait_queue_head_t l_event;
struct work_struct l_work;
};
extern struct workqueue_struct *user_dlm_worker;
void user_dlm_lock_res_init(struct user_lock_res *lockres,
struct dentry *dentry);
int user_dlm_destroy_lock(struct user_lock_res *lockres);
int user_dlm_cluster_lock(struct user_lock_res *lockres,
int level,
int lkm_flags);
void user_dlm_cluster_unlock(struct user_lock_res *lockres,
int level);
void user_dlm_write_lvb(struct inode *inode,
const char *val,
unsigned int len);
void user_dlm_read_lvb(struct inode *inode,
char *val,
unsigned int len);
struct dlm_ctxt *user_dlm_register_context(struct qstr *name,
struct dlm_protocol_version *proto);
void user_dlm_unregister_context(struct dlm_ctxt *dlm);
struct dlmfs_inode_private {
struct dlm_ctxt *ip_dlm;
struct user_lock_res ip_lockres; /* unused for directories. */
struct inode *ip_parent;
struct inode ip_vfs_inode;
};
static inline struct dlmfs_inode_private *
DLMFS_I(struct inode *inode)
{
return container_of(inode,
struct dlmfs_inode_private,
ip_vfs_inode);
}
struct dlmfs_filp_private {
int fp_lock_level;
};
#define DLMFS_MAGIC 0x76a9f425
#endif /* USERDLM_H */