blob: 61e30ed2fc5e99ab99feb35f361366ab2febb9e4 [file] [log] [blame]
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05001/*
2URL: svn://svnanon.samba.org/samba/branches/SAMBA_4_0/source/lib/tdb/common
3Rev: 23590
4Last Changed Date: 2007-06-22 13:36:10 -0400 (Fri, 22 Jun 2007)
Theodore Ts'o106ad962007-04-04 21:26:37 -04005*/
Theodore Ts'oefc6f622008-08-27 23:07:54 -04006 /*
Theodore Ts'o106ad962007-04-04 21:26:37 -04007 trivial database library - standalone version
8
9 Copyright (C) Andrew Tridgell 1999-2005
10 Copyright (C) Jeremy Allison 2000-2006
11 Copyright (C) Paul `Rusty' Russell 2000
Theodore Ts'oefc6f622008-08-27 23:07:54 -040012
Theodore Ts'o106ad962007-04-04 21:26:37 -040013 ** NOTE! The following LGPL license applies to the tdb
14 ** library. This does NOT imply that all of Samba is released
15 ** under the LGPL
Theodore Ts'oefc6f622008-08-27 23:07:54 -040016
Theodore Ts'o106ad962007-04-04 21:26:37 -040017 This library is free software; you can redistribute it and/or
18 modify it under the terms of the GNU Lesser General Public
19 License as published by the Free Software Foundation; either
20 version 2 of the License, or (at your option) any later version.
21
22 This library is distributed in the hope that it will be useful,
23 but WITHOUT ANY WARRANTY; without even the implied warranty of
24 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25 Lesser General Public License for more details.
26
27 You should have received a copy of the GNU Lesser General Public
28 License along with this library; if not, write to the Free Software
29 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30*/
31
32#ifdef CONFIG_STAND_ALONE
33#define HAVE_MMAP
34#define HAVE_STRDUP
35#define HAVE_SYS_MMAN_H
36#define HAVE_UTIME_H
37#define HAVE_UTIME
38#endif
Theodore Ts'oebabf2a2008-07-13 15:32:37 -040039#define _XOPEN_SOURCE 600
Theodore Ts'o106ad962007-04-04 21:26:37 -040040
Theodore Ts'od1154eb2011-09-18 17:34:37 -040041#include "config.h"
Theodore Ts'o106ad962007-04-04 21:26:37 -040042#include <unistd.h>
43#include <stdio.h>
44#include <stdlib.h>
45#include <stdarg.h>
46#include <stddef.h>
47#include <errno.h>
48#include <string.h>
Christophe GRENIERe7cc6f72008-02-16 12:10:56 +010049#ifdef HAVE_SYS_SELECT_H
Theodore Ts'o106ad962007-04-04 21:26:37 -040050#include <sys/select.h>
Christophe GRENIERe7cc6f72008-02-16 12:10:56 +010051#endif
Theodore Ts'o106ad962007-04-04 21:26:37 -040052#include <sys/time.h>
53#include <sys/types.h>
54#include <time.h>
55#ifdef HAVE_UTIME_H
56#include <utime.h>
57#endif
58#include <sys/stat.h>
59#include <sys/file.h>
60#include <fcntl.h>
61
62#ifdef HAVE_SYS_MMAN_H
63#include <sys/mman.h>
64#endif
65
66#ifndef MAP_FILE
67#define MAP_FILE 0
68#endif
69
70#ifndef MAP_FAILED
71#define MAP_FAILED ((void *)-1)
72#endif
73
74#ifndef HAVE_STRDUP
75#define strdup rep_strdup
76static char *rep_strdup(const char *s)
77{
78 char *ret;
79 int length;
80 if (!s)
81 return NULL;
82
83 if (!length)
84 length = strlen(s);
85
86 ret = malloc(length + 1);
87 if (ret) {
88 strncpy(ret, s, length);
89 ret[length] = '\0';
90 }
91 return ret;
92}
93#endif
94
95#ifndef PRINTF_ATTRIBUTE
96#if (__GNUC__ >= 3) && (__GNUC_MINOR__ >= 1 )
97/** Use gcc attribute to check printf fns. a1 is the 1-based index of
98 * the parameter containing the format, and a2 the index of the first
99 * argument. Note that some gcc 2.x versions don't handle this
100 * properly **/
101#define PRINTF_ATTRIBUTE(a1, a2) __attribute__ ((format (__printf__, a1, a2)))
102#else
103#define PRINTF_ATTRIBUTE(a1, a2)
104#endif
105#endif
106
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500107typedef int bool;
108
Theodore Ts'o106ad962007-04-04 21:26:37 -0400109#include "tdb.h"
110
Theodore Ts'od1b75fd2011-11-05 14:51:51 -0400111static TDB_DATA tdb_null;
112
Theodore Ts'o106ad962007-04-04 21:26:37 -0400113#ifndef u32
114#define u32 unsigned
115#endif
116
Theodore Ts'o106ad962007-04-04 21:26:37 -0400117typedef u32 tdb_len_t;
118typedef u32 tdb_off_t;
119
120#ifndef offsetof
121#define offsetof(t,f) ((unsigned int)&((t *)0)->f)
122#endif
123
124#define TDB_MAGIC_FOOD "TDB file\n"
125#define TDB_VERSION (0x26011967 + 6)
126#define TDB_MAGIC (0x26011999U)
127#define TDB_FREE_MAGIC (~TDB_MAGIC)
128#define TDB_DEAD_MAGIC (0xFEE1DEAD)
129#define TDB_RECOVERY_MAGIC (0xf53bc0e7U)
130#define TDB_ALIGNMENT 4
131#define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
132#define DEFAULT_HASH_SIZE 131
133#define FREELIST_TOP (sizeof(struct tdb_header))
134#define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
135#define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
136#define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
137#define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
138#define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off_t))
139#define TDB_HASHTABLE_SIZE(tdb) ((tdb->header.hash_size+1)*sizeof(tdb_off_t))
140#define TDB_DATA_START(hash_size) TDB_HASH_TOP(hash_size-1)
141#define TDB_RECOVERY_HEAD offsetof(struct tdb_header, recovery_start)
142#define TDB_SEQNUM_OFS offsetof(struct tdb_header, sequence_number)
143#define TDB_PAD_BYTE 0x42
144#define TDB_PAD_U32 0x42424242
145
146/* NB assumes there is a local variable called "tdb" that is the
147 * current context, also takes doubly-parenthesized print-style
148 * argument. */
149#define TDB_LOG(x) tdb->log.log_fn x
150
151/* lock offsets */
152#define GLOBAL_LOCK 0
153#define ACTIVE_LOCK 4
154#define TRANSACTION_LOCK 8
155
156/* free memory if the pointer is valid and zero the pointer */
157#ifndef SAFE_FREE
158#define SAFE_FREE(x) do { if ((x) != NULL) {free(x); (x)=NULL;} } while(0)
159#endif
160
161#define BUCKET(hash) ((hash) % tdb->header.hash_size)
162
163#define DOCONV() (tdb->flags & TDB_CONVERT)
164#define CONVERT(x) (DOCONV() ? tdb_convert(&x, sizeof(x)) : &x)
165
166
167/* the body of the database is made of one list_struct for the free space
168 plus a separate data list for each hash value */
169struct list_struct {
170 tdb_off_t next; /* offset of the next record in the list */
171 tdb_len_t rec_len; /* total byte length of record */
172 tdb_len_t key_len; /* byte length of key */
173 tdb_len_t data_len; /* byte length of data */
174 u32 full_hash; /* the full 32 bit hash of the key */
175 u32 magic; /* try to catch errors */
176 /* the following union is implied:
177 union {
178 char record[rec_len];
179 struct {
180 char key[key_len];
181 char data[data_len];
182 }
183 u32 totalsize; (tailer)
184 }
185 */
186};
187
188
189/* this is stored at the front of every database */
190struct tdb_header {
191 char magic_food[32]; /* for /etc/magic */
192 u32 version; /* version of the code */
193 u32 hash_size; /* number of hash entries */
194 tdb_off_t rwlocks; /* obsolete - kept to detect old formats */
195 tdb_off_t recovery_start; /* offset of transaction recovery region */
196 tdb_off_t sequence_number; /* used when TDB_SEQNUM is set */
197 tdb_off_t reserved[29];
198};
199
200struct tdb_lock_type {
201 int list;
202 u32 count;
203 u32 ltype;
204};
205
206struct tdb_traverse_lock {
207 struct tdb_traverse_lock *next;
208 u32 off;
209 u32 hash;
210 int lock_rw;
211};
212
213
214struct tdb_methods {
215 int (*tdb_read)(struct tdb_context *, tdb_off_t , void *, tdb_len_t , int );
216 int (*tdb_write)(struct tdb_context *, tdb_off_t, const void *, tdb_len_t);
217 void (*next_hash_chain)(struct tdb_context *, u32 *);
218 int (*tdb_oob)(struct tdb_context *, tdb_off_t , int );
219 int (*tdb_expand_file)(struct tdb_context *, tdb_off_t , tdb_off_t );
220 int (*tdb_brlock)(struct tdb_context *, tdb_off_t , int, int, int, size_t);
221};
222
223struct tdb_context {
224 char *name; /* the name of the database */
225 void *map_ptr; /* where it is currently mapped */
226 int fd; /* open file descriptor for the database */
227 tdb_len_t map_size; /* how much space has been mapped */
228 int read_only; /* opened read-only */
229 int traverse_read; /* read-only traversal */
230 struct tdb_lock_type global_lock;
231 int num_lockrecs;
232 struct tdb_lock_type *lockrecs; /* only real locks, all with count>0 */
233 enum TDB_ERROR ecode; /* error code for last tdb error */
234 struct tdb_header header; /* a cached copy of the header */
235 u32 flags; /* the flags passed to tdb_open */
236 struct tdb_traverse_lock travlocks; /* current traversal locks */
237 struct tdb_context *next; /* all tdbs to avoid multiple opens */
238 dev_t device; /* uniquely identifies this tdb */
239 ino_t inode; /* uniquely identifies this tdb */
240 struct tdb_logging_context log;
241 unsigned int (*hash_fn)(TDB_DATA *key);
242 int open_flags; /* flags used in the open - needed by reopen */
243 unsigned int num_locks; /* number of chain locks held */
244 const struct tdb_methods *methods;
245 struct tdb_transaction *transaction;
246 int page_size;
247 int max_dead_records;
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500248 bool have_transaction_lock;
Theodore Ts'o106ad962007-04-04 21:26:37 -0400249};
250
251
252/*
253 internal prototypes
254*/
255static int tdb_munmap(struct tdb_context *tdb);
256static void tdb_mmap(struct tdb_context *tdb);
257static int tdb_lock(struct tdb_context *tdb, int list, int ltype);
258static int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
259static int tdb_brlock(struct tdb_context *tdb, tdb_off_t offset, int rw_type, int lck_type, int probe, size_t len);
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500260static int tdb_transaction_lock(struct tdb_context *tdb, int ltype);
261static int tdb_transaction_unlock(struct tdb_context *tdb);
Theodore Ts'o106ad962007-04-04 21:26:37 -0400262static int tdb_brlock_upgrade(struct tdb_context *tdb, tdb_off_t offset, size_t len);
263static int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off);
264static int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off);
265static int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
266static int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
267static void *tdb_convert(void *buf, u32 size);
268static int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec);
269static tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct list_struct *rec);
270static int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
271static int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
272static int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off);
273static int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off);
274static int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec);
275static int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec);
276static int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct list_struct *rec);
277static unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len);
278static int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
279 tdb_off_t offset, tdb_len_t len,
280 int (*parser)(TDB_DATA key, TDB_DATA data,
281 void *private_data),
282 void *private_data);
283static tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, u32 hash, int locktype,
284 struct list_struct *rec);
285static void tdb_io_init(struct tdb_context *tdb);
286static int tdb_expand(struct tdb_context *tdb, tdb_off_t size);
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500287static int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off,
288 struct list_struct *rec);
Theodore Ts'o106ad962007-04-04 21:26:37 -0400289
290
291/* file: error.c */
292
293enum TDB_ERROR tdb_error(struct tdb_context *tdb)
294{
295 return tdb->ecode;
296}
297
298static struct tdb_errname {
299 enum TDB_ERROR ecode; const char *estring;
300} emap[] = { {TDB_SUCCESS, "Success"},
301 {TDB_ERR_CORRUPT, "Corrupt database"},
302 {TDB_ERR_IO, "IO Error"},
303 {TDB_ERR_LOCK, "Locking error"},
304 {TDB_ERR_OOM, "Out of memory"},
305 {TDB_ERR_EXISTS, "Record exists"},
306 {TDB_ERR_NOLOCK, "Lock exists on other keys"},
307 {TDB_ERR_EINVAL, "Invalid parameter"},
308 {TDB_ERR_NOEXIST, "Record does not exist"},
309 {TDB_ERR_RDONLY, "write not permitted"} };
310
311/* Error string for the last tdb error */
312const char *tdb_errorstr(struct tdb_context *tdb)
313{
314 u32 i;
315 for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
316 if (tdb->ecode == emap[i].ecode)
317 return emap[i].estring;
318 return "Invalid error code";
319}
320
321/* file: lock.c */
322
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500323#define TDB_MARK_LOCK 0x80000000
324
Theodore Ts'o106ad962007-04-04 21:26:37 -0400325/* a byte range locking function - return 0 on success
326 this functions locks/unlocks 1 byte at the specified offset.
327
328 On error, errno is also set so that errors are passed back properly
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400329 through tdb_open().
Theodore Ts'o106ad962007-04-04 21:26:37 -0400330
331 note that a len of zero means lock to end of file
332*/
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400333int tdb_brlock(struct tdb_context *tdb, tdb_off_t offset,
Theodore Ts'o106ad962007-04-04 21:26:37 -0400334 int rw_type, int lck_type, int probe, size_t len)
335{
336 struct flock fl;
337 int ret;
338
339 if (tdb->flags & TDB_NOLOCK) {
340 return 0;
341 }
342
343 if ((rw_type == F_WRLCK) && (tdb->read_only || tdb->traverse_read)) {
344 tdb->ecode = TDB_ERR_RDONLY;
345 return -1;
346 }
347
348 fl.l_type = rw_type;
349 fl.l_whence = SEEK_SET;
350 fl.l_start = offset;
351 fl.l_len = len;
352 fl.l_pid = 0;
353
354 do {
355 ret = fcntl(tdb->fd,lck_type,&fl);
356 } while (ret == -1 && errno == EINTR);
357
358 if (ret == -1) {
359 /* Generic lock error. errno set by fcntl.
360 * EAGAIN is an expected return from non-blocking
361 * locks. */
362 if (!probe && lck_type != F_SETLK) {
363 /* Ensure error code is set for log fun to examine. */
364 tdb->ecode = TDB_ERR_LOCK;
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400365 TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d len=%d\n",
Theodore Ts'o106ad962007-04-04 21:26:37 -0400366 tdb->fd, offset, rw_type, lck_type, (int)len));
367 }
368 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
369 }
370 return 0;
371}
372
373
374/*
375 upgrade a read lock to a write lock. This needs to be handled in a
376 special way as some OSes (such as solaris) have too conservative
377 deadlock detection and claim a deadlock when progress can be
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400378 made. For those OSes we may loop for a while.
Theodore Ts'o106ad962007-04-04 21:26:37 -0400379*/
380int tdb_brlock_upgrade(struct tdb_context *tdb, tdb_off_t offset, size_t len)
381{
382 int count = 1000;
383 while (count--) {
384 struct timeval tv;
385 if (tdb_brlock(tdb, offset, F_WRLCK, F_SETLKW, 1, len) == 0) {
386 return 0;
387 }
388 if (errno != EDEADLK) {
389 break;
390 }
391 /* sleep for as short a time as we can - more portable than usleep() */
392 tv.tv_sec = 0;
393 tv.tv_usec = 1;
394 select(0, NULL, NULL, NULL, &tv);
395 }
396 TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock_upgrade failed at offset %d\n", offset));
397 return -1;
398}
399
400
401/* lock a list in the database. list -1 is the alloc list */
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500402static int _tdb_lock(struct tdb_context *tdb, int list, int ltype, int op)
Theodore Ts'o106ad962007-04-04 21:26:37 -0400403{
404 struct tdb_lock_type *new_lck;
405 int i;
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500406 bool mark_lock = ((ltype & TDB_MARK_LOCK) == TDB_MARK_LOCK);
407
408 ltype &= ~TDB_MARK_LOCK;
Theodore Ts'o106ad962007-04-04 21:26:37 -0400409
410 /* a global lock allows us to avoid per chain locks */
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400411 if (tdb->global_lock.count &&
Theodore Ts'o106ad962007-04-04 21:26:37 -0400412 (ltype == tdb->global_lock.ltype || ltype == F_RDLCK)) {
413 return 0;
414 }
415
416 if (tdb->global_lock.count) {
417 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
418 }
419
420 if (list < -1 || list >= (int)tdb->header.hash_size) {
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400421 TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_lock: invalid list %d for ltype=%d\n",
Theodore Ts'o106ad962007-04-04 21:26:37 -0400422 list, ltype));
423 return -1;
424 }
425 if (tdb->flags & TDB_NOLOCK)
426 return 0;
427
428 for (i=0; i<tdb->num_lockrecs; i++) {
429 if (tdb->lockrecs[i].list == list) {
430 if (tdb->lockrecs[i].count == 0) {
431 /*
432 * Can't happen, see tdb_unlock(). It should
433 * be an assert.
434 */
435 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lock: "
436 "lck->count == 0 for list %d", list));
437 }
438 /*
439 * Just increment the in-memory struct, posix locks
440 * don't stack.
441 */
442 tdb->lockrecs[i].count++;
443 return 0;
444 }
445 }
446
447 new_lck = (struct tdb_lock_type *)realloc(
448 tdb->lockrecs,
449 sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
450 if (new_lck == NULL) {
451 errno = ENOMEM;
452 return -1;
453 }
454 tdb->lockrecs = new_lck;
455
456 /* Since fcntl locks don't nest, we do a lock for the first one,
457 and simply bump the count for future ones */
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500458 if (!mark_lock &&
459 tdb->methods->tdb_brlock(tdb,FREELIST_TOP+4*list, ltype, op,
Theodore Ts'o106ad962007-04-04 21:26:37 -0400460 0, 1)) {
Theodore Ts'o106ad962007-04-04 21:26:37 -0400461 return -1;
462 }
463
464 tdb->num_locks++;
465
466 tdb->lockrecs[tdb->num_lockrecs].list = list;
467 tdb->lockrecs[tdb->num_lockrecs].count = 1;
468 tdb->lockrecs[tdb->num_lockrecs].ltype = ltype;
469 tdb->num_lockrecs += 1;
470
471 return 0;
472}
473
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500474/* lock a list in the database. list -1 is the alloc list */
475int tdb_lock(struct tdb_context *tdb, int list, int ltype)
476{
477 int ret;
478 ret = _tdb_lock(tdb, list, ltype, F_SETLKW);
479 if (ret) {
480 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lock failed on list %d "
481 "ltype=%d (%s)\n", list, ltype, strerror(errno)));
482 }
483 return ret;
484}
485
486/* lock a list in the database. list -1 is the alloc list. non-blocking lock */
487int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype)
488{
489 return _tdb_lock(tdb, list, ltype, F_SETLK);
490}
491
492
Theodore Ts'o106ad962007-04-04 21:26:37 -0400493/* unlock the database: returns void because it's too late for errors. */
494 /* changed to return int it may be interesting to know there
495 has been an error --simo */
496int tdb_unlock(struct tdb_context *tdb, int list, int ltype)
497{
498 int ret = -1;
499 int i;
500 struct tdb_lock_type *lck = NULL;
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500501 bool mark_lock = ((ltype & TDB_MARK_LOCK) == TDB_MARK_LOCK);
502
503 ltype &= ~TDB_MARK_LOCK;
Theodore Ts'o106ad962007-04-04 21:26:37 -0400504
505 /* a global lock allows us to avoid per chain locks */
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400506 if (tdb->global_lock.count &&
Theodore Ts'o106ad962007-04-04 21:26:37 -0400507 (ltype == tdb->global_lock.ltype || ltype == F_RDLCK)) {
508 return 0;
509 }
510
511 if (tdb->global_lock.count) {
512 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
513 }
514
515 if (tdb->flags & TDB_NOLOCK)
516 return 0;
517
518 /* Sanity checks */
519 if (list < -1 || list >= (int)tdb->header.hash_size) {
520 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
521 return ret;
522 }
523
524 for (i=0; i<tdb->num_lockrecs; i++) {
525 if (tdb->lockrecs[i].list == list) {
526 lck = &tdb->lockrecs[i];
527 break;
528 }
529 }
530
531 if ((lck == NULL) || (lck->count == 0)) {
532 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: count is 0\n"));
533 return -1;
534 }
535
536 if (lck->count > 1) {
537 lck->count--;
538 return 0;
539 }
540
541 /*
542 * This lock has count==1 left, so we need to unlock it in the
543 * kernel. We don't bother with decrementing the in-memory array
544 * element, we're about to overwrite it with the last array element
545 * anyway.
546 */
547
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500548 if (mark_lock) {
549 ret = 0;
550 } else {
551 ret = tdb->methods->tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK,
552 F_SETLKW, 0, 1);
553 }
Theodore Ts'o106ad962007-04-04 21:26:37 -0400554 tdb->num_locks--;
555
556 /*
557 * Shrink the array by overwriting the element just unlocked with the
558 * last array element.
559 */
560
561 if (tdb->num_lockrecs > 1) {
562 *lck = tdb->lockrecs[tdb->num_lockrecs-1];
563 }
564 tdb->num_lockrecs -= 1;
565
566 /*
567 * We don't bother with realloc when the array shrinks, but if we have
568 * a completely idle tdb we should get rid of the locked array.
569 */
570
571 if (tdb->num_lockrecs == 0) {
572 SAFE_FREE(tdb->lockrecs);
573 }
574
575 if (ret)
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400576 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: An error occurred unlocking!\n"));
Theodore Ts'o106ad962007-04-04 21:26:37 -0400577 return ret;
578}
579
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500580/*
581 get the transaction lock
582 */
583int tdb_transaction_lock(struct tdb_context *tdb, int ltype)
584{
585 if (tdb->have_transaction_lock || tdb->global_lock.count) {
586 return 0;
587 }
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400588 if (tdb->methods->tdb_brlock(tdb, TRANSACTION_LOCK, ltype,
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500589 F_SETLKW, 0, 1) == -1) {
590 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_lock: failed to get transaction lock\n"));
591 tdb->ecode = TDB_ERR_LOCK;
592 return -1;
593 }
594 tdb->have_transaction_lock = 1;
595 return 0;
596}
597
598/*
599 release the transaction lock
600 */
601int tdb_transaction_unlock(struct tdb_context *tdb)
602{
603 int ret;
604 if (!tdb->have_transaction_lock) {
605 return 0;
606 }
607 ret = tdb->methods->tdb_brlock(tdb, TRANSACTION_LOCK, F_UNLCK, F_SETLKW, 0, 1);
608 if (ret == 0) {
609 tdb->have_transaction_lock = 0;
610 }
611 return ret;
612}
613
614
Theodore Ts'o106ad962007-04-04 21:26:37 -0400615
616
617/* lock/unlock entire database */
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500618static int _tdb_lockall(struct tdb_context *tdb, int ltype, int op)
Theodore Ts'o106ad962007-04-04 21:26:37 -0400619{
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500620 bool mark_lock = ((ltype & TDB_MARK_LOCK) == TDB_MARK_LOCK);
621
622 ltype &= ~TDB_MARK_LOCK;
623
Theodore Ts'o106ad962007-04-04 21:26:37 -0400624 /* There are no locks on read-only dbs */
625 if (tdb->read_only || tdb->traverse_read)
626 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
627
628 if (tdb->global_lock.count && tdb->global_lock.ltype == ltype) {
629 tdb->global_lock.count++;
630 return 0;
631 }
632
633 if (tdb->global_lock.count) {
634 /* a global lock of a different type exists */
635 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
636 }
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400637
Theodore Ts'o106ad962007-04-04 21:26:37 -0400638 if (tdb->num_locks != 0) {
639 /* can't combine global and chain locks */
640 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
641 }
642
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500643 if (!mark_lock &&
644 tdb->methods->tdb_brlock(tdb, FREELIST_TOP, ltype, op,
Theodore Ts'o106ad962007-04-04 21:26:37 -0400645 0, 4*tdb->header.hash_size)) {
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500646 if (op == F_SETLKW) {
647 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lockall failed (%s)\n", strerror(errno)));
648 }
Theodore Ts'o106ad962007-04-04 21:26:37 -0400649 return -1;
650 }
651
652 tdb->global_lock.count = 1;
653 tdb->global_lock.ltype = ltype;
654
655 return 0;
656}
657
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500658
659
Theodore Ts'o106ad962007-04-04 21:26:37 -0400660/* unlock entire db */
661static int _tdb_unlockall(struct tdb_context *tdb, int ltype)
662{
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500663 bool mark_lock = ((ltype & TDB_MARK_LOCK) == TDB_MARK_LOCK);
664
665 ltype &= ~TDB_MARK_LOCK;
666
Theodore Ts'o106ad962007-04-04 21:26:37 -0400667 /* There are no locks on read-only dbs */
668 if (tdb->read_only || tdb->traverse_read) {
669 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
670 }
671
672 if (tdb->global_lock.ltype != ltype || tdb->global_lock.count == 0) {
673 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
674 }
675
676 if (tdb->global_lock.count > 1) {
677 tdb->global_lock.count--;
678 return 0;
679 }
680
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500681 if (!mark_lock &&
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400682 tdb->methods->tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW,
Theodore Ts'o106ad962007-04-04 21:26:37 -0400683 0, 4*tdb->header.hash_size)) {
684 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed (%s)\n", strerror(errno)));
685 return -1;
686 }
687
688 tdb->global_lock.count = 0;
689 tdb->global_lock.ltype = 0;
690
691 return 0;
692}
693
694/* lock entire database with write lock */
695int tdb_lockall(struct tdb_context *tdb)
696{
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500697 return _tdb_lockall(tdb, F_WRLCK, F_SETLKW);
698}
699
700/* lock entire database with write lock - mark only */
701int tdb_lockall_mark(struct tdb_context *tdb)
702{
703 return _tdb_lockall(tdb, F_WRLCK | TDB_MARK_LOCK, F_SETLKW);
704}
705
706/* unlock entire database with write lock - unmark only */
707int tdb_lockall_unmark(struct tdb_context *tdb)
708{
709 return _tdb_unlockall(tdb, F_WRLCK | TDB_MARK_LOCK);
710}
711
712/* lock entire database with write lock - nonblocking varient */
713int tdb_lockall_nonblock(struct tdb_context *tdb)
714{
715 return _tdb_lockall(tdb, F_WRLCK, F_SETLK);
Theodore Ts'o106ad962007-04-04 21:26:37 -0400716}
717
718/* unlock entire database with write lock */
719int tdb_unlockall(struct tdb_context *tdb)
720{
721 return _tdb_unlockall(tdb, F_WRLCK);
722}
723
724/* lock entire database with read lock */
725int tdb_lockall_read(struct tdb_context *tdb)
726{
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500727 return _tdb_lockall(tdb, F_RDLCK, F_SETLKW);
728}
729
730/* lock entire database with read lock - nonblock varient */
731int tdb_lockall_read_nonblock(struct tdb_context *tdb)
732{
733 return _tdb_lockall(tdb, F_RDLCK, F_SETLK);
Theodore Ts'o106ad962007-04-04 21:26:37 -0400734}
735
736/* unlock entire database with read lock */
737int tdb_unlockall_read(struct tdb_context *tdb)
738{
739 return _tdb_unlockall(tdb, F_RDLCK);
740}
741
742/* lock/unlock one hash chain. This is meant to be used to reduce
743 contention - it cannot guarantee how many records will be locked */
744int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
745{
746 return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
747}
748
Theodore Ts'o3eed36b2008-02-17 06:59:21 -0500749/* lock/unlock one hash chain, non-blocking. This is meant to be used
750 to reduce contention - it cannot guarantee how many records will be
751 locked */
752int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key)
753{
754 return tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
755}
756
757/* mark a chain as locked without actually locking it. Warning! use with great caution! */
758int tdb_chainlock_mark(struct tdb_context *tdb, TDB_DATA key)
759{
760 return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK | TDB_MARK_LOCK);
761}
762
763/* unmark a chain as locked without actually locking it. Warning! use with great caution! */
764int tdb_chainlock_unmark(struct tdb_context *tdb, TDB_DATA key)
765{
766 return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK | TDB_MARK_LOCK);
767}
768
Theodore Ts'o106ad962007-04-04 21:26:37 -0400769int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
770{
771 return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
772}
773
774int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
775{
776 return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
777}
778
779int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
780{
781 return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
782}
783
784
785
786/* record lock stops delete underneath */
787int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off)
788{
789 return off ? tdb->methods->tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0, 1) : 0;
790}
791
792/*
793 Write locks override our own fcntl readlocks, so check it here.
794 Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
795 an error to fail to get the lock here.
796*/
797int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off)
798{
799 struct tdb_traverse_lock *i;
800 for (i = &tdb->travlocks; i; i = i->next)
801 if (i->off == off)
802 return -1;
803 return tdb->methods->tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1, 1);
804}
805
806/*
807 Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
808 an error to fail to get the lock here.
809*/
810int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off)
811{
812 return tdb->methods->tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0, 1);
813}
814
815/* fcntl locks don't stack: avoid unlocking someone else's */
816int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off)
817{
818 struct tdb_traverse_lock *i;
819 u32 count = 0;
820
821 if (off == 0)
822 return 0;
823 for (i = &tdb->travlocks; i; i = i->next)
824 if (i->off == off)
825 count++;
826 return (count == 1 ? tdb->methods->tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0, 1) : 0);
827}
828
829/* file: io.c */
830
831/* check for an out of bounds access - if it is out of bounds then
832 see if the database has been expanded by someone else and expand
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400833 if necessary
Theodore Ts'o106ad962007-04-04 21:26:37 -0400834 note that "len" is the minimum length needed for the db
835*/
836static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, int probe)
837{
838 struct stat st;
839 if (len <= tdb->map_size)
840 return 0;
841 if (tdb->flags & TDB_INTERNAL) {
842 if (!probe) {
843 /* Ensure ecode is set for log fn. */
844 tdb->ecode = TDB_ERR_IO;
845 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %d beyond internal malloc size %d\n",
846 (int)len, (int)tdb->map_size));
847 }
848 return TDB_ERRCODE(TDB_ERR_IO, -1);
849 }
850
851 if (fstat(tdb->fd, &st) == -1) {
852 return TDB_ERRCODE(TDB_ERR_IO, -1);
853 }
854
855 if (st.st_size < (size_t)len) {
856 if (!probe) {
857 /* Ensure ecode is set for log fn. */
858 tdb->ecode = TDB_ERR_IO;
859 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %d beyond eof at %d\n",
860 (int)len, (int)st.st_size));
861 }
862 return TDB_ERRCODE(TDB_ERR_IO, -1);
863 }
864
865 /* Unmap, update size, remap */
866 if (tdb_munmap(tdb) == -1)
867 return TDB_ERRCODE(TDB_ERR_IO, -1);
868 tdb->map_size = st.st_size;
869 tdb_mmap(tdb);
870 return 0;
871}
872
873/* write a lump of data at a specified offset */
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400874static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
Theodore Ts'o106ad962007-04-04 21:26:37 -0400875 const void *buf, tdb_len_t len)
876{
877 if (len == 0) {
878 return 0;
879 }
880
881 if (tdb->read_only || tdb->traverse_read) {
882 tdb->ecode = TDB_ERR_RDONLY;
883 return -1;
884 }
885
886 if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0)
887 return -1;
888
889 if (tdb->map_ptr) {
890 memcpy(off + (char *)tdb->map_ptr, buf, len);
891 } else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
892 /* Ensure ecode is set for log fn. */
893 tdb->ecode = TDB_ERR_IO;
894 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d len=%d (%s)\n",
895 off, len, strerror(errno)));
896 return TDB_ERRCODE(TDB_ERR_IO, -1);
897 }
898 return 0;
899}
900
901/* Endian conversion: we only ever deal with 4 byte quantities */
902void *tdb_convert(void *buf, u32 size)
903{
904 u32 i, *p = (u32 *)buf;
905 for (i = 0; i < size / 4; i++)
906 p[i] = TDB_BYTEREV(p[i]);
907 return buf;
908}
909
910
911/* read a lump of data at a specified offset, maybe convert */
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400912static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
Theodore Ts'o106ad962007-04-04 21:26:37 -0400913 tdb_len_t len, int cv)
914{
915 if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0) {
916 return -1;
917 }
918
919 if (tdb->map_ptr) {
920 memcpy(buf, off + (char *)tdb->map_ptr, len);
921 } else {
922 ssize_t ret = pread(tdb->fd, buf, len, off);
923 if (ret != (ssize_t)len) {
924 /* Ensure ecode is set for log fn. */
925 tdb->ecode = TDB_ERR_IO;
926 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_read failed at %d "
927 "len=%d ret=%d (%s) map_size=%d\n",
928 (int)off, (int)len, (int)ret, strerror(errno),
929 (int)tdb->map_size));
930 return TDB_ERRCODE(TDB_ERR_IO, -1);
931 }
932 }
933 if (cv) {
934 tdb_convert(buf, len);
935 }
936 return 0;
937}
938
939
940
941/*
942 do an unlocked scan of the hash table heads to find the next non-zero head. The value
943 will then be confirmed with the lock held
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400944*/
Theodore Ts'o106ad962007-04-04 21:26:37 -0400945static void tdb_next_hash_chain(struct tdb_context *tdb, u32 *chain)
946{
947 u32 h = *chain;
948 if (tdb->map_ptr) {
949 for (;h < tdb->header.hash_size;h++) {
950 if (0 != *(u32 *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
951 break;
952 }
953 }
954 } else {
955 u32 off=0;
956 for (;h < tdb->header.hash_size;h++) {
957 if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
958 break;
959 }
960 }
961 }
962 (*chain) = h;
963}
964
965
966int tdb_munmap(struct tdb_context *tdb)
967{
968 if (tdb->flags & TDB_INTERNAL)
969 return 0;
970
971#ifdef HAVE_MMAP
972 if (tdb->map_ptr) {
973 int ret = munmap(tdb->map_ptr, tdb->map_size);
974 if (ret != 0)
975 return ret;
976 }
977#endif
978 tdb->map_ptr = NULL;
979 return 0;
980}
981
982void tdb_mmap(struct tdb_context *tdb)
983{
984 if (tdb->flags & TDB_INTERNAL)
985 return;
986
987#ifdef HAVE_MMAP
988 if (!(tdb->flags & TDB_NOMMAP)) {
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400989 tdb->map_ptr = mmap(NULL, tdb->map_size,
990 PROT_READ|(tdb->read_only? 0:PROT_WRITE),
Theodore Ts'o106ad962007-04-04 21:26:37 -0400991 MAP_SHARED|MAP_FILE, tdb->fd, 0);
992
993 /*
994 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
995 */
996
997 if (tdb->map_ptr == MAP_FAILED) {
998 tdb->map_ptr = NULL;
Theodore Ts'oefc6f622008-08-27 23:07:54 -0400999 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_mmap failed for size %d (%s)\n",
Theodore Ts'o106ad962007-04-04 21:26:37 -04001000 tdb->map_size, strerror(errno)));
1001 }
1002 } else {
1003 tdb->map_ptr = NULL;
1004 }
1005#else
1006 tdb->map_ptr = NULL;
1007#endif
1008}
1009
1010/* expand a file. we prefer to use ftruncate, as that is what posix
1011 says to use for mmap expansion */
1012static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t addition)
1013{
1014 char buf[1024];
1015
1016 if (tdb->read_only || tdb->traverse_read) {
1017 tdb->ecode = TDB_ERR_RDONLY;
1018 return -1;
1019 }
1020
1021 if (ftruncate(tdb->fd, size+addition) == -1) {
1022 char b = 0;
1023 if (pwrite(tdb->fd, &b, 1, (size+addition) - 1) != 1) {
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001024 TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file to %d failed (%s)\n",
Theodore Ts'o106ad962007-04-04 21:26:37 -04001025 size+addition, strerror(errno)));
1026 return -1;
1027 }
1028 }
1029
1030 /* now fill the file with something. This ensures that the
1031 file isn't sparse, which would be very bad if we ran out of
1032 disk. This must be done with write, not via mmap */
1033 memset(buf, TDB_PAD_BYTE, sizeof(buf));
1034 while (addition) {
1035 int n = addition>sizeof(buf)?sizeof(buf):addition;
1036 int ret = pwrite(tdb->fd, buf, n, size);
1037 if (ret != n) {
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001038 TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write of %d failed (%s)\n",
Theodore Ts'o106ad962007-04-04 21:26:37 -04001039 n, strerror(errno)));
1040 return -1;
1041 }
1042 addition -= n;
1043 size += n;
1044 }
1045 return 0;
1046}
1047
1048
1049/* expand the database at least size bytes by expanding the underlying
1050 file and doing the mmap again if necessary */
1051int tdb_expand(struct tdb_context *tdb, tdb_off_t size)
1052{
1053 struct list_struct rec;
1054 tdb_off_t offset;
1055
1056 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
1057 TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n"));
1058 return -1;
1059 }
1060
1061 /* must know about any previous expansions by another process */
1062 tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
1063
1064 /* always make room for at least 10 more records, and round
1065 the database up to a multiple of the page size */
1066 size = TDB_ALIGN(tdb->map_size + size*10, tdb->page_size) - tdb->map_size;
1067
1068 if (!(tdb->flags & TDB_INTERNAL))
1069 tdb_munmap(tdb);
1070
1071 /*
1072 * We must ensure the file is unmapped before doing this
1073 * to ensure consistency with systems like OpenBSD where
1074 * writes and mmaps are not consistent.
1075 */
1076
1077 /* expand the file itself */
1078 if (!(tdb->flags & TDB_INTERNAL)) {
1079 if (tdb->methods->tdb_expand_file(tdb, tdb->map_size, size) != 0)
1080 goto fail;
1081 }
1082
1083 tdb->map_size += size;
1084
1085 if (tdb->flags & TDB_INTERNAL) {
1086 char *new_map_ptr = (char *)realloc(tdb->map_ptr,
1087 tdb->map_size);
1088 if (!new_map_ptr) {
1089 tdb->map_size -= size;
1090 goto fail;
1091 }
1092 tdb->map_ptr = new_map_ptr;
1093 } else {
1094 /*
1095 * We must ensure the file is remapped before adding the space
1096 * to ensure consistency with systems like OpenBSD where
1097 * writes and mmaps are not consistent.
1098 */
1099
1100 /* We're ok if the mmap fails as we'll fallback to read/write */
1101 tdb_mmap(tdb);
1102 }
1103
1104 /* form a new freelist record */
1105 memset(&rec,'\0',sizeof(rec));
1106 rec.rec_len = size - sizeof(rec);
1107
1108 /* link it into the free list */
1109 offset = tdb->map_size - size;
1110 if (tdb_free(tdb, offset, &rec) == -1)
1111 goto fail;
1112
1113 tdb_unlock(tdb, -1, F_WRLCK);
1114 return 0;
1115 fail:
1116 tdb_unlock(tdb, -1, F_WRLCK);
1117 return -1;
1118}
1119
1120/* read/write a tdb_off_t */
1121int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
1122{
1123 return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
1124}
1125
1126int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
1127{
1128 tdb_off_t off = *d;
1129 return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
1130}
1131
1132
1133/* read a lump of data, allocating the space for it */
1134unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
1135{
1136 unsigned char *buf;
1137
1138 /* some systems don't like zero length malloc */
1139 if (len == 0) {
1140 len = 1;
1141 }
1142
1143 if (!(buf = (unsigned char *)malloc(len))) {
1144 /* Ensure ecode is set for log fn. */
1145 tdb->ecode = TDB_ERR_OOM;
1146 TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_alloc_read malloc failed len=%d (%s)\n",
1147 len, strerror(errno)));
1148 return TDB_ERRCODE(TDB_ERR_OOM, buf);
1149 }
1150 if (tdb->methods->tdb_read(tdb, offset, buf, len, 0) == -1) {
1151 SAFE_FREE(buf);
1152 return NULL;
1153 }
1154 return buf;
1155}
1156
1157/* Give a piece of tdb data to a parser */
1158
1159int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
1160 tdb_off_t offset, tdb_len_t len,
1161 int (*parser)(TDB_DATA key, TDB_DATA data,
1162 void *private_data),
1163 void *private_data)
1164{
1165 TDB_DATA data;
1166 int result;
1167
1168 data.dsize = len;
1169
1170 if ((tdb->transaction == NULL) && (tdb->map_ptr != NULL)) {
1171 /*
1172 * Optimize by avoiding the malloc/memcpy/free, point the
1173 * parser directly at the mmap area.
1174 */
1175 if (tdb->methods->tdb_oob(tdb, offset+len, 0) != 0) {
1176 return -1;
1177 }
1178 data.dptr = offset + (unsigned char *)tdb->map_ptr;
1179 return parser(key, data, private_data);
1180 }
1181
1182 if (!(data.dptr = tdb_alloc_read(tdb, offset, len))) {
1183 return -1;
1184 }
1185
1186 result = parser(key, data, private_data);
1187 free(data.dptr);
1188 return result;
1189}
1190
1191/* read/write a record */
1192int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec)
1193{
1194 if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
1195 return -1;
1196 if (TDB_BAD_MAGIC(rec)) {
1197 /* Ensure ecode is set for log fn. */
1198 tdb->ecode = TDB_ERR_CORRUPT;
1199 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
1200 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
1201 }
1202 return tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0);
1203}
1204
1205int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec)
1206{
1207 struct list_struct r = *rec;
1208 return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
1209}
1210
1211static const struct tdb_methods io_methods = {
1212 tdb_read,
1213 tdb_write,
1214 tdb_next_hash_chain,
1215 tdb_oob,
1216 tdb_expand_file,
1217 tdb_brlock
1218};
1219
1220/*
1221 initialise the default methods table
1222*/
1223void tdb_io_init(struct tdb_context *tdb)
1224{
1225 tdb->methods = &io_methods;
1226}
1227
1228/* file: transaction.c */
1229
1230/*
1231 transaction design:
1232
1233 - only allow a single transaction at a time per database. This makes
1234 using the transaction API simpler, as otherwise the caller would
1235 have to cope with temporary failures in transactions that conflict
1236 with other current transactions
1237
1238 - keep the transaction recovery information in the same file as the
1239 database, using a special 'transaction recovery' record pointed at
1240 by the header. This removes the need for extra journal files as
1241 used by some other databases
1242
1243 - dynamically allocated the transaction recover record, re-using it
1244 for subsequent transactions. If a larger record is needed then
1245 tdb_free() the old record to place it on the normal tdb freelist
1246 before allocating the new record
1247
1248 - during transactions, keep a linked list of writes all that have
1249 been performed by intercepting all tdb_write() calls. The hooked
1250 transaction versions of tdb_read() and tdb_write() check this
1251 linked list and try to use the elements of the list in preference
1252 to the real database.
1253
1254 - don't allow any locks to be held when a transaction starts,
1255 otherwise we can end up with deadlock (plus lack of lock nesting
1256 in posix locks would mean the lock is lost)
1257
1258 - if the caller gains a lock during the transaction but doesn't
1259 release it then fail the commit
1260
1261 - allow for nested calls to tdb_transaction_start(), re-using the
1262 existing transaction record. If the inner transaction is cancelled
1263 then a subsequent commit will fail
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001264
Theodore Ts'o106ad962007-04-04 21:26:37 -04001265 - keep a mirrored copy of the tdb hash chain heads to allow for the
1266 fast hash heads scan on traverse, updating the mirrored copy in
1267 the transaction version of tdb_write
1268
1269 - allow callers to mix transaction and non-transaction use of tdb,
1270 although once a transaction is started then an exclusive lock is
1271 gained until the transaction is committed or cancelled
1272
1273 - the commit stategy involves first saving away all modified data
1274 into a linearised buffer in the transaction recovery area, then
1275 marking the transaction recovery area with a magic value to
1276 indicate a valid recovery record. In total 4 fsync/msync calls are
1277 needed per commit to prevent race conditions. It might be possible
1278 to reduce this to 3 or even 2 with some more work.
1279
1280 - check for a valid recovery record on open of the tdb, while the
1281 global lock is held. Automatically recover from the transaction
1282 recovery area if needed, then continue with the open as
1283 usual. This allows for smooth crash recovery with no administrator
1284 intervention.
1285
1286 - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
1287 still available, but no transaction recovery area is used and no
1288 fsync/msync calls are made.
1289
1290*/
1291
1292struct tdb_transaction_el {
1293 struct tdb_transaction_el *next, *prev;
1294 tdb_off_t offset;
1295 tdb_len_t length;
1296 unsigned char *data;
1297};
1298
1299/*
1300 hold the context of any current transaction
1301*/
1302struct tdb_transaction {
1303 /* we keep a mirrored copy of the tdb hash heads here so
1304 tdb_next_hash_chain() can operate efficiently */
1305 u32 *hash_heads;
1306
1307 /* the original io methods - used to do IOs to the real db */
1308 const struct tdb_methods *io_methods;
1309
1310 /* the list of transaction elements. We use a doubly linked
1311 list with a last pointer to allow us to keep the list
1312 ordered, with first element at the front of the list. It
1313 needs to be doubly linked as the read/write traversals need
1314 to be backwards, while the commit needs to be forwards */
1315 struct tdb_transaction_el *elements, *elements_last;
1316
1317 /* non-zero when an internal transaction error has
1318 occurred. All write operations will then fail until the
1319 transaction is ended */
1320 int transaction_error;
1321
1322 /* when inside a transaction we need to keep track of any
1323 nested tdb_transaction_start() calls, as these are allowed,
1324 but don't create a new transaction */
1325 int nesting;
1326
1327 /* old file size before transaction */
1328 tdb_len_t old_map_size;
1329};
1330
1331
1332/*
1333 read while in a transaction. We need to check first if the data is in our list
1334 of transaction elements, then if not do a real read
1335*/
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001336static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
Theodore Ts'o106ad962007-04-04 21:26:37 -04001337 tdb_len_t len, int cv)
1338{
1339 struct tdb_transaction_el *el;
1340
1341 /* we need to walk the list backwards to get the most recent data */
1342 for (el=tdb->transaction->elements_last;el;el=el->prev) {
1343 tdb_len_t partial;
1344
1345 if (off+len <= el->offset) {
1346 continue;
1347 }
1348 if (off >= el->offset + el->length) {
1349 continue;
1350 }
1351
1352 /* an overlapping read - needs to be split into up to
1353 2 reads and a memcpy */
1354 if (off < el->offset) {
1355 partial = el->offset - off;
1356 if (transaction_read(tdb, off, buf, partial, cv) != 0) {
1357 goto fail;
1358 }
1359 len -= partial;
1360 off += partial;
1361 buf = (void *)(partial + (char *)buf);
1362 }
1363 if (off + len <= el->offset + el->length) {
1364 partial = len;
1365 } else {
1366 partial = el->offset + el->length - off;
1367 }
1368 memcpy(buf, el->data + (off - el->offset), partial);
1369 if (cv) {
1370 tdb_convert(buf, len);
1371 }
1372 len -= partial;
1373 off += partial;
1374 buf = (void *)(partial + (char *)buf);
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001375
Theodore Ts'o106ad962007-04-04 21:26:37 -04001376 if (len != 0 && transaction_read(tdb, off, buf, len, cv) != 0) {
1377 goto fail;
1378 }
1379
1380 return 0;
1381 }
1382
1383 /* its not in the transaction elements - do a real read */
1384 return tdb->transaction->io_methods->tdb_read(tdb, off, buf, len, cv);
1385
1386fail:
1387 TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_read: failed at off=%d len=%d\n", off, len));
1388 tdb->ecode = TDB_ERR_IO;
1389 tdb->transaction->transaction_error = 1;
1390 return -1;
1391}
1392
1393
1394/*
1395 write while in a transaction
1396*/
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001397static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
Theodore Ts'o106ad962007-04-04 21:26:37 -04001398 const void *buf, tdb_len_t len)
1399{
1400 struct tdb_transaction_el *el, *best_el=NULL;
1401
1402 if (len == 0) {
1403 return 0;
1404 }
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001405
Theodore Ts'o106ad962007-04-04 21:26:37 -04001406 /* if the write is to a hash head, then update the transaction
1407 hash heads */
1408 if (len == sizeof(tdb_off_t) && off >= FREELIST_TOP &&
1409 off < FREELIST_TOP+TDB_HASHTABLE_SIZE(tdb)) {
1410 u32 chain = (off-FREELIST_TOP) / sizeof(tdb_off_t);
1411 memcpy(&tdb->transaction->hash_heads[chain], buf, len);
1412 }
1413
1414 /* first see if we can replace an existing entry */
1415 for (el=tdb->transaction->elements_last;el;el=el->prev) {
1416 tdb_len_t partial;
1417
1418 if (best_el == NULL && off == el->offset+el->length) {
1419 best_el = el;
1420 }
1421
1422 if (off+len <= el->offset) {
1423 continue;
1424 }
1425 if (off >= el->offset + el->length) {
1426 continue;
1427 }
1428
1429 /* an overlapping write - needs to be split into up to
1430 2 writes and a memcpy */
1431 if (off < el->offset) {
1432 partial = el->offset - off;
1433 if (transaction_write(tdb, off, buf, partial) != 0) {
1434 goto fail;
1435 }
1436 len -= partial;
1437 off += partial;
1438 buf = (const void *)(partial + (const char *)buf);
1439 }
1440 if (off + len <= el->offset + el->length) {
1441 partial = len;
1442 } else {
1443 partial = el->offset + el->length - off;
1444 }
1445 memcpy(el->data + (off - el->offset), buf, partial);
1446 len -= partial;
1447 off += partial;
1448 buf = (const void *)(partial + (const char *)buf);
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001449
Theodore Ts'o106ad962007-04-04 21:26:37 -04001450 if (len != 0 && transaction_write(tdb, off, buf, len) != 0) {
1451 goto fail;
1452 }
1453
1454 return 0;
1455 }
1456
1457 /* see if we can append the new entry to an existing entry */
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001458 if (best_el && best_el->offset + best_el->length == off &&
Theodore Ts'o106ad962007-04-04 21:26:37 -04001459 (off+len < tdb->transaction->old_map_size ||
1460 off > tdb->transaction->old_map_size)) {
1461 unsigned char *data = best_el->data;
1462 el = best_el;
1463 el->data = (unsigned char *)realloc(el->data,
1464 el->length + len);
1465 if (el->data == NULL) {
1466 tdb->ecode = TDB_ERR_OOM;
1467 tdb->transaction->transaction_error = 1;
1468 el->data = data;
1469 return -1;
1470 }
1471 if (buf) {
1472 memcpy(el->data + el->length, buf, len);
1473 } else {
1474 memset(el->data + el->length, TDB_PAD_BYTE, len);
1475 }
1476 el->length += len;
1477 return 0;
1478 }
1479
1480 /* add a new entry at the end of the list */
1481 el = (struct tdb_transaction_el *)malloc(sizeof(*el));
1482 if (el == NULL) {
1483 tdb->ecode = TDB_ERR_OOM;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001484 tdb->transaction->transaction_error = 1;
Theodore Ts'o106ad962007-04-04 21:26:37 -04001485 return -1;
1486 }
1487 el->next = NULL;
1488 el->prev = tdb->transaction->elements_last;
1489 el->offset = off;
1490 el->length = len;
1491 el->data = (unsigned char *)malloc(len);
1492 if (el->data == NULL) {
1493 free(el);
1494 tdb->ecode = TDB_ERR_OOM;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001495 tdb->transaction->transaction_error = 1;
Theodore Ts'o106ad962007-04-04 21:26:37 -04001496 return -1;
1497 }
1498 if (buf) {
1499 memcpy(el->data, buf, len);
1500 } else {
1501 memset(el->data, TDB_PAD_BYTE, len);
1502 }
1503 if (el->prev) {
1504 el->prev->next = el;
1505 } else {
1506 tdb->transaction->elements = el;
1507 }
1508 tdb->transaction->elements_last = el;
1509 return 0;
1510
1511fail:
1512 TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: failed at off=%d len=%d\n", off, len));
1513 tdb->ecode = TDB_ERR_IO;
1514 tdb->transaction->transaction_error = 1;
1515 return -1;
1516}
1517
1518/*
1519 accelerated hash chain head search, using the cached hash heads
1520*/
1521static void transaction_next_hash_chain(struct tdb_context *tdb, u32 *chain)
1522{
1523 u32 h = *chain;
1524 for (;h < tdb->header.hash_size;h++) {
1525 /* the +1 takes account of the freelist */
1526 if (0 != tdb->transaction->hash_heads[h+1]) {
1527 break;
1528 }
1529 }
1530 (*chain) = h;
1531}
1532
1533/*
1534 out of bounds check during a transaction
1535*/
1536static int transaction_oob(struct tdb_context *tdb, tdb_off_t len, int probe)
1537{
1538 if (len <= tdb->map_size) {
1539 return 0;
1540 }
1541 return TDB_ERRCODE(TDB_ERR_IO, -1);
1542}
1543
1544/*
1545 transaction version of tdb_expand().
1546*/
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001547static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t size,
Theodore Ts'o106ad962007-04-04 21:26:37 -04001548 tdb_off_t addition)
1549{
1550 /* add a write to the transaction elements, so subsequent
1551 reads see the zero data */
1552 if (transaction_write(tdb, size, NULL, addition) != 0) {
1553 return -1;
1554 }
1555
1556 return 0;
1557}
1558
1559/*
1560 brlock during a transaction - ignore them
1561*/
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001562static int transaction_brlock(struct tdb_context *tdb, tdb_off_t offset,
Theodore Ts'o106ad962007-04-04 21:26:37 -04001563 int rw_type, int lck_type, int probe, size_t len)
1564{
1565 return 0;
1566}
1567
1568static const struct tdb_methods transaction_methods = {
1569 transaction_read,
1570 transaction_write,
1571 transaction_next_hash_chain,
1572 transaction_oob,
1573 transaction_expand_file,
1574 transaction_brlock
1575};
1576
1577
1578/*
1579 start a tdb transaction. No token is returned, as only a single
1580 transaction is allowed to be pending per tdb_context
1581*/
1582int tdb_transaction_start(struct tdb_context *tdb)
1583{
1584 /* some sanity checks */
1585 if (tdb->read_only || (tdb->flags & TDB_INTERNAL) || tdb->traverse_read) {
1586 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n"));
1587 tdb->ecode = TDB_ERR_EINVAL;
1588 return -1;
1589 }
1590
1591 /* cope with nested tdb_transaction_start() calls */
1592 if (tdb->transaction != NULL) {
1593 tdb->transaction->nesting++;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001594 TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_start: nesting %d\n",
Theodore Ts'o106ad962007-04-04 21:26:37 -04001595 tdb->transaction->nesting));
1596 return 0;
1597 }
1598
1599 if (tdb->num_locks != 0 || tdb->global_lock.count) {
1600 /* the caller must not have any locks when starting a
1601 transaction as otherwise we'll be screwed by lack
1602 of nested locks in posix */
1603 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction with locks held\n"));
1604 tdb->ecode = TDB_ERR_LOCK;
1605 return -1;
1606 }
1607
1608 if (tdb->travlocks.next != NULL) {
1609 /* you cannot use transactions inside a traverse (although you can use
1610 traverse inside a transaction) as otherwise you can end up with
1611 deadlock */
1612 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction within a traverse\n"));
1613 tdb->ecode = TDB_ERR_LOCK;
1614 return -1;
1615 }
1616
1617 tdb->transaction = (struct tdb_transaction *)
1618 calloc(sizeof(struct tdb_transaction), 1);
1619 if (tdb->transaction == NULL) {
1620 tdb->ecode = TDB_ERR_OOM;
1621 return -1;
1622 }
1623
1624 /* get the transaction write lock. This is a blocking lock. As
1625 discussed with Volker, there are a number of ways we could
1626 make this async, which we will probably do in the future */
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05001627 if (tdb_transaction_lock(tdb, F_WRLCK) == -1) {
Theodore Ts'o106ad962007-04-04 21:26:37 -04001628 SAFE_FREE(tdb->transaction);
1629 return -1;
1630 }
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001631
Theodore Ts'o106ad962007-04-04 21:26:37 -04001632 /* get a read lock from the freelist to the end of file. This
1633 is upgraded to a write lock during the commit */
1634 if (tdb_brlock(tdb, FREELIST_TOP, F_RDLCK, F_SETLKW, 0, 0) == -1) {
1635 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to get hash locks\n"));
1636 tdb->ecode = TDB_ERR_LOCK;
1637 goto fail;
1638 }
1639
1640 /* setup a copy of the hash table heads so the hash scan in
1641 traverse can be fast */
1642 tdb->transaction->hash_heads = (u32 *)
1643 calloc(tdb->header.hash_size+1, sizeof(u32));
1644 if (tdb->transaction->hash_heads == NULL) {
1645 tdb->ecode = TDB_ERR_OOM;
1646 goto fail;
1647 }
1648 if (tdb->methods->tdb_read(tdb, FREELIST_TOP, tdb->transaction->hash_heads,
1649 TDB_HASHTABLE_SIZE(tdb), 0) != 0) {
1650 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to read hash heads\n"));
1651 tdb->ecode = TDB_ERR_IO;
1652 goto fail;
1653 }
1654
1655 /* make sure we know about any file expansions already done by
1656 anyone else */
1657 tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
1658 tdb->transaction->old_map_size = tdb->map_size;
1659
1660 /* finally hook the io methods, replacing them with
1661 transaction specific methods */
1662 tdb->transaction->io_methods = tdb->methods;
1663 tdb->methods = &transaction_methods;
1664
1665 /* by calling this transaction write here, we ensure that we don't grow the
1666 transaction linked list due to hash table updates */
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001667 if (transaction_write(tdb, FREELIST_TOP, tdb->transaction->hash_heads,
Theodore Ts'o106ad962007-04-04 21:26:37 -04001668 TDB_HASHTABLE_SIZE(tdb)) != 0) {
1669 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to prime hash table\n"));
1670 tdb->ecode = TDB_ERR_IO;
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05001671 tdb->methods = tdb->transaction->io_methods;
Theodore Ts'o106ad962007-04-04 21:26:37 -04001672 goto fail;
1673 }
1674
1675 return 0;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001676
Theodore Ts'o106ad962007-04-04 21:26:37 -04001677fail:
1678 tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0);
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05001679 tdb_transaction_unlock(tdb);
Theodore Ts'o106ad962007-04-04 21:26:37 -04001680 SAFE_FREE(tdb->transaction->hash_heads);
1681 SAFE_FREE(tdb->transaction);
1682 return -1;
1683}
1684
1685
1686/*
1687 cancel the current transaction
1688*/
1689int tdb_transaction_cancel(struct tdb_context *tdb)
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001690{
Theodore Ts'o106ad962007-04-04 21:26:37 -04001691 if (tdb->transaction == NULL) {
1692 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_cancel: no transaction\n"));
1693 return -1;
1694 }
1695
1696 if (tdb->transaction->nesting != 0) {
1697 tdb->transaction->transaction_error = 1;
1698 tdb->transaction->nesting--;
1699 return 0;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001700 }
Theodore Ts'o106ad962007-04-04 21:26:37 -04001701
1702 tdb->map_size = tdb->transaction->old_map_size;
1703
1704 /* free all the transaction elements */
1705 while (tdb->transaction->elements) {
1706 struct tdb_transaction_el *el = tdb->transaction->elements;
1707 tdb->transaction->elements = el->next;
1708 free(el->data);
1709 free(el);
1710 }
1711
1712 /* remove any global lock created during the transaction */
1713 if (tdb->global_lock.count != 0) {
1714 tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 4*tdb->header.hash_size);
1715 tdb->global_lock.count = 0;
1716 }
1717
1718 /* remove any locks created during the transaction */
1719 if (tdb->num_locks != 0) {
1720 int i;
1721 for (i=0;i<tdb->num_lockrecs;i++) {
1722 tdb_brlock(tdb,FREELIST_TOP+4*tdb->lockrecs[i].list,
1723 F_UNLCK,F_SETLKW, 0, 1);
1724 }
1725 tdb->num_locks = 0;
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05001726 tdb->num_lockrecs = 0;
1727 SAFE_FREE(tdb->lockrecs);
Theodore Ts'o106ad962007-04-04 21:26:37 -04001728 }
1729
1730 /* restore the normal io methods */
1731 tdb->methods = tdb->transaction->io_methods;
1732
1733 tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0);
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05001734 tdb_transaction_unlock(tdb);
Theodore Ts'o106ad962007-04-04 21:26:37 -04001735 SAFE_FREE(tdb->transaction->hash_heads);
1736 SAFE_FREE(tdb->transaction);
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001737
Theodore Ts'o106ad962007-04-04 21:26:37 -04001738 return 0;
1739}
1740
1741/*
1742 sync to disk
1743*/
1744static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t length)
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001745{
Theodore Ts'o106ad962007-04-04 21:26:37 -04001746 if (fsync(tdb->fd) != 0) {
1747 tdb->ecode = TDB_ERR_IO;
1748 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: fsync failed\n"));
1749 return -1;
1750 }
Theodore Ts'odb40c202011-11-28 12:31:29 -05001751#if defined(HAVE_MSYNC) && defined(MS_SYNC)
Theodore Ts'o106ad962007-04-04 21:26:37 -04001752 if (tdb->map_ptr) {
1753 tdb_off_t moffset = offset & ~(tdb->page_size-1);
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001754 if (msync(moffset + (char *)tdb->map_ptr,
Theodore Ts'o106ad962007-04-04 21:26:37 -04001755 length + (offset - moffset), MS_SYNC) != 0) {
1756 tdb->ecode = TDB_ERR_IO;
1757 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: msync failed - %s\n",
1758 strerror(errno)));
1759 return -1;
1760 }
1761 }
1762#endif
1763 return 0;
1764}
1765
1766
1767/*
1768 work out how much space the linearised recovery data will consume
1769*/
1770static tdb_len_t tdb_recovery_size(struct tdb_context *tdb)
1771{
1772 struct tdb_transaction_el *el;
1773 tdb_len_t recovery_size = 0;
1774
1775 recovery_size = sizeof(u32);
1776 for (el=tdb->transaction->elements;el;el=el->next) {
1777 if (el->offset >= tdb->transaction->old_map_size) {
1778 continue;
1779 }
1780 recovery_size += 2*sizeof(tdb_off_t) + el->length;
1781 }
1782
1783 return recovery_size;
1784}
1785
1786/*
1787 allocate the recovery area, or use an existing recovery area if it is
1788 large enough
1789*/
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001790static int tdb_recovery_allocate(struct tdb_context *tdb,
Theodore Ts'o106ad962007-04-04 21:26:37 -04001791 tdb_len_t *recovery_size,
1792 tdb_off_t *recovery_offset,
1793 tdb_len_t *recovery_max_size)
1794{
1795 struct list_struct rec;
1796 const struct tdb_methods *methods = tdb->transaction->io_methods;
1797 tdb_off_t recovery_head;
1798
1799 if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
1800 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery head\n"));
1801 return -1;
1802 }
1803
1804 rec.rec_len = 0;
1805
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001806 if (recovery_head != 0 &&
Theodore Ts'o106ad962007-04-04 21:26:37 -04001807 methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
1808 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery record\n"));
1809 return -1;
1810 }
1811
1812 *recovery_size = tdb_recovery_size(tdb);
1813
1814 if (recovery_head != 0 && *recovery_size <= rec.rec_len) {
1815 /* it fits in the existing area */
1816 *recovery_max_size = rec.rec_len;
1817 *recovery_offset = recovery_head;
1818 return 0;
1819 }
1820
1821 /* we need to free up the old recovery area, then allocate a
1822 new one at the end of the file. Note that we cannot use
1823 tdb_allocate() to allocate the new one as that might return
1824 us an area that is being currently used (as of the start of
1825 the transaction) */
1826 if (recovery_head != 0) {
1827 if (tdb_free(tdb, recovery_head, &rec) == -1) {
1828 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to free previous recovery area\n"));
1829 return -1;
1830 }
1831 }
1832
1833 /* the tdb_free() call might have increased the recovery size */
1834 *recovery_size = tdb_recovery_size(tdb);
1835
1836 /* round up to a multiple of page size */
1837 *recovery_max_size = TDB_ALIGN(sizeof(rec) + *recovery_size, tdb->page_size) - sizeof(rec);
1838 *recovery_offset = tdb->map_size;
1839 recovery_head = *recovery_offset;
1840
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001841 if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
Theodore Ts'o106ad962007-04-04 21:26:37 -04001842 (tdb->map_size - tdb->transaction->old_map_size) +
1843 sizeof(rec) + *recovery_max_size) == -1) {
1844 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to create recovery area\n"));
1845 return -1;
1846 }
1847
1848 /* remap the file (if using mmap) */
1849 methods->tdb_oob(tdb, tdb->map_size + 1, 1);
1850
1851 /* we have to reset the old map size so that we don't try to expand the file
1852 again in the transaction commit, which would destroy the recovery area */
1853 tdb->transaction->old_map_size = tdb->map_size;
1854
1855 /* write the recovery header offset and sync - we can sync without a race here
1856 as the magic ptr in the recovery record has not been set */
1857 CONVERT(recovery_head);
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001858 if (methods->tdb_write(tdb, TDB_RECOVERY_HEAD,
Theodore Ts'o106ad962007-04-04 21:26:37 -04001859 &recovery_head, sizeof(tdb_off_t)) == -1) {
1860 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
1861 return -1;
1862 }
1863
1864 return 0;
1865}
1866
1867
1868/*
1869 setup the recovery data that will be used on a crash during commit
1870*/
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001871static int transaction_setup_recovery(struct tdb_context *tdb,
Theodore Ts'o106ad962007-04-04 21:26:37 -04001872 tdb_off_t *magic_offset)
1873{
1874 struct tdb_transaction_el *el;
1875 tdb_len_t recovery_size;
1876 unsigned char *data, *p;
1877 const struct tdb_methods *methods = tdb->transaction->io_methods;
1878 struct list_struct *rec;
1879 tdb_off_t recovery_offset, recovery_max_size;
1880 tdb_off_t old_map_size = tdb->transaction->old_map_size;
1881 u32 magic, tailer;
1882
1883 /*
1884 check that the recovery area has enough space
1885 */
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001886 if (tdb_recovery_allocate(tdb, &recovery_size,
Theodore Ts'o106ad962007-04-04 21:26:37 -04001887 &recovery_offset, &recovery_max_size) == -1) {
1888 return -1;
1889 }
1890
1891 data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
1892 if (data == NULL) {
1893 tdb->ecode = TDB_ERR_OOM;
1894 return -1;
1895 }
1896
1897 rec = (struct list_struct *)data;
1898 memset(rec, 0, sizeof(*rec));
1899
1900 rec->magic = 0;
1901 rec->data_len = recovery_size;
1902 rec->rec_len = recovery_max_size;
1903 rec->key_len = old_map_size;
1904 CONVERT(rec);
1905
1906 /* build the recovery data into a single blob to allow us to do a single
1907 large write, which should be more efficient */
1908 p = data + sizeof(*rec);
1909 for (el=tdb->transaction->elements;el;el=el->next) {
1910 if (el->offset >= old_map_size) {
1911 continue;
1912 }
1913 if (el->offset + el->length > tdb->transaction->old_map_size) {
1914 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: transaction data over new region boundary\n"));
1915 free(data);
1916 tdb->ecode = TDB_ERR_CORRUPT;
1917 return -1;
1918 }
1919 memcpy(p, &el->offset, 4);
1920 memcpy(p+4, &el->length, 4);
1921 if (DOCONV()) {
1922 tdb_convert(p, 8);
1923 }
1924 /* the recovery area contains the old data, not the
1925 new data, so we have to call the original tdb_read
1926 method to get it */
1927 if (methods->tdb_read(tdb, el->offset, p + 8, el->length, 0) != 0) {
1928 free(data);
1929 tdb->ecode = TDB_ERR_IO;
1930 return -1;
1931 }
1932 p += 8 + el->length;
1933 }
1934
1935 /* and the tailer */
1936 tailer = sizeof(*rec) + recovery_max_size;
1937 memcpy(p, &tailer, 4);
1938 CONVERT(p);
1939
1940 /* write the recovery data to the recovery area */
1941 if (methods->tdb_write(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
1942 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery data\n"));
1943 free(data);
1944 tdb->ecode = TDB_ERR_IO;
1945 return -1;
1946 }
1947
1948 /* as we don't have ordered writes, we have to sync the recovery
1949 data before we update the magic to indicate that the recovery
1950 data is present */
1951 if (transaction_sync(tdb, recovery_offset, sizeof(*rec) + recovery_size) == -1) {
1952 free(data);
1953 return -1;
1954 }
1955
1956 free(data);
1957
1958 magic = TDB_RECOVERY_MAGIC;
1959 CONVERT(magic);
1960
1961 *magic_offset = recovery_offset + offsetof(struct list_struct, magic);
1962
1963 if (methods->tdb_write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
1964 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery magic\n"));
1965 tdb->ecode = TDB_ERR_IO;
1966 return -1;
1967 }
1968
1969 /* ensure the recovery magic marker is on disk */
1970 if (transaction_sync(tdb, *magic_offset, sizeof(magic)) == -1) {
1971 return -1;
1972 }
1973
1974 return 0;
1975}
1976
1977/*
1978 commit the current transaction
1979*/
1980int tdb_transaction_commit(struct tdb_context *tdb)
Theodore Ts'oefc6f622008-08-27 23:07:54 -04001981{
Theodore Ts'o106ad962007-04-04 21:26:37 -04001982 const struct tdb_methods *methods;
1983 tdb_off_t magic_offset = 0;
1984 u32 zero = 0;
1985
1986 if (tdb->transaction == NULL) {
1987 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: no transaction\n"));
1988 return -1;
1989 }
1990
1991 if (tdb->transaction->transaction_error) {
1992 tdb->ecode = TDB_ERR_IO;
1993 tdb_transaction_cancel(tdb);
1994 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: transaction error pending\n"));
1995 return -1;
1996 }
1997
1998 if (tdb->transaction->nesting != 0) {
1999 tdb->transaction->nesting--;
2000 return 0;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002001 }
Theodore Ts'o106ad962007-04-04 21:26:37 -04002002
2003 /* check for a null transaction */
2004 if (tdb->transaction->elements == NULL) {
2005 tdb_transaction_cancel(tdb);
2006 return 0;
2007 }
2008
2009 methods = tdb->transaction->io_methods;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002010
Theodore Ts'o106ad962007-04-04 21:26:37 -04002011 /* if there are any locks pending then the caller has not
2012 nested their locks properly, so fail the transaction */
2013 if (tdb->num_locks || tdb->global_lock.count) {
2014 tdb->ecode = TDB_ERR_LOCK;
2015 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: locks pending on commit\n"));
2016 tdb_transaction_cancel(tdb);
2017 return -1;
2018 }
2019
2020 /* upgrade the main transaction lock region to a write lock */
2021 if (tdb_brlock_upgrade(tdb, FREELIST_TOP, 0) == -1) {
2022 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to upgrade hash locks\n"));
2023 tdb->ecode = TDB_ERR_LOCK;
2024 tdb_transaction_cancel(tdb);
2025 return -1;
2026 }
2027
2028 /* get the global lock - this prevents new users attaching to the database
2029 during the commit */
2030 if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) {
2031 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: failed to get global lock\n"));
2032 tdb->ecode = TDB_ERR_LOCK;
2033 tdb_transaction_cancel(tdb);
2034 return -1;
2035 }
2036
2037 if (!(tdb->flags & TDB_NOSYNC)) {
2038 /* write the recovery data to the end of the file */
2039 if (transaction_setup_recovery(tdb, &magic_offset) == -1) {
2040 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: failed to setup recovery data\n"));
2041 tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
2042 tdb_transaction_cancel(tdb);
2043 return -1;
2044 }
2045 }
2046
2047 /* expand the file to the new size if needed */
2048 if (tdb->map_size != tdb->transaction->old_map_size) {
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002049 if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
2050 tdb->map_size -
Theodore Ts'o106ad962007-04-04 21:26:37 -04002051 tdb->transaction->old_map_size) == -1) {
2052 tdb->ecode = TDB_ERR_IO;
2053 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: expansion failed\n"));
2054 tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
2055 tdb_transaction_cancel(tdb);
2056 return -1;
2057 }
2058 tdb->map_size = tdb->transaction->old_map_size;
2059 methods->tdb_oob(tdb, tdb->map_size + 1, 1);
2060 }
2061
2062 /* perform all the writes */
2063 while (tdb->transaction->elements) {
2064 struct tdb_transaction_el *el = tdb->transaction->elements;
2065
2066 if (methods->tdb_write(tdb, el->offset, el->data, el->length) == -1) {
2067 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed during commit\n"));
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002068
Theodore Ts'o106ad962007-04-04 21:26:37 -04002069 /* we've overwritten part of the data and
2070 possibly expanded the file, so we need to
2071 run the crash recovery code */
2072 tdb->methods = methods;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002073 tdb_transaction_recover(tdb);
Theodore Ts'o106ad962007-04-04 21:26:37 -04002074
2075 tdb_transaction_cancel(tdb);
2076 tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
2077
2078 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed\n"));
2079 return -1;
2080 }
2081 tdb->transaction->elements = el->next;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002082 free(el->data);
Theodore Ts'o106ad962007-04-04 21:26:37 -04002083 free(el);
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002084 }
Theodore Ts'o106ad962007-04-04 21:26:37 -04002085
2086 if (!(tdb->flags & TDB_NOSYNC)) {
2087 /* ensure the new data is on disk */
2088 if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
2089 return -1;
2090 }
2091
2092 /* remove the recovery marker */
2093 if (methods->tdb_write(tdb, magic_offset, &zero, 4) == -1) {
2094 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: failed to remove recovery magic\n"));
2095 return -1;
2096 }
2097
2098 /* ensure the recovery marker has been removed on disk */
2099 if (transaction_sync(tdb, magic_offset, 4) == -1) {
2100 return -1;
2101 }
2102 }
2103
2104 tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
2105
2106 /*
2107 TODO: maybe write to some dummy hdr field, or write to magic
2108 offset without mmap, before the last sync, instead of the
2109 utime() call
2110 */
2111
2112 /* on some systems (like Linux 2.6.x) changes via mmap/msync
2113 don't change the mtime of the file, this means the file may
2114 not be backed up (as tdb rounding to block sizes means that
2115 file size changes are quite rare too). The following forces
2116 mtime changes when a transaction completes */
2117#ifdef HAVE_UTIME
2118 utime(tdb->name, NULL);
2119#endif
2120
2121 /* use a transaction cancel to free memory and remove the
2122 transaction locks */
2123 tdb_transaction_cancel(tdb);
2124 return 0;
2125}
2126
2127
2128/*
2129 recover from an aborted transaction. Must be called with exclusive
2130 database write access already established (including the global
2131 lock to prevent new processes attaching)
2132*/
2133int tdb_transaction_recover(struct tdb_context *tdb)
2134{
2135 tdb_off_t recovery_head, recovery_eof;
2136 unsigned char *data, *p;
2137 u32 zero = 0;
2138 struct list_struct rec;
2139
2140 /* find the recovery area */
2141 if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
2142 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery head\n"));
2143 tdb->ecode = TDB_ERR_IO;
2144 return -1;
2145 }
2146
2147 if (recovery_head == 0) {
2148 /* we have never allocated a recovery record */
2149 return 0;
2150 }
2151
2152 /* read the recovery record */
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002153 if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
Theodore Ts'o106ad962007-04-04 21:26:37 -04002154 sizeof(rec), DOCONV()) == -1) {
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002155 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery record\n"));
Theodore Ts'o106ad962007-04-04 21:26:37 -04002156 tdb->ecode = TDB_ERR_IO;
2157 return -1;
2158 }
2159
2160 if (rec.magic != TDB_RECOVERY_MAGIC) {
2161 /* there is no valid recovery data */
2162 return 0;
2163 }
2164
2165 if (tdb->read_only) {
2166 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: attempt to recover read only database\n"));
2167 tdb->ecode = TDB_ERR_CORRUPT;
2168 return -1;
2169 }
2170
2171 recovery_eof = rec.key_len;
2172
2173 data = (unsigned char *)malloc(rec.data_len);
2174 if (data == NULL) {
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002175 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to allocate recovery data\n"));
Theodore Ts'o106ad962007-04-04 21:26:37 -04002176 tdb->ecode = TDB_ERR_OOM;
2177 return -1;
2178 }
2179
2180 /* read the full recovery data */
2181 if (tdb->methods->tdb_read(tdb, recovery_head + sizeof(rec), data,
2182 rec.data_len, 0) == -1) {
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002183 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery data\n"));
Theodore Ts'o106ad962007-04-04 21:26:37 -04002184 tdb->ecode = TDB_ERR_IO;
2185 return -1;
2186 }
2187
2188 /* recover the file data */
2189 p = data;
2190 while (p+8 < data + rec.data_len) {
2191 u32 ofs, len;
2192 if (DOCONV()) {
2193 tdb_convert(p, 8);
2194 }
2195 memcpy(&ofs, p, 4);
2196 memcpy(&len, p+4, 4);
2197
2198 if (tdb->methods->tdb_write(tdb, ofs, p+8, len) == -1) {
2199 free(data);
2200 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to recover %d bytes at offset %d\n", len, ofs));
2201 tdb->ecode = TDB_ERR_IO;
2202 return -1;
2203 }
2204 p += 8 + len;
2205 }
2206
2207 free(data);
2208
2209 if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
2210 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync recovery\n"));
2211 tdb->ecode = TDB_ERR_IO;
2212 return -1;
2213 }
2214
2215 /* if the recovery area is after the recovered eof then remove it */
2216 if (recovery_eof <= recovery_head) {
2217 if (tdb_ofs_write(tdb, TDB_RECOVERY_HEAD, &zero) == -1) {
2218 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery head\n"));
2219 tdb->ecode = TDB_ERR_IO;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002220 return -1;
Theodore Ts'o106ad962007-04-04 21:26:37 -04002221 }
2222 }
2223
2224 /* remove the recovery magic */
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002225 if (tdb_ofs_write(tdb, recovery_head + offsetof(struct list_struct, magic),
Theodore Ts'o106ad962007-04-04 21:26:37 -04002226 &zero) == -1) {
2227 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery magic\n"));
2228 tdb->ecode = TDB_ERR_IO;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002229 return -1;
Theodore Ts'o106ad962007-04-04 21:26:37 -04002230 }
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002231
Theodore Ts'o106ad962007-04-04 21:26:37 -04002232 /* reduce the file size to the old size */
2233 tdb_munmap(tdb);
2234 if (ftruncate(tdb->fd, recovery_eof) != 0) {
2235 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to reduce to recovery size\n"));
2236 tdb->ecode = TDB_ERR_IO;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002237 return -1;
Theodore Ts'o106ad962007-04-04 21:26:37 -04002238 }
2239 tdb->map_size = recovery_eof;
2240 tdb_mmap(tdb);
2241
2242 if (transaction_sync(tdb, 0, recovery_eof) == -1) {
2243 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync2 recovery\n"));
2244 tdb->ecode = TDB_ERR_IO;
2245 return -1;
2246 }
2247
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002248 TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_recover: recovered %d byte database\n",
Theodore Ts'o106ad962007-04-04 21:26:37 -04002249 recovery_eof));
2250
2251 /* all done */
2252 return 0;
2253}
2254
2255/* file: freelist.c */
2256
2257/* read a freelist record and check for simple errors */
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002258static int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off, struct list_struct *rec)
Theodore Ts'o106ad962007-04-04 21:26:37 -04002259{
2260 if (tdb->methods->tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
2261 return -1;
2262
2263 if (rec->magic == TDB_MAGIC) {
2264 /* this happens when a app is showdown while deleting a record - we should
2265 not completely fail when this happens */
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002266 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
Theodore Ts'o106ad962007-04-04 21:26:37 -04002267 rec->magic, off));
2268 rec->magic = TDB_FREE_MAGIC;
2269 if (tdb->methods->tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
2270 return -1;
2271 }
2272
2273 if (rec->magic != TDB_FREE_MAGIC) {
2274 /* Ensure ecode is set for log fn. */
2275 tdb->ecode = TDB_ERR_CORRUPT;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002276 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_rec_free_read bad magic 0x%x at offset=%d\n",
Theodore Ts'o106ad962007-04-04 21:26:37 -04002277 rec->magic, off));
2278 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
2279 }
2280 if (tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
2281 return -1;
2282 return 0;
2283}
2284
2285
2286
2287/* Remove an element from the freelist. Must have alloc lock. */
2288static int remove_from_freelist(struct tdb_context *tdb, tdb_off_t off, tdb_off_t next)
2289{
2290 tdb_off_t last_ptr, i;
2291
2292 /* read in the freelist top */
2293 last_ptr = FREELIST_TOP;
2294 while (tdb_ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
2295 if (i == off) {
2296 /* We've found it! */
2297 return tdb_ofs_write(tdb, last_ptr, &next);
2298 }
2299 /* Follow chain (next offset is at start of record) */
2300 last_ptr = i;
2301 }
2302 TDB_LOG((tdb, TDB_DEBUG_FATAL,"remove_from_freelist: not on list at off=%d\n", off));
2303 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
2304}
2305
2306
2307/* update a record tailer (must hold allocation lock) */
2308static int update_tailer(struct tdb_context *tdb, tdb_off_t offset,
2309 const struct list_struct *rec)
2310{
2311 tdb_off_t totalsize;
2312
2313 /* Offset of tailer from record header */
2314 totalsize = sizeof(*rec) + rec->rec_len;
2315 return tdb_ofs_write(tdb, offset + totalsize - sizeof(tdb_off_t),
2316 &totalsize);
2317}
2318
2319/* Add an element into the freelist. Merge adjacent records if
2320 neccessary. */
2321int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec)
2322{
2323 tdb_off_t right, left;
2324
2325 /* Allocation and tailer lock */
2326 if (tdb_lock(tdb, -1, F_WRLCK) != 0)
2327 return -1;
2328
2329 /* set an initial tailer, so if we fail we don't leave a bogus record */
2330 if (update_tailer(tdb, offset, rec) != 0) {
2331 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed!\n"));
2332 goto fail;
2333 }
2334
2335 /* Look right first (I'm an Australian, dammit) */
2336 right = offset + sizeof(*rec) + rec->rec_len;
2337 if (right + sizeof(*rec) <= tdb->map_size) {
2338 struct list_struct r;
2339
2340 if (tdb->methods->tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
2341 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: right read failed at %u\n", right));
2342 goto left;
2343 }
2344
2345 /* If it's free, expand to include it. */
2346 if (r.magic == TDB_FREE_MAGIC) {
2347 if (remove_from_freelist(tdb, right, r.next) == -1) {
2348 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: right free failed at %u\n", right));
2349 goto left;
2350 }
2351 rec->rec_len += sizeof(r) + r.rec_len;
2352 }
2353 }
2354
2355left:
2356 /* Look left */
2357 left = offset - sizeof(tdb_off_t);
2358 if (left > TDB_DATA_START(tdb->header.hash_size)) {
2359 struct list_struct l;
2360 tdb_off_t leftsize;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002361
Theodore Ts'o106ad962007-04-04 21:26:37 -04002362 /* Read in tailer and jump back to header */
2363 if (tdb_ofs_read(tdb, left, &leftsize) == -1) {
2364 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left offset read failed at %u\n", left));
2365 goto update;
2366 }
2367
2368 /* it could be uninitialised data */
2369 if (leftsize == 0 || leftsize == TDB_PAD_U32) {
2370 goto update;
2371 }
2372
2373 left = offset - leftsize;
2374
2375 /* Now read in record */
2376 if (tdb->methods->tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
2377 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
2378 goto update;
2379 }
2380
2381 /* If it's free, expand to include it. */
2382 if (l.magic == TDB_FREE_MAGIC) {
2383 if (remove_from_freelist(tdb, left, l.next) == -1) {
2384 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left free failed at %u\n", left));
2385 goto update;
2386 } else {
2387 offset = left;
2388 rec->rec_len += leftsize;
2389 }
2390 }
2391 }
2392
2393update:
2394 if (update_tailer(tdb, offset, rec) == -1) {
2395 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed at %u\n", offset));
2396 goto fail;
2397 }
2398
2399 /* Now, prepend to free list */
2400 rec->magic = TDB_FREE_MAGIC;
2401
2402 if (tdb_ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
2403 tdb_rec_write(tdb, offset, rec) == -1 ||
2404 tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
2405 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free record write failed at offset=%d\n", offset));
2406 goto fail;
2407 }
2408
2409 /* And we're done. */
2410 tdb_unlock(tdb, -1, F_WRLCK);
2411 return 0;
2412
2413 fail:
2414 tdb_unlock(tdb, -1, F_WRLCK);
2415 return -1;
2416}
2417
2418
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002419/*
Theodore Ts'o106ad962007-04-04 21:26:37 -04002420 the core of tdb_allocate - called when we have decided which
2421 free list entry to use
2422 */
2423static tdb_off_t tdb_allocate_ofs(struct tdb_context *tdb, tdb_len_t length, tdb_off_t rec_ptr,
2424 struct list_struct *rec, tdb_off_t last_ptr)
2425{
2426 struct list_struct newrec;
2427 tdb_off_t newrec_ptr;
2428
2429 memset(&newrec, '\0', sizeof(newrec));
2430
2431 /* found it - now possibly split it up */
2432 if (rec->rec_len > length + MIN_REC_SIZE) {
2433 /* Length of left piece */
2434 length = TDB_ALIGN(length, TDB_ALIGNMENT);
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002435
Theodore Ts'o106ad962007-04-04 21:26:37 -04002436 /* Right piece to go on free list */
2437 newrec.rec_len = rec->rec_len - (sizeof(*rec) + length);
2438 newrec_ptr = rec_ptr + sizeof(*rec) + length;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002439
Theodore Ts'o106ad962007-04-04 21:26:37 -04002440 /* And left record is shortened */
2441 rec->rec_len = length;
2442 } else {
2443 newrec_ptr = 0;
2444 }
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002445
Theodore Ts'o106ad962007-04-04 21:26:37 -04002446 /* Remove allocated record from the free list */
2447 if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1) {
2448 return 0;
2449 }
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002450
Theodore Ts'o106ad962007-04-04 21:26:37 -04002451 /* Update header: do this before we drop alloc
2452 lock, otherwise tdb_free() might try to
2453 merge with us, thinking we're free.
2454 (Thanks Jeremy Allison). */
2455 rec->magic = TDB_MAGIC;
2456 if (tdb_rec_write(tdb, rec_ptr, rec) == -1) {
2457 return 0;
2458 }
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002459
Theodore Ts'o106ad962007-04-04 21:26:37 -04002460 /* Did we create new block? */
2461 if (newrec_ptr) {
2462 /* Update allocated record tailer (we
2463 shortened it). */
2464 if (update_tailer(tdb, rec_ptr, rec) == -1) {
2465 return 0;
2466 }
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002467
Theodore Ts'o106ad962007-04-04 21:26:37 -04002468 /* Free new record */
2469 if (tdb_free(tdb, newrec_ptr, &newrec) == -1) {
2470 return 0;
2471 }
2472 }
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002473
Theodore Ts'o106ad962007-04-04 21:26:37 -04002474 /* all done - return the new record offset */
2475 return rec_ptr;
2476}
2477
2478/* allocate some space from the free list. The offset returned points
2479 to a unconnected list_struct within the database with room for at
2480 least length bytes of total data
2481
2482 0 is returned if the space could not be allocated
2483 */
2484tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct list_struct *rec)
2485{
2486 tdb_off_t rec_ptr, last_ptr, newrec_ptr;
2487 struct {
2488 tdb_off_t rec_ptr, last_ptr;
2489 tdb_len_t rec_len;
2490 } bestfit;
2491
2492 if (tdb_lock(tdb, -1, F_WRLCK) == -1)
2493 return 0;
2494
2495 /* Extra bytes required for tailer */
2496 length += sizeof(tdb_off_t);
2497
2498 again:
2499 last_ptr = FREELIST_TOP;
2500
2501 /* read in the freelist top */
2502 if (tdb_ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
2503 goto fail;
2504
2505 bestfit.rec_ptr = 0;
2506 bestfit.last_ptr = 0;
2507 bestfit.rec_len = 0;
2508
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002509 /*
Theodore Ts'o106ad962007-04-04 21:26:37 -04002510 this is a best fit allocation strategy. Originally we used
2511 a first fit strategy, but it suffered from massive fragmentation
2512 issues when faced with a slowly increasing record size.
2513 */
2514 while (rec_ptr) {
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002515 if (tdb_rec_free_read(tdb, rec_ptr, rec) == -1) {
Theodore Ts'o106ad962007-04-04 21:26:37 -04002516 goto fail;
2517 }
2518
2519 if (rec->rec_len >= length) {
2520 if (bestfit.rec_ptr == 0 ||
2521 rec->rec_len < bestfit.rec_len) {
2522 bestfit.rec_len = rec->rec_len;
2523 bestfit.rec_ptr = rec_ptr;
2524 bestfit.last_ptr = last_ptr;
2525 /* consider a fit to be good enough if
2526 we aren't wasting more than half
2527 the space */
2528 if (bestfit.rec_len < 2*length) {
2529 break;
2530 }
2531 }
2532 }
2533
2534 /* move to the next record */
2535 last_ptr = rec_ptr;
2536 rec_ptr = rec->next;
2537 }
2538
2539 if (bestfit.rec_ptr != 0) {
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002540 if (tdb_rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) {
Theodore Ts'o106ad962007-04-04 21:26:37 -04002541 goto fail;
2542 }
2543
2544 newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, rec, bestfit.last_ptr);
2545 tdb_unlock(tdb, -1, F_WRLCK);
2546 return newrec_ptr;
2547 }
2548
2549 /* we didn't find enough space. See if we can expand the
2550 database and if we can then try again */
2551 if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
2552 goto again;
2553 fail:
2554 tdb_unlock(tdb, -1, F_WRLCK);
2555 return 0;
2556}
2557
2558/* file: freelistcheck.c */
2559
2560/* Check the freelist is good and contains no loops.
2561 Very memory intensive - only do this as a consistency
2562 checker. Heh heh - uses an in memory tdb as the storage
2563 for the "seen" record list. For some reason this strikes
2564 me as extremely clever as I don't have to write another tree
2565 data structure implementation :-).
2566 */
2567
2568static int seen_insert(struct tdb_context *mem_tdb, tdb_off_t rec_ptr)
2569{
2570 TDB_DATA key, data;
2571
2572 memset(&data, '\0', sizeof(data));
2573 key.dptr = (unsigned char *)&rec_ptr;
2574 key.dsize = sizeof(rec_ptr);
2575 return tdb_store(mem_tdb, key, data, TDB_INSERT);
2576}
2577
2578int tdb_validate_freelist(struct tdb_context *tdb, int *pnum_entries)
2579{
2580 struct tdb_context *mem_tdb = NULL;
2581 struct list_struct rec;
2582 tdb_off_t rec_ptr, last_ptr;
2583 int ret = -1;
2584
2585 *pnum_entries = 0;
2586
2587 mem_tdb = tdb_open("flval", tdb->header.hash_size,
2588 TDB_INTERNAL, O_RDWR, 0600);
2589 if (!mem_tdb) {
2590 return -1;
2591 }
2592
2593 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
2594 tdb_close(mem_tdb);
2595 return 0;
2596 }
2597
2598 last_ptr = FREELIST_TOP;
2599
2600 /* Store the FREELIST_TOP record. */
2601 if (seen_insert(mem_tdb, last_ptr) == -1) {
2602 ret = TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
2603 goto fail;
2604 }
2605
2606 /* read in the freelist top */
2607 if (tdb_ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1) {
2608 goto fail;
2609 }
2610
2611 while (rec_ptr) {
2612
2613 /* If we can't store this record (we've seen it
2614 before) then the free list has a loop and must
2615 be corrupt. */
2616
2617 if (seen_insert(mem_tdb, rec_ptr)) {
2618 ret = TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
2619 goto fail;
2620 }
2621
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002622 if (tdb_rec_free_read(tdb, rec_ptr, &rec) == -1) {
Theodore Ts'o106ad962007-04-04 21:26:37 -04002623 goto fail;
2624 }
2625
2626 /* move to the next record */
2627 last_ptr = rec_ptr;
2628 rec_ptr = rec.next;
2629 *pnum_entries += 1;
2630 }
2631
2632 ret = 0;
2633
2634 fail:
2635
2636 tdb_close(mem_tdb);
2637 tdb_unlock(tdb, -1, F_WRLCK);
2638 return ret;
2639}
2640
2641/* file: traverse.c */
2642
2643/* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
2644static int tdb_next_lock(struct tdb_context *tdb, struct tdb_traverse_lock *tlock,
2645 struct list_struct *rec)
2646{
2647 int want_next = (tlock->off != 0);
2648
2649 /* Lock each chain from the start one. */
2650 for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
2651 if (!tlock->off && tlock->hash != 0) {
2652 /* this is an optimisation for the common case where
2653 the hash chain is empty, which is particularly
2654 common for the use of tdb with ldb, where large
2655 hashes are used. In that case we spend most of our
2656 time in tdb_brlock(), locking empty hash chains.
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002657
Theodore Ts'o106ad962007-04-04 21:26:37 -04002658 To avoid this, we do an unlocked pre-check to see
2659 if the hash chain is empty before starting to look
2660 inside it. If it is empty then we can avoid that
2661 hash chain. If it isn't empty then we can't believe
2662 the value we get back, as we read it without a
2663 lock, so instead we get the lock and re-fetch the
2664 value below.
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002665
Theodore Ts'o106ad962007-04-04 21:26:37 -04002666 Notice that not doing this optimisation on the
2667 first hash chain is critical. We must guarantee
2668 that we have done at least one fcntl lock at the
2669 start of a search to guarantee that memory is
2670 coherent on SMP systems. If records are added by
2671 others during the search then thats OK, and we
2672 could possibly miss those with this trick, but we
2673 could miss them anyway without this trick, so the
2674 semantics don't change.
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002675
Theodore Ts'o106ad962007-04-04 21:26:37 -04002676 With a non-indexed ldb search this trick gains us a
2677 factor of around 80 in speed on a linux 2.6.x
2678 system (testing using ldbtest).
2679 */
2680 tdb->methods->next_hash_chain(tdb, &tlock->hash);
2681 if (tlock->hash == tdb->header.hash_size) {
2682 continue;
2683 }
2684 }
2685
2686 if (tdb_lock(tdb, tlock->hash, tlock->lock_rw) == -1)
2687 return -1;
2688
2689 /* No previous record? Start at top of chain. */
2690 if (!tlock->off) {
2691 if (tdb_ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
2692 &tlock->off) == -1)
2693 goto fail;
2694 } else {
2695 /* Otherwise unlock the previous record. */
2696 if (tdb_unlock_record(tdb, tlock->off) != 0)
2697 goto fail;
2698 }
2699
2700 if (want_next) {
2701 /* We have offset of old record: grab next */
2702 if (tdb_rec_read(tdb, tlock->off, rec) == -1)
2703 goto fail;
2704 tlock->off = rec->next;
2705 }
2706
2707 /* Iterate through chain */
2708 while( tlock->off) {
2709 tdb_off_t current;
2710 if (tdb_rec_read(tdb, tlock->off, rec) == -1)
2711 goto fail;
2712
2713 /* Detect infinite loops. From "Shlomi Yaakobovich" <Shlomi@exanet.com>. */
2714 if (tlock->off == rec->next) {
2715 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_next_lock: loop detected.\n"));
2716 goto fail;
2717 }
2718
2719 if (!TDB_DEAD(rec)) {
2720 /* Woohoo: we found one! */
2721 if (tdb_lock_record(tdb, tlock->off) != 0)
2722 goto fail;
2723 return tlock->off;
2724 }
2725
2726 /* Try to clean dead ones from old traverses */
2727 current = tlock->off;
2728 tlock->off = rec->next;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002729 if (!(tdb->read_only || tdb->traverse_read) &&
Theodore Ts'o106ad962007-04-04 21:26:37 -04002730 tdb_do_delete(tdb, current, rec) != 0)
2731 goto fail;
2732 }
2733 tdb_unlock(tdb, tlock->hash, tlock->lock_rw);
2734 want_next = 0;
2735 }
2736 /* We finished iteration without finding anything */
2737 return TDB_ERRCODE(TDB_SUCCESS, 0);
2738
2739 fail:
2740 tlock->off = 0;
2741 if (tdb_unlock(tdb, tlock->hash, tlock->lock_rw) != 0)
2742 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_next_lock: On error unlock failed!\n"));
2743 return -1;
2744}
2745
2746/* traverse the entire database - calling fn(tdb, key, data) on each element.
2747 return -1 on error or the record count traversed
2748 if fn is NULL then it is not called
2749 a non-zero return value from fn() indicates that the traversal should stop
2750 */
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002751static int tdb_traverse_internal(struct tdb_context *tdb,
Theodore Ts'o106ad962007-04-04 21:26:37 -04002752 tdb_traverse_func fn, void *private_data,
2753 struct tdb_traverse_lock *tl)
2754{
2755 TDB_DATA key, dbuf;
2756 struct list_struct rec;
2757 int ret, count = 0;
2758
2759 /* This was in the initializaton, above, but the IRIX compiler
2760 * did not like it. crh
2761 */
2762 tl->next = tdb->travlocks.next;
2763
2764 /* fcntl locks don't stack: beware traverse inside traverse */
2765 tdb->travlocks.next = tl;
2766
2767 /* tdb_next_lock places locks on the record returned, and its chain */
2768 while ((ret = tdb_next_lock(tdb, tl, &rec)) > 0) {
2769 count++;
2770 /* now read the full record */
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002771 key.dptr = tdb_alloc_read(tdb, tl->off + sizeof(rec),
Theodore Ts'o106ad962007-04-04 21:26:37 -04002772 rec.key_len + rec.data_len);
2773 if (!key.dptr) {
2774 ret = -1;
2775 if (tdb_unlock(tdb, tl->hash, tl->lock_rw) != 0)
2776 goto out;
2777 if (tdb_unlock_record(tdb, tl->off) != 0)
2778 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
2779 goto out;
2780 }
2781 key.dsize = rec.key_len;
2782 dbuf.dptr = key.dptr + rec.key_len;
2783 dbuf.dsize = rec.data_len;
2784
2785 /* Drop chain lock, call out */
2786 if (tdb_unlock(tdb, tl->hash, tl->lock_rw) != 0) {
2787 ret = -1;
2788 SAFE_FREE(key.dptr);
2789 goto out;
2790 }
2791 if (fn && fn(tdb, key, dbuf, private_data)) {
2792 /* They want us to terminate traversal */
2793 ret = count;
2794 if (tdb_unlock_record(tdb, tl->off) != 0) {
2795 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_traverse: unlock_record failed!\n"));;
2796 ret = -1;
2797 }
2798 SAFE_FREE(key.dptr);
2799 goto out;
2800 }
2801 SAFE_FREE(key.dptr);
2802 }
2803out:
2804 tdb->travlocks.next = tl->next;
2805 if (ret < 0)
2806 return -1;
2807 else
2808 return count;
2809}
2810
2811
2812/*
2813 a write style traverse - temporarily marks the db read only
2814*/
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002815int tdb_traverse_read(struct tdb_context *tdb,
Theodore Ts'o106ad962007-04-04 21:26:37 -04002816 tdb_traverse_func fn, void *private_data)
2817{
2818 struct tdb_traverse_lock tl = { NULL, 0, 0, F_RDLCK };
2819 int ret;
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002820
Theodore Ts'o106ad962007-04-04 21:26:37 -04002821 /* we need to get a read lock on the transaction lock here to
2822 cope with the lock ordering semantics of solaris10 */
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002823 if (tdb_transaction_lock(tdb, F_RDLCK)) {
Theodore Ts'o106ad962007-04-04 21:26:37 -04002824 return -1;
2825 }
2826
2827 tdb->traverse_read++;
2828 ret = tdb_traverse_internal(tdb, fn, private_data, &tl);
2829 tdb->traverse_read--;
2830
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002831 tdb_transaction_unlock(tdb);
Theodore Ts'o106ad962007-04-04 21:26:37 -04002832
2833 return ret;
2834}
2835
2836/*
2837 a write style traverse - needs to get the transaction lock to
2838 prevent deadlocks
2839*/
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002840int tdb_traverse(struct tdb_context *tdb,
Theodore Ts'o106ad962007-04-04 21:26:37 -04002841 tdb_traverse_func fn, void *private_data)
2842{
2843 struct tdb_traverse_lock tl = { NULL, 0, 0, F_WRLCK };
2844 int ret;
2845
2846 if (tdb->read_only || tdb->traverse_read) {
2847 return tdb_traverse_read(tdb, fn, private_data);
2848 }
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002849
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002850 if (tdb_transaction_lock(tdb, F_WRLCK)) {
Theodore Ts'o106ad962007-04-04 21:26:37 -04002851 return -1;
2852 }
2853
2854 ret = tdb_traverse_internal(tdb, fn, private_data, &tl);
2855
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002856 tdb_transaction_unlock(tdb);
Theodore Ts'o106ad962007-04-04 21:26:37 -04002857
2858 return ret;
2859}
2860
2861
2862/* find the first entry in the database and return its key */
2863TDB_DATA tdb_firstkey(struct tdb_context *tdb)
2864{
2865 TDB_DATA key;
2866 struct list_struct rec;
2867
2868 /* release any old lock */
2869 if (tdb_unlock_record(tdb, tdb->travlocks.off) != 0)
2870 return tdb_null;
2871 tdb->travlocks.off = tdb->travlocks.hash = 0;
2872 tdb->travlocks.lock_rw = F_RDLCK;
2873
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002874 /* Grab first record: locks chain and returned record. */
Theodore Ts'o106ad962007-04-04 21:26:37 -04002875 if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
2876 return tdb_null;
2877 /* now read the key */
2878 key.dsize = rec.key_len;
2879 key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002880
2881 /* Unlock the hash chain of the record we just read. */
2882 if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0)
Theodore Ts'o106ad962007-04-04 21:26:37 -04002883 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
2884 return key;
2885}
2886
2887/* find the next entry in the database, returning its key */
2888TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
2889{
2890 u32 oldhash;
2891 TDB_DATA key = tdb_null;
2892 struct list_struct rec;
2893 unsigned char *k = NULL;
2894
2895 /* Is locked key the old key? If so, traverse will be reliable. */
2896 if (tdb->travlocks.off) {
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002897 if (tdb_lock(tdb,tdb->travlocks.hash,tdb->travlocks.lock_rw))
Theodore Ts'o106ad962007-04-04 21:26:37 -04002898 return tdb_null;
2899 if (tdb_rec_read(tdb, tdb->travlocks.off, &rec) == -1
2900 || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
2901 rec.key_len))
2902 || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
2903 /* No, it wasn't: unlock it and start from scratch */
2904 if (tdb_unlock_record(tdb, tdb->travlocks.off) != 0) {
2905 SAFE_FREE(k);
2906 return tdb_null;
2907 }
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002908 if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0) {
Theodore Ts'o106ad962007-04-04 21:26:37 -04002909 SAFE_FREE(k);
2910 return tdb_null;
2911 }
2912 tdb->travlocks.off = 0;
2913 }
2914
2915 SAFE_FREE(k);
2916 }
2917
2918 if (!tdb->travlocks.off) {
2919 /* No previous element: do normal find, and lock record */
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002920 tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), tdb->travlocks.lock_rw, &rec);
Theodore Ts'o106ad962007-04-04 21:26:37 -04002921 if (!tdb->travlocks.off)
2922 return tdb_null;
2923 tdb->travlocks.hash = BUCKET(rec.full_hash);
2924 if (tdb_lock_record(tdb, tdb->travlocks.off) != 0) {
2925 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
2926 return tdb_null;
2927 }
2928 }
2929 oldhash = tdb->travlocks.hash;
2930
2931 /* Grab next record: locks chain and returned record,
2932 unlocks old record */
2933 if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
2934 key.dsize = rec.key_len;
2935 key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
2936 key.dsize);
2937 /* Unlock the chain of this new record */
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002938 if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0)
Theodore Ts'o106ad962007-04-04 21:26:37 -04002939 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
2940 }
2941 /* Unlock the chain of old record */
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002942 if (tdb_unlock(tdb, BUCKET(oldhash), tdb->travlocks.lock_rw) != 0)
Theodore Ts'o106ad962007-04-04 21:26:37 -04002943 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
2944 return key;
2945}
2946
2947/* file: dump.c */
2948
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002949static tdb_off_t tdb_dump_record(struct tdb_context *tdb, int hash,
2950 tdb_off_t offset)
Theodore Ts'o106ad962007-04-04 21:26:37 -04002951{
2952 struct list_struct rec;
2953 tdb_off_t tailer_ofs, tailer;
2954
Theodore Ts'oefc6f622008-08-27 23:07:54 -04002955 if (tdb->methods->tdb_read(tdb, offset, (char *)&rec,
Theodore Ts'o106ad962007-04-04 21:26:37 -04002956 sizeof(rec), DOCONV()) == -1) {
2957 printf("ERROR: failed to read record at %u\n", offset);
2958 return 0;
2959 }
2960
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002961 printf(" rec: hash=%d offset=0x%08x next=0x%08x rec_len=%d "
2962 "key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
2963 hash, offset, rec.next, rec.rec_len, rec.key_len, rec.data_len,
2964 rec.full_hash, rec.magic);
Theodore Ts'o106ad962007-04-04 21:26:37 -04002965
2966 tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off_t);
2967
2968 if (tdb_ofs_read(tdb, tailer_ofs, &tailer) == -1) {
2969 printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
2970 return rec.next;
2971 }
2972
2973 if (tailer != rec.rec_len + sizeof(rec)) {
2974 printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
2975 (unsigned int)tailer, (unsigned int)(rec.rec_len + sizeof(rec)));
2976 }
2977 return rec.next;
2978}
2979
2980static int tdb_dump_chain(struct tdb_context *tdb, int i)
2981{
2982 tdb_off_t rec_ptr, top;
2983
2984 top = TDB_HASH_TOP(i);
2985
2986 if (tdb_lock(tdb, i, F_WRLCK) != 0)
2987 return -1;
2988
2989 if (tdb_ofs_read(tdb, top, &rec_ptr) == -1)
2990 return tdb_unlock(tdb, i, F_WRLCK);
2991
2992 if (rec_ptr)
2993 printf("hash=%d\n", i);
2994
2995 while (rec_ptr) {
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05002996 rec_ptr = tdb_dump_record(tdb, i, rec_ptr);
Theodore Ts'o106ad962007-04-04 21:26:37 -04002997 }
2998
2999 return tdb_unlock(tdb, i, F_WRLCK);
3000}
3001
3002void tdb_dump_all(struct tdb_context *tdb)
3003{
3004 int i;
3005 for (i=0;i<tdb->header.hash_size;i++) {
3006 tdb_dump_chain(tdb, i);
3007 }
3008 printf("freelist:\n");
3009 tdb_dump_chain(tdb, -1);
3010}
3011
3012int tdb_printfreelist(struct tdb_context *tdb)
3013{
3014 int ret;
3015 long total_free = 0;
3016 tdb_off_t offset, rec_ptr;
3017 struct list_struct rec;
3018
3019 if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
3020 return ret;
3021
3022 offset = FREELIST_TOP;
3023
3024 /* read in the freelist top */
3025 if (tdb_ofs_read(tdb, offset, &rec_ptr) == -1) {
3026 tdb_unlock(tdb, -1, F_WRLCK);
3027 return 0;
3028 }
3029
3030 printf("freelist top=[0x%08x]\n", rec_ptr );
3031 while (rec_ptr) {
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003032 if (tdb->methods->tdb_read(tdb, rec_ptr, (char *)&rec,
Theodore Ts'o106ad962007-04-04 21:26:37 -04003033 sizeof(rec), DOCONV()) == -1) {
3034 tdb_unlock(tdb, -1, F_WRLCK);
3035 return -1;
3036 }
3037
3038 if (rec.magic != TDB_FREE_MAGIC) {
3039 printf("bad magic 0x%08x in free list\n", rec.magic);
3040 tdb_unlock(tdb, -1, F_WRLCK);
3041 return -1;
3042 }
3043
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003044 printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)] (end = 0x%08x)\n",
Theodore Ts'o106ad962007-04-04 21:26:37 -04003045 rec_ptr, rec.rec_len, rec.rec_len, rec_ptr + rec.rec_len);
3046 total_free += rec.rec_len;
3047
3048 /* move to the next record */
3049 rec_ptr = rec.next;
3050 }
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003051 printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
Theodore Ts'o106ad962007-04-04 21:26:37 -04003052 (int)total_free);
3053
3054 return tdb_unlock(tdb, -1, F_WRLCK);
3055}
3056
3057/* file: tdb.c */
3058
Theodore Ts'o106ad962007-04-04 21:26:37 -04003059/*
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05003060 non-blocking increment of the tdb sequence number if the tdb has been opened using
Theodore Ts'o106ad962007-04-04 21:26:37 -04003061 the TDB_SEQNUM flag
3062*/
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05003063void tdb_increment_seqnum_nonblock(struct tdb_context *tdb)
Theodore Ts'o106ad962007-04-04 21:26:37 -04003064{
3065 tdb_off_t seqnum=0;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003066
Theodore Ts'o106ad962007-04-04 21:26:37 -04003067 if (!(tdb->flags & TDB_SEQNUM)) {
3068 return;
3069 }
3070
Theodore Ts'o106ad962007-04-04 21:26:37 -04003071 /* we ignore errors from this, as we have no sane way of
3072 dealing with them.
3073 */
3074 tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
3075 seqnum++;
3076 tdb_ofs_write(tdb, TDB_SEQNUM_OFS, &seqnum);
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05003077}
3078
3079/*
3080 increment the tdb sequence number if the tdb has been opened using
3081 the TDB_SEQNUM flag
3082*/
3083static void tdb_increment_seqnum(struct tdb_context *tdb)
3084{
3085 if (!(tdb->flags & TDB_SEQNUM)) {
3086 return;
3087 }
3088
3089 if (tdb_brlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, F_SETLKW, 1, 1) != 0) {
3090 return;
3091 }
3092
3093 tdb_increment_seqnum_nonblock(tdb);
Theodore Ts'o106ad962007-04-04 21:26:37 -04003094
3095 tdb_brlock(tdb, TDB_SEQNUM_OFS, F_UNLCK, F_SETLKW, 1, 1);
3096}
3097
3098static int tdb_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
3099{
3100 return memcmp(data.dptr, key.dptr, data.dsize);
3101}
3102
3103/* Returns 0 on fail. On success, return offset of record, and fills
3104 in rec */
3105static tdb_off_t tdb_find(struct tdb_context *tdb, TDB_DATA key, u32 hash,
3106 struct list_struct *r)
3107{
3108 tdb_off_t rec_ptr;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003109
Theodore Ts'o106ad962007-04-04 21:26:37 -04003110 /* read in the hash top */
3111 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
3112 return 0;
3113
3114 /* keep looking until we find the right record */
3115 while (rec_ptr) {
3116 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
3117 return 0;
3118
3119 if (!TDB_DEAD(r) && hash==r->full_hash
3120 && key.dsize==r->key_len
3121 && tdb_parse_data(tdb, key, rec_ptr + sizeof(*r),
3122 r->key_len, tdb_key_compare,
3123 NULL) == 0) {
3124 return rec_ptr;
3125 }
3126 rec_ptr = r->next;
3127 }
3128 return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
3129}
3130
3131/* As tdb_find, but if you succeed, keep the lock */
3132tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, u32 hash, int locktype,
3133 struct list_struct *rec)
3134{
3135 u32 rec_ptr;
3136
3137 if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
3138 return 0;
3139 if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
3140 tdb_unlock(tdb, BUCKET(hash), locktype);
3141 return rec_ptr;
3142}
3143
3144
3145/* update an entry in place - this only works if the new data size
3146 is <= the old data size and the key exists.
3147 on failure return -1.
3148*/
3149static int tdb_update_hash(struct tdb_context *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
3150{
3151 struct list_struct rec;
3152 tdb_off_t rec_ptr;
3153
3154 /* find entry */
3155 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
3156 return -1;
3157
3158 /* must be long enough key, data and tailer */
3159 if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off_t)) {
3160 tdb->ecode = TDB_SUCCESS; /* Not really an error */
3161 return -1;
3162 }
3163
3164 if (tdb->methods->tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
3165 dbuf.dptr, dbuf.dsize) == -1)
3166 return -1;
3167
3168 if (dbuf.dsize != rec.data_len) {
3169 /* update size */
3170 rec.data_len = dbuf.dsize;
3171 return tdb_rec_write(tdb, rec_ptr, &rec);
3172 }
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003173
Theodore Ts'o106ad962007-04-04 21:26:37 -04003174 return 0;
3175}
3176
3177/* find an entry in the database given a key */
3178/* If an entry doesn't exist tdb_err will be set to
3179 * TDB_ERR_NOEXIST. If a key has no data attached
3180 * then the TDB_DATA will have zero length but
3181 * a non-zero pointer
3182 */
3183TDB_DATA tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
3184{
3185 tdb_off_t rec_ptr;
3186 struct list_struct rec;
3187 TDB_DATA ret;
3188 u32 hash;
3189
3190 /* find which hash bucket it is in */
3191 hash = tdb->hash_fn(&key);
3192 if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
3193 return tdb_null;
3194
3195 ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
3196 rec.data_len);
3197 ret.dsize = rec.data_len;
3198 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
3199 return ret;
3200}
3201
3202/*
3203 * Find an entry in the database and hand the record's data to a parsing
3204 * function. The parsing function is executed under the chain read lock, so it
3205 * should be fast and should not block on other syscalls.
3206 *
3207 * DONT CALL OTHER TDB CALLS FROM THE PARSER, THIS MIGHT LEAD TO SEGFAULTS.
3208 *
3209 * For mmapped tdb's that do not have a transaction open it points the parsing
3210 * function directly at the mmap area, it avoids the malloc/memcpy in this
3211 * case. If a transaction is open or no mmap is available, it has to do
3212 * malloc/read/parse/free.
3213 *
3214 * This is interesting for all readers of potentially large data structures in
3215 * the tdb records, ldb indexes being one example.
3216 */
3217
3218int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
3219 int (*parser)(TDB_DATA key, TDB_DATA data,
3220 void *private_data),
3221 void *private_data)
3222{
3223 tdb_off_t rec_ptr;
3224 struct list_struct rec;
3225 int ret;
3226 u32 hash;
3227
3228 /* find which hash bucket it is in */
3229 hash = tdb->hash_fn(&key);
3230
3231 if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
3232 return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
3233 }
3234
3235 ret = tdb_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
3236 rec.data_len, parser, private_data);
3237
3238 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
3239
3240 return ret;
3241}
3242
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003243/* check if an entry in the database exists
Theodore Ts'o106ad962007-04-04 21:26:37 -04003244
3245 note that 1 is returned if the key is found and 0 is returned if not found
3246 this doesn't match the conventions in the rest of this module, but is
3247 compatible with gdbm
3248*/
3249static int tdb_exists_hash(struct tdb_context *tdb, TDB_DATA key, u32 hash)
3250{
3251 struct list_struct rec;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003252
Theodore Ts'o106ad962007-04-04 21:26:37 -04003253 if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
3254 return 0;
3255 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
3256 return 1;
3257}
3258
3259int tdb_exists(struct tdb_context *tdb, TDB_DATA key)
3260{
3261 u32 hash = tdb->hash_fn(&key);
3262 return tdb_exists_hash(tdb, key, hash);
3263}
3264
3265/* actually delete an entry in the database given the offset */
3266int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct list_struct*rec)
3267{
3268 tdb_off_t last_ptr, i;
3269 struct list_struct lastrec;
3270
3271 if (tdb->read_only || tdb->traverse_read) return -1;
3272
3273 if (tdb_write_lock_record(tdb, rec_ptr) == -1) {
3274 /* Someone traversing here: mark it as dead */
3275 rec->magic = TDB_DEAD_MAGIC;
3276 return tdb_rec_write(tdb, rec_ptr, rec);
3277 }
3278 if (tdb_write_unlock_record(tdb, rec_ptr) != 0)
3279 return -1;
3280
3281 /* find previous record in hash chain */
3282 if (tdb_ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
3283 return -1;
3284 for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
3285 if (tdb_rec_read(tdb, i, &lastrec) == -1)
3286 return -1;
3287
3288 /* unlink it: next ptr is at start of record. */
3289 if (last_ptr == 0)
3290 last_ptr = TDB_HASH_TOP(rec->full_hash);
3291 if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1)
3292 return -1;
3293
3294 /* recover the space */
3295 if (tdb_free(tdb, rec_ptr, rec) == -1)
3296 return -1;
3297 return 0;
3298}
3299
3300static int tdb_count_dead(struct tdb_context *tdb, u32 hash)
3301{
3302 int res = 0;
3303 tdb_off_t rec_ptr;
3304 struct list_struct rec;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003305
Theodore Ts'o106ad962007-04-04 21:26:37 -04003306 /* read in the hash top */
3307 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
3308 return 0;
3309
3310 while (rec_ptr) {
3311 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1)
3312 return 0;
3313
3314 if (rec.magic == TDB_DEAD_MAGIC) {
3315 res += 1;
3316 }
3317 rec_ptr = rec.next;
3318 }
3319 return res;
3320}
3321
3322/*
3323 * Purge all DEAD records from a hash chain
3324 */
3325static int tdb_purge_dead(struct tdb_context *tdb, u32 hash)
3326{
3327 int res = -1;
3328 struct list_struct rec;
3329 tdb_off_t rec_ptr;
3330
3331 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
3332 return -1;
3333 }
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003334
Theodore Ts'o106ad962007-04-04 21:26:37 -04003335 /* read in the hash top */
3336 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
3337 goto fail;
3338
3339 while (rec_ptr) {
3340 tdb_off_t next;
3341
3342 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1) {
3343 goto fail;
3344 }
3345
3346 next = rec.next;
3347
3348 if (rec.magic == TDB_DEAD_MAGIC
3349 && tdb_do_delete(tdb, rec_ptr, &rec) == -1) {
3350 goto fail;
3351 }
3352 rec_ptr = next;
3353 }
3354 res = 0;
3355 fail:
3356 tdb_unlock(tdb, -1, F_WRLCK);
3357 return res;
3358}
3359
3360/* delete an entry in the database given a key */
3361static int tdb_delete_hash(struct tdb_context *tdb, TDB_DATA key, u32 hash)
3362{
3363 tdb_off_t rec_ptr;
3364 struct list_struct rec;
3365 int ret;
3366
3367 if (tdb->max_dead_records != 0) {
3368
3369 /*
3370 * Allow for some dead records per hash chain, mainly for
3371 * tdb's with a very high create/delete rate like locking.tdb.
3372 */
3373
3374 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
3375 return -1;
3376
3377 if (tdb_count_dead(tdb, hash) >= tdb->max_dead_records) {
3378 /*
3379 * Don't let the per-chain freelist grow too large,
3380 * delete all existing dead records
3381 */
3382 tdb_purge_dead(tdb, hash);
3383 }
3384
3385 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
3386 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
3387 return -1;
3388 }
3389
3390 /*
3391 * Just mark the record as dead.
3392 */
3393 rec.magic = TDB_DEAD_MAGIC;
3394 ret = tdb_rec_write(tdb, rec_ptr, &rec);
3395 }
3396 else {
3397 if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK,
3398 &rec)))
3399 return -1;
3400
3401 ret = tdb_do_delete(tdb, rec_ptr, &rec);
3402 }
3403
3404 if (ret == 0) {
3405 tdb_increment_seqnum(tdb);
3406 }
3407
3408 if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
3409 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_delete: WARNING tdb_unlock failed!\n"));
3410 return ret;
3411}
3412
3413int tdb_delete(struct tdb_context *tdb, TDB_DATA key)
3414{
3415 u32 hash = tdb->hash_fn(&key);
3416 return tdb_delete_hash(tdb, key, hash);
3417}
3418
3419/*
3420 * See if we have a dead record around with enough space
3421 */
3422static tdb_off_t tdb_find_dead(struct tdb_context *tdb, u32 hash,
3423 struct list_struct *r, tdb_len_t length)
3424{
3425 tdb_off_t rec_ptr;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003426
Theodore Ts'o106ad962007-04-04 21:26:37 -04003427 /* read in the hash top */
3428 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
3429 return 0;
3430
3431 /* keep looking until we find the right record */
3432 while (rec_ptr) {
3433 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
3434 return 0;
3435
3436 if (TDB_DEAD(r) && r->rec_len >= length) {
3437 /*
3438 * First fit for simple coding, TODO: change to best
3439 * fit
3440 */
3441 return rec_ptr;
3442 }
3443 rec_ptr = r->next;
3444 }
3445 return 0;
3446}
3447
3448/* store an element in the database, replacing any existing element
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003449 with the same key
Theodore Ts'o106ad962007-04-04 21:26:37 -04003450
3451 return 0 on success, -1 on failure
3452*/
3453int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
3454{
3455 struct list_struct rec;
3456 u32 hash;
3457 tdb_off_t rec_ptr;
3458 char *p = NULL;
3459 int ret = -1;
3460
3461 if (tdb->read_only || tdb->traverse_read) {
3462 tdb->ecode = TDB_ERR_RDONLY;
3463 return -1;
3464 }
3465
3466 /* find which hash bucket it is in */
3467 hash = tdb->hash_fn(&key);
3468 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
3469 return -1;
3470
3471 /* check for it existing, on insert. */
3472 if (flag == TDB_INSERT) {
3473 if (tdb_exists_hash(tdb, key, hash)) {
3474 tdb->ecode = TDB_ERR_EXISTS;
3475 goto fail;
3476 }
3477 } else {
3478 /* first try in-place update, on modify or replace. */
3479 if (tdb_update_hash(tdb, key, hash, dbuf) == 0) {
3480 goto done;
3481 }
3482 if (tdb->ecode == TDB_ERR_NOEXIST &&
3483 flag == TDB_MODIFY) {
3484 /* if the record doesn't exist and we are in TDB_MODIFY mode then
3485 we should fail the store */
3486 goto fail;
3487 }
3488 }
3489 /* reset the error code potentially set by the tdb_update() */
3490 tdb->ecode = TDB_SUCCESS;
3491
3492 /* delete any existing record - if it doesn't exist we don't
3493 care. Doing this first reduces fragmentation, and avoids
3494 coalescing with `allocated' block before it's updated. */
3495 if (flag != TDB_INSERT)
3496 tdb_delete_hash(tdb, key, hash);
3497
3498 /* Copy key+value *before* allocating free space in case malloc
3499 fails and we are left with a dead spot in the tdb. */
3500
3501 if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
3502 tdb->ecode = TDB_ERR_OOM;
3503 goto fail;
3504 }
3505
3506 memcpy(p, key.dptr, key.dsize);
3507 if (dbuf.dsize)
3508 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
3509
3510 if (tdb->max_dead_records != 0) {
3511 /*
3512 * Allow for some dead records per hash chain, look if we can
3513 * find one that can hold the new record. We need enough space
3514 * for key, data and tailer. If we find one, we don't have to
3515 * consult the central freelist.
3516 */
3517 rec_ptr = tdb_find_dead(
3518 tdb, hash, &rec,
3519 key.dsize + dbuf.dsize + sizeof(tdb_off_t));
3520
3521 if (rec_ptr != 0) {
3522 rec.key_len = key.dsize;
3523 rec.data_len = dbuf.dsize;
3524 rec.full_hash = hash;
3525 rec.magic = TDB_MAGIC;
3526 if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
3527 || tdb->methods->tdb_write(
3528 tdb, rec_ptr + sizeof(rec),
3529 p, key.dsize + dbuf.dsize) == -1) {
3530 goto fail;
3531 }
3532 goto done;
3533 }
3534 }
3535
3536 /*
3537 * We have to allocate some space from the freelist, so this means we
3538 * have to lock it. Use the chance to purge all the DEAD records from
3539 * the hash chain under the freelist lock.
3540 */
3541
3542 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
3543 goto fail;
3544 }
3545
3546 if ((tdb->max_dead_records != 0)
3547 && (tdb_purge_dead(tdb, hash) == -1)) {
3548 tdb_unlock(tdb, -1, F_WRLCK);
3549 goto fail;
3550 }
3551
3552 /* we have to allocate some space */
3553 rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec);
3554
3555 tdb_unlock(tdb, -1, F_WRLCK);
3556
3557 if (rec_ptr == 0) {
3558 goto fail;
3559 }
3560
3561 /* Read hash top into next ptr */
3562 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
3563 goto fail;
3564
3565 rec.key_len = key.dsize;
3566 rec.data_len = dbuf.dsize;
3567 rec.full_hash = hash;
3568 rec.magic = TDB_MAGIC;
3569
3570 /* write out and point the top of the hash chain at it */
3571 if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
3572 || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
3573 || tdb_ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
3574 /* Need to tdb_unallocate() here */
3575 goto fail;
3576 }
3577
3578 done:
3579 ret = 0;
3580 fail:
3581 if (ret == 0) {
3582 tdb_increment_seqnum(tdb);
3583 }
3584
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003585 SAFE_FREE(p);
Theodore Ts'o106ad962007-04-04 21:26:37 -04003586 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
3587 return ret;
3588}
3589
3590
3591/* Append to an entry. Create if not exist. */
3592int tdb_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
3593{
3594 u32 hash;
3595 TDB_DATA dbuf;
3596 int ret = -1;
3597
3598 /* find which hash bucket it is in */
3599 hash = tdb->hash_fn(&key);
3600 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
3601 return -1;
3602
3603 dbuf = tdb_fetch(tdb, key);
3604
3605 if (dbuf.dptr == NULL) {
3606 dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
3607 } else {
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05003608 unsigned char *new_dptr = (unsigned char *)realloc(dbuf.dptr,
Theodore Ts'o106ad962007-04-04 21:26:37 -04003609 dbuf.dsize + new_dbuf.dsize);
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05003610 if (new_dptr == NULL) {
3611 free(dbuf.dptr);
3612 }
3613 dbuf.dptr = new_dptr;
Theodore Ts'o106ad962007-04-04 21:26:37 -04003614 }
3615
3616 if (dbuf.dptr == NULL) {
3617 tdb->ecode = TDB_ERR_OOM;
3618 goto failed;
3619 }
3620
3621 memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
3622 dbuf.dsize += new_dbuf.dsize;
3623
3624 ret = tdb_store(tdb, key, dbuf, 0);
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003625
Theodore Ts'o106ad962007-04-04 21:26:37 -04003626failed:
3627 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
3628 SAFE_FREE(dbuf.dptr);
3629 return ret;
3630}
3631
3632
3633/*
3634 return the name of the current tdb file
3635 useful for external logging functions
3636*/
3637const char *tdb_name(struct tdb_context *tdb)
3638{
3639 return tdb->name;
3640}
3641
3642/*
3643 return the underlying file descriptor being used by tdb, or -1
3644 useful for external routines that want to check the device/inode
3645 of the fd
3646*/
3647int tdb_fd(struct tdb_context *tdb)
3648{
3649 return tdb->fd;
3650}
3651
3652/*
3653 return the current logging function
3654 useful for external tdb routines that wish to log tdb errors
3655*/
3656tdb_log_func tdb_log_fn(struct tdb_context *tdb)
3657{
3658 return tdb->log.log_fn;
3659}
3660
3661
3662/*
3663 get the tdb sequence number. Only makes sense if the writers opened
3664 with TDB_SEQNUM set. Note that this sequence number will wrap quite
3665 quickly, so it should only be used for a 'has something changed'
3666 test, not for code that relies on the count of the number of changes
3667 made. If you want a counter then use a tdb record.
3668
3669 The aim of this sequence number is to allow for a very lightweight
3670 test of a possible tdb change.
3671*/
3672int tdb_get_seqnum(struct tdb_context *tdb)
3673{
3674 tdb_off_t seqnum=0;
3675
3676 tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
3677 return seqnum;
3678}
3679
3680int tdb_hash_size(struct tdb_context *tdb)
3681{
3682 return tdb->header.hash_size;
3683}
3684
3685size_t tdb_map_size(struct tdb_context *tdb)
3686{
3687 return tdb->map_size;
3688}
3689
3690int tdb_get_flags(struct tdb_context *tdb)
3691{
3692 return tdb->flags;
3693}
3694
Theodore Ts'o3eed36b2008-02-17 06:59:21 -05003695
3696/*
3697 enable sequence number handling on an open tdb
3698*/
3699void tdb_enable_seqnum(struct tdb_context *tdb)
3700{
3701 tdb->flags |= TDB_SEQNUM;
3702}
3703
Theodore Ts'o106ad962007-04-04 21:26:37 -04003704/* file: open.c */
3705
3706/* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
3707static struct tdb_context *tdbs = NULL;
3708
3709
Theodore Ts'o4e523bb2011-11-29 11:24:52 -05003710/* This is from a hash algorithm suggested by Rogier Wolff */
Theodore Ts'o106ad962007-04-04 21:26:37 -04003711static unsigned int default_tdb_hash(TDB_DATA *key)
3712{
3713 u32 value; /* Used to compute the hash value. */
3714 u32 i; /* Used to cycle through random values. */
3715
3716 /* Set the initial value from the key size. */
Theodore Ts'o4e523bb2011-11-29 11:24:52 -05003717 for (value = 0, i=0; i < key->dsize; i++)
3718 value = value * 256 + key->dptr[i] + (value >> 24) * 241;
Theodore Ts'o106ad962007-04-04 21:26:37 -04003719
Theodore Ts'o4e523bb2011-11-29 11:24:52 -05003720 return value;
Theodore Ts'o106ad962007-04-04 21:26:37 -04003721}
3722
3723
3724/* initialise a new database with a specified hash size */
3725static int tdb_new_database(struct tdb_context *tdb, int hash_size)
3726{
3727 struct tdb_header *newdb;
3728 int size, ret = -1;
3729
3730 /* We make it up in memory, then write it out if not internal */
3731 size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off_t);
3732 if (!(newdb = (struct tdb_header *)calloc(size, 1)))
3733 return TDB_ERRCODE(TDB_ERR_OOM, -1);
3734
3735 /* Fill in the header */
3736 newdb->version = TDB_VERSION;
3737 newdb->hash_size = hash_size;
3738 if (tdb->flags & TDB_INTERNAL) {
3739 tdb->map_size = size;
3740 tdb->map_ptr = (char *)newdb;
3741 memcpy(&tdb->header, newdb, sizeof(tdb->header));
3742 /* Convert the `ondisk' version if asked. */
3743 CONVERT(*newdb);
3744 return 0;
3745 }
3746 if (lseek(tdb->fd, 0, SEEK_SET) == -1)
3747 goto fail;
3748
3749 if (ftruncate(tdb->fd, 0) == -1)
3750 goto fail;
3751
3752 /* This creates an endian-converted header, as if read from disk */
3753 CONVERT(*newdb);
3754 memcpy(&tdb->header, newdb, sizeof(tdb->header));
3755 /* Don't endian-convert the magic food! */
3756 memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
3757 if (write(tdb->fd, newdb, size) != size) {
3758 ret = -1;
3759 } else {
3760 ret = 0;
3761 }
3762
3763 fail:
3764 SAFE_FREE(newdb);
3765 return ret;
3766}
3767
3768
3769
3770static int tdb_already_open(dev_t device,
3771 ino_t ino)
3772{
3773 struct tdb_context *i;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003774
Theodore Ts'o106ad962007-04-04 21:26:37 -04003775 for (i = tdbs; i; i = i->next) {
3776 if (i->device == device && i->inode == ino) {
3777 return 1;
3778 }
3779 }
3780
3781 return 0;
3782}
3783
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003784/* open the database, creating it if necessary
Theodore Ts'o106ad962007-04-04 21:26:37 -04003785
3786 The open_flags and mode are passed straight to the open call on the
3787 database file. A flags value of O_WRONLY is invalid. The hash size
3788 is advisory, use zero for a default value.
3789
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003790 Return is NULL on error, in which case errno is also set. Don't
Theodore Ts'o106ad962007-04-04 21:26:37 -04003791 try to call tdb_error or tdb_errname, just do strerror(errno).
3792
3793 @param name may be NULL for internal databases. */
3794struct tdb_context *tdb_open(const char *name, int hash_size, int tdb_flags,
3795 int open_flags, mode_t mode)
3796{
3797 return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
3798}
3799
3800/* a default logging function */
3801static void null_log_fn(struct tdb_context *tdb, enum tdb_debug_level level, const char *fmt, ...) PRINTF_ATTRIBUTE(3, 4);
3802static void null_log_fn(struct tdb_context *tdb, enum tdb_debug_level level, const char *fmt, ...)
3803{
3804}
3805
3806
3807struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
3808 int open_flags, mode_t mode,
3809 const struct tdb_logging_context *log_ctx,
3810 tdb_hash_func hash_fn)
3811{
3812 struct tdb_context *tdb;
3813 struct stat st;
3814 int rev = 0, locked = 0;
3815 unsigned char *vp;
3816 u32 vertest;
3817
3818 if (!(tdb = (struct tdb_context *)calloc(1, sizeof *tdb))) {
3819 /* Can't log this */
3820 errno = ENOMEM;
3821 goto fail;
3822 }
3823 tdb_io_init(tdb);
3824 tdb->fd = -1;
3825 tdb->name = NULL;
3826 tdb->map_ptr = NULL;
3827 tdb->flags = tdb_flags;
3828 tdb->open_flags = open_flags;
3829 if (log_ctx) {
3830 tdb->log = *log_ctx;
3831 } else {
3832 tdb->log.log_fn = null_log_fn;
3833 tdb->log.log_private = NULL;
3834 }
3835 tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
3836
3837 /* cache the page size */
Mike Frysinger24d364f2012-01-09 21:19:48 -05003838 tdb->page_size = sysconf(_SC_PAGESIZE);
Theodore Ts'o106ad962007-04-04 21:26:37 -04003839 if (tdb->page_size <= 0) {
3840 tdb->page_size = 0x2000;
3841 }
3842
3843 if ((open_flags & O_ACCMODE) == O_WRONLY) {
3844 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: can't open tdb %s write-only\n",
3845 name));
3846 errno = EINVAL;
3847 goto fail;
3848 }
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003849
Theodore Ts'o106ad962007-04-04 21:26:37 -04003850 if (hash_size == 0)
3851 hash_size = DEFAULT_HASH_SIZE;
3852 if ((open_flags & O_ACCMODE) == O_RDONLY) {
3853 tdb->read_only = 1;
3854 /* read only databases don't do locking or clear if first */
3855 tdb->flags |= TDB_NOLOCK;
3856 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
3857 }
3858
3859 /* internal databases don't mmap or lock, and start off cleared */
3860 if (tdb->flags & TDB_INTERNAL) {
3861 tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
3862 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
3863 if (tdb_new_database(tdb, hash_size) != 0) {
3864 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: tdb_new_database failed!"));
3865 goto fail;
3866 }
3867 goto internal;
3868 }
3869
3870 if ((tdb->fd = open(name, open_flags, mode)) == -1) {
3871 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_open_ex: could not open file %s: %s\n",
3872 name, strerror(errno)));
3873 goto fail; /* errno set by open(2) */
3874 }
3875
3876 /* ensure there is only one process initialising at once */
3877 if (tdb->methods->tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) {
3878 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to get global lock on %s: %s\n",
3879 name, strerror(errno)));
3880 goto fail; /* errno set by tdb_brlock */
3881 }
3882
3883 /* we need to zero database if we are the only one with it open */
3884 if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
3885 (locked = (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0, 1) == 0))) {
3886 open_flags |= O_CREAT;
3887 if (ftruncate(tdb->fd, 0) == -1) {
3888 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
3889 "failed to truncate %s: %s\n",
3890 name, strerror(errno)));
3891 goto fail; /* errno set by ftruncate */
3892 }
3893 }
3894
3895 if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
3896 || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
3897 || (tdb->header.version != TDB_VERSION
3898 && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
3899 /* its not a valid database - possibly initialise it */
3900 if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
3901 errno = EIO; /* ie bad format or something */
3902 goto fail;
3903 }
3904 rev = (tdb->flags & TDB_CONVERT);
3905 }
3906 vp = (unsigned char *)&tdb->header.version;
3907 vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
3908 (((u32)vp[2]) << 8) | (u32)vp[3];
3909 tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
3910 if (!rev)
3911 tdb->flags &= ~TDB_CONVERT;
3912 else {
3913 tdb->flags |= TDB_CONVERT;
3914 tdb_convert(&tdb->header, sizeof(tdb->header));
3915 }
3916 if (fstat(tdb->fd, &st) == -1)
3917 goto fail;
3918
3919 if (tdb->header.rwlocks != 0) {
3920 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: spinlocks no longer supported\n"));
3921 goto fail;
3922 }
3923
3924 /* Is it already in the open list? If so, fail. */
3925 if (tdb_already_open(st.st_dev, st.st_ino)) {
3926 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
3927 "%s (%d,%d) is already open in this process\n",
3928 name, (int)st.st_dev, (int)st.st_ino));
3929 errno = EBUSY;
3930 goto fail;
3931 }
3932
3933 if (!(tdb->name = (char *)strdup(name))) {
3934 errno = ENOMEM;
3935 goto fail;
3936 }
3937
3938 tdb->map_size = st.st_size;
3939 tdb->device = st.st_dev;
3940 tdb->inode = st.st_ino;
3941 tdb->max_dead_records = 0;
3942 tdb_mmap(tdb);
3943 if (locked) {
3944 if (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0, 1) == -1) {
3945 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
3946 "failed to take ACTIVE_LOCK on %s: %s\n",
3947 name, strerror(errno)));
3948 goto fail;
3949 }
3950
3951 }
3952
3953 /* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
3954 we didn't get the initial exclusive lock as we need to let all other
3955 users know we're using it. */
3956
3957 if (tdb_flags & TDB_CLEAR_IF_FIRST) {
3958 /* leave this lock in place to indicate it's in use */
3959 if (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0, 1) == -1)
3960 goto fail;
3961 }
3962
3963 /* if needed, run recovery */
3964 if (tdb_transaction_recover(tdb) == -1) {
3965 goto fail;
3966 }
3967
3968 internal:
3969 /* Internal (memory-only) databases skip all the code above to
3970 * do with disk files, and resume here by releasing their
3971 * global lock and hooking into the active list. */
3972 if (tdb->methods->tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1) == -1)
3973 goto fail;
3974 tdb->next = tdbs;
3975 tdbs = tdb;
3976 return tdb;
3977
3978 fail:
3979 { int save_errno = errno;
3980
3981 if (!tdb)
3982 return NULL;
Theodore Ts'oefc6f622008-08-27 23:07:54 -04003983
Theodore Ts'o106ad962007-04-04 21:26:37 -04003984 if (tdb->map_ptr) {
3985 if (tdb->flags & TDB_INTERNAL)
3986 SAFE_FREE(tdb->map_ptr);
3987 else
3988 tdb_munmap(tdb);
3989 }
3990 SAFE_FREE(tdb->name);
3991 if (tdb->fd != -1)
3992 if (close(tdb->fd) != 0)
3993 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to close tdb->fd on error!\n"));
3994 SAFE_FREE(tdb);
3995 errno = save_errno;
3996 return NULL;
3997 }
3998}
3999
4000/*
4001 * Set the maximum number of dead records per hash chain
4002 */
4003
4004void tdb_set_max_dead(struct tdb_context *tdb, int max_dead)
4005{
4006 tdb->max_dead_records = max_dead;
4007}
4008
4009/**
4010 * Close a database.
4011 *
4012 * @returns -1 for error; 0 for success.
4013 **/
4014int tdb_close(struct tdb_context *tdb)
4015{
4016 struct tdb_context **i;
4017 int ret = 0;
4018
4019 if (tdb->transaction) {
4020 tdb_transaction_cancel(tdb);
4021 }
4022
4023 if (tdb->map_ptr) {
4024 if (tdb->flags & TDB_INTERNAL)
4025 SAFE_FREE(tdb->map_ptr);
4026 else
4027 tdb_munmap(tdb);
4028 }
4029 SAFE_FREE(tdb->name);
4030 if (tdb->fd != -1)
4031 ret = close(tdb->fd);
4032 SAFE_FREE(tdb->lockrecs);
4033
4034 /* Remove from contexts list */
4035 for (i = &tdbs; *i; i = &(*i)->next) {
4036 if (*i == tdb) {
4037 *i = tdb->next;
4038 break;
4039 }
4040 }
4041
4042 memset(tdb, 0, sizeof(*tdb));
4043 SAFE_FREE(tdb);
4044
4045 return ret;
4046}
4047
4048/* register a loging function */
4049void tdb_set_logging_function(struct tdb_context *tdb,
4050 const struct tdb_logging_context *log_ctx)
4051{
4052 tdb->log = *log_ctx;
4053}
4054
4055void *tdb_get_logging_private(struct tdb_context *tdb)
4056{
4057 return tdb->log.log_private;
4058}
4059
4060/* reopen a tdb - this can be used after a fork to ensure that we have an independent
4061 seek pointer from our parent and to re-establish locks */
4062int tdb_reopen(struct tdb_context *tdb)
4063{
4064 struct stat st;
4065
4066 if (tdb->flags & TDB_INTERNAL) {
4067 return 0; /* Nothing to do. */
4068 }
4069
4070 if (tdb->num_locks != 0 || tdb->global_lock.count) {
4071 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_reopen: reopen not allowed with locks held\n"));
4072 goto fail;
4073 }
4074
4075 if (tdb->transaction != 0) {
4076 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_reopen: reopen not allowed inside a transaction\n"));
4077 goto fail;
4078 }
4079
4080 if (tdb_munmap(tdb) != 0) {
4081 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
4082 goto fail;
4083 }
4084 if (close(tdb->fd) != 0)
4085 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
4086 tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
4087 if (tdb->fd == -1) {
4088 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: open failed (%s)\n", strerror(errno)));
4089 goto fail;
4090 }
Theodore Ts'oefc6f622008-08-27 23:07:54 -04004091 if ((tdb->flags & TDB_CLEAR_IF_FIRST) &&
Theodore Ts'o106ad962007-04-04 21:26:37 -04004092 (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0, 1) == -1)) {
4093 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: failed to obtain active lock\n"));
4094 goto fail;
4095 }
4096 if (fstat(tdb->fd, &st) != 0) {
4097 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
4098 goto fail;
4099 }
4100 if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
4101 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: file dev/inode has changed!\n"));
4102 goto fail;
4103 }
4104 tdb_mmap(tdb);
4105
4106 return 0;
4107
4108fail:
4109 tdb_close(tdb);
4110 return -1;
4111}
4112
4113/* reopen all tdb's */
4114int tdb_reopen_all(int parent_longlived)
4115{
4116 struct tdb_context *tdb;
4117
4118 for (tdb=tdbs; tdb; tdb = tdb->next) {
4119 /*
4120 * If the parent is longlived (ie. a
4121 * parent daemon architecture), we know
4122 * it will keep it's active lock on a
4123 * tdb opened with CLEAR_IF_FIRST. Thus
4124 * for child processes we don't have to
4125 * add an active lock. This is essential
4126 * to improve performance on systems that
4127 * keep POSIX locks as a non-scalable data
4128 * structure in the kernel.
4129 */
4130 if (parent_longlived) {
4131 /* Ensure no clear-if-first. */
4132 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
4133 }
4134
4135 if (tdb_reopen(tdb) != 0)
4136 return -1;
4137 }
4138
4139 return 0;
4140}