-
Notifications
You must be signed in to change notification settings - Fork 225
/
Copy pathbdb_api.h
2503 lines (2072 loc) · 109 KB
/
bdb_api.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
Copyright 2015 Bloomberg Finance L.P.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/*
* bdb layer public api
*
* $Id$
*/
#ifndef __bdb_api_h__
#define __bdb_api_h__
#include <stdio.h>
#include <stdarg.h>
#include <sbuf2.h>
#include <net.h>
#include <bb_stdint.h>
#include <inttypes.h>
#include <limits.h>
#include "fwd_types.h"
#include "bdb_net.h"
#include <sqlglue.h>
#include <bdbglue.h>
#include <assert.h>
#include <compile_time_assert.h>
#include <sqlresponse.pb-c.h>
#define DEFAULT_DBA_USER "dba"
#define DEFAULT_DBA_PASSWORD ""
#define SIZEOF_SEQNUM (10 * sizeof(int))
struct seqnum_t;
typedef struct seqnum_t seqnum_type;
struct filepage_t;
typedef struct filepage_t filepage_type;
struct bdb_state_tag;
struct bdb_callback_tag;
typedef struct bdb_callback_tag bdb_callback_type;
struct tran_tag;
typedef struct tran_tag tran_type;
struct table_version_cache;
typedef struct table_version_cache table_version_cache;
struct bdb_attr_tag;
typedef struct bdb_attr_tag bdb_attr_type;
struct bdb_temp_hash;
typedef struct bdb_temp_hash bdb_temp_hash;
struct bulk_dump;
typedef struct bulk_dump bulk_dump;
struct dtadump;
typedef struct dtadump dtadump;
typedef struct bdb_cursor_ser bdb_cursor_ser_t;
struct bdb_cursor_ser {
uint8_t opaque[64];
};
struct txn_properties;
void bdb_cursor_ser_invalidate(bdb_cursor_ser_t *cur_ser);
enum {
BDB_CALLBACK_NODEUP,
BDB_CALLBACK_WHOISMASTER,
BDB_CALLBACK_REPFAIL,
BDB_CALLBACK_APPSOCK,
BDB_CALLBACK_PRINT,
BDB_CALLBACK_ELECTSETTINGS,
BDB_CALLBACK_GETROOM,
BDB_CALLBACK_CATCHUP,
BDB_CALLBACK_THREADDUMP,
BDB_CALLBACK_SENDNOTCOHERENT,
BDB_CALLBACK_GETLWM,
BDB_CALLBACK_SETLWM,
BDB_CALLBACK_SCDONE,
BDB_CALLBACK_SCABORT,
BDB_CALLBACK_UNDOSHADOW,
BDB_CALLBACK_NODE_IS_DOWN,
BDB_CALLBACK_SERIALCHECK,
BDB_CALLBACK_ADMIN_APPSOCK,
BDB_CALLBACK_SYNCMODE,
BDB_CALLBACK_NODEUP_DRTEST
};
enum { BDB_REPFAIL_NET, BDB_REPFAIL_TIMEOUT, BDB_REPFAIL_RMTBDB };
enum { BDB_OP_ADD = 1, BDB_OP_DEL = 2 };
/* debug options */
enum {
SQL_DBG_NONE = 0, /* no debug, default */
SQL_DBG_BDBLOG =
1, /* track the osqltrn bdblog operations (osqltrn has the flag) */
SQL_DBG_BDBALLLOG = 2, /* track all bdblog operations (no osqltrn here,
osqllog_repo has the flag, pushed to osqllog) */
SQL_DBG_BDBTRN = 4, /* track the osqltrn transaction registration
operations (osqltrn has the flag) */
SQL_DBG_BDBALLTRN = 8, /* track the osqltrn registration operations
(osqltrn repo has the flag, trak all osqltrn) */
SQL_DBG_SHADOW = 16, /* track inserts to shadow files */
SQL_DBG_ALL = INT_MAX /* enable all debug options */
};
typedef struct {
int opcode;
int ixnum;
void *ixdta;
} bdb_update_op_type;
/* Queue stats */
struct bdb_queue_stats {
unsigned n_new_way_frags_consumed;
unsigned n_new_way_frags_aborted;
unsigned n_new_way_geese_consumed;
unsigned n_old_way_frags_consumed;
unsigned n_consume_deadlocks;
unsigned n_add_deadlocks;
unsigned n_get_deadlocks;
unsigned n_get_not_founds;
unsigned n_logical_gets;
unsigned n_physical_gets;
};
/* Forward declare, this is defined in thread_stats.h */
struct berkdb_thread_stats;
/*
* Multiplication usually takes fewer CPU cycles than division. Therefore
* when comparing a usec and a msec, it is preferable to use:
* usec <comparison operator> M2U(msec)
*/
#ifndef U2M
#define U2M(usec) (int)((usec) / 1000)
#endif
#ifndef M2U
#define M2U(msec) ((msec)*1000ULL)
#endif
/* these are the values that "bdberr" can be */
enum {
BDBERR_NOERROR = 0,
BDBERR_MISC = 2, /* no 1 error code, precious? */
BDBERR_MANIPULATE_FREEREC = 3,
BDBERR_ADD_DTA = 4,
BDBERR_ADD_IX = 5,
BDBERR_ADD_RRN = 6,
BDBERR_DEADLOCK = 7,
BDBERR_BUFSMALL = 8,
BDBERR_ADD_DUPE = 9,
BDBERR_DEL_DTA = 10,
BDBERR_DEL_IX = 11,
BDBERR_DEL_RRN = 12,
BDBERR_DELNOTFOUND = 13,
BDBERR_BADARGS = 14,
BDBERR_FETCH_DTA = 15,
BDBERR_FETCH_IX = 16,
BDBERR_RRN_NOTFOUND = 17,
BDBERR_DTA_MISMATCH = 18,
BDBERR_DBEMPTY = 19,
BDBERR_RESERVED_1 = 20, /* internal use only */
BDBERR_READONLY = 21,
BDBERR_TRANTOOCOMPLEX = 22,
BDBERR_CALLBACK = 23,
BDBERR_MALLOC = 24,
BDBERR_IO = 25, /* io error e.g. in fastdump */
BDBERR_TIMEOUT = 26, /* e.g. fast dump socket timeout */
BDBERR_UNPACK = 27, /* unable to unpack ODH */
BDBERR_PACK = 28, /* unable to pack ODH */
BDBERR_INCOHERENT = 29,
BDBERR_BUG_KILLME = 30,
BDBERR_ROW_DEADLOCK = 31,
BDBERR_INVALID_LSN = 32,
BDBERR_DEADLOCK_ON_LAST = 33,
BDBERR_TRAN_CANCELLED = 34,
BDBERR_NO_LOG = 35,
BDBERR_DEADLOCK_ROWLOCK = 36,
BDBERR_NEED_REPOSITION = 37,
BDBERR_LOCK_DESIRED = 38,
BDBERR_NOT_DURABLE = 39,
BDBERR_MAX_SEQUENCE = 40,
BDBERR_EXCEEDED_INDEXES = 41,
BDBERR_EXCEEDED_BLOBS = 42
};
/* values for BDB_ATTR_LOGDELETEAGE; +ve values indicate an absolute
* unix epoch time. */
enum { LOGDELETEAGE_NEVER = -1, LOGDELETEAGE_NOW = 0 };
enum {
BDB_ATTRTYPE_SECS,
BDB_ATTRTYPE_MSECS,
BDB_ATTRTYPE_USECS,
BDB_ATTRTYPE_BYTES,
BDB_ATTRTYPE_KBYTES,
BDB_ATTRTYPE_MBYTES,
BDB_ATTRTYPE_BOOLEAN,
BDB_ATTRTYPE_QUANTITY,
BDB_ATTRTYPE_PERCENT
};
/* See attr.h for attribute definitions */
enum {
#define DEF_ATTR(NAME, name, type, dflt, desc) BDB_ATTR_##NAME,
#define DEF_ATTR_2(NAME, name, type, dflt, desc, flags, verify_fn, update_fn) \
BDB_ATTR_##NAME,
#include "attr.h"
#undef DEF_ATTR
#undef DEF_ATTR_2
BDB_ATTR_MAX
};
/*
* table types supported by bdblib
*/
typedef enum {
BDBTYPE_NONE = 0,
BDBTYPE_ENV = 1, /* environment */
BDBTYPE_TABLE = 2, /* normal table */
BDBTYPE_LITE = 3, /* single .dta file, one index, no .ix files */
BDBTYPE_QUEUE = 4, /* mainly in bdbqueue.c */
BDBTYPE_QUEUEDB = 5, /* like BDBTYPE_QUEUE, but use btree */
} bdbtype_t;
enum {
BDB_RECORD_MAX = 20480, /*
max size of a fixed record ON DISK.
comdb2 exposes 16384. add 1 byte for an
amazing 4k of columns (we dont support that)
and get to 20480.
note, this would be HORRIBLE
as the fixed max page size in berk is 65536
and they need to be able to fit 4 records on
a page (or else use overflow) including some
berk overhead. so that means the max bdb
record size should realistically be some number
like 15384 if we want to perform. oh well.
even better, with 4k pages, we shouldnt be
using record sizes more than 900k or so.
*/
BDB_QUEUEDB_MAX_FILES = 2 /* how many files could a QueueDB have? for
* now, this value should always be two. the
* first file should always be the one being
* consumed from and the second file should
* always be the one being added to. there
* may be no second file if the first one has
* not exceeded its size limit. */
};
enum COMPRESS {
BDB_COMPRESS_NONE = 0,
BDB_COMPRESS_ZLIB = 1,
BDB_COMPRESS_RLE8 = 2,
BDB_COMPRESS_CRLE = 3,
BDB_COMPRESS_LZ4 = 4
};
enum OPENFLAGS { /* NOTE: For "uint32_t flags" arg to "bdb_open_*()". */
BDB_OPEN_NONE = 0,
BDB_OPEN_ADD_QDB_FILE = 0x01,
BDB_OPEN_DEL_QDB_FILE = 0x02,
BDB_OPEN_SKIP_SCHEMA_LK = 0x04,
};
int bdb_compr2algo(const char *a);
const char *bdb_algo2compr(int a);
int bdb_bless_btree(char *input_file, char *output_file);
/* retrieve the user pointer associated with a bdb_handle */
void *bdb_get_usr_ptr(bdb_state_type *bdb_handle);
/* CALLBACK ROUTINES */
/*
provide a "printf()" compatible call for bdb trace to be output with.
in the absense of this callback, all bdb trace will go to stderr.
*/
typedef int (*PRINTFP)(const char *format, va_list ap);
/*
provide a routine that returns 0 or 1 when given the id of a node.
return 0 if the node is marked "down"
return 1 if the node is marked "up"
this routine will be used to determine if nodes should be pulled
from the election pool when errors are occuring talking to this
node.
*/
typedef int (*NODEUPFP)(bdb_state_type *bdb_handle, const char *host);
/*
provide a callback that gets called when a node disconnects.
this is needed by higher levels that need to stop waiting
for that node.
*/
typedef int (*NODEDOWNFP)(char *host);
/*
provide a callback that gets called doing serializable
transaction read-set validation.
This is needed by higher levels that need to abort non-serializable
transaction.
*/
typedef int (*SERIALCHECK)(char *tbname, int idxnum, void *key, int keylen,
void *ranges);
/*
provide a routine that returns an integer specifying the "room"
of a given node. this is used to keep coherency within a room.
*/
typedef int (*GETROOMFP)(bdb_state_type *bdb_handle, const char *host);
/*
pass in a routine that will be called to tell you that someone
has become the master. take whatever action necessary to get
updates directed to you if you are now the master or to have you
updates directed elesewhere if you learned of a new master.
do NOT call back into the bdb library from this routine.
*/
typedef int (*WHOISMASTERFP)(bdb_state_type *bdb_handle, char *host,
int assert_sc_clear);
/*
pass in a routine that will be called when the replication
subsystem has an error communicating with a node. reason
for failure will be passed into this routine. reasons can
be BDB_REPFAIL_NET, BDB_REPFAIL_TIMEOUT, BDB_REPFAIL_RMTBDB
BDB_REPFAIL_NET indicates a network level error communicating
with a node. BDB_REPFAIL_TIMEOUT indicates a timeout (as specified
by BDB_ATTR_REPTIMEOUT) communicating with a node.
BDB_REPFAIL_RMTBDB indicates that the bdb library on the remote
node returned failure.
if user requests are still being directed to a node that is
generating repfail events, the coherency of the data on that node
is at risk with being out of date with respect the the actual data
as it exists on the master copy of the database.
*/
typedef int (*REPFAILFP)(bdb_state_type *bdb_handle, char *host, int reason);
/*
pass in a routine that will handle a newly created socket.
this routine must return immediately, and should create it's own
thread if extended processing is needed.
*/
typedef int (*BDBAPPSOCKFP)(bdb_state_type *bdb_handle, SBUF2 *sb);
/*
pass in a routine that will return the current election preferences.
for now the only thing it can change is the election timeout value,
which is specified in seconds. it should alays return 0.
*/
typedef int (*BDBELECTSETTINGSFP)(bdb_state_type *bdb_handle,
int *elect_time_secs);
/*
pass in a routine that will be called when election has succeeded
and the database starts catching up to the master
*/
typedef int (*BDBCATCHUPFP)(bdb_state_type *bdb_handle,
unsigned long long ourlsn,
unsigned long long masterlsn);
typedef void (*BDBTHREADDUMPFP)(void);
typedef void (*BDBSENDNOTCOHERENTFP)(char *node, int notcoherent, int file,
int offset, int *rc);
typedef int (*BDBGETFILELWMFP)(int *);
typedef int (*BDBSETFILELWMFP)(int *);
/* retrieve all snapshot/serializable sql sessions and update their shadow
tables to account for committed deletes and hide adds */
struct bdb_osql_log;
typedef void (*UNDOSHADOWFP)(struct bdb_osql_log *);
/* Callback to return sync type */
typedef int (*SYNCMODE)(bdb_state_type *);
/* Callback to dr-test aware rtcpu */
typedef int (*NODEUP_DRTEST)(bdb_state_type *, const char *hode, int *isdrtest);
typedef int (*BDB_CALLBACK_FP)();
bdb_callback_type *bdb_callback_create(void);
void bdb_callback_set(bdb_callback_type *bdb_callback, int callback_type,
BDB_CALLBACK_FP callback_rtn);
/*
create a bdb attribute object. all attributes always have default
varaibles. defaults are documented in the BDB_ATTR enum.
*/
void *bdb_attr_create(void);
/*
set an attribute in a bdb attribute object to the specfied value.
*/
void bdb_attr_set(bdb_attr_type *bdb_attr, int attr, int value);
int bdb_attr_set_by_name(bdb_state_type *bdb_handle, bdb_attr_type *bdb_attr,
const char *attrname, int value);
int bdb_attr_get(bdb_attr_type *bdb_attr, int attr);
#define BDB_ATTR_GET(bdb_attr, attr) (bdb_attr_get(bdb_attr, BDB_ATTR_##attr))
void bdb_attr_dump(FILE *fh, const bdb_attr_type *bdb_attr);
/* Get the type of this bdb-state object as a BDBTYPE_ constant */
bdbtype_t bdb_get_type(bdb_state_type *bdb_state);
int bdb_get_qdb_adds(bdb_state_type *bdb_state);
int bdb_get_qdb_cons(bdb_state_type *bdb_state);
bdb_state_type *bdb_clone_handle_with_other_data_files(
const bdb_state_type *clone_bdb_state,
const bdb_state_type *data_files_bdb_state);
void bdb_free_cloned_handle_with_other_data_files(bdb_state_type *bdb_state);
/*
bdb_open_more() : "open" a new database. associate this db transactionally
with the bdb_handle from a prior bdb_open() call.
the returned bdb_handle will be associated with the
passed in bdb_handle. cache will be shared.
calls to bdb_transaction_begin()/commit()/abort() on
any associated bdb_handle operate on all associated
bdb_handles as a unit.
INPUT: name : "comdb name"
dir : location of all files (/bb/data or
something)
numix : number of indexes on database
ixlen : numix long array of integers specifying
size of each index in bytes
ixdups : array of ints, 1 == ix allows dups,
0 == ix does not allow dups
btree_recnum : 1 == turn on recnums on btrees.
0 == don't turn on recnums on btrees.
bdb_handle : a valid bdb_handle obtained from a
bdb_open() call.
OUTPUT: RETURN : success : pointer to bdb handle
failure : NULL
bdberr : will be set to provide additional
info on failure. possibilities are:
catastrophic:
BDBERR_MISC : some problem occurred
*/
/* open an existing table */
bdb_state_type *
bdb_open_more(const char name[], const char dir[], int lrl, short numix,
const short ixlen[], const signed char ixdups[],
const signed char ixrecnum[], const signed char ixdta[], const int ixdtalen[],
const signed char ixcollattr[], const signed char ixnulls[],
int numdtafiles, bdb_state_type *parent_bdb_handle, int *bdberr);
/* same, but using a transaction */
bdb_state_type *
bdb_open_more_tran(const char name[], const char dir[], int lrl, short numix,
const short ixlen[], const signed char ixdups[],
const signed char ixrecnum[], const signed char ixdta[], const int ixdtalen[],
const signed char ixcollattr[], const signed char ixnulls[],
int numdtafiles, bdb_state_type *parent_bdb_handle,
tran_type *tran, uint32_t flags, int *bdberr);
/* open an existing lite table */
bdb_state_type *bdb_open_more_lite(const char name[], const char dir[], int lrl,
int ixlen, int pagesize,
bdb_state_type *parent_bdb_handle,
tran_type *tran, uint32_t flags,
int *bdberr);
/* open an existing queue */
bdb_state_type *bdb_open_more_queue(const char name[], const char dir[],
int item_size, int pagesize,
bdb_state_type *parent_bdb_state,
int isqueuedb, tran_type *, int *bdberr);
/* create a new queue */
bdb_state_type *bdb_create_queue(const char name[], const char dir[],
int item_size, int pagesize,
bdb_state_type *parent_bdb_state,
int isqueuedb, int *bdberr);
bdb_state_type *bdb_create_queue_tran(tran_type *, const char name[],
const char dir[], int item_size,
int pagesize,
bdb_state_type *parent_bdb_state,
int isqueuedb, int *bdberr);
/* create a lite table */
bdb_state_type *bdb_create_more_lite(const char name[], const char dir[],
int lrl, int ixlen, int pagesize,
bdb_state_type *parent_bdb_handle,
int *bdberr);
/* create and open a new table */
bdb_state_type *
bdb_create(const char name[], const char dir[], int lrl, short numix,
const short ixlen[], const signed char ixdups[],
const signed char ixrecnum[], const signed char ixdta[], const int ixdtalen[],
const signed char ixcollattr[], const signed char ixnulls[],
int numdtafiles, bdb_state_type *parent_bdb_handle, int temp,
int *bdberr);
bdb_state_type *
bdb_create_tran(const char name[], const char dir[], int lrl, short numix,
const short ixlen[], const signed char ixdups[],
const signed char ixrecnum[], const signed char ixdta[], const int ixdtalen[],
const signed char ixcollattr[], const signed char ixnulls[],
int numdtafiles, bdb_state_type *parent_bdb_handle, int temp,
int *bdberr, tran_type *);
/* open a databasent. no actual db files are created. */
bdb_state_type *bdb_open_env(const char name[], const char dir[],
bdb_attr_type *bdb_attr,
bdb_callback_type *bdb_callback, void *usr_ptr,
netinfo_type *netinfo, char *recoverlsn,
int *bdberr);
int bdb_set_all_contexts(bdb_state_type *bdb_state, int *bdberr);
int bdb_handle_reset(bdb_state_type *);
int bdb_handle_reset_tran(bdb_state_type *, tran_type *, tran_type *);
int bdb_handle_dbp_add_hash(bdb_state_type *bdb_state, int szkb);
int bdb_handle_dbp_drop_hash(bdb_state_type *bdb_state);
int bdb_handle_dbp_hash_stat(bdb_state_type *bdb_state);
int bdb_handle_dbp_hash_stat_reset(bdb_state_type *bdb_state);
int bdb_close_temp_state(bdb_state_type *bdb_state, int *bdberr);
/* get file sizes for indexes and data files */
uint64_t bdb_data_size(bdb_state_type *bdb_state, int dtanum);
uint64_t bdb_index_size(bdb_state_type *bdb_state, int dtanum);
uint64_t bdb_queue_size(bdb_state_type *bdb_state, unsigned *num_extents);
uint64_t bdb_logs_size(bdb_state_type *bdb_state, unsigned *num_logs);
uint64_t bdb_tmp_size(bdb_state_type *bdb_state, uint64_t *ptmptbls, uint64_t *psqlsorters, uint64_t *pblkseqs,
uint64_t *pothers);
/*
bdb_close(): destroy a bdb_handle.
*/
int bdb_close(bdb_state_type *bdb_handle);
/* see if a handle is open or not */
int bdb_isopen(bdb_state_type *bdb_handle);
/* you need to call this if you created the parent with bdb_open_env */
int bdb_close_env(bdb_state_type *bdb_handle);
/* get a context that can be used in fetches to ensure that we don't fetch
* records added after the context of the original find. */
unsigned long long bdb_get_cmp_context(bdb_state_type *bdb_state);
unsigned long long bdb_get_cmp_context_local(bdb_state_type *bdb_state);
/* Check that the given genid is older than the compare context being given.
* Returns: 1 genid is older, 0 genid is newer */
int bdb_check_genid_is_older(bdb_state_type *bdb_state,
unsigned long long genid,
unsigned long long context);
/* Compare two genids to determine which one would have been allocated first.
* Return codes:
* -1 a < b
* 0 a == b
* 1 a > b
*/
int bdb_cmp_genids(unsigned long long a, unsigned long long b);
/* Mask-out the inplace update-id for each genid and then compare them to
* see which one would have been allocated first:
* Return codes:
* -1 a < b
* 0 a == b
* 1 a > b
*/
int bdb_inplace_cmp_genids(bdb_state_type *bdb_state, unsigned long long g1,
unsigned long long g2);
/* using the bdb_state object, return the updateid for this genid */
int get_updateid_from_genid(bdb_state_type *bdb_state,
unsigned long long genid);
/* Retrieve the participant stripe id which is encoded in the genid.
* Return codes:
* -1 there are no bits allocated for participant stripe id
* otherwise, the stripe-id associated with the genid
*/
int bdb_get_participant_stripe_from_genid(bdb_state_type *bdb_state,
unsigned long long genid);
/* Mask a genid so that it can be used in an ondisk file. With ODH
* turned on this masks out the updateid field. We use this in live schema
* change since some old databases may have genids that have values in this
* field, so we have to change those genids when we do this conversion. */
unsigned long long bdb_mask_updateid(bdb_state_type *bdb_state,
unsigned long long genid);
/* Normalize a genid so that we can find and delete it during a live
* schema-change. This is necessary if schema-change is removing ondisk
* headers for a database table which had in-place updates enabled. It's
* used to locate and delete a record in the new, odh-less table. */
unsigned long long bdb_normalise_genid(bdb_state_type *bdb_state,
unsigned long long genid);
#define BDB_TRAN_RECOVERY 0x00000001
#define BDB_TRAN_NOLOG 0x00000002
/* return a new tran handle, begin a transaction */
tran_type *bdb_tran_begin_flags(bdb_state_type *bdb_handle,
tran_type *parent_tran, int *bdberr,
uint32_t flags);
tran_type *bdb_tran_begin_internal(bdb_state_type *bdb_handle, tran_type *parent_tran,
int *bdberr, const char *func, int line);
#define bdb_tran_begin(A, B, C) ({tran_type *retval; retval = bdb_tran_begin_internal(A, B, C, __func__, __LINE__); retval;})
tran_type *bdb_tran_begin_mvcc(bdb_state_type *bdb_handle,
tran_type *parent_tran, int *bdberr);
tran_type *bdb_tran_begin_stable(bdb_state_type *bdb_handle,
tran_type *parent_tran, int *bdberr);
tran_type *bdb_tran_begin_dirty(bdb_state_type *bdb_handle,
tran_type *parent_tran, int *bdberr);
tran_type *bdb_tran_begin_logical(bdb_state_type *bdb_state, int trak,
int *bdberr);
void bdb_set_tran_lockerid(tran_type *tran, uint32_t lockerid);
void bdb_get_tran_lockerid(tran_type *tran, uint32_t *lockerid);
void *bdb_get_physical_tran(tran_type *ltran);
void bdb_reset_physical_tran(tran_type *ltran);
void *bdb_get_sc_parent_tran(tran_type *ltran);
void bdb_ltran_get_schema_lock(tran_type *ltran);
void bdb_ltran_put_schema_lock(tran_type *ltran);
tran_type *bdb_tran_begin_socksql(bdb_state_type *, int trak, int *bdberr);
tran_type *bdb_tran_begin_readcommitted(bdb_state_type *, int trak,
int *bdberr);
tran_type *bdb_tran_begin_modsnap(bdb_state_type *, int trak,
int *bdberr);
tran_type *bdb_tran_begin_serializable(bdb_state_type *bdb_state, int trak,
int *bdberr, int epoch, int file,
int offset, int is_ha_retry);
tran_type *bdb_tran_begin_snapisol(bdb_state_type *bdb_state, int trak,
int *bdberr, int epoch, int file, int offset,
int is_ha_retry);
/* return log bytes written so far for this transaction */
uint64_t bdb_tran_logbytes(tran_type *tran);
/* Write a prepare record */
int bdb_tran_prepare(bdb_state_type *bdb_state, tran_type *tran, const char *dist_txnid, const char *coordinator_name,
const char *coordinator_tier, uint32_t coordinator_gen, void *blkseq_key, int blkseq_key_len,
int *bdberr);
/* commit the transaction referenced by the tran handle */
int bdb_tran_commit(bdb_state_type *bdb_handle, tran_type *tran, int *bdberr);
int bdb_tran_commit_logical_with_seqnum_size(bdb_state_type *bdb_state,
tran_type *tran, void *blkseq,
int blklen, void *blkkey,
int blkkeylen, seqnum_type *seqnum,
uint64_t *out_txnsize,
int *bdberr);
int bdb_tran_get_start_file_offset(bdb_state_type *bdb_state, tran_type *tran,
int *file, int *offset);
/* commit the transaction referenced by the tran handle. return a
seqnum that is guaranteed to be greater or equal to the seqnum
needed to have this commit reflected in your database
also return an estimate of the transaction size in unspecified
units */
int bdb_tran_commit_with_seqnum_size(bdb_state_type *bdb_state, tran_type *tran,
seqnum_type *seqnum, uint64_t *out_txnsize,
int *bdberr);
/* abort the transaction referenced by the tran handle */
int bdb_tran_abort(bdb_state_type *bdb_handle, tran_type *tran, int *bdberr);
int bdb_tran_abort_priority(bdb_state_type *bdb_handle, tran_type *tran, int *bdberr, int *priority, int discard);
/* english doesn't have curses vile enough */
int bdb_tran_abort_logical(bdb_state_type *bdb_handle, tran_type *tran,
int *bdberr, void *blkseq, int blklen, void *blkkey,
int blkkeylen, seqnum_type *seqnum);
/* prim operations all require a valid tran to be held */
int bdb_prim_allocdta_genid(bdb_state_type *bdb_handle, tran_type *tran,
void *dtaptr, int dtalen, unsigned long long *genid,
int updateid, int *bdberr);
int bdb_prim_adddta_n_genid(bdb_state_type *bdb_state, tran_type *tran,
int dtanum, void *dtaptr, size_t dtalen, int rrn,
unsigned long long genid, int *bdberr,
int odhready);
int bdb_prim_deallocdta_genid(bdb_state_type *bdb_handle, tran_type *tran,
int rrn, unsigned long long genid, int *bdberr);
int bdb_prim_deallocdta_n_genid(bdb_state_type *bdb_state, tran_type *tran,
int rrn, unsigned long long genid, int dtanum,
int *bdberr);
int bdb_prim_updvrfy_genid(bdb_state_type *bdb_state, tran_type *tran,
void *olddta, int oldlen, void *newdta, int newdtaln,
int rrn, unsigned long long oldgenid,
unsigned long long *newgenid, int verifydta,
int participantstripeid, int use_new_genid,
int *bdberr);
int bdb_prim_add_upd_genid(bdb_state_type *bdb_state, tran_type *tran,
int dtanum, void *newdta, int newdtaln, int rrn,
unsigned long long oldgenid,
unsigned long long newgenid, int participantstripeid,
int *bdberr, int odhready);
int bdb_prim_no_upd(bdb_state_type *bdb_state, tran_type *tran, int rrn,
unsigned long long oldgenid, unsigned long long newgenid,
int blobmap, int *bdberr);
int bdb_prim_updvrfy_genid(bdb_state_type *bdb_handle, tran_type *tran,
void *olddta, int oldlen, void *newdta, int newdtaln,
int rrn, unsigned long long oldgenid,
unsigned long long *newgenid, int verifydta,
int participantstripeid, int use_new_genid,
int *bdberr);
int bdb_upd_genid(bdb_state_type *bdb_state, tran_type *tran, int dtanum,
int rrn, unsigned long long oldgenid,
unsigned long long newgenid, int has_blob_opt, int *bdberr);
int bdb_prim_addkey_genid(bdb_state_type *bdb_handle, tran_type *tran,
void *ixdta, int ixnum, int rrn,
unsigned long long genid, void *dta, int dtalen,
int isnull, int *bdberr);
int bdb_prim_delkey_genid(bdb_state_type *bdb_handle, tran_type *tran,
void *ixdta, int ixnum, int rrn,
unsigned long long genid, int isnull, int *bdberr);
int bdb_prim_updkey_genid(bdb_state_type *bdb_state, tran_type *tran, void *key,
int keylen, int ixnum, unsigned long long oldgenid,
unsigned long long genid, void *dta, int dtalen,
int isnull, int *bdberr);
int bdb_prim_upgrade(bdb_state_type *bdb_state, tran_type *tran, void *newdta,
int newdtaln, unsigned long long oldgenid, int *bdberr);
/* Callbacks for use with range delete. */
/* Form the requested key for a given record.
*
* The arguments are:
* void *record - the record to form a key for. tag routines
* tend to modify this in place, so not const
* size_t record_len - length of the record data
* void *index - buffer in which to form the index
* size_t index_len - length of the index
* int index_num - which index to form
* void *userptr - user supplied data
*
* The callback implementation should return:
* 0 - key formed successfully, go ahead and delete this record.
* -1 - halt the range delete operation with an error.
* -2 - do not delete this record, continue to look at other records.
* -3 - deadlock
*/
typedef int (*bdb_formkey_callback_t)(void *, size_t, void *, size_t, int,
void *);
/* Called before a record is deleted; can be used to cancel the record
* deletion.
*
* Arguments:
* void *record - the record to form a key for. tag routines
* tend to modify this in place, so not const
* size_t record_len - length of the record data
* int rrn - rrn number of record
* unsigned long long genid - genid of record
* void *userptr - user supplied data
*
* The callback implementation should return:
* 0 - go ahead and delete this record.
* -1 - halt the range delete operation with an error.
* -2 - do not delete this record, continue to look at other records.
* -3 - deadlock
*/
typedef int (*bdb_pre_delete_callback_t)(void *, size_t, int,
unsigned long long, void *);
/* Called after a record has been deleted.
*
* Arguments:
* void *record - the record to form a key for. tag routines
* tend to modify this in place, so not const
* size_t record_len - length of the record data
* int rrn - rrn number of record
* unsigned long long genid - genid of record
* void *userptr - user supplied data
*
* The callback implementation should return:
* 0 - go ahead and delete this record.
* -1 - halt the range delete operation with an error.
* -3 - deadlock
*/
typedef int (*bdb_post_delete_callback_t)(void *, size_t, int,
unsigned long long, void *);
/* For the best possible performance I've built range delete support directly
* into bdblib. It zooms through the given index and deletes all keys
* between the start and end point given. A callback function must be
* provided to form keys from the given data record so that they can be
* deleted. The other callbacks are optional and can be NULL.
*
* Returns the number of records deleted, or -1 on error (check *bdberr)
*/
int bdb_prim_range_delete(bdb_state_type *bdb_handle, tran_type *tran,
size_t dtalen, int index, const void *start_key,
const void *end_key, size_t keylength,
bdb_formkey_callback_t formkey_callback,
bdb_pre_delete_callback_t pre_callback,
bdb_post_delete_callback_t post_callback,
void *userptr, int *count_deleted_ptr,
int *count_not_deleted_ptr, int max_records,
int max_time_ms, int *bdberr);
/* lite operations give you direct access to bdb tables with minimum overhead */
int bdb_lite_add(bdb_state_type *bdb_handle, tran_type *tran, void *dtaptr,
int dtalen, void *key, int *bdberr);
int bdb_lite_full_add(bdb_state_type *bdb_handle, tran_type *tran, void *dtaptr, int dtalen, void *key, int keylen,
int *bdberr);
int bdb_lite_exact_del(bdb_state_type *bdb_handle, tran_type *tran, void *key,
int *bdberr);
int bdb_lite_delete(bdb_state_type *bdb_handle, tran_type *tran, void *key, int keylen, int *bdberr);
int bdb_lite_exact_fetch(bdb_state_type *bdb_handle, void *key, void *fnddta,
int maxlen, int *fndlen, int *bdberr);
int bdb_lite_exact_fetch_alloc(bdb_state_type *bdb_handle, void *key,
void **fnddta, int *fndlen, int *bdberr);
int bdb_lite_exact_fetch_alloc_tran(bdb_state_type *bdb_handle, tran_type *tran,
void *key, void **fnddta, int *fndlen,
int *bdberr);
int bdb_lite_exact_fetch_tran(bdb_state_type *bdb_state, tran_type *tran,
void *key, void *fnddta, int maxlen, int *fndlen,
int *bdberr);
int bdb_lite_exact_var_fetch(bdb_state_type *bdb_handle, void *key,
void **fnddta, int *fndlen, int *bdberr);
int bdb_lite_exact_var_fetch_tran(bdb_state_type *bdb_state, tran_type *tran,
void *key, void **fnddta, int *fndlen,
int *bdberr);
int bdb_lite_fetch_keys_fwd(bdb_state_type *bdb_state, void *firstkey,
void *fndkeys, int maxfnd, int *numfnd,
int *bdberr);
int bdb_lite_fetch_keys_fwd_tran(bdb_state_type *bdb_state, tran_type *tran,
void *firstkey, void *fndkeys, int maxfnd,
int *numfnd, int *bdberr);
int bdb_lite_fetch_keys_bwd(bdb_state_type *bdb_state, void *firstkey,
void *fndkeys, int maxfnd, int *numfnd,
int *bdberr);
int bdb_lite_fetch_keys_bwd_tran(bdb_state_type *bdb_state, tran_type *tran,
void *firstkey, void *fndkeys, int maxfnd,
int *numfnd, int *bdberr);
int bdb_lite_fetch_partial(bdb_state_type *bdb_state, void *key_in, int klen_in,
void *key_out, int *fnd, int *bdberr);
int bdb_lite_fetch_partial_tran(bdb_state_type *bdb_state, tran_type *tran,
void *key_in, int klen_in, void *key_out,
int *fnd, int *bdberr);
int bdb_lite_exact_fetch_full_tran(bdb_state_type *bdb_state, tran_type *tran, void *key_in, int klen_in, void *key_out,
int maxlen, int *fnd, int *bdberr);
/* queue operations are for queue tables - fifos with multiple consumers */
enum { BDBQUEUE_MAX_CONSUMERS = 32 };
/* 16 byte pointer to an item in an ondisk queue. */
struct bdb_queue_cursor {
uint64_t genid; /* genid of item */
uint32_t recno; /* recno of first fragment of item */
uint32_t reserved; /* must be zero */
};
BB_COMPILE_TIME_ASSERT(queue_cursor_size, sizeof(struct bdb_queue_cursor) == 16);
/* mark a consumer as active or inactive. this grabs the bdb write lock. */
int bdb_queue_consumer(bdb_state_type *bdb_state, int consumer, int active,
int *bdberr);
/* add an item to the end of the queue. */
int bdb_queue_add(bdb_state_type *bdb_state, tran_type *tran, const void *dta,
size_t dtalen, int *bdberr, unsigned long long *out_genid);
/* add/consume dummy records to aid extent reclaimation. winner of the
* May 2006 "Most Absurd Hack" award. */
int bdb_queue_check_goose(bdb_state_type *bdb_state, tran_type *tran,
int *bdberr);
int bdb_queue_add_goose(bdb_state_type *bdb_state, tran_type *tran,
int *bdberr);
int bdb_queue_consume_goose(bdb_state_type *bdb_state, tran_type *tran,
int *bdberr);
/* get the first item unconsumed by this consumer number, AFTER the previously
* found result (passed in through prevfnd). On a successful find *fnd will
* be set to point to memory that the caller must free. The actual item data
* will be at ((const char *)*fnd) + *fnddtaoff). */
struct bdb_queue_found;
int bdb_queue_get(bdb_state_type *bdb_state, tran_type *tran, int consumer,
const struct bdb_queue_cursor *prevcursor,
struct bdb_queue_found **fnd, size_t *fnddtalen,
size_t *fnddtaoff, struct bdb_queue_cursor *fndcursor,
long long *seq, int *bdberr);
/* Get the genid of a queue item that was retrieved by bdb_queue_get() */
unsigned long long bdb_queue_item_genid(const struct bdb_queue_found *dta);
/* Call a callback function for each item on the queue. The parameters to the
* callback are: consumer number, item length, epoch time it was added,
* userptr. */
enum {
/* queue walk callback return codes */
BDB_QUEUE_WALK_CONTINUE = 0,
BDB_QUEUE_WALK_STOP = 1,
BDB_QUEUE_WALK_STOP_CONSUMER = 2,
/* flags to affect the behaviour of the walkback function */
BDB_QUEUE_WALK_KNOWN_CONSUMERS_ONLY = 1,
BDB_QUEUE_WALK_FIRST_ONLY = 2,
BDB_QUEUE_WALK_RESTART = 4
};
typedef int (*bdb_queue_stats_callback_t)(int consumern, size_t item_length,
unsigned int epoch,
unsigned int depth, void *userptr);
int bdb_queuedb_stats(bdb_state_type *bdb_state,
bdb_queue_stats_callback_t callback, tran_type *tran,
void *userptr, int *bdberr);
typedef int (*bdb_queue_walk_callback_t)(int consumern, size_t item_length,
unsigned int epoch, void *userptr);
int bdb_queue_walk(bdb_state_type *bdb_state, int flags, bbuint32_t *lastitem,
bdb_queue_walk_callback_t callback, tran_type *tran,
void *userptr, int *bdberr);
/* debug aid - dump the entire queue */
int bdb_queue_dump(bdb_state_type *bdb_state, FILE *out, int *bdberr);
/* consume a queue item previously found by bdb_queue_get. */
struct bdb_queue_found;
int bdb_queue_consume(bdb_state_type *bdb_state, tran_type *tran, int consumer,
const struct bdb_queue_found *prevfnd, int *bdberr);
/* work out the best page size to use for the given average item size */
int bdb_queue_best_pagesize(int avg_item_sz);
/* Get info about a previously found item. */
void bdb_queue_get_found_info(const void *fnd, size_t *dtaoff, size_t *dtalen);
/* Get queue stats */
const struct bdb_queue_stats *bdb_queue_get_stats(bdb_state_type *bdb_state);
/* dump dta contents of bdb_handle to stream sb */
int bdb_dumpdta(bdb_state_type *bdb_handle, SBUF2 *sb, int *bdberr);
/* debug dump routines */
void bdb_dumpit(bdb_state_type *bdb_state);
void bdb_bulkdumpit(bdb_state_type *bdb_state);
/*
compare seqnum1 with seqnum2.
returns 0 if seqnum1 == seqnum2
1 if seqnum1 > seqnum2
-1 if seqnum1 < seqnum2
*/
int bdb_seqnum_compare(void *inbdb_state, seqnum_type *seqnum1,
seqnum_type *seqnum2);