| /* |
| * Index, mapping scores to log positions. |
| * |
| * The index is made up of some number of index sections, each of |
| * which is typically stored on a different disk. The blocks in all the |
| * index sections are logically numbered, with each index section |
| * responsible for a range of blocks. Blocks are typically 8kB. |
| * |
| * Index Version 1: |
| * |
| * The N index blocks are treated as a giant hash table. The top 32 bits |
| * of score are used as the key for a lookup. Each index block holds |
| * one hash bucket, which is responsible for ceil(2^32 / N) of the key space. |
| * |
| * The index is sized so that a particular bucket is extraordinarily |
| * unlikely to overflow: assuming compressed data blocks are 4kB |
| * on disk, and assuming each block has a 40 byte index entry, |
| * the index data will be 1% of the total data. Since scores are essentially |
| * random, all buckets should be about the same fullness. |
| * A factor of 5 gives us a wide comfort boundary to account for |
| * random variation. So the index disk space should be 5% of the arena disk space. |
| * |
| * Problems with Index Version 1: |
| * |
| * Because the index size is chosen to handle the worst case load, |
| * the index is very sparse, especially when the Venti server is mostly empty. |
| * This has a few bad properties. |
| * |
| * Loading an index block (which typically requires a random disk seek) |
| * is a very expensive operation, yet it yields only a couple index entries. |
| * We're not making efficient use of the disk arm. |
| * |
| * Writing a block requires first checking to see if the block already |
| * exists on the server, which in turn requires an index lookup. When |
| * writing fresh data, these lookups will fail. The index entry cache |
| * cannot serve these, so they must go to disk, which is expensive. |
| * |
| * Thus both the reading and the writing of blocks are held back by |
| * the expense of accessing the index. |
| * |
| * Index Version 2: |
| * |
| * The index is sized to be exactly 2^M blocks. The index blocks are |
| * logically arranged in a (not exactly balanced) binary tree of depth at |
| * most M. The nodes are named by bit strings describing the path from |
| * the root to the node. The root is . (dot). The left child of the root is .0, |
| * the right child .1. The node you get to by starting at the root and going |
| * left right right left is .0110. At the beginning, there is only the root block. |
| * When a block with name .xxx fills, it splits into .xxx0 and .xxx1. |
| * All the index data is kept in the leaves of the tree. |
| * |
| * Index leaf blocks are laid out on disk by interpreting the bitstring as a |
| * binary fraction and multiplying by 2^M -- .0 is the first block, .1 is |
| * the block halfway into the index, .0110 is at position 6/16, and |
| * .xxx and .xxx0 map to the same block (but only one can be a leaf |
| * node at any given time, so this is okay!). A cheap implementation of |
| * this is to append zeros to the bit string to make it M bits long. That's |
| * the integer index block number. |
| * |
| * To find the leaf block that should hold a score, use the bits of the |
| * score one at a time to walk down the tree to a leaf node. If the tree |
| * has leaf nodes .0, .10, and .11, then score 0110101... ends up in bucket |
| * .0 while score 101110101... ends up in bucket .10. There is no leaf node |
| * .1 because it has split into .10 and .11. |
| * |
| * If we know which disk blocks are in use, we can reconstruct the interior |
| * of the tree: if .xxx1 is in use, then .xxx has been split. We keep an in-use |
| * bitmap of all index disk blocks to aid in reconstructing the interior of the |
| * tree. At one bit per index block, the bitmap is small enough to be kept |
| * in memory even on the largest of Venti servers. |
| * |
| * After the root block splits, the index blocks being used will always be |
| * at least half full (averaged across the entire index). So unlike Version 1, |
| * Index Version 2 is quite dense, alleviating the two problems above. |
| * Index block reads now return many index entries. More importantly, |
| * small servers can keep most of the index in the disk cache, making them |
| * more likely to handle negative lookups without going to disk. |
| * |
| * As the index becomes more full, Index Version 2's performance |
| * degrades gracefully into Index Version 1. V2 is really an optimization |
| * for little servers. |
| * |
| * Write Ordering for Index Version 2: |
| * |
| * Unlike Index Version 1, Version 2 must worry about write ordering |
| * within the index. What happens if the in-use bitmap is out of sync |
| * with the actual leaf nodes? What happens if .xxx splits into .xxx0 and |
| * .xxx1 but only one of the new blocks gets written to disk? |
| * |
| * We arrange that when .xxx splits, the .xxx1 block is written first, |
| * then the .xxx0 block, and finally the in-use bitmap entry for .xxx1. |
| * The split is committed by the writing of .xxx0. This ordering leaves |
| * two possible partial disk writes: |
| * |
| * (a) If .xxx1 is written but .xxx0 and the bitmap are not, then it's as if |
| * the split never happened -- we won't think .xxx1 is in use, and we |
| * won't go looking for it. |
| * |
| * (b) If .xxx1 and .xxx0 are written but the bitmap is not, then the first |
| * time we try to load .xxx, we'll get .xxx0 instead, realize the bitmap is |
| * out of date, and update the bitmap. |
| * |
| * Backwards Compatibility |
| * |
| * Because there are large Venti servers in use with Index V1, this code |
| * will read either index version, but when asked to create a new index, |
| * will only create V2. |
| */ |
| |
| #include "stdinc.h" |
| #include "dat.h" |
| #include "fns.h" |
| |
| static int bucklook(u8int *score, int type, u8int *data, int n); |
| static int writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b); |
| static int okibucket(IBucket *ib, ISect *is); |
| static int initindex1(Index*); |
| static ISect *initisect1(ISect *is); |
| static int splitiblock(Index *ix, DBlock *b, ISect *is, u32int buck, IBucket *ib); |
| |
| #define KEY(k,d) ((d) ? (k)>>(32-(d)) : 0) |
| |
| //static QLock indexlock; //ZZZ |
| |
| static char IndexMagic[] = "venti index configuration"; |
| |
| Index* |
| initindex(char *name, ISect **sects, int n) |
| { |
| IFile f; |
| Index *ix; |
| ISect *is; |
| u32int last, blocksize, tabsize; |
| int i; |
| |
| if(n <= 0){ |
| seterr(EOk, "no index sections to initialize index"); |
| return nil; |
| } |
| ix = MKZ(Index); |
| if(ix == nil){ |
| seterr(EOk, "can't initialize index: out of memory"); |
| freeindex(ix); |
| return nil; |
| } |
| |
| tabsize = sects[0]->tabsize; |
| if(partifile(&f, sects[0]->part, sects[0]->tabbase, tabsize) < 0) |
| return nil; |
| if(parseindex(&f, ix) < 0){ |
| freeifile(&f); |
| freeindex(ix); |
| return nil; |
| } |
| freeifile(&f); |
| if(namecmp(ix->name, name) != 0){ |
| seterr(ECorrupt, "mismatched index name: found %s expected %s", ix->name, name); |
| return nil; |
| } |
| if(ix->nsects != n){ |
| seterr(ECorrupt, "mismatched number index sections: found %d expected %d", n, ix->nsects); |
| freeindex(ix); |
| return nil; |
| } |
| ix->sects = sects; |
| last = 0; |
| blocksize = ix->blocksize; |
| for(i = 0; i < ix->nsects; i++){ |
| is = sects[i]; |
| if(namecmp(ix->name, is->index) != 0 |
| || is->blocksize != blocksize |
| || is->tabsize != tabsize |
| || namecmp(is->name, ix->smap[i].name) != 0 |
| || is->start != ix->smap[i].start |
| || is->stop != ix->smap[i].stop |
| || last != is->start |
| || is->start > is->stop){ |
| seterr(ECorrupt, "inconsistent index sections in %s", ix->name); |
| freeindex(ix); |
| return nil; |
| } |
| last = is->stop; |
| } |
| ix->tabsize = tabsize; |
| ix->buckets = last; |
| |
| if(initindex1(ix) < 0){ |
| freeindex(ix); |
| return nil; |
| } |
| |
| ix->arenas = MKNZ(Arena*, ix->narenas); |
| if(maparenas(ix->amap, ix->arenas, ix->narenas, ix->name) < 0){ |
| freeindex(ix); |
| return nil; |
| } |
| |
| return ix; |
| } |
| |
| static int |
| initindex1(Index *ix) |
| { |
| u32int buckets; |
| |
| switch(ix->version){ |
| case IndexVersion1: |
| ix->div = (((u64int)1 << 32) + ix->buckets - 1) / ix->buckets; |
| buckets = (((u64int)1 << 32) - 1) / ix->div + 1; |
| if(buckets != ix->buckets){ |
| seterr(ECorrupt, "inconsistent math for divisor and buckets in %s", ix->name); |
| return -1; |
| } |
| break; |
| |
| case IndexVersion2: |
| buckets = ix->buckets - ix->bitblocks; |
| if(ix->buckets < ix->bitblocks || (buckets&(buckets-1))) |
| seterr(ECorrupt, "bucket count not a power of two in %s", ix->name); |
| ix->maxdepth = u64log2(buckets); |
| ix->bitkeylog = u64log2(ix->blocksize*8); |
| ix->bitkeymask = (1<<ix->bitkeylog)-1; |
| break; |
| } |
| return 0; |
| } |
| |
| int |
| wbindex(Index *ix) |
| { |
| Fmt f; |
| ZBlock *b; |
| int i; |
| |
| if(ix->nsects == 0){ |
| seterr(EOk, "no sections in index %s", ix->name); |
| return -1; |
| } |
| b = alloczblock(ix->tabsize, 1); |
| if(b == nil){ |
| seterr(EOk, "can't write index configuration: out of memory"); |
| return -1; |
| } |
| fmtzbinit(&f, b); |
| if(outputindex(&f, ix) < 0){ |
| seterr(EOk, "can't make index configuration: table storage too small %d", ix->tabsize); |
| freezblock(b); |
| return -1; |
| } |
| for(i = 0; i < ix->nsects; i++){ |
| if(writepart(ix->sects[i]->part, ix->sects[i]->tabbase, b->data, ix->tabsize) < 0){ |
| seterr(EOk, "can't write index: %r"); |
| freezblock(b); |
| return -1; |
| } |
| } |
| freezblock(b); |
| |
| for(i = 0; i < ix->nsects; i++) |
| if(wbisect(ix->sects[i]) < 0) |
| return -1; |
| |
| return 0; |
| } |
| |
| /* |
| * index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' [V2: bitblocks '\n'] sections arenas |
| * version, blocksize: u32int |
| * name: max. ANameSize string |
| * sections, arenas: AMap |
| */ |
| int |
| outputindex(Fmt *f, Index *ix) |
| { |
| if(fmtprint(f, "%s\n%ud\n%s\n%ud\n", IndexMagic, ix->version, ix->name, ix->blocksize) < 0 |
| || (ix->version==IndexVersion2 && fmtprint(f, "%ud\n", ix->bitblocks) < 0) |
| || outputamap(f, ix->smap, ix->nsects) < 0 |
| || outputamap(f, ix->amap, ix->narenas) < 0) |
| return -1; |
| return 0; |
| } |
| |
| int |
| parseindex(IFile *f, Index *ix) |
| { |
| AMapN amn; |
| u32int v; |
| char *s; |
| |
| /* |
| * magic |
| */ |
| s = ifileline(f); |
| if(s == nil || strcmp(s, IndexMagic) != 0){ |
| seterr(ECorrupt, "bad index magic for %s", f->name); |
| return -1; |
| } |
| |
| /* |
| * version |
| */ |
| if(ifileu32int(f, &v) < 0){ |
| seterr(ECorrupt, "syntax error: bad version number in %s", f->name); |
| return -1; |
| } |
| ix->version = v; |
| if(ix->version != IndexVersion1 && ix->version != IndexVersion2){ |
| seterr(ECorrupt, "bad version number in %s", f->name); |
| return -1; |
| } |
| |
| /* |
| * name |
| */ |
| if(ifilename(f, ix->name) < 0){ |
| seterr(ECorrupt, "syntax error: bad index name in %s", f->name); |
| return -1; |
| } |
| |
| /* |
| * block size |
| */ |
| if(ifileu32int(f, &v) < 0){ |
| seterr(ECorrupt, "syntax error: bad block size number in %s", f->name); |
| return -1; |
| } |
| ix->blocksize = v; |
| |
| if(ix->version == IndexVersion2){ |
| /* |
| * free bitmap size |
| */ |
| if(ifileu32int(f, &v) < 0){ |
| seterr(ECorrupt, "syntax error: bad bitmap size in %s", f->name); |
| return -1; |
| } |
| ix->bitblocks = v; |
| } |
| |
| if(parseamap(f, &amn) < 0) |
| return -1; |
| ix->nsects = amn.n; |
| ix->smap = amn.map; |
| |
| if(parseamap(f, &amn) < 0) |
| return -1; |
| ix->narenas = amn.n; |
| ix->amap = amn.map; |
| |
| return 0; |
| } |
| |
| /* |
| * initialize an entirely new index |
| */ |
| Index * |
| newindex(char *name, ISect **sects, int n) |
| { |
| Index *ix; |
| AMap *smap; |
| u64int nb; |
| u32int div, ub, xb, fb, start, stop, blocksize, tabsize; |
| int i, j, version; |
| |
| version = IndexVersion2; |
| |
| if(n < 1){ |
| seterr(EOk, "creating index with no index sections"); |
| return nil; |
| } |
| |
| /* |
| * compute the total buckets available in the index, |
| * and the total buckets which are used. |
| */ |
| nb = 0; |
| blocksize = sects[0]->blocksize; |
| tabsize = sects[0]->tabsize; |
| for(i = 0; i < n; i++){ |
| if(sects[i]->start != 0 || sects[i]->stop != 0 |
| || sects[i]->index[0] != '\0'){ |
| seterr(EOk, "creating new index using non-empty section %s", sects[i]->name); |
| return nil; |
| } |
| if(blocksize != sects[i]->blocksize){ |
| seterr(EOk, "mismatched block sizes in index sections"); |
| return nil; |
| } |
| if(tabsize != sects[i]->tabsize){ |
| seterr(EOk, "mismatched config table sizes in index sections"); |
| return nil; |
| } |
| nb += sects[i]->blocks; |
| } |
| |
| /* |
| * check for duplicate names |
| */ |
| for(i = 0; i < n; i++){ |
| for(j = i + 1; j < n; j++){ |
| if(namecmp(sects[i]->name, sects[j]->name) == 0){ |
| seterr(EOk, "duplicate section name %s for index %s", sects[i]->name, name); |
| return nil; |
| } |
| } |
| } |
| |
| if(nb >= ((u64int)1 << 32)){ |
| seterr(EBug, "index too large"); |
| return nil; |
| } |
| |
| div = 0; |
| fb = 0; |
| if(version == IndexVersion1){ |
| div = (((u64int)1 << 32) + nb - 1) / nb; |
| ub = (((u64int)1 << 32) - 1) / div + 1; |
| if(div < 100){ |
| seterr(EBug, "index divisor too coarse"); |
| return nil; |
| } |
| }else{ |
| fb = (nb+blocksize*8-1)/(blocksize*8); |
| for(ub=1; ub<=((nb-fb)>>1); ub<<=1) |
| ; |
| ub += fb; |
| } |
| if(ub > nb){ |
| seterr(EBug, "index initialization math wrong"); |
| return nil; |
| } |
| xb = nb - ub; |
| |
| /* |
| * initialize each of the index sections |
| * and the section map table |
| */ |
| smap = MKNZ(AMap, n); |
| if(smap == nil){ |
| seterr(EOk, "can't create new index: out of memory"); |
| return nil; |
| } |
| start = 0; |
| for(i = 0; i < n; i++){ |
| stop = start + sects[i]->blocks - xb / n; |
| if(i == n - 1) |
| stop = ub; |
| sects[i]->start = start; |
| sects[i]->stop = stop; |
| namecp(sects[i]->index, name); |
| |
| smap[i].start = start; |
| smap[i].stop = stop; |
| namecp(smap[i].name, sects[i]->name); |
| start = stop; |
| } |
| |
| /* |
| * initialize the index itself |
| */ |
| ix = MKZ(Index); |
| if(ix == nil){ |
| seterr(EOk, "can't create new index: out of memory"); |
| free(smap); |
| return nil; |
| } |
| ix->version = version; |
| namecp(ix->name, name); |
| ix->sects = sects; |
| ix->smap = smap; |
| ix->nsects = n; |
| ix->blocksize = blocksize; |
| ix->buckets = ub; |
| ix->tabsize = tabsize; |
| ix->div = div; |
| ix->bitblocks = fb; |
| |
| if(initindex1(ix) < 0){ |
| free(smap); |
| return nil; |
| } |
| |
| return ix; |
| } |
| |
| ISect* |
| initisect(Part *part) |
| { |
| ISect *is; |
| ZBlock *b; |
| int ok; |
| |
| b = alloczblock(HeadSize, 0); |
| if(b == nil || readpart(part, PartBlank, b->data, HeadSize) < 0){ |
| seterr(EAdmin, "can't read index section header: %r"); |
| return nil; |
| } |
| |
| is = MKZ(ISect); |
| if(is == nil){ |
| freezblock(b); |
| return nil; |
| } |
| is->part = part; |
| ok = unpackisect(is, b->data); |
| freezblock(b); |
| if(ok < 0){ |
| seterr(ECorrupt, "corrupted index section header: %r"); |
| freeisect(is); |
| return nil; |
| } |
| |
| if(is->version != ISectVersion){ |
| seterr(EAdmin, "unknown index section version %d", is->version); |
| freeisect(is); |
| return nil; |
| } |
| |
| return initisect1(is); |
| } |
| |
| ISect* |
| newisect(Part *part, char *name, u32int blocksize, u32int tabsize) |
| { |
| ISect *is; |
| u32int tabbase; |
| |
| is = MKZ(ISect); |
| if(is == nil) |
| return nil; |
| |
| namecp(is->name, name); |
| is->version = ISectVersion; |
| is->part = part; |
| is->blocksize = blocksize; |
| is->start = 0; |
| is->stop = 0; |
| tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1); |
| is->blockbase = (tabbase + tabsize + blocksize - 1) & ~(blocksize - 1); |
| is->blocks = is->part->size / blocksize - is->blockbase / blocksize; |
| |
| is = initisect1(is); |
| if(is == nil) |
| return nil; |
| |
| return is; |
| } |
| |
| /* |
| * initialize the computed parameters for an index |
| */ |
| static ISect* |
| initisect1(ISect *is) |
| { |
| u64int v; |
| |
| is->buckmax = (is->blocksize - IBucketSize) / IEntrySize; |
| is->blocklog = u64log2(is->blocksize); |
| if(is->blocksize != (1 << is->blocklog)){ |
| seterr(ECorrupt, "illegal non-power-of-2 bucket size %d\n", is->blocksize); |
| freeisect(is); |
| return nil; |
| } |
| partblocksize(is->part, is->blocksize); |
| is->tabbase = (PartBlank + HeadSize + is->blocksize - 1) & ~(is->blocksize - 1); |
| if(is->tabbase >= is->blockbase){ |
| seterr(ECorrupt, "index section config table overlaps bucket storage"); |
| freeisect(is); |
| return nil; |
| } |
| is->tabsize = is->blockbase - is->tabbase; |
| v = is->part->size & ~(u64int)(is->blocksize - 1); |
| if(is->blockbase + (u64int)is->blocks * is->blocksize != v){ |
| seterr(ECorrupt, "invalid blocks in index section %s", is->name); |
| //ZZZZZZZZZ |
| // freeisect(is); |
| // return nil; |
| } |
| |
| if(is->stop - is->start > is->blocks){ |
| seterr(ECorrupt, "index section overflows available space"); |
| freeisect(is); |
| return nil; |
| } |
| if(is->start > is->stop){ |
| seterr(ECorrupt, "invalid index section range"); |
| freeisect(is); |
| return nil; |
| } |
| |
| return is; |
| } |
| |
| int |
| wbisect(ISect *is) |
| { |
| ZBlock *b; |
| |
| b = alloczblock(HeadSize, 1); |
| if(b == nil) |
| //ZZZ set error? |
| return -1; |
| |
| if(packisect(is, b->data) < 0){ |
| seterr(ECorrupt, "can't make index section header: %r"); |
| freezblock(b); |
| return -1; |
| } |
| if(writepart(is->part, PartBlank, b->data, HeadSize) < 0){ |
| seterr(EAdmin, "can't write index section header: %r"); |
| freezblock(b); |
| return -1; |
| } |
| freezblock(b); |
| |
| return 0; |
| } |
| |
| void |
| freeisect(ISect *is) |
| { |
| if(is == nil) |
| return; |
| free(is); |
| } |
| |
| void |
| freeindex(Index *ix) |
| { |
| int i; |
| |
| if(ix == nil) |
| return; |
| free(ix->amap); |
| free(ix->arenas); |
| if(ix->sects) |
| for(i = 0; i < ix->nsects; i++) |
| freeisect(ix->sects[i]); |
| free(ix->sects); |
| free(ix->smap); |
| free(ix); |
| } |
| |
| /* |
| * write a clump to an available arena in the index |
| * and return the address of the clump within the index. |
| ZZZ question: should this distinguish between an arena |
| filling up and real errors writing the clump? |
| */ |
| u64int |
| writeiclump(Index *ix, Clump *c, u8int *clbuf) |
| { |
| u64int a; |
| int i; |
| |
| for(i = ix->mapalloc; i < ix->narenas; i++){ |
| a = writeaclump(ix->arenas[i], c, clbuf); |
| if(a != TWID64) |
| return a + ix->amap[i].start; |
| } |
| |
| seterr(EAdmin, "no space left in arenas"); |
| return TWID64; |
| } |
| |
| /* |
| * convert an arena index to an relative arena address |
| */ |
| Arena* |
| amapitoa(Index *ix, u64int a, u64int *aa) |
| { |
| int i, r, l, m; |
| |
| l = 1; |
| r = ix->narenas - 1; |
| while(l <= r){ |
| m = (r + l) / 2; |
| if(ix->amap[m].start <= a) |
| l = m + 1; |
| else |
| r = m - 1; |
| } |
| l--; |
| |
| if(a > ix->amap[l].stop){ |
| for(i=0; i<ix->narenas; i++) |
| print("arena %d: %llux - %llux\n", i, ix->amap[i].start, ix->amap[i].stop); |
| print("want arena %d for %llux\n", l, a); |
| seterr(ECrash, "unmapped address passed to amapitoa"); |
| return nil; |
| } |
| |
| if(ix->arenas[l] == nil){ |
| seterr(ECrash, "unmapped arena selected in amapitoa"); |
| return nil; |
| } |
| *aa = a - ix->amap[l].start; |
| return ix->arenas[l]; |
| } |
| |
| int |
| iaddrcmp(IAddr *ia1, IAddr *ia2) |
| { |
| return ia1->type != ia2->type |
| || ia1->size != ia2->size |
| || ia1->blocks != ia2->blocks |
| || ia1->addr != ia2->addr; |
| } |
| |
| /* |
| * lookup the score in the partition |
| * |
| * nothing needs to be explicitly locked: |
| * only static parts of ix are used, and |
| * the bucket is locked by the DBlock lock. |
| */ |
| int |
| loadientry(Index *ix, u8int *score, int type, IEntry *ie) |
| { |
| ISect *is; |
| DBlock *b; |
| IBucket ib; |
| u32int buck; |
| int h, ok; |
| |
| ok = -1; |
| |
| qlock(&stats.lock); |
| stats.indexreads++; |
| qunlock(&stats.lock); |
| b = loadibucket(ix, score, &is, &buck, &ib); |
| if(b == nil) |
| return -1; |
| |
| if(okibucket(&ib, is) < 0) |
| goto out; |
| |
| h = bucklook(score, type, ib.data, ib.n); |
| if(h & 1){ |
| h ^= 1; |
| unpackientry(ie, &ib.data[h]); |
| ok = 0; |
| goto out; |
| } |
| |
| out: |
| putdblock(b); |
| return ok; |
| } |
| |
| /* |
| * insert or update an index entry into the appropriate bucket |
| */ |
| int |
| storeientry(Index *ix, IEntry *ie) |
| { |
| ISect *is; |
| DBlock *b; |
| IBucket ib; |
| u32int buck; |
| int h, ok; |
| |
| ok = 0; |
| |
| qlock(&stats.lock); |
| stats.indexwreads++; |
| qunlock(&stats.lock); |
| |
| top: |
| b = loadibucket(ix, ie->score, &is, &buck, &ib); |
| if(b == nil) |
| return -1; |
| |
| if(okibucket(&ib, is) < 0) |
| goto out; |
| |
| h = bucklook(ie->score, ie->ia.type, ib.data, ib.n); |
| if(h & 1){ |
| h ^= 1; |
| dirtydblock(b, DirtyIndex); |
| packientry(ie, &ib.data[h]); |
| ok = writebucket(is, buck, &ib, b); |
| goto out; |
| } |
| |
| if(ib.n < is->buckmax){ |
| dirtydblock(b, DirtyIndex); |
| memmove(&ib.data[h + IEntrySize], &ib.data[h], ib.n * IEntrySize - h); |
| ib.n++; |
| |
| packientry(ie, &ib.data[h]); |
| ok = writebucket(is, buck, &ib, b); |
| goto out; |
| } |
| |
| /* block is full -- not supposed to happen in V1 */ |
| if(ix->version == IndexVersion1){ |
| seterr(EAdmin, "index bucket %ud on %s is full\n", buck, is->part->name); |
| ok = -1; |
| goto out; |
| } |
| |
| /* in V2, split the block and try again; splitiblock puts b */ |
| if(splitiblock(ix, b, is, buck, &ib) < 0) |
| return -1; |
| goto top; |
| |
| out: |
| putdblock(b); |
| return ok; |
| } |
| |
| static int |
| writebucket(ISect *is, u32int buck, IBucket *ib, DBlock *b) |
| { |
| assert(b->dirty == DirtyIndex || b->dirty == DirtyIndexSplit); |
| |
| if(buck >= is->blocks){ |
| seterr(EAdmin, "index write out of bounds: %d >= %d\n", |
| buck, is->blocks); |
| return -1; |
| } |
| qlock(&stats.lock); |
| stats.indexwrites++; |
| qunlock(&stats.lock); |
| packibucket(ib, b->data); |
| // return writepart(is->part, is->blockbase + ((u64int)buck << is->blocklog), b->data, is->blocksize); |
| return 0; |
| } |
| |
| static int |
| okibucket(IBucket *ib, ISect *is) |
| { |
| if(ib->n <= is->buckmax) |
| return 0; |
| |
| seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, depth=%lud range=[%lud,%lud)", |
| ib->n, is->buckmax, ib->depth, is->start, is->stop); |
| return -1; |
| } |
| |
| /* |
| * look for score within data; |
| * return 1 | byte index of matching index, |
| * or 0 | index of least element > score |
| */ |
| static int |
| bucklook(u8int *score, int otype, u8int *data, int n) |
| { |
| int i, r, l, m, h, c, cc, type; |
| |
| type = vttodisktype(otype); |
| l = 0; |
| r = n - 1; |
| while(l <= r){ |
| m = (r + l) >> 1; |
| h = m * IEntrySize; |
| for(i = 0; i < VtScoreSize; i++){ |
| c = score[i]; |
| cc = data[h + i]; |
| if(c != cc){ |
| if(c > cc) |
| l = m + 1; |
| else |
| r = m - 1; |
| goto cont; |
| } |
| } |
| cc = data[h + IEntryTypeOff]; |
| if(type != cc){ |
| if(type > cc) |
| l = m + 1; |
| else |
| r = m - 1; |
| goto cont; |
| } |
| return h | 1; |
| cont:; |
| } |
| |
| return l * IEntrySize; |
| } |
| |
| /* |
| * compare two IEntries; consistent with bucklook |
| */ |
| int |
| ientrycmp(const void *vie1, const void *vie2) |
| { |
| u8int *ie1, *ie2; |
| int i, v1, v2; |
| |
| ie1 = (u8int*)vie1; |
| ie2 = (u8int*)vie2; |
| for(i = 0; i < VtScoreSize; i++){ |
| v1 = ie1[i]; |
| v2 = ie2[i]; |
| if(v1 != v2){ |
| if(v1 < v2) |
| return -1; |
| return 0; |
| } |
| } |
| v1 = ie1[IEntryTypeOff]; |
| v2 = ie2[IEntryTypeOff]; |
| if(v1 != v2){ |
| if(v1 < v2) |
| return -1; |
| return 0; |
| } |
| return -1; |
| } |
| |
| /* |
| * find the number of the index section holding bucket #buck |
| */ |
| static int |
| indexsect0(Index *ix, u32int buck) |
| { |
| int r, l, m; |
| |
| l = 1; |
| r = ix->nsects - 1; |
| while(l <= r){ |
| m = (r + l) >> 1; |
| if(ix->sects[m]->start <= buck) |
| l = m + 1; |
| else |
| r = m - 1; |
| } |
| return l - 1; |
| } |
| |
| /* |
| * load the index block at bucket #buck |
| */ |
| static DBlock* |
| loadibucket0(Index *ix, u32int buck, ISect **pis, u32int *pbuck, IBucket *ib, int read) |
| { |
| ISect *is; |
| DBlock *b; |
| |
| is = ix->sects[indexsect0(ix, buck)]; |
| if(buck < is->start || is->stop <= buck){ |
| seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck); |
| return nil; |
| } |
| |
| buck -= is->start; |
| if((b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), read)) == nil) |
| return nil; |
| |
| if(pis) |
| *pis = is; |
| if(pbuck) |
| *pbuck = buck; |
| if(ib) |
| unpackibucket(ib, b->data); |
| return b; |
| } |
| |
| /* |
| * find the number of the index section holding score |
| */ |
| static int |
| indexsect1(Index *ix, u8int *score) |
| { |
| return indexsect0(ix, hashbits(score, 32) / ix->div); |
| } |
| |
| /* |
| * load the index block responsible for score. |
| */ |
| static DBlock* |
| loadibucket1(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib) |
| { |
| return loadibucket0(ix, hashbits(score, 32)/ix->div, pis, pbuck, ib, 1); |
| } |
| |
| static u32int |
| keytobuck(Index *ix, u32int key, int d) |
| { |
| /* clear all but top d bits; can't depend on boundary case shifts */ |
| if(d == 0) |
| key = 0; |
| else if(d != 32) |
| key &= ~((1<<(32-d))-1); |
| |
| /* truncate to maxdepth bits */ |
| if(ix->maxdepth != 32) |
| key >>= 32 - ix->maxdepth; |
| |
| return ix->bitblocks + key; |
| } |
| |
| /* |
| * to check whether .xxx has split, check whether .xxx1 is in use. |
| * it might be worth caching the block for future lookups, but for now |
| * let's assume the dcache is good enough. |
| */ |
| static int |
| bitmapop(Index *ix, u32int key, int d, int set) |
| { |
| DBlock *b; |
| int inuse; |
| u32int key1, buck1; |
| |
| if(d >= ix->maxdepth) |
| return 0; |
| |
| |
| /* construct .xxx1 in bucket number format */ |
| key1 = key | (1<<(32-d-1)); |
| buck1 = keytobuck(ix, key1, d+1); |
| |
| if(0) fprint(2, "key %d/%0*ub key1 %d/%0*ub buck1 %08ux\n", |
| d, d, KEY(key, d), d+1, d+1, KEY(key1, d+1), buck1); |
| |
| /* check whether buck1 is in use */ |
| |
| if((b = loadibucket0(ix, buck1 >> ix->bitkeylog, nil, nil, nil, 1)) == nil){ |
| seterr(ECorrupt, "cannot load in-use bitmap block"); |
| fprint(2, "loadibucket: %r\n"); |
| return -1; |
| } |
| if(0) fprint(2, "buck1 %08ux bitkeymask %08ux bitkeylog %d\n", buck1, ix->bitkeymask, ix->bitkeylog); |
| buck1 &= ix->bitkeymask; |
| inuse = ((u32int*)b->data)[buck1>>5] & (1<<(buck1&31)); |
| if(set && !inuse){ |
| dirtydblock(b, DirtyIndexBitmap); |
| ((u32int*)b->data)[buck1>>5] |= (1<<(buck1&31)); |
| } |
| putdblock(b); |
| return inuse; |
| } |
| |
| static int |
| issplit(Index *ix, u32int key, int d) |
| { |
| return bitmapop(ix, key, d, 0); |
| } |
| |
| static int |
| marksplit(Index *ix, u32int key, int d) |
| { |
| return bitmapop(ix, key, d, 1); |
| } |
| |
| /* |
| * find the number of the index section holding score. |
| * it's not terrible to be wrong once in a while, so we just |
| * do what the bitmap tells us and don't worry about the |
| * bitmap being out of date. |
| */ |
| static int |
| indexsect2(Index *ix, u8int *score) |
| { |
| u32int key; |
| int d, is; |
| |
| key = hashbits(score, 32); |
| for(d=0; d<=ix->maxdepth; d++){ |
| is = issplit(ix, key, d); |
| if(is == -1) |
| return 0; /* must return something valid! */ |
| if(!is) |
| break; |
| } |
| |
| if(d > ix->maxdepth){ |
| seterr(EBug, "index bitmap inconsistent with maxdepth"); |
| return 0; /* must return something valid! */ |
| } |
| |
| return indexsect0(ix, keytobuck(ix, key, d)); |
| } |
| |
| /* |
| * load the index block responsible for score. |
| * (unlike indexsect2, must be correct always.) |
| */ |
| static DBlock* |
| loadibucket2(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib) |
| { |
| u32int key; |
| int d, try, is; |
| DBlock *b; |
| |
| for(try=0; try<32; try++){ |
| key = hashbits(score, 32); |
| for(d=0; d<=ix->maxdepth; d++){ |
| is = issplit(ix, key, d); |
| if(is == -1) |
| return nil; |
| if(!is) |
| break; |
| } |
| if(d > ix->maxdepth){ |
| seterr(EBug, "index bitmap inconsistent with maxdepth"); |
| return nil; |
| } |
| |
| if((b = loadibucket0(ix, keytobuck(ix, key, d), pis, pbuck, ib, 1)) == nil) |
| return nil; |
| |
| if(ib->depth == d) |
| return b; |
| |
| if(ib->depth < d){ |
| fprint(2, "loaded block %ud for %d/%0*ub got %d/%0*ub\n", |
| *pbuck, d, d, KEY(key,d), ib->depth, ib->depth, KEY(key, ib->depth)); |
| seterr(EBug, "index block has smaller depth than expected -- cannot happen"); |
| putdblock(b); |
| return nil; |
| } |
| |
| /* |
| * ib->depth > d, meaning the bitmap was out of date. |
| * fix the bitmap and try again. if we actually updated |
| * the bitmap in splitiblock, this would only happen if |
| * venti crashed at an inopportune moment. but this way |
| * the code gets tested more. |
| */ |
| if(0) fprint(2, "update bitmap: %d/%0*ub is split\n", d, d, KEY(key,d)); |
| putdblock(b); |
| if(marksplit(ix, key, d) < 0) |
| return nil; |
| } |
| seterr(EBug, "loadibucket2 failed to sync bitmap with disk!"); |
| return nil; |
| } |
| |
| static int |
| splitiblock(Index *ix, DBlock *b, ISect *is, u32int buck, IBucket *ib) |
| { |
| int i, d; |
| u8int *score; |
| u32int buck0, buck1, key0, key1, key, dmask; |
| DBlock *b0, *b1; |
| IBucket ib0, ib1; |
| |
| if(ib->depth == ix->maxdepth){ |
| if(0) fprint(2, "depth=%d == maxdepth\n", ib->depth); |
| seterr(EAdmin, "index bucket %ud on %s is full\n", buck, is->part->name); |
| putdblock(b); |
| return -1; |
| } |
| |
| buck = is->start+buck - ix->bitblocks; |
| d = ib->depth+1; |
| buck0 = buck; |
| buck1 = buck0 | (1<<(ix->maxdepth-d)); |
| if(ix->maxdepth == 32){ |
| key0 = buck0; |
| key1 = buck1; |
| }else{ |
| key0 = buck0 << (32-ix->maxdepth); |
| key1 = buck1 << (32-ix->maxdepth); |
| } |
| buck0 += ix->bitblocks; |
| buck1 += ix->bitblocks; |
| USED(buck0); |
| USED(key1); |
| |
| if(d == 32) |
| dmask = TWID32; |
| else |
| dmask = TWID32 ^ ((1<<(32-d))-1); |
| |
| /* |
| * Since we hold the lock for b, the bitmap |
| * thinks buck1 doesn't exist, and the bit |
| * for buck1 can't be updated without first |
| * locking and splitting b, it's safe to try to |
| * acquire the lock on buck1 without dropping b. |
| * No one else will go after it too. |
| * |
| * Also, none of the rest of the code ever locks |
| * more than one block at a time, so it's okay if |
| * we do. |
| */ |
| if((b1 = loadibucket0(ix, buck1, nil, nil, &ib1, 0)) == nil){ |
| putdblock(b); |
| return -1; |
| } |
| b0 = b; |
| ib0 = *ib; |
| |
| /* |
| * Funny locking going on here -- see dirtydblock. |
| * We must putdblock(b1) before putdblock(b0). |
| */ |
| dirtydblock(b0, DirtyIndex); |
| dirtydblock(b1, DirtyIndexSplit); |
| |
| /* |
| * Split the block contents. |
| * The block is already sorted, so it's pretty easy: |
| * the first part stays, the second part goes to b1. |
| */ |
| ib0.n = 0; |
| ib0.depth = d; |
| ib1.n = 0; |
| ib1.depth = d; |
| for(i=0; i<ib->n; i++){ |
| score = ib->data+i*IEntrySize; |
| key = hashbits(score, 32); |
| if((key&dmask) != key0) |
| break; |
| } |
| ib0.n = i; |
| ib1.n = ib->n - ib0.n; |
| memmove(ib1.data, ib0.data+ib0.n*IEntrySize, ib1.n*IEntrySize); |
| if(0) fprint(2, "splitiblock %d in %d/%0*ub => %d in %d/%0*ub + %d in %d/%0*ub\n", |
| ib->n, d-1, d-1, key0>>(32-d+1), |
| ib0.n, d, d, key0>>(32-d), |
| ib1.n, d, d, key1>>(32-d)); |
| |
| packibucket(&ib0, b0->data); |
| packibucket(&ib1, b1->data); |
| |
| /* order matters! see comment above. */ |
| putdblock(b1); |
| putdblock(b0); |
| |
| /* |
| * let the recovery code take care of updating the bitmap. |
| */ |
| |
| qlock(&stats.lock); |
| stats.indexsplits++; |
| qunlock(&stats.lock); |
| |
| return 0; |
| } |
| |
| int |
| indexsect(Index *ix, u8int *score) |
| { |
| if(ix->version == IndexVersion1) |
| return indexsect1(ix, score); |
| return indexsect2(ix, score); |
| } |
| |
| DBlock* |
| loadibucket(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib) |
| { |
| if(ix->version == IndexVersion1) |
| return loadibucket1(ix, score, pis, pbuck, ib); |
| return loadibucket2(ix, score, pis, pbuck, ib); |
| } |
| |
| |