blob: 07bf81c8643ab754b9f88ad430933b1c686ddb07 [file] [log] [blame]
/*
* Index, mapping scores to log positions.
*
* The index is made up of some number of index sections, each of
* which is typically stored on a different disk. The blocks in all the
* index sections are logically numbered, with each index section
* responsible for a range of blocks. Blocks are typically 8kB.
*
* The N index blocks are treated as a giant hash table. The top 32 bits
* of score are used as the key for a lookup. Each index block holds
* one hash bucket, which is responsible for ceil(2^32 / N) of the key space.
*
* The index is sized so that a particular bucket is extraordinarily
* unlikely to overflow: assuming compressed data blocks are 4kB
* on disk, and assuming each block has a 40 byte index entry,
* the index data will be 1% of the total data. Since scores are essentially
* random, all buckets should be about the same fullness.
* A factor of 5 gives us a wide comfort boundary to account for
* random variation. So the index disk space should be 5% of the arena disk space.
*/
#include "stdinc.h"
#include "dat.h"
#include "fns.h"
static int initindex1(Index*);
static ISect *initisect1(ISect *is);
#define KEY(k,d) ((d) ? (k)>>(32-(d)) : 0)
static char IndexMagic[] = "venti index configuration";
Index*
initindex(char *name, ISect **sects, int n)
{
IFile f;
Index *ix;
ISect *is;
u32int last, blocksize, tabsize;
int i;
if(n <= 0){
fprint(2, "bad n\n");
seterr(EOk, "no index sections to initialize index");
return nil;
}
ix = MKZ(Index);
if(ix == nil){
fprint(2, "no mem\n");
seterr(EOk, "can't initialize index: out of memory");
freeindex(ix);
return nil;
}
tabsize = sects[0]->tabsize;
if(partifile(&f, sects[0]->part, sects[0]->tabbase, tabsize) < 0)
return nil;
if(parseindex(&f, ix) < 0){
freeifile(&f);
freeindex(ix);
return nil;
}
freeifile(&f);
if(namecmp(ix->name, name) != 0){
seterr(ECorrupt, "mismatched index name: found %s expected %s", ix->name, name);
return nil;
}
if(ix->nsects != n){
seterr(ECorrupt, "mismatched number index sections: found %d expected %d", n, ix->nsects);
freeindex(ix);
return nil;
}
ix->sects = sects;
last = 0;
blocksize = ix->blocksize;
for(i = 0; i < ix->nsects; i++){
is = sects[i];
if(namecmp(is->index, ix->name) != 0) {
seterr(ECorrupt, "%s: index name is %s, not %s",
sects[i]->part->name, is->index, ix->name);
bad:
freeindex(ix);
return nil;
}
if(is->blocksize != blocksize) {
seterr(ECorrupt, "%s: blocksize is %d, not %d",
sects[i]->part->name, (int)is->blocksize, (int)blocksize);
goto bad;
}
if(is->tabsize != tabsize) {
seterr(ECorrupt, "%s: tabsize is %d, not %d",
sects[i]->part->name, (int)is->tabsize, (int)tabsize);
goto bad;
}
if(namecmp(is->name, ix->smap[i].name) != 0) {
seterr(ECorrupt, "%s: name is %s, not %s",
sects[i]->part->name, is->name, ix->smap[i].name);
goto bad;
}
if(is->start != ix->smap[i].start || is->stop != ix->smap[i].stop) {
seterr(ECorrupt, "%s: range is %lld,%lld, not %lld,%lld",
sects[i]->part->name, is->start, is->stop,
ix->smap[i].start, ix->smap[i].stop);
goto bad;
}
if(is->start > is->stop) {
seterr(ECorrupt, "%s: invalid range %lld,%lld",
sects[i]->part->name, is->start, is->stop);
goto bad;
}
if(is->start != last || is->start > is->stop) {
seterr(ECorrupt, "%s: range %lld-%lld, but last section ended at %lld",
sects[i]->part->name, is->start, is->stop, last);
goto bad;
}
last = is->stop;
}
ix->tabsize = tabsize;
ix->buckets = last;
if(initindex1(ix) < 0){
freeindex(ix);
return nil;
}
ix->arenas = MKNZ(Arena*, ix->narenas);
if(maparenas(ix->amap, ix->arenas, ix->narenas, ix->name) < 0){
freeindex(ix);
return nil;
}
return ix;
}
static int
initindex1(Index *ix)
{
u32int buckets;
ix->div = (((u64int)1 << 32) + ix->buckets - 1) / ix->buckets;
buckets = (((u64int)1 << 32) - 1) / ix->div + 1;
if(buckets != ix->buckets){
seterr(ECorrupt, "inconsistent math for divisor and buckets in %s", ix->name);
return -1;
}
return 0;
}
int
wbindex(Index *ix)
{
Fmt f;
ZBlock *b;
int i;
if(ix->nsects == 0){
seterr(EOk, "no sections in index %s", ix->name);
return -1;
}
b = alloczblock(ix->tabsize, 1, ix->blocksize);
if(b == nil){
seterr(EOk, "can't write index configuration: out of memory");
return -1;
}
fmtzbinit(&f, b);
if(outputindex(&f, ix) < 0){
seterr(EOk, "can't make index configuration: table storage too small %d", ix->tabsize);
freezblock(b);
return -1;
}
for(i = 0; i < ix->nsects; i++){
if(writepart(ix->sects[i]->part, ix->sects[i]->tabbase, b->data, ix->tabsize) < 0
|| flushpart(ix->sects[i]->part) < 0){
seterr(EOk, "can't write index: %r");
freezblock(b);
return -1;
}
}
freezblock(b);
for(i = 0; i < ix->nsects; i++)
if(wbisect(ix->sects[i]) < 0)
return -1;
return 0;
}
/*
* index: IndexMagic '\n' version '\n' name '\n' blocksize '\n' [V2: bitblocks '\n'] sections arenas
* version, blocksize: u32int
* name: max. ANameSize string
* sections, arenas: AMap
*/
int
outputindex(Fmt *f, Index *ix)
{
if(fmtprint(f, "%s\n%ud\n%s\n%ud\n", IndexMagic, ix->version, ix->name, ix->blocksize) < 0
|| outputamap(f, ix->smap, ix->nsects) < 0
|| outputamap(f, ix->amap, ix->narenas) < 0)
return -1;
return 0;
}
int
parseindex(IFile *f, Index *ix)
{
AMapN amn;
u32int v;
char *s;
/*
* magic
*/
s = ifileline(f);
if(s == nil || strcmp(s, IndexMagic) != 0){
seterr(ECorrupt, "bad index magic for %s", f->name);
return -1;
}
/*
* version
*/
if(ifileu32int(f, &v) < 0){
seterr(ECorrupt, "syntax error: bad version number in %s", f->name);
return -1;
}
ix->version = v;
if(ix->version != IndexVersion){
seterr(ECorrupt, "bad version number in %s", f->name);
return -1;
}
/*
* name
*/
if(ifilename(f, ix->name) < 0){
seterr(ECorrupt, "syntax error: bad index name in %s", f->name);
return -1;
}
/*
* block size
*/
if(ifileu32int(f, &v) < 0){
seterr(ECorrupt, "syntax error: bad block size number in %s", f->name);
return -1;
}
ix->blocksize = v;
if(parseamap(f, &amn) < 0)
return -1;
ix->nsects = amn.n;
ix->smap = amn.map;
if(parseamap(f, &amn) < 0)
return -1;
ix->narenas = amn.n;
ix->amap = amn.map;
return 0;
}
/*
* initialize an entirely new index
*/
Index *
newindex(char *name, ISect **sects, int n)
{
Index *ix;
AMap *smap;
u64int nb;
u32int div, ub, xb, start, stop, blocksize, tabsize;
int i, j;
if(n < 1){
seterr(EOk, "creating index with no index sections");
return nil;
}
/*
* compute the total buckets available in the index,
* and the total buckets which are used.
*/
nb = 0;
blocksize = sects[0]->blocksize;
tabsize = sects[0]->tabsize;
for(i = 0; i < n; i++){
/*
* allow index, start, and stop to be set if index is correct
* and start and stop are what we would have picked.
* this allows calling fmtindex to reformat the index after
* replacing a bad index section with a freshly formatted one.
* start and stop are checked below.
*/
if(sects[i]->index[0] != '\0' && strcmp(sects[i]->index, name) != 0){
seterr(EOk, "creating new index using non-empty section %s", sects[i]->name);
return nil;
}
if(blocksize != sects[i]->blocksize){
seterr(EOk, "%s has block size %d, but %s has %d",
sects[0]->part->name, (int)blocksize,
sects[i]->part->name, (int)sects[i]->blocksize);
return nil;
}
if(tabsize != sects[i]->tabsize){
seterr(EOk, "%s has table size %d, but %s has %d",
sects[0]->part->name, (int)tabsize,
sects[i]->part->name, (int)sects[i]->tabsize);
return nil;
}
nb += sects[i]->blocks;
}
/*
* check for duplicate names
*/
for(i = 0; i < n; i++){
for(j = i + 1; j < n; j++){
if(namecmp(sects[i]->name, sects[j]->name) == 0){
seterr(EOk, "%s and %s both have section name %s",
sects[i]->part->name,
sects[j]->part->name,
sects[i]->name);
return nil;
}
}
}
if(nb >= ((u64int)1 << 32)){
fprint(2, "%s: index is 2^32 blocks or more; ignoring some of it\n",
argv0);
nb = ((u64int)1 << 32) - 1;
}
div = (((u64int)1 << 32) + nb - 1) / nb;
if(div < 100){
fprint(2, "%s: index divisor %d too coarse; "
"index larger than needed, ignoring some of it\n",
argv0, div);
div = 100;
nb = (((u64int)1 << 32) - 1) / (100 - 1);
}
ub = (((u64int)1 << 32) - 1) / div + 1;
if(ub > nb){
seterr(EBug, "index initialization math wrong");
return nil;
}
xb = nb - ub;
/*
* initialize each of the index sections
* and the section map table
*/
smap = MKNZ(AMap, n);
if(smap == nil){
seterr(EOk, "can't create new index: out of memory");
return nil;
}
start = 0;
for(i = 0; i < n; i++){
stop = start + sects[i]->blocks - xb / n;
if(i == n - 1)
stop = ub;
if(sects[i]->start != 0 || sects[i]->stop != 0)
if(sects[i]->start != start || sects[i]->stop != stop){
seterr(EOk, "creating new index using non-empty section %s", sects[i]->name);
return nil;
}
sects[i]->start = start;
sects[i]->stop = stop;
namecp(sects[i]->index, name);
smap[i].start = start;
smap[i].stop = stop;
namecp(smap[i].name, sects[i]->name);
start = stop;
}
/*
* initialize the index itself
*/
ix = MKZ(Index);
if(ix == nil){
seterr(EOk, "can't create new index: out of memory");
free(smap);
return nil;
}
ix->version = IndexVersion;
namecp(ix->name, name);
ix->sects = sects;
ix->smap = smap;
ix->nsects = n;
ix->blocksize = blocksize;
ix->buckets = ub;
ix->tabsize = tabsize;
ix->div = div;
if(initindex1(ix) < 0){
free(smap);
return nil;
}
return ix;
}
ISect*
initisect(Part *part)
{
ISect *is;
ZBlock *b;
int ok;
b = alloczblock(HeadSize, 0, 0);
if(b == nil || readpart(part, PartBlank, b->data, HeadSize) < 0){
seterr(EAdmin, "can't read index section header: %r");
return nil;
}
is = MKZ(ISect);
if(is == nil){
freezblock(b);
return nil;
}
is->part = part;
ok = unpackisect(is, b->data);
freezblock(b);
if(ok < 0){
seterr(ECorrupt, "corrupted index section header: %r");
freeisect(is);
return nil;
}
if(is->version != ISectVersion1 && is->version != ISectVersion2){
seterr(EAdmin, "unknown index section version %d", is->version);
freeisect(is);
return nil;
}
return initisect1(is);
}
ISect*
newisect(Part *part, u32int vers, char *name, u32int blocksize, u32int tabsize)
{
ISect *is;
u32int tabbase;
is = MKZ(ISect);
if(is == nil)
return nil;
namecp(is->name, name);
is->version = vers;
is->part = part;
is->blocksize = blocksize;
is->start = 0;
is->stop = 0;
tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1);
is->blockbase = (tabbase + tabsize + blocksize - 1) & ~(blocksize - 1);
is->blocks = is->part->size / blocksize - is->blockbase / blocksize;
is->bucketmagic = 0;
if(is->version == ISectVersion2){
do{
is->bucketmagic = fastrand();
}while(is->bucketmagic==0);
}
is = initisect1(is);
if(is == nil)
return nil;
return is;
}
/*
* initialize the computed parameters for an index
*/
static ISect*
initisect1(ISect *is)
{
u64int v;
is->buckmax = (is->blocksize - IBucketSize) / IEntrySize;
is->blocklog = u64log2(is->blocksize);
if(is->blocksize != (1 << is->blocklog)){
seterr(ECorrupt, "illegal non-power-of-2 bucket size %d\n", is->blocksize);
freeisect(is);
return nil;
}
partblocksize(is->part, is->blocksize);
is->tabbase = (PartBlank + HeadSize + is->blocksize - 1) & ~(is->blocksize - 1);
if(is->tabbase >= is->blockbase){
seterr(ECorrupt, "index section config table overlaps bucket storage");
freeisect(is);
return nil;
}
is->tabsize = is->blockbase - is->tabbase;
v = is->part->size & ~(u64int)(is->blocksize - 1);
if(is->blockbase + (u64int)is->blocks * is->blocksize != v){
seterr(ECorrupt, "invalid blocks in index section %s", is->name);
/* ZZZ what to do?
freeisect(is);
return nil;
*/
}
if(is->stop - is->start > is->blocks){
seterr(ECorrupt, "index section overflows available space");
freeisect(is);
return nil;
}
if(is->start > is->stop){
seterr(ECorrupt, "invalid index section range");
freeisect(is);
return nil;
}
return is;
}
int
wbisect(ISect *is)
{
ZBlock *b;
b = alloczblock(HeadSize, 1, 0);
if(b == nil){
/* ZZZ set error? */
return -1;
}
if(packisect(is, b->data) < 0){
seterr(ECorrupt, "can't make index section header: %r");
freezblock(b);
return -1;
}
if(writepart(is->part, PartBlank, b->data, HeadSize) < 0 || flushpart(is->part) < 0){
seterr(EAdmin, "can't write index section header: %r");
freezblock(b);
return -1;
}
freezblock(b);
return 0;
}
void
freeisect(ISect *is)
{
if(is == nil)
return;
free(is);
}
void
freeindex(Index *ix)
{
int i;
if(ix == nil)
return;
free(ix->amap);
free(ix->arenas);
if(ix->sects)
for(i = 0; i < ix->nsects; i++)
freeisect(ix->sects[i]);
free(ix->sects);
free(ix->smap);
free(ix);
}
/*
* write a clump to an available arena in the index
* and return the address of the clump within the index.
ZZZ question: should this distinguish between an arena
filling up and real errors writing the clump?
*/
u64int
writeiclump(Index *ix, Clump *c, u8int *clbuf)
{
u64int a;
int i;
IAddr ia;
AState as;
trace(TraceLump, "writeiclump enter");
qlock(&ix->writing);
for(i = ix->mapalloc; i < ix->narenas; i++){
a = writeaclump(ix->arenas[i], c, clbuf);
if(a != TWID64){
ix->mapalloc = i;
ia.addr = ix->amap[i].start + a;
ia.type = c->info.type;
ia.size = c->info.uncsize;
ia.blocks = (c->info.size + ClumpSize + (1<<ABlockLog) - 1) >> ABlockLog;
as.arena = ix->arenas[i];
as.aa = ia.addr;
as.stats = as.arena->memstats;
insertscore(c->info.score, &ia, IEDirty, &as);
qunlock(&ix->writing);
trace(TraceLump, "writeiclump exit");
return ia.addr;
}
}
qunlock(&ix->writing);
seterr(EAdmin, "no space left in arenas");
trace(TraceLump, "writeiclump failed");
return TWID64;
}
/*
* convert an arena index to an relative arena address
*/
Arena*
amapitoa(Index *ix, u64int a, u64int *aa)
{
int i, r, l, m;
l = 1;
r = ix->narenas - 1;
while(l <= r){
m = (r + l) / 2;
if(ix->amap[m].start <= a)
l = m + 1;
else
r = m - 1;
}
l--;
if(a > ix->amap[l].stop){
for(i=0; i<ix->narenas; i++)
print("arena %d: %llux - %llux\n", i, ix->amap[i].start, ix->amap[i].stop);
print("want arena %d for %llux\n", l, a);
seterr(ECrash, "unmapped address passed to amapitoa");
return nil;
}
if(ix->arenas[l] == nil){
seterr(ECrash, "unmapped arena selected in amapitoa");
return nil;
}
*aa = a - ix->amap[l].start;
return ix->arenas[l];
}
/*
* convert an arena index to the bounds of the containing arena group.
*/
Arena*
amapitoag(Index *ix, u64int a, u64int *gstart, u64int *glimit, int *g)
{
u64int aa;
Arena *arena;
arena = amapitoa(ix, a, &aa);
if(arena == nil)
return nil;
if(arenatog(arena, aa, gstart, glimit, g) < 0)
return nil;
*gstart += a - aa;
*glimit += a - aa;
return arena;
}
int
iaddrcmp(IAddr *ia1, IAddr *ia2)
{
return ia1->type != ia2->type
|| ia1->size != ia2->size
|| ia1->blocks != ia2->blocks
|| ia1->addr != ia2->addr;
}
/*
* lookup the score in the partition
*
* nothing needs to be explicitly locked:
* only static parts of ix are used, and
* the bucket is locked by the DBlock lock.
*/
int
loadientry(Index *ix, u8int *score, int type, IEntry *ie)
{
ISect *is;
DBlock *b;
IBucket ib;
u32int buck;
int h, ok;
ok = -1;
trace(TraceLump, "loadientry enter");
/*
qlock(&stats.lock);
stats.indexreads++;
qunlock(&stats.lock);
*/
if(!inbloomfilter(mainindex->bloom, score)){
trace(TraceLump, "loadientry bloomhit");
return -1;
}
trace(TraceLump, "loadientry loadibucket");
b = loadibucket(ix, score, &is, &buck, &ib);
trace(TraceLump, "loadientry loadedibucket");
if(b == nil)
return -1;
if(okibucket(&ib, is) < 0){
trace(TraceLump, "loadientry badbucket");
goto out;
}
h = bucklook(score, type, ib.data, ib.n);
if(h & 1){
h ^= 1;
trace(TraceLump, "loadientry found");
unpackientry(ie, &ib.data[h]);
ok = 0;
goto out;
}
trace(TraceLump, "loadientry notfound");
addstat(StatBloomFalseMiss, 1);
out:
putdblock(b);
trace(TraceLump, "loadientry exit");
return ok;
}
int
okibucket(IBucket *ib, ISect *is)
{
if(ib->n <= is->buckmax)
return 0;
seterr(EICorrupt, "corrupted disk index bucket: n=%ud max=%ud, range=[%lud,%lud)",
ib->n, is->buckmax, is->start, is->stop);
return -1;
}
/*
* look for score within data;
* return 1 | byte index of matching index,
* or 0 | index of least element > score
*/
int
bucklook(u8int *score, int otype, u8int *data, int n)
{
int i, r, l, m, h, c, cc, type;
if(otype == -1)
type = -1;
else
type = vttodisktype(otype);
l = 0;
r = n - 1;
while(l <= r){
m = (r + l) >> 1;
h = m * IEntrySize;
for(i = 0; i < VtScoreSize; i++){
c = score[i];
cc = data[h + i];
if(c != cc){
if(c > cc)
l = m + 1;
else
r = m - 1;
goto cont;
}
}
cc = data[h + IEntryTypeOff];
if(type != cc && type != -1){
if(type > cc)
l = m + 1;
else
r = m - 1;
goto cont;
}
return h | 1;
cont:;
}
return l * IEntrySize;
}
/*
* compare two IEntries; consistent with bucklook
*/
int
ientrycmp(const void *vie1, const void *vie2)
{
u8int *ie1, *ie2;
int i, v1, v2;
ie1 = (u8int*)vie1;
ie2 = (u8int*)vie2;
for(i = 0; i < VtScoreSize; i++){
v1 = ie1[i];
v2 = ie2[i];
if(v1 != v2){
if(v1 < v2)
return -1;
return 1;
}
}
v1 = ie1[IEntryTypeOff];
v2 = ie2[IEntryTypeOff];
if(v1 != v2){
if(v1 < v2)
return -1;
return 1;
}
return 0;
}
/*
* find the number of the index section holding bucket #buck
*/
int
indexsect0(Index *ix, u32int buck)
{
int r, l, m;
l = 1;
r = ix->nsects - 1;
while(l <= r){
m = (r + l) >> 1;
if(ix->sects[m]->start <= buck)
l = m + 1;
else
r = m - 1;
}
return l - 1;
}
/*
* load the index block at bucket #buck
*/
static DBlock*
loadibucket0(Index *ix, u32int buck, ISect **pis, u32int *pbuck, IBucket *ib, int mode)
{
ISect *is;
DBlock *b;
is = ix->sects[indexsect0(ix, buck)];
if(buck < is->start || is->stop <= buck){
seterr(EAdmin, "index lookup out of range: %ud not found in index\n", buck);
return nil;
}
buck -= is->start;
if((b = getdblock(is->part, is->blockbase + ((u64int)buck << is->blocklog), mode)) == nil)
return nil;
if(pis)
*pis = is;
if(pbuck)
*pbuck = buck;
if(ib)
unpackibucket(ib, b->data, is->bucketmagic);
return b;
}
/*
* find the number of the index section holding score
*/
int
indexsect1(Index *ix, u8int *score)
{
return indexsect0(ix, hashbits(score, 32) / ix->div);
}
/*
* load the index block responsible for score.
*/
static DBlock*
loadibucket1(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
{
return loadibucket0(ix, hashbits(score, 32)/ix->div, pis, pbuck, ib, OREAD);
}
int
indexsect(Index *ix, u8int *score)
{
return indexsect1(ix, score);
}
DBlock*
loadibucket(Index *ix, u8int *score, ISect **pis, u32int *pbuck, IBucket *ib)
{
return loadibucket1(ix, score, pis, pbuck, ib);
}