blob: a50ef0c5c2bbc4b71eae3778d0e25c71316f430f [file] [log] [blame]
/*
* Disk cache.
*
* Caches raw disk blocks. Getdblock() gets a block, putdblock puts it back.
* Getdblock has a mode parameter that determines i/o and access to a block:
* if mode is OREAD or ORDWR, it is read from disk if not already in memory.
* If mode is ORDWR or OWRITE, it is locked for exclusive use before being returned.
* It is *not* marked dirty -- once changes have been made, they should be noted
* by using dirtydblock() before putdblock().
*
* There is a global cache lock as well as a lock on each block.
* Within a thread, the cache lock can be acquired while holding a block lock,
* but not vice versa; and a block cannot be locked if you already hold the lock
* on another block.
*
* The flush proc writes out dirty blocks in batches, one batch per dirty tag.
* For example, the DirtyArena blocks are all written to disk before any of the
* DirtyArenaCib blocks.
*
* This code used to be in charge of flushing the dirty index blocks out to
* disk, but updating the index turned out to benefit from extra care.
* Now cached index blocks are never marked dirty. The index.c code takes
* care of updating them behind our back, and uses _getdblock to update any
* cached copies of the blocks as it changes them on disk.
*/
#include "stdinc.h"
#include "dat.h"
#include "fns.h"
typedef struct DCache DCache;
enum
{
HashLog = 9,
HashSize = 1<<HashLog,
HashMask = HashSize - 1,
};
struct DCache
{
QLock lock;
RWLock dirtylock; /* must be held to inspect or set b->dirty */
Rendez full;
Round round;
DBlock *free; /* list of available lumps */
u32int now; /* ticks for usage timestamps */
int size; /* max. size of any block; allocated to each block */
DBlock **heads; /* hash table for finding address */
int nheap; /* number of available victims */
DBlock **heap; /* heap for locating victims */
int nblocks; /* number of blocks allocated */
DBlock *blocks; /* array of block descriptors */
DBlock **write; /* array of block pointers to be written */
u8int *mem; /* memory for all block descriptors */
int ndirty; /* number of dirty blocks */
int maxdirty; /* max. number of dirty blocks */
};
typedef struct Ra Ra;
struct Ra
{
Part *part;
u64int addr;
};
static DCache dcache;
static int downheap(int i, DBlock *b);
static int upheap(int i, DBlock *b);
static DBlock *bumpdblock(void);
static void delheap(DBlock *db);
static void fixheap(int i, DBlock *b);
static void flushproc(void*);
static void writeproc(void*);
void
initdcache(u32int mem)
{
DBlock *b, *last;
u32int nblocks, blocksize;
int i;
u8int *p;
if(mem < maxblocksize * 2)
sysfatal("need at least %d bytes for the disk cache", maxblocksize * 2);
if(maxblocksize == 0)
sysfatal("no max. block size given for disk cache");
blocksize = maxblocksize;
nblocks = mem / blocksize;
dcache.full.l = &dcache.lock;
dcache.nblocks = nblocks;
dcache.maxdirty = (nblocks * 2) / 3;
trace(TraceProc, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n",
nblocks, blocksize, dcache.maxdirty);
dcache.size = blocksize;
dcache.heads = MKNZ(DBlock*, HashSize);
dcache.heap = MKNZ(DBlock*, nblocks);
dcache.blocks = MKNZ(DBlock, nblocks);
dcache.write = MKNZ(DBlock*, nblocks);
dcache.mem = MKNZ(u8int, (nblocks+1+128) * blocksize);
last = nil;
p = (u8int*)(((uintptr)dcache.mem+blocksize-1)&~(uintptr)(blocksize-1));
for(i = 0; i < nblocks; i++){
b = &dcache.blocks[i];
b->data = &p[i * blocksize];
b->heap = TWID32;
b->writedonechan = chancreate(sizeof(void*), 1);
b->next = last;
last = b;
}
dcache.free = last;
dcache.nheap = 0;
setstat(StatDcacheSize, nblocks);
initround(&dcache.round, "dcache", 120*1000);
vtproc(flushproc, nil);
vtproc(delaykickroundproc, &dcache.round);
}
static u32int
pbhash(u64int addr)
{
u32int h;
#define hashit(c) ((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask)
h = (addr >> 32) ^ addr;
return hashit(h);
}
DBlock*
getdblock(Part *part, u64int addr, int mode)
{
DBlock *b;
b = _getdblock(part, addr, mode, 1);
if(mode == OREAD || mode == ORDWR)
addstat(StatDcacheRead, 1);
if(mode == OWRITE || mode == ORDWR)
addstat(StatDcacheWrite, 1);
return b;
}
DBlock*
_getdblock(Part *part, u64int addr, int mode, int load)
{
DBlock *b;
u32int h, size, ms;
ms = 0;
trace(TraceBlock, "getdblock enter %s 0x%llux", part->name, addr);
size = part->blocksize;
if(size > dcache.size){
seterr(EAdmin, "block size %d too big for cache with size %d", size, dcache.size);
if(load)
addstat(StatDcacheLookup, 1);
return nil;
}
h = pbhash(addr);
/*
* look for the block in the cache
*/
qlock(&dcache.lock);
again:
for(b = dcache.heads[h]; b != nil; b = b->next){
if(b->part == part && b->addr == addr){
if(load)
addstat2(StatDcacheHit, 1, StatDcacheLookup, 1);
goto found;
}
}
/*
* missed: locate the block with the oldest second to last use.
* remove it from the heap, and fix up the heap.
*/
if(!load){
qunlock(&dcache.lock);
return nil;
}
/*
* Only start timer here, on cache miss - calling msec() on plain cache hits
* makes cache hits system-call bound.
*/
ms = msec();
addstat2(StatDcacheLookup, 1, StatDcacheMiss, 1);
b = bumpdblock();
if(b == nil){
trace(TraceBlock, "all disk cache blocks in use");
addstat(StatDcacheStall, 1);
rsleep(&dcache.full);
addstat(StatDcacheStall, -1);
goto again;
}
assert(!b->dirty);
/*
* the new block has no last use, so assume it happens sometime in the middle
ZZZ this is not reasonable
*/
b->used = (b->used2 + dcache.now) / 2;
/*
* rechain the block on the correct hash chain
*/
b->next = dcache.heads[h];
dcache.heads[h] = b;
if(b->next != nil)
b->next->prev = b;
b->prev = nil;
b->addr = addr;
b->part = part;
b->size = 0;
found:
b->ref++;
b->used2 = b->used;
b->used = dcache.now++;
if(b->heap != TWID32)
fixheap(b->heap, b);
if((mode == ORDWR || mode == OWRITE) && part->writechan == nil){
trace(TraceBlock, "getdblock allocwriteproc %s", part->name);
part->writechan = chancreate(sizeof(DBlock*), dcache.nblocks);
vtproc(writeproc, part);
}
qunlock(&dcache.lock);
trace(TraceBlock, "getdblock lock");
addstat(StatDblockStall, 1);
if(mode == OREAD)
rlock(&b->lock);
else
wlock(&b->lock);
addstat(StatDblockStall, -1);
trace(TraceBlock, "getdblock locked");
if(b->size != size){
if(mode == OREAD){
addstat(StatDblockStall, 1);
runlock(&b->lock);
wlock(&b->lock);
addstat(StatDblockStall, -1);
}
if(b->size < size){
if(mode == OWRITE)
memset(&b->data[b->size], 0, size - b->size);
else{
trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr);
diskaccess(0);
if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){
b->mode = ORDWR; /* so putdblock wunlocks */
putdblock(b);
return nil;
}
trace(TraceBlock, "getdblock readpartdone");
addstat(StatApartRead, 1);
addstat(StatApartReadBytes, size-b->size);
}
}
b->size = size;
if(mode == OREAD){
addstat(StatDblockStall, 1);
wunlock(&b->lock);
rlock(&b->lock);
addstat(StatDblockStall, -1);
}
}
b->mode = mode;
trace(TraceBlock, "getdblock exit");
if(ms)
addstat(StatDcacheLookupTime, msec() - ms);
return b;
}
void
putdblock(DBlock *b)
{
if(b == nil)
return;
trace(TraceBlock, "putdblock %s 0x%llux", b->part->name, b->addr);
if(b->mode == OREAD)
runlock(&b->lock);
else
wunlock(&b->lock);
qlock(&dcache.lock);
if(--b->ref == 0 && !b->dirty){
if(b->heap == TWID32)
upheap(dcache.nheap++, b);
rwakeupall(&dcache.full);
}
qunlock(&dcache.lock);
}
void
dirtydblock(DBlock *b, int dirty)
{
int odirty;
trace(TraceBlock, "dirtydblock enter %s 0x%llux %d from 0x%lux",
b->part->name, b->addr, dirty, getcallerpc(&b));
assert(b->ref != 0);
assert(b->mode==ORDWR || b->mode==OWRITE);
odirty = b->dirty;
if(b->dirty)
assert(b->dirty == dirty);
else
b->dirty = dirty;
qlock(&dcache.lock);
if(!odirty){
dcache.ndirty++;
setstat(StatDcacheDirty, dcache.ndirty);
if(dcache.ndirty >= dcache.maxdirty)
kickround(&dcache.round, 0);
else
delaykickround(&dcache.round);
}
qunlock(&dcache.lock);
}
static void
unchain(DBlock *b)
{
ulong h;
/*
* unchain the block
*/
if(b->prev == nil){
h = pbhash(b->addr);
if(dcache.heads[h] != b)
sysfatal("bad hash chains in disk cache");
dcache.heads[h] = b->next;
}else
b->prev->next = b->next;
if(b->next != nil)
b->next->prev = b->prev;
}
/*
* remove some block from use and update the free list and counters
*/
static DBlock*
bumpdblock(void)
{
DBlock *b;
trace(TraceBlock, "bumpdblock enter");
b = dcache.free;
if(b != nil){
dcache.free = b->next;
return b;
}
if(dcache.ndirty >= dcache.maxdirty)
kickdcache();
/*
* remove blocks until we find one that is unused
* referenced blocks are left in the heap even though
* they can't be scavenged; this is simple a speed optimization
*/
for(;;){
if(dcache.nheap == 0){
kickdcache();
trace(TraceBlock, "bumpdblock gotnothing");
return nil;
}
b = dcache.heap[0];
delheap(b);
if(!b->ref && !b->dirty)
break;
}
trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr);
unchain(b);
return b;
}
void
emptydcache(void)
{
DBlock *b;
qlock(&dcache.lock);
while(dcache.nheap > 0){
b = dcache.heap[0];
delheap(b);
if(!b->ref && !b->dirty){
unchain(b);
b->next = dcache.free;
dcache.free = b;
}
}
qunlock(&dcache.lock);
}
/*
* delete an arbitrary block from the heap
*/
static void
delheap(DBlock *db)
{
if(db->heap == TWID32)
return;
fixheap(db->heap, dcache.heap[--dcache.nheap]);
db->heap = TWID32;
}
/*
* push an element up or down to it's correct new location
*/
static void
fixheap(int i, DBlock *b)
{
if(upheap(i, b) == i)
downheap(i, b);
}
static int
upheap(int i, DBlock *b)
{
DBlock *bb;
u32int now;
int p;
now = dcache.now;
for(; i != 0; i = p){
p = (i - 1) >> 1;
bb = dcache.heap[p];
if(b->used2 - now >= bb->used2 - now)
break;
dcache.heap[i] = bb;
bb->heap = i;
}
dcache.heap[i] = b;
b->heap = i;
return i;
}
static int
downheap(int i, DBlock *b)
{
DBlock *bb;
u32int now;
int k;
now = dcache.now;
for(; ; i = k){
k = (i << 1) + 1;
if(k >= dcache.nheap)
break;
if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now)
k++;
bb = dcache.heap[k];
if(b->used2 - now <= bb->used2 - now)
break;
dcache.heap[i] = bb;
bb->heap = i;
}
dcache.heap[i] = b;
b->heap = i;
return i;
}
static void
findblock(DBlock *bb)
{
DBlock *b, *last;
int h;
last = nil;
h = pbhash(bb->addr);
for(b = dcache.heads[h]; b != nil; b = b->next){
if(last != b->prev)
sysfatal("bad prev link");
if(b == bb)
return;
last = b;
}
sysfatal("block missing from hash table");
}
void
checkdcache(void)
{
DBlock *b;
u32int size, now;
int i, k, refed, nfree;
qlock(&dcache.lock);
size = dcache.size;
now = dcache.now;
for(i = 0; i < dcache.nheap; i++){
if(dcache.heap[i]->heap != i)
sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap);
if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now)
sysfatal("dc: bad heap ordering");
k = (i << 1) + 1;
if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
sysfatal("dc: bad heap ordering");
k++;
if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
sysfatal("dc: bad heap ordering");
}
refed = 0;
for(i = 0; i < dcache.nblocks; i++){
b = &dcache.blocks[i];
if(b->data != &dcache.mem[i * size])
sysfatal("dc: mis-blocked at %d", i);
if(b->ref && b->heap == TWID32)
refed++;
if(b->addr)
findblock(b);
if(b->heap != TWID32
&& dcache.heap[b->heap] != b)
sysfatal("dc: spurious heap value");
}
nfree = 0;
for(b = dcache.free; b != nil; b = b->next){
if(b->addr != 0 || b->heap != TWID32)
sysfatal("dc: bad free list");
nfree++;
}
if(dcache.nheap + nfree + refed != dcache.nblocks)
sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks);
qunlock(&dcache.lock);
}
void
flushdcache(void)
{
trace(TraceProc, "flushdcache enter");
kickround(&dcache.round, 1);
trace(TraceProc, "flushdcache exit");
}
void
kickdcache(void)
{
kickround(&dcache.round, 0);
}
static int
parallelwrites(DBlock **b, DBlock **eb, int dirty)
{
DBlock **p, **q;
Part *part;
for(p=b; p<eb && (*p)->dirty == dirty; p++){
assert(b<=p && p<eb);
sendp((*p)->part->writechan, *p);
}
q = p;
for(p=b; p<q; p++){
assert(b<=p && p<eb);
recvp((*p)->writedonechan);
}
/*
* Flush the partitions that have been written to.
*/
part = nil;
for(p=b; p<q; p++){
if(part != (*p)->part){
part = (*p)->part;
flushpart(part); /* what if it fails? */
}
}
return p-b;
}
/*
* Sort first by dirty flag, then by partition, then by address in partition.
*/
static int
writeblockcmp(const void *va, const void *vb)
{
DBlock *a, *b;
a = *(DBlock**)va;
b = *(DBlock**)vb;
if(a->dirty != b->dirty)
return a->dirty - b->dirty;
if(a->part != b->part){
if(a->part < b->part)
return -1;
if(a->part > b->part)
return 1;
}
if(a->addr < b->addr)
return -1;
return 1;
}
static void
flushproc(void *v)
{
int i, j, n;
ulong t0;
DBlock *b, **write;
USED(v);
threadsetname("flushproc");
for(;;){
waitforkick(&dcache.round);
trace(TraceWork, "start");
t0 = nsec()/1000;
trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0);
write = dcache.write;
n = 0;
for(i=0; i<dcache.nblocks; i++){
b = &dcache.blocks[i];
if(b->dirty)
write[n++] = b;
}
qsort(write, n, sizeof(write[0]), writeblockcmp);
/* Write each stage of blocks out. */
trace(TraceProc, "writeblocks t=%lud", (ulong)(nsec()/1000)-t0);
i = 0;
for(j=1; j<DirtyMax; j++){
trace(TraceProc, "writeblocks.%d t=%lud",
j, (ulong)(nsec()/1000)-t0);
i += parallelwrites(write+i, write+n, j);
}
if(i != n){
fprint(2, "in flushproc i=%d n=%d\n", i, n);
for(i=0; i<n; i++)
fprint(2, "\tblock %d: dirty=%d\n",
i, write[i]->dirty);
abort();
}
/*
* b->dirty is protected by b->lock while ndirty is protected
* by dcache.lock, so the --ndirty below is the delayed one
* from clearing b->dirty in the write proc. It may happen
* that some other proc has come along and redirtied b since
* the write. That's okay, it just means that ndirty may be
* one too high until we catch up and do the decrement.
*/
trace(TraceProc, "undirty.%d t=%lud", j, (ulong)(nsec()/1000)-t0);
qlock(&dcache.lock);
for(i=0; i<n; i++){
b = write[i];
--dcache.ndirty;
if(b->ref == 0 && b->heap == TWID32){
upheap(dcache.nheap++, b);
rwakeupall(&dcache.full);
}
}
setstat(StatDcacheDirty, dcache.ndirty);
qunlock(&dcache.lock);
addstat(StatDcacheFlush, 1);
trace(TraceWork, "finish");
}
}
static void
writeproc(void *v)
{
DBlock *b;
Part *p;
p = v;
threadsetname("writeproc:%s", p->name);
for(;;){
b = recvp(p->writechan);
trace(TraceWork, "start");
assert(b->part == p);
trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr);
wlock(&b->lock);
trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr);
diskaccess(0);
if(writepart(p, b->addr, b->data, b->size) < 0)
fprint(2, "%s: writeproc: part %s addr 0x%llux: write error: %r\n",
argv0, p->name, b->addr);
addstat(StatApartWrite, 1);
addstat(StatApartWriteBytes, b->size);
b->dirty = 0;
wunlock(&b->lock);
trace(TraceProc, "finish %s 0x%llux", p->name, b->addr);
trace(TraceWork, "finish");
sendp(b->writedonechan, b);
}
}