blob: d7ae41d39d43e5e306c88f3f20174ed80064918b [file] [log] [blame]
/*
* Check and fix an arena partition.
*
* This is a lot grittier than the rest of Venti because
* it can't just give up if a byte here or there is wrong.
*
* The rule here (hopefully followed!) is that block corruption
* only ever has a local effect -- there are no blocks that you
* can wipe out that will cause large portions of
* uncorrupted data blocks to be useless.
*/
#include "stdinc.h"
#include "dat.h"
#include "fns.h"
#include "whack.h"
#define ROUNDUP(x,n) (((x)+(n)-1)&~((n)-1))
#pragma varargck type "z" uvlong
#pragma varargck type "z" vlong
#pragma varargck type "t" uint
enum
{
K = 1024,
M = 1024*1024,
G = 1024*1024*1024,
Block = 4096,
};
int debugsha1;
int verbose;
Part *part;
char *file;
char *basename;
char *dumpbase;
int fix;
int badreads;
int unseal;
uchar zero[MaxDiskBlock];
Arena lastarena;
ArenaPart ap;
uvlong arenasize;
int nbadread;
int nbad;
uvlong partend;
void checkarena(vlong, int);
void
usage(void)
{
fprint(2, "usage: fixarenas [-fv] [-a arenasize] [-b blocksize] file [ranges]\n");
threadexitsall(0);
}
/*
* Format number in simplest way that is okay with unittoull.
*/
static int
zfmt(Fmt *fmt)
{
vlong x;
x = va_arg(fmt->args, vlong);
if(x == 0)
return fmtstrcpy(fmt, "0");
if(x%G == 0)
return fmtprint(fmt, "%lldG", x/G);
if(x%M == 0)
return fmtprint(fmt, "%lldM", x/M);
if(x%K == 0)
return fmtprint(fmt, "%lldK", x/K);
return fmtprint(fmt, "%lld", x);
}
/*
* Format time like ctime without newline.
*/
static int
tfmt(Fmt *fmt)
{
uint t;
char buf[30];
t = va_arg(fmt->args, uint);
strcpy(buf, ctime(t));
buf[28] = 0;
return fmtstrcpy(fmt, buf);
}
/*
* Coalesce messages about unreadable sectors into larger ranges.
* bad(0, 0) flushes the buffer.
*/
static void
bad(char *msg, vlong o, int len)
{
static vlong lb0, lb1;
static char *lmsg;
if(msg == nil)
msg = lmsg;
if(o == -1){
lmsg = nil;
lb0 = 0;
lb1 = 0;
return;
}
if(lb1 != o || (msg && lmsg && strcmp(msg, lmsg) != 0)){
if(lb0 != lb1)
print("%s %#llux+%#llux (%,lld+%,lld)\n",
lmsg, lb0, lb1-lb0, lb0, lb1-lb0);
lb0 = o;
}
lmsg = msg;
lb1 = o+len;
}
/*
* Read in the len bytes of data at the offset. If can't for whatever reason,
* fill it with garbage but print an error.
*/
static uchar*
readdisk(uchar *buf, vlong offset, int len)
{
int i, j, k, n;
if(offset >= partend){
memset(buf, 0xFB, len);
return buf;
}
if(offset+len > partend){
memset(buf, 0xFB, len);
len = partend - offset;
}
if(readpart(part, offset, buf, len) >= 0)
return buf;
/*
* The read failed. Clear the buffer to nonsense, and
* then try reading in smaller pieces. If that fails,
* read in even smaller pieces. And so on down to sectors.
*/
memset(buf, 0xFD, len);
for(i=0; i<len; i+=64*K){
n = 64*K;
if(i+n > len)
n = len-i;
if(readpart(part, offset+i, buf+i, n) >= 0)
continue;
for(j=i; j<len && j<i+64*K; j+=4*K){
n = 4*K;
if(j+n > len)
n = len-j;
if(readpart(part, offset+j, buf+j, n) >= 0)
continue;
for(k=j; k<len && k<j+4*K; k+=512){
if(readpart(part, offset+k, buf+k, 512) >= 0)
continue;
bad("disk read failed at", k, 512);
badreads++;
}
}
}
bad(nil, 0, 0);
return buf;
}
/*
* Buffer to support running SHA1 hash of the disk.
*/
typedef struct Shabuf Shabuf;
struct Shabuf
{
int fd;
vlong offset;
DigestState state;
int rollback;
vlong r0;
DigestState *hist;
int nhist;
};
void
sbdebug(Shabuf *sb, char *file)
{
int fd;
if(sb->fd > 0){
close(sb->fd);
sb->fd = 0;
}
if((fd = create(file, OWRITE, 0666)) < 0)
return;
if(fd == 0){
fd = dup(fd, -1);
close(0);
}
sb->fd = fd;
}
void
sbupdate(Shabuf *sb, uchar *p, vlong offset, int len)
{
int n, x;
vlong o;
if(sb->rollback && !sb->hist){
sb->r0 = offset;
sb->nhist = 1;
sb->hist = vtmalloc(sb->nhist*sizeof *sb->hist);
memset(sb->hist, 0, sizeof sb->hist[0]);
}
if(sb->r0 == 0)
sb->r0 = offset;
if(sb->offset < offset || sb->offset >= offset+len){
if(0) print("sbupdate %p %#llux+%d but offset=%#llux\n",
p, offset, len, sb->offset);
return;
}
x = sb->offset - offset;
if(0) print("sbupdate %p %#llux+%d skip %d\n",
sb, offset, len, x);
if(x){
p += x;
offset += x;
len -= x;
}
assert(sb->offset == offset);
if(sb->fd > 0)
pwrite(sb->fd, p, len, offset - sb->r0);
if(!sb->rollback){
sha1(p, len, nil, &sb->state);
sb->offset += len;
return;
}
/* save state every 4M so we can roll back quickly */
o = offset - sb->r0;
while(len > 0){
n = 4*M - o%(4*M);
if(n > len)
n = len;
sha1(p, n, nil, &sb->state);
sb->offset += n;
o += n;
p += n;
len -= n;
if(o%(4*M) == 0){
x = o/(4*M);
if(x >= sb->nhist){
if(x != sb->nhist)
print("oops! x=%d nhist=%d\n", x, sb->nhist);
sb->nhist += 32;
sb->hist = vtrealloc(sb->hist, sb->nhist*sizeof *sb->hist);
}
sb->hist[x] = sb->state;
}
}
}
void
sbdiskhash(Shabuf *sb, vlong eoffset)
{
static uchar dbuf[4*M];
int n;
while(sb->offset < eoffset){
n = sizeof dbuf;
if(sb->offset+n > eoffset)
n = eoffset - sb->offset;
readdisk(dbuf, sb->offset, n);
sbupdate(sb, dbuf, sb->offset, n);
}
}
void
sbrollback(Shabuf *sb, vlong offset)
{
int x;
vlong o;
Dir d;
if(!sb->rollback || !sb->r0){
print("cannot rollback sha\n");
return;
}
if(offset >= sb->offset)
return;
o = offset - sb->r0;
x = o/(4*M);
if(x >= sb->nhist){
print("cannot rollback sha\n");
return;
}
sb->state = sb->hist[x];
sb->offset = sb->r0 + x*4*M;
assert(sb->offset <= offset);
if(sb->fd > 0){
nulldir(&d);
d.length = sb->offset - sb->r0;
dirfwstat(sb->fd, &d);
}
}
void
sbscore(Shabuf *sb, uchar *score)
{
if(sb->hist){
free(sb->hist);
sb->hist = nil;
}
sha1(nil, 0, score, &sb->state);
}
/*
* If we're fixing arenas, then editing this memory edits the disk!
* It will be written back out as new data is paged in.
*/
uchar buf[4*M];
uchar sbuf[4*M];
vlong bufoffset;
int buflen;
static void pageout(void);
static uchar*
pagein(vlong offset, int len)
{
pageout();
if(offset >= partend){
memset(buf, 0xFB, sizeof buf);
return buf;
}
if(offset+len > partend){
memset(buf, 0xFB, sizeof buf);
len = partend - offset;
}
bufoffset = offset;
buflen = len;
readdisk(buf, offset, len);
memmove(sbuf, buf, len);
return buf;
}
static void
pageout(void)
{
if(buflen==0 || !fix || memcmp(buf, sbuf, buflen) == 0){
buflen = 0;
return;
}
if(writepart(part, bufoffset, buf, buflen) < 0)
print("disk write failed at %#llux+%#ux (%,lld+%,d)\n",
bufoffset, buflen, bufoffset, buflen);
buflen = 0;
}
static void
zerorange(vlong offset, int len)
{
int i;
vlong ooff;
int olen;
enum { MinBlock = 4*K, MaxBlock = 8*K };
if(0)
if(bufoffset <= offset && offset+len <= bufoffset+buflen){
memset(buf+(offset-bufoffset), 0, len);
return;
}
ooff = bufoffset;
olen = buflen;
i = offset%MinBlock;
if(i+len < MaxBlock){
pagein(offset-i, (len+MinBlock-1)&~(MinBlock-1));
memset(buf+i, 0, len);
}else{
pagein(offset-i, MaxBlock);
memset(buf+i, 0, MaxBlock-i);
offset += MaxBlock-i;
len -= MaxBlock-i;
while(len >= MaxBlock){
pagein(offset, MaxBlock);
memset(buf, 0, MaxBlock);
offset += MaxBlock;
len -= MaxBlock;
}
pagein(offset, (len+MinBlock-1)&~(MinBlock-1));
memset(buf, 0, len);
}
pagein(ooff, olen);
}
/*
* read/write integers
*
static void
p16(uchar *p, u16int u)
{
p[0] = (u>>8) & 0xFF;
p[1] = u & 0xFF;
}
*/
static u16int
u16(uchar *p)
{
return (p[0]<<8)|p[1];
}
static void
p32(uchar *p, u32int u)
{
p[0] = (u>>24) & 0xFF;
p[1] = (u>>16) & 0xFF;
p[2] = (u>>8) & 0xFF;
p[3] = u & 0xFF;
}
static u32int
u32(uchar *p)
{
return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|p[3];
}
/*
static void
p64(uchar *p, u64int u)
{
p32(p, u>>32);
p32(p, u);
}
*/
static u64int
u64(uchar *p)
{
return ((u64int)u32(p)<<32) | u32(p+4);
}
static int
vlongcmp(const void *va, const void *vb)
{
vlong a, b;
a = *(vlong*)va;
b = *(vlong*)vb;
if(a < b)
return -1;
if(b > a)
return 1;
return 0;
}
/* D and S are in draw.h */
#define D VD
#define S VS
enum
{
D = 0x10000,
Z = 0x20000,
S = 0x30000,
T = 0x40000,
N = 0xFFFF
};
typedef struct Info Info;
struct Info
{
int len;
char *name;
};
Info partinfo[] = {
4, "magic",
D|4, "version",
Z|4, "blocksize",
4, "arenabase",
0
};
Info headinfo4[] = {
4, "magic",
D|4, "version",
S|ANameSize, "name",
Z|4, "blocksize",
Z|8, "size",
0
};
Info headinfo5[] = {
4, "magic",
D|4, "version",
S|ANameSize, "name",
Z|4, "blocksize",
Z|8, "size",
4, "clumpmagic",
0
};
Info tailinfo4[] = {
4, "magic",
D|4, "version",
S|ANameSize, "name",
D|4, "clumps",
D|4, "cclumps",
T|4, "ctime",
T|4, "wtime",
D|8, "used",
D|8, "uncsize",
1, "sealed",
0
};
Info tailinfo4a[] = {
/* tailinfo 4 */
4, "magic",
D|4, "version",
S|ANameSize, "name",
D|4, "clumps",
D|4, "cclumps",
T|4, "ctime",
T|4, "wtime",
D|8, "used",
D|8, "uncsize",
1, "sealed",
/* mem stats */
1, "extension",
D|4, "mem.clumps",
D|4, "mem.cclumps",
D|8, "mem.used",
D|8, "mem.uncsize",
1, "mem.sealed",
0
};
Info tailinfo5[] = {
4, "magic",
D|4, "version",
S|ANameSize, "name",
D|4, "clumps",
D|4, "cclumps",
T|4, "ctime",
T|4, "wtime",
4, "clumpmagic",
D|8, "used",
D|8, "uncsize",
1, "sealed",
0
};
Info tailinfo5a[] = {
/* tailinfo 5 */
4, "magic",
D|4, "version",
S|ANameSize, "name",
D|4, "clumps",
D|4, "cclumps",
T|4, "ctime",
T|4, "wtime",
4, "clumpmagic",
D|8, "used",
D|8, "uncsize",
1, "sealed",
/* mem stats */
1, "extension",
D|4, "mem.clumps",
D|4, "mem.cclumps",
D|8, "mem.used",
D|8, "mem.uncsize",
1, "mem.sealed",
0
};
void
showdiffs(uchar *want, uchar *have, int len, Info *info)
{
int n;
while(len > 0 && (n=info->len&N) > 0){
if(memcmp(have, want, n) != 0){
switch(info->len){
case 1:
print("\t%s: correct=%d disk=%d\n",
info->name, *want, *have);
break;
case 4:
print("\t%s: correct=%#ux disk=%#ux\n",
info->name, u32(want), u32(have));
break;
case D|4:
print("\t%s: correct=%,ud disk=%,ud\n",
info->name, u32(want), u32(have));
break;
case T|4:
print("\t%s: correct=%t\n\t\tdisk=%t\n",
info->name, u32(want), u32(have));
break;
case Z|4:
print("\t%s: correct=%z disk=%z\n",
info->name, (uvlong)u32(want), (uvlong)u32(have));
break;
case D|8:
print("\t%s: correct=%,lld disk=%,lld\n",
info->name, u64(want), u64(have));
break;
case Z|8:
print("\t%s: correct=%z disk=%z\n",
info->name, u64(want), u64(have));
break;
case S|ANameSize:
print("\t%s: correct=%s disk=%.*s\n",
info->name, (char*)want,
utfnlen((char*)have, ANameSize-1),
(char*)have);
break;
default:
print("\t%s: correct=%.*H disk=%.*H\n",
info->name, n, want, n, have);
break;
}
}
have += n;
want += n;
len -= n;
info++;
}
if(len > 0 && memcmp(have, want, len) != 0){
if(memcmp(want, zero, len) != 0)
print("!!\textra want data in showdiffs (bug in fixarenas)\n");
else
print("\tnon-zero data on disk after structure\n");
if(verbose > 1){
print("want: %.*H\n", len, want);
print("have: %.*H\n", len, have);
}
}
}
/*
* Does part begin with an arena?
*/
int
isonearena(void)
{
return u32(pagein(0, Block)) == ArenaHeadMagic;
}
static int tabsizes[] = { 16*1024, 64*1024, 512*1024, 768*1024, };
/*
* Poke around on the disk to guess what the ArenaPart numbers are.
*/
void
guessgeometry(void)
{
int i, j, n, bestn, ndiff, nhead, ntail;
uchar *p, *ep, *sp;
u64int diff[100], head[20], tail[20];
u64int offset, bestdiff;
ap.version = ArenaPartVersion;
if(arenasize == 0 || ap.blocksize == 0){
/*
* The ArenaPart block at offset PartBlank may be corrupt or just wrong.
* Instead, look for the individual arena headers and tails, which there
* are many of, and once we've seen enough, infer the spacing.
*
* Of course, nothing in the file format requires that arenas be evenly
* spaced, but fmtarenas always does that for us.
*/
nhead = 0;
ntail = 0;
for(offset=PartBlank; offset<partend; offset+=4*M){
p = pagein(offset, 4*M);
for(sp=p, ep=p+4*M; p<ep; p+=K){
if(u32(p) == ArenaHeadMagic && nhead < nelem(head)){
if(verbose)
print("arena head at %#llx\n", offset+(p-sp));
head[nhead++] = offset+(p-sp);
}
if(u32(p) == ArenaMagic && ntail < nelem(tail)){
tail[ntail++] = offset+(p-sp);
if(verbose)
print("arena tail at %#llx\n", offset+(p-sp));
}
}
if(nhead == nelem(head) && ntail == nelem(tail))
break;
}
if(nhead < 3 && ntail < 3)
sysfatal("too few intact arenas: %d heads, %d tails", nhead, ntail);
/*
* Arena size is likely the most common
* inter-head or inter-tail spacing.
*/
ndiff = 0;
for(i=1; i<nhead; i++)
diff[ndiff++] = head[i] - head[i-1];
for(i=1; i<ntail; i++)
diff[ndiff++] = tail[i] - tail[i-1];
qsort(diff, ndiff, sizeof diff[0], vlongcmp);
bestn = 0;
bestdiff = 0;
for(i=1, n=1; i<=ndiff; i++, n++){
if(i==ndiff || diff[i] != diff[i-1]){
if(n > bestn){
bestn = n;
bestdiff = diff[i-1];
}
n = 0;
}
}
print("arena size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
if(arenasize != 0 && arenasize != bestdiff)
print("using user-specified size %z instead\n", arenasize);
else
arenasize = bestdiff;
/*
* The arena tail for an arena is arenasize-blocksize from the head.
*/
ndiff = 0;
for(i=j=0; i<nhead && j<ntail; ){
if(tail[j] < head[i]){
j++;
continue;
}
if(tail[j] < head[i]+arenasize){
diff[ndiff++] = head[i]+arenasize - tail[j];
j++;
continue;
}
i++;
}
if(ndiff < 3)
sysfatal("too few intact arenas: %d head, tail pairs", ndiff);
qsort(diff, ndiff, sizeof diff[0], vlongcmp);
bestn = 0;
bestdiff = 0;
for(i=1, n=1; i<=ndiff; i++, n++){
if(i==ndiff || diff[i] != diff[i-1]){
if(n > bestn){
bestn = n;
bestdiff = diff[i-1];
}
n = 0;
}
}
print("block size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
if(ap.blocksize != 0 && ap.blocksize != bestdiff)
print("using user-specified size %z instead\n", (vlong)ap.blocksize);
else
ap.blocksize = bestdiff;
if(ap.blocksize == 0 || ap.blocksize&(ap.blocksize-1))
sysfatal("block size not a power of two");
if(ap.blocksize > MaxDiskBlock)
sysfatal("block size too big (max=%d)", MaxDiskBlock);
/*
* Use head/tail information to deduce arena base.
*/
ndiff = 0;
for(i=0; i<nhead; i++)
diff[ndiff++] = head[i]%arenasize;
for(i=0; i<ntail; i++)
diff[ndiff++] = (tail[i]+ap.blocksize)%arenasize;
qsort(diff, ndiff, sizeof diff[0], vlongcmp);
bestn = 0;
bestdiff = 0;
for(i=1, n=1; i<=ndiff; i++, n++){
if(i==ndiff || diff[i] != diff[i-1]){
if(n > bestn){
bestn = n;
bestdiff = diff[i-1];
}
n = 0;
}
}
ap.arenabase = bestdiff;
}
ap.tabbase = ROUNDUP(PartBlank+HeadSize, ap.blocksize);
/*
* XXX pick up table, check arenabase.
* XXX pick up table, record base name.
*/
/*
* Somewhat standard computation.
* Fmtarenas used to use 64k tab, now uses 512k tab.
*/
if(ap.arenabase == 0){
print("trying standard arena bases...\n");
for(i=0; i<nelem(tabsizes); i++){
ap.arenabase = ROUNDUP(PartBlank+HeadSize+tabsizes[i], ap.blocksize);
p = pagein(ap.arenabase, Block);
if(u32(p) == ArenaHeadMagic)
break;
}
}
p = pagein(ap.arenabase, Block);
print("arena base likely %z%s\n", (vlong)ap.arenabase,
u32(p)!=ArenaHeadMagic ? " (but no arena head there)" : "");
ap.tabsize = ap.arenabase - ap.tabbase;
}
/*
* Check the arena partition blocks and then the arenas listed in range.
*/
void
checkarenas(char *range)
{
char *s, *t;
int i, lo, hi, narena;
uchar dbuf[HeadSize];
uchar *p;
guessgeometry();
partend -= partend%ap.blocksize;
memset(dbuf, 0, sizeof dbuf);
packarenapart(&ap, dbuf);
p = pagein(PartBlank, Block);
if(memcmp(p, dbuf, HeadSize) != 0){
print("on-disk arena part superblock incorrect\n");
showdiffs(dbuf, p, HeadSize, partinfo);
}
memmove(p, dbuf, HeadSize);
narena = (partend-ap.arenabase + arenasize-1)/arenasize;
if(range == nil){
for(i=0; i<narena; i++)
checkarena(ap.arenabase+(vlong)i*arenasize, i);
}else if(strcmp(range, "none") == 0){
/* nothing */
}else{
/* parse, e.g., -4,8-9,10- */
for(s=range; *s; s=t){
t = strchr(s, ',');
if(t)
*t++ = 0;
else
t = s+strlen(s);
if(*s == '-')
lo = 0;
else
lo = strtol(s, &s, 0);
hi = lo;
if(*s == '-'){
s++;
if(*s == 0)
hi = narena-1;
else
hi = strtol(s, &s, 0);
}
if(*s != 0){
print("bad arena range: %s\n", s);
continue;
}
for(i=lo; i<=hi; i++)
checkarena(ap.arenabase+(vlong)i*arenasize, i);
}
}
}
/*
* Is there a clump here at p?
*/
static int
isclump(uchar *p, Clump *cl, u32int *pmagic)
{
int n;
u32int magic;
uchar score[VtScoreSize], *bp;
Unwhack uw;
uchar ubuf[70*1024];
bp = p;
magic = u32(p);
if(magic == 0)
return 0;
p += U32Size;
cl->info.type = vtfromdisktype(*p);
if(cl->info.type == 0xFF)
return 0;
p++;
cl->info.size = u16(p);
p += U16Size;
cl->info.uncsize = u16(p);
if(cl->info.size > cl->info.uncsize)
return 0;
p += U16Size;
scorecp(cl->info.score, p);
p += VtScoreSize;
cl->encoding = *p;
p++;
cl->creator = u32(p);
p += U32Size;
cl->time = u32(p);
p += U32Size;
switch(cl->encoding){
case ClumpENone:
if(cl->info.size != cl->info.uncsize)
return 0;
scoremem(score, p, cl->info.size);
if(scorecmp(score, cl->info.score) != 0)
return 0;
break;
case ClumpECompress:
if(cl->info.size >= cl->info.uncsize)
return 0;
unwhackinit(&uw);
n = unwhack(&uw, ubuf, cl->info.uncsize, p, cl->info.size);
if(n != cl->info.uncsize)
return 0;
scoremem(score, ubuf, cl->info.uncsize);
if(scorecmp(score, cl->info.score) != 0)
return 0;
break;
default:
return 0;
}
p += cl->info.size;
/* it all worked out in the end */
*pmagic = magic;
return p - bp;
}
/*
* All ClumpInfos seen in this arena.
* Kept in binary tree so we can look up by score.
*/
typedef struct Cit Cit;
struct Cit
{
int left;
int right;
vlong corrupt;
ClumpInfo ci;
};
Cit *cibuf;
int ciroot;
int ncibuf, mcibuf;
void
resetcibuf(void)
{
ncibuf = 0;
ciroot = -1;
}
int*
ltreewalk(int *p, uchar *score)
{
int i;
for(;;){
if(*p == -1)
return p;
i = scorecmp(cibuf[*p].ci.score, score);
if(i == 0)
return p;
if(i < 0)
p = &cibuf[*p].right;
else
p = &cibuf[*p].left;
}
}
void
addcibuf(ClumpInfo *ci, vlong corrupt)
{
Cit *cit;
if(ncibuf == mcibuf){
mcibuf += 131072;
cibuf = vtrealloc(cibuf, mcibuf*sizeof cibuf[0]);
}
cit = &cibuf[ncibuf];
cit->ci = *ci;
cit->left = -1;
cit->right = -1;
cit->corrupt = corrupt;
if(!corrupt)
*ltreewalk(&ciroot, ci->score) = ncibuf;
ncibuf++;
}
void
addcicorrupt(vlong len)
{
static ClumpInfo zci;
addcibuf(&zci, len);
}
int
haveclump(uchar *score)
{
int i;
int p;
p = ciroot;
for(;;){
if(p == -1)
return 0;
i = scorecmp(cibuf[p].ci.score, score);
if(i == 0)
return 1;
if(i < 0)
p = cibuf[p].right;
else
p = cibuf[p].left;
}
}
int
matchci(ClumpInfo *ci, uchar *p)
{
if(ci->type != vtfromdisktype(p[0]))
return 0;
if(ci->size != u16(p+1))
return 0;
if(ci->uncsize != u16(p+3))
return 0;
if(scorecmp(ci->score, p+5) != 0)
return 0;
return 1;
}
int
sealedarena(uchar *p, int blocksize)
{
int v, n;
v = u32(p+4);
switch(v){
default:
return 0;
case ArenaVersion4:
n = ArenaSize4;
break;
case ArenaVersion5:
n = ArenaSize5;
break;
}
if(p[n-1] != 1){
print("arena tail says not sealed\n");
return 0;
}
if(memcmp(p+n, zero, blocksize-VtScoreSize-n) != 0){
print("arena tail followed by non-zero data\n");
return 0;
}
if(memcmp(p+blocksize-VtScoreSize, zero, VtScoreSize) == 0){
print("arena score zero\n");
return 0;
}
return 1;
}
int
okayname(char *name, int n)
{
char buf[20];
if(nameok(name) < 0)
return 0;
sprint(buf, "%d", n);
if(n == 0)
buf[0] = 0;
if(strlen(name) < strlen(buf)
|| strcmp(name+strlen(name)-strlen(buf), buf) != 0)
return 0;
return 1;
}
int
clumpinfocmp(ClumpInfo *a, ClumpInfo *b)
{
if(a->type != b->type)
return a->type - b->type;
if(a->size != b->size)
return a->size - b->size;
if(a->uncsize != b->uncsize)
return a->uncsize - b->uncsize;
return scorecmp(a->score, b->score);
}
ClumpInfo*
loadci(vlong offset, Arena *arena, int nci)
{
int i, j, per;
uchar *p, *sp;
ClumpInfo *bci, *ci;
per = arena->blocksize/ClumpInfoSize;
bci = vtmalloc(nci*sizeof bci[0]);
ci = bci;
offset += arena->size - arena->blocksize;
p = sp = nil;
for(i=0; i<nci; i+=per){
if(p == sp){
sp = pagein(offset-4*M, 4*M);
p = sp+4*M;
}
p -= arena->blocksize;
offset -= arena->blocksize;
for(j=0; j<per && i+j<nci; j++)
unpackclumpinfo(ci++, p+j*ClumpInfoSize);
}
return bci;
}
vlong
writeci(vlong offset, Arena *arena, ClumpInfo *ci, int nci)
{
int i, j, per;
uchar *p, *sp;
per = arena->blocksize/ClumpInfoSize;
offset += arena->size - arena->blocksize;
p = sp = nil;
for(i=0; i<nci; i+=per){
if(p == sp){
sp = pagein(offset-4*M, 4*M);
p = sp+4*M;
}
p -= arena->blocksize;
offset -= arena->blocksize;
memset(p, 0, arena->blocksize);
for(j=0; j<per && i+j<nci; j++)
packclumpinfo(ci++, p+j*ClumpInfoSize);
}
pageout();
return offset;
}
void
loadarenabasics(vlong offset0, int anum, ArenaHead *head, Arena *arena)
{
char dname[ANameSize];
static char lastbase[ANameSize];
uchar *p;
Arena oarena;
ArenaHead ohead;
/*
* Fmtarenas makes all arenas the same size
* except the last, which may be smaller.
* It uses the same block size for arenas as for
* the arena partition blocks.
*/
arena->size = arenasize;
if(offset0+arena->size > partend)
arena->size = partend - offset0;
head->size = arena->size;
arena->blocksize = ap.blocksize;
head->blocksize = arena->blocksize;
/*
* Look for clump magic and name in head/tail blocks.
* All the other info we will reconstruct just in case.
*/
p = pagein(offset0, arena->blocksize);
memset(&ohead, 0, sizeof ohead);
if(unpackarenahead(&ohead, p) >= 0){
head->version = ohead.version;
head->clumpmagic = ohead.clumpmagic;
if(okayname(ohead.name, anum))
strcpy(head->name, ohead.name);
}
p = pagein(offset0+arena->size-arena->blocksize,
arena->blocksize);
memset(&oarena, 0, sizeof oarena);
if(unpackarena(&oarena, p) >= 0){
arena->version = oarena.version;
arena->clumpmagic = oarena.clumpmagic;
if(okayname(oarena.name, anum))
strcpy(arena->name, oarena.name);
arena->diskstats.clumps = oarena.diskstats.clumps;
print("old arena: sealed=%d\n", oarena.diskstats.sealed);
arena->diskstats.sealed = oarena.diskstats.sealed;
}
/* Head trumps arena. */
if(head->version){
arena->version = head->version;
arena->clumpmagic = head->clumpmagic;
}
if(arena->version == 0)
arena->version = ArenaVersion5;
if(basename){
if(anum == -1)
snprint(arena->name, ANameSize, "%s", basename);
else
snprint(arena->name, ANameSize, "%s%d", basename, anum);
}else if(lastbase[0])
snprint(arena->name, ANameSize, "%s%d", lastbase, anum);
else if(head->name[0])
strcpy(arena->name, head->name);
else if(arena->name[0] == 0)
sysfatal("cannot determine base name for arena; use -n");
strcpy(lastbase, arena->name);
sprint(dname, "%d", anum);
lastbase[strlen(lastbase)-strlen(dname)] = 0;
/* Was working in arena, now copy to head. */
head->version = arena->version;
memmove(head->name, arena->name, sizeof head->name);
head->blocksize = arena->blocksize;
head->size = arena->size;
}
void
shahead(Shabuf *sb, vlong offset0, ArenaHead *head)
{
uchar headbuf[MaxDiskBlock];
sb->offset = offset0;
memset(headbuf, 0, sizeof headbuf);
packarenahead(head, headbuf);
sbupdate(sb, headbuf, offset0, head->blocksize);
}
u32int
newclumpmagic(int version)
{
u32int m;
if(version == ArenaVersion4)
return _ClumpMagic;
do{
m = fastrand();
}while(m==0 || m == _ClumpMagic);
return m;
}
/*
* Poke around in the arena to find the clump data
* and compute the relevant statistics.
*/
void
guessarena(vlong offset0, int anum, ArenaHead *head, Arena *arena,
uchar *oldscore, uchar *score)
{
uchar dbuf[MaxDiskBlock];
int needtozero, clumps, nb1, nb2, minclumps;
int inbad, n, ncib, printed, sealing, smart;
u32int magic;
uchar *sp, *ep, *p;
vlong boffset, eoffset, lastclumpend, leaked;
vlong offset, toffset, totalcorrupt, v;
Clump cl;
ClumpInfo *bci, *ci, *eci, *xci;
Cit *bcit, *cit, *ecit;
Shabuf oldsha, newsha;
/*
* We expect to find an arena, with data, between offset
* and offset+arenasize. With any luck, the data starts at
* offset+ap.blocksize. The blocks have variable size and
* aren't padded at all, which doesn't give us any alignment
* constraints. The blocks are compressed or high entropy,
* but the headers are pretty low entropy (except the score):
*
* type[1] (range 0 thru 9, 13)
* size[2]
* uncsize[2] (<= size)
*
* so we can look for these. We check the scores as we go,
* so we can't make any wrong turns. If we find ourselves
* in a dead end, scan forward looking for a new start.
*/
resetcibuf();
memset(head, 0, sizeof *head);
memset(arena, 0, sizeof *arena);
memset(oldscore, 0, VtScoreSize);
memset(score, 0, VtScoreSize);
memset(&oldsha, 0, sizeof oldsha);
memset(&newsha, 0, sizeof newsha);
newsha.rollback = 1;
if(0){
sbdebug(&oldsha, "old.sha");
sbdebug(&newsha, "new.sha");
}
loadarenabasics(offset0, anum, head, arena);
/* start the clump hunt */
clumps = 0;
totalcorrupt = 0;
sealing = 1;
boffset = offset0 + arena->blocksize;
offset = boffset;
eoffset = offset0+arena->size - arena->blocksize;
toffset = eoffset;
sp = pagein(offset0, 4*M);
if(arena->diskstats.sealed){
oldsha.offset = offset0;
sbupdate(&oldsha, sp, offset0, 4*M);
}
ep = sp+4*M;
p = sp + (boffset - offset0);
ncib = arena->blocksize / ClumpInfoSize; /* ci per block in index */
lastclumpend = offset;
nbad = 0;
inbad = 0;
needtozero = 0;
minclumps = 0;
while(offset < eoffset){
/*
* Shift buffer if we're running out of room.
*/
if(p+70*K >= ep){
/*
* Start the post SHA1 buffer. By now we should know the
* clumpmagic and arena version, so we can create a
* correct head block to get things going.
*/
if(sealing && fix && newsha.offset == 0){
newsha.offset = offset0;
if(arena->clumpmagic == 0){
if(arena->version == 0)
arena->version = ArenaVersion5;
arena->clumpmagic = newclumpmagic(arena->version);
}
head->clumpmagic = arena->clumpmagic;
shahead(&newsha, offset0, head);
}
n = 4*M-256*K;
if(sealing && fix){
sbdiskhash(&newsha, bufoffset);
sbupdate(&newsha, buf, bufoffset, 4*M-256*K);
}
pagein(bufoffset+n, 4*M);
p -= n;
if(arena->diskstats.sealed)
sbupdate(&oldsha, buf, bufoffset, 4*M);
}
/*
* Check for a clump at p, which is at offset in the disk.
* Duplicate clumps happen in corrupted disks
* (the same pattern gets written many times in a row)
* and should never happen during regular use.
*/
magic = 0;
if((n = isclump(p, &cl, &magic)) > 0){
/*
* If we were in the middle of some corrupted data,
* flush a warning about it and then add any clump
* info blocks as necessary.
*/
if(inbad){
inbad = 0;
v = offset-lastclumpend;
if(needtozero){
zerorange(lastclumpend, v);
sbrollback(&newsha, lastclumpend);
print("corrupt clump data - %#llux+%#llux (%,llud bytes)\n",
lastclumpend, v, v);
}
addcicorrupt(v);
totalcorrupt += v;
nb1 = (minclumps+ncib-1)/ncib;
minclumps += (v+ClumpSize+VtMaxLumpSize-1)/(ClumpSize+VtMaxLumpSize);
nb2 = (minclumps+ncib-1)/ncib;
eoffset -= (nb2-nb1)*arena->blocksize;
}
if(haveclump(cl.info.score))
print("warning: duplicate clump %d %V at %#llux+%#d\n", cl.info.type, cl.info.score, offset, n);
/*
* If clumps use different magic numbers, we don't care.
* We'll just use the first one we find and make the others
* follow suit.
*/
if(arena->clumpmagic == 0){
print("clump type %d size %d score %V magic %x\n",
cl.info.type, cl.info.size, cl.info.score, magic);
arena->clumpmagic = magic;
if(magic == _ClumpMagic)
arena->version = ArenaVersion4;
else
arena->version = ArenaVersion5;
}
if(magic != arena->clumpmagic)
p32(p, arena->clumpmagic);
if(clumps == 0)
arena->ctime = cl.time;
/*
* Record the clump, update arena stats,
* grow clump info blocks if needed.
*/
if(verbose > 1)
print("\tclump %d: %d %V at %#llux+%#ux (%d)\n",
clumps, cl.info.type, cl.info.score, offset, n, n);
addcibuf(&cl.info, 0);
if(minclumps%ncib == 0)
eoffset -= arena->blocksize;
minclumps++;
clumps++;
if(cl.encoding != ClumpENone)
arena->diskstats.cclumps++;
arena->diskstats.uncsize += cl.info.uncsize;
arena->wtime = cl.time;
/*
* Move to next clump.
*/
offset += n;
p += n;
lastclumpend = offset;
}else{
/*
* Overwrite malformed clump data with zeros later.
* For now, just record whether it needs to be overwritten.
* Bad regions must be of size at least ClumpSize.
* Postponing the overwriting keeps us from writing past
* the end of the arena data (which might be directory data)
* with zeros.
*/
if(!inbad){
inbad = 1;
needtozero = 0;
if(memcmp(p, zero, ClumpSize) != 0)
needtozero = 1;
p += ClumpSize;
offset += ClumpSize;
nbad++;
}else{
if(*p != 0)
needtozero = 1;
p++;
offset++;
}
}
}
pageout();
if(verbose)
print("readable clumps: %d; min. directory entries: %d\n",
clumps, minclumps);
arena->diskstats.used = lastclumpend - boffset;
leaked = eoffset - lastclumpend;
if(verbose)
print("used from %#llux to %#llux = %,lld (%,lld unused)\n",
boffset, lastclumpend, arena->diskstats.used, leaked);
/*
* Finish the SHA1 of the old data.
*/
if(arena->diskstats.sealed){
sbdiskhash(&oldsha, toffset);
readdisk(dbuf, toffset, arena->blocksize);
scorecp(dbuf+arena->blocksize-VtScoreSize, zero);
sbupdate(&oldsha, dbuf, toffset, arena->blocksize);
sbscore(&oldsha, oldscore);
}
/*
* If we still don't know the clump magic, the arena
* must be empty. It still needs a value, so make
* something up.
*/
if(arena->version == 0)
arena->version = ArenaVersion5;
if(arena->clumpmagic == 0){
if(arena->version == ArenaVersion4)
arena->clumpmagic = _ClumpMagic;
else{
do
arena->clumpmagic = fastrand();
while(arena->clumpmagic==_ClumpMagic
||arena->clumpmagic==0);
}
head->clumpmagic = arena->clumpmagic;
}
/*
* Guess at number of clumpinfo blocks to load.
* If we guess high, it's no big deal. If we guess low,
* we'll be forced into rewriting the whole directory.
* Still not such a big deal.
*/
if(clumps == 0 || arena->diskstats.used == totalcorrupt)
goto Nocib;
if(clumps < arena->diskstats.clumps)
clumps = arena->diskstats.clumps;
if(clumps < ncibuf)
clumps = ncibuf;
clumps += totalcorrupt/
((arena->diskstats.used - totalcorrupt)/clumps);
clumps += totalcorrupt/2000;
if(clumps < minclumps)
clumps = minclumps;
clumps += ncib-1;
clumps -= clumps%ncib;
/*
* Can't write into the actual data.
*/
v = offset0 + arena->size - arena->blocksize;
v -= (clumps+ncib-1)/ncib * arena->blocksize;
if(v < lastclumpend){
v = offset0 + arena->size - arena->blocksize;
clumps = (v-lastclumpend)/arena->blocksize * ncib;
}
if(clumps < minclumps)
print("cannot happen?\n");
/*
* Check clumpinfo blocks against directory we created.
* The tricky part is handling the corrupt sections of arena.
* If possible, we remark just the affected directory entries
* rather than slide everything down.
*
* Allocate clumps+1 blocks and check that we don't need
* the last one at the end.
*/
bci = loadci(offset0, arena, clumps+1);
eci = bci+clumps+1;
bcit = cibuf;
ecit = cibuf+ncibuf;
smart = 0; /* Somehow the smart code doesn't do corrupt clumps right. */
Again:
nbad = 0;
ci = bci;
for(cit=bcit; cit<ecit && ci<eci; cit++){
if(cit->corrupt){
vlong n, m;
if(smart){
/*
* If we can, just mark existing entries as corrupt.
*/
n = cit->corrupt;
for(xci=ci; n>0 && xci<eci; xci++)
n -= ClumpSize+xci->size;
if(n > 0 || xci >= eci)
goto Dumb;
printed = 0;
for(; ci<xci; ci++){
if(verbose && ci->type != VtCorruptType){
if(!printed){
print("marking directory %d-%d as corrupt\n",
(int)(ci-bci), (int)(xci-bci));
printed = 1;
}
print("\ttype=%d size=%d uncsize=%d score=%V\n",
ci->type, ci->size, ci->uncsize, ci->score);
}
ci->type = VtCorruptType;
}
}else{
Dumb:
print("\trewriting clump directory\n");
/*
* Otherwise, blaze a new trail.
*/
n = cit->corrupt;
while(n > 0 && ci < eci){
if(n < ClumpSize)
sysfatal("bad math in clump corrupt");
if(n <= VtMaxLumpSize+ClumpSize)
m = n;
else{
m = VtMaxLumpSize+ClumpSize;
if(n-m < ClumpSize)
m -= ClumpSize;
}
ci->type = VtCorruptType;
ci->size = m-ClumpSize;
ci->uncsize = m-ClumpSize;
memset(ci->score, 0, VtScoreSize);
ci++;
n -= m;
}
}
continue;
}
if(clumpinfocmp(&cit->ci, ci) != 0){
if(verbose && (smart || verbose>1)){
print("clumpinfo %d\n", (int)(ci-bci));
print("\twant: %d %d %d %V\n",
cit->ci.type, cit->ci.size,
cit->ci.uncsize, cit->ci.score);
print("\thave: %d %d %d %V\n",
ci->type, ci->size,
ci->uncsize, ci->score);
}
*ci = cit->ci;
nbad++;
}
ci++;
}
if(ci >= eci || cit < ecit){
print("ran out of space editing existing directory; rewriting\n");
print("# eci %ld ci %ld ecit %ld cit %ld\n", eci-bci, ci-bci, ecit-bcit, cit-bcit);
assert(smart); /* can't happen second time thru */
smart = 0;
goto Again;
}
assert(ci <= eci);
arena->diskstats.clumps = ci-bci;
eoffset = writeci(offset0, arena, bci, ci-bci);
if(sealing && fix)
sbrollback(&newsha, v);
print("eoffset=%lld lastclumpend=%lld diff=%lld unseal=%d\n", eoffset, lastclumpend, eoffset-lastclumpend, unseal);
if(lastclumpend > eoffset)
print("arena directory overwrote blocks! cannot happen!\n");
free(bci);
if(smart && nbad)
print("arena directory has %d bad or missing entries\n", nbad);
Nocib:
if(eoffset - lastclumpend > 64*1024 && (!arena->diskstats.sealed || unseal)){
if(arena->diskstats.sealed)
print("unsealing arena\n");
sealing = 0;
memset(oldscore, 0, VtScoreSize);
}
/*
* Finish the SHA1 of the new data - only meaningful
* if we've been writing to disk (`fix').
*/
arena->diskstats.sealed = sealing;
arena->memstats = arena->diskstats;
if(sealing && fix){
uchar tbuf[MaxDiskBlock];
sbdiskhash(&newsha, toffset);
memset(tbuf, 0, sizeof tbuf);
packarena(arena, tbuf);
sbupdate(&newsha, tbuf, toffset, arena->blocksize);
sbscore(&newsha, score);
}
}
void
dumparena(vlong offset, int anum, Arena *arena)
{
char buf[1000];
vlong o, e;
int fd, n;
snprint(buf, sizeof buf, "%s.%d", dumpbase, anum);
if((fd = create(buf, OWRITE, 0666)) < 0){
fprint(2, "create %s: %r\n", buf);
return;
}
e = offset+arena->size;
for(o=offset; o<e; o+=n){
n = 4*M;
if(o+n > e)
n = e-o;
if(pwrite(fd, pagein(o, n), n, o-offset) != n){
fprint(2, "write %s at %#llux: %r\n", buf, o-offset);
return;
}
}
}
void
checkarena(vlong offset, int anum)
{
uchar dbuf[MaxDiskBlock];
uchar *p, oldscore[VtScoreSize], score[VtScoreSize];
Arena arena, oarena;
ArenaHead head;
Info *fmt, *fmta;
int sz;
print("# arena %d: offset %#llux\n", anum, offset);
if(offset >= partend){
print("arena offset out of bounds\n");
return;
}
guessarena(offset, anum, &head, &arena, oldscore, score);
if(verbose){
print("#\tversion=%d name=%s blocksize=%d size=%z",
head.version, head.name, head.blocksize, head.size);
if(head.clumpmagic)
print(" clumpmagic=%#.8ux", head.clumpmagic);
print("\n#\tclumps=%d cclumps=%d used=%,lld uncsize=%,lld\n",
arena.diskstats.clumps, arena.diskstats.cclumps,
arena.diskstats.used, arena.diskstats.uncsize);
print("#\tctime=%t\n", arena.ctime);
print("#\twtime=%t\n", arena.wtime);
if(arena.diskstats.sealed)
print("#\tsealed score=%V\n", score);
}
if(dumpbase){
dumparena(offset, anum, &arena);
return;
}
memset(dbuf, 0, sizeof dbuf);
packarenahead(&head, dbuf);
p = pagein(offset, arena.blocksize);
if(memcmp(dbuf, p, arena.blocksize) != 0){
print("on-disk arena header incorrect\n");
showdiffs(dbuf, p, arena.blocksize,
arena.version==ArenaVersion4 ? headinfo4 : headinfo5);
}
memmove(p, dbuf, arena.blocksize);
memset(dbuf, 0, sizeof dbuf);
packarena(&arena, dbuf);
if(arena.diskstats.sealed)
scorecp(dbuf+arena.blocksize-VtScoreSize, score);
p = pagein(offset+arena.size-arena.blocksize, arena.blocksize);
memset(&oarena, 0, sizeof oarena);
unpackarena(&oarena, p);
if(arena.version == ArenaVersion4){
sz = ArenaSize4;
fmt = tailinfo4;
fmta = tailinfo4a;
}else{
sz = ArenaSize5;
fmt = tailinfo5;
fmta = tailinfo5a;
}
if(p[sz] == 1){
fmt = fmta;
if(oarena.diskstats.sealed){
/*
* some arenas were sealed with the extension
* before we adopted the convention that if it didn't
* add new information it gets dropped.
*/
_packarena(&arena, dbuf, 1);
}
}
if(memcmp(dbuf, p, arena.blocksize-VtScoreSize) != 0){
print("on-disk arena tail incorrect\n");
showdiffs(dbuf, p, arena.blocksize-VtScoreSize, fmt);
}
if(arena.diskstats.sealed){
if(oarena.diskstats.sealed)
if(scorecmp(p+arena.blocksize-VtScoreSize, oldscore) != 0){
print("on-disk arena seal score incorrect\n");
print("\tcorrect=%V\n", oldscore);
print("\t disk=%V\n", p+arena.blocksize-VtScoreSize);
}
if(fix && scorecmp(p+arena.blocksize-VtScoreSize, score) != 0){
print("%ssealing arena%s: %V\n",
oarena.diskstats.sealed ? "re" : "",
scorecmp(oldscore, score) == 0 ?
"" : " after changes", score);
}
}
memmove(p, dbuf, arena.blocksize);
pageout();
}
AMapN*
buildamap(void)
{
uchar *p;
vlong o;
ArenaHead h;
AMapN *an;
AMap *m;
an = vtmallocz(sizeof *an);
for(o=ap.arenabase; o<partend; o+=arenasize){
p = pagein(o, Block);
if(unpackarenahead(&h, p) >= 0){
an->map = vtrealloc(an->map, (an->n+1)*sizeof an->map[0]);
m = &an->map[an->n++];
m->start = o;
m->stop = o+h.size;
strcpy(m->name, h.name);
}
}
return an;
}
void
checkmap(void)
{
char *s;
uchar *p;
int i, len;
AMapN *an;
Fmt fmt;
an = buildamap();
fmtstrinit(&fmt);
fmtprint(&fmt, "%ud\n", an->n);
for(i=0; i<an->n; i++)
fmtprint(&fmt, "%s\t%lld\t%lld\n",
an->map[i].name, an->map[i].start, an->map[i].stop);
s = fmtstrflush(&fmt);
len = strlen(s);
if(len > ap.tabsize){
print("arena partition map too long: need %z bytes have %z\n",
(vlong)len, (vlong)ap.tabsize);
len = ap.tabsize;
}
if(ap.tabsize >= 4*M){ /* can't happen - max arenas is 2000 */
print("arena partition map *way* too long\n");
return;
}
p = pagein(ap.tabbase, ap.tabsize);
if(memcmp(p, s, len) != 0){
print("arena partition map incorrect; rewriting.\n");
memmove(p, s, len);
}
pageout();
}
int mainstacksize = 512*1024;
void
threadmain(int argc, char **argv)
{
int mode;
mode = OREAD;
readonly = 1;
ARGBEGIN{
case 'U':
unseal = 1;
break;
case 'a':
arenasize = unittoull(EARGF(usage()));
break;
case 'b':
ap.blocksize = unittoull(EARGF(usage()));
break;
case 'f':
fix = 1;
mode = ORDWR;
readonly = 0;
break;
case 'n':
basename = EARGF(usage());
break;
case 'v':
verbose++;
break;
case 'x':
dumpbase = EARGF(usage());
break;
default:
usage();
}ARGEND
if(argc != 1 && argc != 2)
usage();
file = argv[0];
ventifmtinstall();
fmtinstall('z', zfmt);
fmtinstall('t', tfmt);
quotefmtinstall();
part = initpart(file, mode|ODIRECT);
if(part == nil)
sysfatal("can't open %s: %r", file);
partend = part->size;
if(isonearena()){
checkarena(0, -1);
threadexitsall(nil);
}
checkarenas(argc > 1 ? argv[1] : nil);
checkmap();
threadexitsall(nil);
}