| /* From libregexp but leaks extra classes when it runs out */ | 
 |  | 
 |  | 
 | #include <u.h> | 
 | #include <libc.h> | 
 | #include "regexp.h" | 
 | #include "/sys/src/libregexp/regcomp.h" | 
 |  | 
 | #define	TRUE	1 | 
 | #define	FALSE	0 | 
 |  | 
 | /* | 
 |  * Parser Information | 
 |  */ | 
 | typedef | 
 | struct Node | 
 | { | 
 | 	Reinst*	first; | 
 | 	Reinst*	last; | 
 | }Node; | 
 |  | 
 | #define	NSTACK	20 | 
 | static	Node	andstack[NSTACK]; | 
 | static	Node	*andp; | 
 | static	int	atorstack[NSTACK]; | 
 | static	int*	atorp; | 
 | static	int	cursubid;		/* id of current subexpression */ | 
 | static	int	subidstack[NSTACK];	/* parallel to atorstack */ | 
 | static	int*	subidp; | 
 | static	int	lastwasand;	/* Last token was operand */ | 
 | static	int	nbra; | 
 | static	char*	exprp;		/* pointer to next character in source expression */ | 
 | static	int	lexdone; | 
 | static	int	nclass; | 
 | static	Reclass*classp; | 
 | static	Reinst*	freep; | 
 | static	int	errors; | 
 | static	Rune	yyrune;		/* last lex'd rune */ | 
 | static	Reclass*yyclassp;	/* last lex'd class */ | 
 |  | 
 | /* predeclared crap */ | 
 | static	void	operator(int); | 
 | static	void	pushand(Reinst*, Reinst*); | 
 | static	void	pushator(int); | 
 | static	void	evaluntil(int); | 
 | static	int	bldcclass(void); | 
 |  | 
 | static jmp_buf regkaboom; | 
 |  | 
 | static	void | 
 | rcerror(char *s) | 
 | { | 
 | 	errors++; | 
 | 	regerror(s); | 
 | 	longjmp(regkaboom, 1); | 
 | } | 
 |  | 
 | static	Reinst* | 
 | newinst(int t) | 
 | { | 
 | 	freep->type = t; | 
 | 	freep->left = 0; | 
 | 	freep->right = 0; | 
 | 	return freep++; | 
 | } | 
 |  | 
 | static	void | 
 | operand(int t) | 
 | { | 
 | 	Reinst *i; | 
 |  | 
 | 	if(lastwasand) | 
 | 		operator(CAT);	/* catenate is implicit */ | 
 | 	i = newinst(t); | 
 |  | 
 | 	if(t == CCLASS || t == NCCLASS) | 
 | 		i->cp = yyclassp; | 
 | 	if(t == RUNE) | 
 | 		i->r = yyrune; | 
 |  | 
 | 	pushand(i, i); | 
 | 	lastwasand = TRUE; | 
 | } | 
 |  | 
 | static	void | 
 | operator(int t) | 
 | { | 
 | 	if(t==RBRA && --nbra<0) | 
 | 		rcerror("unmatched right paren"); | 
 | 	if(t==LBRA){ | 
 | 		if(++cursubid >= NSUBEXP) | 
 | 			rcerror ("too many subexpressions"); | 
 | 		nbra++; | 
 | 		if(lastwasand) | 
 | 			operator(CAT); | 
 | 	} else | 
 | 		evaluntil(t); | 
 | 	if(t != RBRA) | 
 | 		pushator(t); | 
 | 	lastwasand = FALSE; | 
 | 	if(t==STAR || t==QUEST || t==PLUS || t==RBRA) | 
 | 		lastwasand = TRUE;	/* these look like operands */ | 
 | } | 
 |  | 
 | static	void | 
 | regerr2(char *s, int c) | 
 | { | 
 | 	char buf[100]; | 
 | 	char *cp = buf; | 
 | 	while(*s) | 
 | 		*cp++ = *s++; | 
 | 	*cp++ = c; | 
 | 	*cp = '\0';  | 
 | 	rcerror(buf); | 
 | } | 
 |  | 
 | static	void | 
 | cant(char *s) | 
 | { | 
 | 	char buf[100]; | 
 | 	strcpy(buf, "can't happen: "); | 
 | 	strcat(buf, s); | 
 | 	rcerror(buf); | 
 | } | 
 |  | 
 | static	void | 
 | pushand(Reinst *f, Reinst *l) | 
 | { | 
 | 	if(andp >= &andstack[NSTACK]) | 
 | 		cant("operand stack overflow"); | 
 | 	andp->first = f; | 
 | 	andp->last = l; | 
 | 	andp++; | 
 | } | 
 |  | 
 | static	void | 
 | pushator(int t) | 
 | { | 
 | 	if(atorp >= &atorstack[NSTACK]) | 
 | 		cant("operator stack overflow"); | 
 | 	*atorp++ = t; | 
 | 	*subidp++ = cursubid; | 
 | } | 
 |  | 
 | static	Node* | 
 | popand(int op) | 
 | { | 
 | 	Reinst *inst; | 
 |  | 
 | 	if(andp <= &andstack[0]){ | 
 | 		regerr2("missing operand for ", op); | 
 | 		inst = newinst(NOP); | 
 | 		pushand(inst,inst); | 
 | 	} | 
 | 	return --andp; | 
 | } | 
 |  | 
 | static	int | 
 | popator(void) | 
 | { | 
 | 	if(atorp <= &atorstack[0]) | 
 | 		cant("operator stack underflow"); | 
 | 	--subidp; | 
 | 	return *--atorp; | 
 | } | 
 |  | 
 | static	void | 
 | evaluntil(int pri) | 
 | { | 
 | 	Node *op1, *op2; | 
 | 	Reinst *inst1, *inst2; | 
 |  | 
 | 	while(pri==RBRA || atorp[-1]>=pri){ | 
 | 		switch(popator()){ | 
 | 		default: | 
 | 			rcerror("unknown operator in evaluntil"); | 
 | 			break; | 
 | 		case LBRA:		/* must have been RBRA */ | 
 | 			op1 = popand('('); | 
 | 			inst2 = newinst(RBRA); | 
 | 			inst2->subid = *subidp; | 
 | 			op1->last->next = inst2; | 
 | 			inst1 = newinst(LBRA); | 
 | 			inst1->subid = *subidp; | 
 | 			inst1->next = op1->first; | 
 | 			pushand(inst1, inst2); | 
 | 			return; | 
 | 		case OR: | 
 | 			op2 = popand('|'); | 
 | 			op1 = popand('|'); | 
 | 			inst2 = newinst(NOP); | 
 | 			op2->last->next = inst2; | 
 | 			op1->last->next = inst2; | 
 | 			inst1 = newinst(OR); | 
 | 			inst1->right = op1->first; | 
 | 			inst1->left = op2->first; | 
 | 			pushand(inst1, inst2); | 
 | 			break; | 
 | 		case CAT: | 
 | 			op2 = popand(0); | 
 | 			op1 = popand(0); | 
 | 			op1->last->next = op2->first; | 
 | 			pushand(op1->first, op2->last); | 
 | 			break; | 
 | 		case STAR: | 
 | 			op2 = popand('*'); | 
 | 			inst1 = newinst(OR); | 
 | 			op2->last->next = inst1; | 
 | 			inst1->right = op2->first; | 
 | 			pushand(inst1, inst1); | 
 | 			break; | 
 | 		case PLUS: | 
 | 			op2 = popand('+'); | 
 | 			inst1 = newinst(OR); | 
 | 			op2->last->next = inst1; | 
 | 			inst1->right = op2->first; | 
 | 			pushand(op2->first, inst1); | 
 | 			break; | 
 | 		case QUEST: | 
 | 			op2 = popand('?'); | 
 | 			inst1 = newinst(OR); | 
 | 			inst2 = newinst(NOP); | 
 | 			inst1->left = inst2; | 
 | 			inst1->right = op2->first; | 
 | 			op2->last->next = inst2; | 
 | 			pushand(inst1, inst2); | 
 | 			break; | 
 | 		} | 
 | 	} | 
 | } | 
 |  | 
 | static	Reprog* | 
 | optimize(Reprog *pp) | 
 | { | 
 | 	Reinst *inst, *target; | 
 | 	int size; | 
 | 	Reprog *npp; | 
 | 	Reclass *cl; | 
 | 	int diff; | 
 |  | 
 | 	/* | 
 | 	 *  get rid of NOOP chains | 
 | 	 */ | 
 | 	for(inst=pp->firstinst; inst->type!=END; inst++){ | 
 | 		target = inst->next; | 
 | 		while(target->type == NOP) | 
 | 			target = target->next; | 
 | 		inst->next = target; | 
 | 	} | 
 |  | 
 | 	/* | 
 | 	 *  The original allocation is for an area larger than | 
 | 	 *  necessary.  Reallocate to the actual space used | 
 | 	 *  and then relocate the code. | 
 | 	 */ | 
 | 	size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst); | 
 | 	npp = realloc(pp, size); | 
 | 	if(npp==0 || npp==pp) | 
 | 		return pp; | 
 | 	diff = (char *)npp - (char *)pp; | 
 | 	freep = (Reinst *)((char *)freep + diff); | 
 | 	for(inst=npp->firstinst; inst<freep; inst++){ | 
 | 		switch(inst->type){ | 
 | 		case OR: | 
 | 		case STAR: | 
 | 		case PLUS: | 
 | 		case QUEST: | 
 | 			*(char **)&inst->right += diff; | 
 | 			break; | 
 | 		case CCLASS: | 
 | 		case NCCLASS: | 
 | 			*(char **)&inst->right += diff; | 
 | 			cl = inst->cp; | 
 | 			*(char **)&cl->end += diff; | 
 | 			break; | 
 | 		} | 
 | 		*(char **)&inst->left += diff; | 
 | 	} | 
 | 	*(char **)&npp->startinst += diff; | 
 | 	return npp; | 
 | } | 
 |  | 
 | #ifdef	DEBUG | 
 | static	void | 
 | dumpstack(void){ | 
 | 	Node *stk; | 
 | 	int *ip; | 
 |  | 
 | 	print("operators\n"); | 
 | 	for(ip=atorstack; ip<atorp; ip++) | 
 | 		print("0%o\n", *ip); | 
 | 	print("operands\n"); | 
 | 	for(stk=andstack; stk<andp; stk++) | 
 | 		print("0%o\t0%o\n", stk->first->type, stk->last->type); | 
 | } | 
 |  | 
 | static	void | 
 | dump(Reprog *pp) | 
 | { | 
 | 	Reinst *l; | 
 | 	Rune *p; | 
 |  | 
 | 	l = pp->firstinst; | 
 | 	do{ | 
 | 		print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type, | 
 | 			l->left-pp->firstinst, l->right-pp->firstinst); | 
 | 		if(l->type == RUNE) | 
 | 			print("\t%C\n", l->r); | 
 | 		else if(l->type == CCLASS || l->type == NCCLASS){ | 
 | 			print("\t["); | 
 | 			if(l->type == NCCLASS) | 
 | 				print("^"); | 
 | 			for(p = l->cp->spans; p < l->cp->end; p += 2) | 
 | 				if(p[0] == p[1]) | 
 | 					print("%C", p[0]); | 
 | 				else | 
 | 					print("%C-%C", p[0], p[1]); | 
 | 			print("]\n"); | 
 | 		} else | 
 | 			print("\n"); | 
 | 	}while(l++->type); | 
 | } | 
 | #endif | 
 |  | 
 | static	Reclass* | 
 | newclass(void) | 
 | { | 
 | 	if(nclass <= 0){ | 
 | 		classp = mallocz(128*sizeof(Reclass), 1); | 
 | 		if(classp == nil) | 
 | 			regerror("out of memory"); | 
 | 		nclass = 128; | 
 | 	} | 
 | 	return &classp[--nclass]; | 
 | } | 
 |  | 
 | static	int | 
 | nextc(Rune *rp) | 
 | { | 
 | 	if(lexdone){ | 
 | 		*rp = 0; | 
 | 		return 1; | 
 | 	} | 
 | 	exprp += chartorune(rp, exprp); | 
 | 	if(*rp == L'\\'){ | 
 | 		exprp += chartorune(rp, exprp); | 
 | 		return 1; | 
 | 	} | 
 | 	if(*rp == 0) | 
 | 		lexdone = 1; | 
 | 	return 0; | 
 | } | 
 |  | 
 | static	int | 
 | lex(int literal, int dot_type) | 
 | { | 
 | 	int quoted; | 
 |  | 
 | 	quoted = nextc(&yyrune); | 
 | 	if(literal || quoted){ | 
 | 		if(yyrune == 0) | 
 | 			return END; | 
 | 		return RUNE; | 
 | 	} | 
 |  | 
 | 	switch(yyrune){ | 
 | 	case 0: | 
 | 		return END; | 
 | 	case L'*': | 
 | 		return STAR; | 
 | 	case L'?': | 
 | 		return QUEST; | 
 | 	case L'+': | 
 | 		return PLUS; | 
 | 	case L'|': | 
 | 		return OR; | 
 | 	case L'.': | 
 | 		return dot_type; | 
 | 	case L'(': | 
 | 		return LBRA; | 
 | 	case L')': | 
 | 		return RBRA; | 
 | 	case L'^': | 
 | 		return BOL; | 
 | 	case L'$': | 
 | 		return EOL; | 
 | 	case L'[': | 
 | 		return bldcclass(); | 
 | 	} | 
 | 	return RUNE; | 
 | } | 
 |  | 
 | static int | 
 | bldcclass(void) | 
 | { | 
 | 	int type; | 
 | 	Rune r[NCCRUNE]; | 
 | 	Rune *p, *ep, *np; | 
 | 	Rune rune; | 
 | 	int quoted; | 
 |  | 
 | 	/* we have already seen the '[' */ | 
 | 	type = CCLASS; | 
 | 	yyclassp = newclass(); | 
 |  | 
 | 	/* look ahead for negation */ | 
 | 	/* SPECIAL CASE!!! negated classes don't match \n */ | 
 | 	ep = r; | 
 | 	quoted = nextc(&rune); | 
 | 	if(!quoted && rune == L'^'){ | 
 | 		type = NCCLASS; | 
 | 		quoted = nextc(&rune); | 
 | 		*ep++ = L'\n'; | 
 | 		*ep++ = L'\n'; | 
 | 	} | 
 |  | 
 | 	/* parse class into a set of spans */ | 
 | 	for(; ep<&r[NCCRUNE];){ | 
 | 		if(rune == 0){ | 
 | 			rcerror("malformed '[]'"); | 
 | 			return 0; | 
 | 		} | 
 | 		if(!quoted && rune == L']') | 
 | 			break; | 
 | 		if(!quoted && rune == L'-'){ | 
 | 			if(ep == r){ | 
 | 				rcerror("malformed '[]'"); | 
 | 				return 0; | 
 | 			} | 
 | 			quoted = nextc(&rune); | 
 | 			if((!quoted && rune == L']') || rune == 0){ | 
 | 				rcerror("malformed '[]'"); | 
 | 				return 0; | 
 | 			} | 
 | 			*(ep-1) = rune; | 
 | 		} else { | 
 | 			*ep++ = rune; | 
 | 			*ep++ = rune; | 
 | 		} | 
 | 		quoted = nextc(&rune); | 
 | 	} | 
 |  | 
 | 	/* sort on span start */ | 
 | 	for(p = r; p < ep; p += 2){ | 
 | 		for(np = p; np < ep; np += 2) | 
 | 			if(*np < *p){ | 
 | 				rune = np[0]; | 
 | 				np[0] = p[0]; | 
 | 				p[0] = rune; | 
 | 				rune = np[1]; | 
 | 				np[1] = p[1]; | 
 | 				p[1] = rune; | 
 | 			} | 
 | 	} | 
 |  | 
 | 	/* merge spans */ | 
 | 	np = yyclassp->spans; | 
 | 	p = r; | 
 | 	if(r == ep) | 
 | 		yyclassp->end = np; | 
 | 	else { | 
 | 		np[0] = *p++; | 
 | 		np[1] = *p++; | 
 | 		for(; p < ep; p += 2) | 
 | 			if(p[0] <= np[1]){ | 
 | 				if(p[1] > np[1]) | 
 | 					np[1] = p[1]; | 
 | 			} else { | 
 | 				np += 2; | 
 | 				np[0] = p[0]; | 
 | 				np[1] = p[1]; | 
 | 			} | 
 | 		yyclassp->end = np+2; | 
 | 	} | 
 |  | 
 | 	return type; | 
 | } | 
 |  | 
 | static	Reprog* | 
 | regcomp1(char *s, int literal, int dot_type) | 
 | { | 
 | 	int token; | 
 | 	Reprog *pp; | 
 |  | 
 | 	/* get memory for the program */ | 
 | 	pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s)); | 
 | 	if(pp == 0){ | 
 | 		regerror("out of memory"); | 
 | 		return 0; | 
 | 	} | 
 | 	freep = pp->firstinst; | 
 | 	classp = pp->class; | 
 | 	errors = 0; | 
 |  | 
 | 	if(setjmp(regkaboom)) | 
 | 		goto out; | 
 |  | 
 | 	/* go compile the sucker */ | 
 | 	lexdone = 0; | 
 | 	exprp = s; | 
 | 	nclass = NCLASS; | 
 | 	nbra = 0; | 
 | 	atorp = atorstack; | 
 | 	andp = andstack; | 
 | 	subidp = subidstack; | 
 | 	lastwasand = FALSE; | 
 | 	cursubid = 0; | 
 |  | 
 | 	/* Start with a low priority operator to prime parser */ | 
 | 	pushator(START-1); | 
 | 	while((token = lex(literal, dot_type)) != END){ | 
 | 		if((token&0300) == OPERATOR) | 
 | 			operator(token); | 
 | 		else | 
 | 			operand(token); | 
 | 	} | 
 |  | 
 | 	/* Close with a low priority operator */ | 
 | 	evaluntil(START); | 
 |  | 
 | 	/* Force END */ | 
 | 	operand(END); | 
 | 	evaluntil(START); | 
 | #ifdef DEBUG | 
 | 	dumpstack(); | 
 | #endif | 
 | 	if(nbra) | 
 | 		rcerror("unmatched left paren"); | 
 | 	--andp;	/* points to first and only operand */ | 
 | 	pp->startinst = andp->first; | 
 | #ifdef DEBUG | 
 | 	dump(pp); | 
 | #endif | 
 | 	pp = optimize(pp); | 
 | #ifdef DEBUG | 
 | 	print("start: %d\n", andp->first-pp->firstinst); | 
 | 	dump(pp); | 
 | #endif | 
 | out: | 
 | 	if(errors){ | 
 | 		free(pp); | 
 | 		pp = 0; | 
 | 	} | 
 | 	return pp; | 
 | } | 
 |  | 
 | extern	Reprog* | 
 | regcomp(char *s) | 
 | { | 
 | 	return regcomp1(s, 0, ANY); | 
 | } | 
 |  | 
 | extern	Reprog* | 
 | regcomplit(char *s) | 
 | { | 
 | 	return regcomp1(s, 1, ANY); | 
 | } | 
 |  | 
 | extern	Reprog* | 
 | regcompnl(char *s) | 
 | { | 
 | 	return regcomp1(s, 0, ANYNL); | 
 | } |