Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

regcomp.c @ 567

Last change on this file since 567 was 444, checked in by satin@…, 6 years ago
add newlib,libalmos-mkh, restructure shared_syscalls.h and mini-libc
File size: 47.8 KB

Line
1	/*-
2	* Copyright (c) 1992, 1993, 1994 Henry Spencer.
3	* Copyright (c) 1992, 1993, 1994
4	* The Regents of the University of California. All rights reserved.
5	*
6	* This code is derived from software contributed to Berkeley by
7	* Henry Spencer.
8	*
9	* Redistribution and use in source and binary forms, with or without
10	* modification, are permitted provided that the following conditions
11	* are met:
12	* 1. Redistributions of source code must retain the above copyright
13	* notice, this list of conditions and the following disclaimer.
14	* 2. Redistributions in binary form must reproduce the above copyright
15	* notice, this list of conditions and the following disclaimer in the
16	* documentation and/or other materials provided with the distribution.
17	* 4. Neither the name of the University nor the names of its contributors
18	* may be used to endorse or promote products derived from this software
19	* without specific prior written permission.
20	*
21	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31	* SUCH DAMAGE.
32	*
33	* @(#)regcomp.c 8.5 (Berkeley) 3/20/94
34	*/
35
36	#ifndef _NO_REGEX
37
38	#if defined(LIBC_SCCS) && !defined(lint)
39	static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
40	#endif /* LIBC_SCCS and not lint */
41	#include <sys/cdefs.h>
42
43	#include <sys/types.h>
44	#include <stdio.h>
45	#include <string.h>
46	#include <ctype.h>
47	#include <limits.h>
48	#include <stdlib.h>
49	#include <regex.h>
50
51	#include "collate.h"
52
53	#include "utils.h"
54	#include "regex2.h"
55
56	#include "cclass.h"
57	#include "cname.h"
58
59	/*
60	* parse structure, passed up and down to avoid global variables and
61	* other clumsinesses
62	*/
63	struct parse {
64	char next; / next character in RE */
65	char end; / end of string (-> NUL normally) */
66	int error; /* has an error been seen? */
67	sop strip; / malloced strip */
68	sopno ssize; /* malloced strip size (allocated) */
69	sopno slen; /* malloced strip length (used) */
70	int ncsalloc; /* number of csets allocated */
71	struct re_guts *g;
72	# define NPAREN 10 /* we need to remember () 1-9 for back refs */
73	sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
74	sopno pend[NPAREN]; /* -> ) ([0] unused) */
75	};
76
77	/* ========= begin header generated by ./mkh ========= */
78	#ifdef __cplusplus
79	extern "C" {
80	#endif
81
82	/* === regcomp.c === */
83	static void p_ere(struct parse *p, int stop);
84	static void p_ere_exp(struct parse *p);
85	static void p_str(struct parse *p);
86	static void p_bre(struct parse *p, int end1, int end2);
87	static int p_simp_re(struct parse *p, int starordinary);
88	static int p_count(struct parse *p);
89	static void p_bracket(struct parse *p);
90	static void p_b_term(struct parse p, cset cs);
91	static void p_b_cclass(struct parse p, cset cs);
92	static void p_b_eclass(struct parse p, cset cs);
93	static char p_b_symbol(struct parse *p);
94	static char p_b_coll_elem(struct parse *p, int endc);
95	static char othercase(int ch);
96	static void bothcases(struct parse *p, int ch);
97	static void ordinary(struct parse *p, int ch);
98	static void nonnewline(struct parse *p);
99	static void repeat(struct parse *p, sopno start, int from, int to);
100	static int seterr(struct parse *p, int e);
101	static cset allocset(struct parse p);
102	static void freeset(struct parse p, cset cs);
103	static int freezeset(struct parse p, cset cs);
104	static int firstch(struct parse p, cset cs);
105	static int nch(struct parse p, cset cs);
106	#if used
107	static void mcadd(struct parse p, cset cs, char *cp);
108	static void mcsub(cset cs, char cp);
109	static int mcin(cset cs, char cp);
110	static char mcfind(cset cs, char *cp);
111	#endif
112	static void mcinvert(struct parse p, cset cs);
113	static void mccase(struct parse p, cset cs);
114	static int isinsets(struct re_guts *g, int c);
115	static int samesets(struct re_guts *g, int c1, int c2);
116	static void categorize(struct parse p, struct re_guts g);
117	static sopno dupl(struct parse *p, sopno start, sopno finish);
118	static void doemit(struct parse *p, sop op, size_t opnd);
119	static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
120	static void dofwd(struct parse *p, sopno pos, sop value);
121	static void enlarge(struct parse *p, sopno size);
122	static void stripsnug(struct parse p, struct re_guts g);
123	static void findmust(struct parse p, struct re_guts g);
124	static int altoffset(sop *scan, int offset, int mccs);
125	static void computejumps(struct parse p, struct re_guts g);
126	static void computematchjumps(struct parse p, struct re_guts g);
127	static sopno pluscount(struct parse p, struct re_guts g);
128
129	#ifdef __cplusplus
130	}
131	#endif
132	/* ========= end header generated by ./mkh ========= */
133
134	static char nuls[10]; /* place to point scanner in event of error */
135
136	/*
137	* macros for use with parse structure
138	* BEWARE: these know that the parse structure is named `p' !!!
139	*/
140	#define PEEK() (*p->next)
141	#define PEEK2() (*(p->next+1))
142	#define MORE() (p->next < p->end)
143	#define MORE2() (p->next+1 < p->end)
144	#define SEE(c) (MORE() && PEEK() == (c))
145	#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
146	#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
147	#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
148	#define NEXT() (p->next++)
149	#define NEXT2() (p->next += 2)
150	#define NEXTn(n) (p->next += (n))
151	#define GETNEXT() (*p->next++)
152	#define SETERROR(e) seterr(p, (e))
153	#define REQUIRE(co, e) ((co) \|\| SETERROR(e))
154	#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
155	#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
156	#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() \|\| PEEK() != (c), e))
157	#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
158	#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
159	#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
160	#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
161	#define HERE() (p->slen)
162	#define THERE() (p->slen - 1)
163	#define THERETHERE() (p->slen - 2)
164	#define DROP(n) (p->slen -= (n))
165
166	#ifndef NDEBUG
167	static int never = 0; /* for use in asserts; shuts lint up */
168	#else
169	#define never 0 /* some <assert.h>s have bugs too */
170	#endif
171
172	/* Macro used by computejump()/computematchjump() */
173	#define MIN(a,b) ((a)<(b)?(a):(b))
174
175	/*
176	- regcomp - interface for parser and compilation
177	= extern int regcomp(regex_t __restrict, const char __restrict, int);
178	= #define REG_BASIC 0000
179	= #define REG_EXTENDED 0001
180	= #define REG_ICASE 0002
181	= #define REG_NOSUB 0004
182	= #define REG_NEWLINE 0010
183	= #define REG_NOSPEC 0020
184	= #define REG_PEND 0040
185	= #define REG_DUMP 0200
186	*/
187	int /* 0 success, otherwise REG_something */
188	regcomp(preg, pattern, cflags)
189	regex_t *__restrict preg;
190	const char *__restrict pattern;
191	int cflags;
192	{
193	struct parse pa;
194	struct re_guts *g;
195	struct parse *p = &pa;
196	int i;
197	size_t len;
198	#ifdef REDEBUG
199	# define GOODFLAGS(f) (f)
200	#else
201	# define GOODFLAGS(f) ((f)&~REG_DUMP)
202	#endif
203
204	cflags = GOODFLAGS(cflags);
205	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
206	return(REG_INVARG);
207
208	if (cflags&REG_PEND) {
209	if (preg->re_endp < pattern)
210	return(REG_INVARG);
211	len = preg->re_endp - pattern;
212	} else
213	len = strlen((char *)pattern);
214
215	/* do the mallocs early so failure handling is easy */
216	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
217	(NC-1)*sizeof(cat_t));
218	if (g == NULL)
219	return(REG_ESPACE);
220	p->ssize = len/(size_t)2(size_t)3 + (size_t)1; / ugh */
221	p->strip = (sop )malloc(p->ssize sizeof(sop));
222	p->slen = 0;
223	if (p->strip == NULL) {
224	free((char *)g);
225	return(REG_ESPACE);
226	}
227
228	/* set things up */
229	p->g = g;
230	p->next = (char )pattern; / convenience; we do not modify it */
231	p->end = p->next + len;
232	p->error = 0;
233	p->ncsalloc = 0;
234	for (i = 0; i < NPAREN; i++) {
235	p->pbegin[i] = 0;
236	p->pend[i] = 0;
237	}
238	g->csetsize = NC;
239	g->sets = NULL;
240	g->setbits = NULL;
241	g->ncsets = 0;
242	g->cflags = cflags;
243	g->iflags = 0;
244	g->nbol = 0;
245	g->neol = 0;
246	g->must = NULL;
247	g->moffset = -1;
248	g->charjump = NULL;
249	g->matchjump = NULL;
250	g->mlen = 0;
251	g->nsub = 0;
252	g->ncategories = 1; /* category 0 is "everything else" */
253	g->categories = &g->catspace[-(CHAR_MIN)];
254	(void) memset((char )g->catspace, 0, NCsizeof(cat_t));
255	g->backrefs = 0;
256
257	/* do it */
258	EMIT(OEND, 0);
259	g->firststate = THERE();
260	if (cflags&REG_EXTENDED)
261	p_ere(p, OUT);
262	else if (cflags&REG_NOSPEC)
263	p_str(p);
264	else
265	p_bre(p, OUT, OUT);
266	EMIT(OEND, 0);
267	g->laststate = THERE();
268
269	/* tidy up loose ends and fill things in */
270	categorize(p, g);
271	stripsnug(p, g);
272	findmust(p, g);
273	/* only use Boyer-Moore algorithm if the pattern is bigger
274	* than three characters
275	*/
276	if(g->mlen > 3) {
277	computejumps(p, g);
278	computematchjumps(p, g);
279	if(g->matchjump == NULL && g->charjump != NULL) {
280	free(g->charjump);
281	g->charjump = NULL;
282	}
283	}
284	g->nplus = pluscount(p, g);
285	g->magic = MAGIC2;
286	preg->re_nsub = g->nsub;
287	preg->re_g = g;
288	preg->re_magic = MAGIC1;
289	#ifndef REDEBUG
290	/* not debugging, so can't rely on the assert() in regexec() */
291	if (g->iflags&BAD)
292	SETERROR(REG_ASSERT);
293	#endif
294
295	/* win or lose, we're done */
296	if (p->error != 0) /* lose */
297	regfree(preg);
298	return(p->error);
299	}
300
301	/*
302	- p_ere - ERE parser top level, concatenation and alternation
303	== static void p_ere(struct parse *p, int stop);
304	*/
305	static void
306	p_ere(p, stop)
307	struct parse *p;
308	int stop; /* character this ERE should end at */
309	{
310	char c;
311	sopno prevback = 0;
312	sopno prevfwd = 0;
313	sopno conc;
314	int first = 1; /* is this the first alternative? */
315
316	for (;;) {
317	/* do a bunch of concatenated expressions */
318	conc = HERE();
319	while (MORE() && (c = PEEK()) != '\|' && c != stop)
320	p_ere_exp(p);
321	(void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
322
323	if (!EAT('\|'))
324	break; /* NOTE BREAK OUT */
325
326	if (first) {
327	INSERT(OCH_, conc); /* offset is wrong */
328	prevfwd = conc;
329	prevback = conc;
330	first = 0;
331	}
332	ASTERN(OOR1, prevback);
333	prevback = THERE();
334	AHEAD(prevfwd); /* fix previous offset */
335	prevfwd = HERE();
336	EMIT(OOR2, 0); /* offset is very wrong */
337	}
338
339	if (!first) { /* tail-end fixups */
340	AHEAD(prevfwd);
341	ASTERN(O_CH, prevback);
342	}
343
344	assert(!MORE() \|\| SEE(stop));
345	}
346
347	/*
348	- p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
349	== static void p_ere_exp(struct parse *p);
350	*/
351	static void
352	p_ere_exp(p)
353	struct parse *p;
354	{
355	char c;
356	sopno pos;
357	int count;
358	int count2;
359	sopno subno;
360	int wascaret = 0;
361
362	assert(MORE()); /* caller should have ensured this */
363	c = GETNEXT();
364
365	pos = HERE();
366	switch (c) {
367	case '(':
368	(void)REQUIRE(MORE(), REG_EPAREN);
369	p->g->nsub++;
370	subno = p->g->nsub;
371	if (subno < NPAREN)
372	p->pbegin[subno] = HERE();
373	EMIT(OLPAREN, subno);
374	if (!SEE(')'))
375	p_ere(p, ')');
376	if (subno < NPAREN) {
377	p->pend[subno] = HERE();
378	assert(p->pend[subno] != 0);
379	}
380	EMIT(ORPAREN, subno);
381	(void)MUSTEAT(')', REG_EPAREN);
382	break;
383	#ifndef POSIX_MISTAKE
384	case ')': /* happens only if no current unmatched ( */
385	/*
386	* You may ask, why the ifndef? Because I didn't notice
387	* this until slightly too late for 1003.2, and none of the
388	* other 1003.2 regular-expression reviewers noticed it at
389	* all. So an unmatched ) is legal POSIX, at least until
390	* we can get it fixed.
391	*/
392	SETERROR(REG_EPAREN);
393	break;
394	#endif
395	case '^':
396	EMIT(OBOL, 0);
397	p->g->iflags \|= USEBOL;
398	p->g->nbol++;
399	wascaret = 1;
400	break;
401	case '$':
402	EMIT(OEOL, 0);
403	p->g->iflags \|= USEEOL;
404	p->g->neol++;
405	break;
406	case '\|':
407	SETERROR(REG_EMPTY);
408	break;
409	case '*':
410	case '+':
411	case '?':
412	SETERROR(REG_BADRPT);
413	break;
414	case '.':
415	if (p->g->cflags&REG_NEWLINE)
416	nonnewline(p);
417	else
418	EMIT(OANY, 0);
419	break;
420	case '[':
421	p_bracket(p);
422	break;
423	case '\\':
424	(void)REQUIRE(MORE(), REG_EESCAPE);
425	c = GETNEXT();
426	ordinary(p, c);
427	break;
428	case '{': /* okay as ordinary except if digit follows */
429	(void)REQUIRE(!MORE() \|\| !isdigit((uch)PEEK()), REG_BADRPT);
430	/* FALLTHROUGH */
431	default:
432	ordinary(p, c);
433	break;
434	}
435
436	if (!MORE())
437	return;
438	c = PEEK();
439	/* we call { a repetition if followed by a digit */
440	if (!( c == '*' \|\| c == '+' \|\| c == '?' \|\|
441	(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
442	return; /* no repetition, we're done */
443	NEXT();
444
445	(void)REQUIRE(!wascaret, REG_BADRPT);
446	switch (c) {
447	case '': / implemented as +? */
448	/* this case does not require the (y\|) trick, noKLUDGE */
449	INSERT(OPLUS_, pos);
450	ASTERN(O_PLUS, pos);
451	INSERT(OQUEST_, pos);
452	ASTERN(O_QUEST, pos);
453	break;
454	case '+':
455	INSERT(OPLUS_, pos);
456	ASTERN(O_PLUS, pos);
457	break;
458	case '?':
459	/* KLUDGE: emit y? as (y\|) until subtle bug gets fixed */
460	INSERT(OCH_, pos); /* offset slightly wrong */
461	ASTERN(OOR1, pos); /* this one's right */
462	AHEAD(pos); /* fix the OCH_ */
463	EMIT(OOR2, 0); /* offset very wrong... */
464	AHEAD(THERE()); /* ...so fix it */
465	ASTERN(O_CH, THERETHERE());
466	break;
467	case '{':
468	count = p_count(p);
469	if (EAT(',')) {
470	if (isdigit((uch)PEEK())) {
471	count2 = p_count(p);
472	(void)REQUIRE(count <= count2, REG_BADBR);
473	} else /* single number with comma */
474	count2 = INFINITY;
475	} else /* just a single number */
476	count2 = count;
477	repeat(p, pos, count, count2);
478	if (!EAT('}')) { /* error heuristics */
479	while (MORE() && PEEK() != '}')
480	NEXT();
481	(void)REQUIRE(MORE(), REG_EBRACE);
482	SETERROR(REG_BADBR);
483	}
484	break;
485	}
486
487	if (!MORE())
488	return;
489	c = PEEK();
490	if (!( c == '*' \|\| c == '+' \|\| c == '?' \|\|
491	(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
492	return;
493	SETERROR(REG_BADRPT);
494	}
495
496	/*
497	- p_str - string (no metacharacters) "parser"
498	== static void p_str(struct parse *p);
499	*/
500	static void
501	p_str(p)
502	struct parse *p;
503	{
504	(void)REQUIRE(MORE(), REG_EMPTY);
505	while (MORE())
506	ordinary(p, GETNEXT());
507	}
508
509	/*
510	- p_bre - BRE parser top level, anchoring and concatenation
511	== static void p_bre(struct parse *p, int end1, \
512	== int end2);
513	* Giving end1 as OUT essentially eliminates the end1/end2 check.
514	*
515	* This implementation is a bit of a kludge, in that a trailing $ is first
516	* taken as an ordinary character and then revised to be an anchor. The
517	* only undesirable side effect is that '$' gets included as a character
518	* category in such cases. This is fairly harmless; not worth fixing.
519	* The amount of lookahead needed to avoid this kludge is excessive.
520	*/
521	static void
522	p_bre(p, end1, end2)
523	struct parse *p;
524	int end1; /* first terminating character */
525	int end2; /* second terminating character */
526	{
527	sopno start = HERE();
528	int first = 1; /* first subexpression? */
529	int wasdollar = 0;
530
531	if (EAT('^')) {
532	EMIT(OBOL, 0);
533	p->g->iflags \|= USEBOL;
534	p->g->nbol++;
535	}
536	while (MORE() && !SEETWO(end1, end2)) {
537	wasdollar = p_simp_re(p, first);
538	first = 0;
539	}
540	if (wasdollar) { /* oops, that was a trailing anchor */
541	DROP(1);
542	EMIT(OEOL, 0);
543	p->g->iflags \|= USEEOL;
544	p->g->neol++;
545	}
546
547	(void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
548	}
549
550	/*
551	- p_simp_re - parse a simple RE, an atom possibly followed by a repetition
552	== static int p_simp_re(struct parse *p, int starordinary);
553	*/
554	static int /* was the simple RE an unbackslashed $? */
555	p_simp_re(p, starordinary)
556	struct parse *p;
557	int starordinary; /* is a leading * an ordinary character? */
558	{
559	int c;
560	int count;
561	int count2;
562	sopno pos;
563	int i;
564	sopno subno;
565	# define BACKSL (1<<CHAR_BIT)
566
567	pos = HERE(); /* repetion op, if any, covers from here */
568
569	assert(MORE()); /* caller should have ensured this */
570	c = GETNEXT();
571	if (c == '\\') {
572	(void)REQUIRE(MORE(), REG_EESCAPE);
573	c = BACKSL \| GETNEXT();
574	}
575	switch (c) {
576	case '.':
577	if (p->g->cflags&REG_NEWLINE)
578	nonnewline(p);
579	else
580	EMIT(OANY, 0);
581	break;
582	case '[':
583	p_bracket(p);
584	break;
585	case BACKSL\|'{':
586	SETERROR(REG_BADRPT);
587	break;
588	case BACKSL\|'(':
589	p->g->nsub++;
590	subno = p->g->nsub;
591	if (subno < NPAREN)
592	p->pbegin[subno] = HERE();
593	EMIT(OLPAREN, subno);
594	/* the MORE here is an error heuristic */
595	if (MORE() && !SEETWO('\\', ')'))
596	p_bre(p, '\\', ')');
597	if (subno < NPAREN) {
598	p->pend[subno] = HERE();
599	assert(p->pend[subno] != 0);
600	}
601	EMIT(ORPAREN, subno);
602	(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
603	break;
604	case BACKSL\|')': /* should not get here -- must be user */
605	case BACKSL\|'}':
606	SETERROR(REG_EPAREN);
607	break;
608	case BACKSL\|'1':
609	case BACKSL\|'2':
610	case BACKSL\|'3':
611	case BACKSL\|'4':
612	case BACKSL\|'5':
613	case BACKSL\|'6':
614	case BACKSL\|'7':
615	case BACKSL\|'8':
616	case BACKSL\|'9':
617	i = (c&~BACKSL) - '0';
618	assert(i < NPAREN);
619	if (p->pend[i] != 0) {
620	assert(i <= p->g->nsub);
621	EMIT(OBACK_, i);
622	assert(p->pbegin[i] != 0);
623	assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
624	assert(OP(p->strip[p->pend[i]]) == ORPAREN);
625	(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
626	EMIT(O_BACK, i);
627	} else
628	SETERROR(REG_ESUBREG);
629	p->g->backrefs = 1;
630	break;
631	case '*':
632	(void)REQUIRE(starordinary, REG_BADRPT);
633	/* FALLTHROUGH */
634	default:
635	ordinary(p, (char)c);
636	break;
637	}
638
639	if (EAT('')) { / implemented as +? */
640	/* this case does not require the (y\|) trick, noKLUDGE */
641	INSERT(OPLUS_, pos);
642	ASTERN(O_PLUS, pos);
643	INSERT(OQUEST_, pos);
644	ASTERN(O_QUEST, pos);
645	} else if (EATTWO('\\', '{')) {
646	count = p_count(p);
647	if (EAT(',')) {
648	if (MORE() && isdigit((uch)PEEK())) {
649	count2 = p_count(p);
650	(void)REQUIRE(count <= count2, REG_BADBR);
651	} else /* single number with comma */
652	count2 = INFINITY;
653	} else /* just a single number */
654	count2 = count;
655	repeat(p, pos, count, count2);
656	if (!EATTWO('\\', '}')) { /* error heuristics */
657	while (MORE() && !SEETWO('\\', '}'))
658	NEXT();
659	(void)REQUIRE(MORE(), REG_EBRACE);
660	SETERROR(REG_BADBR);
661	}
662	} else if (c == '$') /* $ (but not \$) ends it */
663	return(1);
664
665	return(0);
666	}
667
668	/*
669	- p_count - parse a repetition count
670	== static int p_count(struct parse *p);
671	*/
672	static int /* the value */
673	p_count(p)
674	struct parse *p;
675	{
676	int count = 0;
677	int ndigits = 0;
678
679	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
680	count = count*10 + (GETNEXT() - '0');
681	ndigits++;
682	}
683
684	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
685	return(count);
686	}
687
688	/*
689	- p_bracket - parse a bracketed character list
690	== static void p_bracket(struct parse *p);
691	*
692	* Note a significant property of this code: if the allocset() did SETERROR,
693	* no set operations are done.
694	*/
695	static void
696	p_bracket(p)
697	struct parse *p;
698	{
699	cset *cs = allocset(p);
700	int invert = 0;
701
702	/* Dept of Truly Sickening Special-Case Kludges */
703	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
704	EMIT(OBOW, 0);
705	NEXTn(6);
706	return;
707	}
708	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
709	EMIT(OEOW, 0);
710	NEXTn(6);
711	return;
712	}
713
714	if (EAT('^'))
715	invert++; /* make note to invert set at end */
716	if (EAT(']'))
717	CHadd(cs, ']');
718	else if (EAT('-'))
719	CHadd(cs, '-');
720	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
721	p_b_term(p, cs);
722	if (EAT('-'))
723	CHadd(cs, '-');
724	(void)MUSTEAT(']', REG_EBRACK);
725
726	if (p->error != 0) /* don't mess things up further */
727	return;
728
729	if (p->g->cflags&REG_ICASE) {
730	int i;
731	int ci;
732
733	for (i = p->g->csetsize - 1; i >= 0; i--)
734	if (CHIN(cs, i) && isalpha(i)) {
735	ci = othercase(i);
736	if (ci != i)
737	CHadd(cs, ci);
738	}
739	if (cs->multis != NULL)
740	mccase(p, cs);
741	}
742	if (invert) {
743	int i;
744
745	for (i = p->g->csetsize - 1; i >= 0; i--)
746	if (CHIN(cs, i))
747	CHsub(cs, i);
748	else
749	CHadd(cs, i);
750	if (p->g->cflags&REG_NEWLINE)
751	CHsub(cs, '\n');
752	if (cs->multis != NULL)
753	mcinvert(p, cs);
754	}
755
756	assert(cs->multis == NULL); /* xxx */
757
758	if (nch(p, cs) == 1) { /* optimize singleton sets */
759	ordinary(p, firstch(p, cs));
760	freeset(p, cs);
761	} else
762	EMIT(OANYOF, freezeset(p, cs));
763	}
764
765	/*
766	- p_b_term - parse one term of a bracketed character list
767	== static void p_b_term(struct parse p, cset cs);
768	*/
769	static void
770	p_b_term(p, cs)
771	struct parse *p;
772	cset *cs;
773	{
774	char c;
775	char start, finish;
776	int i;
777
778	/* classify what we've got */
779	switch ((MORE()) ? PEEK() : '\0') {
780	case '[':
781	c = (MORE2()) ? PEEK2() : '\0';
782	break;
783	case '-':
784	SETERROR(REG_ERANGE);
785	return; /* NOTE RETURN */
786	break;
787	default:
788	c = '\0';
789	break;
790	}
791
792	switch (c) {
793	case ':': /* character class */
794	NEXT2();
795	(void)REQUIRE(MORE(), REG_EBRACK);
796	c = PEEK();
797	(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
798	p_b_cclass(p, cs);
799	(void)REQUIRE(MORE(), REG_EBRACK);
800	(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
801	break;
802	case '=': /* equivalence class */
803	NEXT2();
804	(void)REQUIRE(MORE(), REG_EBRACK);
805	c = PEEK();
806	(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
807	p_b_eclass(p, cs);
808	(void)REQUIRE(MORE(), REG_EBRACK);
809	(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
810	break;
811	default: /* symbol, ordinary character, or range */
812	/* xxx revision needed for multichar stuff */
813	start = p_b_symbol(p);
814	if (SEE('-') && MORE2() && PEEK2() != ']') {
815	/* range */
816	NEXT();
817	if (EAT('-'))
818	finish = '-';
819	else
820	finish = p_b_symbol(p);
821	} else
822	finish = start;
823	if (start == finish)
824	CHadd(cs, start);
825	else {
826	if (__collate_load_error) {
827	(void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
828	for (i = (uch)start; i <= (uch)finish; i++)
829	CHadd(cs, i);
830	} else {
831	(void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
832	for (i = CHAR_MIN; i <= CHAR_MAX; i++) {
833	if ( __collate_range_cmp(start, i) <= 0
834	&& __collate_range_cmp(i, finish) <= 0
835	)
836	CHadd(cs, i);
837	}
838	}
839	}
840	break;
841	}
842	}
843
844	/*
845	- p_b_cclass - parse a character-class name and deal with it
846	== static void p_b_cclass(struct parse p, cset cs);
847	*/
848	static void
849	p_b_cclass(p, cs)
850	struct parse *p;
851	cset *cs;
852	{
853	int c;
854	char *sp = p->next;
855	struct cclass *cp;
856	size_t len;
857
858	while (MORE() && isalpha((uch)PEEK()))
859	NEXT();
860	len = p->next - sp;
861	for (cp = cclasses; cp->name != NULL; cp++)
862	if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
863	break;
864	if (cp->name == NULL) {
865	/* oops, didn't find it */
866	SETERROR(REG_ECTYPE);
867	return;
868	}
869
870	switch (cp->fidx) {
871	case CALNUM:
872	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
873	if (isalnum((uch)c))
874	CHadd(cs, c);
875	break;
876	case CALPHA:
877	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
878	if (isalpha((uch)c))
879	CHadd(cs, c);
880	break;
881	case CBLANK:
882	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
883	if (isblank((uch)c))
884	CHadd(cs, c);
885	break;
886	case CCNTRL:
887	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
888	if (iscntrl((uch)c))
889	CHadd(cs, c);
890	break;
891	case CDIGIT:
892	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
893	if (isdigit((uch)c))
894	CHadd(cs, c);
895	break;
896	case CGRAPH:
897	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
898	if (isgraph((uch)c))
899	CHadd(cs, c);
900	break;
901	case CLOWER:
902	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
903	if (islower((uch)c))
904	CHadd(cs, c);
905	break;
906	case CPRINT:
907	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
908	if (isprint((uch)c))
909	CHadd(cs, c);
910	break;
911	case CPUNCT:
912	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
913	if (ispunct((uch)c))
914	CHadd(cs, c);
915	break;
916	case CSPACE:
917	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
918	if (isspace((uch)c))
919	CHadd(cs, c);
920	break;
921	case CUPPER:
922	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
923	if (isupper((uch)c))
924	CHadd(cs, c);
925	break;
926	case CXDIGIT:
927	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
928	if (isxdigit((uch)c))
929	CHadd(cs, c);
930	break;
931	}
932	#if 0
933	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
934	MCadd(p, cs, u);
935	#endif
936	}
937
938	/*
939	- p_b_eclass - parse an equivalence-class name and deal with it
940	== static void p_b_eclass(struct parse p, cset cs);
941	*
942	* This implementation is incomplete. xxx
943	*/
944	static void
945	p_b_eclass(p, cs)
946	struct parse *p;
947	cset *cs;
948	{
949	char c;
950
951	c = p_b_coll_elem(p, '=');
952	CHadd(cs, c);
953	}
954
955	/*
956	- p_b_symbol - parse a character or [..]ed multicharacter collating symbol
957	== static char p_b_symbol(struct parse *p);
958	*/
959	static char /* value of symbol */
960	p_b_symbol(p)
961	struct parse *p;
962	{
963	char value;
964
965	(void)REQUIRE(MORE(), REG_EBRACK);
966	if (!EATTWO('[', '.'))
967	return(GETNEXT());
968
969	/* collating symbol */
970	value = p_b_coll_elem(p, '.');
971	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
972	return(value);
973	}
974
975	/*
976	- p_b_coll_elem - parse a collating-element name and look it up
977	== static char p_b_coll_elem(struct parse *p, int endc);
978	*/
979	static char /* value of collating element */
980	p_b_coll_elem(p, endc)
981	struct parse *p;
982	int endc; /* name ended by endc,']' */
983	{
984	char *sp = p->next;
985	struct cname *cp;
986	int len;
987
988	while (MORE() && !SEETWO(endc, ']'))
989	NEXT();
990	if (!MORE()) {
991	SETERROR(REG_EBRACK);
992	return(0);
993	}
994	len = p->next - sp;
995	for (cp = cnames; cp->name != NULL; cp++)
996	if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
997	return(cp->code); /* known name */
998	if (len == 1)
999	return(sp); / single character */
1000	SETERROR(REG_ECOLLATE); /* neither */
1001	return(0);
1002	}
1003
1004	/*
1005	- othercase - return the case counterpart of an alphabetic
1006	== static char othercase(int ch);
1007	*/
1008	static char /* if no counterpart, return ch */
1009	othercase(ch)
1010	int ch;
1011	{
1012	ch = (uch)ch;
1013	assert(isalpha(ch));
1014	if (isupper(ch))
1015	return(tolower(ch));
1016	else if (islower(ch))
1017	return(toupper(ch));
1018	else /* peculiar, but could happen */
1019	return(ch);
1020	}
1021
1022	/*
1023	- bothcases - emit a dualcase version of a two-case character
1024	== static void bothcases(struct parse *p, int ch);
1025	*
1026	* Boy, is this implementation ever a kludge...
1027	*/
1028	static void
1029	bothcases(p, ch)
1030	struct parse *p;
1031	int ch;
1032	{
1033	char *oldnext = p->next;
1034	char *oldend = p->end;
1035	char bracket[3];
1036
1037	ch = (uch)ch;
1038	assert(othercase(ch) != ch); /* p_bracket() would recurse */
1039	p->next = bracket;
1040	p->end = bracket+2;
1041	bracket[0] = ch;
1042	bracket[1] = ']';
1043	bracket[2] = '\0';
1044	p_bracket(p);
1045	assert(p->next == bracket+2);
1046	p->next = oldnext;
1047	p->end = oldend;
1048	}
1049
1050	/*
1051	- ordinary - emit an ordinary character
1052	== static void ordinary(struct parse *p, int ch);
1053	*/
1054	static void
1055	ordinary(p, ch)
1056	struct parse *p;
1057	int ch;
1058	{
1059	cat_t *cap = p->g->categories;
1060
1061	if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
1062	bothcases(p, ch);
1063	else {
1064	EMIT(OCHAR, (uch)ch);
1065	if (cap[ch] == 0)
1066	cap[ch] = p->g->ncategories++;
1067	}
1068	}
1069
1070	/*
1071	- nonnewline - emit REG_NEWLINE version of OANY
1072	== static void nonnewline(struct parse *p);
1073	*
1074	* Boy, is this implementation ever a kludge...
1075	*/
1076	static void
1077	nonnewline(p)
1078	struct parse *p;
1079	{
1080	char *oldnext = p->next;
1081	char *oldend = p->end;
1082	char bracket[4];
1083
1084	p->next = bracket;
1085	p->end = bracket+3;
1086	bracket[0] = '^';
1087	bracket[1] = '\n';
1088	bracket[2] = ']';
1089	bracket[3] = '\0';
1090	p_bracket(p);
1091	assert(p->next == bracket+3);
1092	p->next = oldnext;
1093	p->end = oldend;
1094	}
1095
1096	/*
1097	- repeat - generate code for a bounded repetition, recursively if needed
1098	== static void repeat(struct parse *p, sopno start, int from, int to);
1099	*/
1100	static void
1101	repeat(p, start, from, to)
1102	struct parse *p;
1103	sopno start; /* operand from here to end of strip */
1104	int from; /* repeated from this number */
1105	int to; /* to this number of times (maybe INFINITY) */
1106	{
1107	sopno finish = HERE();
1108	# define N 2
1109	# define INF 3
1110	# define REP(f, t) ((f)*8 + (t))
1111	# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1112	sopno copy;
1113
1114	if (p->error != 0) /* head off possible runaway recursion */
1115	return;
1116
1117	assert(from <= to);
1118
1119	switch (REP(MAP(from), MAP(to))) {
1120	case REP(0, 0): /* must be user doing this */
1121	DROP(finish-start); /* drop the operand */
1122	break;
1123	case REP(0, 1): /* as x{1,1}? */
1124	case REP(0, N): /* as x{1,n}? */
1125	case REP(0, INF): /* as x{1,}? */
1126	/* KLUDGE: emit y? as (y\|) until subtle bug gets fixed */
1127	INSERT(OCH_, start); /* offset is wrong... */
1128	repeat(p, start+1, 1, to);
1129	ASTERN(OOR1, start);
1130	AHEAD(start); /* ... fix it */
1131	EMIT(OOR2, 0);
1132	AHEAD(THERE());
1133	ASTERN(O_CH, THERETHERE());
1134	break;
1135	case REP(1, 1): /* trivial case */
1136	/* done */
1137	break;
1138	case REP(1, N): /* as x?x{1,n-1} */
1139	/* KLUDGE: emit y? as (y\|) until subtle bug gets fixed */
1140	INSERT(OCH_, start);
1141	ASTERN(OOR1, start);
1142	AHEAD(start);
1143	EMIT(OOR2, 0); /* offset very wrong... */
1144	AHEAD(THERE()); /* ...so fix it */
1145	ASTERN(O_CH, THERETHERE());
1146	copy = dupl(p, start+1, finish+1);
1147	assert(copy == finish+4);
1148	repeat(p, copy, 1, to-1);
1149	break;
1150	case REP(1, INF): /* as x+ */
1151	INSERT(OPLUS_, start);
1152	ASTERN(O_PLUS, start);
1153	break;
1154	case REP(N, N): /* as xx{m-1,n-1} */
1155	copy = dupl(p, start, finish);
1156	repeat(p, copy, from-1, to-1);
1157	break;
1158	case REP(N, INF): /* as xx{n-1,INF} */
1159	copy = dupl(p, start, finish);
1160	repeat(p, copy, from-1, to);
1161	break;
1162	default: /* "can't happen" */
1163	SETERROR(REG_ASSERT); /* just in case */
1164	break;
1165	}
1166	}
1167
1168	/*
1169	- seterr - set an error condition
1170	== static int seterr(struct parse *p, int e);
1171	*/
1172	static int /* useless but makes type checking happy */
1173	seterr(p, e)
1174	struct parse *p;
1175	int e;
1176	{
1177	if (p->error == 0) /* keep earliest error condition */
1178	p->error = e;
1179	p->next = nuls; /* try to bring things to a halt */
1180	p->end = nuls;
1181	return(0); /* make the return value well-defined */
1182	}
1183
1184	/*
1185	- allocset - allocate a set of characters for []
1186	== static cset allocset(struct parse p);
1187	*/
1188	static cset *
1189	allocset(p)
1190	struct parse *p;
1191	{
1192	int no = p->g->ncsets++;
1193	size_t nc;
1194	size_t nbytes;
1195	cset *cs;
1196	size_t css = (size_t)p->g->csetsize;
1197	int i;
1198
1199	if (no >= p->ncsalloc) { /* need another column of space */
1200	p->ncsalloc += CHAR_BIT;
1201	nc = p->ncsalloc;
1202	assert(nc % CHAR_BIT == 0);
1203	nbytes = nc / CHAR_BIT * css;
1204	if (p->g->sets == NULL)
1205	p->g->sets = (cset )malloc(nc sizeof(cset));
1206	else
1207	p->g->sets = (cset )reallocf((char )p->g->sets,
1208	nc * sizeof(cset));
1209	if (p->g->setbits == NULL)
1210	p->g->setbits = (uch *)malloc(nbytes);
1211	else {
1212	p->g->setbits = (uch )reallocf((char )p->g->setbits,
1213	nbytes);
1214	/* xxx this isn't right if setbits is now NULL */
1215	for (i = 0; i < no; i++)
1216	p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1217	}
1218	if (p->g->sets != NULL && p->g->setbits != NULL)
1219	(void) memset((char *)p->g->setbits + (nbytes - css),
1220	0, css);
1221	else {
1222	no = 0;
1223	SETERROR(REG_ESPACE);
1224	/* caller's responsibility not to do set ops */
1225	}
1226	}
1227
1228	assert(p->g->sets != NULL); /* xxx */
1229	cs = &p->g->sets[no];
1230	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1231	cs->mask = 1 << ((no) % CHAR_BIT);
1232	cs->hash = 0;
1233	cs->smultis = 0;
1234	cs->multis = NULL;
1235
1236	return(cs);
1237	}
1238
1239	/*
1240	- freeset - free a now-unused set
1241	== static void freeset(struct parse p, cset cs);
1242	*/
1243	static void
1244	freeset(p, cs)
1245	struct parse *p;
1246	cset *cs;
1247	{
1248	int i;
1249	cset *top = &p->g->sets[p->g->ncsets];
1250	size_t css = (size_t)p->g->csetsize;
1251
1252	for (i = 0; i < css; i++)
1253	CHsub(cs, i);
1254	if (cs == top-1) /* recover only the easy case */
1255	p->g->ncsets--;
1256	}
1257
1258	/*
1259	- freezeset - final processing on a set of characters
1260	== static int freezeset(struct parse p, cset cs);
1261	*
1262	* The main task here is merging identical sets. This is usually a waste
1263	* of time (although the hash code minimizes the overhead), but can win
1264	* big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1265	* is done using addition rather than xor -- all ASCII [aA] sets xor to
1266	* the same value!
1267	*/
1268	static int /* set number */
1269	freezeset(p, cs)
1270	struct parse *p;
1271	cset *cs;
1272	{
1273	short h = cs->hash;
1274	int i;
1275	cset *top = &p->g->sets[p->g->ncsets];
1276	cset *cs2;
1277	size_t css = (size_t)p->g->csetsize;
1278
1279	/* look for an earlier one which is the same */
1280	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1281	if (cs2->hash == h && cs2 != cs) {
1282	/* maybe */
1283	for (i = 0; i < css; i++)
1284	if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1285	break; /* no */
1286	if (i == css)
1287	break; /* yes */
1288	}
1289
1290	if (cs2 < top) { /* found one */
1291	freeset(p, cs);
1292	cs = cs2;
1293	}
1294
1295	return((int)(cs - p->g->sets));
1296	}
1297
1298	/*
1299	- firstch - return first character in a set (which must have at least one)
1300	== static int firstch(struct parse p, cset cs);
1301	*/
1302	static int /* character; there is no "none" value */
1303	firstch(p, cs)
1304	struct parse *p;
1305	cset *cs;
1306	{
1307	int i;
1308	size_t css = (size_t)p->g->csetsize;
1309
1310	for (i = 0; i < css; i++)
1311	if (CHIN(cs, i))
1312	return((char)i);
1313	assert(never);
1314	return(0); /* arbitrary */
1315	}
1316
1317	/*
1318	- nch - number of characters in a set
1319	== static int nch(struct parse p, cset cs);
1320	*/
1321	static int
1322	nch(p, cs)
1323	struct parse *p;
1324	cset *cs;
1325	{
1326	int i;
1327	size_t css = (size_t)p->g->csetsize;
1328	int n = 0;
1329
1330	for (i = 0; i < css; i++)
1331	if (CHIN(cs, i))
1332	n++;
1333	return(n);
1334	}
1335
1336	#if used
1337	/*
1338	- mcadd - add a collating element to a cset
1339	== static void mcadd(struct parse p, cset cs, \
1340	== char *cp);
1341	*/
1342	static void
1343	mcadd(p, cs, cp)
1344	struct parse *p;
1345	cset *cs;
1346	char *cp;
1347	{
1348	size_t oldend = cs->smultis;
1349
1350	cs->smultis += strlen(cp) + 1;
1351	if (cs->multis == NULL)
1352	cs->multis = malloc(cs->smultis);
1353	else
1354	cs->multis = reallocf(cs->multis, cs->smultis);
1355	if (cs->multis == NULL) {
1356	SETERROR(REG_ESPACE);
1357	return;
1358	}
1359
1360	(void) strcpy(cs->multis + oldend - 1, cp);
1361	cs->multis[cs->smultis - 1] = '\0';
1362	}
1363
1364	/*
1365	- mcsub - subtract a collating element from a cset
1366	== static void mcsub(cset cs, char cp);
1367	*/
1368	static void
1369	mcsub(cs, cp)
1370	cset *cs;
1371	char *cp;
1372	{
1373	char *fp = mcfind(cs, cp);
1374	size_t len = strlen(fp);
1375
1376	assert(fp != NULL);
1377	(void) memmove(fp, fp + len + 1,
1378	cs->smultis - (fp + len + 1 - cs->multis));
1379	cs->smultis -= len;
1380
1381	if (cs->smultis == 0) {
1382	free(cs->multis);
1383	cs->multis = NULL;
1384	return;
1385	}
1386
1387	cs->multis = reallocf(cs->multis, cs->smultis);
1388	assert(cs->multis != NULL);
1389	}
1390
1391	/*
1392	- mcin - is a collating element in a cset?
1393	== static int mcin(cset cs, char cp);
1394	*/
1395	static int
1396	mcin(cs, cp)
1397	cset *cs;
1398	char *cp;
1399	{
1400	return(mcfind(cs, cp) != NULL);
1401	}
1402
1403	/*
1404	- mcfind - find a collating element in a cset
1405	== static char mcfind(cset cs, char *cp);
1406	*/
1407	static char *
1408	mcfind(cs, cp)
1409	cset *cs;
1410	char *cp;
1411	{
1412	char *p;
1413
1414	if (cs->multis == NULL)
1415	return(NULL);
1416	for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1417	if (strcmp(cp, p) == 0)
1418	return(p);
1419	return(NULL);
1420	}
1421	#endif
1422
1423	/*
1424	- mcinvert - invert the list of collating elements in a cset
1425	== static void mcinvert(struct parse p, cset cs);
1426	*
1427	* This would have to know the set of possibilities. Implementation
1428	* is deferred.
1429	*/
1430	static void
1431	mcinvert(p, cs)
1432	struct parse *p;
1433	cset *cs;
1434	{
1435	assert(cs->multis == NULL); /* xxx */
1436	}
1437
1438	/*
1439	- mccase - add case counterparts of the list of collating elements in a cset
1440	== static void mccase(struct parse p, cset cs);
1441	*
1442	* This would have to know the set of possibilities. Implementation
1443	* is deferred.
1444	*/
1445	static void
1446	mccase(p, cs)
1447	struct parse *p;
1448	cset *cs;
1449	{
1450	assert(cs->multis == NULL); /* xxx */
1451	}
1452
1453	/*
1454	- isinsets - is this character in any sets?
1455	== static int isinsets(struct re_guts *g, int c);
1456	*/
1457	static int /* predicate */
1458	isinsets(g, c)
1459	struct re_guts *g;
1460	int c;
1461	{
1462	uch *col;
1463	int i;
1464	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1465	unsigned uc = (uch)c;
1466
1467	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1468	if (col[uc] != 0)
1469	return(1);
1470	return(0);
1471	}
1472
1473	/*
1474	- samesets - are these two characters in exactly the same sets?
1475	== static int samesets(struct re_guts *g, int c1, int c2);
1476	*/
1477	static int /* predicate */
1478	samesets(g, c1, c2)
1479	struct re_guts *g;
1480	int c1;
1481	int c2;
1482	{
1483	uch *col;
1484	int i;
1485	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1486	unsigned uc1 = (uch)c1;
1487	unsigned uc2 = (uch)c2;
1488
1489	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1490	if (col[uc1] != col[uc2])
1491	return(0);
1492	return(1);
1493	}
1494
1495	/*
1496	- categorize - sort out character categories
1497	== static void categorize(struct parse p, struct re_guts g);
1498	*/
1499	static void
1500	categorize(p, g)
1501	struct parse *p;
1502	struct re_guts *g;
1503	{
1504	cat_t *cats = g->categories;
1505	int c;
1506	int c2;
1507	cat_t cat;
1508
1509	/* avoid making error situations worse */
1510	if (p->error != 0)
1511	return;
1512
1513	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1514	if (cats[c] == 0 && isinsets(g, c)) {
1515	cat = g->ncategories++;
1516	cats[c] = cat;
1517	for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1518	if (cats[c2] == 0 && samesets(g, c, c2))
1519	cats[c2] = cat;
1520	}
1521	}
1522
1523	/*
1524	- dupl - emit a duplicate of a bunch of sops
1525	== static sopno dupl(struct parse *p, sopno start, sopno finish);
1526	*/
1527	static sopno /* start of duplicate */
1528	dupl(p, start, finish)
1529	struct parse *p;
1530	sopno start; /* from here */
1531	sopno finish; /* to this less one */
1532	{
1533	sopno ret = HERE();
1534	sopno len = finish - start;
1535
1536	assert(finish >= start);
1537	if (len == 0)
1538	return(ret);
1539	enlarge(p, p->ssize + len); /* this many unexpected additions */
1540	assert(p->ssize >= p->slen + len);
1541	(void) memcpy((char *)(p->strip + p->slen),
1542	(char )(p->strip + start), (size_t)lensizeof(sop));
1543	p->slen += len;
1544	return(ret);
1545	}
1546
1547	/*
1548	- doemit - emit a strip operator
1549	== static void doemit(struct parse *p, sop op, size_t opnd);
1550	*
1551	* It might seem better to implement this as a macro with a function as
1552	* hard-case backup, but it's just too big and messy unless there are
1553	* some changes to the data structures. Maybe later.
1554	*/
1555	static void
1556	doemit(p, op, opnd)
1557	struct parse *p;
1558	sop op;
1559	size_t opnd;
1560	{
1561	/* avoid making error situations worse */
1562	if (p->error != 0)
1563	return;
1564
1565	/* deal with oversize operands ("can't happen", more or less) */
1566	assert(opnd < 1<<OPSHIFT);
1567
1568	/* deal with undersized strip */
1569	if (p->slen >= p->ssize)
1570	enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1571	assert(p->slen < p->ssize);
1572
1573	/* finally, it's all reduced to the easy case */
1574	p->strip[p->slen++] = SOP(op, opnd);
1575	}
1576
1577	/*
1578	- doinsert - insert a sop into the strip
1579	== static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1580	*/
1581	static void
1582	doinsert(p, op, opnd, pos)
1583	struct parse *p;
1584	sop op;
1585	size_t opnd;
1586	sopno pos;
1587	{
1588	sopno sn;
1589	sop s;
1590	int i;
1591
1592	/* avoid making error situations worse */
1593	if (p->error != 0)
1594	return;
1595
1596	sn = HERE();
1597	EMIT(op, opnd); /* do checks, ensure space */
1598	assert(HERE() == sn+1);
1599	s = p->strip[sn];
1600
1601	/* adjust paren pointers */
1602	assert(pos > 0);
1603	for (i = 1; i < NPAREN; i++) {
1604	if (p->pbegin[i] >= pos) {
1605	p->pbegin[i]++;
1606	}
1607	if (p->pend[i] >= pos) {
1608	p->pend[i]++;
1609	}
1610	}
1611
1612	memmove((char )&p->strip[pos+1], (char )&p->strip[pos],
1613	(HERE()-pos-1)*sizeof(sop));
1614	p->strip[pos] = s;
1615	}
1616
1617	/*
1618	- dofwd - complete a forward reference
1619	== static void dofwd(struct parse *p, sopno pos, sop value);
1620	*/
1621	static void
1622	dofwd(p, pos, value)
1623	struct parse *p;
1624	sopno pos;
1625	sop value;
1626	{
1627	/* avoid making error situations worse */
1628	if (p->error != 0)
1629	return;
1630
1631	assert(value < 1<<OPSHIFT);
1632	p->strip[pos] = OP(p->strip[pos]) \| value;
1633	}
1634
1635	/*
1636	- enlarge - enlarge the strip
1637	== static void enlarge(struct parse *p, sopno size);
1638	*/
1639	static void
1640	enlarge(p, size)
1641	struct parse *p;
1642	sopno size;
1643	{
1644	sop *sp;
1645
1646	if (p->ssize >= size)
1647	return;
1648
1649	sp = (sop )realloc(p->strip, sizesizeof(sop));
1650	if (sp == NULL) {
1651	SETERROR(REG_ESPACE);
1652	return;
1653	}
1654	p->strip = sp;
1655	p->ssize = size;
1656	}
1657
1658	/*
1659	- stripsnug - compact the strip
1660	== static void stripsnug(struct parse p, struct re_guts g);
1661	*/
1662	static void
1663	stripsnug(p, g)
1664	struct parse *p;
1665	struct re_guts *g;
1666	{
1667	g->nstates = p->slen;
1668	g->strip = (sop )realloc((char )p->strip, p->slen * sizeof(sop));
1669	if (g->strip == NULL) {
1670	SETERROR(REG_ESPACE);
1671	g->strip = p->strip;
1672	}
1673	}
1674
1675	/*
1676	- findmust - fill in must and mlen with longest mandatory literal string
1677	== static void findmust(struct parse p, struct re_guts g);
1678	*
1679	* This algorithm could do fancy things like analyzing the operands of \|
1680	* for common subsequences. Someday. This code is simple and finds most
1681	* of the interesting cases.
1682	*
1683	* Note that must and mlen got initialized during setup.
1684	*/
1685	static void
1686	findmust(p, g)
1687	struct parse *p;
1688	struct re_guts *g;
1689	{
1690	sop *scan;
1691	sop *start = NULL;
1692	sop *newstart = NULL;
1693	sopno newlen;
1694	sop s;
1695	char *cp;
1696	sopno i;
1697	int offset;
1698	int cs, mccs;
1699
1700	/* avoid making error situations worse */
1701	if (p->error != 0)
1702	return;
1703
1704	/* Find out if we can handle OANYOF or not */
1705	mccs = 0;
1706	for (cs = 0; cs < g->ncsets; cs++)
1707	if (g->sets[cs].multis != NULL)
1708	mccs = 1;
1709
1710	/* find the longest OCHAR sequence in strip */
1711	newlen = 0;
1712	offset = 0;
1713	g->moffset = 0;
1714	scan = g->strip + 1;
1715	do {
1716	s = *scan++;
1717	switch (OP(s)) {
1718	case OCHAR: /* sequence member */
1719	if (newlen == 0) /* new sequence */
1720	newstart = scan - 1;
1721	newlen++;
1722	break;
1723	case OPLUS_: /* things that don't break one */
1724	case OLPAREN:
1725	case ORPAREN:
1726	break;
1727	case OQUEST_: /* things that must be skipped */
1728	case OCH_:
1729	offset = altoffset(scan, offset, mccs);
1730	scan--;
1731	do {
1732	scan += OPND(s);
1733	s = *scan;
1734	/* assert() interferes w debug printouts */
1735	if (OP(s) != O_QUEST && OP(s) != O_CH &&
1736	OP(s) != OOR2) {
1737	g->iflags \|= BAD;
1738	return;
1739	}
1740	} while (OP(s) != O_QUEST && OP(s) != O_CH);
1741	/* fallthrough */
1742	case OBOW: /* things that break a sequence */
1743	case OEOW:
1744	case OBOL:
1745	case OEOL:
1746	case O_QUEST:
1747	case O_CH:
1748	case OEND:
1749	if (newlen > g->mlen) { /* ends one */
1750	start = newstart;
1751	g->mlen = newlen;
1752	if (offset > -1) {
1753	g->moffset += offset;
1754	offset = newlen;
1755	} else
1756	g->moffset = offset;
1757	} else {
1758	if (offset > -1)
1759	offset += newlen;
1760	}
1761	newlen = 0;
1762	break;
1763	case OANY:
1764	if (newlen > g->mlen) { /* ends one */
1765	start = newstart;
1766	g->mlen = newlen;
1767	if (offset > -1) {
1768	g->moffset += offset;
1769	offset = newlen;
1770	} else
1771	g->moffset = offset;
1772	} else {
1773	if (offset > -1)
1774	offset += newlen;
1775	}
1776	if (offset > -1)
1777	offset++;
1778	newlen = 0;
1779	break;
1780	case OANYOF: /* may or may not invalidate offset */
1781	/* First, everything as OANY */
1782	if (newlen > g->mlen) { /* ends one */
1783	start = newstart;
1784	g->mlen = newlen;
1785	if (offset > -1) {
1786	g->moffset += offset;
1787	offset = newlen;
1788	} else
1789	g->moffset = offset;
1790	} else {
1791	if (offset > -1)
1792	offset += newlen;
1793	}
1794	if (offset > -1)
1795	offset++;
1796	newlen = 0;
1797	/* And, now, if we found out we can't deal with
1798	* it, make offset = -1.
1799	*/
1800	if (mccs)
1801	offset = -1;
1802	break;
1803	default:
1804	/* Anything here makes it impossible or too hard
1805	* to calculate the offset -- so we give up;
1806	* save the last known good offset, in case the
1807	* must sequence doesn't occur later.
1808	*/
1809	if (newlen > g->mlen) { /* ends one */
1810	start = newstart;
1811	g->mlen = newlen;
1812	if (offset > -1)
1813	g->moffset += offset;
1814	else
1815	g->moffset = offset;
1816	}
1817	offset = -1;
1818	newlen = 0;
1819	break;
1820	}
1821	} while (OP(s) != OEND);
1822
1823	if (g->mlen == 0) { /* there isn't one */
1824	g->moffset = -1;
1825	return;
1826	}
1827
1828	/* turn it into a character string */
1829	g->must = malloc((size_t)g->mlen + 1);
1830	if (g->must == NULL) { /* argh; just forget it */
1831	g->mlen = 0;
1832	g->moffset = -1;
1833	return;
1834	}
1835	cp = g->must;
1836	scan = start;
1837	for (i = g->mlen; i > 0; i--) {
1838	while (OP(s = *scan++) != OCHAR)
1839	continue;
1840	assert(cp < g->must + g->mlen);
1841	*cp++ = (char)OPND(s);
1842	}
1843	assert(cp == g->must + g->mlen);
1844	cp++ = '\0'; / just on general principles */
1845	}
1846
1847	/*
1848	- altoffset - choose biggest offset among multiple choices
1849	== static int altoffset(sop *scan, int offset, int mccs);
1850	*
1851	* Compute, recursively if necessary, the largest offset among multiple
1852	* re paths.
1853	*/
1854	static int
1855	altoffset(scan, offset, mccs)
1856	sop *scan;
1857	int offset;
1858	int mccs;
1859	{
1860	int largest;
1861	int try;
1862	sop s;
1863
1864	/* If we gave up already on offsets, return */
1865	if (offset == -1)
1866	return -1;
1867
1868	largest = 0;
1869	try = 0;
1870	s = *scan++;
1871	while (OP(s) != O_QUEST && OP(s) != O_CH) {
1872	switch (OP(s)) {
1873	case OOR1:
1874	if (try > largest)
1875	largest = try;
1876	try = 0;
1877	break;
1878	case OQUEST_:
1879	case OCH_:
1880	try = altoffset(scan, try, mccs);
1881	if (try == -1)
1882	return -1;
1883	scan--;
1884	do {
1885	scan += OPND(s);
1886	s = *scan;
1887	if (OP(s) != O_QUEST && OP(s) != O_CH &&
1888	OP(s) != OOR2)
1889	return -1;
1890	} while (OP(s) != O_QUEST && OP(s) != O_CH);
1891	/* We must skip to the next position, or we'll
1892	* leave altoffset() too early.
1893	*/
1894	scan++;
1895	break;
1896	case OANYOF:
1897	if (mccs)
1898	return -1;
1899	case OCHAR:
1900	case OANY:
1901	try++;
1902	case OBOW:
1903	case OEOW:
1904	case OLPAREN:
1905	case ORPAREN:
1906	case OOR2:
1907	break;
1908	default:
1909	try = -1;
1910	break;
1911	}
1912	if (try == -1)
1913	return -1;
1914	s = *scan++;
1915	}
1916
1917	if (try > largest)
1918	largest = try;
1919
1920	return largest+offset;
1921	}
1922
1923	/*
1924	- computejumps - compute char jumps for BM scan
1925	== static void computejumps(struct parse p, struct re_guts g);
1926	*
1927	* This algorithm assumes g->must exists and is has size greater than
1928	* zero. It's based on the algorithm found on Computer Algorithms by
1929	* Sara Baase.
1930	*
1931	* A char jump is the number of characters one needs to jump based on
1932	* the value of the character from the text that was mismatched.
1933	*/
1934	static void
1935	computejumps(p, g)
1936	struct parse *p;
1937	struct re_guts *g;
1938	{
1939	int ch;
1940	int mindex;
1941
1942	/* Avoid making errors worse */
1943	if (p->error != 0)
1944	return;
1945
1946	g->charjump = (int) malloc((NC + 1) sizeof(int));
1947	if (g->charjump == NULL) /* Not a fatal error */
1948	return;
1949	/* Adjust for signed chars, if necessary */
1950	g->charjump = &g->charjump[-(CHAR_MIN)];
1951
1952	/* If the character does not exist in the pattern, the jump
1953	* is equal to the number of characters in the pattern.
1954	*/
1955	for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
1956	g->charjump[ch] = g->mlen;
1957
1958	/* If the character does exist, compute the jump that would
1959	* take us to the last character in the pattern equal to it
1960	* (notice that we match right to left, so that last character
1961	* is the first one that would be matched).
1962	*/
1963	for (mindex = 0; mindex < g->mlen; mindex++)
1964	g->charjump[(unsigned char) g->must[mindex]] = g->mlen - mindex - 1;
1965	}
1966
1967	/*
1968	- computematchjumps - compute match jumps for BM scan
1969	== static void computematchjumps(struct parse p, struct re_guts g);
1970	*
1971	* This algorithm assumes g->must exists and is has size greater than
1972	* zero. It's based on the algorithm found on Computer Algorithms by
1973	* Sara Baase.
1974	*
1975	* A match jump is the number of characters one needs to advance based
1976	* on the already-matched suffix.
1977	* Notice that all values here are minus (g->mlen-1), because of the way
1978	* the search algorithm works.
1979	*/
1980	static void
1981	computematchjumps(p, g)
1982	struct parse *p;
1983	struct re_guts *g;
1984	{
1985	int mindex; /* General "must" iterator */
1986	int suffix; /* Keeps track of matching suffix */
1987	int ssuffix; /* Keeps track of suffixes' suffix */
1988	int* pmatches; /* pmatches[k] points to the next i
1989	* such that i+1...mlen is a substring
1990	* of k+1...k+mlen-i-1
1991	*/
1992
1993	/* Avoid making errors worse */
1994	if (p->error != 0)
1995	return;
1996
1997	pmatches = (int) malloc(g->mlen sizeof(unsigned int));
1998	if (pmatches == NULL) {
1999	g->matchjump = NULL;
2000	return;
2001	}
2002
2003	g->matchjump = (int) malloc(g->mlen sizeof(unsigned int));
2004	if (g->matchjump == NULL) /* Not a fatal error */
2005	return;
2006
2007	/* Set maximum possible jump for each character in the pattern */
2008	for (mindex = 0; mindex < g->mlen; mindex++)
2009	g->matchjump[mindex] = 2*g->mlen - mindex - 1;
2010
2011	/* Compute pmatches[] */
2012	for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
2013	mindex--, suffix--) {
2014	pmatches[mindex] = suffix;
2015
2016	/* If a mismatch is found, interrupting the substring,
2017	* compute the matchjump for that position. If no
2018	* mismatch is found, then a text substring mismatched
2019	* against the suffix will also mismatch against the
2020	* substring.
2021	*/
2022	while (suffix < g->mlen
2023	&& g->must[mindex] != g->must[suffix]) {
2024	g->matchjump[suffix] = MIN(g->matchjump[suffix],
2025	g->mlen - mindex - 1);
2026	suffix = pmatches[suffix];
2027	}
2028	}
2029
2030	/* Compute the matchjump up to the last substring found to jump
2031	* to the beginning of the largest must pattern prefix matching
2032	* it's own suffix.
2033	*/
2034	for (mindex = 0; mindex <= suffix; mindex++)
2035	g->matchjump[mindex] = MIN(g->matchjump[mindex],
2036	g->mlen + suffix - mindex);
2037
2038	ssuffix = pmatches[suffix];
2039	while (suffix < g->mlen) {
2040	while (suffix <= ssuffix && suffix < g->mlen) {
2041	g->matchjump[suffix] = MIN(g->matchjump[suffix],
2042	g->mlen + ssuffix - suffix);
2043	suffix++;
2044	}
2045	if (suffix < g->mlen)
2046	ssuffix = pmatches[ssuffix];
2047	}
2048
2049	free(pmatches);
2050	}
2051
2052	/*
2053	- pluscount - count + nesting
2054	== static sopno pluscount(struct parse p, struct re_guts g);
2055	*/
2056	static sopno /* nesting depth */
2057	pluscount(p, g)
2058	struct parse *p;
2059	struct re_guts *g;
2060	{
2061	sop *scan;
2062	sop s;
2063	sopno plusnest = 0;
2064	sopno maxnest = 0;
2065
2066	if (p->error != 0)
2067	return(0); /* there may not be an OEND */
2068
2069	scan = g->strip + 1;
2070	do {
2071	s = *scan++;
2072	switch (OP(s)) {
2073	case OPLUS_:
2074	plusnest++;
2075	break;
2076	case O_PLUS:
2077	if (plusnest > maxnest)
2078	maxnest = plusnest;
2079	plusnest--;
2080	break;
2081	}
2082	} while (OP(s) != OEND);
2083	if (plusnest != 0)
2084	g->iflags \|= BAD;
2085	return(maxnest);
2086	}
2087
2088	#endif /* !_NO_REGEX */

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/libs/newlib/src/newlib/libc/posix/regcomp.c @ 567

Download in other formats: