automaton.cpp
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* */
3 /* Copyright (c) 1995, 2004 IBM Corporation. All rights reserved. */
4 /* Copyright (c) 2005-2017 Rexx Language Association. All rights reserved. */
5 /* */
6 /* This program and the accompanying materials are made available under */
7 /* the terms of the Common Public License v1.0 which accompanies this */
8 /* distribution. A copy is also available at the following address: */
9 /* https://www.oorexx.org/license.html */
10 /* */
11 /* Redistribution and use in source and binary forms, with or */
12 /* without modification, are permitted provided that the following */
13 /* conditions are met: */
14 /* */
15 /* Redistributions of source code must retain the above copyright */
16 /* notice, this list of conditions and the following disclaimer. */
17 /* Redistributions in binary form must reproduce the above copyright */
18 /* notice, this list of conditions and the following disclaimer in */
19 /* the documentation and/or other materials provided with the distribution. */
20 /* */
21 /* Neither the name of Rexx Language Association nor the names */
22 /* of its contributors may be used to endorse or promote products */
23 /* derived from this software without specific prior written permission. */
24 /* */
25 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS */
26 /* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT */
27 /* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS */
28 /* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT */
29 /* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */
30 /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */
31 /* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, */
32 /* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY */
33 /* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING */
34 /* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS */
35 /* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
36 /* */
37 /*----------------------------------------------------------------------------*/
38 /******************************************************************************/
39 /* Object REXX Support automaton.cpp */
40 /* */
41 /* Regular Expression Utility functions */
42 /* */
43 /******************************************************************************/
44 #include <stdlib.h>
45 #include <stdio.h>
46 #include <string.h>
47 #include <ctype.h>
48 #include "automaton.hpp"
49 #include "regexp.hpp"
50 //#define MYDEBUG
51 
52 // constructor: initialize automaton
53 automaton::automaton() : ch(NULL), next1(NULL), next2(NULL), final(-1), regexp(NULL),
54  setArray(NULL), setSize(0), size(16), freeState(1), currentPos(0), minimal(false)
55 {
56  int bytes = sizeof(int)*size;
57 
58  ch = (int*) malloc(bytes);
59  next1 = (int*) malloc(bytes);
60  next2 = (int*) malloc(bytes);
61 }
62 
63 // destructor: free memory
65 {
66  if (size)
67  {
68  free(ch);
69  free(next1);
70  free(next2);
71  }
72  if (setSize)
73  {
74  for (int i=0;i<setSize;i++)
75  {
76  free(setArray[i]);
77  }
78  free(setArray);
79  }
80 
81 }
82 
83 
84 /*************************************************************/
85 /* automaton::setMinimal */
86 /* */
87 /* make the automaton match minimal or maximal */
88 /* this influences the state machine. */
89 /*************************************************************/
91 {
92  if (f != this->minimal)
93  {
94  if (this->final != -1)
95  {
96  if (f == false)
97  {
98  setState(this->final, 0x00, this->final+1, this->final+1);
99  }
100  else
101  {
102  setState(this->final, EPSILON, EOP, EOP);
103  }
104  }
105  this->minimal = f;
106  }
107 }
108 
109 
110 /*************************************************************/
111 /* automaton::parse */
112 /* */
113 /* creates the automaton from the regular expression. if an */
114 /* error occurs during parsing, the error number is returned */
115 /* and the automaton will match any pattern. */
116 /* parsing is recursive. */
117 /*************************************************************/
118 
119 /*
120 We parse a regular expression pattern to create a finite-state
121 machine (FSM) that later match() can use to efficiently determine
122 whether a string matches the pattern or not.
123 
124 The FSM's main part are three arrays, where the array index is the
125 state number.
126  ch[] holds one of the following:
127  a specific character which this state must consume
128  EPSILON, indicating that this state doesn't consume a character
129  ANY, indicating that this state consumes any character
130  SET, indicating that this state consumes a character from a set
131  next1[] the state to transition to for the next step
132  next2[] if next2[] is not equal to next1[], it specifies an alternative
133  state to transition to
134 
135  freeState keeps track of the first yet unused state.
136  setState() is used to set a new or modify an existing state.
137 
138 
139 Parsing is done in a single pass.
140 
141 Here's an example. After parsing the pattern "ABC", the FSM will look
142 like this:
143 state ch next1 next2
144 0 eps 1
145 1 A 2
146 2 B 3
147 3 C 4
148 
149 Starting in state 0, we transition to state 1 without consuming anything.
150 In state 1 we consume "A" and transition to state 2.
151 In state 2 we consume "B" and transition to state 3.
152 In state 3 we consume the final "C", and transition to the terminal state 4.
153 
154 Now let's see what happens if we change the pattern to e. g. "ABC*".
155 Having parsed "ABC", the FSM is as shown above.
156 When the parser sees the asterisk, it will modify the FSM as follows
157 (see factor() function, block regexp[currentPos] == '*'):
158 
159 (1) setState(freeState, EPSILON, freeState+1, t2);
160 (2) next1[t1-1] = freeState;
161 
162 (1) adds a new epsilon-state that allows to either transition to the new
163 terminal state (freeState+1), or to the start of the last expression, which
164 in our example is state 3, which consumes "C".
165 (2) modifies the existing state t1-1, the state before our last expression,
166 in our case state 2, consuming "B", to alternatively transition to state 4.
167 state ch next1 next2
168 0 eps 1
169 1 A 2
170 2 B 4 3
171 3 C 4
172 4 eps 5 3
173 
174 Following the state transitions, we find that this will - as expected -
175 allow AB, ABC, ABCC, ABCCC.. to be successfully matched.
176 
177 
178 Parsing is done (roughly) according to this regular expression BNF:
179 
180  expression ::= term | term "|" expression
181 
182  term ::= factor | factor term
183 
184  factor ::= atom | atom metacharacter
185 
186  atom ::= "?" | "[" set "]" | "(" expression ")" | character
187 
188  metacharacter ::= "*" | "+" | "{" times "}"
189 
190 */
191 int automaton::parse(const char *regexp)
192 {
193  int temp;
194  this->regexp = regexp;
195  currentPos = 0;
196  freeState = 1;
197  memset(ch, 0x00, sizeof(int)*size);
198  memset(next1, 0x00, sizeof(int)*size);
199  memset(next2, 0x00, sizeof(int)*size);
200  if (setSize)
201  {
202  for (int i=0;i<setSize;i++)
203  {
204  free(setArray[i]);
205  }
206  free(setArray);
207  setSize = 0;
208  setArray = NULL;
209  }
210 
211  try
212  {
213  // parse expression recursively by splitting it up
214  temp = expression();
215  // using a temporary place to store the start state
216  // is needed since next1 may change during reallocation
217  next1[0] = temp;
218  }
219  catch (RE_ERROR err)
220  {
221  this->regexp = NULL;
222  // an error occured!
223  setState(0, EPSILON, 0, 0); // make automaton match anything
224  return(int) err;
225  }
226  // set start state
227  setState(0, EPSILON, next1[0], next1[0]);
228  this->final = freeState;
229  if (minimal == false)
230  {
231  // zero-terminate the expression...
232  setState(freeState, 0x00, freeState+1, freeState+1);
233  freeState++;
234  }
235  else
236  {
237  // this ends the pattern, but we need a dummy
238  // show the regexp be switched between min and max matching
240  freeState++;
241  }
242  // ...and set epsilon transition to end state
244 
245 // in debug mode, print out the automaton
246 #ifdef MYDEBUG
247  printf("state\tch\tnext1\tnext2\n");
248  for (int i=0;i<size && next1[i] != EOP;i++)
249  {
250  if ((unsigned int) next1[i] < 32000)
251  {
252  printf("%d\t",i);
253  switch (ch[i] & SCAN)
254  {
255  case ANY:
256  printf("any\t");
257  break;
258  case SET:
259  printf("set %d\t",(ch[i] & 0x0fff0000) >> 16);
260  break;
261  case SET|NOT:
262  printf("nset %d\t",(ch[i] & 0x0fff0000) >> 16);
263  break;
264  case EPSILON:
265  printf("eps\t");
266  break;
267  default:
268  printf("%c\t",ch[i]);
269  break;
270  }
271  if (next1[i] != next2[i])
272  {
273  printf("%d\t%d\n",next1[i],next2[i]);
274  }
275  else
276  {
277  printf("%d\n",next1[i]);
278  }
279  }
280  }
281 #endif
282 
283  this->regexp = NULL; // contents only guaranteed during
284  // runtime of this method
285  return 0;
286 }
287 
288 /***********************************************************/
289 /* automaton::letter */
290 /* */
291 /* check if the given character is a letter or a operator. */
292 /***********************************************************/
294 {
295  int r;
296 
297  switch (c) {
298  case '(':
299  case ')':
300  case '{':
301  case '}':
302  case '|':
303  case '*':
304  case '+':
305  case '[':
306  case ']':
307  //case '?':
308  case 0x00:
309  r=0;
310  break;
311  default:
312  r=1;
313  }
314  return r;
315 }
316 
317 /**************************************************************/
318 /* automaton::expression */
319 /* */
320 /* expression ::= term | term "|" expression */
321 /* */
322 /* dissect expression into smaller parts (expression => term) */
323 /* and insert fitting states into the automaton. */
324 /**************************************************************/
326 {
327  int t1, t2;
328  int r;
329 
330  t1 = term();
331  r = t1;
332  // is the current character an 'or' operator?
333  if (regexp[currentPos] == '|')
334  {
335  currentPos++;
336  freeState++;
337  r = t2 = freeState;
338  freeState++;
339  // set epsilon transitions, evaluate right expression
340  // (left one has already been processed at this point)
341  setState(t2, EPSILON, expression(), t1);
343  setState(t1-1, ch[t1-1], t2, next2[t1-1]);
344  }
345  return r;
346 }
347 
348 /*********************************/
349 /* automaton::term */
350 /* */
351 /* term ::= factor | factor term */
352 /* */
353 /* parse a term. */
354 /*********************************/
356 {
357  int r;
358 
359  r = factor();
360 
361  // another term following?
362  if ( (regexp[currentPos] == '(') ||
363  (regexp[currentPos] == '[') ||
365  {
366  term();
367  }
368 
369  return r;
370 }
371 
372 /***************************************************************/
373 /* automaton::factor */
374 /* */
375 /* factor ::= atom | atom metacharacter */
376 /* */
377 /* atom ::= "?" | "[" set "]" | "(" expression ")" | character */
378 /* */
379 /* metacharacter ::= "*" | "+" | "{" times "}" */
380 /* */
381 /* create new states for the automaton according to meaning of */
382 /* single characters. */
383 /* known operators are '?', '\', '[', '(', '{', '*', '+' */
384 /* */
385 /* returns the start state for this factor */
386 /***************************************************************/
388 {
389  int t1, t2;
390  int r;
391 
392  t1 = freeState;
393  switch (regexp[currentPos])
394  {
395  // match any single character
396  case '?':
397  // consume any character and transition to the following state
399  r = t2 = freeState;
400  freeState++;
401  currentPos++;
402  break;
403  // escape operator, use next character literally
404  case '\\':
405  currentPos++;
406  if (regexp[currentPos] == 0x00) throw E_UNEXPECTED_EOP;
407  // consume the current character and transition to the following state
409  t2 = freeState;
410  freeState++;
411  currentPos++;
412  break;
413  // set definition
414  case '[':
415  currentPos++;
416  t2 = set();
417  if (regexp[currentPos] == ']') currentPos++;
418  else throw E_ILLEGAL_SET;
419  break;
420  // an expression in parenthesis
421  case '(':
422  currentPos++; // step over "("
423 
424  // parenthesis need an extra leading epsilon transition so that | works correctly
425  freeState++; // reserve space for leading epsilon transition
426  t2 = expression();
427  setState(t1, EPSILON, t2, t2);
428 
429  // we also require an extra closing epsilon transition to make "(A|B)x*" work
431  freeState++;
432 
433  if (regexp[currentPos] == ')') currentPos++;
434  else throw E_MISSING_PAREN_CLOSE;
435  break;
436  default:
437  // only a letter is valid at this position
438  if (letter(regexp[currentPos]))
439  {
440  // consume the letter and transition to the following state
442  t2 = freeState;
443  freeState++;
444  currentPos++;
445  }
446  else
447  {
448  throw E_UNEXPECTED_SYMBOL;
449  }
450  }
451 
452  // the next statements are left out of the switch-block because
453  // they are all postfix operators.
454  // if they are used without operands, E_UNEXPECTED_SYMBOL is
455  // caused in the switch-block
456 
457  // match previous expression zero or more times
458  if (regexp[currentPos] == '*')
459  {
461  r = freeState;
462  next1[t1-1] = freeState;
463  freeState++;
464  currentPos++;
465  }
466  // match previous expression one or more times
467  else if (regexp[currentPos] == '+')
468  {
469  // consume the letter and transition to the following state
471  r = t1;
472  freeState++;
473  currentPos++;
474  }
475  // match previous expression exactly n times
476  else if (regexp[currentPos] == '{')
477  {
478  char stringBuffer[64];
479  int tempPos = currentPos+1;
480  int i = 0;
481  int j,k, size;
482  bool orExpression = t1!=t2;
483 
484  r = t2;
485 
486  // get n
487  while (regexp[tempPos] && regexp[tempPos] != '}')
488  {
489  stringBuffer[i] = regexp[tempPos];
490  tempPos++;
491  if (i<62) i++;
492  }
493 
494  if (regexp[tempPos] == 0x00)
495  {
496  throw E_UNEXPECTED_EOP;
497  }
498  stringBuffer[i] = 0x00;
499  i = atoi(stringBuffer) - 1;
500  if (i <= 0)
501  {
502  throw E_ILLEGAL_NUMBER;
503  }
504 
505  // the previous expression will be copied and insert
506  // n times into the automaton
507  size = freeState - t1;
508  if (orExpression)
509  {
510  size++;
511  }
512  while (i--)
513  {
514  k = freeState;
515  if (orExpression)
516  {
517  t2 += size;
518  setState(freeState++, EPSILON, t2, t2);
519  }
520  for (j=t1;j<k;j++,freeState++)
521  {
522  setState(freeState, ch[j], next1[j]+size, next2[j]+size);
523  }
524  t1 += size;
525  }
526 
527  currentPos = tempPos + 1;
528  }
529  else
530  {
531  r = t2;
532  }
533  return r;
534 }
535 
536 /**********************************************************/
537 /* automaton::set */
538 /* */
539 /* process a set definition. */
540 /* this parses the the definition and creates a character */
541 /* array that contains all specified characters. */
542 /* if this is an exclusive match, the NOT bit of the SET */
543 /* transition will be set. */
544 /**********************************************************/
546 {
547  int i=0;
548  int transition = SET;
549  int length = 257; // pre-allocation length is 257 bytes
550  // if more are need, reallocation is used
551  const char *ptr = regexp+currentPos;
552  char *range = (char*) malloc(sizeof(char)*length);
553 #ifdef WIN32
554  int (__cdecl *func)(int) = NULL; // function pointer for symbolic names
555 #else
556  int (*func)(int) = NULL;
557 #endif
558  // is this an exclusive definition?
559  if (*ptr == '^') {
560  transition |= NOT;
561  ptr++;
562  }
563 
564  // parse the set definition
565  while (*ptr && *ptr != ']') {
566  // '-' operator: go from last specified character
567  // to next specified character (fill in all in-between)
568  // (a '-' at the beginning is interpreted as a simple character)
569  if (*ptr == '-' && i) {
570  char b=ptr[-1], e=ptr[1];
571  if (e == 0x00 || e == ']') throw E_ILLEGAL_SET;
572  ptr+=2;
573  // start character > end character? then swap them
574  if (b>e) {
575  char t=b;
576  b=e; e=t;
577  }
578  // fill up the range
579  for (char c=b+1;c<=e;c++) {
580  if (checkRange(range,i,c)) range[i++] = c;
581  if (i == length) {
582  range = (char*) realloc(range,length*2);
583  length *= 2;
584  }
585  }
586  }
587  // escape operator
588  else if (*ptr == '\\') {
589  ptr++;
590  if (*ptr == 0x00) throw E_ILLEGAL_SET;
591  if (checkRange(range,i,*ptr)) range[i++] = *ptr;
592  if (i == length) {
593  range = (char*) realloc(range,length*2);
594  length *= 2;
595  }
596  ptr++;
597  }
598  // symbolic name?
599  else if (*ptr == ':') {
600  int len=0;
601  char buffer[16];
602  ptr++;
603  while (len<15 && *ptr && *ptr != ':')
604  buffer[len++] = *ptr++;
605  buffer[len] = 0x00;
606  if (*ptr != ':') throw E_ILLEGAL_SYMBOLIC_NAME;
607  ptr++;
608  // make symbolic name uppercase
609  while (len>=0) {
610  buffer[len] = toupper((int) buffer[len]);
611  len--;
612  }
613  // keyword ALPHA
614  if (!strcmp(buffer,"ALPHA"))
615  func = isalpha;
616  // keyword LOWER
617  else if (!strcmp(buffer,"LOWER"))
618  func = islower;
619  // keyword UPPER
620  else if (!strcmp(buffer,"UPPER"))
621  func = isupper;
622  // keyword ALNUM
623  else if (!strcmp(buffer,"ALNUM"))
624  func = isalnum;
625  // keyword DIGIT
626  else if (!strcmp(buffer,"DIGIT"))
627  func = isdigit;
628  // keyword XDIGIT
629  else if (!strcmp(buffer,"XDIGIT"))
630  func = isxdigit;
631  // keyword BLANK
632  else if (!strcmp(buffer,"BLANK")) {
633  // whitespace...
634  if (checkRange(range,i,' ')) range[i++] = ' ';
635  if (i == length) {
636  range = (char*) realloc(range,length*2);
637  length *= 2;
638  }
639  // ...or tab
640  if (checkRange(range,i,0x09)) range[i++] = 0x09;
641  if (i == length) {
642  range = (char*) realloc(range,length*2);
643  length *= 2;
644  }
645  }
646  // keyword SPACE
647  else if (!strcmp(buffer,"SPACE"))
648  func = isspace;
649  // keyword CNTRL
650  else if (!strcmp(buffer,"CNTRL"))
651  func = iscntrl;
652  // keyword PRINT
653  else if (!strcmp(buffer,"PRINT"))
654  func = isprint;
655  // keyword PUNCT
656  else if (!strcmp(buffer,"PUNCT"))
657  func = ispunct;
658  // keyword GRAPH
659  else if (!strcmp(buffer,"GRAPH"))
660  func = isgraph;
661  // unknown symbolic name?
662  else
664 
665  if (func) {
666  int j;
667  // go over all ASCII characters and use the specified
668  // function...
669  for (j=0;j<256;j++)
670  if ( (*func)(j) ) {
671  if (checkRange(range,i,j)) range[i++] = j;
672  if (i == length) {
673  range = (char*) realloc(range,length*2);
674  length *= 2;
675  }
676  }
677  }
678 
679  }
680  // simple character
681  else {
682  if (checkRange(range,i,*ptr)) range[i++] = *ptr;
683  ptr++;
684  if (i == length) {
685  range = (char*) realloc(range,length*2);
686  length *= 2;
687  }
688  }
689  }
690 
691  // empty sets are not allowed
692  if (i == 0) throw E_ILLEGAL_SET;
693  else currentPos+=(int)(ptr-(regexp+currentPos));
694 
695  // set SET transition (bits 16 to 27 contain the set number)
696  // that is created by insertSet
697  // consume a character from/not from the set and transition to the following state
698  setState(freeState, transition | (insertSet(range, i)<<16), freeState+1, freeState+1);
699  i = freeState;
700  freeState++;
701 
702  free(range);
703 
704  return i;
705 }
706 
707 /******************************************************************/
708 /* automaton::setState */
709 /* */
710 /* insert a state into the automaton. */
711 /* if the given position is too large for the currently allocated */
712 /* arrays, reallocation will occur. */
713 /******************************************************************/
714 void automaton::setState(int position, int transition, int state1, int state2)
715 {
716  // insert an element not yet settable?
717  // yes, then get enough memory for that
718  while (size <= position) {
719  size<<=1;
720 
721  ch = (int*) realloc(ch,size*sizeof(int));
722  next1 = (int*) realloc(next1,size*sizeof(int));
723  next2 = (int*) realloc(next2,size*sizeof(int));
724  }
725  ch[position] = transition;
726  next1[position] = state1;
727  next2[position] = state2;
728 }
729 
730 /************************************************************/
731 /* automaton::checkRange */
732 /* */
733 /* small helper function to check if a character is already */
734 /* in the range (avoid multiple entries). */
735 /* returns 1 if character not in range, zero otherwise. */
736 /************************************************************/
737 int automaton::checkRange(char *range, int length, char c)
738 {
739  int rc = 1;
740  for (int i=0;i<length;i++)
741  if (range[i] == c) {
742  rc = 0;
743  break;
744  }
745  return rc;
746 }
747 
748 /***************************************************************/
749 /* automaton::insertSet */
750 /* */
751 /* create a set and return the set number. */
752 /* the set is an array of characters, the first element of the */
753 /* set specifies the number of elements in the array. */
754 /* the setArray containing pointers to the sets will be re- */
755 /* allocated on each call. it is assumed that performance is */
756 /* secondary because parsing takes place only once and usually */
757 /* there are only a few set definitions at all. */
758 /***************************************************************/
759 int automaton::insertSet(char *range, int rangeSize)
760 {
761  setSize++;
762  // enlarge setArray
763  setArray = (int**) realloc(setArray,setSize*sizeof(int*));
764  // get memory for set array
765  setArray[setSize-1] = (int*) malloc((rangeSize+1)*sizeof(int));
766 
767  // fill in elements
768  for ( int i=0; i<rangeSize; i++)
769  setArray[setSize-1][i+1] = (int) range[i];
770 
771  setArray[setSize-1][0] = rangeSize; // set length
772  return setSize-1;
773 }
774 
775 /*************************************************/
776 /* automaton::match */
777 /* */
778 /* try to match a string with the automaton. */
779 /* returns 1 on success and 0 on failure. */
780 /* matching is non-recursively done with a queue */
781 /* that has a push, put and pop method. */
782 /*************************************************/
783 int automaton::match(const char *a, int N) // string length passed in
784  // instead of strlen
785 {
786  int n1, n2;
787  int j = 0;
788  int state = next1[0]; // get start state
789  doubleQueue dq(64); // create a double queue
790  // one SCAN symbol will be put into the queue
791  int i;
792  int set;
793  int len;
794  bool found;
795 
796  // terminates when end state (==0) is reached
797  while (state) {
798 #ifdef MYDEBUG
799  printf(" ");
800  dq.dump();
801 #endif
802  // go to next character
803  if (state == SCAN) {
804  if (minimal == true && j == N) break; // for minimal match
805 #ifdef MYDEBUG
806  printf("consume %2d %c\n",j,a[j]);
807 #endif
808  j++;
809  dq.put(SCAN);
810  }
811  else
812  switch (ch[state] & SCAN) {
813  case EPSILON: // epsilon transition
814  n1 = next1[state];
815  n2 = next2[state];
816  dq.push(n1);
817  if (n1 != n2) dq.push(n2);
818  break;
819  case SET: // inclusive set
820  case SET|NOT: // exclusive set
821  set = (ch[state] & 0x0fff0000)>>16; // get set number
822  len = setArray[set][0]; // get number of elements in set
823  found = (ch[state]&NOT)?true:false; // set default value
824 
825  for (i=1; i<=len; i++) {
826  if (setArray[set][i] == (int) a[j]) {
827  found = !found;
828  break;
829  }
830  }
831  if (found) {
832  dq.put(next1[state]);
833  }
834  break;
835  case ANY: // just match any character
836  dq.put(next1[state]);
837  break;
838  default:
839  if (j < N) { // freeState: we can't read past end of string
840  // normal character?
841  if (ch[state] == (int) a[j]) {
842  dq.put(next1[state]);
843  }
844  } else if (j == N) { // simulate zero-terminated string in any case
845  if (ch[state] == (int) 0) {
846  dq.put(next1[state]);
847  }
848  }
849  break;
850  }
851 #ifdef MYDEBUG
852  printf("%s",dq.isEmpty()?" empty ":"NOT empty ");
853  dq.dump();
854 #endif
855  // break out if we've reached the end of the string
856  // or no more states are available
857  if (dq.isEmpty() || (j>(N+1)) ) break;
858  state = dq.pop();
859  }
860 
861  currentPos = j;
862 
863  if (currentPos > N) currentPos = N;
864 
865  // return 1 if end state (EOP) has been reached
866  return state==EOP?1:0;
867 }
int set()
Definition: automaton.cpp:545
int checkRange(char *, int, char)
Definition: automaton.cpp:737
int match(const char *, int)
Definition: automaton.cpp:783
int expression()
Definition: automaton.cpp:325
const char * regexp
Definition: automaton.hpp:87
int currentPos
Definition: automaton.hpp:94
int setSize
Definition: automaton.hpp:90
int insertSet(char *, int)
Definition: automaton.cpp:759
void setState(int, int, int, int)
Definition: automaton.cpp:714
bool minimal
Definition: automaton.hpp:95
int * ch
Definition: automaton.hpp:81
int letter(int)
Definition: automaton.cpp:293
int freeState
Definition: automaton.hpp:93
int * next1
Definition: automaton.hpp:82
int parse(const char *)
Definition: automaton.cpp:191
int term()
Definition: automaton.cpp:355
int * next2
Definition: automaton.hpp:83
int factor()
Definition: automaton.cpp:387
int ** setArray
Definition: automaton.hpp:89
void setMinimal(bool)
Definition: automaton.cpp:90
void put(int value)
Definition: dblqueue.cpp:74
bool isEmpty()
Definition: dblqueue.hpp:52
int pop()
Definition: dblqueue.cpp:87
void push(int value)
Definition: dblqueue.cpp:62
#define EPSILON
Definition: regexp.hpp:45
RE_ERROR
Definition: regexp.hpp:52
@ E_UNEXPECTED_EOP
Definition: regexp.hpp:53
@ E_ILLEGAL_SYMBOLIC_NAME
Definition: regexp.hpp:53
@ E_UNEXPECTED_SYMBOL
Definition: regexp.hpp:52
@ E_MISSING_PAREN_CLOSE
Definition: regexp.hpp:52
@ E_ILLEGAL_SET
Definition: regexp.hpp:52
@ E_ILLEGAL_NUMBER
Definition: regexp.hpp:53
#define SET
Definition: regexp.hpp:48
#define EOP
Definition: regexp.hpp:49
#define ANY
Definition: regexp.hpp:47
#define SCAN
Definition: regexp.hpp:44
#define NOT
Definition: regexp.hpp:46