automaton.cpp
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* */
3 /* Copyright (c) 1995, 2004 IBM Corporation. All rights reserved. */
4 /* Copyright (c) 2005-2009 Rexx Language Association. All rights reserved. */
5 /* */
6 /* This program and the accompanying materials are made available under */
7 /* the terms of the Common Public License v1.0 which accompanies this */
8 /* distribution. A copy is also available at the following address: */
9 /* http://www.oorexx.org/license.html */
10 /* */
11 /* Redistribution and use in source and binary forms, with or */
12 /* without modification, are permitted provided that the following */
13 /* conditions are met: */
14 /* */
15 /* Redistributions of source code must retain the above copyright */
16 /* notice, this list of conditions and the following disclaimer. */
17 /* Redistributions in binary form must reproduce the above copyright */
18 /* notice, this list of conditions and the following disclaimer in */
19 /* the documentation and/or other materials provided with the distribution. */
20 /* */
21 /* Neither the name of Rexx Language Association nor the names */
22 /* of its contributors may be used to endorse or promote products */
23 /* derived from this software without specific prior written permission. */
24 /* */
25 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS */
26 /* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT */
27 /* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS */
28 /* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT */
29 /* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */
30 /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */
31 /* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, */
32 /* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY */
33 /* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING */
34 /* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS */
35 /* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
36 /* */
37 /*----------------------------------------------------------------------------*/
38 /******************************************************************************/
39 /* Object REXX Support automaton.cpp */
40 /* */
41 /* Regular Expression Utility functions */
42 /* */
43 /******************************************************************************/
44 #include <stdlib.h>
45 #include <stdio.h>
46 #include <string.h>
47 #include <ctype.h>
48 #include "automaton.hpp"
49 #include "regexp.hpp"
50 
51 // constructor: initialize automaton
52 automaton::automaton() : ch(NULL), next1(NULL), next2(NULL), final(-1), regexp(NULL),
53  setArray(NULL), setSize(0), size(16), freeState(1), currentPos(0), minimal(false)
54 {
55  int bytes = sizeof(int)*size;
56 
57  ch = (int*) malloc(bytes);
58  next1 = (int*) malloc(bytes);
59  next2 = (int*) malloc(bytes);
60 }
61 
62 // destructor: free memory
64 {
65  if (size)
66  {
67  free(ch);
68  free(next1);
69  free(next2);
70  }
71  if (setSize)
72  {
73  for (int i=0;i<setSize;i++)
74  {
75  free(setArray[i]);
76  }
77  free(setArray);
78  }
79 
80 }
81 
82 
83 /*************************************************************/
84 /* automaton::setMinimal */
85 /* */
86 /* make the automaton match minimal or maximal */
87 /* this influences the state machine. */
88 /*************************************************************/
90 {
91  if (f != this->minimal)
92  {
93  if (this->final != -1)
94  {
95  if (f == false)
96  {
97  setState(this->final, 0x00, this->final+1, this->final+1);
98  }
99  else
100  {
101  setState(this->final, EPSILON, EOP, EOP);
102  }
103  }
104  this->minimal = f;
105  }
106 }
107 
108 
109 /*************************************************************/
110 /* automaton::parse */
111 /* */
112 /* creates the automaton from the regular expression. if an */
113 /* error occurs during parsing, the error number is returned */
114 /* and the automaton will match any pattern. */
115 /* parsing is recursive. */
116 /*************************************************************/
117 int automaton::parse(const char *regexp)
118 {
119  int temp;
120  this->regexp = regexp;
121  currentPos = 0;
122  freeState = 1;
123  memset(ch, 0x00, sizeof(int)*size);
124  memset(next1, 0x00, sizeof(int)*size);
125  memset(next2, 0x00, sizeof(int)*size);
126  if (setSize)
127  {
128  for (int i=0;i<setSize;i++)
129  {
130  free(setArray[i]);
131  }
132  free(setArray);
133  setSize = 0;
134  setArray = NULL;
135  }
136 
137  try
138  {
139  // parse expression recursively by splitting it up
140  temp = expression();
141  // using a temporary place to store the start state
142  // is needed since next1 may change during reallocation
143  next1[0] = temp;
144  }
145  catch (RE_ERROR err)
146  {
147  this->regexp = NULL;
148  // an error occured!
149  setState(0, EPSILON, 0, 0); // make automaton match anything
150  return(int) err;
151  }
152  // set start state
153  setState(0, EPSILON, next1[0], next1[0]);
154  this->final = freeState;
155  if (minimal == false)
156  {
157  // zero-terminate the expression...
158  setState(freeState, 0x00, freeState+1, freeState+1);
159  freeState++;
160  }
161  else
162  {
163  // this ends the pattern, but we need a dummy
164  // show the regexp be switched between min and max matching
166  freeState++;
167  }
168  // ...and set epsilon transition to end state
170 
171 // in debug mode, print out the automaton
172 #ifdef MYDEBUG
173  printf("S\tT\t1\t2\n");
174  for (int i=0;i<size && next1[i] != EOP;i++)
175  {
176  if ((unsigned int) next1[i] < 32000)
177  {
178  printf("%d\t",i);
179  switch (ch[i] & SCAN)
180  {
181  case ANY:
182  printf("any\t");
183  break;
184  case SET:
185  printf("set %d\t",(ch[i] & 0x0fff0000) >> 16);
186  break;
187  case SET|NOT:
188  printf("nset %d\t",(ch[i] & 0x0fff0000) >> 16);
189  break;
190  case EPSILON:
191  printf("eps\t");
192  break;
193  default:
194  printf("%c\t",ch[i]);
195  break;
196  }
197  if ( (ch[i] & SCAN) == EPSILON )
198  {
199  if (next1[i] != next2[i])
200  {
201  printf("%d\t%d\n",next1[i],next2[i]);
202  }
203  }
204  else
205  {
206  printf("%d\n",next1[i]);
207  }
208  }
209  }
210 #endif
211 
212  this->regexp = NULL; // contents only guaranteed during
213  // runtime of this method
214  return 0;
215 }
216 
217 /***********************************************************/
218 /* automaton::letter */
219 /* */
220 /* check if the given character is a letter or a operator. */
221 /***********************************************************/
223 {
224  int r;
225 
226  switch (c) {
227  case '(':
228  case ')':
229  case '{':
230  case '}':
231  case '|':
232  case '*':
233  case '+':
234  case '[':
235  case ']':
236  //case '?':
237  case 0x00:
238  r=0;
239  break;
240  default:
241  r=1;
242  }
243  return r;
244 }
245 
246 /**************************************************************/
247 /* automaton::expression */
248 /* */
249 /* dissect expression into smaller parts (expression => term) */
250 /* and insert fitting states into the automaton. */
251 /**************************************************************/
253 {
254  int t1, t2;
255  int r;
256 
257  t1 = term();
258  r = t1;
259  // is the current character an 'or' operator?
260  if (regexp[currentPos] == '|')
261  {
262  currentPos++;
263  freeState++;
264  r = t2 = freeState;
265  freeState++;
266  // set epsilon transitions, evaluate right expression
267  // (left one has already been processed at this point)
268  setState(t2, EPSILON, expression(), t1);
270  setState(t1-1, ch[t1-1], t2, next2[t1-1]);
271  }
272  return r;
273 }
274 
275 /*******************/
276 /* automaton::term */
277 /* */
278 /* parse a term. */
279 /*******************/
281 {
282  int r;
283 
284  r = factor();
285 
286  // do we need to concatenate parenthesis?
287  // parenthesis need an extra epsilon transition so that | works correctly
288  if ( (regexp[currentPos] == '(') )
289  {
290  int t1 = freeState, t2;
291  freeState++;
292  t2 = term();
293  setState(t1, EPSILON, t2, t2);
294  }
295  else if ((regexp[currentPos] == '[') ||
297  {
298  term(); // simple concat
299  }
300 
301  return r;
302 }
303 
304 /***************************************************************/
305 /* automaton::factor */
306 /* */
307 /* create new states for the automaton according to meaning of */
308 /* single characters. */
309 /* known operators are '?', '\', '[', '(', '{', '*', '+' */
310 /***************************************************************/
312 {
313  int t1, t2;
314  int r;
315 
316  t1 = freeState;
317  switch (regexp[currentPos])
318  {
319  // match any single character
320  case '?':
322  r = t2 = freeState;
323  freeState++;
324  currentPos++;
325  break;
326  // escape operator, use next character literally
327  case '\\':
328  currentPos++;
329  if (regexp[currentPos] == 0x00) throw E_UNEXPECTED_EOP;
331  t2 = freeState;
332  freeState++;
333  currentPos++;
334  break;
335  // set definition
336  case '[':
337  currentPos++;
338  t2 = set();
339  if (regexp[currentPos] == ']') currentPos++;
340  else throw E_ILLEGAL_SET;
341  break;
342  // an expression in parenthesis
343  case '(':
344  currentPos++;
345  t2 = expression();
346  if (regexp[currentPos] == ')') currentPos++;
347  else throw E_MISSING_PAREN_CLOSE;
348  break;
349  default:
350  // only a letter is valid at this position
351  if (letter(regexp[currentPos]))
352  {
354  t2 = freeState;
355  freeState++;
356  currentPos++;
357  }
358  else
359  {
360  throw E_UNEXPECTED_SYMBOL;
361  }
362  }
363 
364  // the next statements are left out of the switch-block because
365  // they are all postfix operators.
366  // if they are used without operands, E_UNEXPECTED_SYMBOL is
367  // caused in the switch-block
368 
369  // match previous expression zero or more times
370  if (regexp[currentPos] == '*')
371  {
373  r = freeState;
374  next1[t1-1] = freeState;
375  freeState++;
376  currentPos++;
377  }
378  // match previous expression one or more times
379  else if (regexp[currentPos] == '+')
380  {
382  r = t1;
383  freeState++;
384  currentPos++;
385  }
386  // match previous expression exactly n times
387  else if (regexp[currentPos] == '{')
388  {
389  char stringBuffer[64];
390  int tempPos = currentPos+1;
391  int i = 0;
392  int j,k, size;
393  bool orExpression = t1!=t2;
394 
395  r = t2;
396 
397  // get n
398  while (regexp[tempPos] && regexp[tempPos] != '}')
399  {
400  stringBuffer[i] = regexp[tempPos];
401  tempPos++;
402  if (i<62) i++;
403  }
404 
405  if (regexp[tempPos] == 0x00)
406  {
407  throw E_UNEXPECTED_EOP;
408  }
409  stringBuffer[i] = 0x00;
410  i = atoi(stringBuffer) - 1;
411  if (i <= 0)
412  {
413  throw E_ILLEGAL_NUMBER;
414  }
415 
416  // the previous expression will be copied and insert
417  // n times into the automaton
418  size = freeState - t1;
419  if (orExpression)
420  {
421  size++;
422  }
423  while (i--)
424  {
425  k = freeState;
426  if (orExpression)
427  {
428  t2 += size;
429  setState(freeState++, EPSILON, t2, t2);
430  }
431  for (j=t1;j<k;j++,freeState++)
432  {
433  setState(freeState, ch[j], next1[j]+size, next2[j]+size);
434  }
435  t1 += size;
436  }
437 
438  currentPos = tempPos + 1;
439  }
440  else
441  {
442  r = t2;
443  }
444  return r;
445 }
446 
447 /**********************************************************/
448 /* automaton::set */
449 /* */
450 /* process a set definition. */
451 /* this parses the the definition and creates a character */
452 /* array that contains all specified characters. */
453 /* if this is an exclusive match, the NOT bit of the SET */
454 /* transition will be set. */
455 /**********************************************************/
457 {
458  int i=0;
459  int transition = SET;
460  int length = 256; // pre-allocation length is 256 bytes
461  // if more are need, reallocation is used
462  const char *ptr = regexp+currentPos;
463  char *range = (char*) malloc(sizeof(char)*length);
464 #ifdef WIN32
465  int (__cdecl *func)(int) = NULL; // function pointer for symbolic names
466 #else
467  int (*func)(int) = NULL;
468 #endif
469  // is this an exclusive definition?
470  if (*ptr == '^') {
471  transition |= NOT;
472  ptr++;
473  }
474 
475  // parse the set definition
476  while (*ptr && *ptr != ']') {
477  // '-' operator: go from last specified character
478  // to next specified character (fill in all in-between)
479  // (a '-' at the beginning is interpreted as a simple character)
480  if (*ptr == '-' && i) {
481  char b=ptr[-1], e=ptr[1];
482  if (e == 0x00 || e == ']') throw E_ILLEGAL_SET;
483  ptr+=2;
484  // start character > end character? then swap them
485  if (b>e) {
486  char t=b;
487  b=e; e=t;
488  }
489  // fill up the range
490  for (char c=b+1;c<=e;c++) {
491  if (checkRange(range,i,c)) range[i++] = c;
492  if (i == length) {
493  range = (char*) realloc(range,length*2);
494  length *= 2;
495  }
496  }
497  }
498  // escape operator
499  else if (*ptr == '\\') {
500  ptr++;
501  if (*ptr == 0x00) throw E_ILLEGAL_SET;
502  if (checkRange(range,i,*ptr)) range[i++] = *ptr;
503  if (i == length) {
504  range = (char*) realloc(range,length*2);
505  length *= 2;
506  }
507  ptr++;
508  }
509  // symbolic name?
510  else if (*ptr == ':') {
511  int len=0;
512  char buffer[16];
513  ptr++;
514  while (len<15 && *ptr && *ptr != ':')
515  buffer[len++] = *ptr++;
516  buffer[len] = 0x00;
517  if (*ptr != ':') throw E_ILLEGAL_SYMBOLIC_NAME;
518  ptr++;
519  // make symbolic name uppercase
520  while (len>=0) {
521  buffer[len] = toupper((int) buffer[len]);
522  len--;
523  }
524  // keyword ALPHA
525  if (!strcmp(buffer,"ALPHA"))
526  func = isalpha;
527  // keyword LOWER
528  else if (!strcmp(buffer,"LOWER"))
529  func = islower;
530  // keyword UPPER
531  else if (!strcmp(buffer,"UPPER"))
532  func = isupper;
533  // keyword ALNUM
534  else if (!strcmp(buffer,"ALNUM"))
535  func = isalnum;
536  // keyword DIGIT
537  else if (!strcmp(buffer,"DIGIT"))
538  func = isdigit;
539  // keyword XDIGIT
540  else if (!strcmp(buffer,"XDIGIT"))
541  func = isxdigit;
542  // keyword BLANK
543  else if (!strcmp(buffer,"BLANK")) {
544  // whitespace...
545  if (checkRange(range,i,' ')) range[i++] = ' ';
546  if (i == length) {
547  range = (char*) realloc(range,length*2);
548  length *= 2;
549  }
550  // ...or tab
551  if (checkRange(range,i,0x09)) range[i++] = 0x09;
552  if (i == length) {
553  range = (char*) realloc(range,length*2);
554  length *= 2;
555  }
556  }
557  // keyword SPACE
558  else if (!strcmp(buffer,"SPACE"))
559  func = isspace;
560  // keyword CNTRL
561  else if (!strcmp(buffer,"CNTRL"))
562  func = iscntrl;
563  // keyword PRINT
564  else if (!strcmp(buffer,"PRINT"))
565  func = isprint;
566  // keyword PUNCT
567  else if (!strcmp(buffer,"PUNCT"))
568  func = ispunct;
569  // keyword GRAPH
570  else if (!strcmp(buffer,"GRAPH"))
571  func = isgraph;
572  // unknown symbolic name?
573  else
575 
576  if (func) {
577  int j;
578  // go over all ASCII characters and use the specified
579  // function...
580  for (j=0;j<256;j++)
581  if ( (*func)(j) ) {
582  if (checkRange(range,i,j)) range[i++] = j;
583  if (i == length) {
584  range = (char*) realloc(range,length*2);
585  length *= 2;
586  }
587  }
588  }
589 
590  }
591  // simple character
592  else {
593  if (checkRange(range,i,*ptr)) range[i++] = *ptr;
594  ptr++;
595  if (i == length) {
596  range = (char*) realloc(range,length*2);
597  length *= 2;
598  }
599  }
600  }
601  range[i]=0x00; // null-terminate string
602 
603  // empty sets are not allowed
604  if (i == 0) throw E_ILLEGAL_SET;
605  else currentPos+=(int)(ptr-(regexp+currentPos));
606 
607  // set SET transition (bits 16 to 27 contain the set number)
608  // that is created by insertSet
609  setState(freeState, transition | (insertSet(range)<<16), freeState+1, freeState+1);
610  i = freeState;
611  freeState++;
612 
613  free(range);
614 
615  return i;
616 }
617 
618 /******************************************************************/
619 /* automaton::setState */
620 /* */
621 /* insert a state into the automaton. */
622 /* if the given position is too large for the currently allocated */
623 /* arrays, reallocation will occur. */
624 /******************************************************************/
625 void automaton::setState(int position, int transition, int state1, int state2)
626 {
627  // insert an element not yet settable?
628  // yes, then get enough memory for that
629  while (size <= position) {
630  size<<=1;
631 
632  ch = (int*) realloc(ch,size*sizeof(int));
633  next1 = (int*) realloc(next1,size*sizeof(int));
634  next2 = (int*) realloc(next2,size*sizeof(int));
635  }
636  ch[position] = transition;
637  next1[position] = state1;
638  next2[position] = state2;
639 }
640 
641 /************************************************************/
642 /* automaton::checkRange */
643 /* */
644 /* small helper function to check if a character is already */
645 /* in the range (avoid multiple entries). */
646 /* returns 1 if character not in range, zero otherwise. */
647 /************************************************************/
648 int automaton::checkRange(char *range, int length, char c)
649 {
650  int rc = 1;
651  for (int i=0;i<length;i++)
652  if (range[i] == c) {
653  rc = 0;
654  break;
655  }
656  return rc;
657 }
658 
659 /***************************************************************/
660 /* automaton::insertSet */
661 /* */
662 /* create a set and return the set number. */
663 /* the set is an array of characters, the first element of the */
664 /* set specifies the number of elements in the array. */
665 /* the setArray containing pointers to the sets will be re- */
666 /* allocated on each call. it is assumed that performance is */
667 /* secondary because parsing takes place only once and usually */
668 /* there are only a few set definitions at all. */
669 /***************************************************************/
670 int automaton::insertSet(char *range)
671 {
672  unsigned int i;
673 
674  setSize++;
675  // enlarge setArray
676  setArray = (int**) realloc(setArray,setSize*sizeof(int*));
677  // get memory for set array
678  setArray[setSize-1] = (int*) malloc((strlen(range)+1)*sizeof(int));
679 
680  // fill in elements
681  for ( i=0; i<strlen(range); i++)
682  setArray[setSize-1][i+1] = (int) range[i];
683 
684  setArray[setSize-1][0] = i; // set length
685  return setSize-1;
686 }
687 
688 /*************************************************/
689 /* automaton::match */
690 /* */
691 /* try to match a string with the automaton. */
692 /* returns 1 on success and 0 on failure. */
693 /* matching is non-recursively done with a queue */
694 /* that has a push, put and pop method. */
695 /*************************************************/
696 int automaton::match(const char *a, int N) // string length passed in
697  // instead of strlen
698 {
699  int n1, n2;
700  int j = 0;
701  int state = next1[0]; // get start state
702  doubleQueue dq(64); // create a double queue
703  // one SCAN symbol will be put into the queue
704  int i;
705  int set;
706  int len;
707  bool found;
708 
709  // terminates when end state (==0) is reached
710  while (state) {
711 #ifdef MYDEBUG
712  printf("double queue dump:\n");
713  dq.dump();
714 #endif
715  // go to next character
716  if (state == SCAN) {
717  if (minimal == true && j == N) break; // for minimal match
718 #ifdef MYDEBUG
719  printf("consume %2d %c\n",j,a[j]);
720 #endif
721  j++;
722  dq.put(SCAN);
723  }
724  else
725  switch (ch[state] & SCAN) {
726  case EPSILON: // epsilon transition
727  n1 = next1[state];
728  n2 = next2[state];
729  dq.push(n1);
730  if (n1 != n2) dq.push(n2);
731  break;
732  case SET: // inclusive set
733  case SET|NOT: // exclusive set
734  set = (ch[state] & 0x0fff0000)>>16; // get set number
735  len = setArray[set][0]; // get number of elements in set
736  found = (ch[state]&NOT)?true:false; // set default value
737 
738  for (i=1; i<=len; i++) {
739  if (setArray[set][i] == (int) a[j]) {
740  found = !found;
741  break;
742  }
743  }
744  if (found) {
745  dq.put(next1[state]);
746  }
747  break;
748  case ANY: // just match any character
749  dq.put(next1[state]);
750  break;
751  default:
752  if (j < N) { // freeState: we can't read past end of string
753  // normal character?
754  if (ch[state] == (int) a[j]) {
755  dq.put(next1[state]);
756  }
757  } else if (j == N) { // simulate zero-terminated string in any case
758  if (ch[state] == (int) 0) {
759  dq.put(next1[state]);
760  }
761  }
762  break;
763  }
764 #ifdef MYDEBUG
765  printf("dq is %s\n",dq.isEmpty()?"empty":"NOT empty");
766  dq.dump();
767 #endif
768  // break out if we've reached the end of the string
769  // or no more states are available
770  if (dq.isEmpty() || (j>(N+1)) ) break;
771  state = dq.pop();
772  }
773 
774  currentPos = j;
775 
776  if (currentPos > N) currentPos = N;
777 
778  // return 1 if end state (EOP) has been reached
779  return state==EOP?1:0;
780 }
int set()
Definition: automaton.cpp:456
int checkRange(char *, int, char)
Definition: automaton.cpp:648
int match(const char *, int)
Definition: automaton.cpp:696
int expression()
Definition: automaton.cpp:252
const char * regexp
Definition: automaton.hpp:87
int currentPos
Definition: automaton.hpp:94
int setSize
Definition: automaton.hpp:90
int insertSet(char *)
Definition: automaton.cpp:670
void setState(int, int, int, int)
Definition: automaton.cpp:625
bool minimal
Definition: automaton.hpp:95
int * ch
Definition: automaton.hpp:81
int letter(int)
Definition: automaton.cpp:222
int freeState
Definition: automaton.hpp:93
int * next1
Definition: automaton.hpp:82
int parse(const char *)
Definition: automaton.cpp:117
int term()
Definition: automaton.cpp:280
int * next2
Definition: automaton.hpp:83
int factor()
Definition: automaton.cpp:311
int ** setArray
Definition: automaton.hpp:89
void setMinimal(bool)
Definition: automaton.cpp:89
void put(int value)
Definition: dblqueue.cpp:74
bool isEmpty()
Definition: dblqueue.hpp:52
int pop()
Definition: dblqueue.cpp:87
void push(int value)
Definition: dblqueue.cpp:62
#define EPSILON
Definition: regexp.hpp:45
RE_ERROR
Definition: regexp.hpp:52
@ E_UNEXPECTED_EOP
Definition: regexp.hpp:53
@ E_ILLEGAL_SYMBOLIC_NAME
Definition: regexp.hpp:53
@ E_UNEXPECTED_SYMBOL
Definition: regexp.hpp:52
@ E_MISSING_PAREN_CLOSE
Definition: regexp.hpp:52
@ E_ILLEGAL_SET
Definition: regexp.hpp:52
@ E_ILLEGAL_NUMBER
Definition: regexp.hpp:53
#define SET
Definition: regexp.hpp:48
#define EOP
Definition: regexp.hpp:49
#define ANY
Definition: regexp.hpp:47
#define SCAN
Definition: regexp.hpp:44
#define NOT
Definition: regexp.hpp:46