TextClass.cpp
Go to the documentation of this file.
1 /*----------------------------------------------------------------------------*/
2 /* */
3 /* Copyright (c) 1995, 2004 IBM Corporation. All rights reserved. */
4 /* Copyright (c) 2005-2021 Rexx Language Association. All rights reserved. */
5 /* */
6 /* This program and the accompanying materials are made available under */
7 /* the terms of the Common Public License v1.0 which accompanies this */
8 /* distribution. A copy is also available at the following address: */
9 /* http://www.oorexx.org/license.html */
10 /* */
11 /* Redistribution and use in source and binary forms, with or */
12 /* without modification, are permitted provided that the following */
13 /* conditions are met: */
14 /* */
15 /* Redistributions of source code must retain the above copyright */
16 /* notice, this list of conditions and the following disclaimer. */
17 /* Redistributions in binary form must reproduce the above copyright */
18 /* notice, this list of conditions and the following disclaimer in */
19 /* the documentation and/or other materials provided with the distribution. */
20 /* */
21 /* Neither the name of Rexx Language Association nor the names */
22 /* of its contributors may be used to endorse or promote products */
23 /* derived from this software without specific prior written permission. */
24 /* */
25 /* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS */
26 /* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT */
27 /* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS */
28 /* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT */
29 /* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, */
30 /* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED */
31 /* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, */
32 /* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY */
33 /* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING */
34 /* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS */
35 /* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
36 /* */
37 /*----------------------------------------------------------------------------*/
38 
39 #include "RexxCore.h"
40 #include "ProtectedObject.hpp"
41 #include "TextClass.hpp"
42 
43 
44 /******************************************************************************/
45 /* */
46 /* RexxText Class */
47 /* */
48 /******************************************************************************/
49 
50 // singleton class instance
51 RexxTextClass *RexxText::classInstance = OREF_NULL; // TheRexxTextClass
52 
53 RexxText *RexxText::nullText = OREF_NULL; // Initialized by RexxMemory::restore (RexxMemory.cpp) and RexxMemory::createImage (Setup.cpp)
54 
55 
57 {
58  CLASS_CREATE(RexxText, "RexxText", RexxTextClass);
59 }
60 
62 {
63  RexxObject *newObj = OREF_NULL;
64 
65  // This method is called from RexxMemory::createImage and RexxMemory::restore
66  // At that moment, there is no current activity, and ProtectedObject can't be used.
67  // Use GlobalProtectedObject?
69  {
70  // Normal situation
71  newObj = new RexxText();
72  ProtectedObject p(newObj);
73  newObj->setBehaviour(TheRexxTextClass->getInstanceBehaviour());
74  if (TheRexxTextClass->hasUninitDefined())
75  {
76  newObj->hasUninit();
77  }
78  ProtectedObject p_result;
79  RexxObject *arguments[1];
80  arguments[0] = s;
81  bool messageUnderstood = newObj->messageSend(OREF_INIT, arguments, 1, 0, p_result, false);
82  }
83  else
84  {
85  // Special situation
86  newObj = new RexxText();
87  // ProtectedObject p(newObj);
88  newObj->setBehaviour(TheRexxTextClass->getInstanceBehaviour());
89  if (TheRexxTextClass->hasUninitDefined())
90  {
91  newObj->hasUninit();
92  }
93 #if 0
94  // Can't send "INIT" because the extension is maybe not loaded
95  // Anyway, can't use messageSend because a ProtectedObject is needed
96  // TODO manage a native init which doesn't depend on extensions
97  ProtectedObject p_result;
98  RexxObject *arguments[1];
99  arguments[0] = s;
100  bool messageUnderstood = newObj->messageSend(OREF_INIT, arguments, 1, 0, p_result, false);
101 #endif
102  }
103  return (RexxText *)newObj;
104 }
105 
106 /*static*/ RexxText *RexxText::newText(const char *s, size_t blength)
107 {
108  RexxString *string = new_string(s, blength);
109 
110  // This method is called from RexxMemory::createImage and RexxMemory::restore
111  // At that moment, there is no current activity, and ProtectedObject can't be used.
113  {
114  // Normal situation
115  ProtectedObject p_string(string);
116  return newText(string);
117  }
118  else
119  {
120  // Special situation
121  // ProtectedObject p_string(string);
122  return newText(string);
123  }
124 }
125 
126 RexxObject *RexxText::newRexx(RexxObject **init_args, size_t argCount, size_t named_argCount)
127 {
128  RexxObject *newObj = new RexxText();
129  ProtectedObject p(newObj);
130  newObj->setBehaviour(((RexxClass *)this)->getInstanceBehaviour());
131  if (((RexxClass *)this)->hasUninitDefined())
132  {
133  newObj->hasUninit();
134  }
135  newObj->sendMessage(OREF_INIT, init_args, argCount, named_argCount);
136  return newObj;
137 }
138 
139 void *RexxText::operator new(size_t size)
140 {
141  return new_object(size, T_RexxText);
142 }
143 
144 void RexxText::live(size_t liveMark)
145 {
146  memory_mark(this->objectVariables);
147 }
148 
149 void RexxText::liveGeneral(int reason)
150 {
151  memory_mark_general(this->objectVariables);
152 }
153 
155 {
157  flatten_reference(newThis->objectVariables, envelope);
159 }
160 
162 {
163  return (RexxString *)this->sendMessage(OREF_REQUEST, OREF_STRINGSYM);
164 }
165 
166 
167 // Handle a REQUEST('TEXT') request for a REXX text object
169 {
170  return this;
171 }
172 
173 
175 {
176  return (RexxString *)this->sendMessage(OREF_REQUEST, OREF_STRINGSYM);
177 }
178 
179 // Handle a REQUEST('TEXT') request for a REXX text object
181 {
182  if (this->isBaseClass())
183  {
184  return this;
185  }
186  else
187  {
188  // return new_text(this->getString(), this->getEncoding()); // TODO
189  return this;
190  }
191 }
192 
193 
194 // Return the primitive text value of this object
196 {
197  if (isOfClass(RexxText, this))
198  {
199  return this;
200  }
201  else
202  {
203  // return new_text(this->getString(), this->getEncoding()); // TODO
204  return this;
205  }
206 }
207 
208 
209 void RexxTextClass::live(size_t liveMark)
210 {
211  this->RexxClass::live(liveMark); // do RexxClass level marking
212 
213  // mark the static objects
215 }
216 
218 {
219  this->RexxClass::liveGeneral(reason);// do RexxClass level marking
220 
221  // mark the static objects
223 }
224 
225 
226 /******************************************************************************/
227 /* */
228 /* Helpers */
229 /* */
230 /******************************************************************************/
231 
232 /* All possible oorexx user-defined errors:
233  Error_Program_unreadable_user_defined
234  Error_Program_interrupted_user_defined
235  Error_System_resources_user_defined
236  Error_Unmatched_quote_user_defined
237  Error_Control_stack_user_defined
238  Error_Invalid_character_user_defined
239  Error_Symbol_or_string_user_defined
240  Error_Symbol_expected_user_defined
241  Error_Invalid_data_user_defined
242  Error_Invalid_character_string_user_defined
243  Error_Invalid_data_string_user_defined
244  Error_Invalid_subkeyword_string_user_defined
245  Error_Invalid_whole_number_user_defined
246  Error_Name_too_long_user_defined
247  Error_Invalid_variable_user_defined
248  Error_Expression_user_defined
249  Error_Logical_value_user_defined
250  Error_Invalid_expression_user_defined
251  Error_Unmatched_parenthesis_user_defined
252  Error_Unexpected_comma_user_defined
253  Error_Invalid_template_user_defined
254  Error_Incorrect_call_user_defined
255  Error_Conversion_user_defined
256  Error_Overflow_user_defined
257  Error_Routine_not_found_user_defined
258  Error_Function_no_data_user_defined
259  Error_Variable_reference_user_defined
260  Error_System_service_user_defined
261  Error_Interpretation_user_defined
262  Error_Invalid_argument_user_defined
263  Error_External_name_not_found_user_defined
264  Error_No_result_object_user_defined
265  Error_OLE_Error_user_defined
266  Error_Incorrect_method_user_defined
267  Error_No_method_user_defined
268  Error_Execution_user_defined
269  Error_Translation_user_defined
270 */
271 
272 
273 ssize_t integerRange(RexxObject *obj, ssize_t min, ssize_t max, wholenumber_t error, const char *errorMessage)
274 {
275  if (obj != OREF_NULL)
276  {
278  if (integer != TheNilObject)
279  {
280  wholenumber_t value = integer->getValue();
281  if (value >= min && value <= max) return value;
282  }
283  }
284  reportException(error, errorMessage);
285  return 0; // To avoid warning, must return something (should never reach this line)
286 }
287 
288 
289 ssize_t integer(RexxObject *obj, const char *errorMessage)
290 {
291  if (obj != OREF_NULL)
292  {
294  if (integer != TheNilObject) return integer->getValue();
295  }
297  return 0; // To avoid warning, must return something (should never reach this line)
298 }
299 
300 
302 {
303  int64_t v64 = 1;
304  return *((int8_t*)&v64) == 1;
305 }
306 
307 
308 /******************************************************************************/
309 /* */
310 /* Unicode Class */
311 /* */
312 /******************************************************************************/
313 
314 // singleton class instance
316 
317 
319 {
320  CLASS_CREATE(Unicode, "Unicode", RexxClass);
321 }
322 
323 RexxObject *Unicode::newRexx(RexxObject **init_args, size_t argCount, size_t named_argCount)
324 {
325  // This class has no instance...
327  return TheNilObject;
328 }
329 
331 {
332  // This class cannot be copied because it holds tons of informations about the Unicode characters...
334  return TheNilObject;
335 }
336 
337 void *Unicode::operator new(size_t size)
338 {
339  return new_object(size, T_Unicode);
340 }
341 
342 void Unicode::live(size_t liveMark)
343 {
344  memory_mark(this->objectVariables);
345 }
346 
347 void Unicode::liveGeneral(int reason)
348 {
349  memory_mark_general(this->objectVariables);
350 }
351 
353 {
355  flatten_reference(newThis->objectVariables, envelope);
357 }
358 
360 {
361  int64_t v64 = 1;
363 }
364 
365 /******************************************************************************/
366 /* */
367 /* Unicode Class - utf8proc */
368 /* */
369 /******************************************************************************/
370 
371 #include "m17n/utf8proc/utf8proc.h"
372 
373 
374 void raiseError(utf8proc_ssize_t errcode)
375 {
376  const char *errmsg = utf8proc_errmsg(errcode);
377  switch (errcode)
378  {
379  case UTF8PROC_ERROR_NOMEM:
380  case UTF8PROC_ERROR_OVERFLOW:
382  case UTF8PROC_ERROR_INVALIDUTF8:
383  case UTF8PROC_ERROR_NOTASSIGNED:
385  case UTF8PROC_ERROR_INVALIDOPTS:
387  default:
389  }
390 }
391 
392 
394 {
395  return new_string(utf8proc_unicode_version());
396 }
397 
398 
399 /**
400  * Given a pair of consecutive codepoints, return whether a grapheme break is
401  * permitted between them.
402  *
403  * @param array An array of 3 items:
404  * codepoint1 [IN] The first codepoint.
405  * codepoint2 [IN] The second codepoint.
406  * state [IN OUT] Initial value must be 0.
407  *
408  * @return .true if a grapheme break is permitted, .false otherwise.
409  */
411 {
412  array = arrayArgument(array, OREF_positional, ARG_ONE);
413  ProtectedObject p(array);
414  utf8proc_int32_t codepoint1 = (utf8proc_int32_t)integerRange(array->get(1), 0, SSIZE_MAX, Error_Invalid_argument_user_defined, "GraphemeBreak: The first codepoint must be a non negative integer");
415  utf8proc_int32_t codepoint2 = (utf8proc_int32_t)integerRange(array->get(2), 0, SSIZE_MAX, Error_Invalid_argument_user_defined, "GraphemeBreak: The second codepoint must be a non negative integer");
416  utf8proc_int32_t state = (utf8proc_int32_t)integerRange(array->get(3), 0, SSIZE_MAX, Error_Invalid_argument_user_defined, "GraphemeBreak:The state must be a non negative integer");
417  utf8proc_bool graphemeBreak = utf8proc_grapheme_break_stateful(codepoint1, codepoint2, &state);
418  array->put(new_integer(state), 3); // Output argument
419  return graphemeBreak ? TheTrueObject : TheFalseObject;
420 }
421 
423 {
424  utf8proc_int32_t codepoint = (utf8proc_int32_t)integer(rexxCodepoint, "CodepointCategory: codepoint must be an integer");
425  const utf8proc_property_t *property = utf8proc_get_property(codepoint);
426  return new_integer(property->category); // see utf8proc_category_t
427 }
428 #if 0
429 /** Unicode categories. */
430 typedef enum {
431  UTF8PROC_CATEGORY_CN = 0, /**< Other, not assigned */
432  UTF8PROC_CATEGORY_LU = 1, /**< Letter, uppercase */
433  UTF8PROC_CATEGORY_LL = 2, /**< Letter, lowercase */
434  UTF8PROC_CATEGORY_LT = 3, /**< Letter, titlecase */
435  UTF8PROC_CATEGORY_LM = 4, /**< Letter, modifier */
436  UTF8PROC_CATEGORY_LO = 5, /**< Letter, other */
437  UTF8PROC_CATEGORY_MN = 6, /**< Mark, nonspacing */
438  UTF8PROC_CATEGORY_MC = 7, /**< Mark, spacing combining */
439  UTF8PROC_CATEGORY_ME = 8, /**< Mark, enclosing */
440  UTF8PROC_CATEGORY_ND = 9, /**< Number, decimal digit */
441  UTF8PROC_CATEGORY_NL = 10, /**< Number, letter */
442  UTF8PROC_CATEGORY_NO = 11, /**< Number, other */
443  UTF8PROC_CATEGORY_PC = 12, /**< Punctuation, connector */
444  UTF8PROC_CATEGORY_PD = 13, /**< Punctuation, dash */
445  UTF8PROC_CATEGORY_PS = 14, /**< Punctuation, open */
446  UTF8PROC_CATEGORY_PE = 15, /**< Punctuation, close */
447  UTF8PROC_CATEGORY_PI = 16, /**< Punctuation, initial quote */
448  UTF8PROC_CATEGORY_PF = 17, /**< Punctuation, final quote */
449  UTF8PROC_CATEGORY_PO = 18, /**< Punctuation, other */
450  UTF8PROC_CATEGORY_SM = 19, /**< Symbol, math */
451  UTF8PROC_CATEGORY_SC = 20, /**< Symbol, currency */
452  UTF8PROC_CATEGORY_SK = 21, /**< Symbol, modifier */
453  UTF8PROC_CATEGORY_SO = 22, /**< Symbol, other */
454  UTF8PROC_CATEGORY_ZS = 23, /**< Separator, space */
455  UTF8PROC_CATEGORY_ZL = 24, /**< Separator, line */
456  UTF8PROC_CATEGORY_ZP = 25, /**< Separator, paragraph */
457  UTF8PROC_CATEGORY_CC = 26, /**< Other, control */
458  UTF8PROC_CATEGORY_CF = 27, /**< Other, format */
459  UTF8PROC_CATEGORY_CS = 28, /**< Other, surrogate */
460  UTF8PROC_CATEGORY_CO = 29, /**< Other, private use */
461 } utf8proc_category_t;
462 #endif
463 
464 
466 {
467  utf8proc_int32_t codepoint = (utf8proc_int32_t)integer(rexxCodepoint, "CodepointCombiningClass: codepoint must be an integer");
468  const utf8proc_property_t *property = utf8proc_get_property(codepoint);
469  return new_integer(property->combining_class); // see utf8proc_category_t
470 }
471 
472 
474 {
475  utf8proc_int32_t codepoint = (utf8proc_int32_t)integer(rexxCodepoint, "CodepointBidiClass: codepoint must be an integer");
476  const utf8proc_property_t *property = utf8proc_get_property(codepoint);
477  return new_integer(property->bidi_class); // see utf8proc_bidi_class_t
478 }
479 #if 0
480 /** Bidirectional character classes. */
481 typedef enum {
482  UTF8PROC_BIDI_CLASS_L = 1, /**< Left-to-Right */
483  UTF8PROC_BIDI_CLASS_LRE = 2, /**< Left-to-Right Embedding */
484  UTF8PROC_BIDI_CLASS_LRO = 3, /**< Left-to-Right Override */
485  UTF8PROC_BIDI_CLASS_R = 4, /**< Right-to-Left */
486  UTF8PROC_BIDI_CLASS_AL = 5, /**< Right-to-Left Arabic */
487  UTF8PROC_BIDI_CLASS_RLE = 6, /**< Right-to-Left Embedding */
488  UTF8PROC_BIDI_CLASS_RLO = 7, /**< Right-to-Left Override */
489  UTF8PROC_BIDI_CLASS_PDF = 8, /**< Pop Directional Format */
490  UTF8PROC_BIDI_CLASS_EN = 9, /**< European Number */
491  UTF8PROC_BIDI_CLASS_ES = 10, /**< European Separator */
492  UTF8PROC_BIDI_CLASS_ET = 11, /**< European Number Terminator */
493  UTF8PROC_BIDI_CLASS_AN = 12, /**< Arabic Number */
494  UTF8PROC_BIDI_CLASS_CS = 13, /**< Common Number Separator */
495  UTF8PROC_BIDI_CLASS_NSM = 14, /**< Nonspacing Mark */
496  UTF8PROC_BIDI_CLASS_BN = 15, /**< Boundary Neutral */
497  UTF8PROC_BIDI_CLASS_B = 16, /**< Paragraph Separator */
498  UTF8PROC_BIDI_CLASS_S = 17, /**< Segment Separator */
499  UTF8PROC_BIDI_CLASS_WS = 18, /**< Whitespace */
500  UTF8PROC_BIDI_CLASS_ON = 19, /**< Other Neutrals */
501  UTF8PROC_BIDI_CLASS_LRI = 20, /**< Left-to-Right Isolate */
502  UTF8PROC_BIDI_CLASS_RLI = 21, /**< Right-to-Left Isolate */
503  UTF8PROC_BIDI_CLASS_FSI = 22, /**< First Strong Isolate */
504  UTF8PROC_BIDI_CLASS_PDI = 23, /**< Pop Directional Isolate */
505 } utf8proc_bidi_class_t;
506 #endif
507 
508 
510 {
511  utf8proc_int32_t codepoint = (utf8proc_int32_t)integer(rexxCodepoint, "CodepointBidiMirrored: codepoint must be an integer");
512  const utf8proc_property_t *property = utf8proc_get_property(codepoint);
513  return property->bidi_mirrored ? TheTrueObject : TheFalseObject;
514 }
515 
516 
518 {
519  utf8proc_int32_t codepoint = (utf8proc_int32_t)integer(rexxCodepoint, "CodepointDecompositionType: codepoint must be an integer");
520  const utf8proc_property_t *property = utf8proc_get_property(codepoint);
521  return new_integer(property->decomp_type); // see utf8proc_decomp_type_t
522 
523  /* not returned, internal use
524  utf8proc_uint16_t decomp_seqindex;
525  utf8proc_uint16_t casefold_seqindex;
526  utf8proc_uint16_t uppercase_seqindex;
527  utf8proc_uint16_t lowercase_seqindex;
528  utf8proc_uint16_t titlecase_seqindex;
529  utf8proc_uint16_t comb_index;
530  unsigned bidi_mirrored:1;
531  unsigned comp_exclusion:1;
532  */
533 }
534 #if 0
535 /** Decomposition type. */
536 typedef enum {
537  UTF8PROC_DECOMP_TYPE_FONT = 1, /**< Font */
538  UTF8PROC_DECOMP_TYPE_NOBREAK = 2, /**< Nobreak */
539  UTF8PROC_DECOMP_TYPE_INITIAL = 3, /**< Initial */
540  UTF8PROC_DECOMP_TYPE_MEDIAL = 4, /**< Medial */
541  UTF8PROC_DECOMP_TYPE_FINAL = 5, /**< Final */
542  UTF8PROC_DECOMP_TYPE_ISOLATED = 6, /**< Isolated */
543  UTF8PROC_DECOMP_TYPE_CIRCLE = 7, /**< Circle */
544  UTF8PROC_DECOMP_TYPE_SUPER = 8, /**< Super */
545  UTF8PROC_DECOMP_TYPE_SUB = 9, /**< Sub */
546  UTF8PROC_DECOMP_TYPE_VERTICAL = 10, /**< Vertical */
547  UTF8PROC_DECOMP_TYPE_WIDE = 11, /**< Wide */
548  UTF8PROC_DECOMP_TYPE_NARROW = 12, /**< Narrow */
549  UTF8PROC_DECOMP_TYPE_SMALL = 13, /**< Small */
550  UTF8PROC_DECOMP_TYPE_SQUARE = 14, /**< Square */
551  UTF8PROC_DECOMP_TYPE_FRACTION = 15, /**< Fraction */
552  UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */
553 } utf8proc_decomp_type_t;
554 #endif
555 
556 
558 {
559  utf8proc_int32_t codepoint = (utf8proc_int32_t)integer(rexxCodepoint, "CodepointIgnorable: codepoint must be an integer");
560  const utf8proc_property_t *property = utf8proc_get_property(codepoint);
561  return property->ignorable ? TheTrueObject : TheFalseObject;
562 }
563 
564 
566 {
567  utf8proc_int32_t codepoint = (utf8proc_int32_t)integer(rexxCodepoint, "CodepointControlBoundary: codepoint must be an integer");
568  const utf8proc_property_t *property = utf8proc_get_property(codepoint);
569  return property->control_boundary ? TheTrueObject : TheFalseObject;
570 }
571 
572 
574 {
575  utf8proc_int32_t codepoint = (utf8proc_int32_t)integer(rexxCodepoint, "CodepointCharWidth: codepoint must be an integer");
576  const utf8proc_property_t *property = utf8proc_get_property(codepoint);
577  return new_integer(property->charwidth);
578 
579  /* not returned, not used?
580  unsigned pad:2;
581  */
582 }
583 
584 
586 {
587  utf8proc_int32_t codepoint = (utf8proc_int32_t)integer(rexxCodepoint, "CodepointBoundClass: codepoint must be an integer");
588  const utf8proc_property_t *property = utf8proc_get_property(codepoint);
589  return new_integer(property->boundclass); // see utf8proc_boundclass_t
590 }
591 #if 0
592 /** Boundclass property. (TR29) */
593 typedef enum {
594  UTF8PROC_BOUNDCLASS_START = 0, /**< Start */
595  UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */
596  UTF8PROC_BOUNDCLASS_CR = 2, /**< Cr */
597  UTF8PROC_BOUNDCLASS_LF = 3, /**< Lf */
598  UTF8PROC_BOUNDCLASS_CONTROL = 4, /**< Control */
599  UTF8PROC_BOUNDCLASS_EXTEND = 5, /**< Extend */
600  UTF8PROC_BOUNDCLASS_L = 6, /**< L */
601  UTF8PROC_BOUNDCLASS_V = 7, /**< V */
602  UTF8PROC_BOUNDCLASS_T = 8, /**< T */
603  UTF8PROC_BOUNDCLASS_LV = 9, /**< Lv */
604  UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */
605  UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
606  UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
607  UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */
608  UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */
609 
610  /* the following are no longer used in Unicode 11, but we keep
611  the constants here for backward compatibility */
612  UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */
613  UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */
614  UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */
615  UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
616 
617  /* the Extended_Pictographic property is used in the Unicode 11
618  grapheme-boundary rules, so we store it in the boundclass field */
619  UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC = 19,
620  UTF8PROC_BOUNDCLASS_E_ZWG = 20, /* UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ */
621 } utf8proc_boundclass_t;
622 #endif
623 
624 
626 {
627  utf8proc_int32_t codepoint = (utf8proc_int32_t)integer(rexxCodepoint, "CodepointToLower: codepoint must be an integer");
628  return new_integer(utf8proc_tolower(codepoint));
629 }
630 
631 
633 {
634  utf8proc_int32_t codepoint = (utf8proc_int32_t)integer(rexxCodepoint, "CodepointToUpper: codepoint must be an integer");
635  return new_integer(utf8proc_toupper(codepoint));
636 }
637 
638 
640 {
641  utf8proc_int32_t codepoint = (utf8proc_int32_t)integer(rexxCodepoint, "CodepointToTitle: codepoint must be an integer");
642  return new_integer(utf8proc_totitle(codepoint));
643 }
644 
645 
647 {
648  utf8proc_int32_t codepoint = (utf8proc_int32_t)integer(rexxCodepoint, "CodepointIsLower: codepoint must be an integer");
649  return utf8proc_islower(codepoint) ? TheTrueObject : TheFalseObject;
650 
651 }
652 
653 
655 {
656  utf8proc_int32_t codepoint = (utf8proc_int32_t)integer(rexxCodepoint, "CodepointIsUpper: codepoint must be an integer");
657  return utf8proc_isupper(codepoint) ? TheTrueObject : TheFalseObject;
658 
659 }
660 
661 
662 // utf8proc helper
663 RexxString *normalize(RexxString *string, utf8proc_option_t options)
664 {
665  utf8proc_uint8_t *retval;
666  string = stringArgument(string, OREF_positional, ARG_ONE);
667  const utf8proc_uint8_t *str = (const utf8proc_uint8_t *)string->getStringData();
668  utf8proc_ssize_t strlength = (utf8proc_ssize_t)string->getLength();
669  utf8proc_ssize_t reslength = utf8proc_map(str, strlength, &retval, options);
670  if (reslength < 0) raiseError(reslength); // here, reslength is an error code
671  // Not so easy to optimize memory allocation...
672  // utf8proc_map allocates a buffer of 32-bit codepoints
673  // and then reuse this same buffer to convert to utf-8
674  // In the end, the buffer is reallocated to shrink it.
675  RexxString *result = new_string((const char *)retval, reslength);
676  free(retval);
677  return result;
678 }
679 
680 
681 RexxObject *Unicode::utf8proc_transform(RexxString *string, RexxObject **named_arglist, size_t named_argcount)
682 {
683  string = stringArgument(string, OREF_positional, ARG_ONE);
684 
685  // use strict named arg casefold = .false, lump= .false, nlf = 0, normalization = 0, stripCC = .false, stripIgnorable= .false, stripMark = .false, stripNA = .false
686  NamedArguments expectedNamedArguments(8); // 8 named arguments
687  expectedNamedArguments[0] = NamedArgument("CASEFOLD", TheFalseObject); // default value = .false
688  expectedNamedArguments[1] = NamedArgument("LUMP", TheFalseObject); // default value = .false
689  expectedNamedArguments[2] = NamedArgument("NLF", IntegerZero); // default value = 0 (0=none, 1=NLF2LF, 2=NLF2LS, 3=NLF2PS)
690  expectedNamedArguments[3] = NamedArgument("NORMALIZATION", IntegerZero); // default value = 0 (0=none, 1=NFC, 2=NFD, 3=NFKC, 4=NFKD)
691  expectedNamedArguments[4] = NamedArgument("STRIPCC", TheFalseObject); // default value = .false
692  expectedNamedArguments[5] = NamedArgument("STRIPIGNORABLE",TheFalseObject); // default value = .false
693  expectedNamedArguments[6] = NamedArgument("STRIPMARK", TheFalseObject); // default value = .false
694  expectedNamedArguments[7] = NamedArgument("STRIPNA", TheFalseObject); // default value = .false
695  expectedNamedArguments.match(named_arglist, named_argcount, /*strict*/ true, /*extraAllowed*/ false);
696  ssize_t casefold = integerRange(expectedNamedArguments[0].value, 0, 1, Error_Logical_value_user_defined, "Transform: value of named argument \"casefold\" must be 0 or 1");
697  ssize_t lump = integerRange(expectedNamedArguments[1].value, 0, 1, Error_Logical_value_user_defined, "Transform: value of named argument \"lump\" must be 0 or 1");
698  ssize_t nlf = integerRange(expectedNamedArguments[2].value, 0, 3, Error_Invalid_argument_user_defined, "Transform: value of named argument \"nlf\" must be 0..3");
699  ssize_t normalization = integerRange(expectedNamedArguments[3].value, 0, 4, Error_Invalid_argument_user_defined, "Transform: value of named argument \"normalization\" must be 0..4");
700  ssize_t stripCC = integerRange(expectedNamedArguments[4].value, 0, 1, Error_Logical_value_user_defined, "Transform: value of named argument \"stripCC\" must be 0 or 1");
701  ssize_t stripIgnorable= integerRange(expectedNamedArguments[5].value, 0, 1, Error_Logical_value_user_defined, "Transform: value of named argument \"stripIgnorable\" must be 0 or 1");
702  ssize_t stripMark = integerRange(expectedNamedArguments[6].value, 0, 1, Error_Logical_value_user_defined, "Transform: value of named argument \"stripMark\" must be 0 or 1");
703  ssize_t stripNA = integerRange(expectedNamedArguments[7].value, 0, 1, Error_Logical_value_user_defined, "Transform: value of named argument \"stripNA\" must be 0 or 1");
704 
705  int options = 0;
706  if (casefold) options |= UTF8PROC_CASEFOLD;
707  if (lump) options |= UTF8PROC_LUMP;
708  if (nlf == 1) options |= UTF8PROC_NLF2LF;
709  if (nlf == 2) options |= UTF8PROC_NLF2LS;
710  if (nlf == 3) options |= UTF8PROC_NLF2PS;
711  if (normalization == 1) options |= UTF8PROC_STABLE | UTF8PROC_COMPOSE; // NFC
712  if (normalization == 2) options |= UTF8PROC_STABLE | UTF8PROC_DECOMPOSE; // NFD
713  if (normalization == 3) options |= UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT; // NFKC
714  if (normalization == 4) options |= UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT; // NFKD
715  if (stripCC) options |= UTF8PROC_STRIPCC;
716  if (stripIgnorable) options |= UTF8PROC_IGNORE;
717  if (stripMark) options |= UTF8PROC_STRIPMARK;
718  if (stripNA) options |= UTF8PROC_STRIPNA;
719 
720  return normalize(string, utf8proc_option_t(options));
721 }
722 
723 #if 0 // options that can be passed from executor
724 
725  /** Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE. */
726  UTF8PROC_IGNORE = (1<<5),
727 
728  /**
729  * Indicating that NLF-sequences (LF, CRLF, CR, NEL) are representing a
730  * line break, and should be converted to the codepoint for line
731  * separation (LS).
732  */
733  // convert LF, CRLF, CR and NEL into LS
734  UTF8PROC_NLF2LS = (1<<7),
735 
736  /**
737  * Indicating that NLF-sequences are representing a paragraph break, and
738  * should be converted to the codepoint for paragraph separation
739  * (PS).
740  */
741  // convert LF, CRLF, CR and NEL into PS
742  UTF8PROC_NLF2PS = (1<<8),
743 
744  /** Indicating that the meaning of NLF-sequences is unknown. */
745  // convert LF, CRLF, CR and NEL into LF
746  UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS),
747 
748  /** Strips and/or convers control characters.
749  *
750  * NLF-sequences are transformed into space, except if one of the
751  * NLF2LS/PS/LF options is given. HorizontalTab (HT) and FormFeed (FF)
752  * are treated as a NLF-sequence in this case. All other control
753  * characters are simply removed.
754  */
755  UTF8PROC_STRIPCC = (1<<9),
756 
757  /**
758  * Performs unicode case folding, to be able to do a case-insensitive
759  * string comparison.
760  */
761  UTF8PROC_CASEFOLD = (1<<10),
762 
763  /** Lumps certain characters together.
764  *
765  * E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-". See lump.md for details.
766  *
767  * If NLF2LF is set, this includes a transformation of paragraph and
768  * line separators to ASCII line-feed (LF).
769  */
770  UTF8PROC_LUMP = (1<<12),
771 
772  /** Strips all character markings.
773  *
774  * This includes non-spacing, spacing and enclosing (i.e. accents).
775  * @note This option works only with @ref UTF8PROC_COMPOSE or
776  * @ref UTF8PROC_DECOMPOSE
777  */
778  UTF8PROC_STRIPMARK = (1<<13),
779 
780  /**
781  * Strip unassigned codepoints.
782  */
783  UTF8PROC_STRIPNA = (1<<14),
784 
785 #endif
786 
787 
788 /******************************************************************************/
789 /* */
790 /* Unicode Class - uni-algo */
791 /* */
792 /******************************************************************************/
793 
794 #include <sstream>
795 #include <string>
796 
797 #include "m17n/uni-algo/include/uni_algo/version.h"
798 
799 
801 {
802  std::stringstream version;
803  version << una::version::library.major() << "." << una::version::library.minor() << "." << una::version::library.patch();
804  // Don't use version.str().cstr() because cstr() would return a pointer to a temporary object
805  // See https://stackoverflow.com/questions/1374468/stringstream-string-and-char-conversion-confusion
806  const std::string& version_str = version.str();
807  const char* version_cstr = version_str.c_str();
808  return new_string(version_cstr);
809 }
void reportException(wholenumber_t error)
#define min(a, b)
Definition: ArrayClass.cpp:82
@ T_RexxText
@ T_Unicode
RexxInteger * new_integer(wholenumber_t v)
#define OREF_NULL
Definition: RexxCore.h:61
RexxString * stringArgument(RexxObject *object, RexxString *kind, size_t position)
Definition: RexxCore.h:315
#define TheRexxTextClass
Definition: RexxCore.h:180
#define TheTrueObject
Definition: RexxCore.h:196
#define isOfClass(t, r)
Definition: RexxCore.h:224
#define TheNilObject
Definition: RexxCore.h:191
#define TheFalseObject
Definition: RexxCore.h:195
const int ARG_ONE
Definition: RexxCore.h:83
RexxArray * arrayArgument(RexxObject *object, RexxString *kind, size_t position)
Definition: RexxCore.h:395
#define IntegerZero
Definition: RexxCore.h:199
RexxInteger * REQUEST_INTEGER(RexxObject *obj)
Definition: RexxCore.h:460
#define Error_Unsupported_copy_method
#define Error_System_resources_user_defined
#define Error_Logical_value_user_defined
#define Error_Invalid_argument_user_defined
#define Error_System_service_user_defined
#define Error_Invalid_character_string_user_defined
#define Error_Unsupported_new_method
#define memory_mark(oref)
Definition: RexxMemory.hpp:450
RexxObject * new_object(size_t s)
Definition: RexxMemory.hpp:436
#define flatten_reference(oref, envel)
Definition: RexxMemory.hpp:498
#define CLASS_CREATE(name, id, className)
Definition: RexxMemory.hpp:503
#define memory_mark_general(oref)
Definition: RexxMemory.hpp:451
#define cleanUpFlatten
Definition: RexxMemory.hpp:484
#define setUpFlatten(type)
Definition: RexxMemory.hpp:478
RexxString * new_string(const char *s, stringsize_t l)
ssize_t integerRange(RexxObject *obj, ssize_t min, ssize_t max, wholenumber_t error, const char *errorMessage)
Definition: TextClass.cpp:273
bool isLittleEndian()
Definition: TextClass.cpp:301
RexxString * normalize(RexxString *string, utf8proc_option_t options)
Definition: TextClass.cpp:663
void raiseError(utf8proc_ssize_t errcode)
Definition: TextClass.cpp:374
ssize_t integer(RexxObject *obj, const char *errorMessage)
Definition: TextClass.cpp:289
static RexxActivity *volatile currentActivity
void match(RexxObject **namedArglist, size_t namedArgCount, bool strict, bool extraAllowed, size_t minimumRequired=0)
void put(RexxObject *eref, size_t pos)
Definition: ArrayClass.cpp:208
RexxObject * get(size_t pos)
Definition: ArrayClass.hpp:203
void liveGeneral(int reason)
Definition: ClassClass.cpp:87
void live(size_t)
Definition: ClassClass.cpp:67
void setBehaviour(RexxBehaviour *b)
void sendMessage(RexxString *, RexxArray *, RexxDirectory *, ProtectedObject &)
bool messageSend(RexxString *, RexxObject **, size_t, size_t, ProtectedObject &, bool processUnknown=true, bool dynamicTarget=true)
size_t getLength()
const char * getStringData()
void liveGeneral(int reason)
Definition: TextClass.cpp:217
void live(size_t)
Definition: TextClass.cpp:209
void live(size_t)
Definition: TextClass.cpp:144
RexxText * makeText()
Definition: TextClass.cpp:180
void liveGeneral(int reason)
Definition: TextClass.cpp:149
static RexxText * newText(RexxString *s)
Definition: TextClass.cpp:61
RexxText * textValue()
Definition: TextClass.cpp:195
static RexxTextClass * classInstance
Definition: TextClass.hpp:97
void flatten(RexxEnvelope *)
Definition: TextClass.cpp:154
RexxString * makeString()
Definition: TextClass.cpp:174
RexxText * primitiveMakeText()
Definition: TextClass.cpp:168
static void createInstance()
Definition: TextClass.cpp:56
RexxString * primitiveMakeString()
Definition: TextClass.cpp:161
static RexxText * nullText
Definition: TextClass.hpp:99
RexxObject * newRexx(RexxObject **, size_t, size_t)
Definition: TextClass.cpp:126
RexxObject * copyRexx()
Definition: TextClass.cpp:330
RexxObject * utf8proc_transform(RexxString *str, RexxObject **named_arglist, size_t named_argcount)
Definition: TextClass.cpp:681
RexxInteger * utf8proc_graphemeBreak(RexxArray *)
Definition: TextClass.cpp:410
static RexxClass * classInstance
Definition: TextClass.hpp:146
RexxInteger * utf8proc_codepointCharWidth(RexxObject *rexxCodepoint)
Definition: TextClass.cpp:573
RexxInteger * utf8proc_codepointCategory(RexxObject *rexxCodepoint)
Definition: TextClass.cpp:422
RexxInteger * utf8proc_codepointBidiClass(RexxObject *rexxCodepoint)
Definition: TextClass.cpp:473
RexxInteger * utf8proc_codepointToTitleSimple(RexxObject *rexxCodepoint)
Definition: TextClass.cpp:639
static void createInstance()
Definition: TextClass.cpp:318
RexxString * utf8proc_version()
Definition: TextClass.cpp:393
RexxInteger * utf8proc_codepointControlBoundary(RexxObject *rexxCodepoint)
Definition: TextClass.cpp:565
void live(size_t)
Definition: TextClass.cpp:342
RexxInteger * utf8proc_codepointToUpperSimple(RexxObject *rexxCodepoint)
Definition: TextClass.cpp:632
RexxInteger * utf8proc_codepointIsLower(RexxObject *rexxCodepoint)
Definition: TextClass.cpp:646
RexxString * unialgo_version()
Definition: TextClass.cpp:800
RexxInteger * utf8proc_codepointToLowerSimple(RexxObject *rexxCodepoint)
Definition: TextClass.cpp:625
RexxInteger * utf8proc_codepointDecompositionType(RexxObject *rexxCodepoint)
Definition: TextClass.cpp:517
RexxInteger * systemIsLittleEndian()
Definition: TextClass.cpp:359
void liveGeneral(int reason)
Definition: TextClass.cpp:347
RexxInteger * utf8proc_codepointBoundClass(RexxObject *rexxCodepoint)
Definition: TextClass.cpp:585
void flatten(RexxEnvelope *)
Definition: TextClass.cpp:352
RexxInteger * utf8proc_codepointIsUpper(RexxObject *rexxCodepoint)
Definition: TextClass.cpp:654
RexxInteger * utf8proc_codepointIgnorable(RexxObject *rexxCodepoint)
Definition: TextClass.cpp:557
RexxInteger * utf8proc_codepointBidiMirrored(RexxObject *rexxCodepoint)
Definition: TextClass.cpp:509
RexxObject * newRexx(RexxObject **, size_t, size_t)
Definition: TextClass.cpp:323
RexxInteger * utf8proc_codepointCombiningClass(RexxObject *rexxCodepoint)
Definition: TextClass.cpp:465
ssize_t wholenumber_t
Definition: rexx.h:230
#define SSIZE_MAX
signed __int64 int64_t
SSIZE_T ssize_t
char int8_t