loadPackage OK for extension/stringChunk.cls loadPackage OK for utilities/indentedStream.cls loadPackage OK for extension/extensions.cls loadLibrary OK for rxunixsys loadPackage OK for ncurses.cls loadPackage OK for csvStream.cls loadLibrary OK for hostemu loadPackage OK for json.cls loadPackage OK for mime.cls loadPackage OK for rxftp.cls loadLibrary OK for rxmath loadPackage OK for rxregexp.cls loadPackage OK for regex/regex.cls loadPackage OK for smtp.cls loadPackage OK for socket.cls loadPackage OK for streamsocket.cls loadPackage OK for pipeline/pipe.cls loadPackage OK for rgf_util2/rgf_util2.rex loadPackage OK for BSF.CLS loadPackage OK for oorexxshell_queries.cls loadPackage OK for pipeline/pipe_extension.cls loadPackage OK for rgf_util2/rgf_util2_wrappers.rex REXX-ooRexx_4.3.0(MT)_64-bit 6.04 22 Jun 2024 Input queue name: Sacc7Q600001c5ebe0 -------------------- -- Unicode libraries -------------------- /* The prototype is currently using 3 libraries: - utf8proc https://github.com/JuliaStrings/utf8proc - uni-algo https://github.com/uni-algo/uni-algo Current usage: - utf8proc is used for grapheme segmentation, characters properties, normalization, simple upper/lower/title. - uni-algo is used for full upper/lower/title. */ --------------------- -- Unicode characters --------------------- -- Unicode version ooRexx[bash]> .unicode~version= -- 15.1.0 '15.1.0' -- Unicode character names are not loaded by default ooRexx[bash]> call loadUnicodeCharacterNames Load the Unicode character names 15.1.0 ............................................ Total loaded character names: 149813 Total character name aliases: 473 Unicode character intervals not expanded, execute: call expandUnicodeCharacterIntervals ooRexx[bash]> .unicode~characters= (an UnicodeCharacterSupplier count=44185 size=918000) /* Unicode character names defined by interval are not loaded by default. The following method gives informations about these intervals. */ ooRexx[bash]> .Unicode~informations= a Directory (13 items) 'characterIntervals' : (an UnicodeCharacterIntervalSupplier count=17 notExpanded:17 intervals, 105693 characters) 'characters' : (an UnicodeCharacterSupplier count=44185 size=918000) 'dataDirectory' : (/local/rexx/oorexx/executor/sandbox/jlf/packages/encoding/Unicode-15.1.0) 'maxCodepoint' : 1114111 'memorizeTranscodings' : 0 'memorizeTransformations' : 0 'systemIsLittleEndian' : 1 'totalCharacterNameAliases' : 473 'totalCharactersLoaded' : 149813 'totalIntervalCharacters' : 105693 'totalIntervalCharactersNotExpanded' : 105693 'unckeckedConversionToString' : 1 'version' : '15.1.0' -- Select the characters whose category is Cc (Control) ooRexx[bash]> .unicode~characters~select{item~categoryName=="Cc"}== an Array (shape [65], 65 items) 1 : ( "" U+0000 Cc 0 "", "NULL", "NUL" ) 2 : ( "" U+0001 Cc 0 "", "START OF HEADING", "SOH" ) 3 : ( "" U+0002 Cc 0 "", "START OF TEXT", "STX" ) 4 : ( "" U+0003 Cc 0 "", "END OF TEXT", "ETX" ) 5 : ( "" U+0004 Cc 0 "", "END OF TRANSMISSION", "EOT" ) 6 : ( "" U+0005 Cc 0 "", "ENQUIRY", "ENQ" ) 7 : ( "" U+0006 Cc 0 "", "ACKNOWLEDGE", "ACK" ) 8 : ( "" U+0007 Cc 0 "", "ALERT", "BEL" ) 9 : ( "" U+0008 Cc 0 "", "BACKSPACE", "BS" ) 10 : ( "" U+0009 Cc 0 "", "CHARACTER TABULATION", "HORIZONTAL TABULATION", "HT", "TAB" ) 11 : ( "" U+000A Cc 0 "", "LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL" ) 12 : ( "" U+000B Cc 0 "", "LINE TABULATION", "VERTICAL TABULATION", "VT" ) 13 : ( "" U+000C Cc 0 "", "FORM FEED", "FF" ) 14 : ( "" U+000D Cc 0 "", "CARRIAGE RETURN", "CR" ) 15 : ( "" U+000E Cc 0 "", "SHIFT OUT", "LOCKING-SHIFT ONE", "SO" ) 16 : ( "" U+000F Cc 0 "", "SHIFT IN", "LOCKING-SHIFT ZERO", "SI" ) 17 : ( "" U+0010 Cc 0 "", "DATA LINK ESCAPE", "DLE" ) 18 : ( "" U+0011 Cc 0 "", "DEVICE CONTROL ONE", "DC1" ) 19 : ( "" U+0012 Cc 0 "", "DEVICE CONTROL TWO", "DC2" ) 20 : ( "" U+0013 Cc 0 "", "DEVICE CONTROL THREE", "DC3" ) 21 : ( "" U+0014 Cc 0 "", "DEVICE CONTROL FOUR", "DC4" ) 22 : ( "" U+0015 Cc 0 "", "NEGATIVE ACKNOWLEDGE", "NAK" ) 23 : ( "" U+0016 Cc 0 "", "SYNCHRONOUS IDLE", "SYN" ) 24 : ( "" U+0017 Cc 0 "", "END OF TRANSMISSION BLOCK", "ETB" ) 25 : ( "" U+0018 Cc 0 "", "CANCEL", "CAN" ) 26 : ( "" U+0019 Cc 0 "", "END OF MEDIUM", "EOM", "EM" ) 27 : ( "" U+001A Cc 0 "", "SUBSTITUTE", "SUB" ) 28 : ( "" U+001B Cc 0 "", "ESCAPE", "ESC" ) 29 : ( "" U+001C Cc 0 "", "INFORMATION SEPARATOR FOUR", "FILE SEPARATOR", "FS" ) 30 : ( "" U+001D Cc 0 "", "INFORMATION SEPARATOR THREE", "GROUP SEPARATOR", "GS" ) 31 : ( "" U+001E Cc 0 "", "INFORMATION SEPARATOR TWO", "RECORD SEPARATOR", "RS" ) 32 : ( "" U+001F Cc 0 "", "INFORMATION SEPARATOR ONE", "UNIT SEPARATOR", "US" ) 33 : ( "" U+007F Cc 0 "", "DELETE", "DEL" ) 34 : ( "" U+0080 Cc 0 "", "PADDING CHARACTER", "PAD" ) 35 : ( "" U+0081 Cc 0 "", "HIGH OCTET PRESET", "HOP" ) 36 : ( "" U+0082 Cc 0 "", "BREAK PERMITTED HERE", "BPH" ) 37 : ( "" U+0083 Cc 0 "", "NO BREAK HERE", "NBH" ) 38 : ( "" U+0084 Cc 0 "", "INDEX", "IND" ) 39 : ( " " U+0085 Cc 0 "", "NEXT LINE", "NEL" ) 40 : ( "" U+0086 Cc 0 "", "START OF SELECTED AREA", "SSA" ) 41 : ( "" U+0087 Cc 0 "", "END OF SELECTED AREA", "ESA" ) 42 : ( "" U+0088 Cc 0 "", "CHARACTER TABULATION SET", "HORIZONTAL TABULATION SET", "HTS" ) 43 : ( "" U+0089 Cc 0 "", "CHARACTER TABULATION WITH JUSTIFICATION", "HORIZONTAL TABULATION WITH JUSTIFICATION", "HTJ" ) 44 : ( "" U+008A Cc 0 "", "LINE TABULATION SET", "VERTICAL TABULATION SET", "VTS" ) 45 : ( "" U+008B Cc 0 "", "PARTIAL LINE FORWARD", "PARTIAL LINE DOWN", "PLD" ) 46 : ( "" U+008C Cc 0 "", "PARTIAL LINE BACKWARD", "PARTIAL LINE UP", "PLU" ) 47 : ( "" U+008D Cc 0 "", "REVERSE LINE FEED", "REVERSE INDEX", "RI" ) 48 : ( "" U+008E Cc 0 "", "SINGLE SHIFT TWO", "SINGLE-SHIFT-2", "SS2" ) 49 : ( "" U+008F Cc 0 "", "SINGLE SHIFT THREE", "SINGLE-SHIFT-3", "SS3" ) 50 : ( "" U+0090 Cc 0 "", "DEVICE CONTROL STRING", "DCS" ) 51 : ( "" U+0091 Cc 0 "", "PRIVATE USE ONE", "PRIVATE USE-1", "PU1" ) 52 : ( "" U+0092 Cc 0 "", "PRIVATE USE TWO", "PRIVATE USE-2", "PU2" ) 53 : ( "" U+0093 Cc 0 "", "SET TRANSMIT STATE", "STS" ) 54 : ( "" U+0094 Cc 0 "", "CANCEL CHARACTER", "CCH" ) 55 : ( "" U+0095 Cc 0 "", "MESSAGE WAITING", "MW" ) 56 : ( "" U+0096 Cc 0 "", "START OF GUARDED AREA", "START OF PROTECTED AREA", "SPA" ) 57 : ( "" U+0097 Cc 0 "", "END OF GUARDED AREA", "END OF PROTECTED AREA", "EPA" ) 58 : ( "" U+0098 Cc 0 "", "START OF STRING", "SOS" ) 59 : ( "" U+0099 Cc 0 "", "SINGLE GRAPHIC CHARACTER INTRODUCER", "SGC" ) 60 : ( "" U+009A Cc 0 "", "SINGLE CHARACTER INTRODUCER", "SCI" ) 61 : ( "" U+009B Cc 0 "", "CONTROL SEQUENCE INTRODUCER", "CSI" ) 62 : ( "" U+009C Cc 0 "", "STRING TERMINATOR", "ST" ) 63 : ( "" U+009D Cc 0 "", "OPERATING SYSTEM COMMAND", "OSC" ) 64 : ( "" U+009E Cc 0 "", "PRIVACY MESSAGE", "PM" ) 65 : ( "" U+009F Cc 0 "", "APPLICATION PROGRAM COMMAND", "APC" ) -- Select the characters whose name contains "rex" -- Loose matching rule UAX44-LM2, see https://unicode.org/reports/tr44/#Matching_Names -- Spaces are ignored (among others), that's why "LETTER EXTRA" is matching "*rex*" ooRexx[bash]> .unicode~characters("*rex*")== an Array (shape [15], 15 items) 1 : ( "꜌" U+A70C Sk 1 "MODIFIER LETTER EXTRA-LOW DOTTED TONE BAR" ) 2 : ( "˩" U+02E9 Sk 1 "MODIFIER LETTER EXTRA-LOW TONE BAR" ) 3 : ( "꜍" U+A70D Sk 1 "MODIFIER LETTER EXTRA-HIGH DOTTED LEFT-STEM TONE BAR" ) 4 : ( "꜑" U+A711 Sk 1 "MODIFIER LETTER EXTRA-LOW DOTTED LEFT-STEM TONE BAR" ) 5 : ( "˥" U+02E5 Sk 1 "MODIFIER LETTER EXTRA-HIGH TONE BAR" ) 6 : ( "꜈" U+A708 Sk 1 "MODIFIER LETTER EXTRA-HIGH DOTTED TONE BAR" ) 7 : ( "🖕" U+1F595 So 2 "REVERSED HAND WITH MIDDLE FINGER EXTENDED" ) 8 : ( "ꎅ" U+A385 Lo 2 "YI SYLLABLE RREX" ) 9 : ( "꜒" U+A712 Sk 1 "MODIFIER LETTER EXTRA-HIGH LEFT-STEM TONE BAR" ) 10 : ( "ꏑ" U+A3D1 Lo 2 "YI SYLLABLE REX" ) 11 : ( "ꎜ" U+A39C Lo 2 "YI SYLLABLE NREX" ) 12 : ( "꜖" U+A716 Sk 1 "MODIFIER LETTER EXTRA-LOW LEFT-STEM TONE BAR" ) 13 : ( "𖩿" U+16A7F Lo 1 "TANGSA LETTER EX" ) 14 : ( "𝍊" U+1D34A So 1 "TETRAGRAM FOR EXHAUSTION" ) 15 : ( "🦖" U+1F996 So 2 "T-REX" ) /* Regular expressions are supported: .unicode~characters("/^math.*psi$")== returns all the characters whose name starts with "math" and ends with "psi". */ ooRexx[bash]> .unicode~characters("/^math.*psi$")== an Array (shape [10], 10 items) 1 : ( "𝚿" U+1D6BF Lu 1 "MATHEMATICAL BOLD CAPITAL PSI" ) 2 : ( "𝛙" U+1D6D9 Ll 1 "MATHEMATICAL BOLD SMALL PSI" ) 3 : ( "𝛹" U+1D6F9 Lu 1 "MATHEMATICAL ITALIC CAPITAL PSI" ) 4 : ( "𝜓" U+1D713 Ll 1 "MATHEMATICAL ITALIC SMALL PSI" ) 5 : ( "𝜳" U+1D733 Lu 1 "MATHEMATICAL BOLD ITALIC CAPITAL PSI" ) 6 : ( "𝝍" U+1D74D Ll 1 "MATHEMATICAL BOLD ITALIC SMALL PSI" ) 7 : ( "𝝭" U+1D76D Lu 1 "MATHEMATICAL SANS-SERIF BOLD CAPITAL PSI" ) 8 : ( "𝞇" U+1D787 Ll 1 "MATHEMATICAL SANS-SERIF BOLD SMALL PSI" ) 9 : ( "𝞧" U+1D7A7 Lu 1 "MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL PSI" ) 10 : ( "𝟁" U+1D7C1 Ll 1 "MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL PSI" ) -- longest name ooRexx[bash]> .unicode~characters~reduce{if accu~name~length < item~name~length then item }= ( "🮨" U+1FBA8 So 1 "BOX DRAWINGS LIGHT DIAGONAL UPPER CENTRE TO MIDDLE LEFT AND MIDDLE RIGHT TO LOWER CENTRE" ) ooRexx[bash]> result~name~length= 88 /* Escape characters can be used in string literals, they are unescaped at run-time. Note: should be supported at parse-time, it's just that this prototype doesn't modify the lexer, for the moment. \N{Unicode name} Character name in the Unicode database or codepoint Label \u{Unicode name} Character name in the Unicode database or codepoint Label \U{Unicode name} same as \u{Unicode name} \u{X..X} Unicode character denoted by 1-8 hex digits. The first character must be a digit 0..9 \U{X..X} same as \u{X..X} \uXXXX Unicode character denoted by 4 hex digits ('u' lowercase) \UXXXXXXXX Unicode character denoted by 8 hex digits ('U' uppercase) \xXX 1 byte denoted by 2 hex digits ('x' lowercase) \XXXXX 2 bytes denoted by 4 hex digits ('X' uppercase) */ -- Character by name ooRexx[bash]> "hello\u{space}John\n"~text~unescape= -- T'hello John[0A]' T'hello John[0A]' ooRexx[bash]> "The \u{t-rex} shows his \u{flexed biceps}!"~text~unescape= -- T'The 🦖 shows his 💪!' T'The 🦖 shows his 💪!' -- Character by codepoint ooRexx[bash]> "hello\u{20}John\n"~text~unescape= -- T'hello John[0A]' T'hello John[0A]' ooRexx[bash]> "hello\u0020John\n"~text~unescape= -- T'hello John[0A]' T'hello John[0A]' ooRexx[bash]> "hello\U00000020John\n"~text~unescape= -- T'hello John[0A]' T'hello John[0A]' -- Name versus codepoint ooRexx[bash]> "\u{bed} is different from \u{0bed}"~text~unescape= -- T'🛏 is different from ௭' T'🛏 is different from ௭' ooRexx[bash]> .unicode~character("bed")= -- ( "🛏" U+1F6CF So 1 "BED" ) ( "🛏" U+1F6CF So 1 "BED" ) ooRexx[bash]> .unicode~character("bed", hexadecimal:)= -- ( "௭" U+0BED Nd 1 "TAMIL DIGIT SEVEN" ) ( "௭" U+0BED Nd 1 "TAMIL DIGIT SEVEN" ) ooRexx[bash]> .unicode~character("U+0bed")= -- ( "௭" U+0BED Nd 1 "TAMIL DIGIT SEVEN" ) ( "௭" U+0BED Nd 1 "TAMIL DIGIT SEVEN" ) -- High surrogate followed by low surrogate is invalid UTF-8 ooRexx[bash]> "\uD83D\uDE3F"~text~unescape= -- T'??????' T'������' -- High surrogate followed by low surrogate is valid WTF-8 ooRexx[bash]> "\uD83D\uDE3F"~text("wtf8")~unescape= -- T'😿' T'😿' -- \u is not supported for Byte encoding, you can use \x{X..X} ooRexx[bash]> "hello\u{20}John\n"~text("byte")~unescape= -- Byte encoding: \u not supported. Byte encoding: \u not supported. Error code= 23.900 ooRexx[bash]> "hello\x{20}John\n"~text("byte")~unescape= -- T'hello John[0A]' T'hello John[0A]' -- The method unescape is available only for Byte, UTF-8 and WTF-8. -- No implementation for UTF-16, WTF-16, UTF-32. ooRexx[bash]> "hello\u{U+20}John\n"~text~utf16~unescape= -- Method UNESCAPE is ABSTRACT and cannot be directly invoked. Method UNESCAPE is ABSTRACT and cannot be directly invoked. Error code= 93.965 -------------------------------- -- Unicode characters properties -------------------------------- -- Category -- http://www.unicode.org/reports/tr44/#General_Category_Values ooRexx[bash]> .unicode~codepointCategoryNames= ['Cn','Lu','Ll','Lt','Lm','Lo','Mn','Mc','Me','Nd','Nl','No','Pc','Pd','Ps','Pe','Pi','Pf','Po','Sm','Sc','Sk','So','Zs','Zl','Zp','Cc','Cf','Cs','Co'] -- First character of each category ooRexx[bash]> seen = .directory~new; .unicode~characters~each{expose seen; v = item~categoryName; if seen[v] == .nil then seen[v] = item}; seen= a Directory (28 items) 'Cc' : ( "" U+0000 Cc 0 "", "NULL", "NUL" ) 'Cf' : ( "" U+00AD Cf 1 "SOFT HYPHEN", "SHY" ) 'Cs' : ( "���" U+D83D Cs 0 "" ) 'Ll' : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 'Lm' : ( "ʰ" U+02B0 Lm 1 "MODIFIER LETTER SMALL H" ) 'Lo' : ( "ª" U+00AA Lo 1 "FEMININE ORDINAL INDICATOR" ) 'Lt' : ( "Dž" U+01C5 Lt 1 "LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON" ) 'Lu' : ( "A" U+0041 Lu 1 "LATIN CAPITAL LETTER A" ) 'Mc' : ( "ः" U+0903 Mc 0 "DEVANAGARI SIGN VISARGA" ) 'Me' : ( "҈" U+0488 Me 0 "COMBINING CYRILLIC HUNDRED THOUSANDS SIGN" ) 'Mn' : ( "̀" U+0300 Mn 0 "COMBINING GRAVE ACCENT" ) 'Nd' : ( "0" U+0030 Nd 1 "DIGIT ZERO" ) 'Nl' : ( "ᛮ" U+16EE Nl 1 "RUNIC ARLAUG SYMBOL" ) 'No' : ( "²" U+00B2 No 1 "SUPERSCRIPT TWO" ) 'Pc' : ( "_" U+005F Pc 1 "LOW LINE" ) 'Pd' : ( "-" U+002D Pd 1 "HYPHEN-MINUS" ) 'Pe' : ( ")" U+0029 Pe 1 "RIGHT PARENTHESIS" ) 'Pf' : ( "»" U+00BB Pf 1 "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK" ) 'Pi' : ( "«" U+00AB Pi 1 "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK" ) 'Po' : ( "!" U+0021 Po 1 "EXCLAMATION MARK" ) 'Ps' : ( "(" U+0028 Ps 1 "LEFT PARENTHESIS" ) 'Sc' : ( "$" U+0024 Sc 1 "DOLLAR SIGN" ) 'Sk' : ( "^" U+005E Sk 1 "CIRCUMFLEX ACCENT" ) 'Sm' : ( "+" U+002B Sm 1 "PLUS SIGN" ) 'So' : ( "¦" U+00A6 So 1 "BROKEN BAR" ) 'Zl' : ( " " U+2028 Zl 0 "LINE SEPARATOR" ) 'Zp' : ( " " U+2029 Zp 0 "PARAGRAPH SEPARATOR" ) 'Zs' : ( " " U+0020 Zs 1 "SPACE", "SP" ) -- Combining class -- http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values -- Canonical combining classes are defined in the Unicode Standard as integers in the range 0...254. -- For convenience, the standard assigns symbolic names to a subset of these combining classes. -- First character of each combining class ooRexx[bash]> seen = .directory~new; .unicode~characters~each{expose seen; v = item~combiningClass; if seen[cv] == .nil then seen[v] = item}; seen= a Directory (56 items) 0 : ( "󠇯" U+E01EF Mn 0 "VARIATION SELECTOR-256", "VS256" ) 1 : ( "𝅩" U+1D169 Mn 0 "MUSICAL SYMBOL COMBINING TREMOLO-3" ) 6 : ( "𖿱" U+16FF1 Mc 2 "VIETNAMESE ALTERNATE READING MARK NHAY" ) 7 : ( "𞥊" U+1E94A Mn 0 "ADLAM NUKTA" ) 8 : ( "゚" U+309A Mn 0 "COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK" ) 9 : ( "𑽂" U+11F42 Mn 0 "KAWI CONJOINER" ) 10 : ( "ְ" U+05B0 Mn 0 "HEBREW POINT SHEVA" ) 11 : ( "ֱ" U+05B1 Mn 0 "HEBREW POINT HATAF SEGOL" ) 12 : ( "ֲ" U+05B2 Mn 0 "HEBREW POINT HATAF PATAH" ) 13 : ( "ֳ" U+05B3 Mn 0 "HEBREW POINT HATAF QAMATS" ) 14 : ( "ִ" U+05B4 Mn 0 "HEBREW POINT HIRIQ" ) 15 : ( "ֵ" U+05B5 Mn 0 "HEBREW POINT TSERE" ) 16 : ( "ֶ" U+05B6 Mn 0 "HEBREW POINT SEGOL" ) 17 : ( "ַ" U+05B7 Mn 0 "HEBREW POINT PATAH" ) 18 : ( "ׇ" U+05C7 Mn 0 "HEBREW POINT QAMATS QATAN" ) 19 : ( "ֺ" U+05BA Mn 0 "HEBREW POINT HOLAM HASER FOR VAV" ) 20 : ( "ֻ" U+05BB Mn 0 "HEBREW POINT QUBUTS" ) 21 : ( "ּ" U+05BC Mn 0 "HEBREW POINT DAGESH OR MAPIQ" ) 22 : ( "ֽ" U+05BD Mn 0 "HEBREW POINT METEG" ) 23 : ( "ֿ" U+05BF Mn 0 "HEBREW POINT RAFE" ) 24 : ( "ׁ" U+05C1 Mn 0 "HEBREW POINT SHIN DOT" ) 25 : ( "ׂ" U+05C2 Mn 0 "HEBREW POINT SIN DOT" ) 26 : ( "ﬞ" U+FB1E Mn 0 "HEBREW POINT JUDEO-SPANISH VARIKA" ) 27 : ( "ࣰ" U+08F0 Mn 0 "ARABIC OPEN FATHATAN" ) 28 : ( "ࣱ" U+08F1 Mn 0 "ARABIC OPEN DAMMATAN" ) 29 : ( "ࣲ" U+08F2 Mn 0 "ARABIC OPEN KASRATAN" ) 30 : ( "َ" U+064E Mn 0 "ARABIC FATHA" ) 31 : ( "ُ" U+064F Mn 0 "ARABIC DAMMA" ) 32 : ( "ِ" U+0650 Mn 0 "ARABIC KASRA" ) 33 : ( "ّ" U+0651 Mn 0 "ARABIC SHADDA" ) 34 : ( "ْ" U+0652 Mn 0 "ARABIC SUKUN" ) 35 : ( "ٰ" U+0670 Mn 0 "ARABIC LETTER SUPERSCRIPT ALEF" ) 36 : ( "ܑ" U+0711 Mn 0 "SYRIAC LETTER SUPERSCRIPT ALAPH" ) 84 : ( "ౕ" U+0C55 Mn 0 "TELUGU LENGTH MARK" ) 91 : ( "ౖ" U+0C56 Mn 0 "TELUGU AI LENGTH MARK" ) 103 : ( "ู" U+0E39 Mn 0 "THAI CHARACTER SARA UU" ) 107 : ( "๋" U+0E4B Mn 0 "THAI CHARACTER MAI CHATTAWA" ) 118 : ( "ູ" U+0EB9 Mn 0 "LAO VOWEL SIGN UU" ) 122 : ( "໋" U+0ECB Mn 0 "LAO TONE MAI CATAWA" ) 129 : ( "ཱ" U+0F71 Mn 0 "TIBETAN VOWEL SIGN AA" ) 130 : ( "ྀ" U+0F80 Mn 0 "TIBETAN VOWEL SIGN REVERSED I" ) 132 : ( "ུ" U+0F74 Mn 0 "TIBETAN VOWEL SIGN U" ) 202 : ( "᷐" U+1DD0 Mn 0 "COMBINING IS BELOW" ) 214 : ( "᷎" U+1DCE Mn 0 "COMBINING OGONEK ABOVE" ) 216 : ( "𝅲" U+1D172 Mc 0 "MUSICAL SYMBOL COMBINING FLAG-5" ) 218 : ( "〪" U+302A Mn 0 "IDEOGRAPHIC LEVEL TONE MARK" ) 220 : ( "𞣖" U+1E8D6 Mn 0 "MENDE KIKAKUI COMBINING NUMBER MILLIONS" ) 222 : ( "〭" U+302D Mn 0 "IDEOGRAPHIC ENTERING TONE MARK" ) 224 : ( "〯" U+302F Mc 2 "HANGUL DOUBLE DOT TONE MARK" ) 226 : ( "𝅭" U+1D16D Mc 0 "MUSICAL SYMBOL COMBINING AUGMENTATION DOT" ) 228 : ( "〫" U+302B Mn 0 "IDEOGRAPHIC RISING TONE MARK" ) 230 : ( "𞥉" U+1E949 Mn 0 "ADLAM GEMINATE CONSONANT MODIFIER" ) 232 : ( "𞓭" U+1E4ED Mn 0 "NAG MUNDARI SIGN TOYOR" ) 233 : ( "᷼" U+1DFC Mn 0 "COMBINING DOUBLE INVERTED BREVE BELOW" ) 234 : ( "᷍" U+1DCD Mn 0 "COMBINING DOUBLE CIRCUMFLEX ABOVE" ) 240 : ( "ͅ" U+0345 Mn 0 "COMBINING GREEK YPOGEGRAMMENI" ) -- Bidirectionnal class -- http://www.unicode.org/reports/tr44/#Bidi_Class_Values ooRexx[bash]> .unicode~codepointBidiClassNames= ['L','LRE','LRO','R','AL','RLE','RLO','PDF','EN','ES','ET','AN','CS','NSM','BN','B','S','WS','ON','LRI','RLI','FSI','PDI'] -- First character of each bidirectionnal class ooRexx[bash]> seen = .directory~new; .unicode~characters~each{expose seen; v = item~bidiClassName; if seen[v] == .nil then seen[v] = item}; seen= a Directory (23 items) 'AL' : ( "؈" U+0608 Sm 1 "ARABIC RAY" ) 'AN' : ( "" U+0600 Cf 0 "ARABIC NUMBER SIGN" ) 'B' : ( "" U+000A Cc 0 "", "LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL" ) 'BN' : ( "" U+0000 Cc 0 "", "NULL", "NUL" ) 'CS' : ( "," U+002C Po 1 "COMMA" ) 'EN' : ( "0" U+0030 Nd 1 "DIGIT ZERO" ) 'ES' : ( "+" U+002B Sm 1 "PLUS SIGN" ) 'ET' : ( "#" U+0023 Po 1 "NUMBER SIGN" ) 'FSI' : ( "" U+2068 Cf 0 "FIRST STRONG ISOLATE", "FSI" ) 'L' : ( "A" U+0041 Lu 1 "LATIN CAPITAL LETTER A" ) 'LRE' : ( "" U+202A Cf 0 "LEFT-TO-RIGHT EMBEDDING", "LRE" ) 'LRI' : ( "" U+2066 Cf 0 "LEFT-TO-RIGHT ISOLATE", "LRI" ) 'LRO' : ( "" U+202D Cf 0 "LEFT-TO-RIGHT OVERRIDE", "LRO" ) 'NSM' : ( "̀" U+0300 Mn 0 "COMBINING GRAVE ACCENT" ) 'ON' : ( "!" U+0021 Po 1 "EXCLAMATION MARK" ) 'PDF' : ( "" U+202C Cf 0 "POP DIRECTIONAL FORMATTING", "PDF" ) 'PDI' : ( "" U+2069 Cf 0 "POP DIRECTIONAL ISOLATE", "PDI" ) 'R' : ( "־" U+05BE Pd 1 "HEBREW PUNCTUATION MAQAF" ) 'RLE' : ( "" U+202B Cf 0 "RIGHT-TO-LEFT EMBEDDING", "RLE" ) 'RLI' : ( "" U+2067 Cf 0 "RIGHT-TO-LEFT ISOLATE", "RLI" ) 'RLO' : ( "" U+202E Cf 0 "RIGHT-TO-LEFT OVERRIDE", "RLO" ) 'S' : ( "" U+0009 Cc 0 "", "CHARACTER TABULATION", "HORIZONTAL TABULATION", "HT", "TAB" ) 'WS' : ( "" U+000C Cc 0 "", "FORM FEED", "FF" ) -- Bidi mirrored (boolean) -- https://unicode.org/reports/tr9/ -- First 10 characters such as bidiMirrored == .true ooRexx[bash]> .unicode~characters~pipe(.select {item~bidiMirrored} | .take 10 | .console) 40 : ( "(" U+0028 Ps 1 "LEFT PARENTHESIS" ) 41 : ( ")" U+0029 Pe 1 "RIGHT PARENTHESIS" ) 60 : ( "<" U+003C Sm 1 "LESS-THAN SIGN" ) 62 : ( ">" U+003E Sm 1 "GREATER-THAN SIGN" ) 91 : ( "[" U+005B Ps 1 "LEFT SQUARE BRACKET" ) 93 : ( "]" U+005D Pe 1 "RIGHT SQUARE BRACKET" ) 123 : ( "{" U+007B Ps 1 "LEFT CURLY BRACKET" ) 125 : ( "}" U+007D Pe 1 "RIGHT CURLY BRACKET" ) 171 : ( "«" U+00AB Pi 1 "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK" ) 187 : ( "»" U+00BB Pf 1 "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK" ) -- Decomposition type -- https://unicode.org/reports/tr15/ ooRexx[bash]> .unicode~codepointDecompositionTypeNames= ['None','Font','Nobreak','Initial','Medial','Final','Isolated','Circle','Super','Sub','Vertical','Wide','Narrow','Small','Square','Fraction','Compat'] -- First character of each decomposition type ooRexx[bash]> seen = .directory~new; .unicode~characters~each{expose seen; v = item~decompositionTypeName; if seen[v] == .nil then seen[v] = item}; seen= a Directory (17 items) 'Circle' : ( "①" U+2460 No 1 "CIRCLED DIGIT ONE" ) 'Compat' : ( "¨" U+00A8 Sk 1 "DIAERESIS" ) 'Final' : ( "ﭑ" U+FB51 Lo 1 "ARABIC LETTER ALEF WASLA FINAL FORM" ) 'Font' : ( "ℂ" U+2102 Lu 1 "DOUBLE-STRUCK CAPITAL C" ) 'Fraction' : ( "¼" U+00BC No 1 "VULGAR FRACTION ONE QUARTER" ) 'Initial' : ( "ﭔ" U+FB54 Lo 1 "ARABIC LETTER BEEH INITIAL FORM" ) 'Isolated' : ( "ﭐ" U+FB50 Lo 1 "ARABIC LETTER ALEF WASLA ISOLATED FORM" ) 'Medial' : ( "ﭕ" U+FB55 Lo 1 "ARABIC LETTER BEEH MEDIAL FORM" ) 'Narrow' : ( "。" U+FF61 Po 1 "HALFWIDTH IDEOGRAPHIC FULL STOP" ) 'Nobreak' : ( " " U+00A0 Zs 1 "NO-BREAK SPACE", "NBSP" ) 'None' : ( "" U+0000 Cc 0 "", "NULL", "NUL" ) 'Small' : ( "﹐" U+FE50 Po 2 "SMALL COMMA" ) 'Square' : ( "㉐" U+3250 So 2 "PARTNERSHIP SIGN" ) 'Sub' : ( "ᵢ" U+1D62 Lm 1 "LATIN SUBSCRIPT SMALL LETTER I" ) 'Super' : ( "ª" U+00AA Lo 1 "FEMININE ORDINAL INDICATOR" ) 'Vertical' : ( "ゟ" U+309F Lo 2 "HIRAGANA DIGRAPH YORI" ) 'Wide' : ( " " U+3000 Zs 2 "IDEOGRAPHIC SPACE" ) -- Ignorable (boolean) -- http://www.unicode.org/review/pr-5.html -- http://unicode.org/L2/L2002/02368-default-ignorable.html -- First 10 characters such as ignorable == .true ooRexx[bash]> .unicode~characters~pipe(.select {item~ignorable} | .take 10 | .console) 173 : ( "" U+00AD Cf 1 "SOFT HYPHEN", "SHY" ) 847 : ( "͏" U+034F Mn 0 "COMBINING GRAPHEME JOINER", "CGJ" ) 1564 : ( "" U+061C Cf 0 "ARABIC LETTER MARK", "ALM" ) 4447 : ( "ᅟ" U+115F Lo 2 "HANGUL CHOSEONG FILLER" ) 4448 : ( "ᅠ" U+1160 Lo 1 "HANGUL JUNGSEONG FILLER" ) 6068 : ( "឴" U+17B4 Mn 0 "KHMER VOWEL INHERENT AQ" ) 6069 : ( "឵" U+17B5 Mn 0 "KHMER VOWEL INHERENT AA" ) 6155 : ( "᠋" U+180B Mn 0 "MONGOLIAN FREE VARIATION SELECTOR ONE", "FVS1" ) 6156 : ( "᠌" U+180C Mn 0 "MONGOLIAN FREE VARIATION SELECTOR TWO", "FVS2" ) 6157 : ( "᠍" U+180D Mn 0 "MONGOLIAN FREE VARIATION SELECTOR THREE", "FVS3" ) -- Boundary (boolean) -- http://unicode.org/reports/tr29/tr29-6.html -- First 10 characters such as controlBoundary == .true ooRexx[bash]> .unicode~characters~pipe(.select {item~controlBoundary} | .take 10 | .console) 0 : ( "" U+0000 Cc 0 "", "NULL", "NUL" ) 1 : ( "" U+0001 Cc 0 "", "START OF HEADING", "SOH" ) 2 : ( "" U+0002 Cc 0 "", "START OF TEXT", "STX" ) 3 : ( "" U+0003 Cc 0 "", "END OF TEXT", "ETX" ) 4 : ( "" U+0004 Cc 0 "", "END OF TRANSMISSION", "EOT" ) 5 : ( "" U+0005 Cc 0 "", "ENQUIRY", "ENQ" ) 6 : ( "" U+0006 Cc 0 "", "ACKNOWLEDGE", "ACK" ) 7 : ( "" U+0007 Cc 0 "", "ALERT", "BEL" ) 8 : ( "" U+0008 Cc 0 "", "BACKSPACE", "BS" ) 9 : ( "" U+0009 Cc 0 "", "CHARACTER TABULATION", "HORIZONTAL TABULATION", "HT", "TAB" ) -- Char width -- First character of each width value ooRexx[bash]> seen = .directory~new; .unicode~characters~each{expose seen; v = item~charWidth; if seen[v] == .nil then seen[v] = item}; seen= a Directory (3 items) 0 : ( "" U+0000 Cc 0 "", "NULL", "NUL" ) 1 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 2 : ( "ᄀ" U+1100 Lo 2 "HANGUL CHOSEONG KIYEOK" ) -- Bound class -- https://unicode.org/reports/tr29/ ooRexx[bash]> .unicode~codepointBoundClassNames= ['START','XX','CR','LF','CN','EX','L','V','T','LV','LVT','RI','SM','PP','ZWJ','EB','EM','GAZ','EBG','EXTENDED_PICTOGRAPHIC','E_ZWG'] -- First character of each bound class ooRexx[bash]> seen = .directory~new; .unicode~characters~each{expose seen; v = item~boundClassName; if seen[v] == .nil then seen[v] = item}; seen= a Directory (15 items) 'CN' : ( "" U+0000 Cc 0 "", "NULL", "NUL" ) 'CR' : ( "" U+000D Cc 0 "", "CARRIAGE RETURN", "CR" ) 'EX' : ( "̀" U+0300 Mn 0 "COMBINING GRAVE ACCENT" ) 'EXTENDED_PICTOGRAPHIC' : ( "©" U+00A9 So 1 "COPYRIGHT SIGN" ) 'L' : ( "ᄀ" U+1100 Lo 2 "HANGUL CHOSEONG KIYEOK" ) 'LF' : ( "" U+000A Cc 0 "", "LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL" ) 'LV' : ( "가" U+AC00 Lo 2 "HANGUL SYLLABLE GA" ) 'LVT' : ( "각" U+AC01 Lo 2 "HANGUL SYLLABLE GAG" ) 'PP' : ( "" U+0600 Cf 0 "ARABIC NUMBER SIGN" ) 'RI' : ( "🇦" U+1F1E6 So 1 "REGIONAL INDICATOR SYMBOL LETTER A" ) 'SM' : ( "ः" U+0903 Mc 0 "DEVANAGARI SIGN VISARGA" ) 'T' : ( "ᆨ" U+11A8 Lo 1 "HANGUL JONGSEONG KIYEOK" ) 'V' : ( "ᅠ" U+1160 Lo 1 "HANGUL JUNGSEONG FILLER" ) 'XX' : ( " " U+0020 Zs 1 "SPACE", "SP" ) 'ZWJ' : ( "" U+200D Cf 0 "ZERO WIDTH JOINER", "ZWJ" ) -- isLower -- Forty-first to fifthieth characters such as isLower == .true ooRexx[bash]> .unicode~characters~pipe(.select {item~isLower} | .take 50 | .take "last" 10 | .console) 234 : ( "ê" U+00EA Ll 1 "LATIN SMALL LETTER E WITH CIRCUMFLEX" ) 235 : ( "ë" U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" ) 236 : ( "ì" U+00EC Ll 1 "LATIN SMALL LETTER I WITH GRAVE" ) 237 : ( "í" U+00ED Ll 1 "LATIN SMALL LETTER I WITH ACUTE" ) 238 : ( "î" U+00EE Ll 1 "LATIN SMALL LETTER I WITH CIRCUMFLEX" ) 239 : ( "ï" U+00EF Ll 1 "LATIN SMALL LETTER I WITH DIAERESIS" ) 240 : ( "ð" U+00F0 Ll 1 "LATIN SMALL LETTER ETH" ) 241 : ( "ñ" U+00F1 Ll 1 "LATIN SMALL LETTER N WITH TILDE" ) 242 : ( "ò" U+00F2 Ll 1 "LATIN SMALL LETTER O WITH GRAVE" ) 243 : ( "ó" U+00F3 Ll 1 "LATIN SMALL LETTER O WITH ACUTE" ) -- isUpper -- Forty-first to fifthieth characters such as isUpper == .true ooRexx[bash]> .unicode~characters~pipe(.select {item~isUpper} | .take 50 | .take "last" 10 | .console) 206 : ( "Î" U+00CE Lu 1 "LATIN CAPITAL LETTER I WITH CIRCUMFLEX" ) 207 : ( "Ï" U+00CF Lu 1 "LATIN CAPITAL LETTER I WITH DIAERESIS" ) 208 : ( "Ð" U+00D0 Lu 1 "LATIN CAPITAL LETTER ETH" ) 209 : ( "Ñ" U+00D1 Lu 1 "LATIN CAPITAL LETTER N WITH TILDE" ) 210 : ( "Ò" U+00D2 Lu 1 "LATIN CAPITAL LETTER O WITH GRAVE" ) 211 : ( "Ó" U+00D3 Lu 1 "LATIN CAPITAL LETTER O WITH ACUTE" ) 212 : ( "Ô" U+00D4 Lu 1 "LATIN CAPITAL LETTER O WITH CIRCUMFLEX" ) 213 : ( "Õ" U+00D5 Lu 1 "LATIN CAPITAL LETTER O WITH TILDE" ) 214 : ( "Ö" U+00D6 Lu 1 "LATIN CAPITAL LETTER O WITH DIAERESIS" ) 216 : ( "Ø" U+00D8 Lu 1 "LATIN CAPITAL LETTER O WITH STROKE" ) ----------------------- -- Unicode case folding ----------------------- /* See https://www.w3.org/TR/charmod-norm/ Case folding is the process of making two texts which differ only in case identical for comparison purposes. */ ooRexx[bash]> ooRexx[bash]> ("sTrasse", "straße", "STRASSE")~each{item~text~casefold}== an Array (shape [3], 3 items) 1 : T'strasse' 2 : T'strasse' 3 : T'strasse' /* utf8proc doesn't support language-sensitive case-folding. The Julia developers, who use utf8proc, have decided to remain locale-independent. See https://github.com/JuliaLang/julia/issues/7848 Example: The name of the second largest city in Turkey is "Diyarbakır", which contains both the dotted and dotless letters i. */ ooRexx[bash]> ooRexx[bash]> "DİYARBAKIR"~text~casefold= -- T'di̇yarbakir' should be diyarbakır T'di̇yarbakir' ---------------------- -- Unicode upper lower ---------------------- ooRexx[bash]> smallSharpS = .unicode~character("LATIN SMALL LETTER SHARP S") -- ( "ß" U+00DF Ll 1 "LATIN SMALL LETTER SHARP S" ) ooRexx[bash]> smallSharpS~toUpperSimple= -- 7838 (codepoint in decimal) 7838 ooRexx[bash]> smallSharpS~toUpperFull= -- TODO: 83 83 (codepoints in decimal) 7838 ooRexx[bash]> .Unicode~character(smallSharpS~toUpperSimple)= -- ( "ẞ" U+1E9E Lu 1 "LATIN CAPITAL LETTER SHARP S" ) ( "ẞ" U+1E9E Lu 1 "LATIN CAPITAL LETTER SHARP S" ) ooRexx[bash]> "ß"~text~upper= -- TODO: T'SS' T'ẞ' /* Unicode standard 15 section 5.18 Case Mappings: Default casing Tailored casing (small sharp) ß <--- ẞ (capital sharp) (small sharp) ß <--> ẞ (capital sharp) (small sharp) ß ---> SS ss <--> SS ss <--> SS When using the default Unicode casing operations, capital sharp s will lowercase to small sharp s, but not vice versa: small sharp s uppercases to “SS”. A tailored casing operation is needed in circumstances requiring small sharp s to uppercase to capital sharp s. */ /* Another example of wrong result for upper/lower */ ooRexx[bash]> "Diyarbakır"~text~upper= -- T'DIYARBAKIR' should be DİYARBAKIR T'DIYARBAKIR' ooRexx[bash]> "DİYARBAKIR"~text~lower= -- T'diyarbakir' T'diyarbakir' -------------------------- -- Unicode transformations -------------------------- /* Method .Unicode~utf8proc_transform The purpose of this method is to support all the transformations provided by utf8proc. Takes a byte string as input (UTF-8 encoded), returns a new transformed byte string as output (UTF-8). TODO: Add support for UTF-16, UTF-32 */ ooRexx[bash]> string = "\u{BEL}Le\u{IDEOGRAPHIC SPACE}\u{OGHAM SPACE MARK}\u{ZERO-WIDTH-SPACE}Père\t\u{HYPHEN}\u{SOFT-HYPHEN}\u{EN DASH}\u{EM DASH}Noël\x{EFB790}\r\n" ooRexx[bash]> text = string~text~unescape ooRexx[bash]> text~UnicodeCharacters== an Array (shape [22], 22 items) 1 : ( "" U+0007 Cc 0 "", "ALERT", "BEL" ) 2 : ( "L" U+004C Lu 1 "LATIN CAPITAL LETTER L" ) 3 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) 4 : ( " " U+3000 Zs 2 "IDEOGRAPHIC SPACE" ) 5 : ( " " U+1680 Zs 1 "OGHAM SPACE MARK" ) 6 : ( "" U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" ) 7 : ( "P" U+0050 Lu 1 "LATIN CAPITAL LETTER P" ) 8 : ( "è" U+00E8 Ll 1 "LATIN SMALL LETTER E WITH GRAVE" ) 9 : ( "r" U+0072 Ll 1 "LATIN SMALL LETTER R" ) 10 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) 11 : ( "" U+0009 Cc 0 "", "CHARACTER TABULATION", "HORIZONTAL TABULATION", "HT", "TAB" ) 12 : ( "‐" U+2010 Pd 1 "HYPHEN" ) 13 : ( "" U+00AD Cf 1 "SOFT HYPHEN", "SHY" ) 14 : ( "–" U+2013 Pd 1 "EN DASH" ) 15 : ( "—" U+2014 Pd 1 "EM DASH" ) 16 : ( "N" U+004E Lu 1 "LATIN CAPITAL LETTER N" ) 17 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 18 : ( "ë" U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" ) 19 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) 20 : ( "" U+FDD0 Cn 1 "" ) 21 : ( "" U+000D Cc 0 "", "CARRIAGE RETURN", "CR" ) 22 : ( "" U+000A Cc 0 "", "LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL" ) /* Possible transformations: 1 : ( "" U+0007 Cc 0 "", "ALERT", "BEL" ) <-- removable with STRIPCC: 2 : ( "L" U+004C Lu 1 "LATIN CAPITAL LETTER L" ) 3 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) 4 : ( " " U+3000 Zs 2 "IDEOGRAPHIC SPACE" ) <-- replaceable by " " with LUMP: 5 : ( " " U+1680 Zs 1 "OGHAM SPACE MARK" ) <-- replaceable by " " with LUMP: 6 : ( "" U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" ) <-- removable by STRIPIGNORABLE: 7 : ( "P" U+0050 Lu 1 "LATIN CAPITAL LETTER P" ) 8 : ( "è" U+00E8 Ll 1 "LATIN SMALL LETTER E WITH GRAVE" ) <-- replaceable by "e" with normalization + STRIPMARK: 9 : ( "r" U+0072 Ll 1 "LATIN SMALL LETTER R" ) 10 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) 11 : ( "" U+0009 Cc 0 "", "CHARACTER TABULATION" ) <-- replaceable by " " with STRIPCC: 12 : ( "‐" U+2010 Pd 1 "HYPHEN" ) <-- replaceable by "-" with LUMP: 13 : ( "" U+00AD Cf 1 "SOFT HYPHEN", "SHY" ) <-- removable by STRIPIGNORABLE: 14 : ( "–" U+2013 Pd 1 "EN DASH" ) <-- replaceable by "-" with LUMP: 15 : ( "—" U+2014 Pd 1 "EM DASH" ) <-- replaceable by "-" with LUMP: 16 : ( "N" U+004E Lu 1 "LATIN CAPITAL LETTER N" ) 17 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 18 : ( "ë" U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" ) <-- replaceable by "e" with normalization + STRIPMARK: 19 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) 20 : ( "" U+FDD0 Cn 1 "" ) <-- removeable with STRIPNA: 21 : ( "" U+000D Cc 0 "", "CARRIAGE RETURN", "CR" ) 22 : ( "" U+000A Cc 0 "", "LINE FEED" ) <-- CR+LF replaceable by " " with STRIPCC: */ ooRexx[bash]> text= -- T'[07]Le Père[09]‐–—Noël[0D0A]' T'[07]Le Père[09]‐–—Noël[0D0A]' -- Performs unicode case folding, to be able to do a case-insensitive string comparison. ooRexx[bash]> .Unicode~utf8proc_transform(text~string, casefold:)= -- '[07]le père[09]‐–—noël[0D0A]' '[07]le père[09]‐–—noël[0D0A]' -- Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE ooRexx[bash]> .Unicode~utf8proc_transform(text~string, stripIgnorable:)= -- '[07]Le Père[09]‐–—Noël[0D0A]' '[07]Le Père[09]‐–—Noël[0D0A]' -- Lumps certain characters together. See lump.md for details: -- https://github.com/JuliaStrings/utf8proc/blob/master/lump.md -- E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-" -- jlf: I was expecting to have only one space and one "-" but that's not the case -- Seems working as designed... -- All the concerned characters become the same character, but still remain distinct characters. ooRexx[bash]> .Unicode~utf8proc_transform(text~string, lump:)= -- '[07]Le Père[09]---Noël[0D0A]' '[07]Le Père[09]---Noël[0D0A]' -- NLF2LF: Convert LF, CRLF, CR and NEL into LF ooRexx[bash]> .Unicode~utf8proc_transform(text~string, NLF:1)= -- '[07]Le Père[09]‐–—Noël[0A]' '[07]Le Père[09]‐–—Noël[0A]' -- NLF2LS: Convert LF, CRLF, CR and NEL into LS (U+2028 Zl 0 "LINE SEPARATOR") ooRexx[bash]> .Unicode~utf8proc_transform(text~string, NLF:2)= -- '[07]Le Père[09]‐–—Noël' '[07]Le Père[09]‐–—Noël ' -- NLF2PS: convert LF, CRLF, CR and NEL into PS (U+2029 Zp 0 "PARAGRAPH SEPARATOR") ooRexx[bash]> .Unicode~utf8proc_transform(text~string, NLF:3)= -- '[07]Le Père[09]‐–—Noël ' '[07]Le Père[09]‐–—Noël ' -- Strips and/or converts control characters. ooRexx[bash]> .Unicode~utf8proc_transform(text~string, stripCC:)= -- 'Le Père ‐–—Noël ' 'Le Père ‐–—Noël ' -- Strips all character markings. -- This includes non-spacing, spacing and enclosing (i.e. accents). -- This option works only with normalization. ooRexx[bash]> .Unicode~utf8proc_transform(text~string, stripMark:, normalization:1)= -- '[07]Le Pere[09]‐–—Noel[0D0A]' '[07]Le Pere[09]‐–—Noel[0D0A]' -- Strips unassigned codepoints. ooRexx[bash]> .Unicode~utf8proc_transform(text~string, stripNA:)= -- '[07]Le Père[09]‐–—Noël[0D0A]' '[07]Le Père[09]‐–—Noël[0D0A]' -- Application of several options ooRexx[bash]> .Unicode~utf8proc_transform(text~string, casefold:, lump:, normalization:1, stripIgnorable:, stripCC:, stripMark:, stripNA:)= -- 'le pere ---noel ' 'le pere ---noel ' /* Some comments about the transformations: 1 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) 2 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) 3 : ( " " U+0020 Zs 1 "SPACE", "SP" ) <-- LUMP (was IDEOGRAPHIC SPACE) 4 : ( " " U+0020 Zs 1 "SPACE", "SP" ) <-- LUMP (was OGHAM SPACE MARK) 5 : ( "p" U+0070 Ll 1 "LATIN SMALL LETTER P" ) 6 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) 7 : ( "r" U+0072 Ll 1 "LATIN SMALL LETTER R" ) 8 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) 9 : ( " " U+0020 Zs 1 "SPACE", "SP" ) <-- STRIPCC (was TAB) 10 : ( "-" U+002D Pd 1 "HYPHEN-MINUS" ) <-- LUMP (was HYPHEN) 11 : ( "-" U+002D Pd 1 "HYPHEN-MINUS" ) <-- LUMP (was EN DASH) 12 : ( "-" U+002D Pd 1 "HYPHEN-MINUS" ) <-- LUMP (was EM DASH) 13 : ( "n" U+006E Ll 1 "LATIN SMALL LETTER N" ) 14 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 15 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) 16 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) 17 : ( " " U+0020 Zs 1 "SPACE", "SP" ) <-- STRIPCC (was CR+LF) */ ------------------------- -- Unicode normalizations ------------------------- /* Normalization NFC, NFD, NFKC, NFKD. http://unicode.org/faq/normalization.html Implemented with utf8proc. Methods on RexxText: ~NFC ~isNFC ~NFD ~isNFD ~NFKC ~isNFKC ~NFKD ~isNFKD Possible values for isNFxx: -1 unknown 0 no 1 yes A same text can be in several normalization forms. Text exclusively containing ASCII characters (U+0000..U+007F) is left unaffected by all of the Normalization Forms: The 4 indicators isNFxx are 1. The methods NFxx sets the corresponding indicator isNFxx - on the source text : 0 or 1 (test if both strings are equal) - on the result text : 1 */ ooRexx[bash]> "only ASCII"~text~isNFC= -- 1 1 ooRexx[bash]> "only ASCII"~text~isNFD= -- 1 1 ooRexx[bash]> "only ASCII"~text~isNFKC= -- 1 1 ooRexx[bash]> "only ASCII"~text~isNFKD= -- 1 1 ooRexx[bash]> text = "Noël"~text ooRexx[bash]> text~isNFC= -- 1 1 ooRexx[bash]> text~isNFD= -- -1 -1 ooRexx[bash]> textNFC = text~NFC ooRexx[bash]> textNFD = text~NFD ooRexx[bash]> text~isNFC= -- 1 1 ooRexx[bash]> text~isNFD= -- 0 0 ooRexx[bash]> textNFC~isNFC= -- 1 1 ooRexx[bash]> textNFD~isNFD= -- 1 1 /* The normalized text can be memorized on the original text: text = "père Noël"~text textNFD = text~nfd(memorize:) From now, the returned NFD is always the memorized text. */ ooRexx[bash]> text = xrange("0", "FF"x)~text("cp1252")~utf8 ooRexx[bash]> text= T'0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ' ooRexx[bash]> text~isNFD= -1 ooRexx[bash]> textNFD = text~NFD(memorize:) text~nfd~"==":.object( textNFD)= -- 1 (this is really the same object) 1 /* Some remarks about the string used in the next demo: - the first "äöü" is NFC, the second "äöü" is NFD - "x̂" is two codepoints in any normalization. - "ϔ" normalization forms are all different. - "ﷺ" is one of the worst cases regarding the expansion factor in NFKS/NFKS: 18x - "baffle"~text~subchar(3)= -- T'ffl' "baffle"~text~upper= -- T'BAfflE', not BAFFLE The ligature disappears in NFK[CD] but not in NF[CD] */ ooRexx[bash]> "äöü äöü x̂ ϔ ﷺ baffle"~text~UnicodeCharacters== an Array (shape [22], 22 items) 1 : ( "ä" U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" ) 2 : ( "ö" U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" ) 3 : ( "ü" U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" ) 4 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 5 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 6 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 7 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 8 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 9 : ( "u" U+0075 Ll 1 "LATIN SMALL LETTER U" ) 10 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 11 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 12 : ( "x" U+0078 Ll 1 "LATIN SMALL LETTER X" ) 13 : ( "̂" U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" ) 14 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 15 : ( "ϔ" U+03D4 Lu 1 "GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL" ) 16 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 17 : ( "ﷺ" U+FDFA Lo 1 "ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM" ) 18 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 19 : ( "b" U+0062 Ll 1 "LATIN SMALL LETTER B" ) 20 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 21 : ( "ffl" U+FB04 Ll 1 "LATIN SMALL LIGATURE FFL" ) 22 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) ooRexx[bash]> "äöü äöü x̂ ϔ ﷺ baffle"~text~description= -- 'UTF-8 not-ASCII (18 characters, 22 codepoints, 34 bytes, 0 error)' 'UTF-8 not-ASCII (18 characters, 22 codepoints, 34 bytes, 0 error)' ooRexx[bash]> "äöü äöü x̂ ϔ ﷺ baffle"~text~upper= -- T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE' /* NFD Normalization Form D Canonical Decomposition Characters are decomposed by canonical equivalence, and multiple combining characters are arranged in a specific order. */ ooRexx[bash]> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfd~UnicodeCharacters== an Array (shape [26], 26 items) 1 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 2 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 3 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 4 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 5 : ( "u" U+0075 Ll 1 "LATIN SMALL LETTER U" ) 6 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 7 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 8 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 9 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 10 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 11 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 12 : ( "u" U+0075 Ll 1 "LATIN SMALL LETTER U" ) 13 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 14 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 15 : ( "x" U+0078 Ll 1 "LATIN SMALL LETTER X" ) 16 : ( "̂" U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" ) 17 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 18 : ( "ϒ" U+03D2 Lu 1 "GREEK UPSILON WITH HOOK SYMBOL" ) 19 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 20 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 21 : ( "ﷺ" U+FDFA Lo 1 "ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM" ) 22 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 23 : ( "b" U+0062 Ll 1 "LATIN SMALL LETTER B" ) 24 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 25 : ( "ffl" U+FB04 Ll 1 "LATIN SMALL LIGATURE FFL" ) 26 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) ooRexx[bash]> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfd~description= -- 'UTF-8 not-ASCII (18 characters, 26 codepoints, 39 bytes, 0 error)' 'UTF-8 not-ASCII (18 characters, 26 codepoints, 39 bytes, 0 error)' ooRexx[bash]> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfd~upper= -- T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE' T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE' /* NFC Normalization Form C Canonical Decomposition, followed by Canonical Composition Characters are decomposed and then recomposed by canonical equivalence. */ ooRexx[bash]> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfc~UnicodeCharacters== an Array (shape [19], 19 items) 1 : ( "ä" U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" ) 2 : ( "ö" U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" ) 3 : ( "ü" U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" ) 4 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 5 : ( "ä" U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" ) 6 : ( "ö" U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" ) 7 : ( "ü" U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" ) 8 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 9 : ( "x" U+0078 Ll 1 "LATIN SMALL LETTER X" ) 10 : ( "̂" U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" ) 11 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 12 : ( "ϔ" U+03D4 Lu 1 "GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL" ) 13 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 14 : ( "ﷺ" U+FDFA Lo 1 "ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM" ) 15 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 16 : ( "b" U+0062 Ll 1 "LATIN SMALL LETTER B" ) 17 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 18 : ( "ffl" U+FB04 Ll 1 "LATIN SMALL LIGATURE FFL" ) 19 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) ooRexx[bash]> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfc~description= -- 'UTF-8 not-ASCII (18 characters, 19 codepoints, 31 bytes, 0 error)' 'UTF-8 not-ASCII (18 characters, 19 codepoints, 31 bytes, 0 error)' ooRexx[bash]> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfc~upper= -- T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE' T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE' /* NFKD Normalization Form KD Compatibility Decomposition (K is used to stand for compatibility to avoid confusion with the C standing for composition) Characters are decomposed by compatibility, and multiple combining characters are arranged in a specific order. */ ooRexx[bash]> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkd~UnicodeCharacters== an Array (shape [45], 45 items) 1 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 2 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 3 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 4 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 5 : ( "u" U+0075 Ll 1 "LATIN SMALL LETTER U" ) 6 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 7 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 8 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 9 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 10 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 11 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 12 : ( "u" U+0075 Ll 1 "LATIN SMALL LETTER U" ) 13 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 14 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 15 : ( "x" U+0078 Ll 1 "LATIN SMALL LETTER X" ) 16 : ( "̂" U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" ) 17 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 18 : ( "Υ" U+03A5 Lu 1 "GREEK CAPITAL LETTER UPSILON" ) 19 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 20 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 21 : ( "ص" U+0635 Lo 1 "ARABIC LETTER SAD" ) 22 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 23 : ( "ى" U+0649 Lo 1 "ARABIC LETTER ALEF MAKSURA" ) 24 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 25 : ( "ا" U+0627 Lo 1 "ARABIC LETTER ALEF" ) 26 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 27 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 28 : ( "ه" U+0647 Lo 1 "ARABIC LETTER HEH" ) 29 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 30 : ( "ع" U+0639 Lo 1 "ARABIC LETTER AIN" ) 31 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 32 : ( "ي" U+064A Lo 1 "ARABIC LETTER YEH" ) 33 : ( "ه" U+0647 Lo 1 "ARABIC LETTER HEH" ) 34 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 35 : ( "و" U+0648 Lo 1 "ARABIC LETTER WAW" ) 36 : ( "س" U+0633 Lo 1 "ARABIC LETTER SEEN" ) 37 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 38 : ( "م" U+0645 Lo 1 "ARABIC LETTER MEEM" ) 39 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 40 : ( "b" U+0062 Ll 1 "LATIN SMALL LETTER B" ) 41 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 42 : ( "f" U+0066 Ll 1 "LATIN SMALL LETTER F" ) 43 : ( "f" U+0066 Ll 1 "LATIN SMALL LETTER F" ) 44 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) 45 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) ooRexx[bash]> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkd~description= -- 'UTF-8 not-ASCII (37 characters, 45 codepoints, 69 bytes, 0 error)' 'UTF-8 not-ASCII (37 characters, 45 codepoints, 69 bytes, 0 error)' ooRexx[bash]> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkd~upper= -- T'ÄÖÜ ÄÖÜ X̂ Ϋ صلى الله عليه وسلم BAFFLE T'ÄÖÜ ÄÖÜ X̂ Ϋ صلى الله عليه وسلم BAFFLE' /* NFKC Normalization Form KC Compatibility Decomposition, followed by Canonical Composition Characters are decomposed by compatibility, then recomposed by canonical equivalence. */ ooRexx[bash]> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkc~UnicodeCharacters== an Array (shape [38], 38 items) 1 : ( "ä" U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" ) 2 : ( "ö" U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" ) 3 : ( "ü" U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" ) 4 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 5 : ( "ä" U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" ) 6 : ( "ö" U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" ) 7 : ( "ü" U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" ) 8 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 9 : ( "x" U+0078 Ll 1 "LATIN SMALL LETTER X" ) 10 : ( "̂" U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" ) 11 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 12 : ( "Ϋ" U+03AB Lu 1 "GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA" ) 13 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 14 : ( "ص" U+0635 Lo 1 "ARABIC LETTER SAD" ) 15 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 16 : ( "ى" U+0649 Lo 1 "ARABIC LETTER ALEF MAKSURA" ) 17 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 18 : ( "ا" U+0627 Lo 1 "ARABIC LETTER ALEF" ) 19 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 20 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 21 : ( "ه" U+0647 Lo 1 "ARABIC LETTER HEH" ) 22 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 23 : ( "ع" U+0639 Lo 1 "ARABIC LETTER AIN" ) 24 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 25 : ( "ي" U+064A Lo 1 "ARABIC LETTER YEH" ) 26 : ( "ه" U+0647 Lo 1 "ARABIC LETTER HEH" ) 27 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 28 : ( "و" U+0648 Lo 1 "ARABIC LETTER WAW" ) 29 : ( "س" U+0633 Lo 1 "ARABIC LETTER SEEN" ) 30 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 31 : ( "م" U+0645 Lo 1 "ARABIC LETTER MEEM" ) 32 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 33 : ( "b" U+0062 Ll 1 "LATIN SMALL LETTER B" ) 34 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 35 : ( "f" U+0066 Ll 1 "LATIN SMALL LETTER F" ) 36 : ( "f" U+0066 Ll 1 "LATIN SMALL LETTER F" ) 37 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) 38 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) ooRexx[bash]> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkc~description= -- 'UTF-8 not-ASCII (37 characters, 38 codepoints, 61 bytes, 0 error)' 'UTF-8 not-ASCII (37 characters, 38 codepoints, 61 bytes, 0 error)' ooRexx[bash]> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkc~upper= -- T'ÄÖÜ ÄÖÜ X̂ Ϋ صلى الله عليه وسلم BAFFLE' T'ÄÖÜ ÄÖÜ X̂ Ϋ صلى الله عليه وسلم BAFFLE' -- The normalization forms are implemented only for UTF-8 and WTF-8. ooRexx[bash]> "D800 DC01"x~text("utf16")~nfd~UnicodeCharacters== -- Method TRANSFORM is ABSTRACT and cannot be directly invoked. Method TRANSFORM is ABSTRACT and cannot be directly invoked. Error code= 93.965 ooRexx[bash]> "D800 DC01"x~text("utf16")~utf8~nfd~UnicodeCharacters== an Array (shape [1], 1 items) 1 : ( "𐀁" U+10001 Lo 1 "LINEAR B SYLLABLE B038 E" ) ooRexx[bash]> "\uD800\uDC01"~text("wtf8")~unescape~nfd~UnicodeCharacters== an Array (shape [1], 1 items) 1 : ( "𐀁" U+10001 Lo 1 "LINEAR B SYLLABLE B038 E" ) /* End of demonstration. */