loadPackage OK for extension/stringChunk.cls
loadPackage OK for utilities/indentedStream.cls
loadPackage OK for extension/extensions.cls
loadLibrary OK for rxunixsys
loadPackage OK for ncurses.cls
loadPackage OK for csvStream.cls
loadLibrary OK for hostemu
loadPackage OK for json.cls
loadPackage OK for mime.cls
loadPackage OK for rxftp.cls
loadLibrary OK for rxmath
loadPackage OK for rxregexp.cls
loadPackage OK for regex/regex.cls
loadPackage OK for smtp.cls
loadPackage OK for socket.cls
loadPackage OK for streamsocket.cls
loadPackage OK for pipeline/pipe.cls
loadPackage OK for rgf_util2/rgf_util2.rex
loadPackage OK for BSF.CLS
loadPackage OK for oorexxshell_queries.cls
loadPackage OK for pipeline/pipe_extension.cls
loadPackage OK for rgf_util2/rgf_util2_wrappers.rex

REXX-ooRexx_4.3.0(MT)_64-bit 6.04 22 Apr 2024
Input queue name: S12c66Q600003f428a0

----------------
-- Text encoding
----------------

/*
Start working on a prototype for encoded strings.

Main ideas explored with this prototype :
- The existing String class is kept unchanged, but its semantic becomes : "byte-oriented".
- The prototype adds a layer of services working at grapheme level, provided by the RexxText class.
- The RexxText class works on the bytes managed by the String class.
- String instances are immutable, the same for RexxText instances.
- No automatic conversion to Unicode by the interpreter.
- The strings crossing the I/O barriers are kept unchanged.
- Supported encodings : byte, UTF-8, UTF-16, UTF-32.
*/

/*
On my Mac, where locale returns :
LANG=""
LC_COLLATE="en_US.UTF-8"
LC_CTYPE="en_US.UTF-8"
LC_MESSAGES="en_US.UTF-8"
LC_MONETARY="en_US.UTF-8"
LC_NUMERIC="en_US.UTF-8"
LC_TIME="en_US.UTF-8"
LC_ALL="en_US.UTF-8"

I get that under ooRexxShell :
*/
ooRexx[sh]> s1 = "é"
ooRexx[sh]> s1=                                 -- 'é'
T'é'
ooRexx[sh]> s1~length=                          -- 1    (was 2 before automatic conversion of string literals to text)
 1
ooRexx[sh]> s1~c2x=                             -- C3 A9
'C3A9'
ooRexx[sh]> combining_acute_accent = "cc81"x
ooRexx[sh]> s2 = "e" || combining_acute_accent
ooRexx[sh]> s2=                                 -- T'é'
T'é'
ooRexx[sh]> s2~length=                          -- 1
 1
ooRexx[sh]> s2~c2x=                             -- 65 CC81
'65 CC81'

/*
My goal :
s1~text~length=                     -- 1 grapheme
s1~text~codepoints~count=           -- 1 codepoint
s1~text~string~length=              -- 2 bytes

s2~text~length=                     -- 1 grapheme
s2~text~codepoints~count=           -- 2 codepoints
s2~text~string~length=              -- 3 bytes
*/
ooRexx[sh]> .encoding~defaultEncoding = "utf8"
ooRexx[sh]> s1~text~length=                     -- 1 grapheme
 1
ooRexx[sh]> s1~text~codepoints~count=           -- 1 codepoint
 1
ooRexx[sh]> s1~text~string~length=              -- 2 bytes
 2

ooRexx[sh]> s2~text~length=                     -- 1 grapheme
 1
ooRexx[sh]> s2~text~codepoints~count=           -- 2 codepoints
 2
ooRexx[sh]> s2~text~string~length=              -- 3 bytes
 3

/*
A String is linked to a RexxText, which itself is linked to this String:

    a String
     ▲  text --------> a RexxText
     │                     indexer (anEncoding)
     │                          codepoints (sequential access)
     │                          characters (direct access to graphemes)
     +-<---------------------<- string

The ooRexx programmer has the choice :
- working with String at byte level
- working with RexxText at grapheme level.
- the same instance of String is used in both cases.
*/
ooRexx[sh]> myText = "où as tu e" || .Unicode~character("combining acute accent")~utf8 || "té ?"
ooRexx[sh]> myText=                         -- T'où as tu été ?'
T'où as tu été ?'
ooRexx[sh]> myString = myText~string
ooRexx[sh]> myString=                       -- 'où as tu été ?'
'où as tu été ?'
ooRexx[sh]> myString~length=                -- 18
 18
ooRexx[sh]> myText~length=                  -- 14
 14

/*
                                -- 1  2  3  4  5  6  7  8  9  10 11 12 13 14 15 16 17 18
myString                        -- 6F C3 B9 20 61 73 20 74 75 20 65 CC 81 74 C3 A9 20 3F
                                -- o. ù....  . a. s.  . t. u.  . e. acute t. é....  . ?.
*/
ooRexx[sh]> myString~eachC("c2x")=
['6F','C3','B9', 20, 61, 73, 20, 74, 75, 20, 65,'CC', 81, 74,'C3','A9', 20,'3F']

/*
                                -- 1  2     3  4  5  6  7  8  9  10       11 12    13 14
myText                          -- 6F C3B9  20 61 73 20 74 75 20 65 CC81  74 C3A9  20 3F
                                -- o. ù...   . a. s.  . t. u.  . e. acut  t. é...   . ?.
*/
ooRexx[sh]> myText~graphemes~each("c2x")=
['6F','C3B9', 20, 61, 73, 20, 74, 75, 20,'65 CC81', 74,'C3A9', 20,'3F']

/*
CR+LF is a grapheme made of 2 codepoints.
LF+CR are 2 graphemes.
*/
ooRexx[sh]> "0D0A"x~text~description=
'UTF-8 ASCII (1 character, 2 codepoints, 2 bytes, 0 error)'
ooRexx[sh]> "0A0D"x~text~description=
'UTF-8 ASCII (2 characters, 2 codepoints, 2 bytes, 0 error)'

/*
More examples of encoded string
*/
ooRexx[sh]> "( ͡° ͜ʖ ͡°)"~text~description=
'UTF-8 not-ASCII (9 characters, 12 codepoints, 20 bytes, 0 error)'
ooRexx[sh]> "( ͡° ͜ʖ ͡°)"~text~characters~each("c2x")=
[ 28,'20 CDA1','C2B0','20 CD9C','CA96','EFBBBF','20 CDA1','C2B0', 29]
ooRexx[sh]> "( ͡° ͜ʖ ͡°)"~text~codepoints~each{"U+"item~d2x}=
['U+28','U+20','U+361','U+B0','U+20','U+35C','U+296','U+FEFF','U+20','U+361','U+B0','U+29']
ooRexx[sh]> "(ノಠ益ಠ)ノ彡"~text~description=
'UTF-8 not-ASCII (8 characters, 8 codepoints, 20 bytes, 0 error)'
ooRexx[sh]> "(ノಠ益ಠ)ノ彡"~text~characters~each("c2x")=
[ 28,'E3838E','E0B2A0','E79B8A','E0B2A0', 29,'E3838E','E5BDA1']
ooRexx[sh]> "(ノಠ益ಠ)ノ彡"~text~codepoints~each{"U+"item~d2x}=
['U+28','U+30CE','U+CA0','U+76CA','U+CA0','U+29','U+30CE','U+5F61']

/*
U+FE0E VARIATION SELECTOR-15 (UTF-8: EF B8 8E)
https://codepoints.net/U+fe0e
This codepoint may change the appearance of the preceding character.
If that is a symbol, dingbat or emoji, U+FE0E forces it to be rendered in a
textual fashion as compared to a colorful image.

U+FE0F VARIATION SELECTOR-16 (UTF-8: EF B8 8F)
https://codepoints.net/U+fe0f
This codepoint may change the appearance of the preceding character.
If that is a symbol, dingbat or emoji, U+FE0F forces it to be rendered as a
colorful image as compared to a monochrome text variant.
In theory ❤ and ❄ (and many other emoji) should display as text style by default
without VS16, but many applications ignore that.
*/
ooRexx[sh]> emoji_bag = .bag~of('❤', '❤️', '❄', '❄︎', '❄️', '⚪', '⚪️', '⚫', '⚫️')
ooRexx[sh]> emoji_table = emoji_bag~table~map("text")
ooRexx[sh]> emoji_table~map("c2x")==
a Table (9 items)
T'⚪'    : 'E29AAA'
T'⚪️' : 'E29AAA EFB88F'
T'⚫'    : 'E29AAB'
T'⚫️' : 'E29AAB EFB88F'
T'❄'    : 'E29D84'
T'❄︎' : 'E29D84 EFB88E'
T'❄️' : 'E29D84 EFB88F'
T'❤'    : 'E29DA4'
T'❤️' : 'E29DA4 EFB88F'
ooRexx[sh]> emoji_table~map("c2u")==
a Table (9 items)
T'⚪'    : 'U+26AA'
T'⚪️' : 'U+26AA U+FE0F'
T'⚫'    : 'U+26AB'
T'⚫️' : 'U+26AB U+FE0F'
T'❄'    : 'U+2744'
T'❄︎' : 'U+2744 U+FE0E'
T'❄️' : 'U+2744 U+FE0F'
T'❤'    : 'U+2764'
T'❤️' : 'U+2764 U+FE0F'
/*
The tables above are not well aligned
because the alignement is based on the length of the indexes,
which is (for the moment) a count of bytes, not a count of graphemes...
*/

/*
https://www.reddit.com/r/cpp/comments/aqzu7i
👩‍👨‍👩‍👧‍👦‍👧‍👧‍👦
is one grapheme, made up of 15 codepoints
*/
ooRexx[sh]> family = "👩‍👨‍👩‍👧‍👦‍👧‍👧‍👦"
ooRexx[sh]> family=
T'👩‍👨‍👩‍👧‍👦‍👧‍👧‍👦'
ooRexx[sh]> family~text~description=                              -- 1 character, 15 codepoints, 53 bytes
'UTF-8 not-ASCII (1 character, 15 codepoints, 53 bytes, 0 error)'
ooRexx[sh]> family~text~c2x=
'F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 E2808D F09F91A6 E2808D F09F91A7 E2808D F09F91A7 E2808D F09F91A6'
ooRexx[sh]> family~text~c2u=
'U+1F469 U+200D U+1F468 U+200D U+1F469 U+200D U+1F467 U+200D U+1F466 U+200D U+1F467 U+200D U+1F467 U+200D U+1F466'
ooRexx[sh]> family~text~c2g=
'F09F91A9E2808DF09F91A8E2808DF09F91A9E2808DF09F91A7E2808DF09F91A6E2808DF09F91A7E2808DF09F91A7E2808DF09F91A6'

/*
https://onlineunicodetools.com/generate-zalgo-unicode
Uses Unicode combining characters to create symbol noise.
"hello" zalgoified:
h̵᷊̟͉͔̟̲͆e̷͇̼͉̲̾l̸̨͓̭̗᷿︣︠ͦl̶̯̻̑̈ͮ͌︡̕o̵̝̬̯᷊̭̯̦᷃ͪ̆́᷈́͜͢͞
*/
ooRexx[sh]> helloZalgo = "h̵᷊̟͉͔̟̲͆e̷͇̼͉̲̾l̸̨͓̭̗᷿︣︠ͦl̶̯̻̑̈ͮ͌︡̕o̵̝̬̯᷊̭̯̦᷃ͪ̆́᷈́͜͢͞"
ooRexx[sh]> helloZalgo =
T'h̵᷊̟͉͔̟̲͆e̷͇̼͉̲̾l̸̨͓̭̗᷿︣︠ͦl̶̯̻̑̈ͮ͌︡̕o̵̝̬̯᷊̭̯̦᷃ͪ̆́᷈́͜͢͞'
ooRexx[sh]> helloZalgo~text~description=                          -- 5 characters, 54 codepoints, 111 bytes
'UTF-8 not-ASCII (5 characters, 54 codepoints, 111 bytes, 0 error)'
ooRexx[sh]> helloZalgo~text~c2x=
'68 CD86 CCB5 E1B78A CC9F CD89 CD94 CC9F CCB2 65 CCBE CCB7 CD87 CCBC CD89 CCB2 6C EFB8A3 EFB8A0 CDA6 CCB8 CD93 CCAD CC97 E1B7BF CCA8 6C CC91 CC88 CDAE CD8C EFB8A1 CC95 CCB6 CCAF CCBB 6F E1B783 CDAA CC86 CC81 E1B788 CD81 CD9E CCB5 CC9D CD9C CCAC CCAF E1B78A CDA2 CCAD CCAF CCA6'
ooRexx[sh]> helloZalgo~text~c2u=
'U+0068 U+0346 U+0335 U+1DCA U+031F U+0349 U+0354 U+031F U+0332 U+0065 U+033E U+0337 U+0347 U+033C U+0349 U+0332 U+006C U+FE23 U+FE20 U+0366 U+0338 U+0353 U+032D U+0317 U+1DFF U+0328 U+006C U+0311 U+0308 U+036E U+034C U+FE21 U+0315 U+0336 U+032F U+033B U+006F U+1DC3 U+036A U+0306 U+0301 U+1DC8 U+0341 U+035E U+0335 U+031D U+035C U+032C U+032F U+1DCA U+0362 U+032D U+032F U+0326'
ooRexx[sh]> helloZalgo~text~c2g=
'68CD86CCB5E1B78ACC9FCD89CD94CC9FCCB2 65CCBECCB7CD87CCBCCD89CCB2 6CEFB8A3EFB8A0CDA6CCB8CD93CCADCC97E1B7BFCCA8 6CCC91CC88CDAECD8CEFB8A1CC95CCB6CCAFCCBB 6FE1B783CDAACC86CC81E1B788CD81CD9ECCB5CC9DCD9CCCACCCAFE1B78ACDA2CCADCCAFCCA6'

/*
Supported encoding conversions:
Byte to UTF-8, UTF-16, UTF-32
UTF-8 to UTF-16, UTF-32
UTF-16 to UTF-8, UTF-32
UTF-32 to UTF-8, UTF-16
*/

/*
The Byte_Encoding can be specialized to add support for specific encoding conversions.
*/
ooRexx[sh]> .Encoding~list~table==
a Table (45 items)
 437               : (The IBM437_Encoding class)
 819               : (The ISO88591_Encoding class)
 8859              : (The ISO88591_Encoding class)
 28591             : (The ISO88591_Encoding class)
 88591             : (The ISO88591_Encoding class)
'ASCII8BIT'        : (The Byte_Encoding class)
'BINARY'           : (The Byte_Encoding class)
'BYTE'             : (The Byte_Encoding class)
'BYTES'            : (The Byte_Encoding class)
'CP1252'           : (The WINDOWS1252_Encoding class)
'CP437'            : (The IBM437_Encoding class)
'CP5348'           : (The WINDOWS1252_Encoding class)
'CP65001'          : (The UTF8_Encoding class)
'CP819'            : (The ISO88591_Encoding class)
'CSISOLATIN1'      : (The ISO88591_Encoding class)
'CSPC8CODEPAGE437' : (The IBM437_Encoding class)
'CSWINDOWS1252'    : (The WINDOWS1252_Encoding class)
'IBM1252'          : (The IBM1252_Encoding class)
'IBM437'           : (The IBM437_Encoding class)
'IBM5348'          : (The WINDOWS1252_Encoding class)
'IBM819'           : (The ISO88591_Encoding class)
'ISO88591'         : (The ISO88591_Encoding class)
'ISO885911987'     : (The ISO88591_Encoding class)
'ISOIR100'         : (The ISO88591_Encoding class)
'L1'               : (The ISO88591_Encoding class)
'LATIN'            : (The ISO88591_Encoding class)
'LATIN1'           : (The ISO88591_Encoding class)
'UNICODE16'        : (The Unicode16_Encoding class)
'UNICODE32'        : (The Unicode32_Encoding class)
'UNICODE8'         : (The Unicode8_Encoding class)
'UTF16'            : (The UTF16BE_Encoding class)
'UTF16BE'          : (The UTF16BE_Encoding class)
'UTF16LE'          : (The UTF16LE_Encoding class)
'UTF32'            : (The UTF32BE_Encoding class)
'UTF32BE'          : (The UTF32BE_Encoding class)
'UTF32LE'          : (The UTF32LE_Encoding class)
'UTF8'             : (The UTF8_Encoding class)
'WE8ISO8859P1'     : (The ISO88591_Encoding class)
'WINDOWS1252'      : (The WINDOWS1252_Encoding class)
'WINDOWS28591'     : (The ISO88591_Encoding class)
'WINDOWS437'       : (The IBM437_Encoding class)
'WTF16'            : (The WTF16BE_Encoding class)
'WTF16BE'          : (The WTF16BE_Encoding class)
'WTF16LE'          : (The WTF16LE_Encoding class)
'WTF8'             : (The WTF8_Encoding class)

/*
Example: CP1252 to UTF-8 (where CP1252 is an alias of Windows-1252)
"Un œuf de chez MaPoule™ coûte ±0.40€"
*/
ooRexx[sh]> str_cp1252 = "Un " || "9C"x || "uf de chez MaPoule" || "99"x || " co" || "FB"x || "te " || "B1"x || "0.40" || "80"x
ooRexx[sh]> txt_cp1252 = str_cp1252~text("cp1252")
ooRexx[sh]> txt_cp1252~description=
'windows-1252 not-ASCII (36 characters, 36 codepoints, 36 bytes, 0 error)'
ooRexx[sh]> txt_cp1252~c2x=
'55 6E 20 9C 75 66 20 64 65 20 63 68 65 7A 20 4D 61 50 6F 75 6C 65 99 20 63 6F FB 74 65 20 B1 30 2E 34 30 80'
ooRexx[sh]> txt_utf8 = txt_cp1252~utf8
ooRexx[sh]> txt_utf8=
T'Un œuf de chez MaPoule™ coûte ±0.40€'
ooRexx[sh]> txt_utf8~description=
'UTF-8 not-ASCII (36 characters, 36 codepoints, 43 bytes, 0 error)'
ooRexx[sh]> txt_utf8~c2x=
'55 6E 20 C593 75 66 20 64 65 20 63 68 65 7A 20 4D 61 50 6F 75 6C 65 E284A2 20 63 6F C3BB 74 65 20 C2B1 30 2E 34 30 E282AC'

/*
Strings of codepoints encoded as native integers.
3 representations:
    Unicode8_Encoding
    Unicode16_Encoding
    Unicode32_Encoding.
The method ~unicode returns one of these encodings, depending on the character
with the largest Unicode codepoint (1, 2, or 4 bytes) in the source string.
Unlike the flexible representation of Python, the 3 representions are first-class.
No BOM, the endiannes is the CPU one.
Unicode32_Encoding can be used with utf8proc for the functions taking a buffer of 32-bit integers.
*/

-- Just an interpretative layer put above the string
ooRexx[sh]> "côté"~text("unicode8")~pipe{item~description(short:1) ":" item~c2x}=
'Unicode8 not-ASCII : 63 C3 B4 74 C3 A9'

-- UTF-8 converted to Unicode8
ooRexx[sh]> "côté"~text~unicode~pipe{item~description(short:1) ":" item~c2x}=
'Unicode8 not-ASCII : 63 F4 74 E9'
ooRexx[sh]> "noël‍👨‍👩‍👧"~text~maximumCodepoint~pipe{"U+"item~d2x}=   -- U+1F469 is the maximum codepoint
'U+1F469'
ooRexx[sh]> "noël‍👨‍👩‍👧"~text~unicode~description(technical:1)=      -- For this maximum codepoint, we need Unicode32
'Unicode32 (5 characters (1 index from index 5), 10 codepoints (0 index), 40 bytes, 0 error)'

-- The endianness of the UnicodeN_Encoding is the one of the machine.
-- With an Intel CPU, it's little-endian.
ooRexx[sh]> "noël‍👨‍👩‍👧"~text~unicode~c2x=
'6E000000 6F000000 EB000000 6C000000 0D200000 68F40100 0D200000 69F40100 0D200000 67F40100'

-- The default endianness for UTF32 is big-endian.
ooRexx[sh]> "noël‍👨‍👩‍👧"~text~utf32~c2x=
'0000006E 0000006F 000000EB 0000006C 0000200D 0001F468 0000200D 0001F469 0000200D 0001F467'

/*
Comparing the size of UTF-8 vs UTF-16 vs UTF-32 for various strings.
These strings are initially UTF-8 encoded.
The first step is to get a wrapper RexxText (default encoding is UTF-8).
The second step is to convert to UTF-16 or UTF-32.
The description includes technical informations (technical:1) about the internal tables for indexation.
*/
ooRexx[sh]> howMuchOfStorage = "how much of storage?"
ooRexx[sh]> howMuchOfStorage~text~description(technical:1)=         -- UTF-8:     20 characters, 20 codepoints,  20 bytes
'UTF-8 ASCII (20 characters (0 index), 20 codepoints (0 index), 20 bytes, 0 error)'
ooRexx[sh]> howMuchOfStorage~text~utf16~description(technical:1)=   -- UTF-16:    20 characters, 20 codepoints,  40 bytes
'UTF-16BE (20 characters (0 index), 20 codepoints (0 index), 40 bytes, 0 error)'
ooRexx[sh]> howMuchOfStorage~text~utf32~description(technical:1)=   -- UTF-32:    20 characters, 20 codepoints,  80 bytes
'UTF-32BE (20 characters (0 index), 20 codepoints (0 index), 80 bytes, 0 error)'
ooRexx[sh]> howMuchOfStorage~text~unicode~description(technical:1)= -- Unicode8:  20 characters, 20 codepoints,  20 bytes
'Unicode8 ASCII (20 characters (0 index), 20 codepoints (0 index), 20 bytes, 0 error)'
ooRexx[sh]> rexCharacters = "'rex' in their name: ꎅ ꎜ ꏑ 🦖"
ooRexx[sh]> rexCharacters~text~description(technical:1)=            -- UTF-8:     28 characters, 28 codepoints,  37 bytes
'UTF-8 not-ASCII (28 characters (6 indexes from index 23), 28 codepoints (6 indexes from index 23), 37 bytes, 0 error)'
ooRexx[sh]> rexCharacters~text~utf16~description(technical:1)=      -- UTF-16:    28 characters, 28 codepoints,  58 bytes
'UTF-16BE (28 characters (0 index), 28 codepoints (0 index), 58 bytes, 0 error)'
ooRexx[sh]> rexCharacters~text~utf32~description(technical:1)=      -- UTF-32:    28 characters, 28 codepoints, 112 bytes
'UTF-32BE (28 characters (0 index), 28 codepoints (0 index), 112 bytes, 0 error)'
ooRexx[sh]> rexCharacters~text~unicode~description(technical:1)=    -- Unicode32: 28 characters, 28 codepoints, 112 bytes
'Unicode32 (28 characters (0 index), 28 codepoints (0 index), 112 bytes, 0 error)'
ooRexx[sh]> family = "👩‍👨‍👩‍👧‍👦‍👧‍👧‍👦"
ooRexx[sh]> family~text~description(technical:1)=                   -- UTF-8:      1 character,  15 codepoints,  53 bytes
'UTF-8 not-ASCII (1 character (0 index), 15 codepoints (14 indexes from index 2), 53 bytes, 0 error)'
ooRexx[sh]> family~text~utf16~description(technical:1)=             -- UTF-16:     1 character,  15 codepoints,  46 bytes
'UTF-16BE (1 character (0 index), 15 codepoints (14 indexes from index 2), 46 bytes, 0 error)'
ooRexx[sh]> family~text~utf32~description(technical:1)=             -- UTF-32:     1 character,  15 codepoints,  60 bytes
'UTF-32BE (1 character (0 index), 15 codepoints (0 index), 60 bytes, 0 error)'
ooRexx[sh]> family~text~unicode~description(technical:1)=           -- Unicode32:  1 character,  15 codepoints,  60 byte
'Unicode32 (1 character (0 index), 15 codepoints (0 index), 60 bytes, 0 error)'
ooRexx[sh]> helloZalgo = "h̵᷊̟͉͔̟̲͆e̷͇̼͉̲̾l̸̨͓̭̗᷿︣︠ͦl̶̯̻̑̈ͮ͌︡̕o̵̝̬̯᷊̭̯̦᷃ͪ̆́᷈́͜͢͞"
ooRexx[sh]> helloZalgo~text~description(technical:1)=               -- UTF-8:      5 characters, 54 codepoints, 111 bytes
'UTF-8 not-ASCII (5 characters (4 indexes from index 2), 54 codepoints (52 indexes from index 3), 111 bytes, 0 error)'
ooRexx[sh]> helloZalgo~text~utf16~description(technical:1)=         -- UTF-16:     5 characters, 54 codepoints, 108 bytes
'UTF-16BE (5 characters (4 indexes from index 2), 54 codepoints (0 index), 108 bytes, 0 error)'
ooRexx[sh]> helloZalgo~text~utf32~description(technical:1)=         -- UTF-32:     5 characters, 54 codepoints, 216 bytes
'UTF-32BE (5 characters (4 indexes from index 2), 54 codepoints (0 index), 216 bytes, 0 error)'
ooRexx[sh]> helloZalgo~text~unicode~description(technical:1)=       -- Unicode16:  5 characters, 54 codepoints, 108 bytes
'Unicode16 (5 characters (4 indexes from index 2), 54 codepoints (0 index), 108 bytes, 0 error)'


/*
It's possible to set/get an encoding on a String or a MutableBuffer
without having an associated RexxText.
It's just an annotation, there is no indexing.
*/

ooRexx[sh]> s = "nonsense"
ooRexx[sh]> s~encoding =                    -- returns the default encoding: (The UTF8_Encoding class)
(The UTF8_Encoding class)
ooRexx[sh]> s~hasText =                     -- 0
 0
ooRexx[sh]> s~encoding = .UTF16BE_Encoding  -- tag the string: encoded UTF-16BE
ooRexx[sh]> s~encoding =                    -- (The UTF16BE_Encoding class)
(The UTF16BE_Encoding class)
ooRexx[sh]> s~hasText =                     -- still no associated RexxText: 0
 0
ooRexx[sh]> t = s~text                      -- associates a RexxText to the string
ooRexx[sh]> s~hasText =                     -- the string has an associated text: 1
 1
ooRexx[sh]> t~encoding =                    -- the encoding of the text is the one of the string: (The UTF16BE_Encoding class)
(The UTF16BE_Encoding class)

-- Changing the encoding of a String/RexxText has no impact on the byte sequence.
ooRexx[sh]> s=                              -- "nonsense"
'nonsense'
ooRexx[sh]> t~string=                       -- "nonsense"
'nonsense'

-- The impact of the encoding annotation is on the decoding of the byte sequence.
-- The english word "nonsense" in UTF-8 becomes the chinese word "soup" when interpreted as UTF-16BE and converted to UTF-8.
ooRexx[sh]> t~utf8 =                        -- T'湯湳敮獥'      soup
T'湯湳敮獥'

-- Setting/getting the encoding of the string will set/get the encoding of the associated RexxText
ooRexx[sh]> s~encoding = .UTF16LE_Encoding
ooRexx[sh]> t~encoding =                    -- the encoding of the text has been changed: (The UTF16LE_Encoding class)
(The UTF16LE_Encoding class)
ooRexx[sh]> 
-- The english word "nonsense" in UTF-8 becomes the chinese word "tide" when interpreted as UTF-16LE and converted to UTF-8.
ooRexx[sh]> t~utf8 =                        -- T'潮獮湥敳'      tide
T'潮獮湥敳'


/*
End of demonstration.
*/