loadPackage OK for extension/stringChunk.cls
loadPackage OK for utilities/indentedStream.cls
loadPackage OK for extension/extensions.cls
loadLibrary OK for rxunixsys
loadPackage OK for ncurses.cls
loadPackage OK for csvStream.cls
loadLibrary OK for hostemu
loadPackage OK for json.cls
loadPackage OK for mime.cls
loadPackage OK for rxftp.cls
loadLibrary OK for rxmath
loadPackage OK for rxregexp.cls
loadPackage OK for regex/regex.cls
loadPackage OK for smtp.cls
loadPackage OK for socket.cls
loadPackage OK for streamsocket.cls
loadPackage OK for pipeline/pipe.cls
loadPackage OK for rgf_util2/rgf_util2.rex
loadPackage OK for BSF.CLS
loadPackage OK for oorexxshell_queries.cls
loadPackage OK for pipeline/pipe_extension.cls
loadPackage OK for rgf_util2/rgf_util2_wrappers.rex

REXX-ooRexx_4.3.0(MT)_64-bit 6.04 22 Apr 2024
Input queue name: ScbeQ600003f06560

ooRexx> call loadUnicodeCharacterNames
Load the Unicode character names 15.1.0 
............................................
Total loaded character names: 149813
Total character name aliases: 473
Unicode character intervals not expanded, execute: call expandUnicodeCharacterIntervals

ooRexx> .Unicode~memorizeTranscodings = .false
ooRexx> .Unicode~memorizeTransformations = .false


-- ===============================================================================
-- 2024 Apr 24

/*
Rework the support of encoding for RexxBlock.
The definition doesn't change:
A RexxBlock has the same encoding as its definition package.
New methods:
    encoding
    encoding=
    hasEncoding
    setEncoding
Examples:
*/
ooRexx> block = {say .context~package~encoding; s1 = "Père Noël"; say s1~class s1~encoding; s2 = "Père" "Noël"; say s~class s~encoding}
ooRexx> block~hasEncoding=                                  -- 1
 1
ooRexx> block~encoding=                                     -- (The UTF8_Encoding class)
(The UTF8_Encoding class)
ooRexx> block~()
The UTF8_Encoding class
The RexxText class The UTF8_Encoding class
The String class The UTF8_Encoding class
/*
    The UTF8_Encoding class                         -- encoding of the definition package
    The RexxText class The UTF8_Encoding class      -- encoding of the definition package (string literal)
    The String class The UTF8_Encoding class        -- encoding of the calculated string
*/

-- Changing the block encoding
ooRexx> block = {say .context~package~encoding; s1 = "Père Noël"; say s1~class s1~encoding; s2 = "Père" "Noël"; say s~class s~encoding}
ooRexx> oldEncoding = block~setEncoding("byte")
ooRexx> oldEncoding=                                        -- (The UTF8_Encoding class)
(The UTF8_Encoding class)
ooRexx> block~hasEncoding=                                  -- 1
 1
ooRexx> block~encoding=                                     -- (The Byte_Encoding class)
(The Byte_Encoding class)
ooRexx> block~()
The Byte_Encoding class
The String class The Byte_Encoding class
The String class The UTF8_Encoding class
/*
    The Byte_Encoding class                         -- encoding of the definition package
    The String class The Byte_Encoding class        -- encoding of the definition package (string literal)
    The String class The UTF8_Encoding class        -- Calculated string. TODO should be The Byte_Encoding
*/
ooRexx> block~setEncoding(oldEncoding)
ooRexx> block~hasEncoding=                                  -- 1
 1
ooRexx> block~encoding=                                     -- (The UTF8_Encoding class)
(The UTF8_Encoding class)
ooRexx> block~()
The UTF8_Encoding class
The String class The Byte_Encoding class
The String class The UTF8_Encoding class
/*
    The UTF8_Encoding class
    The String class The Byte_Encoding class        -- Once a string literal has a stored encoding, it doesn't change
    The String class The UTF8_Encoding class
*/


-- ===============================================================================
-- 2024 Apr 22

/*
The encoding of a string literal is the encoding of its definition package.
It is set when the string literal is first evaluated.
Once a string literal has received its encoding, it does not change even if the
package encoding is changed later. Only string literals not yet evaluated will
be impacted. It is possible to explicitly change the encoding using the
~setEncoding method or using the ~encoding = new_encoding assignment.
The same goes for the default encoding. Once a calculated string has received
its encoding, it does not change even if the default encoding is changed later.
Examples:
*/
ooRexx> system rexx string_literal_encoding/package_main.rex
prolog of package_byte.cls     The Byte_Encoding class
prolog of package_cp1252.cls   The WINDOWS1252_Encoding class
prolog of package_utf8.cls     The UTF8_Encoding class
prolog of package_utf16be.cls  The UTF16BE_Encoding class
prolog of package_utf32be.cls  The UTF32BE_Encoding class

.package_byte~m_name           package_byte                   String                         The Byte_Encoding class
.package_byte~m_noel           Noël                          String                         The Byte_Encoding class
.package_byte~m_noel_x         Noël                          String                         The Byte_Encoding class
.package_byte~m_noel_x2c       Noël                          String                         The UTF8_Encoding class

.package_cp1252~m_name         package_cp1252                 String                         The WINDOWS1252_Encoding class
.package_cp1252~m_noel         Noël                          String                         The WINDOWS1252_Encoding class
.package_cp1252~m_noel_x       Noël                          String                         The WINDOWS1252_Encoding class
.package_cp1252~m_noel_x2c     Noël                          String                         The UTF8_Encoding class

.package_utf8~m_name           package_utf8                   String                         The UTF8_Encoding class
.package_utf8~m_noel           Noël                           RexxText                       The UTF8_Encoding class
.package_utf8~m_noel_x         Noël                           RexxText                       The UTF8_Encoding class
.package_utf8~m_noel_x2c       Noël                          String                         The UTF8_Encoding class

.package_utf16be~m_name        package_utf16be                String                         The UTF16BE_Encoding class
.package_utf16be~m_noel        乯쎫氀                          � RexxText                       The UTF16BE_Encoding class
.package_utf16be~m_noel_x      Noël                           RexxText                       The UTF16BE_Encoding class
.package_utf16be~m_noel_x2c    No�l                       String                         The UTF8_Encoding class

.package_utf32be~m_name        package_utf32be                String                         The UTF32BE_Encoding class
.package_utf32be~m_noel        ������������������������������ RexxText                       The UTF32BE_Encoding class
.package_utf32be~m_noel_x      Noël                           RexxText                       The UTF32BE_Encoding class
.package_utf32be~m_noel_x2c    No�l               String                         The UTF8_Encoding class


/*
Consequence of the previous rule, the hexadecimal and binary strings are no longer
declared Byte encoded. Now their encoding is given by their definition package.
Idem for the BIFs/BIMs D2C and X2C, their results are no longer declared Byte encoded.
Since they have no assigned encoding, their results encoding depend on the default
encoding.
Examples:
*/
ooRexx> "41"x=                      -- 'A'
'A'
ooRexx> "41"x~hasEncoding=          -- 1    The encoding is stored
 1
ooRexx> "41"x~encoding=             -- (The UTF8_Encoding class)    This is the encoding of the definition package
(The UTF8_Encoding class)
ooRexx> "41"~x2c=                   -- 'A'
'A'
ooRexx> "41"~x2c~hasEncoding=       -- 0    No stored encoding
 0
ooRexx> "41"~x2c~encoding=          -- (The UTF8_Encoding class)    This is the default encoding
(The UTF8_Encoding class)


/*
For the proper management of the encoding of string literals, the globalStrings
directory is no longer used by the parser when building an image.
Now, each source (package) manages its own directory, even when building an image.
For the moment, all the packages that are included in rexx.img are byte encoded,
so this change is not needed. But maybe in the future, I may have packages with
different encodings in rexx.img.
*/


/*
It's now possible to reset the encoding of a string, mutable buffer or package
by passing .nil when using
    target~encoding = .nil
    target~setEncoding(.nil)
After reset, the encoding is no longer stored and the default encoding is returned.
A RexxText has always an encoding, so an error is raised when passing .nil.
This same error is raised when the target is a string linked to a RexxText.
Examples:
*/
ooRexx> s = "Noel"
ooRexx> s~description=                          -- 'UTF-8 ASCII (4 bytes)'
'UTF-8 ASCII (4 bytes)'
ooRexx> oldEncoding = s~setEncoding(.nil)
ooRexx> oldEncoding=                            -- (The UTF8_Encoding class)
(The UTF8_Encoding class)
ooRexx> s~description=                          -- 'UTF-8 ASCII by default (4 bytes)'
'UTF-8 ASCII by default (4 bytes)'
ooRexx> s~setEncoding(oldEncoding)
ooRexx> s~description=                          -- 'UTF-8 ASCII (4 bytes)'
'UTF-8 ASCII (4 bytes)'

ooRexx> t = "Noël"
ooRexx> t~description=                          -- 'UTF-8 not-ASCII (4 characters, 4 codepoints, 5 bytes, 0 error)'
'UTF-8 not-ASCII (4 characters, 4 codepoints, 5 bytes, 0 error)'
ooRexx> t~setEncoding(.nil)                     -- Encoding: 'The NIL object' is not supported
Encoding: 'The NIL object' is not supported.
Error code= 93.900
ooRexx> s = t~string
ooRexx> s~description=                          -- 'UTF-8 not-ASCII (4 characters, 4 codepoints, 5 bytes, 0 error)'
'UTF-8 not-ASCII (4 characters, 4 codepoints, 5 bytes, 0 error)'
ooRexx> s~setEncoding(.nil)                     -- Encoding: 'The NIL object' is not supported
Encoding: 'The NIL object' is not supported.
Error code= 93.900


/*
The method ~setEncoding returns .nil when the target has no stored encoding.
That allows to reset properly the encoding when restoring the previous value.
Note: the method ~encoding never returns .nil. It returns the default encoding
when no encoding is stored.
Examples:
*/
ooRexx> .context~package~hasEncoding=                       -- 0                            The encoding is not stored
 0
ooRexx> .context~package~encoding=                          -- (The UTF8_Encoding class)    It's the default encoding
(The UTF8_Encoding class)
ooRexx> oldEncoding = .context~package~setEncoding("byte")
ooRexx> oldEncoding=                                        -- (The NIL object)
(The NIL object)
ooRexx> .context~package~hasEncoding=                       -- 1                            The encoding is stored
 1
ooRexx> .context~package~encoding=                          -- (The Byte_Encoding class)
(The Byte_Encoding class)
ooRexx> .context~package~setEncoding(oldEncoding)=          -- (The Byte_Encoding class)    Previous encoding
(The Byte_Encoding class)
ooRexx> .context~package~hasEncoding=                       -- 0                            Return to non-stored encoding
 0
ooRexx> .context~package~encoding=                          -- (The UTF8_Encoding class)    It's the default encoding
(The UTF8_Encoding class)


/*
New methods:
    .String~detach
    .RexxText~detach
The string is detached from its text counterpart.
The text becomes an empty text "".
Useful when working with big strings, to reclaim memory.
No need to call ~detach on both targets. There is a forward to the counterpart.
Examples:
*/
ooRexx> s = "Noel"
ooRexx> t = s~text
ooRexx> t=              -- T'Noel'
T'Noel'
ooRexx> s~hasText=      -- 1
 1
ooRexx> s~detach
ooRexx> s~hasText=      -- 0
 0
ooRexx> t=              -- T''
T''

ooRexx> t = "Noël"
ooRexx> s = t~string
ooRexx> t=              -- T'Noël'
T'Noël'
ooRexx> s~hasText=      -- 1
 1
ooRexx> t~detach
ooRexx> s~hasText=      -- 0
 0
ooRexx> t=              -- T''
T''


/*
New methods:
    .String~byte
    .RexxText~byte
Returns a copy of the string or text, with encoding = The Byte_Encoding.
The Byte_Encoding is a raw encoding with few constraints. It's often used for
diagnostic or repair. It can be always absorbed when doing a concatenation or a
comparison. BUT it's impossible to transcode from/to it without errors if the
string contains not-ASCII characters. Here, no transcoding, it's a copy as-is
whose encoding is The Byte_Encoding.
Examples:
*/
ooRexx> "50C3"x~description=                    -- 'UTF-8 not-ASCII (2 characters, 2 codepoints, 2 bytes, 1 error)'
'UTF-8 not-ASCII (2 characters, 2 codepoints, 2 bytes, 1 error)'
ooRexx> "Père"~text~startsWith("50C3"x)=        -- Invalid UTF-8 string (raised by utf8proc)
Invalid UTF-8 string.
Error code= 22.900
ooRexx> "50C3"x~byte~description=               -- 'Byte not-ASCII (2 characters, 2 codepoints, 2 bytes, 0 error)'
'Byte not-ASCII (2 characters, 2 codepoints, 2 bytes, 0 error)'
ooRexx> "Père"~text~startsWith("50C3"x~byte)=   -- 0 (not aligned)
 0


/*
New methods:
    .String~bytes
    .RexxText~bytes
Returns a ByteSupplier which provides each byte in decimal.
Examples:
*/
ooRexx> "Noel"~bytes==
a ByteSupplier 
 1 :  78
 2 :  111
 3 :  101
 4 :  108
ooRexx> "Noël"~bytes==
a ByteSupplier 
 1 :  78
 2 :  111
 3 :  195
 4 :  171
 5 :  108


-- ===============================================================================
-- 2024 Apr 12

/*
[interpreter]

Add support for dynamic target when sending messages.
The target is calculated based on the initial target and the arguments
values/types of the message. It's still a single-dispatch.
The ~~ form of message is not impacted: it returns the object that received the
message (the initial target), not the calculated target.


New method .Object~dynamicTarget which returns the target in function of the arguments:
    RexxObject *RexxObject::dynamicTargetRexx(RexxObject **arguments, size_t argCount, size_t named_argCount)
    {
        return this->dynamicTarget(arguments, argCount, named_argCount);
    }
By default, the dynamic target is the receiver object.
Native classes can override the virtual method dynamicTarget.
For the moment, it's not possible to override this method with an ooRexx method.
Examples:
*/
ooRexx> (1,2)~dynamicTarget=                       -- initial target: [ 1, 2]
[ 1, 2]
ooRexx> (1,2)~dynamicTarget("string")=             -- initial target: [ 1, 2]
[ 1, 2]
ooRexx> (1,2)~dynamicTarget("string", "teẌt")=     -- initial target: [ 1, 2]
[ 1, 2]


/*
The forward instruction does not depend on the dynamic target calculation.
If you need to forward using the dynamic target then do:
    forward message "DYNAMICTARGET" continue
    forward to (result)
*/


/*
[Encoded strings]

+---------------------------------------------------------------+
|                   3rd important milestone                     |
| The String messages become polymorphic on RexxString/RexxText |
+---------------------------------------------------------------+
If at least one positional argument is a RexxText then the String message is
sent to the RexxText counterpart of the String instance, otherwise the String
message is sent to the String instance.

The RexxString class overrides the virtual method dynamicTarget:
    RexxObject *RexxString::dynamicTarget(RexxObject **arguments, size_t count, size_t named_count)
    {
        if (hasRexxTextArguments(arguments, count, named_count))
        {
            RexxText *text = this->requestText();
            return text;
        }
        return this;
    }
Examples:
*/
ooRexx> "Noel"~dynamicTarget=                       -- initial target: 'Noel'
'Noel'
ooRexx> "Noel"~dynamicTarget("string")=             -- initial target: 'Noel'
'Noel'
ooRexx> "Noel"~dynamicTarget("string", "teẌt")=     -- text counterpart of the initial target: T'Noel'  because "teẌt" is a RexxText
T'Noel'


/*
Examples of dynamic target with ~center:
*/
ooRexx> "é"~c2x=; "é"~class=                        -- 'C3A9'   (The RexxText class)
'C3A9'
(The RexxText class)
ooRexx> "test"~center(10, "é")=                     -- T'ééétestééé'
T'ééétestééé'

ooRexx> "C3A9"x=; result~description=               -- T'é'     'UTF-8 not-ASCII (1 character, 1 codepoint, 2 bytes, 0 error)'
T'é'
'UTF-8 not-ASCII (1 character, 1 codepoint, 2 bytes, 0 error)'
ooRexx> "test"~center(10, "C3A9"x)=                 -- T'ééétestééé'
T'ééétestééé'

ooRexx> x2c("C3A9")=; result~description=           -- 'é'      'UTF-8 not-ASCII by default (2 bytes)'
'é'
'UTF-8 not-ASCII by default (2 bytes)'
-- next error is ok: the pad is a string made of 2 bytes
ooRexx> "test"~center(10, x2c("C3A9"))=             -- Incorrect pad or character argument specified; found "é"
Incorrect pad or character argument specified; found "é"
Error code= 93.922


/*
Examples of dynamic target with ~left:
*/
ooRexx> "test"~left(10)=                            -- 'test      '
'test      '
ooRexx> "test"~left(10, ".")=                       -- 'test......'
'test......'
ooRexx> "test"~left(10, "🦖")=                     -- T'test🦖🦖🦖🦖🦖🦖'
T'test🦖🦖🦖🦖🦖🦖'


/*
The ~~ form of message is not impacted: it always returns the initial target
*/
ooRexx> "test"~right(10, "é")~left(20, "è")=        -- T'éééééétestèèèèèèèèèè'
T'éééééétestèèèèèèèèèè'
ooRexx> "test"~~right(10, "é")~left(20, "è")=       -- T'testèèèèèèèèèèèèèèèè'
T'testèèèèèèèèèèèèèèèè'
ooRexx> "test"~right(10, "é")~~left(20, "è")=       -- T'éééééétest'
T'éééééétest'
ooRexx> "test"~~right(10, "é")~~left(20, "è")=      -- 'test'
'test'


/*
[doers]

RexxText inherit from TextDoer.
Examples:
*/
ooRexx> "c2x"~text~do("a")=                     -- 61  (was Object "c2x" does not understand message "DO")
 61
ooRexx> "ça va ?"~characters~each=              -- [T'ç',T'a',T' ',T'v',T'a',T' ',T'?']
[T'ç',T'a',T' ',T'v',T'a',T' ',T'?']
ooRexx> "ça va ?"~characters~each("c2x")=       -- ['C3A7', 61, 20, 76, 61, 20,'3F']
['C3A7', 61, 20, 76, 61, 20,'3F']


/*
A RexxBlock has the same encoding as its definition package.
Examples:
*/
ooRexx> {.context~package~encoding}~()=         -- (The UTF8_Encoding class)
(The UTF8_Encoding class)

ooRexx> oldEncoding = .context~package~setEncoding("byte")
ooRexx> {.context~package~encoding}~()=         -- (The Byte_Encoding class)
(The Byte_Encoding class)
ooRexx> .context~package~setEncoding(oldEncoding)

-- was: Incorrect pad or character argument specified; found "é"
-- because the package encoding of the block was The Byte_Encoding (default)
-- and the string literal "é" was not converted to a RexxText instance.
-- Now the package encoding of the block is The UTF8_Encoding and it works:
ooRexx> ("un", "deux")~each{item~right(10, "é")}==
an Array (shape [2], 2 items)
 1 : T'ééééééééun'
 2 : T'éééééédeux'


-- ===============================================================================
-- 2024 Apr 10

/*
+-----------------------------------------------------------+
|                  2nd important milestone                  |
| The string BIFs become polymorphic on RexxString/RexxText |
+-----------------------------------------------------------+
If at least one positional argument is a RexxText then the string BIFs forward
to RexxText, otherwise the string BIFs forward to RexxString.
Enhanced BIFs:
    ABBREV
    CENTER      implemented on RexxText
    CENTRE      implemented on RexxText
    CHANGESTR
    COMPARE     implemented on RexxText
    COPIES      implemented on RexxText
    COUNTSTR
    D2C         implemented on RexxText
    DELSTR
    DELWORD
    INSERT
    LASTPOS
    LEFT        implemented on RexxText
    LENGTH      implemented on RexxText
    LOWER       implemented on RexxText
    OVERLAY
    POS         implemented on RexxText
    REVERSE     implemented on RexxText
    RIGHT       implemented on RexxText
    SPACE
    STRIP       implemented on RexxText
    SUBSTR      implemented on RexxText
    SUBWORD
    UPPER       implemented on RexxText
    VERIFY
    WORD
    WORDINDEX
    WORDLENGTH
    WORDPOS
    WORDS
    X2C         implemented on RexxText
Examples:
*/
-- CENTER
ooRexx> CENTER("Noel", 10, "*")=                        -- '***Noel***'
'***Noel***'
ooRexx> CENTER("Noel", 10, "🤶")=                       -- T'🤶🤶🤶Noel🤶🤶🤶'  because "🤶" is a RexxText
T'🤶🤶🤶Noel🤶🤶🤶'
ooRexx> CENTER("Noël", 10, "*")=                        -- T'***Noël***'            because "Noël" is a RexxText
T'***Noël***'
ooRexx> CENTER("Noël"~string, 10, "*")=                 --  '**Noël***'
'**Noël***'
ooRexx> CENTER("Noël", 10, "🤶")=                       -- T'🤶🤶🤶Noël🤶🤶🤶'
T'🤶🤶🤶Noël🤶🤶🤶'
ooRexx> CENTER("Noël"~string, 10, "🤶")=                -- T'🤶🤶🤶Noël🤶🤶🤶'  because "🤶" is a RexxText
T'🤶🤶🤶Noël🤶🤶🤶'
ooRexx> CENTER("Noël", 10, "🤶"~string)=                -- T'🤶🤶🤶Noël🤶🤶🤶'  because "Noël" is a RexxText
T'🤶🤶🤶Noël🤶🤶🤶'
ooRexx> CENTER("Noel", 10, "🤶"~string)=                -- CENTER positional argument 3 must be a single character; found "🤶"
CENTER positional argument 3 must be a single character; found "🤶".
Error code= 40.23
ooRexx> CENTER("Noël"~string, 10, "🤶"~string)=         -- CENTER positional argument 3 must be a single character; found "🤶"
CENTER positional argument 3 must be a single character; found "🤶".
Error code= 40.23

-- Other BIFs
ooRexx> ABBREV("Printer","Pri")=                        --  1
 1
ooRexx> ABBREV("Printer 🖨","Pri")=                     -- Object "Printer 🖨" does not understand message "ABBREV"
Object "Printer 🖨" does not understand message "ABBREV".
Error code= 97.1
ooRexx> CHANGESTR("p", "mpNoelpp", "m", 2)=             -- 'mmNoelmp'
'mmNoelmp'
ooRexx> CHANGESTR("🎅", "🤶🎅Noël🎅🎅", "🤶", 2)=   -- Object "🤶🎅Noël🎅🎅" does not understand message "CHANGESTR"
Object "🤶🎅Noël🎅🎅" does not understand message "CHANGESTR".
Error code= 97.1
ooRexx> COMPARE("straSssSSssse", "stra", "S")=          -- 6
 6
ooRexx> COMPARE("straßssßßssse", "stra", "ß")=          -- 6
 6
ooRexx> COPIES("🤶", 4)=                                -- T'🤶🤶🤶🤶'
T'🤶🤶🤶🤶'
ooRexx> COUNTSTR("m", "mpmp")=                          --  2
 2
ooRexx> COUNTSTR("🤶", "🤶🎅🤶🎅")=                 -- Object "🤶🎅🤶🎅" does not understand message "COUNTSTR"
Object "🤶🎅🤶🎅" does not understand message "COUNTSTR".
Error code= 97.1
ooRexx> D2C(65)=                                        -- 'A'
'A'
ooRexx> D2C(65~text)=                                   -- T'A'
T'A'
ooRexx> DELSTR("Noel", 3, 2)=                           -- 'No'
'No'
ooRexx> DELSTR("Noël", 3, 2)=                           -- Object "Noël" does not understand message "DELSTR"
Object "Noël" does not understand message "DELSTR".
Error code= 97.1
ooRexx> DELWORD("Pere Noel p", 2, 2)=                   -- 'Pere '
'Pere '
ooRexx> DELWORD("Père Noël 🎅", 2, 2)=                  -- Object "Père Noël 🎅" does not understand message "DELWORD"
Object "Père Noël 🎅" does not understand message "DELWORD".
Error code= 97.1
ooRexx> INSERT("123", "abc", 5, 6, "+")=                -- 'abc++123+++'
'abc++123+++'
ooRexx> INSERT("123", "abc", 5, 6, "🎅")=               -- Object "abc" does not understand message "INSERT"
Object "abc" does not understand message "INSERT".
Error code= 97.1
ooRexx> LASTPOS("m", "mMere Noelm")=                    -- 11
 11
ooRexx> LASTPOS("🤶", "🤶Mère Noël🤶")=                 -- Object "🤶Mère Noël🤶" does not understand message "LASTPOS"
Object "🤶Mère Noël🤶" does not understand message "LASTPOS".
Error code= 97.1
ooRexx> LEFT("abc d",8,".")=                            -- 'abc d...'
'abc d...'
ooRexx> LEFT("abc d",8,"🤶")=                           -- T'abc d🤶🤶🤶'
T'abc d🤶🤶🤶'
ooRexx> LENGTH("Père Noël 🎅"~string)=                  -- 16
 16
ooRexx> LENGTH("Père Noël 🎅")=                         -- 11
 11
ooRexx> LOWER("PÈRE NOËL")=                             -- T'père noël'
T'père noël'
ooRexx> OVERLAY("123","abc",5,6,"+")=                   -- 'abc+123+++'
'abc+123+++'
ooRexx> OVERLAY("123","abc",5,6,"🤶")=                  -- Object "abc" does not understand message "OVERLAY"
Object "abc" does not understand message "OVERLAY".
Error code= 97.1
ooRexx> POS("Frei", "Bundesstraße im Freiland")=        -- 17
 17
ooRexx> REVERSE("Noël")=                                -- T'lëoN'
T'lëoN'
ooRexx> RIGHT("12",5,"0")=                              --  00012
 00012
ooRexx> RIGHT("12",5,"𝟶")=                             -- T'𝟶𝟶𝟶12'
T'𝟶𝟶𝟶12'
ooRexx> SPACE("abc  def  ",2,"+")=                      -- 'abc++def'
'abc++def'
ooRexx> SPACE("abc  def  ",2,"⊕")=                      -- Object "abc  def  " does not understand message "SPACE"
Object "abc  def  " does not understand message "SPACE".
Error code= 97.1
ooRexx> STRIP("12.0000", "T", '.0')=                    --  12
 12
ooRexx> STRIP("12.øøøø", "T", '.ø')=                   -- T'12'    where 'ø'~c2x='C3B8'.
T'12'
ooRexx> STRIP(("12.øø" || "C3"x || "øø")~string, "T", '.ø'~string)=    --  12  Every byte of the last parameter is searched and removed
 12
ooRexx> STRIP("12.øø" || "C3"x || "øø", "T", '.ø')=                    -- Invalid UTF-8 string (raised by utf8proc)
Invalid UTF-8 string.
Error code= 22.900
ooRexx> STRIP(("12.øø" || "C3"x || "øø")~transcodeTo("ISO-8859-1", replacementCharacter:"#"), "T", '.ø'~transcodeTo("ISO-8859-1"))=   -- T'12.??#'
T'12.��#'
ooRexx> SUBSTR("abc",2,6,".")=                          -- 'bc....'
'bc....'
ooRexx> SUBSTR("abc",2,6,"🤶")=                         -- T'bc🤶🤶🤶🤶'
T'bc🤶🤶🤶🤶'
ooRexx> SUBWORD("Now is   the time",2,2)=               -- 'is   the'
'is   the'
ooRexx> SUBWORD("Now is   the 🕑",2,2)=                 -- Object "Now is   the 🕑" does not understand message "SUBWORD"
Object "Now is   the 🕑" does not understand message "SUBWORD".
Error code= 97.1
ooRexx> UPPER("père noël")=                             -- T'PÈRE NOËL'
T'PÈRE NOËL'
ooRexx> VERIFY("ABCDEF","ABC","N",2,3)=                 --  4
 4
ooRexx> VERIFY("ABCDEF","ABC","N"~text,2,3)=            -- Object "ABCDEF" does not understand message "VERIFY" (yes! ANY parameter is tested, including the option)
Object "ABCDEF" does not understand message "VERIFY".
Error code= 97.1
ooRexx> WORD("Now is the time",3)=                      -- 'the'
'the'
ooRexx> WORD("Now is the 🕑",3)=                        -- Object "Now is the 🕑" does not understand message "WORD"
Object "Now is the 🕑" does not understand message "WORD".
Error code= 97.1
ooRexx> WORDINDEX("Now is the time",3)=                 --  8
 8
ooRexx> WORDINDEX("Now is the 🕑",3)=                   -- Object "Now is the 🕑" does not understand message "WORDINDEX"
Object "Now is the 🕑" does not understand message "WORDINDEX".
Error code= 97.1
ooRexx> WORDLENGTH("Now is the time",4)=                --  4
 4
ooRexx> WORDLENGTH("Now is the 🕑",4)=                  -- Object "Now is the 🕑" does not understand message "WORDLENGTH"
Object "Now is the 🕑" does not understand message "WORDLENGTH".
Error code= 97.1
ooRexx> WORDPOS("the","Now is the time")=               --  3
 3
ooRexx> WORDPOS("the","Now is the 🕑")=                 -- Object "Now is the 🕑" does not understand message "WORDPOS"
Object "Now is the 🕑" does not understand message "WORDPOS".
Error code= 97.1
ooRexx> WORDS("Now is the time")=                       --  4
 4
ooRexx> WORDS("Now is the 🕑")=                         -- Object "Now is the 🕑" does not understand message "WORDS"
Object "Now is the 🕑" does not understand message "WORDS".
Error code= 97.1
ooRexx> X2C(41)=                                        -- 'A'
'A'
ooRexx> X2C(41~text)=                                   -- T'A'
T'A'


/*
Still not sure:
When the target is a String, should the BIF d2c and x2c return a RexxText when
the result is not-ASCII and the evaluation context encoding is not Byte?
That would be consistent with the rules for string literal (R1, R2).
Currently, assuming the package encoding is UTF-8:
"FF"x is a RexxText but x2c("FF") is a String.
And what about "FF"~x2c? currently it's a String.
Examples:
*/
ooRexx> "FF"x=;result~description=                      -- T'[FF]'      'UTF-8 not-ASCII (1 character, 1 codepoint, 1 byte, 1 error)'
T'[FF]'
'UTF-8 not-ASCII (1 character, 1 codepoint, 1 byte, 1 error)'
ooRexx> x2c("FF")=;result~description=                  -- '[FF]'       'UTF-8 not-ASCII by default (1 byte)'
'[FF]'
'UTF-8 not-ASCII by default (1 byte)'
ooRexx> "FF"~x2c=;result~description=                   -- '[FF]'       'UTF-8 not-ASCII by default (1 byte)'
'[FF]'
'UTF-8 not-ASCII by default (1 byte)'
ooRexx> "FF"~text~x2c=;result~description=              -- T'[FF]'      'UTF-8 not-ASCII (1 character, 1 codepoint, 1 byte, 1 error)'
T'[FF]'
'UTF-8 not-ASCII (1 character, 1 codepoint, 1 byte, 1 error)'
ooRexx> "FF"~text("cp1252")~x2c=;result~description=    -- T'[FF]'      'windows-1252 not-ASCII (1 character, 1 codepoint, 1 byte, 0 error)'
T'[FF]'
'windows-1252 not-ASCII (1 character, 1 codepoint, 1 byte, 0 error)'
---
ooRexx> "41"x=;result~description=                      -- 'A'          'UTF-8 ASCII (1 byte)'
'A'
'UTF-8 ASCII (1 byte)'
ooRexx> x2c("41")=;result~description=                  -- 'A'          'UTF-8 ASCII by default (1 byte)'
'A'
'UTF-8 ASCII by default (1 byte)'
ooRexx> "41"~x2c=;result~description=                   -- 'A'          'UTF-8 ASCII by default (1 byte)'
'A'
'UTF-8 ASCII by default (1 byte)'
ooRexx> "41"~text~x2c=;result~description=              -- T'A'         'UTF-8 ASCII (1 character, 1 codepoint, 1 byte, 0 error)'
T'A'
'UTF-8 ASCII (1 character, 1 codepoint, 1 byte, 0 error)'
ooRexx> "41"~text("cp1252")~x2c=;result~description=    -- T'A'         'windows-1252 ASCII (1 character, 1 codepoint, 1 byte, 0 error)'
T'A'
'windows-1252 ASCII (1 character, 1 codepoint, 1 byte, 0 error)'


-- ===============================================================================
-- 2024 Apr 03

/*
No longer apply the rule R3 during the automatic conversion of String literals
to RexxText instances. If the package encoding is not a byte encoding then any
not-ASCII String literal is converted to a RexxText, whatever its encoding.
Reason: inconsistency between
    "noel" "FF"x~~setEncoding("cp1252")=        -- 'noel [FF]' because concatenation of 2 String instances
    "noël" "FF"x~~setEncoding("cp1252")=        -- Encoding: cannot append... because concatenation of a RexxText with a String
Now:
*/
ooRexx>     "FF"x=                                      -- T'[FF]'  (was a String thanks to R3)
T'[FF]'
ooRexx>     "noel" "FF"x~~setEncoding("cp1252")=        -- Encoding: cannot append windows-1252 not-ASCII '[FF]' to UTF-8 ASCII 'noel'   (was 'noel [FF]')
Encoding: cannot append windows-1252 not-ASCII '[FF]' to UTF-8 ASCII 'noel'.
Error code= 23.900
ooRexx>                                                 -- Note: no longer "by default" in "UTF-8 ASCII 'noel'" because the string literal has now a stored encoding
/*
Unchanged:
*/
ooRexx>     "noël" "FF"x=                               -- T'noël [FF]'     no error because the Byte_Encoding is always absorbed
T'noël [FF]'
ooRexx>     "FF"x "noël"=                               -- T'[FF] noël'     idem
T'[FF] noël'
ooRexx>     "noël" "FF"x~~setEncoding("cp1252")=        -- Encoding: cannot append windows-1252 not-ASCII '[FF]' to UTF-8 not-ASCII 'noël'
Encoding: cannot append windows-1252 not-ASCII '[FF]' to UTF-8 not-ASCII 'noël'.
Error code= 23.900
ooRexx>     "FF"x~~setEncoding("cp1252") "noël"=        -- Encoding: cannot append UTF-8 not-ASCII 'noël' to windows-1252 not-ASCII '[FF]'
Encoding: cannot append UTF-8 not-ASCII 'noël' to windows-1252 not-ASCII '[FF]'.
Error code= 23.900


-- ===============================================================================
-- 2024 Apr 01

/*
A package has an encoding:
.Package
    encoding
    encoding=
    hasEncoding
Rules for the calculation of a package default encoding:
Case 1: package not requesting "text.cls", directly or indirectly.
    Most of the legacy packages don't support an automatic conversion to text.
    The package's default encoding is Byte (not .Encoding~defaultEncoding).
Case 2: package requesting "text.cls", directly or indirectly.
    We assume that the requester supports an automatic conversion to text.
    The package's default encoding is .Encoding~defaultEncoding.
*/


/*
New method setEncoding on String, MutableBuffer, Package and RexxText, to change
the current encoding and return the previous encoding.
The bytes are not impacted, it's just an update of the encoding annotation.
Example, assuming the default encoding is UTF-8:
*/
ooRexx> "Noel"~setEncoding("windows-1252")=     -- (The UTF8_Encoding class)    (previous encoding)
(The UTF8_Encoding class)
ooRexx> "Noël"~setEncoding("byte")=             -- (The UTF8_Encoding class)    (previous encoding)
(The UTF8_Encoding class)
/*
Example, when the default encoding is Byte:
*/
ooRexx> oldEncoding = .encoding~setDefaultEncoding("byte")
ooRexx> "Noel"~setEncoding("windows-1252")=     -- (The Byte_Encoding class)    (previous encoding)
(The Byte_Encoding class)
ooRexx> "Noël"~setEncoding("byte")=             -- (The Byte_Encoding class)    (previous encoding)
(The Byte_Encoding class)
ooRexx> .encoding~setDefaultEncoding(oldEncoding)


/*
New methods on the class Encoding, to change the current encoding and return
the previous encoding:
    setDefaultEncoding
    setDefaultInputEncoding
    setDefaultOutputEncoding
*/


/*
Relax the constraints for the Byte_Encoding in the methods compatibleEncoding
and asEncodingFor: The Byte_Encoding can be always absorbed.
Reason: The Byte_Encoding is often used for diagnostic or repair.
Examples:
*/
ooRexx> "Père"~c2g=                             -- '50 C3A8 72 65'
'50 C3A8 72 65'
ooRexx> "Père"~text~startsWith("50C3"x~byte)=   -- false (not aligned) (was Encoding: cannot compare Byte not-ASCII 'P\C3' with UTF-8 not-ASCII 'Père')
 0
ooRexx> "Père"~text~startsWith("50C3A8"x~byte)= -- true (was Encoding: cannot compare Byte not-ASCII 'Pè' with UTF-8 not-ASCII 'Père')
 1


/*
+-------------------------------------------+
|          1st important milestone          |
| Activation of the automatic conversion    |
| of String literals to RexxText instances  |
+-------------------------------------------+
This is managed in RexxString::evaluate
Rules:
if string~isASCII then value = string                               -- R1 don't convert to RexxText if the string literal is ASCII (here, NO test of encoding, just testing the bytes)
else if .context~package~encoding~isByte then value = string        -- R2 don't convert to RexxText if the encoding of its definition package is the Byte_Encoding or a subclass of it (legacy package).
-- else if string~isCompatibleWithByteString then value = string    -- R3 (no longer applied) don't convert to RexxText if the string literal is compatible with a Byte string.
else value = string~text                                            -- R4 convert to RexxText
Examples, assuming the package encoding is UTF-8:
*/
ooRexx> "Noel"~class=                                       -- (The String class)       R1
(The String class)

ooRexx> oldEncoding = .context~package~setEncoding("byte")
ooRexx> "Noël"~class=                                       -- (The String class)       R2
(The String class)
ooRexx> .context~package~setEncoding(oldEncoding)

-- The rule R3 is no longer applied
-- The only way to test it is to use an hexadecimal (or binary) string literal.
-- [later] The hexadecimal string literals are no longer Byte encoded, so this test is no longer a good test
ooRexx> "Noël"~c2x=                                         -- '4E 6F C3AB 6C'
'4E 6F C3AB 6C'
ooRexx> '4E 6F C3AB 6C'x~encoding=                          -- (The UTF8_Encoding class) (was (The Byte_Encoding class) so R3 could apply, but we no longer apply it)
(The UTF8_Encoding class)
ooRexx> '4E 6F C3AB 6C'x~class=                             -- (The RexxText class)     R4
(The RexxText class)

ooRexx> "Noël"~class=                                       -- (The RexxText class)     R4
(The RexxText class)
ooRexx> "Noël"~string~class=                                -- (The String class)       R4 The string literal is a RexxText, the method ~string returns a String with encoding UTF-8
(The String class)
ooRexx> "Noël"~~setEncoding("byte")~class=                  -- (The RexxText class)     R4 The string literal is a RexxText, its encoding is changed from UTF-8 to Byte
(The RexxText class)
ooRexx> "Noël"~~setEncoding("byte")~string~class=           -- (The String class)       R4 The string literal is a RexxText, its encoding is changed from UTF-8 to Byte, the method ~string returns a String with encoding Byte
(The String class)


/*
Deactivate (again) the constraint "self~isCompatibleWithByteString" when converting
a RexxText to a String (.Unicode~unckeckedConversionToString = .true).
Reason: after activation of the automatic conversion to RexxText, I get these
errors if I keep the constraint "self~isCompatibleWithByteString".
    say "Noël"              -- raise an error "UTF-8 not-ASCII 'Noël' cannot be converted to a String instance"
    xrange("00"x,"ff"x)     -- raise an error "UTF-8 not-ASCII '[FF]' cannot be converted to a String instance"
The constraint "self~isCompatibleWithByteString" was put in place to detect when
a RexxText instance is "lost" during conversion to string. Now that we have a
common interface on String and RexxText, plus an automatic conversion to RexxText,
this "loss" should occur less often. But still occurs.
Example, assuming the default encoding and the package encoding are UTF-8:
*/
ooRexx> "Noël"~length=          -- 4
 4
ooRexx> "Noël"~text~length=     -- 4
 4
ooRexx> "Noël"~string~length=   -- 5
 5
ooRexx> length("Noël")=         -- 4    (was 5, should be 4    (with the constraint, would raise UTF-8 not-ASCII 'Noël' cannot be converted to a String instance))
 4
ooRexx> length("Noël"~string)=  -- 5
 5


/*
----------
ABANDONNED
(incompatible with the decision to assign the encoding of the definition package
to the string literals)
----------
The strings created by D2C, X2C are declared Byte encoded.
It's because it's not unusual to create ill-formed encoded strings with these BIF/BIM.
The Byte_Encoding is a raw encoding with few constraints, BUT it's impossible
to transcode from/to it without errors if the string contains not-ASCII characters.
That's why, often, a more specialized byte encoding is applied on the byte string,
to interpret the bytes differently.
Implementation notes:
    D2C: RexxNumberString::d2xD2c calls StringUtil::packHex
    X2C: StringUtil::packHex
Examples:
*/
ooRexx> "é"~encoding=                                   -- (The UTF8_Encoding class)
(The UTF8_Encoding class)

-- D2C
ooRexx> "é"~c2d=                                       -- 50089
 50089
ooRexx> d2c(50089)=                                     -- 'é'
'é'
ooRexx> 50089~d2c=                                      -- 'é'
'é'
ooRexx> d2c(50089)~encoding=                            -- (The UTF8_Encoding class)    (was (The Byte_Encoding class))
(The UTF8_Encoding class)
ooRexx> 50089~d2c~encoding=                             -- (The UTF8_Encoding class)    (was (The Byte_Encoding class))
(The UTF8_Encoding class)

-- X2C
ooRexx> "é"~c2x=                                        -- 'C3A9'
'C3A9'
ooRexx> x2c("C3A9")=                                    -- 'é'
'é'
ooRexx> "C3A9"~x2c=                                     -- 'é'
'é'
ooRexx> x2c("C3A9")~encoding=                           -- (The UTF8_Encoding class)    (was (The Byte_Encoding class))
(The UTF8_Encoding class)
ooRexx> "C3A9"~x2c~encoding=                            -- (The UTF8_Encoding class)    (was (The Byte_Encoding class))
(The UTF8_Encoding class)

-- Valid Byte string, but invalid UTF-8 string
ooRexx> "C3"~x2c~class=                                 -- (The String class)
(The String class)
ooRexx> "C3"~x2c~encoding=                              -- (The UTF8_Encoding class)    (was (The Byte_Encoding class))
(The UTF8_Encoding class)
-- Apply an UTF-8 view through the String interface
ooRexx> "C3"~x2c~~setEncoding("utf8")~description=      -- 'UTF-8 not-ASCII (1 byte)'
'UTF-8 not-ASCII (1 byte)'
ooRexx> "C3"~x2c~~setEncoding("utf8")~errors=           -- 'UTF-8 encoding: byte sequence at byte-position 1 is truncated, expected 2 bytes.'
['UTF-8 encoding: byte sequence at byte-position 1 is truncated, expected 2 bytes.']
-- Apply an UTF-8 view through the RexxText interface
ooRexx> "C3"~x2c~text("utf8")~description=              -- 'UTF-8 not-ASCII (1 character, 1 codepoint, 1 byte, 1 error)'
'UTF-8 not-ASCII (1 character, 1 codepoint, 1 byte, 1 error)'
ooRexx> "C3"~x2c~text("utf8")~errors=                   -- 'UTF-8 encoding: byte sequence at byte-position 1 is truncated, expected 2 bytes
['UTF-8 encoding: byte sequence at byte-position 1 is truncated, expected 2 bytes.']


/*
----------
ABANDONNED
(incompatible with the decision to assign the encoding of the definition package
to the string literals)
----------
The hexadecimal and binary strings are declared Byte encoded, for the same reasons
as D2C, X2C.
Implementation notes:
    RexxSource::packLiteral (Scanner.cpp)
Examples:
*/
-- The encoding of a string literal is the encoding of its definition package.
ooRexx> "é"~encoding=                                   -- (The UTF8_Encoding class)
(The UTF8_Encoding class)

-- The encoding of an hexadecimal string is the Byte encoding.
ooRexx> "é"~c2x=                                        -- 'C3A9'
'C3A9'
ooRexx> "C3A9"x=                                        -- T'é'
T'é'
ooRexx> "C3A9"x~encoding=                               -- (The UTF8_Encoding class)    (was (The Byte_Encoding class))
(The UTF8_Encoding class)

-- The encoding of a binary string is the Byte encoding.
ooRexx> "é"~c2x~x2b=                                    -- 1100001110101001
 1100001110101001
ooRexx> "11000011 10101001"b=                           -- T'é'
T'é'
ooRexx> "11000011 10101001"b~encoding=                  -- (The UTF8_Encoding class)    (was (The Byte_Encoding class))
(The UTF8_Encoding class)


/*
Implementation of Strip:
*/
ooRexx> "Noël"~strip=                       -- T'Noël'
T'Noël'
ooRexx> "\tNoël "~unescape~strip=           -- T'Noël'
T'Noël'
ooRexx> "Noël"~strip("b", "ë")=             -- T'Noël'
T'Noël'
ooRexx> "Noë"~strip("b", "ë")=              -- T'No'
T'No'
ooRexx> "🤶Noël🎅"~strip("b", "lë🎅🤶")=  -- T'No'
T'No'
ooRexx> "\u{NBSP}\u{EN SPACE}\u{EM SPACE}\u{HAIR SPACE}\u{FIGURE SPACE}\u{THIN SPACE}"~unescape~strip=          -- T'      '
T'      '
ooRexx> "\u{NBSP}\u{EN SPACE}\u{EM SPACE}\u{HAIR SPACE}\u{FIGURE SPACE}\u{THIN SPACE}"~unescape~strip(lump:)=   -- T''
T''


/*
New methods on String for compatibility with RexxText (inherit StringRexxTextInterface).
Most of these methods forward to string~text.
*/
ooRexx> "a"~errors=                                 -- (The NIL object)
(The NIL object)
ooRexx> "a"~isCompatibleWithASCII=                  -- 1
 1
ooRexx> "a"~isCompatibleWithByteString=             -- 1
 1
ooRexx> "a"~isUpper=                                -- 0
 0
ooRexx> "A"~isUpper=                                -- 1
 1
ooRexx> "a"~isLower=                                -- 1
 1
ooRexx> "A"~isLower=                                -- 0
 0
ooRexx> "a"~codepoints=                             -- (a CodePointSupplier)
(a CodePointSupplier)
ooRexx> "a"~maximumCodepoint=                       -- 97
 97
ooRexx> "a"~maximumUnicodeCodepoint=                -- 97
 97
ooRexx> "a"~UnicodeCharacters=                      -- [( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )]
[( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )]
ooRexx> "a"~characters=                             -- ['a']
['a']
ooRexx> "a"~character(1)=                           -- 'a'
'a'
ooRexx> buffer = .MutableBuffer~new; "a"~character(1, :buffer)=     -- M'a'
M'a'
ooRexx> "a"~transcodeTo("utf16")=                   -- T'[00]a'
T'[00]a'
ooRexx> "a"~utf8=                                   -- T'a'
T'a'
ooRexx> "a"~wtf8=                                   -- T'a'
T'a'
ooRexx> "a"~utf16=                                  -- T'[00]a'
T'[00]a'
ooRexx> "a"~utf16be=                                -- T'[00]a'
T'[00]a'
ooRexx> "a"~utf16le=                                -- T'a[00]'
T'a[00]'
ooRexx> "a"~wtf16=                                  -- T'[00]a'
T'[00]a'
ooRexx> "a"~wtf16be=                                -- T'[00]a'
T'[00]a'
ooRexx> "a"~wtf16le=                                -- T'a[00]
T'a[00]'
ooRexx> "a"~utf32=                                  -- T'[000000]a'
T'[000000]a'
ooRexx> "a"~utf32be=                                -- T'[000000]a'
T'[000000]a'
ooRexx> "a"~utf32le=                                -- T'a[000000]'
T'a[000000]'
ooRexx> "a"~unicode~c2x=                            -- 61
 61
ooRexx> "a"~unicodeN~c2x=                           -- 61
 61
ooRexx> "a"~unicode8~c2x=                           -- 61
 61
ooRexx> "a"~unicode16~c2x=                          -- 6100
 6100
ooRexx> "a"~unicode32~c2x=                          -- 61000000
 61000000
ooRexx> "a"~c2u=                                    -- 'U+0061'
'U+0061'
ooRexx> 'U+0061'~u2c=                               -- T'a[000000]'
T'a[000000]'
ooRexx> 'U+0061'~u2c~c2x=                           -- 61000000
 61000000
ooRexx> 'U+0061'~u2c~utf8=                          -- T'a'
T'a'
ooRexx> "ab"~c2g=                                   -- '61 62'
'61 62'
ooRexx> "z"~checkHexadecimalValueCompatibility=     -- [no result] (good, no error raised)
[no result]
ooRexx> "z"~checkNumericValueCompatibility=         -- [no result] (good, no error raised)
[no result]
ooRexx> "z"~checkLogicalValueCompatibility=         -- [no result] (good, no error raised)
[no result]
ooRexx> "\u{FLAG IN HOLE}"~unescape=                -- T'⛳'
T'⛳'
ooRexx> "a"~transform=                              -- T'a'
T'a'
ooRexx> "a"~transformer=                            -- (a RexxTextTransformer)
(a RexxTextTransformer)
ooRexx> "abc def"~title=                            -- T'Abc Def'
T'Abc Def'
ooRexx> "a"~isNFC=                                  -- 1
 1
ooRexx> "a"~NFC=                                    -- T'a'
T'a'
ooRexx> "a"~isNFD=                                  -- 1
 1
ooRexx> "a"~NFD=                                    -- T'a'
T'a'
ooRexx> "a"~isNFKC=                                 -- 1
 1
ooRexx> "a"~NFKC=                                   -- T'a'
T'a'
ooRexx> "a"~isNFKD=                                 -- 1
 1
ooRexx> "a"~NFKD=                                   -- T'a'
T'a'
ooRexx> "a"~isCasefold=                             -- -1
-1
ooRexx> "A"~isCasefold=                             -- -1
-1
ooRexx> "a"~transform(casefold:)~isCasefold=        -- 1
 1
ooRexx> "A"~transform(casefold:)~isCasefold=        -- 1
 1
ooRexx> "a"~casefold=                               -- T'a'
T'a'
ooRexx> "A"~casefold=                               -- T'a'
T'a'
ooRexx> "a"~isMarkStripped=                         -- -1
-1
ooRexx> "a"~transform(stripMark:)~isMarkStripped=   -- 1
 1
ooRexx> "a"~isIgnorableStripped=                    -- -1
-1
ooRexx> "a"~transform(stripIgnorable:)~isIgnorableStripped=     -- 1
 1
ooRexx> "a"~isCCStripped=                           -- -1
-1
ooRexx> "a"~transform(stripCC:)~isCCStripped=       -- 1
 1
ooRexx> "a"~isNAStripped=                           -- -1
-1
ooRexx> "a"~transform(stripNA:)~isNAStripped=       -- 1
 1
ooRexx> "ab"~graphemes=                             -- ['a','b']
['a','b']
ooRexx> "ab"~grapheme(1)=                           -- 'a'
'a'


/*
Implementation of the abstract method 'transform' for Byte_Encoding and its subclasses.
Parameters:
    normalization = 0           Ignored, there is no normalization for byte strings.
    casefold = .false           if .true then apply ~lower
    lump= .false                Ignored
    stripMark = .false          if .true then replace the accented letters by their base letter
    stripIgnorable= .false      Ignored
    stripCC = .false            if .true then remove the codepoints < 20x
    stripNA = .false            if .true then remove the unassigned codepoints
Examples:
*/
-- casefold
ooRexx> "Père Noël"~transcodeTo("windows-1252")=                                                     -- T'P?re No?l'
T'P�re No�l'
ooRexx> "Père Noël"~transcodeTo("windows-1252")~c2x=                                                 -- '50 E8 72 65 20 4E 6F EB 6C'
'50 E8 72 65 20 4E 6F EB 6C'
ooRexx> '50 E8 72 65 20 4E 6F EB 6C'x~byte~transform(casefold:)=                                     -- T'p?re no?l'
T'p�re no�l'
ooRexx> '50 E8 72 65 20 4E 6F EB 6C'x~byte~transform(casefold:)~encoding=                            -- (The Byte_Encoding class)
(The Byte_Encoding class)
ooRexx> '50 E8 72 65 20 4E 6F EB 6C'x~byte~transform(casefold:)~utf8=                                -- Cannot convert Byte not-ASCII character 232 (E8) at byte-position 2 to UTF-8
Cannot convert Byte not-ASCII character 232 (E8) at byte-position 2 to UTF-8.
Error code= 23.900
ooRexx> '50 E8 72 65 20 4E 6F EB 6C'x~byte~transform(casefold:)~~setEncoding("windows-1252")~utf8=   -- T'père noël'
T'père noël'

-- stripMark depends on the encoding
ooRexx> "80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F 90 93 94 95 96 97 98 99 9A 9F A0 A1 A2 A3 A4 A5"x~text("ibm-437")~utf8=                           -- T'ÇüéâäàåçêëèïîìÄÅÉôöòûùÿÖ܃áíóúñÑ'
T'ÇüéâäàåçêëèïîìÄÅÉôöòûùÿÖ܃áíóúñÑ'
ooRexx> "80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F 90 93 94 95 96 97 98 99 9A 9F A0 A1 A2 A3 A4 A5"x~text("ibm-437")~transform(stripMark:)~utf8=     -- T'CueaaaaceeeiiiAAEooouuyOUfaiounN'
T'CueaaaaceeeiiiAAEooouuyOUfaiounN'
ooRexx> "83 8A 9A 9F C0 C1 C2 C3 C4 C5 C7 C8 C9 CA CB CC CD CE CF D1 D2 D3 D4 D5 D6 D8 D9 DA DB DC DD E0 E1 E2 E3 E4 E5 E7 E8 E9 EA EB EC ED EE EF F1 F2 F3 F4 F5 F6 F8 F9 FA FB FC FD FF"x~text("ibm-1252")~utf8=                        -- T'ƒŠšŸÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüýÿ'
T'ƒŠšŸÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüýÿ'
ooRexx> "83 8A 9A 9F C0 C1 C2 C3 C4 C5 C7 C8 C9 CA CB CC CD CE CF D1 D2 D3 D4 D5 D6 D8 D9 DA DB DC DD E0 E1 E2 E3 E4 E5 E7 E8 E9 EA EB EC ED EE EF F1 F2 F3 F4 F5 F6 F8 F9 FA FB FC FD FF"x~text("ibm-1252")~transform(stripMark:)~utf8=  -- T'fSsYAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy'
T'fSsYAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy'
ooRexx> "C0 C1 C2 C3 C4 C5 C7 C8 C9 CA CB CC CD CE CF D1 D2 D3 D4 D5 D6 D8 D9 DA DB DC DD E0 E1 E2 E3 E4 E5 E7 E8 E9 EA EB EC ED EE EF F1 F2 F3 F4 F5 F6 F8 F9 FA FB FC FD FF"x~text("iso-8859-1")~utf8=                            -- T'ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüýÿ'
T'ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüýÿ'
ooRexx> "C0 C1 C2 C3 C4 C5 C7 C8 C9 CA CB CC CD CE CF D1 D2 D3 D4 D5 D6 D8 D9 DA DB DC DD E0 E1 E2 E3 E4 E5 E7 E8 E9 EA EB EC ED EE EF F1 F2 F3 F4 F5 F6 F8 F9 FA FB FC FD FF"x~text("iso-8859-1")~transform(stripMark:)~utf8=      -- T'AAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy'
T'AAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy'
ooRexx> "83 8A 8E 9A 9E 9F C0 C1 C2 C3 C4 C5 C7 C8 C9 CA CB CC CD CE CF D1 D2 D3 D4 D5 D6 D8 D9 DA DB DC DD E0 E1 E2 E3 E4 E5 E7 E8 E9 EA EB EC ED EE EF F1 F2 F3 F4 F5 F6 F8 F9 FA FB FC FD FF"x~text("windows-1252")~utf8=                            -- T'ƒŠŽšžŸÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüýÿ'
T'ƒŠŽšžŸÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüýÿ'
ooRexx> "83 8A 8E 9A 9E 9F C0 C1 C2 C3 C4 C5 C7 C8 C9 CA CB CC CD CE CF D1 D2 D3 D4 D5 D6 D8 D9 DA DB DC DD E0 E1 E2 E3 E4 E5 E7 E8 E9 EA EB EC ED EE EF F1 F2 F3 F4 F5 F6 F8 F9 FA FB FC FD FF"x~text("windows-1252")~transform(stripMark:)~utf8=      -- T'fSZszYAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy'
T'fSZszYAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy'

-- several transformations
ooRexx> "Père Noël"~transcodeTo("windows-1252")~transform(casefold:, stripMark:)~utf8=                      -- T'pere noel'
T'pere noel'
ooRexx> '50 E8 72 65 20 4E 6F EB 6C'x~~setEncoding("windows-1252")~transform(casefold:, stripMark:)~utf8=   -- T'pere noel'
T'pere noel'
-- next: the transform is done on Byte string, which has no rule for stripMark.
-- the accents are not removed.
ooRexx> '50 E8 72 65 20 4E 6F EB 6C'x~byte~transform(casefold:, stripMark:)~~setEncoding("windows-1252")~utf8=   -- T'père noël'
T'père noël'


-- ===============================================================================
-- 2024 Mar 17

/*
For consistency with other methods, add the optional named argument 'buffer' to
    []
    c2g
    c2x
    x2b
    x2d
Examples:
*/
ooRexx> buffer = .MutableBuffer~new
ooRexx> "Tête à tête"~text[2, 5, :buffer]=  -- M'ête à'
M'ête à'
ooRexx> "A"~text~c2g(:buffer)=              -- M'ête à41'
M'ête à41'
ooRexx> "A"~text~c2x(:buffer)=              -- M'ête à4141'
M'ête à4141'
ooRexx> "41"~text~x2b(:buffer)=             -- M'ête à414101000001'
M'ête à414101000001'
ooRexx> "41"~text~x2d(:buffer)=             -- M'ête à41410100000165'
M'ête à41410100000165'


/*
For compatibility with Python, add support for \N{Unicode name}.
Example:
*/
ooRexx> "\N{for all} x \N{there exists} y such that x+y=0"~text~unescape=       -- T'∀ x ∃ y such that x+y=0'
T'∀ x ∃ y such that x+y=0'


/*
Add support for code point labels.
Examples:
*/
ooRexx> .unicode~character("<control-000A>")=           -- ( ""    U+000A Cc 0 "", "LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL" )
( ""    U+000A Cc 0 "", "LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL" )
ooRexx> "hello\N{<control-000A>}bye"~text~unescape=     -- T'hello[0A]bye'
T'hello[0A]bye'
ooRexx> "hello\U{<control-000A>}bye"~text~unescape=     -- T'hello[0A]bye'
T'hello[0A]bye'


/*
Modify the display of UnicodeCharacter properties to show the codepoint values
in U+ and 0x notation.
*/
ooRexx> .Unicode["🤶"]~properties=
a Directory (30 items)
'aliases'               : (an Array no shape, 0 items)
'bidiClass'             :  19
'bidiClassName'         : 'ON'
'bidiMirrored'          :  0
'boundClass'            :  19
'boundClassName'        : 'EXTENDED_PICTOGRAPHIC'
'category'              :  22
'categoryName'          : 'So'
'charWidth'             :  2
'codepoint'             : 'U+1F936'
'combiningClass'        :  0
'controlBoundary'       :  0
'decompositionType'     :  0
'decompositionTypeName' : 'None'
'ignorable'             :  0
'isLower'               :  0
'isUpper'               :  0
'name'                  : 'MOTHER CHRISTMAS'
'toLowerFull'           : 'U+1F936'
'toLowerSimple'         : 'U+1F936'
'toTitleFull'           : 'U+1F936'
'toTitleSimple'         : 'U+1F936'
'toUpperFull'           : 'U+1F936'
'toUpperSimple'         : 'U+1F936'
'Unicode'               : '0x36F90100'
'UTF16BE'               : '0xD83EDD36'
'UTF16LE'               : '0x3ED836DD'
'UTF32BE'               : '0x0001F936'
'UTF32LE'               : '0x36F90100'
'UTF8'                  : '0xF09FA4B6'


/*
Modification of the rule for buffer encoding neutrality.
    old: If left is a        buffer with no encoding then use the right encoding.
    new: If left is an empty buffer with no encoding then use the right encoding.
Impacted methods:
    .Encoding~compatibleEncoding
    .StringIndexer~asEncodingFor
Examples:
*/
ooRexx> buffer = .MutableBuffer~new
-- This is an empty buffer with no explicit encoding:
-- The rule for encoding neutrality will apply.
ooRexx> buffer~description=                                                     -- 'UTF-8 ASCII by default (0 byte)'
'UTF-8 ASCII by default (0 byte)'
ooRexx> "Test"~text~utf16~left(2, :buffer)=                                     -- M'[00]T[00]e'
M'[00]T[00]e'
-- The buffer encoding is now UTF-16BE.
ooRexx> buffer~description=                                                     -- 'UTF-16BE (4 bytes)'
'UTF-16BE (4 bytes)'

ooRexx> buffer = .MutableBuffer~new("not empty")
ooRexx> buffer~description=                                                     -- 'UTF-8 ASCII (9 bytes)'
'UTF-8 ASCII (9 bytes)'
ooRexx>                                                                         -- Note: no longer "UTF-8 ASCII by default" because the string literal has now a stored encoding
-- Here, the rule for encoding neutrality does not apply.
ooRexx> "Test"~text~utf16~left(2, :buffer)=                                     -- Encoding: cannot append UTF-16BE to UTF-8 ASCII 'not empty'
Encoding: cannot append UTF-16BE to UTF-8 ASCII 'not empty'.
Error code= 23.900
ooRexx>                                                                         -- Note: no longer "UTF-8 ASCII by default" because the string literal has now a stored encoding


/*
New method ~u2c on String and RexxText.
Create a Unicode32 text from a sequence of U+xxxx.
The U+ string/text must be compatible with a byte encoding (Byte or subclass,
UTF-8 ASCII, WTF-8 ASCII).
In other words, will not support a sequence of U+xxxx encoded in UTF-16 or UTF-32.
Examples:
*/
-- U+ string
ooRexx> "U+004E u+006F U+00EB U+006C U+1F936 U+1F385"~u2c~description=          -- 'Unicode32 (6 characters, 6 codepoints, 24 bytes, 0 error)'
'Unicode32 (6 characters, 6 codepoints, 24 bytes, 0 error)'
ooRexx> "U+004E u+006F U+00EB U+006C U+1F936 U+1F385"~u2c~c2x=                  -- '4E000000 6F000000 EB000000 6C000000 36F90100 85F30100'
'4E000000 6F000000 EB000000 6C000000 36F90100 85F30100'
ooRexx> "U+004E u+006F U+00EB U+006C U+1F936 U+1F385"~u2c~utf8=                 -- T'Noël🤶🎅'
T'Noël🤶🎅'

-- U+ text
ooRexx> "U+004E u+006F U+00EB U+006C U+1F936 U+1F385"~text~u2c~description=     -- 'Unicode32 (6 characters, 6 codepoints, 24 bytes, 0 error)'
'Unicode32 (6 characters, 6 codepoints, 24 bytes, 0 error)'
ooRexx> "U+004E u+006F U+00EB U+006C U+1F936 U+1F385"~text~u2c~c2x=             -- '4E000000 6F000000 EB000000 6C000000 36F90100 85F30100'
'4E000000 6F000000 EB000000 6C000000 36F90100 85F30100'
ooRexx> "U+004E u+006F U+00EB U+006C U+1F936 U+1F385"~text~u2c~utf8=            -- T'Noël🤶🎅'
T'Noël🤶🎅'

ooRexx> buffer = .MutableBuffer~new
ooRexx> "U+0031 U+0032"~text~u2c(:buffer)=                                      -- M'1[000000]2[000000]'
M'1[000000]2[000000]'
-- The buffer encoding is now Unicode32.
ooRexx> buffer~description=                                                     -- 'Unicode32 (8 bytes)'
'Unicode32 (8 bytes)'

-- Examples of invalid U+ string/text
ooRexx> "U+004E u+006F U+00EB U+006C U+1F936 U+1F385"~text~utf16~u2c=           -- UTF-16BE '[00]U[00]+[00]0[00]0[00]4[00]E[00] ...' is not compatible with an U+ string.
UTF-16BE '[00]U[00]+[00]0[00]0[00]4[00]E[00] ...' is not compatible with an U+ string.
Error code= 23.900
ooRexx> "A+004E"~u2c=                                                           -- Expecting U+ or u+ followed by 4..6 hex digits, got 'A+004E'
Expecting U+ or u+ followed by 4..6 hex digits, got 'A+004E'.
Error code= 93.900
ooRexx> "u+4E"~u2c=                                                             -- Expecting U+ or u+ followed by 4..6 hex digits, got 'u+4E'
Expecting U+ or u+ followed by 4..6 hex digits, got 'u+4E'.
Error code= 93.900
ooRexx> "u+000004E"~u2c=                                                        -- Expecting U+ or u+ followed by 4..6 hex digits, got 'u+000004E'
Expecting U+ or u+ followed by 4..6 hex digits, got 'u+000004E'.
Error code= 93.900


/*
New supported methods on RexxText:
- d2c       forward to String, return a Text or a MutableBuffer
- d2x       forward to String, return a String or a MutableBuffer
Examples:
*/
ooRexx> "65"~text~d2c=              -- T'A'
T'A'
ooRexx> "65"~text~d2x=              -- 41
 41
ooRexx> buffer = .MutableBuffer~new
ooRexx> "65"~text~d2c(:buffer)=     -- M'A'
M'A'
ooRexx> "65"~text~d2x(:buffer)=     -- M'A41'
M'A41'
ooRexx> buffer~encoding = "utf16"
ooRexx> "65"~text~d2c(:buffer)=     -- Encoding: cannot append Byte ASCII 'A' to UTF-16BE 'A41'
Encoding: cannot append UTF-8 ASCII by default 'A' to UTF-16BE 'A41'.
Error code= 23.900


/*
Partial implementation of translate (ASCII string only):
Examples:
*/
ooRexx> "hello"~text~translate=              -- 'HELLO'
'HELLO'
ooRexx> "hello"~text~translate(,,"x")=       -- 'xxxxx'
'xxxxx'
ooRexx> "hello"~text~translate(,"el","x")=   -- 'hxxxo'
'hxxxo'


-- ===============================================================================
-- 2023 Dec 04

/*
Reworked the implementation of caselessMatchChar, matchCar.
*/

ooRexx> "Bundesschnellstraße"~text~caselessMatchChar(18, "s")=           -- now 0: "ß" casefolded to "ss" doesn't match "s"
 0
ooRexx> "BAFFLE"~text~caselessMatchChar(5, "ffl")=                        -- now 0: "L" casefolded to "l" doesn't match "ffl" casefolded to "ffl" (no more iteration on each character of "ffl")
 0
ooRexx> "baffle"~text~matchChar(3, "f", normalization:.Unicode~NFKD)=     -- now 0: "ffl" transformed to "ffl" doesn't match "f"
 0

/*
After rework, I have these other differences:
*/

-- Case 1 sounds good (no more iteration on each character of "ffl")
ooRexx> "BAFFLE"~text~caselessMatchChar(3, "ffl")=        -- 0    was 1 "ffl" becomes "ffl" (3 graphemes), there is a match on "f" at 3
 0
ooRexx> "BAFFLE"~text~caselessPos("ffl", aslist:, aligned:0)=
a List (1 items)
 0 : [+3.3,+6.6]
/*
    a List (1 items)
     0 : [+3.3,+6.6]
*/
-- I get the same result as before by explicitely decomposing the ligature "ffl" to "ffl" BEFORE :
ooRexx> "BAFFLE"~text~caselessMatchChar(3, "ffl"~text~transform(normalization:.Unicode~NFKD))=    -- 1
 1
-- here, it's ok because the match is on several characters
ooRexx> "BAFFLE"~text~caselessMatch(3, "ffl")=            -- 1
 1


-- Case 2 sounds good (no more iteration on each character of "ffl")
ooRexx> "BAFFLE"~text~caselessMatchChar(5, "ffl")=        -- 0    was 1 "ffl" becomes "ffl" (3 graphemes), there is a match on "l" at 5
 0
ooRexx> "BAFFLE"~text~caselessMatch(5, "ffl")=            -- 0
 0


-- Case 3 sounds good (no more iteration on each character of "ffl")
ooRexx> "baffle"~text~caselessMatchChar(3, "F")=          -- 0    was 1 "ffl" at 3 (1 grapheme) becomes "ffl" (3 graphemes), there is a match on "f"
 0


-- Case 4 sound good (hum... did I really think that the character "ffl" at pos 3 can match an "l"?)
ooRexx> "baffle"~text~caselessMatchChar(3, "L")=          -- 0    was 1 "ffl" at 3 (1 grapheme) becomes "ffl" (3 graphemes), there is a match on "l"
 0


-- ===============================================================================
-- 2023 Nov 28

/*
https://github.com/unicode-org/icu4x/issues/4365
Segmenter does not work correctly in some languages
        let text = "as `নমস্কাৰ, আপোনাৰ কি খবৰ?`
    hi `हैलो, क्या हाल हैं?`
    mai `नमस्ते अहाँ केना छथि?`
    mr `नमस्कार, कसे आहात?`
    ne `नमस्ते, कस्तो हुनुहुन्छ?`
    or `ନମସ୍କାର ତୁମେ କେମିତି ଅଛ?`
    sa `हे त्वं किदं असि?`
    te `హాయ్, ఎలా ఉన్నారు?`";
icu4c: 151
rust: 161
---
ICU4X and ICU4C are just using different definitions of EGCs; ICU4C has had a
tailoring for years which has just been incorporated into Unicode 15.1, whereas
ICU4X implements the 15.0 version without that tailoring.
The difference is the handling of aksaras in some indic scripts:
in Unicode 15.1 (and in any recent ICU4C) क्या is one EGC, but it is two EGCs
(क्, या) in untailored Unicode 15.0 (and in ICU4X).
---
executor: 151
*/
ooRexx> s="as `নমস্কাৰ, আপোনাৰ কি খবৰ?`"'0D'x"hi `हैलो, क्या हाल हैं?`"'0D'x"mai `नमस्ते अहाँ केना छथि?`"'0D'x"mr `नमस्कार, कसे आहात?`"'0D'x"ne `नमस्ते, कस्तो हुनुहुन्छ?`"'0D'x"or `ନମସ୍କାର ତୁମେ କେମିତି ଅଛ?`"'0D'x"sa `हे त्वं किदं असि?`"'0D'x"te `హాయ్, ఎలా ఉన్నారు?`"
ooRexx> s~text~length=  -- 151
 151


/*
https://boyter.org/posts/unicode-support-what-does-that-actually-mean/
According wikipedia the character ſ is a long s. Which means if you want to
support unicode you need to ensure that if someone does a case insensitive
comparison then the following examples are all string equivalent.
ſecret == secret == Secret
ſatisfaction == satisfaction == ſatiſfaction == Satiſfaction == SatiSfaction === ſatiSfaction
*/
ooRexx> "ſ"~text~casefold=                                      -- "s"
T's'
ooRexx> "ſecret"~text~caselessEquals("secret")=                 -- 1
 1
ooRexx> "ſecret"~text~caselessEquals("Secret")=                 -- 1
 1
ooRexx> "ſatisfaction"~text~caselessEquals("satisfaction")=     -- 1
 1
ooRexx> "satisfaction"~text~caselessEquals("ſatiſfaction")=     -- 1
 1
ooRexx> "ſatiſfaction"~text~caselessEquals("Satiſfaction")=     -- 1
 1
ooRexx> "Satiſfaction"~text~caselessEquals("SatiSfaction")=     -- 1
 1
ooRexx> "SatiSfaction"~text~caselessEquals("ſatiSfaction")=     -- 1
 1


-- ===============================================================================
-- 2023 Nov 21

/*
To rework? matchChar sometimes returns .true whereas pos returns 0.
Examples in demoTextCompatibility:

KO? 2023.12.04: yes
*/
ooRexx> "Bundesschnellstraße"~text~caselessMatchChar(18, "s")=      -- now 0, was 1 before 2023.12.04
 0
ooRexx> "Bundesschnellstraße"~text~caselessPos("s", aslist:, aligned:0)=
a List (5 items)
 0 : [+6.6,+7.7]
 1 : [+7.7,+8.8]
 2 : [+14.14,+15.15]
 3 : [+18.18,-18.19]
 4 : [-18.19,+19.20]
/*
    a List (5 items)
     0 : [+6.6,+7.7]
     1 : [+7.7,+8.8]
     2 : [+14.14,+15.15]
     3 : [+18.18,-18.19]
     4 : [-18.19,+19.20]
*/

/*
KO? 2023.12.04: yes
*/
ooRexx> "BAFFLE"~text~caselessMatchChar(5, "ffl")=                    -- now 0, was 1 before 2023.12.04
 0
ooRexx> "BAFFLE"~text~caselessPos("ffl", aslist:, aligned:0)=
a List (1 items)
 0 : [+3.3,+6.6]
/*
    a List (1 items)
     0 : [+3.3,+6.6]
*/

/*
KO? 2023.12.04: yes
*/
ooRexx> "baffle"~text~matchChar(3, "f", normalization:.Unicode~NFKD)=     -- now 0, was 1 before 2023.12.04
 0
ooRexx> "baffle"~text~pos("f", normalization:.Unicode~NFKD, aslist:, aligned:0)=
a List (2 items)
 0 : [+3.3,-3.4]
 1 : [-3.4,-3.5]
/*
    a List (2 items)
     0 : [+3.3,-3.4]
     1 : [-3.4,-3.5]
*/


-- ===============================================================================
-- 2023 Nov 17

/*
Rework the implementation of caselessCompare, to get the right answer here:
*/
ooRexx> "sss"~text~caselessCompare("", "ß")=                --  3 (not  4 because the 3rd  's' matches only half of the casefolded pad "ß" which is "ss")
 3
ooRexx> "straßssßßssse"~text~caselessCompare("stra", "ß")=  -- 12 (not 13 because the last 's' matches only half of the casefolded pad "ß" which is "ss")
 12

/*
Analysis using Unicode scalars:

-----------------------------------------
CASE 1 : aligned in self, aligned in arg1
-----------------------------------------
*/

ooRexx> "straßssßßssse"~text~compare("stra", "ß")=          --  6
 6
/*
    "straßssßßssse"~text~unicode~c2g=
         1  2  3  4  5  6  7  8  9  0  1  2  3      -- (external character indexes)
         s  t  r  a  ß  s  s  ß  ß  s  s  s  e
         73 74 72 61 DF 73 73 DF DF 73 73 73 65     -- (unicode scalars)
    -------------------------------------------
    "straßßßßßßßßß"~text~unicode~c2g=
         1  2  3  4  5  6  7  8  9  0  1  2  3      -- (external character indexes)
         s  t  r  a  ß  ß  ß  ß  ß  ß  ß  ß  ß
         73 74 72 61 DF DF DF DF DF DF DF DF DF     -- (unicode scalars)
                        |
                        first different unicode scalar
*/

/*
Debug output: the indexer supports the named parameter debug
"straßssßßssse"~text~indexer~compare("stra", "ß", debug:.true)=
    selfTextTransformer~iSubtext~string = straßssßßssse
    selfTextTransformer~iSubtext~c2g = 73 74 72 61 C39F 73 73 C39F C39F 73 73 73 65
    selfTextTransformedString~length = 16
    textTextTransformer~iSubtext~string = straßßßßßßßßß
    textTextTransformer~iSubtext~c2g = 73 74 72 61 C39F C39F C39F C39F C39F C39F C39F C39F C39F
    textTextTransformedString~length = 22
    posB1 = 7
    posC1 = +6.7
    posB2 = 7
    posC2 = +6.7
     6
*/


/*
---------------------------------------------
CASE 2 : aligned in self, not aligned in arg1
---------------------------------------------
*/

ooRexx> "straßssßßssse"~text~caselessCompare("stra", "ß")=                              -- 12
 12
/*
    "straßssßßssse"~text~unicode~c2g=
         1  2  3  4  5     6  7  8     9     0  1  2  3                         -- (external character indexes)
         s  t  r  a  ß     s  s  ß     ß     s  s  s  e
         73 74 72 61 DF    73 73 DF    DF    73 73 73 65                        -- (unicode scalars)
    "straßssßßssse"~text~casefold~unicode~c2g=
         1  2  3  4  5  6  7  8  9  0  1  2  3  4  5  6                         -- (internal byte indexes)
         s  t  r  a  s  s  s  s  s  s  s  s  s  s  s  e
         73 74 72 61 73 73 73 73 73 73 73 73 73 73 73 65                        -- (unicode scalars)
    ----------------------------------------------------
    "straßßßßßßßßß"~text~unicode~c2g=
         1  2  3  4  5     6     7     8     9     0     1     2     3          -- (external character indexes)
         s  t  r  a  ß     ß     ß     ß     ß     ß     ß     ß     ß
         73 74 72 61 DF    DF    DF    DF    DF    DF    DF    DF    DF         -- (unicode scalars)
    "straßßßßßßßßß"~text~casefold~unicode~c2g=
         1  2  3  4  5  6  7  8  9  0  1  2  3  4  5  6  7  8  9  0  1  2       -- (internal byte indexes)
         s  t  r  a  ß     ß     ß     ß     ß     ß     ß     ß     ß
         73 74 72 61 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73      -- (unicode scalars)
                                                   |  |
                                                   |  +-- 65 at (13,16) <> 73 at (-10,+16) but can't be 13 because would match only the first 73 of ß at (10,15)
                                                   +-- yes, 12.
*/

/*
Debug output: the indexer supports the named parameter debug
"straßssßßssse"~text~indexer~caselessCompare("stra", "ß", debug:.true)=
    selfTextTransformer~iSubtext~string = strassssssssssse
    selfTextTransformer~iSubtext~c2g = 73 74 72 61 73 73 73 73 73 73 73 73 73 73 73 65
    selfTextTransformedString~length = 16
    textTextTransformer~iSubtext~string = strassssssssssssssssss
    textTextTransformer~iSubtext~c2g = 73 74 72 61 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73
    textTextTransformedString~length = 22
    posB1 = 16
    posC1 = +13.16
    posB2 = 16
    posC2 = -10.16
     12
*/

-- Another way to test: at which moment the growing padded string on the right will no longer be found at pos 1
--   1234567890123
ooRexx>     "straßssßßssse"~text~caselessPos("straß")=          -- 1
 1
--   straß
ooRexx>     "straßssßßssse"~text~caselessPos("straßß")=         -- 1
 1
--   straßß
ooRexx>     "straßssßßssse"~text~caselessPos("straßßß")=        -- 1
 1
--   straßß ß
ooRexx>     "straßssßßssse"~text~caselessPos("straßßßß")=       -- 1
 1
--   straßß ßß
ooRexx>     "straßssßßssse"~text~caselessPos("straßßßßß")=      -- 1
 1
--   straßß ßßß
ooRexx>     "straßssßßssse"~text~caselessPos("straßßßßßß")=     -- 0    The last ß doesn't match "se" at 12
 0
--   straßß ßßß ß


/*
---------------------------------------------
CASE 3 : not aligned in self, aligned in arg1
---------------------------------------------
*/

ooRexx> "stra"~text~caselessCompare("straßssßßssse", "ß")=  -- 9
 9
/*
    1  2  3  4  5     6     7     8     9     0     1     2     3               -- (external character indexes)
    s  t  r  a  ß     ß     ß     ß     ß     ß     ß     ß     ß
    1  2  3  4  5  6  7  8  9  0  1  2  3  4  5  6  7  8  9  0  1  2            -- (internal byte indexes)
    73 74 72 61 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73           -- (unicode scalars of the casefolded string)
    -----------------------------------------------------------------
    1  2  3  4  5     6  7  8     9     0  1  2  3                              -- (external character indexes)
    s  t  r  a  ß     s  s  ß     ß     s  s  s  e
    1  2  3  4  5  6  7  8  9  0  1  2  3  4  5  6                              -- (internal byte indexes)
    73 74 72 61 73 73 73 73 73 73 73 73 73 73 73 65                             -- (unicode scalars of the casefolded string)
                                        |        |
                                        |        + 73 at (-10,16) <> 65 at (13,16)
                                        +-- yes, 9.
*/

/*
Debug output: the indexer supports the named parameter debug
"stra"~text~indexer~caselessCompare("straßssßßssse", "ß", debug:.true)=
    selfTextTransformer~iSubtext~string = strassssssssssssssssss
    selfTextTransformer~iSubtext~c2g = 73 74 72 61 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73
    selfTextTransformedString~length = 22
    textTextTransformer~iSubtext~string = strassssssssssse
    textTextTransformer~iSubtext~c2g = 73 74 72 61 73 73 73 73 73 73 73 73 73 73 73 65
    textTextTransformedString~length = 16
    posB1 = 16
    posC1 = -10.16
    posB2 = 16
    posC2 = +13.16
     9
*/

ooRexx> "straß"        ~text~caselessCompare("straßssßßssse", "ß")=  -- 9
 9
ooRexx> "straßß"       ~text~caselessCompare("straßssßßssse", "ß")=  -- 9
 9
ooRexx> "straßßß"      ~text~caselessCompare("straßssßßssse", "ß")=  -- 9
 9
ooRexx> "straßßßß"     ~text~caselessCompare("straßssßßssse", "ß")=  -- 9
 9
ooRexx> "straßßßßß"    ~text~caselessCompare("straßssßßssse", "ß")=  -- 9
 9
ooRexx> "straßßßßßß"   ~text~caselessCompare("straßssßßssse", "ß")=  -- 9
 9
ooRexx> "straßßßßßßß"  ~text~caselessCompare("straßssßßssse", "ß")=  -- 9
 9
ooRexx> "straßßßßßßßß" ~text~caselessCompare("straßssßßssse", "ß")=  -- 9
 9

ooRexx> "straß"        ~text~caselessCompareTo("straßssßßssse")=  -- -1
-1
ooRexx> "straßß"       ~text~caselessCompareTo("straßssßßssse")=  -- -1
-1
ooRexx> "straßßß"      ~text~caselessCompareTo("straßssßßssse")=  -- -1
-1
ooRexx> "straßßßß"     ~text~caselessCompareTo("straßssßßssse")=  -- -1
-1
ooRexx> "straßßßßß"    ~text~caselessCompareTo("straßssßßssse")=  -- -1     up to 9 characters, it's lesser
-1
ooRexx> "straßßßßßß"   ~text~caselessCompareTo("straßssßßssse")=  -- 1      from 10 characters, it's greater
 1
ooRexx> "straßßßßßßß"  ~text~caselessCompareTo("straßssßßssse")=  -- 1
 1
ooRexx> "straßßßßßßßß" ~text~caselessCompareTo("straßssßßssse")=  -- 1
 1

ooRexx> "stra"     ~caselessCompare("strasssssse", "s")=    -- 11
 11
ooRexx> "stra"~text~caselessCompare("strasssssse", "s")=    -- 11
 11
ooRexx> "strasssssse"     ~caselessCompare("stra", "s")=    -- 11
 11
ooRexx> "strasssssse"~text~caselessCompare("stra", "s")=    -- 11
 11

ooRexx> "strà"     ~caselessCompare("stràsssssse", "s")=    -- 11 (was 12 before automatic conversion of string literals to text)
 11
ooRexx> "strà"~text~caselessCompare("stràsssssse", "s")=    -- 11
 11
ooRexx> "stràsssssse"     ~caselessCompare("strà", "s")=    -- 11 (was 12 before automatic conversion of string literals to text)
 11
ooRexx> "stràsssssse"~text~caselessCompare("strà", "s")=    -- 11
 11


/*
---------------------------------------------
CASE 4 : not aligned in self, aligned in arg1
---------------------------------------------
*/

ooRexx> iota_dt = "\u{GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS}"~text~unescape
ooRexx> ("a" iota_dt~casefold "b")~compare("a" iota_dt, normalization: 0)=  -- 3
 3

/*
Debug output: the indexer supports the named parameter debug
("a" iota_dt~casefold "b")~indexer~compare("a" iota_dt, normalization: 0, debug:.true)=
    selfTextTransformer~iSubtext~string = a ΐ b
    selfTextTransformer~iSubtext~c2g = 61 20 CEB9CC88CC81 20 62
    selfTextTransformedString~length = 10
    textTextTransformer~iSubtext~string = a ΐ
    textTextTransformer~iSubtext~c2g = 61 20 CE90 20 20
    textTextTransformedString~length = 6
    posB1 = 4
    posC1 = -3.4
    posB2 = 4
    posC2 = -3.4
     3
*/


-- ===============================================================================
-- 2023 Oct 04

/*
Reactivate the constraint "self~isCompatibleWithByteString" when converting a
RexxText to a String. It can be disabled by setting
    .Unicode~unckeckedConversionToString = .true
Currently, the only case where this constraint is disabled is when testing
the regular expressions in diary_examples.rex.


Some checks of encoding compatibiliy were missing.
Added in:
- compareText: caselessCompare, compare
- compareToText: caselessCompareTo, compareTo
- endsWithText: caselessEndsWith, endsWith
- matchCharText: caselessMatchChar, matchChar
- matchText: caselessMatch, match
- posText: caselessPos, pos


New supported methods:
- abs       forward to String, return a String
- b2x       forward to String, return a String
- bitAnd    forward to String, return a String
- bitOr     forward to String, return a String
- bitXor    forward to String, return a String
Examples:
*/
ooRexx>     (-1)~text~abs=          -- 1
 1
ooRexx>     ("-x")~text~abs=        -- ABS method target must be a number; found "-x".
ABS method target must be a number; found "-x".
Error code= 93.943
ooRexx>     ("-é")~text~abs=        -- UTF-8 not-ASCII '-é' is not compatible with a Rexx numeric value.
UTF-8 not-ASCII '-é' is not compatible with a Rexx numeric value.
Error code= 23.900

ooRexx>     100~text~b2x=           -- 4
 4
ooRexx>     "x"~text~b2x=           -- Only 0, 1, and whitespace characters are valid in a binary string; character found "x".
Only 0, 1, and whitespace characters are valid in a binary string; character found "x".
Error code= 93.934
ooRexx>     "é"~text~b2x=           -- UTF-8 not-ASCII 'é' is not compatible with a Rexx numeric value.
UTF-8 not-ASCII 'é' is not compatible with a Rexx numeric value.
Error code= 23.900

ooRexx>     "12"x~text~bitAnd=                                  -- '[12]'   ("12"x)
'[12]'
ooRexx>     "73"x~text~bitAnd("27"x~text)=                      -- '#'      ("23"x)
'#'
ooRexx>     "13"x~text~bitAnd("5555"x~text)=                    -- '[11]U'  ("1155"x)
'[11]U'
ooRexx>     "13"x~text~bitAnd("5555"x~text,"74"x~text)=         -- '[11]T'  ("1154"x)
'[11]T'
ooRexx>     "pQrS"~text~bitAnd(,"DF"x~text("byte"))=            -- "PQRS"
'PQRS'

ooRexx>     "12"x~text~bitOr=                                   -- '[12]'       ("12"x)
'[12]'
ooRexx>     "15"x~text~bitOr("24"x~text)=                       -- 5            ("35"x)
 5
ooRexx>     "15"x~text~bitOr("2456"x~text)=                     -- '5V'         ("3556"x)
'5V'
ooRexx>     "15"x~text~bitOr("2456"x~text,"F0"x~text("byte"))=  -- '5?'         ("35F6"x)
'5�'
ooRexx>     "1111"x~text~bitOr(,"4D"x~text)=                    -- ']]'         ("5D5D"x)
']]'
ooRexx>     "pQrS"~text~bitOr(,"20"x~text)=                     -- "pqrs"
'pqrs'

ooRexx>     "12"x~text~bitXor=                                  -- '[12]'       ("12"x)
'[12]'
ooRexx>     "12"x~text~bitXor("22"x~text)=                      -- 0            ("30"x)
 0
ooRexx>     "1211"x~text~bitXor("22"x~text)=                    -- '0[11]'      ("3011"x)
'0[11]'
ooRexx>     "1111"x~text~bitXor("444444"x~text)=                -- 'UUD'        ("555544"x)
'UUD'
ooRexx>     "1111"x~text~bitXor("444444"x~text,"40"x~text)=     -- 'UU[04]'     ("555504"x)
'UU[04]'
ooRexx>     "1111"x~text~bitXor(,"4D"x~text)=                   -- '\\'         ("5C5C"x)
'\\'
ooRexx>     "C711"x~text~bitXor("222222"x~text," "~text)=       -- '?3[02]'     ("E53302"x)
'�3[02]'


/*
Implementation of caselessStartsWith, startsWith:
(forwards to caselessPos or pos, and returns .true if result == 1)
(was already implemented, waiting for 'pos' implementation)
Examples:
*/
ooRexx>     "Père"~text~c2g=                                -- '50 C3A8 72 65'
'50 C3A8 72 65'
ooRexx>     "Père"~text~startsWith("50"x)=                  -- true
 1
ooRexx>     "Père"~text~startsWith("50C3"x)=                -- was Invalid UTF-8 string     (utf8proc error because "50C3"x is an invalid UTF-8 encoding)
Invalid UTF-8 string.
Error code= 22.900
ooRexx>     "Père"~text~startsWith("50C3"x~text("byte"))=   -- false (not aligned)    (was Encoding: cannot compare Byte not-ASCII 'P?' with UTF-8 not-ASCII 'Père')
 0
ooRexx>     "Père"~text~startsWith("50C3A8"x)=              -- true
 1

ooRexx>     "éßffl"~text~c2g=                                 -- 'C3A9 C39F EFAC84'
'C3A9 C39F EFAC84'
ooRexx>     "éßffl"~text~casefold~c2g=                        -- 'C3A9 73 73 66 66 6C'
'C3A9 73 73 66 66 6C'
ooRexx>     "éßffl"~text~caselessStartsWith("É")=             -- true
 1
ooRexx>     "éßffl"~text~caselessStartsWith("És")=            -- false
 0
ooRexx>     "éßffl"~text~caselessStartsWith("Éss")=           -- true
 1
ooRexx>     "éßffl"~text~caselessStartsWith("Éssf")=          -- false
 0
ooRexx>     "éßffl"~text~caselessStartsWith("Éssff")=         -- false
 0
ooRexx>     "éßffl"~text~caselessStartsWith("Éssffl")=        -- true
 1

ooRexx>     "noël👩‍👨‍👩‍👧🎅"~text~startsWith("noël👩")=                       -- false
 0
ooRexx>     "noël👩‍👨‍👩‍👧🎅"~text~startsWith("noël👩", aligned:.false)=       -- true
 1
ooRexx>     "noël👩‍👨‍👩‍👧🎅"~text~startsWith("noël👩‍👨‍👩‍")=                  -- false
 0
ooRexx>     "noël👩‍👨‍👩‍👧🎅"~text~startsWith("noël👩‍👨‍👩‍", aligned:.false)=  -- true
 1
ooRexx>     "noël👩‍👨‍👩‍👧🎅"~text~startsWith("noël👩‍👨‍👩‍👧")=                 -- true
 1


-- ===============================================================================
-- 2023 Oct 03

/*
Move the routine createCharacterTranscodingTable from byte_common.cls to
byte_encoding.cls. It's used only by Byte_Encoding and its subclasses.


The 'text' method of UnicodeCharacter has been replaced by 'transcodeTo'.
Reason 1: the byte encodings were not supported correctly.
Reason 2: the fact a transcoding is needed is against the definition of the
'text' method (apply a view on the bytes without modifying them).


Finalize the support of replacement character during transcoding.
A replacement character can be .nil or "" or a character.
When a character, it can be a String or a RexxText made of one codepoint or a UnicodeCharacter.
In all cases, the corresponding codepoint is used. This codepoint is transcoded to the target encoding.

Behavior when a source codepoint does not have a matching target codepoint:
- When the replacement character is .nil, an error is raised.
- When the replacement character is "", the source codepoint is ignored (not transcoded)
- Otherwise the source codepoint is replaced by the replacement character.

Reminder: if the 'strict' named argument is false (default) then the fallback
codepoint transcodings are used, if any. So when 'strict' is false, potentially
more source could be transcoded.

Examples:
*/
ooRexx>     -- The Windows-1252 encoding has some fallback codepoint transcodings.
ooRexx>     -- HOP is one of them: 81x --> +U0081 only when strict:.false
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~utf8(strict:.false)=   -- T'Noël (strict:.false is the default)
T'Noël'
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~utf8(strict:.false)~unicodecharacters==
an Array (shape [5], 5 items)
 1 : ( "N"   U+004E Lu 1 "LATIN CAPITAL LETTER N" )
 2 : ( "o"   U+006F Ll 1 "LATIN SMALL LETTER O" )
 3 : ( "ë"   U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" )
 4 : ( "l"   U+006C Ll 1 "LATIN SMALL LETTER L" )
 5 : ( ""    U+0081 Cc 0 "", "HIGH OCTET PRESET", "HOP" )
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~utf8(strict:.true)=    -- Cannot convert windows-1252 not-ASCII character 129 (81) at byte-position 5 to UTF-8.
Cannot convert windows-1252 not-ASCII character 129 (81) at byte-position 5 to UTF-8.
Error code= 23.900
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~utf8(strict:.true, replacementCharacter:"")=       -- T'Noël'
T'Noël'
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~utf8(strict:.true, replacementCharacter:"#")=      -- T'Noël#'
T'Noël#'
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~utf8(strict:.true, replacementCharacter:"🎅")=     -- T'Noël🎅'
T'Noël🎅'

ooRexx>     "Noël\u{HOP}"~text("utf8")~unescape~transcodeTo("byte")=                    -- Cannot convert UTF-8 not-ASCII codepoint 235 (EB) at position 3 to Byte.
Cannot convert UTF-8 not-ASCII codepoint 235 (EB) at position 3 to Byte.
Error code= 23.900
ooRexx>     "Noël\u{HOP}"~text("utf8")~unescape~transcodeTo("windows-1252")=            -- T'No?l?'
T'No�l�'
ooRexx>     "Noël\u{HOP}"~text("utf8")~unescape~transcodeTo("windows-1252")~c2x=        -- '4E 6F EB 6C 81'
'4E 6F EB 6C 81'
ooRexx>     "Noël\u{HOP}"~text("utf8")~unescape~transcodeTo("windows-1252", strict:)=   -- Cannot convert UTF-8 not-ASCII codepoint 129 (81) at position 5 to windows-1252.
Cannot convert UTF-8 not-ASCII codepoint 129 (81) at position 5 to windows-1252.
Error code= 23.900

ooRexx>     -- "byte" encoding: only 00..7F can be transcoded
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("byte")~utf8=                                                      -- Cannot convert Byte not-ASCII character 235 (EB) at byte-position 3 to UTF-8.
Cannot convert Byte not-ASCII character 235 (EB) at byte-position 3 to UTF-8.
Error code= 23.900
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("byte")~utf8(replacementCharacter:"")=                             -- T'Nol'
T'Nol'
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("byte")~utf8(replacementCharacter:"#")=                            -- T'No#l#'      1 replacement character for ë because "ë" is 'EB'x
T'No#l#'
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("byte")~utf8(replacementCharacter:"🎅")=                           -- T'No🎅l🎅'
T'No🎅l🎅'
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("byte")~utf8(replacementCharacter:"🎅"~text)=                      -- T'No🎅l🎅'
T'No🎅l🎅'
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("byte")~utf8(replacementCharacter:.unicode["Father Christmas"])=   -- T'No🎅l🎅'
T'No🎅l🎅'

ooRexx>     "Noël"~text("byte")~utf8(replacementCharacter:"")=                          -- T'Nol'
T'Nol'
ooRexx>     "Noël"~text("byte")~utf8(replacementCharacter:"#")=                         -- T'No##l'         2 replacement characters for ë because "ë" is 'C3 AB'x
T'No##l'
ooRexx>     "Noël"~text("byte")~utf8(replacementCharacter:"🎅")=                        -- T'No🎅🎅l'
T'No🎅🎅l'
ooRexx>     "Noël"~text("byte")~utf8(replacementCharacter:"🎅🎅")=                     -- The transcoded replacement character must have at most one codepoint, got UTF-8 not-ASCII (2 characters, 2 codepoints, 8 bytes, 0 error) '🎅🎅'.
The transcoded replacement character must have at most one codepoint, got UTF-8 not-ASCII (2 characters, 2 codepoints, 8 bytes, 0 error) '🎅🎅'.
Error code= 23.900
ooRexx>     "Noël"~text("byte")~utf8(replacementCharacter:"🎅🎅"~text)=                -- The transcoded replacement character must have at most one codepoint, got UTF-8 not-ASCII (2 characters, 2 codepoints, 8 bytes, 0 error) '🎅🎅'.
The transcoded replacement character must have at most one codepoint, got UTF-8 not-ASCII (2 characters, 2 codepoints, 8 bytes, 0 error) '🎅🎅'.
Error code= 23.900

ooRexx>     "Noël"~text("utf8")~transcodeTo("byte")=                                    -- Cannot convert UTF-8 not-ASCII codepoint 235 (EB) at position 3 to Byte.
Cannot convert UTF-8 not-ASCII codepoint 235 (EB) at position 3 to Byte.
Error code= 23.900
ooRexx>     "Noël"~text("utf8")~transcodeTo("byte", replacementCharacter:"")=           -- T'Nol'
T'Nol'
ooRexx>     "Noël"~text("utf8")~transcodeTo("byte", replacementCharacter:"#")=          -- T'No#l'
T'No#l'
ooRexx>     "Noël"~text("utf8")~transcodeTo("byte", replacementCharacter:"🎅")=         -- The replacement character UTF-8 not-ASCII '🎅' cannot be transcoded to Byte.
The replacement character UTF-8 not-ASCII '🎅' cannot be transcoded to Byte.
Error code= 93.900

ooRexx>     "Noël🤶"~text("utf8")~transcodeTo("unicode", replacementCharacter:"🎅")=        -- T'N[000000]o[000000]?[000000]l[000000]??[0100]'
T'N[000000]o[000000]�[000000]l[000000]6�[0100]'
ooRexx>     "Noël🤶"~text("utf8")~transcodeTo("unicode", replacementCharacter:"🎅")~c2x=    -- '4E000000 6F000000 EB000000 6C000000 36F90100'
'4E000000 6F000000 EB000000 6C000000 36F90100'
ooRexx>     "Noël🤶"~text("utf8")~transcodeTo("unicode", replacementCharacter:"🎅")~c2u=    -- 'U+004E U+006F U+00EB U+006C U+1F936'
'U+004E U+006F U+00EB U+006C U+1F936'

ooRexx>     "Noël🤶"~text("utf8")~transcodeTo("unicode8", replacementCharacter:"")=          -- T'No?l'
T'No�l'
ooRexx>     "Noël🤶"~text("utf8")~transcodeTo("unicode8", replacementCharacter:"")~c2x=      -- '4E 6F EB 6C'
'4E 6F EB 6C'
ooRexx>     "Noël🤶"~text("utf8")~transcodeTo("unicode8", replacementCharacter:"#")=         -- T'No?l#'
T'No�l#'
ooRexx>     "Noël🤶"~text("utf8")~transcodeTo("unicode8", replacementCharacter:"#")~c2x=     -- '4E 6F EB 6C 23'
'4E 6F EB 6C 23'
ooRexx>     "Noël🤶"~text("utf8")~transcodeTo("unicode8", replacementCharacter:"🎅")=       -- The replacement character UTF-8 not-ASCII '🎅' cannot be transcoded to Unicode8.
The replacement character UTF-8 not-ASCII '🎅' cannot be transcoded to Unicode8.
Error code= 93.900

ooRexx>     "Noël🤶"~text("utf8")~transcodeTo("unicode16", replacementCharacter:"")=         -- T'N[00]o[00]?[00]l[00]'
T'N[00]o[00]�[00]l[00]'
ooRexx>     "Noël🤶"~text("utf8")~transcodeTo("unicode16", replacementCharacter:"#")=        -- T'N[00]o[00]?[00]l[00]#[00]'
T'N[00]o[00]�[00]l[00]#[00]'
ooRexx>     "Noël🤶"~text("utf8")~transcodeTo("unicode16", replacementCharacter:"#")~c2x=    -- '4E00 6F00 EB00 6C00 2300'
'4E00 6F00 EB00 6C00 2300'
ooRexx>     "Noël🤶"~text("utf8")~transcodeTo("unicode16", replacementCharacter:"#")~c2u=    -- 'U+004E U+006F U+00EB U+006C U+0023'
'U+004E U+006F U+00EB U+006C U+0023'
ooRexx>     "Noël🤶"~text("utf8")~transcodeTo("unicode16", replacementCharacter:"🎅")=      -- The replacement character UTF-8 not-ASCII '🎅' cannot be transcoded to Unicode16.
The replacement character UTF-8 not-ASCII '🎅' cannot be transcoded to Unicode16.
Error code= 93.900

ooRexx>     "Noël🤶"~text("utf8")~transcodeTo("unicode32")=         -- T'N[000000]o[000000]?[000000]l[000000]6?[0100]'
T'N[000000]o[000000]�[000000]l[000000]6�[0100]'
ooRexx>     "Noël🤶"~text("utf8")~transcodeTo("unicode32")~c2x=     -- '4E000000 6F000000 EB000000 6C000000 36F90100'
'4E000000 6F000000 EB000000 6C000000 36F90100'
ooRexx>     "Noël🤶"~text("utf8")~transcodeTo("unicode32")~c2u=     -- 'U+004E U+006F U+00EB U+006C U+1F936'
'U+004E U+006F U+00EB U+006C U+1F936'


/*
The method c2u is no longer abstract for the byte encodings.
Now, a byte encoding is converted on the fly to UnicodeN in non strict mode,
replacing any unsupported character by .Unicode~replacementCharacter.
Idem for the method unicodeCharacters.
Examples:
*/
ooRexx>     "FF FE FD FC"x~text("byte")~c2x=                                               -- 'FF FE FD FC'
'FF FE FD FC'
ooRexx>     "FF FE FD FC"x~text("byte")~c2g=                                               -- 'FF FE FD FC'
'FF FE FD FC'
ooRexx>     "FF FE FD FC"x~text("byte")~codepoints==
a CodePointSupplier 
 1 :  255
 2 :  254
 3 :  253
 4 :  252
ooRexx>     "FF FE FD FC"x~text("byte")~c2u=                                               -- 'U+FFFD U+FFFD U+FFFD U+FFFD'
'U+FFFD U+FFFD U+FFFD U+FFFD'
ooRexx>     "FF FE FD FC"x~text("byte")~unicodeCharacters==
an Array (shape [4], 4 items)
 1 : ( "�"   U+FFFD So 1 "REPLACEMENT CHARACTER" )
 2 : ( "�"   U+FFFD So 1 "REPLACEMENT CHARACTER" )
 3 : ( "�"   U+FFFD So 1 "REPLACEMENT CHARACTER" )
 4 : ( "�"   U+FFFD So 1 "REPLACEMENT CHARACTER" )

ooRexx>     "FF FE FD FC"x~text("utf8")~c2x=                                               -- 'FF FE FD FC'
'FF FE FD FC'
ooRexx>     "FF FE FD FC"x~text("utf8")~c2g=                                               -- 'FF FE FD FC'
'FF FE FD FC'
ooRexx>     "FF FE FD FC"x~text("utf8")~c2u=                                               -- 'U+FFFD U+FFFD U+FFFD U+FFFD'
'U+FFFD U+FFFD U+FFFD U+FFFD'
ooRexx>     "FF FE FD FC"x~text("utf8")~codepoints==
a CodePointSupplier 
 1 :  65533
 2 :  65533
 3 :  65533
 4 :  65533

ooRexx>     "FF FE FD FC"x~text("unicode8")~c2x=                                           -- 'FF FE FD FC'
'FF FE FD FC'
ooRexx>     "FF FE FD FC"x~text("unicode8")~c2g=                                           -- 'FF FE FD FC'
'FF FE FD FC'
ooRexx>     "FF FE FD FC"x~text("unicode8")~codepoints==
a CodePointSupplier 
 1 :  255
 2 :  254
 3 :  253
 4 :  252
ooRexx>     "FF FE FD FC"x~text("unicode8")~c2u=                                           -- 'U+00FF U+00FE U+00FD U+00FC'
'U+00FF U+00FE U+00FD U+00FC'
ooRexx>     "FF FE FD FC"x~text("unicode8")~unicodecharacters==
an Array (shape [4], 4 items)
 1 : ( "ÿ"   U+00FF Ll 1 "LATIN SMALL LETTER Y WITH DIAERESIS" )
 2 : ( "þ"   U+00FE Ll 1 "LATIN SMALL LETTER THORN" )
 3 : ( "ý"   U+00FD Ll 1 "LATIN SMALL LETTER Y WITH ACUTE" )
 4 : ( "ü"   U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" )

ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("byte")~c2x=                           -- '4E 6F EB 6C 81'
'4E 6F EB 6C 81'
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("byte")~c2g=                           -- '4E 6F EB 6C 81'
'4E 6F EB 6C 81'
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("byte")~c2u=                           -- 'U+004E U+006F U+FFFD U+006C U+FFFD'
'U+004E U+006F U+FFFD U+006C U+FFFD'
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("byte")~unicodecharacters==
an Array (shape [5], 5 items)
 1 : ( "N"   U+004E Lu 1 "LATIN CAPITAL LETTER N" )
 2 : ( "o"   U+006F Ll 1 "LATIN SMALL LETTER O" )
 3 : ( "�"   U+FFFD So 1 "REPLACEMENT CHARACTER" )
 4 : ( "l"   U+006C Ll 1 "LATIN SMALL LETTER L" )
 5 : ( "�"   U+FFFD So 1 "REPLACEMENT CHARACTER" )

ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~c2x=                   -- '4E 6F EB 6C 81'
'4E 6F EB 6C 81'
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~c2g=                   -- '4E 6F EB 6C 81'
'4E 6F EB 6C 81'
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~c2u=                   -- 'U+004E U+006F U+00EB U+006C U+0081'
'U+004E U+006F U+00EB U+006C U+0081'
ooRexx>     ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~unicodecharacters==
an Array (shape [5], 5 items)
 1 : ( "N"   U+004E Lu 1 "LATIN CAPITAL LETTER N" )
 2 : ( "o"   U+006F Ll 1 "LATIN SMALL LETTER O" )
 3 : ( "ë"   U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" )
 4 : ( "l"   U+006C Ll 1 "LATIN SMALL LETTER L" )
 5 : ( ""    U+0081 Cc 0 "", "HIGH OCTET PRESET", "HOP" )


-- ===============================================================================
-- 2023 Sep 27

/*
Add the named parameters 'stripCC' and 'stripNA' to all the methods supporting
the named parameter 'normalization'. This is utf8proc specific.
- stripCC: remove control characters (see utf8proc doc for more information:
  HorizontalTab (HT) and FormFeed (FF) are transformed into space)
- stripNA: remove unassigned codepoints
Example:
*/
ooRexx> .unicode["ESA"]=        -- ( "‡"    U+0087 Cc 0 "", "END OF SELECTED AREA", "ESA"
( "‡"    U+0087 Cc 0 "", "END OF SELECTED AREA", "ESA" )
ooRexx> .unicode["NBSP"]=       -- ( " "   U+00A0 Zs 1 "NO-BREAK SPACE", "NBSP" )
( " "   U+00A0 Zs 1 "NO-BREAK SPACE", "NBSP" )
ooRexx> .unicode["SSA"]=        -- ( "†"    U+0086 Cc 0 "", "START OF SELECTED AREA", "SSA"
( "†"    U+0086 Cc 0 "", "START OF SELECTED AREA", "SSA" )
ooRexx> .unicode["U+0378"]=     -- ( "͸"   U+0378 Cn 1 "" )     unassigned
( "͸"   U+0378 Cn 1 "" )

ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape=                                         -- T'†Mrs. 🤶 a͸nd Mr. 🎅
T'†Mrs. 🤶 a͸nd Mr. 🎅‡'
ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~c2g=                                     -- 'C286 4D 72 73 2E C2A0 F09FA4B6 20 61 CDB8 6E 64 20 4D 72 2E C2A0 F09F8E85 C287'
'C286 4D 72 73 2E C2A0 F09FA4B6 20 61 CDB8 6E 64 20 4D 72 2E C2A0 F09F8E85 C287'
ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~transform(stripNA:)~c2g=                 -- 'C286 4D 72 73 2E C2A0 F09FA4B6 20 61      6E 64 20 4D 72 2E C2A0 F09F8E85 C287'
'C286 4D 72 73 2E C2A0 F09FA4B6 20 61 6E 64 20 4D 72 2E C2A0 F09F8E85 C287'
ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~transform(stripNA:, stripCC:)~c2g=       -- '     4D 72 73 2E C2A0 F09FA4B6 20 61      6E 64 20 4D 72 2E C2A0 F09F8E85     '
'4D 72 73 2E C2A0 F09FA4B6 20 61 6E 64 20 4D 72 2E C2A0 F09F8E85'

ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~pos("and")=                              -- 0
 0
ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~pos("and", stripNA:)=                    -- 9
 9
ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~pos("and", stripNA:, stripCC:)=          -- 9    yes! 9, not 8 because it's the EXTERNAL position
 9

ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("mr.")=                      -- 14
 14
ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("mr.", stripNA:)=            -- 14   yes! 14, not 13
 14
ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("mr.", stripNA:, stripCC:)=  -- 14   yes! 14, not 12
 14

ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("\U{SSA}"~text~unescape)=              -- 1
 1
ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("\U{SSA}"~text~unescape, stripCC:)=    -- 0
 0

ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("a\u0378nd"~text~unescape)=                        -- 9
 9
ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("a\u0378nd"~text~unescape, stripCC:)=              -- 9
 9
ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("a\u0378nd"~text~unescape, stripNA:)=              -- 9    yes! 9, not 0 because \u0378 is removed both in the needle and in thehaystack
 9
ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("a\u0378nd"~text~unescape, stripNA:, stripCC:)=    -- 9    yes! 9, not 8
 9


/*
caselessEndsWith, endsWith: returns false if the start of the 'other' string is
not aligned with a character.
Examples
*/
ooRexx> "#éßffl#…"~text~endsWith("…")=                      -- true
 1
ooRexx> "#éßffl#…"~text~caselessEndsWith("…")=              -- true
 1

ooRexx> "#éßffl#…"~text~endsWith("fl#…")=                   -- false, ffl remains ffl
 0
ooRexx> "#éßffl#…"~text~caselessEndsWith("FL#…")=           -- false, ffl becomes ffl but FL is not aligned with ffl
 0

ooRexx> "#éßffl#…"~text~endsWith("ffl#…")=                   -- true
 1
ooRexx> "#éßffl#…"~text~caselessEndsWith("ffl#…")=           -- true
 1

ooRexx> "#éßffl#…"~text~endsWith("ffl#…")=                  -- false, ffl remains ffl
 0
ooRexx> "#éßffl#…"~text~caselessEndsWith("FFL#…")=          -- true,  ffl becomes ffl and FFL is aligned with ffl
 1

ooRexx> "#éßffl#…"~text~endsWith("sffl#…")=                  -- false, ß remains ß
 0
ooRexx> "#éßffl#…"~text~caselessEndsWith("Sffl#…")=          -- false, ß becomes ss but s is not aligned with ss
 0

ooRexx> "#éßffl#…"~text~endsWith("ßffl#…")=                  -- true
 1
ooRexx> "#éßffl#…"~text~caselessEndsWith("ßffl#…")=          -- true
 1

ooRexx> "#éßffl#…"~text~endsWith("ssffl#…")=                 -- false, ß remains ß
 0
ooRexx> "#éßffl#…"~text~caselessEndsWith("SSffl#…")=         -- true,  ß becomes ss
 1

ooRexx> "#éßffl#…"~text~endsWith("éßffl#…")=                 -- true
 1
ooRexx> "#éßffl#…"~text~caselessEndsWith("ÉSSFFL#…")=       -- true
 1

ooRexx> "#éßffl#…"~text~endsWith("#éßffl#…")=                -- true
 1
ooRexx> "#éßffl#…"~text~caselessEndsWith("#ÉSSFFL#…")=      -- true
 1

ooRexx> "#e\U{COMBINING ACUTE ACCENT}ßffl#…"~text~unescape~c2g=                                                                   -- '23 65CC81 C39F EFAC84 23 E280A6'
'23 65CC81 C39F EFAC84 23 E280A6'
ooRexx>   "\U{COMBINING ACUTE ACCENT}ßffl#…"~text~unescape~c2g=                                                                   -- '     CC81 C39F EFAC84 23 E280A6'
'CC81 C39F EFAC84 23 E280A6'
ooRexx> "#e\U{COMBINING ACUTE ACCENT}ßffl#…"~text~unescape~endsWith("\U{COMBINING ACUTE ACCENT}ßffl#…"~text~unescape)=             -- false, not aligned with e\U{COMBINING ACUTE ACCENT}
 0

ooRexx> "#e\U{COMBINING ACUTE ACCENT}ßffl#…"~text~unescape~casefold~c2g=                                                          -- '23 65CC81 73 73 66 66 6C 23 E280A6'
'23 65CC81 73 73 66 66 6C 23 E280A6'
ooRexx>   "\U{COMBINING ACUTE ACCENT}SSFFL#…"~text~unescape~casefold~c2g=                                                        -- '     CC81 73 73 66 66 6C 23 E280A6'
'CC81 73 73 66 66 6C 23 E280A6'
ooRexx> "#e\U{COMBINING ACUTE ACCENT}ßffl#…"~text~unescape~caselessEndsWith("\U{COMBINING ACUTE ACCENT}SSFFL#…"~text~unescape)=   -- false, not aligned with e\U{COMBINING ACUTE ACCENT}
 0


/*
New 'RexxTextTransformer' class:
    - Converts positions in a transformed string to positions in the corresponding
      untransformed string. This is used for the caselessXXX methods which takes
      or return positions.
    - Supports inflating and deflating transformations.
      jlf 2023 Sep 28: better names are expansion and contraction.
    - The transformation can be made on a part of the string (from startC, for
      lengthC characters).
    - The methods for the transformation are the same as for RexxText:
      NFC, NFD, NFKC, NFKD, casefold, transform. The result is the instance of
      RexxTextTransformer, not the transformed text.
    - Only one call to a transformation method can be done. This is because the
      parameters of the transformation are memorized to re-apply internally the
      transformation character by character, when moving the cursors.
    - The 'transformer' method lets create an instance of RexxTextTransformer
      from a text.

    Example:
        - full text        = original text (untransformed)
        - external subtext = part of the full text to transform
        - internal subtext = transformed part of the full text

        The method ib2xc converts an internal byte (ib) position in the internal
        subtext (iSubtext) to an external character (xc) position in the external
        full text.
        ib2xc supports only growing positions. The only way to go backward is to
        use backupPos/restorePos or resetPos.

                                     --                          Transformed part of the full text
                                     --                       +-------------------------------------+               -- GLOBAL INDEXES (offsetC=3, offsetB=7)
                                     --  01   | 02   | 03     | 04 | 05     | 06    | 07       | 08 | 09            -- (external character indexes) <--------+
                                     --  1 2  | 3 4  | 5 6 7  | 8  | 9 0    | 1 2   | 3 4 5    | 6  | 7 8 9         -- (external byte indexes)               |
            "éßffl#éßffl#…"~text~c2g   --  C3A9 | C39F | EFAC84 | 23 | C3A9   | C39F  | EFAC84   | 23 | E280A6        -- (external bytes)                      |
                                     --  é    | ß    | ffl     | #  | é      | ß     | ffl       | #  | …             -- (full text)                           ^
                                     --  1 2  | 3 4  | 5 6 7  | 8  | 9 0 1  | 2  3  | 4  5  6  | 7  | 8 9 0         -- (internal byte indexes, offset=7)     |
                                     --  C3A9 | C39F | EFAC84 | 23 | 65CC81 | 73 73 | 66 66 6C | 23 | E280A6        -- (internal bytes)                      |
                                                              +-------------------------------------+                                                        |
                                                                                                                    -- RELATIVE INDEXES                      |
                                                            --  01 | 02     | 03    | 04       | 05                 -- (external character indexes) <--------+
                                                            --  1  | 2 3    | 4 5   | 6 7 8    | 9                  -- (external byte indexes)               |
            "#éßffl#"~text~c2g=                              --  23 | C3A9   | C39F  | EFAC84   | 23                 -- (external bytes)                      |
                                                            --  #  | é      | ß     | ffl       | #                  -- (external subtext)                    ^
                                                                                                                                                             |
                                                                                                                    -- RELATIVE INDEXES                      |
                                                            --  01 | 02     | 03 04 | 05 06 07 | 08                 -- (internal character indexes)          |
                                                            --  1  | 2 3 4  | 5  6  | 7  8  9  | 0                  -- (internal byte indexes) ------>-------+
            "#éßffl#"~text~NFD(casefold:)~c2g=               --  23 | 65CC81 | 73 73 | 66 66 6C | 23                 -- (internal bytes)
                                                            --  #  | é      | s  s  | f  f  l  | #                  -- (internal subtext)
*/
ooRexx> transformer = "éßffl#éßffl#…"~text~transformer(4, 5)~NFD(casefold:)
ooRexx> transformer~fulltext=       -- T'éßffl#éßffl#…'
T'éßffl#éßffl#…'
ooRexx> transformer~xSubtext=       -- T'#éßffl#'
T'#éßffl#'
ooRexx> transformer~iSubtext=       -- T'#éssffl#'
T'#éssffl#'

-- ib2xc supports only growing positions
ooRexx> transformer~ib2xc(1)=       -- 4    the internal byte position 1 in the internal subtext corresponds to the 4th external character in the full text
 4
ooRexx> transformer~ib2xc(7)=       -- 7
 7
ooRexx> transformer~ib2xc(2)=       -- Error RexxTextTransformer: You specified a byte position (2) lower than the previous one (7).
RexxTextTransformer: You specified a byte position (2) lower than the previous one (7).
Error code= 93.900

-- The previous error is avoided by backuping/restoring the current position
ooRexx> transformer~resetPos        -- reset to allow iteration again from internal byte position 1
ooRexx> transformer~ib2xc(1)=       -- 4
 4
ooRexx> transformer~backupPos
ooRexx> transformer~ib2xc(7)=       -- 7
 7
ooRexx> transformer~restorePos
ooRexx> transformer~ib2xc(2)=       -- 5
 5

ooRexx> transformer~resetPos
ooRexx> do i=1 to transformer~iSubtext~string~length; say "byte pos" i~right(2) "    character pos=" transformer~ib2xc(i)~string~left(20) transformer~ib2xc(i, aligned:.false); end
byte pos  1     character pos= 4                    +4.8
byte pos  2     character pos= 5                    +5.9
byte pos  3     character pos= -5                   -5.10
byte pos  4     character pos= -5                   -5.11
byte pos  5     character pos= 6                    +6.12
byte pos  6     character pos= -6                   -6.13
byte pos  7     character pos= 7                    +7.14
byte pos  8     character pos= -7                   -7.15
byte pos  9     character pos= -7                   -7.16
byte pos 10     character pos= 8                    +8.17
/*
    byte pos  1     character pos= 4                    +4.8    -- the 8th internal byte is aligned with the 4th external character
    byte pos  2     character pos= 5                    +5.9
    byte pos  3     character pos= The NIL object       -5.10   -- the 10th internal byte is part of the 5th external character, but is not aligned with it.
    byte pos  4     character pos= The NIL object       -5.11
    byte pos  5     character pos= 6                    +6.12
    byte pos  6     character pos= The NIL object       -6.13
    byte pos  7     character pos= 7                    +7.14
    byte pos  8     character pos= The NIL object       -7.15
    byte pos  9     character pos= The NIL object       -7.16
    byte pos 10     character pos= 8                    +8.17
*/
/*
    More details on positions mappings.
    transformer~iSubtext is the transformed part of the full text.
    The internal relative byte position 1 becomes the internal global byte position 8:
        There are 7 bytes (offsetB=7) before the part to transform: 1 + 7 = 8.
        It's the same offsetB=7 for external and internal bytes, because this part is not transformed.
        Remember:
        It doesn't make sense to return the external byte position, because some internal byte positions
        have no corresponding external byte position. For example the internal global byte position 11.
        For diagnostics and analysis, only internal byte positions are relevant.
    The external relative character position 1 becomes the external global character position 4:
        There are 3 characters (offsetC=3) before the part to transform: 1 + 3 = 4.
        It's the same offsetC=3 for external and internal characters, because this part is not transformed.
        Remember:
        The user works only with external global character positions.
        It wouldn't make sense to return internal character positions.
    Example of alignment:
        The internal relative byte position 1 becomes the internal global byte position 8,
        is part of the 4th external character and is aligned with it.
    Example of non-alignment:
        The internal relative byte position 3 becomes the internal global byte position 10,
        is part of the 5th external character and is not aligned with it.
*/


-- ===============================================================================
-- 2023 Sep 16

/*
Relax the constraint "self~isCompatibleWithByteString" when converting a RexxText
to a String.
That allows to go further in the tests of regular expression.
*/
ooRexx> unckeckedConversionToString = .Unicode~unckeckedConversionToString -- backup
ooRexx> .Unicode~unckeckedConversionToString = .true


-- bug in regex.cls
ooRexx> p = .Pattern~compile("(.)*foo")
ooRexx> p~matches("xfooxxxxxxfooXXXX")=         -- Invalid position argument specified; found "0".
Invalid position argument specified; found "0".
Error code= 93.924


-- False success in text mode
-- "à" is 2 bytes 'C3A0', "🎅" is 4 bytes 'F09F8E85'
-- When compiling a String then each of the bytes of "à" or "🎅" become candidate for matching
-- When compiling a RexxText then only the sequence of all the bytes of "à" or "🎅" should match... But that's not the case.
ooRexx> pB = .Pattern~compile("[àb🎅]")
ooRexx> pT = .Pattern~compile("[àb🎅]"~text)
ooRexx> pB~startsWith('àXXXX')=                             -- 1
 1
ooRexx> pT~startsWith('àXXXX'~text)=                        -- 1 but matched only C3
 1
ooRexx> pB~startsWith('bXXXX')=                             -- 1
 1
ooRexx> pT~startsWith('bXXXX'~text)=                        -- 1
 1
ooRexx> pB~startsWith('🎅XXXX')=                            -- 1
 1
ooRexx> pT~startsWith('🎅XXXX'~text)=                       -- 1
 1
ooRexx> pB~startsWith('F0'x || 'XXXX')=                     -- Invalid UTF-8 string (raised by utf8proc) (was 1 before automatic conversion of string literals to text)
Invalid UTF-8 string.
Error code= 22.900
ooRexx> pT~startsWith('F0'x || 'XXXX'~text)=                -- Invalid UTF-8 string (raised by utf8proc)
Invalid UTF-8 string.
Error code= 22.900
ooRexx> pT~startsWith('F0'x || 'XXXX')=                     -- Invalid UTF-8 string (raised by utf8proc) (was 1 (not good) before automatic conversion of string literals to text)
Invalid UTF-8 string.
Error code= 22.900
ooRexx> pB~startsWith('9F'x || 'XXXX')=                     -- Invalid UTF-8 string (raised by utf8proc) (was 1 before automatic conversion of string literals to text)
Invalid UTF-8 string.
Error code= 22.900
ooRexx> pT~startsWith('9F'x || 'XXXX'~text)=                -- Invalid UTF-8 string (raised by utf8proc)
Invalid UTF-8 string.
Error code= 22.900
ooRexx> pT~startsWith('9F'x || 'XXXX')=                     -- Invalid UTF-8 string (raised by utf8proc) (was 1 (not good) before automatic conversion of string literals to text)
Invalid UTF-8 string.
Error code= 22.900


-- greedy pattern
ooRexx> pB = .Pattern~compile("(.)*fô🎅")
ooRexx> pT = .Pattern~compile("(.)*fô🎅"~text)
ooRexx> pB~matches("xfooxxxxxxfô🎅")=                        -- 1
 1
ooRexx> pT~matches("xfooxxxxxxfô🎅"~text)=                   -- 1
 1
ooRexx> pB~startsWith("xfooxxxxxxfô🎅")=                     -- 1
 1
ooRexx> pT~startsWith("xfooxxxxxxfô🎅"~text)=                -- 1
 1


-- zero or one occurrences of "a"
ooRexx> pB = .Pattern~compile("a?")
ooRexx> pT = .Pattern~compile("a?"~text)
ooRexx> pB~matches("")=                                     -- 1
 1
ooRexx> pT~matches(""~text)=                                -- 1
 1
ooRexx> pB~matches("a")=                                    -- 1
 1
ooRexx> pT~matches("a"~text)=                               -- 1
 1
ooRexx> pB~matches("aa")=                                   -- 0
 0
ooRexx> pT~matches("aa"~text)=                              -- 0
 0


-- zero or one occurrences of "🎅"
ooRexx> pB = .Pattern~compile("🎅?")
ooRexx> pT = .Pattern~compile("🎅?"~text)
ooRexx> pB~matches("")=                                     -- 1 (was 0 (KO) before automatic conversion of string literals to text)
 1
ooRexx> pT~matches(""~text)=                                -- 1
 1
ooRexx> pB~matches("🎅")=                                   -- 1
 1
ooRexx> pT~matches("🎅"~text)=                              -- 1
 1
ooRexx> pB~matches("🎅🎅")=                                 -- 0
 0
ooRexx> pT~matches("🎅🎅"~text)=                            -- 0
 0


-- exactly 3 occurrences of "a"
ooRexx> pB = .Pattern~compile("a{3}")
ooRexx> pT = .Pattern~compile("a{3}"~text)
ooRexx> pB~matches("aa")=                                   -- 0
 0
ooRexx> pT~matches("aa"~text)=                              -- 0
 0
ooRexx> pB~matches("aaa")=                                  -- 1
 1
ooRexx> pT~matches("aaa"~text)=                             -- 1
 1
ooRexx> pB~matches("aaaa")=                                 -- 0
 0
ooRexx> pT~matches("aaaa"~text)=                            -- 0
 0


-- exactly 3 occurrences of "🎅"
ooRexx> pB = .Pattern~compile("🎅{3}")
ooRexx> pT = .Pattern~compile("🎅{3}"~text)
ooRexx> pB~matches("🎅🎅")=                                 -- 0
 0
ooRexx> pT~matches("🎅🎅"~text)=                            -- 0
 0
ooRexx> pB~matches("🎅🎅🎅")=                               -- 1 (was 0    KO before automatic conversion of string literals to text)
 1
ooRexx> pT~matches("🎅🎅🎅"~text)=                          -- 1
 1
ooRexx> pB~matches("🎅🎅🎅🎅")=                             -- 0
 0
ooRexx> pT~matches("🎅🎅🎅🎅"~text)=                        -- 0
 0


-- repetitive "b" in the middle
ooRexx> pB = .Pattern~compile("ab{2}c")
ooRexx> pT = .Pattern~compile("ab{2}c"~text)
ooRexx> pB~matches("ac")=                                   -- 0
 0
ooRexx> pT~matches("ac"~text)=                              -- 0
 0
ooRexx> pB~matches("abc")=                                  -- 0
 0
ooRexx> pT~matches("abc"~text)=                             -- 0
 0
ooRexx> pB~matches("abbc")=                                 -- 1
 1
ooRexx> pT~matches("abbc"~text)=                            -- 1
 1
ooRexx> pB~matches("abbbc")=                                -- 0
 0
ooRexx> pT~matches("abbbc"~text)=                           -- 0
 0


-- repetitive "🎅" in the middle
ooRexx> pB = .Pattern~compile("a🎅{2}c")
ooRexx> pT = .Pattern~compile("a🎅{2}c"~text)
ooRexx> pB~matches("ac")=                                   -- 0
 0
ooRexx> pT~matches("ac"~text)=                              -- 0
 0
ooRexx> pB~matches("a🎅c")=                                 -- 0
 0
ooRexx> pT~matches("a🎅c"~text)=                            -- 0
 0
ooRexx> pB~matches("a🎅🎅c")=                               -- 1 (was 0 (KO) before automatic conversion of string literals to text)
 1
ooRexx> pT~matches("a🎅🎅c"~text)=                          -- 1
 1
ooRexx> pB~matches("a🎅🎅🎅c")=                             -- 0
 0
ooRexx> pT~matches("a🎅🎅🎅c"~text)=                        -- 0
 0


-- "a" or "b"
ooRexx> pB = .Pattern~compile("a|b")
ooRexx> pT = .Pattern~compile("a|b"~text)
ooRexx> pB~matches("a")=                                    -- 1
 1
ooRexx> pT~matches("a"~text)=                               -- 1
 1
ooRexx> pB~matches("b")=                                    -- 1
 1
ooRexx> pT~matches("b"~text)=                               -- 1
 1
ooRexx> pB~matches("c")=                                    -- 0
 0
ooRexx> pT~matches("c"~text)=                               -- 0
 0
ooRexx> pB~startsWith("abc")=                               -- 1
 1
ooRexx> pT~startsWith("abc"~text)=                          -- 1
 1
ooRexx> pB~startsWith("bac")=                               -- 1
 1
ooRexx> pT~startsWith("bac"~text)=                          -- 1
 1
ooRexx> r = pB~find("xxxabcxxx")
ooRexx> r~matched=; r~start=; r~end=; r~text=; r~length=
 1
 4
 5
'a'
 1
ooRexx> r = pT~find("xxxabcxxx"~text)
ooRexx> r~matched=; r~start=; r~end=; r~text=; r~length=
 1
 4
 5
T'a'
 1
ooRexx> r = pB~find("xxxbacxxx")
ooRexx> r~matched=; r~start=; r~end=; r~text=; r~length=
 1
 4
 5
'b'
 1
ooRexx> r = pT~find("xxxbacxxx"~text)
ooRexx> r~matched=; r~start=; r~end=; r~text=; r~length=
 1
 4
 5
T'b'
 1


-- "🤶" or "🎅"
ooRexx> pB = .Pattern~compile("🤶|🎅")
ooRexx> pT = .Pattern~compile("🤶|🎅"~text)
ooRexx> pB~matches("🤶")=                                   -- 1
 1
ooRexx> pT~matches("🤶"~text)=                              -- 1
 1
ooRexx> pB~matches("🎅")=                                   -- 1
 1
ooRexx> pT~matches("🎅"~text)=                              -- 1
 1
ooRexx> pB~matches("c")=                                    -- 0
 0
ooRexx> pT~matches("c"~text)=                               -- 0
 0
ooRexx> pB~startsWith("🤶🎅c")=                             -- 1
 1
ooRexx> pT~startsWith("🤶🎅c"~text)=                        -- 1
 1
ooRexx> pB~startsWith("🎅🤶c")=                             -- 1
 1
ooRexx> pT~startsWith("🎅🤶c"~text)=                        -- 1
 1
ooRexx> r = pB~find("xxx🤶🎅cxxx")
ooRexx> r~matched=; r~start=; r~end=; r~text=; r~length=    -- now ok (r~end was 8 and r~length was 4 before automatic conversion of string literals to text)
 1
 4
 5
T'🤶'
 1
ooRexx> r = pT~find("xxx🤶🎅cxxx"~text)
ooRexx> r~matched=; r~start=; r~end=; r~text=; r~length=
 1
 4
 5
T'🤶'
 1
ooRexx> r = pB~find("xxx🎅🤶cxxx")
ooRexx> r~matched=; r~start=; r~end=; r~text=; r~length=    -- now ok (r~end was 8 and r~length was 4 before automatic conversion of string literals to text)
 1
 4
 5
T'🎅'
 1
ooRexx> r = pT~find("xxx🎅🤶cxxx"~text)
ooRexx> r~matched=; r~start=; r~end=; r~text=; r~length=
 1
 4
 5
T'🎅'
 1


ooRexx> .Unicode~unckeckedConversionToString = unckeckedConversionToString -- restore


-- ===============================================================================
-- 2023 Sep 14

/*
Fix implementation of caselessPos, pos for ligatures.
The results were not good for some byte indexes when using aligned:.false
*/

--------------
-- test case 1
--------------
-- pos with ligature "ffl" in strict mode (default)

ooRexx> "bâfflé"~text~c2u=                            -- 'U+0062 U+00E2 U+FB04 U+00E9'
'U+0062 U+00E2 U+FB04 U+00E9'

/*
                                             --  01 | 02   | 03     | 04     (external grapheme indexes)
                                             --  1  | 2 3  | 4 5 6  | 7 8    (external byte indexes)
"bâfflé"~text~c2g=                            -- '62 | C3A2 | EFAC84 | C3A9'
                                             --  b  | â    | ffl     | é
*/

ooRexx> "bâfflé"~text~pos("é")=                       -- 4
 4
ooRexx> "bâfflé"~text~pos("e")=                       -- 0
 0
ooRexx> "bâfflé"~text~pos("e", stripMark:)=           -- 4
 4
ooRexx> "bâfflé"~text~pos("f")=                       -- 0 because in strict mode, "ffl" remains U+FB04
 0
ooRexx> "bâfflé"~text~pos("f", asList:, overlap:, aligned:.false)=  -- a List (0 items)
a List (0 items)

--------------
-- test case 2
--------------
-- caselessPos with ligature "ffl" in strict mode (default)
-- (apply casefold internally but returns external indexes)
-- The ligature is decomposed by casefold.

/*
                                             --  01 | 02   | 03       | 04     (external grapheme indexes)
                                             --  1  | 2 3  | 4 5 6    | 7 8    (external byte indexes)
"bâfflé"~text~c2g=                            -- '62 | C3A2 | EFAC84   | C3A9'
                                             --  b  | â    | ffl       | é

                                             --  01 | 02   | 03 04 05 | 06     (internal grapheme indexes)
                                             --  1  | 2 3  | 4  5  6  | 7 8    (internal byte indexes)
"bâfflé"~text~casefold~c2g=                   -- '62 | C3A2 | 66 66 6C | C3A9'
                                             --  b  | â    | f  f  l  | é
*/

ooRexx> "bâfflé"~text~caselessPos("É")=               -- 4
 4
ooRexx> "bâfflé"~text~caselessPos("E")=               -- 0
 0
ooRexx> "bâfflé"~text~caselessPos("E", stripMark:)=   -- 4
 4
ooRexx> "bâfflé"~text~caselessPos("F")=               -- 0 because "F" matches only a subset of "ffl"-->"ffl"
 0
ooRexx> "bâfflé"~text~caselessPos("FF")=              -- 0 because "FF" matches only a subset of "ffl"-->"ffl"
 0
ooRexx> "bâfflé"~text~caselessPos("FL")=              -- 0 because "FL" matches only a subset of "ffl"-->"ffl"
 0
ooRexx> "bâfflé"~text~caselessPos("FFL")=             -- 3 because "FFL" matches all of "ffl"-->"ffl"
 3
ooRexx> "bâfflé"~text~caselessPos("F", asList:, overlap:, aligned:.false)=
a List (2 items)
 0 : [+3.4,-3.5]
 1 : [-3.5,-3.6]
ooRexx> "bâfflfflé"~text~caselessPos("É")=              -- 5
 5
ooRexx> "bâfflfflé"~text~caselessPos("FFL", asList:, overlap:, aligned:.false)=
a List (2 items)
 0 : [+3.4,+4.7]
 1 : [+4.7,+5.10]
ooRexx> "bâfflfflé"~text~caselessPos("F", asList:, overlap:, aligned:.false)=
a List (4 items)
 0 : [+3.4,-3.5]
 1 : [-3.5,-3.6]
 2 : [+4.7,-4.8]
 3 : [-4.8,-4.9]
ooRexx> "bâfflfflé"~text~caselessPos("FLFF")=                   -- 0
 0
ooRexx> "bâfflfflé"~text~caselessPos("FLFF", aligned:.false)=   -- [-3.5,-4.9]
[-3.5,-4.9]
ooRexx> "bâfflfflé"~text~caselessPos("FFLFFL")=                 -- 3
 3

--------------
-- test case 3
--------------
-- pos with ligature "ffl" in non-strict mode
-- (in non-strict mode, the normalization is NFKD, but returns external indexes)
-- The ligature is decomposed by NFKD

/*
                                             --  01 | 02     | 03       | 04     (external grapheme indexes)
                                             --  1  | 2 3    | 4 5 6    | 7 8    (external byte indexes)
"bâfflé"~text~c2g=                            -- '62 | C3A2   | EFAC84   | C3A9'
                                             --  b  | â      | ffl       | é

                                             --  01 | 02     | 03 04 05 | 06     (internal grapheme indexes)
                                             --  1  | 2 3 4  | 5  6  7  | 8 9 0  (internal byte indexes)
"bâfflé"~text~NFKD~c2g=                       -- '62 | 61CC82 | 66 66 6C | 65CC81'
                                             --  b  | a ^    | f  f  l  | e ´
*/

ooRexx> "bâfflé"~text~pos("é", strict:.false)=                -- 4
 4
ooRexx> "bâfflé"~text~pos("e", strict:.false)=                -- 0
 0
ooRexx> "bâfflé"~text~pos("e", strict:.false, stripMark:)=    -- 4
 4
ooRexx> "bâfflé"~text~pos("f", strict:.false)=                -- 0 because "f" matches only a subset of "ffl"-->"ffl"
 0
ooRexx> "bâfflé"~text~pos("ff", strict:.false)=               -- 0 because "ff" matches only a subset of "ffl"-->"ffl"
 0
ooRexx> "bâfflé"~text~pos("ffl", strict:.false)=              -- 3 because "ffl" matches all of "ffl"-->"ffl"
 3
ooRexx> "bâfflé"~text~pos("f", strict:.false, asList:, overlap:, aligned:.false)=
a List (2 items)
 0 : [+3.5,-3.6]
 1 : [-3.6,-3.7]
ooRexx> "bâfflfflé"~text~pos("é", strict:.false)=               -- 5
 5
ooRexx> "bâfflfflé"~text~pos("ffl", strict:.false, asList:, overlap:, aligned:.false)=
a List (2 items)
 0 : [+3.5,+4.8]
 1 : [+4.8,+5.11]
ooRexx> "bâfflfflé"~text~pos("f", strict:.false, asList:, overlap:, aligned:.false)=
a List (4 items)
 0 : [+3.5,-3.6]
 1 : [-3.6,-3.7]
 2 : [+4.8,-4.9]
 3 : [-4.9,-4.10]
ooRexx> "bâfflfflé"~text~pos("flff", strict:.false)=                    -- 0
 0
ooRexx> "bâfflfflé"~text~pos("flff", strict:.false, aligned:.false)=    -- [-3.6,-4.10]
[-3.6,-4.10]
ooRexx> "bâfflfflé"~text~pos("fflffl", strict:.false)=                  -- 3
 3

--------------
-- test case 4
--------------
-- caselessPos with ligature "ffl" in non-strict mode
-- (apply casefold internally but returns external indexes)
-- (in non-strict mode, the normalization is NFKD, but returns external indexes)
-- The ligature is decomposed both by casefold and by NFKD.

/*
                                             --  01 | 02     | 03       | 04     (external grapheme indexes)
                                             --  1  | 2 3    | 4 5 6    | 7 8    (external byte indexes)
"bâfflé"~text~c2g=                            -- '62 | C3A2   | EFAC84   | C3A9'
                                             --  b  | â      | ffl       | é

                                             --  01 | 02     | 03 04 05 | 06     (internal grapheme indexes)
                                             --  1  | 2 3 4  | 5  6  7  | 8 9 0  (internal byte indexes)
"bâfflé"~text~NFKD~c2g=                       -- '62 | 61CC82 | 66 66 6C | 65CC81'
                                             --  b  | a ^    | f  f  l  | e ´
*/

ooRexx> "bâfflé"~text~caselessPos("É", strict:.false)=               -- 4
 4
ooRexx> "bâfflé"~text~caselessPos("E", strict:.false)=               -- 0
 0
ooRexx> "bâfflé"~text~caselessPos("E", strict:.false, stripMark:)=   -- 4
 4
ooRexx> "bâfflé"~text~caselessPos("F", strict:.false)=               -- 0 because "F" matches only a subset of "ffl"-->"ffl"
 0
ooRexx> "bâfflé"~text~caselessPos("FF", strict:.false)=              -- 0 because "FF" matches only a subset of "ffl"-->"ffl"
 0
ooRexx> "bâfflé"~text~caselessPos("FFL", strict:.false)=             -- 3 because "FFL" matches all of "ffl"-->"ffl"
 3
ooRexx> "bâfflé"~text~caselessPos("F", strict:.false, asList:, overlap:, aligned:.false)=
a List (2 items)
 0 : [+3.5,-3.6]
 1 : [-3.6,-3.7]
ooRexx> "bâfflfflé"~text~caselessPos("É", strict:.false)=              -- 5
 5
ooRexx> "bâfflfflé"~text~caselessPos("FFL", strict:.false, asList:, overlap:, aligned:.false)=
a List (2 items)
 0 : [+3.5,+4.8]
 1 : [+4.8,+5.11]
ooRexx> "bâfflfflé"~text~caselessPos("F", strict:.false, asList:, overlap:, aligned:.false)=
a List (4 items)
 0 : [+3.5,-3.6]
 1 : [-3.6,-3.7]
 2 : [+4.8,-4.9]
 3 : [-4.9,-4.10]
ooRexx> "bâfflfflé"~text~caselessPos("FLFF", strict:.false)=                    -- 0
 0
ooRexx> "bâfflfflé"~text~caselessPos("FLFF", strict:.false, aligned:.false)=    -- [-3.6,-4.10]
[-3.6,-4.10]
ooRexx> "bâfflfflé"~text~caselessPos("FFLFFL", strict:.false)=                  -- 3
 3


-- ===============================================================================
-- 2023 Sep 11

/*
casefold now supports the option stripMark.

Rework the implementation of caselessPos, pos.
- Thanks to Raku and Chrome, I realize that a matching should be succesful only
  if all the bytes of a grapheme are matched.
- New named argument 'asList', to return a list of positions
  (similar to Raku's method .indices).
- New named argument overlap: (same as Raku)
  If the optional named argument 'overlap' is specified, the search continues
  from the position directly following the previous match, otherwise the search
  will continue after the previous match.
*/

/*
Remember:
aligned=.false is intended for analysis of matchings and [non-]regression tests.
Otherwise, I don't see any use.
When aligned:.false, a returned position has the form +/-posC.posB where posB is
the position of the matched byte in the transformed haystack, and posC is the
corresponding grapheme position in the untransformed haystack.
Don't use trunc(abs(position)) because you may need up to numeric digits 40:
    position max can be +/-(2**64-1)||"."||(2**64-1)
Use instead:
    if position~matchChar(1, "+-") then parse var position 2 posC "." posB
*/

/*
Additional test cases to cover corner cases for caselessPos, pos.
*/

--------------
-- test case 1
--------------
-- case no overlap versus overlap

/*
                                --  01   | 02   | 03   | 04   | 05   | 06
                                --  1 2  | 3 4  | 5 6  | 7 8  | 9 0  | 1 2
"àààààà"~text~c2g=              -- 'C3A0 | C3A0 | C3A0 | C3A0 | C3A0 | C3A0'
                                --  à    | à    | à    | à    | à    | à

                                --  01   | 02   | 03   | 04   | 05   | 06
                                --  1 2  | 3 4  | 5 6  | 7 8  | 9 0  | 1 2
"àààààà"~text~casefold~c2g=     -- 'C3A0 | C3A0 | C3A0 | C3A0 | C3A0 | C3A0'
                                --  à    | à    | à    | à    | à    | à
*/

ooRexx> "àààààà"~text~caselessPos("aa", stripMark:)=                                    -- 1
 1
ooRexx> "àààààà"~text~caselessPos("aa", stripMark:, asList:)~allItems=                  -- [ 1, 3, 5]
[ 1, 3, 5]
ooRexx> "àààààà"~text~caselessPos("aa", stripMark:, asList:, overlap:)~allItems=        -- [ 1, 2, 3, 4, 5]
[ 1, 2, 3, 4, 5]
ooRexx> "àààààà"~text~caselessPos("aa", stripMark:, asList:, aligned:.false)=
a List (3 items)
 0 : [+1.1,+3.3]
 1 : [+3.3,+5.5]
 2 : [+5.5,+7.7]
ooRexx> "àààààà"~text~caselessPos("aa", stripMark:, asList:, overlap:, aligned:.false)=
a List (5 items)
 0 : [+1.1,+3.3]
 1 : [+2.2,+4.4]
 2 : [+3.3,+5.5]
 3 : [+4.4,+6.6]
 4 : [+5.5,+7.7]

--------------
-- test case 2
--------------
-- case where the end of the matching is inside the untransformed grapheme

/*
                            --  01
                            --  1 2
"ß"~text~c2g=               -- 'C39F'
                            --  ß

                            --  01 02
                            --  1  2
"ß"~text~casefold~c2g=      -- '73 73'
                            --  s  s
*/

ooRexx> "ß"~text~caselessPos("s")=                                  -- 0, not 1 because 1 would match only the first byte of "ß"-->"ss"
 0
ooRexx> "ß"~text~caselessPos("s", asList:)=                         -- a List (0 items)
a List (0 items)
ooRexx> "ß"~text~caselessPos("s", asList:, overlap:)=               -- a List (0 items)
a List (0 items)
ooRexx> "ß"~text~caselessPos("s", asList:, aligned:.false)=
a List (2 items)
 0 : [+1.1,-1.2]
 1 : [-1.2,+2.3]
ooRexx> "ß"~text~caselessPos("s", asList:, overlap:, aligned:.false)=
a List (2 items)
 0 : [+1.1,-1.2]
 1 : [-1.2,+2.3]

/*
                            --  01 | 02
                            --  1  | 2 3
"sß"~text~c2g=              -- '73 | C39F'
                            --  s  | ß

                            --  01 | 02 03
                            --  1  | 2  3
"sß"~text~casefold~c2g=     -- '73 | 73 73'
                            --  s  | s  s
*/

ooRexx> "sß"~text~caselessPos("ss")=                                -- 2, not 1 because 1 would match only the first byte of "ß"-->"ss"
 2
ooRexx> "sß"~text~caselessPos("ss", asList:)~allItems=              -- [ 2]
[ 2]
ooRexx> "sß"~text~caselessPos("ss", asList:, overlap:)~allItems=    -- [ 2]
[ 2]
ooRexx> "sß"~text~caselessPos("ss", asList:, aligned:.false)=
a List (1 items)
 0 : [+1.1,-2.3]
ooRexx> "sß"~text~caselessPos("ss", asList:, overlap:, aligned:.false)=
a List (2 items)
 0 : [+1.1,-2.3]
 1 : [+2.2,+3.4]

/*
                            --  01 | 02    | 03
                            --  1  | 2 3   | 4
"sßs"~text~c2g=             -- '73 | C39F  | 73'
                            --  s  | ß     | s

                            --  01 | 02 03 | 04
                            --  1  | 2  3  | 4
"sßs"~text~casefold~c2g=    -- '73 | 73 73 | 73'
                            --  s  | s  s  | s
*/

ooRexx> "sßs"~text~caselessPos("s", 2)=                             -- 3, not 2 because 2 would match only the first byte of "ß"-->"ss"
 3
ooRexx> "sßs"~text~caselessPos("s", 2, asList:)~allItems=           -- [ 3]
[ 3]
ooRexx> "sßs"~text~caselessPos("s", 2, asList:, overlap:)~allItems= -- [ 3]
[ 3]
ooRexx> "sßs"~text~caselessPos("s", 2, asList:, aligned:.false)=
a List (3 items)
 0 : [+2.2,-2.3]
 1 : [-2.3,+3.4]
 2 : [+3.4,+4.5]
ooRexx> "sßs"~text~caselessPos("s", 2, asList:, overlap:, aligned:.false)=
a List (3 items)
 0 : [+2.2,-2.3]
 1 : [-2.3,+3.4]
 2 : [+3.4,+4.5]

ooRexx> "sßs"~text~caselessPos("ss")=                               -- 2, not 1 because 1 would match only the first byte of "ß"-->"ss"
 2
ooRexx> "sßs"~text~caselessPos("ss", asList:)~allItems=             -- [ 2]
[ 2]
ooRexx> "sßs"~text~caselessPos("ss", asList:, overlap:)~allItems=   -- [ 2]
[ 2]
ooRexx> "sßs"~text~caselessPos("ss", asList:, aligned:.false)=
a List (2 items)
 0 : [+1.1,-2.3]
 1 : [-2.3,+4.5]
ooRexx> "sßs"~text~caselessPos("ss", asList:, overlap:, aligned:.false)=
a List (3 items)
 0 : [+1.1,-2.3]
 1 : [+2.2,+3.4]
 2 : [-2.3,+4.5]

--------------
-- test case 3
--------------
-- caselessPos (apply casefold internally but returns external indexes)
-- search 1 character, no overlap when searching a single character.

/*
                                                        --  01 | 02 | 03 | 04 | 05 | 06 | 07 | 08 | 09 | 10 | 11    | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19    | 20 | 21 | 22 | 23
                                                        --  1  | 2  | 3  | 4  | 5  | 6  | 7  | 8  | 9  | 0  | 1 2   | 3  | 4  | 5  | 6  | 7  | 8  | 9  | 0 1   | 2  | 3  | 4  | 5
"Bundesstraße sss sßs ss"~text~c2g=                     -- '42 | 75 | 6E | 64 | 65 | 73 | 73 | 74 | 72 | 61 | C39F  | 65 | 20 | 73 | 73 | 73 | 20 | 73 | C39F  | 73 | 20 | 73 | 73'
                                                        --  B  | u  | n  | d  | e  | s  | s  | t  | r  | a  | ß     | e  | _  | s  | s  | s  | _  | s  | ß     | s  | _  | s  | s
                                                        --                           ^    ^                   ^                 ^    ^    ^         ^    ^       ^         ^    ^

                                                        --  01 | 02 | 03 | 04 | 05 | 06 | 07 | 08 | 09 | 10 | 11    | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19    | 20 | 21 | 22 | 23
                                                        --  1  | 2  | 3  | 4  | 5  | 6  | 7  | 8  | 9  | 0  | 1  2  | 3  | 4  | 5  | 6  | 7  | 8  | 9  | 0  1  | 2  | 3  | 4  | 5
"Bundesstraße sss sßs ss"~text~casefold~c2g=            -- '62 | 75 | 6E | 64 | 65 | 73 | 73 | 74 | 72 | 61 | 73 73 | 65 | 20 | 73 | 73 | 73 | 20 | 73 | 73 73 | 73 | 20 | 73 | 73'
                                                        --  B  | u  | n  | d  | e  | s  | s  | t  | r  | a  | ß     | e  | _  | s  | s  | s  | _  | s  | ß     | s  | _  | s  | s
*/

ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s")=        -- 6
 6
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 7)=     -- 7
 7
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 8)=     -- 14
 14
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 15)=    -- 15
 15
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 16)=    -- 16
 16
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 17)=    -- 18
 18
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 19)=    -- 20
 20
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 21)=    -- 22
 22
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 23)=    -- 23
 23
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 24)=    -- 0
 0
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", asList:)~allItems=              -- [ 6, 7, 14, 15, 16, 18, 20, 22, 23]
[ 6, 7, 14, 15, 16, 18, 20, 22, 23]
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", asList:, overlap:)~allItems=    -- [ 6, 7, 14, 15, 16, 18, 20, 22, 23]
[ 6, 7, 14, 15, 16, 18, 20, 22, 23]
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", asList:, aligned:.false)=
a List (13 items)
 0  : [+6.6,+7.7]
 1  : [+7.7,+8.8]
 2  : [+11.11,-11.12]
 3  : [-11.12,+12.13]
 4  : [+14.15,+15.16]
 5  : [+15.16,+16.17]
 6  : [+16.17,+17.18]
 7  : [+18.19,+19.20]
 8  : [+19.20,-19.21]
 9  : [-19.21,+20.22]
 10 : [+20.22,+21.23]
 11 : [+22.24,+23.25]
 12 : [+23.25,+24.26]
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", asList:, overlap:, aligned:.false)=
a List (13 items)
 0  : [+6.6,+7.7]
 1  : [+7.7,+8.8]
 2  : [+11.11,-11.12]
 3  : [-11.12,+12.13]
 4  : [+14.15,+15.16]
 5  : [+15.16,+16.17]
 6  : [+16.17,+17.18]
 7  : [+18.19,+19.20]
 8  : [+19.20,-19.21]
 9  : [-19.21,+20.22]
 10 : [+20.22,+21.23]
 11 : [+22.24,+23.25]
 12 : [+23.25,+24.26]

--------------
-- test case 4
--------------
-- caselessPos (apply casefold internally but returns external indexes)
-- search 3 characters

/*
                                                        --  01 02 03 04 05 06 07 08 09 10 11   12 13 14 15 16 17 18 19   20 21 22 23
"Bundesstraße sss sßs ss"~text~c2g=                     -- '42 75 6E 64 65 73 73 74 72 61 C39F 65 20 73 73 73 20 73 C39F 73 20 73 73'
                                                        --  B  u  n  d  e  s  s  t  r  a  ß    e  _  s  s  s  _  s  ß    s  _  s  s
                                                        --                                           |           |  |
*/

ooRexx>                                                                                 --                  Raku                Chrome
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSs")=                              -- 14               13                  y
 14
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSs", 15)=                          -- 18               17                  y
 18
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSs", 19)=                          -- 19   (overlap)   18 (if overlap)     y
 19
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSs", 20)=                          -- 0
 0
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSs", asList:)~allItems=            -- [ 14, 18]
[ 14, 18]
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSs", asList:, overlap:)~allItems=  -- [ 14, 18, 19]
[ 14, 18, 19]
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSs", asList:, aligned:.false)=
a List (2 items)
 0 : [+14.15,+17.18]
 1 : [+18.19,+20.22]
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSs", asList:, overlap:, aligned:.false)=
a List (3 items)
 0 : [+14.15,+17.18]
 1 : [+18.19,+20.22]
 2 : [+19.20,+21.23]

--------------
-- test case 5
--------------
-- caselessPos (apply casefold internally but returns external indexes)
-- search 4 characters

/*
                                                        --  01 02 03 04 05 06 07 08 09 10 11   12 13 14 15 16 17 18 19   20 21 22 23
"Bundesstraße sss sßs ss"~text~c2g=                     -- '42 75 6E 64 65 73 73 74 72 61 C39F 65 20 73 73 73 20 73 C39F 73 20 73 73'
                                                        --  B  u  n  d  e  s  s  t  r  a  ß    e  _  s  s  s  _  s  ß    s  _  s  s
                                                        --                                                       |
*/

ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSsS")=                             -- 18 (good, same result as Raku and Chrome)
 18
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSsS", asList:)~allItems=           -- [ 18]
[ 18]
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSsS", asList:, overlap:)~allItems= -- [ 18]
[ 18]
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSsS", asList:, aligned:.false)=
a List (1 items)
 0 : [+18.19,+21.23]
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSsS", asList:, overlap:, aligned:.false)=
a List (1 items)
 0 : [+18.19,+21.23]

--------------
-- test case 6
--------------
-- caselessPos (apply casefold internally but returns external indexes)
-- search 2 characters in a long sequence

/*
                                                        --  01 02 03 04 05   06 07 08   09   10 11 12 13
"straßssßßssse"~text~c2g=                               -- '73 74 72 61 C39F 73 73 C39F C39F 73 73 73 65'
                                                        --  s  t  r  a  ß    s  s  ß    ß    s  s  s  e
                                                        --              |    |  |  |    |    |  |
*/

ooRexx>                                                         --                  Raku                Chome
ooRexx> "straßssßßssse"~text~caselessPos("Ss")=                 -- 5                4                   y
 5
ooRexx> "straßssßßssse"~text~caselessPos("Ss", 6)=              -- 6                5 (if overlap)      y       why Raku needs overlap?
 6
ooRexx> "straßssßßssse"~text~caselessPos("Ss", 7)=              -- 8                7                   y
 8
ooRexx> "straßssßßssse"~text~caselessPos("Ss", 9)=              -- 9                8 (if overlap)      y       why Raku needs overlap?
 9
ooRexx> "straßssßßssse"~text~caselessPos("Ss", 10)=             -- 10               9                   y
 10
ooRexx> "straßssßßssse"~text~caselessPos("Ss", 11)=             -- 11   (overlap)   10 (if overlap)     y
 11
ooRexx> "straßssßßssse"~text~caselessPos("Ss", 12)=             -- 0
 0
ooRexx> "straßssßßssse"~text~caselessPos("Ss", asList:)~allItems=           -- [ 5, 6, 8, 9, 10]
[ 5, 6, 8, 9, 10]
ooRexx> "straßssßßssse"~text~caselessPos("Ss", asList:, overlap:)~allItems= -- [ 5, 6, 8, 9, 10, 11]
[ 5, 6, 8, 9, 10, 11]
ooRexx> "straßssßßssse"~text~caselessPos("Ss", asList:, aligned:.false)=
a List (5 items)
 0 : [+5.5,+6.7]
 1 : [+6.7,+8.9]
 2 : [+8.9,+9.11]
 3 : [+9.11,+10.13]
 4 : [+10.13,+12.15]
ooRexx> "straßssßßssse"~text~caselessPos("Ss", asList:, overlap:, aligned:.false)=
a List (10 items)
 0 : [+5.5,+6.7]
 1 : [-5.6,+7.8]
 2 : [+6.7,+8.9]
 3 : [+7.8,-8.10]
 4 : [+8.9,+9.11]
 5 : [-8.10,-9.12]
 6 : [+9.11,+10.13]
 7 : [-9.12,+11.14]
 8 : [+10.13,+12.15]
 9 : [+11.14,+13.16]

--------------
-- test case 7
--------------
-- pos, caselessPos

/*
                                                    --  01 02 03 04 05   06 07 08 09 10   11 12                                                 13
                                                    --  0                         1                      2                   3                    4
                                                    --  1  2  3  4  5 6  7  8  9  0  1 2  3  4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8  9 0 1 2
"straße noël👩‍👨‍👩‍👧🎅"~text~c2g=                         -- '73 74 72 61 C39F 65 20 6E 6F C3AB 6C F09F91A9E2808DF09F91A8E2808DF09F91A9E2808DF09F91A7 F09F8E85'
                                                    --                                                                                 |
*/

ooRexx> "👧🎅"~text~c2g=                                   -- 'F09F91A7 F09F8E85'
'F09F91A7 F09F8E85'
ooRexx> "👧🎅"~text~casefold~c2g=                          -- 'F09F91A7 F09F8E85'
'F09F91A7 F09F8E85'

ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~pos("👧🎅", 1, aligned:.false)=         -- [-12.35,+14.43]
[-12.35,+14.43]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~pos("👧🎅", 12, aligned:.false)=        -- [-12.35,+14.43]
[-12.35,+14.43]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~pos("👧🎅", 13, aligned:.false)=        -- 0
 0
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~pos("👧🎅", 13, asList:)=               -- a List (0 items)
a List (0 items)
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~pos("👧🎅", 13, asList:, overlap:)=     -- a List (0 items)
a List (0 items)
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~pos("👧🎅", asList:, aligned:.false)=
a List (1 items)
 0 : [-12.35,+14.43]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~pos("👧🎅", asList:, overlap:, aligned:.false)=
a List (1 items)
 0 : [-12.35,+14.43]

ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~caselessPos("👧🎅", 1, aligned:.false)=     -- [-12.35,+14.43]
[-12.35,+14.43]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~caselessPos("👧🎅", 12, aligned:.false)=    -- [-12.35,+14.43]
[-12.35,+14.43]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~caselessPos("👧🎅", 13, aligned:.false)=    -- 0
 0
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~caselessPos("👧🎅", asList:, aligned:.false)=
a List (1 items)
 0 : [-12.35,+14.43]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~caselessPos("👧🎅", asList:, overlap:, aligned:.false)=
a List (1 items)
 0 : [-12.35,+14.43]

-- yes, 12.35, not 12.34 even if "ë" (2 bytes) becomes internally "e" (1 byte)
-- because the indexes are external (relative to the target string, not related to the internal transformed string)
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~caselessPos("👧🎅", 1, aligned:.false, stripMark:)=     -- [-12.35,+14.43]
[-12.34,+14.42]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~caselessPos("👧🎅", 12, aligned:.false, stripMark:)=    -- [-12.35,+14.43]
[-12.35,+14.43]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~caselessPos("👧🎅", 13, aligned:.false, stripMark:)=    -- 0
 0
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~caselessPos("👧🎅", asList:, aligned:.false, stripMark:)=
a List (1 items)
 0 : [-12.34,+14.42]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~caselessPos("👧🎅", asList:, overlap:, aligned:.false, stripMark:)=
a List (1 items)
 0 : [-12.34,+14.42]

--------------
-- test case 8
--------------
-- casefold

/*
                                                    --  01 02 03 04 05 06 07 08 09 10 11   12 13                                                 14
                                                    --  0                          1                      2                   3                    4
                                                    --  1  2  3  4  5  6  7  8  9  0  1 2  3  4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8  9 0 1 2
"straße noël👩‍👨‍👩‍👧🎅"~text~casefold~c2g=                -- '73 74 72 61 73 73 65 20 6E 6F C3AB 6C F09F91A9E2808DF09F91A8E2808DF09F91A9E2808DF09F91A7 F09F8E85'
                                                    --                                                                                  |
*/

-- here we get 13 because "ß" is replaced by "ss" before calling pos
-- the byte position .35 is unchanged because "ß" is 2 bytes, as is "ss".
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~casefold~pos("👧🎅", 1, aligned:.false)=                -- [-13.35,+15.43]
[-13.35,+15.43]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~casefold~pos("👧🎅", asList:, aligned:.false)=
a List (1 items)
 0 : [-13.35,+15.43]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~casefold~pos("👧🎅", asList:, overlap:, aligned:.false)=
a List (1 items)
 0 : [-13.35,+15.43]

-- stripMark has no impact on the byte position because it's an internal transformation
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~casefold~pos("👧🎅", 1, aligned:.false, stripMark:)=    -- [-13.35,+15.43]
[-13.34,+15.42]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~casefold~pos("👧🎅", asList:, aligned:.false, stripMark:)=
a List (1 items)
 0 : [-13.34,+15.42]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~casefold~pos("👧🎅", asList:, overlap:, aligned:.false, stripMark:)=
a List (1 items)
 0 : [-13.34,+15.42]

-- here we get 13.34 because stripMark has an impact on the byte position:
-- "ë" (2 bytes" becomes "e" (1 byte) before calling pos.
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~casefold(stripMark:)~pos("👧🎅", 1, aligned:.false)=    -- [-13.34,+15.42]
[-13.34,+15.42]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~casefold(stripMark:)~pos("👧🎅", asList:, aligned:.false)=
a List (1 items)
 0 : [-13.34,+15.42]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~casefold(stripMark:)~pos("👧🎅", asList:, overlap:, aligned:.false)=
a List (1 items)
 0 : [-13.34,+15.42]

--------------
-- test case 9
--------------
-- pos with a needle inside a grapheme of the haystack
-- Raku consider there is no matching.

ooRexx> "👨‍👩"~text~c2g=                                  -- 'F09F91A8E2808DF09F91A9'
'F09F91A8E2808DF09F91A9'

ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~pos("👨‍👩")=                   -- 0
 0
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~pos("👨‍👩", aligned:.false)=   -- [-12.21,-12.32]
[-12.21,-12.32]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~pos("👨‍👩", asList:, aligned:.false)=
a List (1 items)
 0 : [-12.21,-12.32]
ooRexx> "straße noël👩‍👨‍👩‍👧🎅"~text~pos("👨‍👩", asList:, overlap:, aligned:.false)=
a List (1 items)
 0 : [-12.21,-12.32]

---------------
-- test case 10
---------------
-- pos with ignorable (no internal transformation)
-- TAG SPACE is ignorable

/*
                                                                                --  01 02   03         04 05 06 07 08 09 10 11   12 13 14 15 16 17         18   19 20
"TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~c2g=             -- '54 C38A 74F3A080A0 65 20 73 73 73 20 73 C39F 73 20 73 73 20 74F3A080A0 C3AA 54 45'
                                                                                --  T  Ê    t TAG SPAC e  _  s  s  s  _  s  ß    s  _  s  s  _  t TAG SPAC ê    T  E
                                                                                --                           |  |                      |
*/

ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("ss", asList:)~allItems=             -- [ 6, 14]
[ 6, 14]
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("ss", asList:, overlap:)~allItems=   -- [ 6, 7, 14]
[ 6, 7, 14]
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("ss", asList:, aligned:.false)=
a List (2 items)
 0 : [+6.11,+8.13]
 1 : [+14.20,+16.22]
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("ss", asList:, overlap:, aligned:.false)=
a List (3 items)
 0 : [+6.11,+8.13]
 1 : [+7.12,+9.14]
 2 : [+14.20,+16.22]

--------------
-- test case 11
--------------
-- caselessPos with ignorable (apply casefold internally but returns external indexes)
-- TAG SPACE is ignorable

/*
                                                                                --  01 02   03         04 05 06 07 08 09 10 11   12 13 14 15 16 17         18   19 20
"TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~c2g=             -- '54 C38A 74F3A080A0 65 20 73 73 73 20 73 C39F 73 20 73 73 20 74F3A080A0 C3AA 54 45'
                                                                                --  T  Ê    t TAG SPAC e  _  s  s  s  _  s  ß    s  _  s  s  _  t TAG SPAC ê    T  E
                                                                                --                           |  |           |          |
*/

ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss", asList:)~allItems=             -- [ 6, 11, 14]
[ 6, 11, 14]
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss", asList:, overlap:)~allItems=   -- [ 6, 7, 11, 14]
[ 6, 7, 11, 14]
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss", asList:, aligned:.false)=
a List (4 items)
 0 : [+6.11,+8.13]
 1 : [+10.15,-11.17]
 2 : [-11.17,+13.19]
 3 : [+14.20,+16.22]
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss", asList:, overlap:, aligned:.false)=
a List (6 items)
 0 : [+6.11,+8.13]
 1 : [+7.12,+9.14]
 2 : [+10.15,-11.17]
 3 : [+11.16,+12.18]
 4 : [-11.17,+13.19]
 5 : [+14.20,+16.22]


ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", asList:)~allItems=             -- [ 19]
[ 19]
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", asList:, overlap:)~allItems=   -- [ 19]
[ 19]
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", asList:, aligned:.false)=
a List (1 items)
 0 : [+19.30,+21.32]
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", asList:, overlap:, aligned:.false)=
a List (1 items)
 0 : [+19.30,+21.32]

---------------
-- test case 12
---------------
-- pos with ignorable (apply casefold + stripMark internally but returns external indexes)
-- TAG SPACE is ignorable

ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("te", stripMark:, asList:)=              -- a List (0 items)
a List (0 items)
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("te", stripMark:, asList:, overlap:)=    -- a List (0 items)
a List (0 items)

---------------
-- test case 13
---------------
-- caselessPos with ignorable (apply casefold + stripMark internally but returns external indexes)
-- TAG SPACE is ignorable

ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, asList:)~allItems=             -- [ 1, 19]
[ 1, 19]
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, asList:, overlap:)~allItems=   -- [ 1, 19]
[ 1, 19]
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, asList:, aligned:.false)=
a List (2 items)
 0 : [+1.1,+3.3]
 1 : [+19.28,+21.30]
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, asList:, overlap:, aligned:.false)=
a List (2 items)
 0 : [+1.1,+3.3]
 1 : [+19.28,+21.30]

---------------
-- test case 14
---------------
-- caselessPos with ignorable (apply casefold + stripIgnorable internally but returns external indexes)
-- TAG SPACE is ignorable

/*
                                                                                --  01 02   03         04 05 06 07 08 09 10 11   12 13 14 15 16 17         18   19 20
"TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~c2g=             -- '54 C38A 74F3A080A0 65 20 73 73 73 20 73 C39F 73 20 73 73 20 74F3A080A0 C3AA 54 45'
                                                                                --  T  Ê    t TAG SPAC e  _  s  s  s  _  s  ß    s  _  s  s  _  t TAG SPAC ê    T  E
                                                                                --  |       |                                                   |               |
*/

ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, stripIgnorable:, asList:)~allItems=            -- [ 1, 3, 17, 19]
[ 1, 3, 17, 19]
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, stripIgnorable:, asList:, overlap:)~allItems=  -- [ 1, 3, 17, 19]
[ 1, 3, 17, 19]
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, stripIgnorable:, asList:, aligned:.false)=
a List (4 items)
 0 : [+1.1,+3.3]
 1 : [+3.3,+5.5]
 2 : [+17.18,+19.20]
 3 : [+19.20,+21.22]
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, stripIgnorable:, asList:, overlap:, aligned:.false)=
a List (4 items)
 0 : [+1.1,+3.3]
 1 : [+3.3,+5.5]
 2 : [+17.18,+19.20]
 3 : [+19.20,+21.22]


-- ===============================================================================
-- 2023 Sep 06

/*
Fix the implementation of caselessPos, pos.
Was not returning the right position when the length of the string changed
internally. Now the results are identical to Raku's (with a few exceptions).
*/

ooRexx> "Bundesstraße im Freiland"~text~pos("Freiland")=                -- 17
 17
ooRexx> "Bundesstraße im Freiland"~text~caselessPos("freiland")=        -- 17
 17

--------------
-- test case 1
--------------
-- pos (no internal transformation)

/*
                                                        --  01 02 03 04 05 06 07 08 09 10 11   12 13 14 15 16 17 18 19   20 21 22 23
"Bundesstraße sss sßs ss"~text~c2g=                     -- '42 75 6E 64 65 73 73 74 72 61 C39F 65 20 73 73 73 20 73 C39F 73 20 73 73'
                                                        --  B  u  n  d  e  s  s  t  r  a  ß    e  _  s  s  s  _  s  ß    s  _  s  s
                                                        --                 |                         |                         |        no overlap
                                                        --                 |                         |  |                      |        with overlap
*/

ooRexx> "Bundesstraße sss sßs ss"~text~pos("ss")=               -- 6
 6
ooRexx> "Bundesstraße sss sßs ss"~text~pos("ss", 7)=            -- 14
 14
ooRexx> "Bundesstraße sss sßs ss"~text~pos("ss", 15)=           -- 15 (overlap)
 15
ooRexx> "Bundesstraße sss sßs ss"~text~pos("ss", 16)=           -- 22
 22
ooRexx> "Bundesstraße sss sßs ss"~text~pos("ss", 23)=           -- 0
 0

--------------
-- test case 2
--------------
-- caselessPos (apply casefold internally but returns external indexes)

/*
                                                        --  01 02 03 04 05 06 07 08 09 10 11   12 13 14 15 16 17 18 19   20 21 22 23
"Bundesstraße sss sßs ss"~text~c2g=                     -- '42 75 6E 64 65 73 73 74 72 61 C39F 65 20 73 73 73 20 73 C39F 73 20 73 73'
                                                        --  B  u  n  d  e  s  s  t  r  a  ß    e  _  s  s  s  _  s  ß    s  _  s  s
                                                        --                 |              |          |           |             |        no overlap
                                                        --                 |              |          |  |        |  |          |        with overlap
*/

ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("ss")=       -- 6
 6
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("ss", 7)=    -- 11
 11
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("ss", 12)=   -- 14
 14
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("ss", 15)=   -- 15 (overlap)
 15
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("ss", 16)=   -- 19           (Raku doesn't return this index, am I wrong? sounds good to me...)
 19
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("ss", 20)=   -- 22
 22
ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("ss", 23)=   -- 0
 0

--------------
-- test case 3
--------------
-- casefold~pos (the returned indexes are different from caselessPos because the string is transformed before calling ~pos)
-- Use "ü" instead of "u" to have a non-ASCII string.
-- Without "ü", the 'pos' method would forward to String.

/*
                                                        --  01 02   03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
"Bündesstraße sss sßs ss"~text~casefold~c2g=            -- '62 C3BC 6E 64 65 73 73 74 72 61 73 73 65 20 73 73 73 20 73 73 73 73 20 73 73'
                                                        --  b  ü    n  d  e  s  s  t  r  a  s  s  e  _  s  s  s  _  s  s  s  s  _  s  s
                                                        --                   |              |           |           |     |        |    no overlap
                                                        --                   |              |           |  |        |  |  |        |    with overlap
*/

ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss")=      -- 6
 6
ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss", 7)=   -- 11
 11
ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss", 12)=  -- 15
 15
ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss", 16)=  -- 16 (overlap)
 16
ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss", 17)=  -- 19
 19
ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss", 20)=  -- 20 (overlap)
 20
ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss", 21)=  -- 21 (overlap)
 21
ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss", 22)=  -- 24
 24
ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss", 25)=  -- 0
 0


--------------
-- test case 4
--------------
-- TAG SPACE is ignorable
ooRexx> "TÊt\u{TAG SPACE}e"~text~unescape~length=                                       -- 4
 4
ooRexx> "TÊt\u{TAG SPACE}e"~text~unescape~c2g=                                          -- '54 C38A 74F3A080A0 65'
'54 C38A 74F3A080A0 65'
ooRexx> "TÊt\u{TAG SPACE}e"~text~unescape~transform(stripIgnorable:)~c2g=               -- '54 C38A 74 65'
'54 C38A 74 65'

--------------
-- test case 5
--------------
-- pos with ignorable (no internal transformation)

/*
                                                                                --  01 02   03         04 05 06 07 08 09 10 11   12 13 14 15 16 17         18   19 20
"TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~c2g=             -- '54 C38A 74F3A080A0 65 20 73 73 73 20 73 C39F 73 20 73 73 20 74F3A080A0 C3AA 54 45'
                                                                                --  T  Ê    t TAG SPAC e  _  s  s  s  _  s  ß    s  _  s  s  _  t TAG SPAC ê    T  E
                                                                                --                           |  |                      |
*/

ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("ss")=       -- 6
 6
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("ss", 7)=    -- 7
 7
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("ss", 8)=    -- 14
 14
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("ss", 15)=   -- 0
 0

ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("te")=       -- 0
 0

--------------
-- test case 6
--------------
-- caselessPos with ignorable (apply casefold internally but returns external indexes)

/*
                                                                                --  01 02   03         04 05 06 07 08 09 10 11   12 13 14 15 16 17         18   19 20
"TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~c2g=             -- '54 C38A 74F3A080A0 65 20 73 73 73 20 73 C39F 73 20 73 73 20 74F3A080A0 C3AA 54 45'
                                                                                --  T  Ê    t TAG SPAC e  _  s  s  s  _  s  ß    s  _  s  s  _  t TAG SPAC ê    T  E
                                                                                --                           |  |        |  |          |
*/

ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss")=       -- 6
 6
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss", 7)=    -- 7 (overlap)
 7
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss", 8)=    -- 11
 11
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss", 12)=   -- 14
 14
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss", 15)=   -- 0
 0

ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te")=       -- 19
 19
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", 20)=   -- 0
 0

--------------
-- test case 7
--------------
-- pos with ignorable (apply casefold + stripMark internally but returns external indexes)
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("te", stripMark:)=   -- 0
 0

--------------
-- test case 8
--------------
-- caselessPos with ignorable (apply casefold + stripMark internally but returns external indexes)
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:)=       -- 1
 1
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", 2, stripMark:)=    -- 19
 19
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", 20, stripMark:)=   -- 0
 0

--------------
-- test case 9
--------------
-- caselessPos with ignorable (apply casefold + stripIgnorable internally but returns external indexes)

/*
                                                                                --  01 02   03         04 05 06 07 08 09 10 11   12 13 14 15 16 17         18   19 20
"TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~c2g=             -- '54 C38A 74F3A080A0 65 20 73 73 73 20 73 C39F 73 20 73 73 20 74F3A080A0 C3AA 54 45'
                                                                                --  T  Ê    t TAG SPAC e  _  s  s  s  _  s  ß    s  _  s  s  _  t TAG SPAC ê    T  E
                                                                                --  |       |                                                   |               |
*/

ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, stripIgnorable:)=      -- 1
 1
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", 2, stripMark:, stripIgnorable:)=   -- 3
 3
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", 4, stripMark:, stripIgnorable:)=   -- 17
 17
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", 18, stripMark:, stripIgnorable:)=  -- 19
 19
ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", 20, stripMark:, stripIgnorable:)=  -- 0
 0


-- ===============================================================================
-- 2023 Aug 29

/*
Implementation of caselessContains, contains:
(forwards to caselessPos or pos, and returns .true if result <> 0)
(was already implemented, waiting for 'pos' implementation)
Examples:
*/
ooRexx>     "Père Noël Père Noël"~text~contains("oë")=                   -- .true
 1
ooRexx>     "Père Noël Père Noël"~text~contains("oë", , 7)=              -- .false
 0
ooRexx>     "Père Noël Père Noël"~text~contains("oë", , 8)=              -- .true
 1
ooRexx>     "Père Noël Père Noël"~text~contains("oë", 8)=                -- .true
 1
ooRexx>     "Père Noël Père Noël"~text~contains("oë", 8, 10)=            -- .false
 0
ooRexx>     "Père Noël Père Noël"~text~contains("oë", 8, 11)=            -- .true
 1
ooRexx>     "Père Noël Père Noël"~text~caselessContains("OË", 8, 11)=    -- .true
 1

ooRexx>     "noël👩‍👨‍👩‍👧🎅"~text~contains("👧🎅")=                            -- .false
 0
ooRexx>     "noël👩‍👨‍👩‍👧🎅"~text~contains("👧🎅", aligned:.false)=            -- .true
 1
ooRexx>     "noël👩‍👨‍👩‍👧🎅"~text~contains("👩‍👨‍👩‍👧🎅", aligned:.false)=            -- .true
 1


-- ===============================================================================
-- 2023 Aug 28

/*
Add a named argument 'aligned' to caselessPos, pos:
- If aligned=.true (default) then return the first character position in the
  untransformed haystack such as all the bytes of the transformed needle are
  matched with corresponding bytes in the transformed haystack AND the first
  and last byte positions are aligned with character positions.
  If no match then return 0.
- If aligned=.false then return a couple (array) of numbers +/-posC.posB where
  posB is the position of the matched byte in the transformed haystack, and posC
  is the corresponding grapheme position in the untransformed haystack.
  A number is negative if the byte position is not aligned with the corresponding
  character position.
  The first number is the start of the matching.
  The second number is the end of the matching + 1.

aligned=.false is intended for analysis of matchings and [non-]regression tests.
Otherwise, I don't see any use.

Example:
*/
ooRexx>     "noël👩‍👨‍👩‍👧🎅"~text~pos("👧🎅")=                           -- 0
 0
ooRexx>     "noël👩‍👨‍👩‍👧🎅"~text~pos("👧🎅", aligned:.false)=           -- [-5.27,+7.35]
[-5.27,+7.35]
ooRexx>     "noël👩‍👨‍👩‍👧🎅"~text~pos("👩‍👨‍👩‍👧🎅", aligned:.false)=           -- [+5.6,+7.35]
[+5.6,+7.35]


/*
Comparison operators:
Take into account the default normalization managed by the .Unicode class
*/
ooRexx> .Unicode~normalizationName(.Unicode~defaultNormalization(strict:.true))=    -- NFC when strict
'NFC'
ooRexx> .Unicode~normalizationName(.Unicode~defaultNormalization(strict:.false))=   -- NFKD when not strict
'NFKD'
/*
Example:
*/
ooRexx>     ("baffle"~text == "baffle"~text) =    -- false
 0
ooRexx>     ("baffle"~text = "baffle"~text) =     -- true
 1
/*
Reminder: the non-strict mode supports all the Unicode spaces, not just U+0032.
*/
ooRexx>     string1 = " Le\u{IDEOGRAPHIC SPACE}Pè\u{ZERO-WIDTH-SPACE}re\u{HYPHEN}Noël"~text~unescape
ooRexx>     string2 = "Le\u{OGHAM SPACE MARK}Père\u{EN DASH}No\u{ZERO-WIDTH-SPACE}ël "~text~unescape
ooRexx>     (string1 == string2) =              -- false
 0
ooRexx>     (string1 = string2) =               -- true
 1


-- ===============================================================================
-- 2023 Aug 26

ooRexx> t = "noël👩‍👨‍👩‍👧🎅"~text; t~c2g=    -- '6E 6F C3AB 6C F09F91A9E2808DF09F91A8E2808DF09F91A9E2808DF09F91A7 F09F8E85'
'6E 6F C3AB 6C F09F91A9E2808DF09F91A8E2808DF09F91A9E2808DF09F91A7 F09F8E85'
ooRexx> t = "noël👩‍👨‍👩‍👧🎅"~text; do indexB=1 to t~string~length + 2; indexC = t~indexer~characterIndexC(indexB); character = t~character(abs(indexC)); say "indexB" indexB~right(3) "--> indexC" indexC~right(4) "    " character~c2x; end
indexB   1 --> indexC    1      6E
indexB   2 --> indexC    2      6F
indexB   3 --> indexC    3      C3AB
indexB   4 --> indexC   -3      C3AB
indexB   5 --> indexC    4      6C
indexB   6 --> indexC    5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB   7 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB   8 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB   9 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  10 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  11 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  12 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  13 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  14 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  15 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  16 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  17 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  18 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  19 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  20 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  21 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  22 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  23 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  24 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  25 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  26 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  27 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  28 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  29 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  30 --> indexC   -5      F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7
indexB  31 --> indexC    6      F09F8E85
indexB  32 --> indexC   -6      F09F8E85
indexB  33 --> indexC   -6      F09F8E85
indexB  34 --> indexC   -6      F09F8E85
indexB  35 --> indexC    7      
indexB  36 --> indexC    7      


-- Implementation of caselessCompare, compare
-- ------------------------------------------
ooRexx>     "hello"~text~compare("hello")=                          -- 0
 0
ooRexx>     "hello"~text~compare("helloo")=                         -- 6
 6
ooRexx>     "hello"~text~compare("hellô")=                          -- 5
 5
ooRexx>     "hello"~text~caselessCompare("hellô",stripMark:)=       -- 0
 0
ooRexx>     "hellÔ"~text~caselessCompare("hellô")=                  -- 0
 0
ooRexx>     "hellÔ"~text~caselessCompare("")=                       -- 1
 1
ooRexx>     "hellÔ"~text~caselessCompare("", "h")=                  -- 2
 2
ooRexx>     zwsp = "\u{ZERO WIDTH SPACE}"~text~unescape             -- ignorable
ooRexx>     ("he"zwsp"llo")~compare("hellô")=                       -- 3 (ok)
 3
ooRexx>     ("he"zwsp"llo")~compare("hellô", stripIgnorable:)=      -- 6 (ok? not 5 because the ignorable character count as a character)
 6


-- casefold 2 characters: "ß" becomes "ss"
ooRexx>     "Bundesstraße im Freiland"~text~caselessCompare("Bundesstraße")=        -- 14 (good)
 14
ooRexx>     "Bundesstraße im Freiland"~text~caselessCompare("Bundesstraße", "_")=   -- 13 (good)
 13
ooRexx>     "Bundesstraße im Freiland"~text~caselessCompare("bundesstrasse")=       -- 14 (good)
 14
ooRexx>     "Bundesstrasse im Freiland"~text~caselessCompare("bundesstraße")=       -- 15 (good)
 15
ooRexx>     "straßssßßssse"~text~compare("stra", "ß")=                              --  6 (good)
 6
ooRexx>     "straßssßßssse"~text~caselessCompare("stra", "ß")=                      -- 12 (not 13 because the last 's' match half of the pad 'ss')
 12

/*
This test case is a little bit strange because:
- the case-folded character looks identical to the original character.
- the normalization and the casefold have the same effect.
*/
-- casefold 3 characters: "ΐ" 'U+0390' becomes "ΐ" 'U+03B9 U+0308 U+0301'
ooRexx>     iota_dt = "\u{GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS}"~text~unescape
ooRexx>     iota_dt~casefold~UnicodeCharacters==
an Array (shape [3], 3 items)
 1 : ( "ι"   U+03B9 Ll 1 "GREEK SMALL LETTER IOTA" )
 2 : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 3 : ( "́"    U+0301 Mn 0 "COMBINING ACUTE ACCENT" )
ooRexx>     ("a" iota_dt "b")~compare("a")=                         -- 3
 3
ooRexx>     ("a" iota_dt "b")~compare("a" iota_dt)=                 -- 5
 5
ooRexx>     ("a" iota_dt~casefold "b")~compare("a" iota_dt)=                                -- 5 (yes! not 3 because the default NFC transforms iota_dt~casefold 'U+03B9 U+0308 U+0301' into 'U+0390')
 5
ooRexx>     ("a" iota_dt~casefold "b")~compare("a" iota_dt, normalization: .Unicode~NFD)=   -- 5 (yes! not 3 because NFD transforms iota_dt 'U+0390' into 'U+03B9 U+0308 U+0301'
 5
ooRexx>     ("a" iota_dt~casefold "b")~compare("a" iota_dt, normalization: 0)=              -- 3 because normalization deactivated
 3
ooRexx>     ("a" iota_dt "b")~caselessCompare("a")=                 -- 3
 3
ooRexx>     ("a" iota_dt "b")~caselessCompare("a" iota_dt)=         -- 5
 5
ooRexx>     ("a" iota_dt "b")~caselessCompare("a ", iota_dt)=       -- 4
 4


-- Implementation of caselessEndsWith, endsWith
-- --------------------------------------------
ooRexx>     "hello"~text~endsWith("")=                              -- false
 0
ooRexx>     "hello"~text~endsWith("o")=                             -- true
 1
ooRexx>     "hello"~text~endsWith("ô")=                             -- false
 0
ooRexx>     "hello"~text~endsWith("ô", stripMark:)=                 -- true
 1
ooRexx>     "hello"~text~endsWith("O")=                             -- false
 0
ooRexx>     "hello"~text~caselessEndsWith("O")=                     -- true
 1


-- Rework implementation of caselessMatchChar, matchChar
-- -----------------------------------------------------
ooRexx>     "BAFFLE"~text~caselessMatchChar(3, "ffl")=               -- 0, was 1 before 2023.12.04      "ffl" becomes "ffl" (3 graphemes), there is a match on "f" at 3
 0
ooRexx>     "BAFFLE"~text~caselessMatchChar(5, "ffl")=               -- 0, was 1 before 2023.12.04      "ffl" becomes "ffl" (3 graphemes), there is a match on "l" at 5
 0
ooRexx>     "baffle"~text~caselessMatchChar(5, "L")=               -- 1      there is a match on "l" at 5 (forward to string)
 1
ooRexx>     "baffle"~text~caselessMatchChar(3, "ffl")=                 -- 1      "ffl" at 3 (1 grapheme) becomes "ffl" (3 graphemes), there is a match on "l"
 1
ooRexx>     "baffle"~text~caselessMatchChar(3, "F")=                 -- 0, was 1 before 2023.12.04      "ffl" at 3 (1 grapheme) becomes "ffl" (3 graphemes), there is a match on "f"
 0
ooRexx>     "baffle"~text~caselessMatchChar(3, "L")=                 -- 0, was 1 before 2023.12.04      "ffl" at 3 (1 grapheme) becomes "ffl" (3 graphemes), there is a match on "l"
 0
ooRexx>     "baffle"~text~caselessMatchChar(4, "E")=                 -- 1      the grapheme at 4 is "e", not "f". There is a match with "e"
 1


-- Rework implementation of caselessCompareTo, compareTo
-- -----------------------------------------------------
ooRexx>     "Père Noël"~text~nfc~compareTo("Père Noël"~text~nfc)=                       -- 0 (equal)
 0
ooRexx>     "Père Noël"~text~nfc~compareTo("Père Noël"~text~nfd)=                       -- 0 (equal)
 0
ooRexx>     "Père Noël"~text~nfd~compareTo("Père Noël"~text~nfc)=                       -- 0 (equal)
 0
ooRexx>     "Père Noël"~text~nfd~compareTo("Père Noël"~text~nfd)=                       -- 0 (equal)
 0
ooRexx>     ---
ooRexx>     "Pere Noël"~text~nfc~compareTo("Père Noel"~text~nfc, stripMark:)=           -- 0 (equal)
 0
ooRexx>     "Pere Noël"~text~nfc~compareTo("Père Noel"~text~nfd, stripMark:)=           -- 0 (equal)
 0
ooRexx>     "Pere Noël"~text~nfd~compareTo("Père Noel"~text~nfc, stripMark:)=           -- 0 (equal)
 0
ooRexx>     "Pere Noël"~text~nfd~compareTo("Père Noel"~text~nfd, stripMark:)=           -- 0 (equal)
 0
ooRexx>     ---
ooRexx>     "1st Père Noël"~text~nfc~compareTo("2nd Père Noël"~text~nfc)=               -- -1 (lesser)
-1
ooRexx>     "1st Père Noël"~text~nfc~compareTo("2nd Père Noël"~text~nfd)=               -- -1 (lesser)
-1
ooRexx>     "1st Père Noël"~text~nfd~compareTo("2nd Père Noël"~text~nfc)=               -- -1 (lesser)
-1
ooRexx>     "1st Père Noël"~text~nfd~compareTo("2nd Père Noël"~text~nfd)=               -- -1 (lesser)
-1
ooRexx>     ---
ooRexx>     "Père Noël 2nd"~text~nfc~compareTo("Père Noël 1st"~text~nfc)=               -- 1 (greater)
 1
ooRexx>     "Père Noël 2nd"~text~nfc~compareTo("Père Noël 1st"~text~nfd)=               -- 1 (greater)
 1
ooRexx>     "Père Noël 2nd"~text~nfd~compareTo("Père Noël 1st"~text~nfc)=               -- 1 (greater)
 1
ooRexx>     "Père Noël 2nd"~text~nfd~compareTo("Père Noël 1st"~text~nfd)=               -- 1 (greater)
 1
ooRexx>     ---
ooRexx>     "Pere Noël"~text~nfc~compareTo("Père Noel"~text~nfc, 3, 4)=                 -- 0 (equal)
 0
ooRexx>     "Pere Noël"~text~nfc~compareTo("Père Noel"~text~nfd, 3, 4)=                 -- 0 (equal)
 0
ooRexx>     "Pere Noël"~text~nfd~compareTo("Père Noel"~text~nfc, 3, 4)=                 -- 0 (equal)
 0
ooRexx>     "Pere Noël"~text~nfd~compareTo("Père Noel"~text~nfd, 3, 4)=                 -- 0 (equal)
 0
ooRexx>     ---
ooRexx>     "PÈRE NOËL"~text~nfc~compareTo("Père Noël"~text~nfc)=                       -- -1 (lesser)
-1
ooRexx>     "PÈRE NOËL"~text~nfc~compareTo("Père Noël"~text~nfd)=                       -- -1 (lesser)
-1
ooRexx>     "PÈRE NOËL"~text~nfd~compareTo("Père Noël"~text~nfc)=                       -- -1 (lesser)
-1
ooRexx>     "PÈRE NOËL"~text~nfd~compareTo("Père Noël"~text~nfd)=                       -- -1 (lesser)
-1
ooRexx>     ---
ooRexx>     "PÈRE NOËL"~text~nfc~caselessCompareTo("Père Noël"~text~nfc)=               -- 0 (equal)
 0
ooRexx>     "PÈRE NOËL"~text~nfc~caselessCompareTo("Père Noël"~text~nfd)=               -- 0 (equal)
 0
ooRexx>     "PÈRE NOËL"~text~nfd~caselessCompareTo("Père Noël"~text~nfc)=               -- 0 (equal)
 0
ooRexx>     "PÈRE NOËL"~text~nfd~caselessCompareTo("Père Noël"~text~nfd)=               -- 0 (equal)
 0
ooRexx>     ---
ooRexx>     "PERE NOËL"~text~nfc~caselessCompareTo("Père Noel"~text~nfc, 3, 4)=         -- 0 (equal)
 0
ooRexx>     "PERE NOËL"~text~nfc~caselessCompareTo("Père Noel"~text~nfd, 3, 4)=         -- 0 (equal)
 0
ooRexx>     "PERE NOËL"~text~nfd~caselessCompareTo("Père Noel"~text~nfc, 3, 4)=         -- 0 (equal)
 0
ooRexx>     "PERE NOËL"~text~nfd~caselessCompareTo("Père Noel"~text~nfd, 3, 4)=         -- 0 (equal)
 0


-- Implementation of caselessPos, pos
-- ----------------------------------

/*
    --       P  è       r  e  _  N  o  ë       l
    --       1  2       3  4  5  6  7  8       9
    -- NFC  '50 C3A8    72 65 20 4E 6F C3AB    6C'
    --       1  2 3     4  5  6  7  8  9 10    11
    -- NFD  '50 65 CC80 72 65 20 4E 6F 65 CC88 6C'
    --       1  2  3 4  5  6  7  8  9  19 1112 13
*/
ooRexx>                                                             --      self needle
ooRexx>     "Père Noël Père Noël"~text~pos("l")=                    -- 9    NFC, NFC
 9
ooRexx>     "Père Noël Père Noël"     ~pos("l")=                    -- 9    NFC, NFC    (was 11 before automatic conversion of string literals to text)
 9

ooRexx>     "Père Noël Père Noël"~text~pos("l", , 8)=               -- 0    NFC, NFC
 0
ooRexx>     "Père Noël Père Noël"     ~pos("l", , 10)=              -- 9    NFC, NFC    (was 0 before automatic conversion of string literals to text)
 9

ooRexx>     "Père Noël Père Noël"~text~pos("l", , 9)=               -- 9    NFC, NFC
 9
ooRexx>     "Père Noël Père Noël"     ~pos("l", , 11)=              -- 9    NFC, NFC    (was 11 before automatic conversion of string literals to text)
 9

ooRexx>     "Père Noël Père Noël"~text~pos("l", 10)=                -- 19   NFC, NFC
 19
ooRexx>     "Père Noël Père Noël"     ~pos("l", 12)=                -- 19   NFC, NFC    (was 23 before automatic conversion of string literals to text)
 19

ooRexx>     "Père Noël Père Noël"~text~pos("l", 10, 9)=             -- 0    NFC, NFC
 0
ooRexx>     "Père Noël Père Noël"     ~pos("l", 12, 11)=            -- 19   NFC, NFC    (was 0 before automatic conversion of string literals to text)
 19

ooRexx>     "Père Noël Père Noël"~text~pos("l", 10, 10)=            -- 19   NFC, NFC
 19
ooRexx>     "Père Noël Père Noël"     ~pos("l", 12, 12)=            -- 19   NFC, NFC    (was 23 before automatic conversion of string literals to text)
 19

ooRexx>     ---

ooRexx>     "Père Noël Père Noël"~text~pos("l")=                    -- 9    NFD, NFC
 9
ooRexx>     "Père Noël Père Noël"     ~pos("l")=                    -- 9    NFD, NFC    (was 13 before automatic conversion of string literals to text)
 9

ooRexx>     "Père Noël Père Noël"~text~pos("l", , 8)=               -- 0    NFD, NFC
 0
ooRexx>     "Père Noël Père Noël"     ~pos("l", , 12)=              -- 9    NFD, NFC    (was 0 before automatic conversion of string literals to text)
 9

ooRexx>     "Père Noël Père Noël"~text~pos("l", , 9)=               -- 9    NFD, NFC
 9
ooRexx>     "Père Noël Père Noël"     ~pos("l", , 13)=              -- 9    NFD, NFC    (was 13 before automatic conversion of string literals to text)
 9

ooRexx>     "Père Noël Père Noël"~text~pos("l", 10)=                -- 19   NFD, NFC
 19
ooRexx>     "Père Noël Père Noël"     ~pos("l", 14)=                -- 19   NFD, NFC    (was 27 before automatic conversion of string literals to text)
 19

ooRexx>     "Père Noël Père Noël"~text~pos("l", 10, 9)=             -- 0    NFD, NFC
 0
ooRexx>     "Père Noël Père Noël"     ~pos("l", 14, 13)=            -- 19   NFD, NFC    (was 0 before automatic conversion of string literals to text)
 19

ooRexx>     "Père Noël Père Noël"~text~pos("l", 10, 10)=            -- 19   NFD, NFC
 19
ooRexx>     "Père Noël Père Noël"     ~pos("l", 14, 14)=            -- 19   NFD, NFC    (was 27 before automatic conversion of string literals to text)
 19

ooRexx>     ---

ooRexx>     "Père Noël Père Noël"~text~pos("oë")=                   -- 7    NFC, NFC
 7
ooRexx>     "Père Noël Père Noël"     ~pos("oë")=                   -- 7    NFC, NFC    (was 8 before automatic conversion of string literals to text)
 7

ooRexx>     "Père Noël Père Noël"~text~pos("oë", , 7)=              -- 0    NFC, NFC
 0
ooRexx>     "Père Noël Père Noël"     ~pos("oë", , 9)=              -- 7    NFC, NFC    (was 0 before automatic conversion of string literals to text)
 7

ooRexx>     "Père Noël Père Noël"~text~pos("oë", , 8)=              -- 7    NFC, NFC
 7
ooRexx>     "Père Noël Père Noël"     ~pos("oë", , 10)=             -- 7    NFC, NFC    (was 8 before automatic conversion of string literals to text)
 7

ooRexx>     "Père Noël Père Noël"~text~pos("oë", 8)=                -- 17   NFC, NFC
 17
ooRexx>     "Père Noël Père Noël"     ~pos("oë", 9)=                -- 17   NFC, NFC    (was 20 before automatic conversion of string literals to text)
 17

ooRexx>     "Père Noël Père Noël"~text~pos("oë", 8, 10)=            -- 0    NFC, NFC
 0
ooRexx>     "Père Noël Père Noël"     ~pos("oë", 9, 13)=            -- 17   NFC, NFC    (was 0 before automatic conversion of string literals to text)
 17

ooRexx>     "Père Noël Père Noël"~text~pos("oë", 8, 11)=            -- 17   NFC, NFC
 17
ooRexx>     "Père Noël Père Noël"     ~pos("oë", 9, 14)=            -- 17   NFC, NFC    (was 20 before automatic conversion of string literals to text)
 17

ooRexx>     ---

ooRexx>     "Père Noël Père Noël"~text~pos("oë")=                   -- 7    NFD, NFC
 7
ooRexx>     "Père Noël Père Noël"     ~pos("oë")=                   -- 7    NFD, NFC    (was "always 0, no need to test all the combinations" before automatic conversion of string literals to text)
 7

ooRexx>     "Père Noël Père Noël"~text~pos("oë", , 7)=              -- 0    NFD, NFC
 0

ooRexx>     "Père Noël Père Noël"~text~pos("oë", , 8)=              -- 7    NFD, NFC
 7

ooRexx>     "Père Noël Père Noël"~text~pos("oë", 8)=                -- 17   NFD, NFC
 17

ooRexx>     "Père Noël Père Noël"~text~pos("oë", 8, 10)=            -- 0    NFD, NFC
 0

ooRexx>     "Père Noël Père Noël"~text~pos("oë", 8, 11)=            -- 17   NFD, NFC
 17

ooRexx>     ---

ooRexx>     "Père Noël Père Noël"~text~pos("oë")=                   -- 7    NFC, NFD
 7
ooRexx>     "Père Noël Père Noël"     ~pos("oë")=                   -- 7    NFC, NFD    (was "always 0, no need to test all the combinations" before automatic conversion of string literals to text)
 7

ooRexx>     "Père Noël Père Noël"~text~pos("oë", , 7)=              -- 0    NFC, NFD
 0

ooRexx>     "Père Noël Père Noël"~text~pos("oë", , 8)=              -- 7    NFC, NFD
 7

ooRexx>     "Père Noël Père Noël"~text~pos("oë", 8)=                -- 17   NFC, NFD
 17

ooRexx>     "Père Noël Père Noël"~text~pos("oë", 8, 10)=            -- 0    NFC, NFD
 0

ooRexx>     "Père Noël Père Noël"~text~pos("oë", 8, 11)=            -- 17   NFC, NFD
 17

ooRexx>     ---

ooRexx>     "Père Noël Père Noël"~text~pos("oë")=                   -- 7    NFD, NFD
 7
ooRexx>     "Père Noël Père Noël"     ~pos("oë")=                   -- 7    NFD, NFD    (was 9 before automatic conversion of string literals to text)
 7

ooRexx>     "Père Noël Père Noël"~text~pos("oë", , 7)=              -- 0    NFD, NFD
 0
ooRexx>     "Père Noël Père Noël"     ~pos("oë", , 11)=             -- 7    NFD, NFD    (was 0 before automatic conversion of string literals to text)
 7

ooRexx>     "Père Noël Père Noël"~text~pos("oë", , 8)=              -- 7    NFD, NFD
 7
ooRexx>     "Père Noël Père Noël"     ~pos("oë", , 12)=             -- 7    NFD, NFD    (was 9 before automatic conversion of string literals to text)
 7

ooRexx>     "Père Noël Père Noël"~text~pos("oë", 8)=                -- 17   NFD, NFD
 17
ooRexx>     "Père Noël Père Noël"     ~pos("oë", 10)=               -- 17   NFD, NFD    (was 23 before automatic conversion of string literals to text)
 17

ooRexx>     "Père Noël Père Noël"~text~pos("oë", 8, 10)=            -- 0    NFD, NFD
 0
ooRexx>     "Père Noël Père Noël"     ~pos("oë", 10, 16)=           -- 17   NFD, NFD    (was 0 before automatic conversion of string literals to text)
 17

ooRexx>     "Père Noël Père Noël"~text~pos("oë", 8, 11)=            -- 17   NFD, NFD
 17
ooRexx>     "Père Noël Père Noël"     ~pos("oë", 10, 17)=           -- 17   NFD, NFD    (was 23 before automatic conversion of string literals to text)
 17

ooRexx>     ---

ooRexx>     "Père Noël Père Noël"~text~pos("oe")=                   -- 0    NFC, NFC    always 0, no need to test all the combinations
 0
ooRexx>     "Père Noël Père Noël"~text~pos("oe", stripMark:)=       -- 7    NFC, NFC
 7

ooRexx>     "Père Noël Père Noël"~text~pos("oe", , 7, stripMark:)=  -- 0    NFC, NFC
 0

ooRexx>     "Père Noël Père Noël"~text~pos("oe", , 8, stripMark:)=  -- 7    NFC, NFC
 7

ooRexx>     "Père Noël Père Noël"~text~pos("oe", 8, stripMark:)=    -- 17   NFC, NFC
 17

ooRexx>     "Père Noël Père Noël"~text~pos("oe", 8, 10, stripMark:)=-- 0    NFC, NFC
 0

ooRexx>     "Père Noël Père Noël"~text~pos("oe", 8, 11, stripMark:)=-- 17   NFC, NFC
 17

ooRexx>     ---
ooRexx>     -- caseless tests not in the diary:
ooRexx>     ---

ooRexx>     "Père Noël Père Noël"~text~caselessPos("L")=                    -- 9    NFC, NFC
 9
ooRexx>     "Père Noël Père Noël"     ~caselessPos("L")=                    -- 9    NFC, NFC    (was 11 before automatic conversion of string literals to text)
 9

ooRexx>     "Père Noël Père Noël"~text~caselessPos("L", , 8)=               -- 0    NFC, NFC
 0
ooRexx>     "Père Noël Père Noël"     ~caselessPos("L", , 10)=              -- 9    NFC, NFC    (was 0 before automatic conversion of string literals to text)
 9

ooRexx>     "Père Noël Père Noël"~text~caselessPos("L", , 9)=               -- 9    NFC, NFC
 9
ooRexx>     "Père Noël Père Noël"     ~caselessPos("L", , 11)=              -- 9    NFC, NFC    (was 11 before automatic conversion of string literals to text)
 9

ooRexx>     "Père Noël Père Noël"~text~caselessPos("L", 10)=                -- 19   NFC, NFC
 19
ooRexx>     "Père Noël Père Noël"     ~caselessPos("L", 12)=                -- 19   NFC, NFC    (was 23 before automatic conversion of string literals to text)
 19

ooRexx>     "Père Noël Père Noël"~text~caselessPos("L", 10, 9)=             -- 0    NFC, NFC
 0
ooRexx>     "Père Noël Père Noël"     ~caselessPos("L", 12, 11)=            -- 19   NFC, NFC    (was 0 before automatic conversion of string literals to text)
 19

ooRexx>     "Père Noël Père Noël"~text~caselessPos("L", 10, 10)=            -- 19   NFC, NFC
 19
ooRexx>     "Père Noël Père Noël"     ~caselessPos("L", 12, 12)=            -- 19   NFC, NFC    (was 23 before automatic conversion of string literals to text)
 19

ooRexx>     ---

ooRexx>     "Père Noël Père Noël"~text~caselessPos("L")=                    -- 9    NFD, NFC
 9
ooRexx>     "Père Noël Père Noël"     ~caselessPos("L")=                    -- 9    NFD, NFC    (was 13 before automatic conversion of string literals to text)
 9

ooRexx>     "Père Noël Père Noël"~text~caselessPos("L", , 8)=               -- 0    NFD, NFC
 0
ooRexx>     "Père Noël Père Noël"     ~caselessPos("L", , 12)=              -- 9    NFD, NFC    (was 0 before automatic conversion of string literals to text)
 9

ooRexx>     "Père Noël Père Noël"~text~caselessPos("L", , 9)=               -- 9    NFD, NFC
 9
ooRexx>     "Père Noël Père Noël"     ~caselessPos("L", , 13)=              -- 9    NFD, NFC    (was 13 before automatic conversion of string literals to text)
 9

ooRexx>     "Père Noël Père Noël"~text~caselessPos("L", 10)=                -- 19   NFD, NFC
 19
ooRexx>     "Père Noël Père Noël"     ~caselessPos("L", 14)=                -- 19   NFD, NFC    (was 27 before automatic conversion of string literals to text)
 19

ooRexx>     "Père Noël Père Noël"~text~caselessPos("L", 10, 9)=             -- 0    NFD, NFC
 0
ooRexx>     "Père Noël Père Noël"     ~caselessPos("L", 14, 13)=            -- 19   NFD, NFC    (was 0 before automatic conversion of string literals to text)
 19

ooRexx>     "Père Noël Père Noël"~text~caselessPos("L", 10, 10)=            -- 19   NFD, NFC
 19
ooRexx>     "Père Noël Père Noël"     ~caselessPos("L", 14, 14)=            -- 19   NFD, NFC    (was 27 before automatic conversion of string literals to text)
 19

ooRexx>     ---

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË")=                   -- 7    NFC, NFC
 7
ooRexx>     "Père Noël Père Noël"     ~caselessPos("OË")=                   -- 7    NFC, NFC    (was "yes, 0, not 8 because "OË"~lower=='oË'" before automatic conversion of string literals to text)
 7

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", , 7)=              -- 0    NFC, NFC
 0
ooRexx>     "Père Noël Père Noël"     ~caselessPos("OË", , 9)=              -- 7    NFC, NFC    (was 0 before automatic conversion of string literals to text)
 7

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", , 8)=              -- 7    NFC, NFC
 7
ooRexx>     "Père Noël Père Noël"     ~caselessPos("OË", , 10)=             -- 7    NFC, NFC    (was "yes, 0, not 8 because "OË"~lower=='oË'" before automatic conversion of string literals to text)
 7

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", 8)=                -- 17   NFC, NFC
 17
ooRexx>     "Père Noël Père Noël"     ~caselessPos("OË", 9)=                -- 17   NFC, NFC    (was "yes, 0, not 20 because "OË"~lower=='oË'" before automatic conversion of string literals to text)
 17

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", 8, 10)=            -- 0    NFC, NFC
 0
ooRexx>     "Père Noël Père Noël"     ~caselessPos("OË", 9, 13)=            -- 17   NFC, NFC    (was 0 before automatic conversion of string literals to text)
 17

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", 8, 11)=            -- 17   NFC, NFC
 17
ooRexx>     "Père Noël Père Noël"     ~caselessPos("OË", 9, 14)=            -- 17   NFC, NFC    (was "yes, 0, not 20 because "OË"~lower=='oË'" before automatic conversion of string literals to text)
 17

ooRexx>     ---

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË")=                   -- 7    NFD, NFC
 7
ooRexx>     "Père Noël Père Noël"     ~caselessPos("OË")=                   -- 7    NFD, NFC    (was "always 0, no need to test all the combinations" before automatic conversion of string literals to text)
 7

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", , 7)=              -- 0    NFD, NFC
 0

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", , 8)=              -- 7    NFD, NFC
 7

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", 8)=                -- 17   NFD, NFC
 17

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", 8, 10)=            -- 0    NFD, NFC
 0

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", 8, 11)=            -- 17   NFD, NFC
 17

ooRexx>     ---

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË")=                   -- 7    NFC, NFD
 7
ooRexx>     "Père Noël Père Noël"     ~caselessPos("OË")=                   -- 7    NFC, NFD   (was "always 0, no need to test all the combinations" before automatic conversion of string literals to text)
 7

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", , 7)=              -- 0    NFC, NFD
 0

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", , 8)=              -- 7    NFC, NFD
 7

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", 8)=                -- 17   NFC, NFD
 17

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", 8, 10)=            -- 0    NFC, NFD
 0

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", 8, 11)=            -- 17   NFC, NFD
 17

ooRexx>     ---

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË")=                   -- 7    NFD, NFD
 7
ooRexx>     "Père Noël Père Noël"     ~caselessPos("OË")=                   -- 7    NFD, NFD   (was "yes, 9 (it works...) because the NFD representation isolate the accent: "oë"~c2x=='6F65CC88',  "OË"~lower~c2x=='6F65CC88'" before automatic conversion of string literals to text)
 7

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", , 7)=              -- 0    NFD, NFD
 0
ooRexx>     "Père Noël Père Noël"     ~caselessPos("OË", , 11)=             -- 7    NFD, NFD   (was 0 before automatic conversion of string literals to text)
 7

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", , 8)=              -- 7    NFD, NFD
 7
ooRexx>     "Père Noël Père Noël"     ~caselessPos("OË", , 12)=             -- 7    NFD, NFD   (was "yes, 9 (it works thanks to the NFD), see previous comment" before automatic conversion of string literals to text)
 7

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", 8)=                -- 17   NFD, NFD
 17
ooRexx>     "Père Noël Père Noël"     ~caselessPos("OË", 10)=               -- 17   NFD, NFD   (was "yes, 23 (it works thanks to the NFD), see previous comment" before automatic conversion of string literals to text)
 17

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", 8, 10)=            -- 0    NFD, NFD
 0
ooRexx>     "Père Noël Père Noël"     ~caselessPos("OË", 10, 16)=           -- 17   NFD, NFD   (was 0 before automatic conversion of string literals to text)
 17

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OË", 8, 11)=            -- 17   NFD, NFD
 17
ooRexx>     "Père Noël Père Noël"     ~caselessPos("OË", 10, 17)=           -- 17   NFD, NFD   (was "yes, 23 (it works thanks to the NFD), see previous comment" before automatic conversion of string literals to text)
 17

ooRexx>     ---

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OE")=                   -- 0    NFC, NFC    always 0, no need to test all the combinations
 0
ooRexx>     "Père Noël Père Noël"~text~caselessPos("OE", stripMark:)=       -- 7    NFC, NFC
 7

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OE", , 7, stripMark:)=  -- 0    NFC, NFC
 0

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OE", , 8, stripMark:)=  -- 7    NFC, NFC
 7

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OE", 8, stripMark:)=    -- 17   NFC, NFC
 17

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OE", 8, 10, stripMark:)=-- 0    NFC, NFC
 0

ooRexx>     "Père Noël Père Noël"~text~caselessPos("OE", 8, 11, stripMark:)=-- 17   NFC, NFC
 17


-- ===============================================================================
-- 2023 Aug 07

-- Add conversion from a Unicode encoding to a Byte encoding.
ooRexx> "Père Noël"~text~transcodeTo("cp437")~c2x=                                  -- '50 8A 72 65 20 4E 6F 89 6C'
'50 8A 72 65 20 4E 6F 89 6C'
ooRexx> '50 8A 72 65 20 4E 6F 89 6C'x~text("cp437")~utf8~c2x=                       -- '50 C3A8 72 65 20 4E 6F C3AB 6C'
'50 C3A8 72 65 20 4E 6F C3AB 6C'
ooRexx> '50 8A 72 65 20 4E 6F 89 6C'x~text("cp437")~transcodeTo("utf8")~c2x=        -- '50 C3A8 72 65 20 4E 6F C3AB 6C'
'50 C3A8 72 65 20 4E 6F C3AB 6C'

-- The replacementCharacter "FF"x is interpreted as a UTF-8 string (default encoding). "FF"x~text~c2u= -- 'U+FFFD'
-- Was: Hence the error "The replacement character UTF-8 not-ASCII '[FF]' cannot be transcoded to ISO-8859-1."
-- Now: Invalid UTF-8 string (since automatic conversion of string literals to text)
-- Now: Direct transcoding from 'Byte' to 'ISO-8859-1' is not supported (since the systematic absorption of The Byte_Encoding)
-- TODO: test case to get the previous error message '...cannot be transcoded...'
ooRexx> text = "Père Noël 🎅 10€"~text; do encoding over .Byte_Encoding~subclasses~~append(.Byte_Encoding); say encoding~name~left(13)":" text~transcodeTo(encoding, replacementCharacter:"FF"x~byte)~c2x; end
Direct transcoding from 'Byte' to 'ISO-8859-1' is not supported.
Error code= 23.900

-- Here, the replacementCharacter is interpreted as a byte string encoded in the target encoding
ooRexx> text = "Père Noël 🎅 10€"~text; do encoding over .Byte_Encoding~subclasses~~append(.Byte_Encoding); say encoding~name~left(13)":" text~transcodeTo(encoding, replacementCharacter:"FF"x~text(encoding))~c2x; end
ISO-8859-1   : 50 E8 72 65 20 4E 6F EB 6C 20 FF 20 31 30 FF
ibm-1252     : 50 E8 72 65 20 4E 6F EB 6C 20 FF 20 31 30 FF
windows-1252 : 50 E8 72 65 20 4E 6F EB 6C 20 FF 20 31 30 80
IBM437       : 50 8A 72 65 20 4E 6F 89 6C 20 FF 20 31 30 FF
Byte         : 50 FF 72 65 20 4E 6F FF 6C 20 FF 20 31 30 FF


-- ===============================================================================
-- 2023 Aug 04


--- Following expressions return the same result correctly tagged 'ISO-8859-1'
ooRexx> b = .MutableBuffer~new; "Pere"~text("windows-1252")~append(" "~text("windows-1252"), buffer:b)~appendEncoded("Noël"~text("iso-8859-1"), buffer:b)=; result~description=
M'Pere Noël'
'ISO-8859-1 not-ASCII (10 bytes)'
ooRexx> b = .MutableBuffer~new; "Pere"~text("windows-1252")~appendEncoded(" "~text("windows-1252"), buffer:b)~appendEncoded("Noël"~text("iso-8859-1"), buffer:b)=; result~description=
M'Pere Noël'
'ISO-8859-1 not-ASCII (10 bytes)'
ooRexx> b = .MutableBuffer~new; b~appendEncoded("Pere"~text("windows-1252"), " "~text("windows-1252"), "Noël"~text("iso-8859-1"))=; result~description=
M'Pere Noël'
'ISO-8859-1 not-ASCII (10 bytes)'

-- Following expressions (not using 'appendEncoded') return the same result as above, but wrongly tagged 'windows-1252' or 'UTF-8'
ooRexx> b = .MutableBuffer~new; "Pere"~text("windows-1252")~append(" "~text("windows-1252"), buffer:b)~append("Noël"~text("iso-8859-1"), buffer:b)=; result~description=
M'Pere Noël'
'windows-1252 not-ASCII (10 bytes)'
ooRexx> b = .MutableBuffer~new; b~append("Pere"~text("windows-1252"), " "~text("windows-1252"), "Noël"~text("iso-8859-1"))=; result~description=
M'Pere Noël'
'UTF-8 not-ASCII by default (10 bytes)'


-- ===============================================================================
-- 2023 Jun 28

-- Bitkey is now 2 bytes (4 hex digits) always.

-- For debug, give temporarily access to the flags stored on an indexer.
ooRexx> "Père Noël"~text~nfc(casefold:, stripMark:)~indexer~flags=
a Directory (10 items)
'FLAG_CASEFOLD'        :  1
'FLAG_LUMP'            : -1
'FLAG_NFC'             :  1
'FLAG_NFD'             : -1
'FLAG_NFKC'            : -1
'FLAG_NFKD'            : -1
'FLAG_STRIP_CC'        : -1
'FLAG_STRIP_IGNORABLE' : -1
'FLAG_STRIP_MARK'      :  1
'FLAG_STRIP_NA'        : -1


-- ===============================================================================
-- 2023 May 31

-- Add support for functional methods to RexxText.

-- Example inspired by https://elixir-lang.org/
-- Frequency of each character, ignoring the accents:
ooRexx> "Notre père Noël 🎅"~text~transform(stripMark:)~reduce(by: "characters", initial: .stem~new~~put(0)){accu[item~string] += 1}=
a Stem (9 items)
'🎅' :  1
' '    :  3
'e'    :  4
'l'    :  1
'N'    :  2
'o'    :  2
'p'    :  1
'r'    :  2
't'    :  1

-- Add support for generator methods to RexxText.

ooRexx> g="Noël 🎅"~text~generateC
ooRexx> g~()=       -- T'N'
T'N'
ooRexx> g~()=       -- T'o'
T'o'
ooRexx> g~()=       -- T'ë'
T'ë'
ooRexx> g~()=       -- T'l'
T'l'
ooRexx> g~()=       -- T' '
T' '
ooRexx> g~()=       -- T'🎅'
T'🎅'
ooRexx> g~()=       -- [no result]
[no result]


-- ===============================================================================
-- 2023 May 29

-- For convenience, additional way to search a character:
-- with a routine
ooRexx> .UnicodeCharacter("bed")=                   -- ( "🛏"   U+1F6CF So 1 "BED" )
( "🛏"   U+1F6CF So 1 "BED" )
ooRexx> .UnicodeCharacter("bed", hexadecimal:)=     -- ( "௭"   U+0BED Nd 1 "TAMIL DIGIT SEVEN" )
( "௭"   U+0BED Nd 1 "TAMIL DIGIT SEVEN" )
-- with the operator []
ooRexx> .UnicodeCharacter["bed"]=                   -- ( "🛏"   U+1F6CF So 1 "BED" )
( "🛏"   U+1F6CF So 1 "BED" )
ooRexx> .UnicodeCharacter["bed", hexadecimal:]=     -- ( "௭"   U+0BED Nd 1 "TAMIL DIGIT SEVEN" )
( "௭"   U+0BED Nd 1 "TAMIL DIGIT SEVEN" )

-- This comes in complement of:
ooRexx> .Unicode["bed", hexadecimal:]=              -- ( "௭"   U+0BED Nd 1 "TAMIL DIGIT SEVEN" )
( "௭"   U+0BED Nd 1 "TAMIL DIGIT SEVEN" )
ooRexx> .Unicode~character("bed", hexadecimal:)=    -- ( "௭"   U+0BED Nd 1 "TAMIL DIGIT SEVEN" )
( "௭"   U+0BED Nd 1 "TAMIL DIGIT SEVEN" )


-- New method UnicodeCharacter~properties at class level: return a list of property names.
ooRexx> .UnicodeCharacter~properties=
['aliases','bidiClass','bidiClassName','bidiMirrored','boundClass','boundClassName','category','categoryName','charWidth','codepoint','combiningClass','controlBoundary','decompositionTypeName','decompositionType','ignorable','isLower','isUpper','name','toLowerFull','toLowerSimple','toTitleFull','toTitleSimple','toUpperFull','toUpperSimple','Unicode','UTF16BE','UTF16LE','UTF32BE','UTF32LE','UTF8']


-- ===============================================================================
-- 2023 May 24

-- For convenience, it's now possible to search directly a character if it's made of one codepoint only:
ooRexx> .Unicode~character("a")=    -- ( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )
( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )
ooRexx> .Unicode~character("à")=    -- ( "à"   U+00E0 Ll 1 "LATIN SMALL LETTER A WITH GRAVE" )
( "à"   U+00E0 Ll 1 "LATIN SMALL LETTER A WITH GRAVE" )
ooRexx> .Unicode~character("à")=    -- Error: The character 'à' is made of several codepoints: U+0061 U+0300
The character 'à' is made of several codepoints: U+0061 U+0300.
Error code= 93.900

-- For the last example, you can get an array of characters:
ooRexx> "à"~text~UnicodeCharacters==
an Array (shape [2], 2 items)
 1 : ( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )
 2 : ( "̀"    U+0300 Mn 0 "COMBINING GRAVE ACCENT" )


-- New method UnicodeCharacter~properties at instance level: Return a directory of properties.
ooRexx> .Unicode~character("U+000D")~properties=
a Directory (30 items)
'aliases'               : [(CARRIAGE RETURN),(CR)]
'bidiClass'             :  16
'bidiClassName'         : 'B'
'bidiMirrored'          :  0
'boundClass'            :  2
'boundClassName'        : 'CR'
'category'              :  26
'categoryName'          : 'Cc'
'charWidth'             :  0
'codepoint'             : 'U+000D'
'combiningClass'        :  0
'controlBoundary'       :  1
'decompositionType'     :  0
'decompositionTypeName' : 'None'
'ignorable'             :  0
'isLower'               :  0
'isUpper'               :  0
'name'                  : ''
'toLowerFull'           : 'U+000D'
'toLowerSimple'         : 'U+000D'
'toTitleFull'           : 'U+000D'
'toTitleSimple'         : 'U+000D'
'toUpperFull'           : 'U+000D'
'toUpperSimple'         : 'U+000D'
'Unicode'               : '0x0D'
'UTF16BE'               : '0x000D'
'UTF16LE'               : '0x0D00'
'UTF32BE'               : '0x0000000D'
'UTF32LE'               : '0x0D000000'
'UTF8'                  : '0x0D'


-- ===============================================================================
-- 2023 March 20

-- Rework implementation of caselessMatch to support correctly
ooRexx> "Bundesstraße im Freiland"~text~caselessMatch(14, "im")=    -- .true
 1


-- ===============================================================================
-- 2023 March 08

-- Implementation of caselessMatchChar, matchChar
ooRexx> "Noëlle"~text~matchChar(2, "aeiouy")=                       -- 1
 1
ooRexx> "Noëlle"~text~matchChar(3, "aeiouy")=                       -- 0
 0
ooRexx> "Noëlle"~text~matchChar(3, "aeëiouy")=                      -- 1    include the accents in the list of accepted characters
 1
ooRexx> "Noëlle"~text~matchChar(3, "aeiouy", stripMark:)=           -- 1    or remove the accents from the tested string
 1
ooRexx> "Noëlle"~text~matchChar(6, "aeiouy")=                       -- 1
 1

ooRexx> "Bundesschnellstraße"~text~matchChar(14, "s")=              -- 1
 1
ooRexx> "Bundesschnellstraße"~text~matchChar(18, "s")=              -- 0
 0
ooRexx> "Bundesschnellstraße"~text~matchChar(18, "sß")=             -- 1
 1
ooRexx> "Bundesschnellstraße"~text~caselessMatchChar(18, "s")=      -- 0, was 1 before 2023.12.04    "ß" becomes "ss" which is 2 graphemes. The first grapheme at 18 matches "s"
 0
ooRexx> "Bundesschnellstraße"~text~caselessMatchChar(19, "s")=      -- 0    "ß" becomes "ss" which is 2 graphemes. The grapheme at 19 is "e", not the second "s"
 0
ooRexx> "Bundesschnellstraße"~text~caselessMatchChar(19, "e")=      -- 1    "ß" becomes "ss" which is 2 graphemes. The grapheme at 19 is "e", not the second "s"
 1

-- The ligature disappears in NFK[CD] but not in NF[CD]
ooRexx> "baffle"~text~NFKC=                                            -- T'baffle'
T'baffle'
ooRexx> "baffle"~text~NFKD=                                            -- T'baffle'
T'baffle'
ooRexx> "baffle"~text~matchChar(3, "f")=                               -- 0     "ffl" is ONE grapheme because NFC
 0
ooRexx> "baffle"~text~matchChar(3, "ffl")=                               -- 1     "ffl" is ONE grapheme because NFC
 1
ooRexx> "baffle"~text~matchChar(3, "ffl", normalization:.Unicode~NFKD)=  -- 1     "ffl" becomes "ffl" (3 graphemes). There is a match because the first grapheme is "f"
 1
ooRexx> "baffle"~text~matchChar(3, "f", normalization:.Unicode~NFKD)=  -- 0, was 1 before 2023.12.04     "ffl" becomes "ffl" (3 graphemes). There is a match because the first grapheme is "f"
 0
ooRexx> "baffle"~text~matchChar(4, "f", normalization:.Unicode~NFKD)=  -- 0     "ffl" becomes "ffl" (3 graphemes). The grapheme at 4 is "e", not the second "f"
 0
ooRexx> "baffle"~text~matchChar(4, "e", normalization:.Unicode~NFKD)=  -- 1     "ffl" becomes "ffl" (3 graphemes). The grapheme at 4 is "e", not the second "f"
 1

-- The ligature disappears when casefolded
ooRexx> "baffle"~text~casefold=                                        -- T'baffle'
T'baffle'
ooRexx> "BAFFLE"~text~caselessMatchChar(3, "ffl")=                     -- 0, was 1 before 2023.12.04     "ffl" becomes "ffl" (3 graphemes), there is a match on "f" at 3
 0
ooRexx> "BAFFLE"~text~caselessMatchChar(5, "ffl")=                     -- 0, was 1 before 2023.12.04     "ffl" becomes "ffl" (3 graphemes), there is a match on "l" at 5
 0
ooRexx> "BAFFLE"~text~caselessMatchChar(5, "L")=                     -- 1      there is a match on "l" at 5 (forward to String)
 1


-- Implementation of caselessEquals, equals
ooRexx> "ŒUF"~text~caselessEquals("œuf")=           -- 1
 1
ooRexx> "œuf"~text~caselessEquals("ŒUF")=           -- 1
 1
ooRexx> "Straße"~text~caselessEquals("strasse")=    -- 1
 1
ooRexx> "strasse"~text~caselessEquals("Straße")=    -- 1
 1


-- Some ligatures are not decomposed by NFKC.
ooRexx> "ŒUF"~text~caselessEquals("oeuf")=                                  -- 0
 0
ooRexx> "ŒUF"~text~caselessEquals("oeuf", normalization:.Unicode~NFKC)=     -- 0
 0


-- ===============================================================================
-- 2022 November 20

/*
For consistency, all the conversion methods accept the named argument 'strict',
even if it's not needed for the unicode encodings.
Previously, was supported only for the byte encodings.
The default value of 'strict' is now .false.

The conversion methods accept the named argument 'memorize(3)'.
Its default value is given by .unicode~memorizeTranscodings (was memorizeConversions) which is .false by default.
Example:
    s = "hello"
    t = s~text
    utf16 = t~utf16(memorize:)
    utf32 = t~utf32(memorize:)
    t~utf16~"==":.object(utf16)=         -- 1
    t~utf32~"==":.object(utf32)=         -- 1
*/

/*
CP1252 to UTF-8, UTF-16, UTF-32
"Un œuf de chez MaPoule™ coûte ±0.40€"
*/
ooRexx> str_cp1252 = "Un " || "9C"x || "uf de chez MaPoule" || "99"x || " co" || "FB"x || "te " || "B1"x || "0.40" || "80"x
ooRexx> txt_cp1252 = str_cp1252~text("cp1252")
ooRexx> utf8  = txt_cp1252~utf8(memorize:)
ooRexx> utf16 = txt_cp1252~utf16(memorize:)
ooRexx> utf32 = txt_cp1252~utf32(memorize:)
ooRexx> txt_cp1252~utf8 ~"==":.object(utf8) =         -- 1
 1
ooRexx> txt_cp1252~utf16~"==":.object(utf16)=         -- 1
 1
ooRexx> txt_cp1252~utf32~"==":.object(utf32)=         -- 1
 1

/*
When an optional buffer is passed, must check that its encoding is compatible.
Done for the conversion methods.
Example:
*/
ooRexx> b = .mutablebuffer~new            -- No encoding yet
ooRexx> "hello"~text~utf16(buffer:b)      -- now the buffer's encoding is UTF-16
ooRexx> "bye"~text~utf8(buffer:b)         -- Encoding: cannot append UTF-8 to UTF-16BE '[00]h[00]e[00]l[00]l[00]o'.
Encoding: cannot append UTF-8 to UTF-16BE '[00]h[00]e[00]l[00]l[00]o'.
Error code= 23.900


-- ===============================================================================
-- 2022 November 08

/*
Additional arguments are supported by NFC, NFD, NFKC, NFKD, Casefold:
    lump
        Lumps certain different codepoints together.
        All the concerned characters become the same character, but still remain distinct characters.
        E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-"
             all space characters (general category Zs) to U+0020
    stripIgnorable
        Strips the characters whose property Default_Ignorable_Code_Point = true
        such as SOFT-HYPHEN or ZERO-WIDTH-SPACE
    stripCC
        Strips and/or converts control characters:
        characters 00-1F and 7F-9F, except 09 which is replaced by 20.
    stripMark
        Strips all character markings:
        characters whose category is Mc Me Mn (i.e. accents)
            Mc Spacing Mark
            Me Enclosing Mark
            Mn Nonspacing Mark
        This option works only with normalization.
    stripNA
        Strips the characters whose category is Cn Unassigned
        Note that the value gc=Cn does not actually occur in UnicodeData.txt,
        because that data file does not list unassigned code points.

Remark: the normalization NFKC_Casefold (short alias NFKC_CF) is done with
    ~NFKC(Casefold: .true, stripIgnorable: .true)
*/

/*
Two RexxText values are considered equal if their extended grapheme clusters are canonically equivalent.
This is the definition of Swift.
Q&A https://lists.isocpp.org/sg16/2018/08/0121.php
TODO: confirm that it's NFC, and only that.
The definition of canonical equivalence by the Unicode standard seems not limited to NFC.
https://unicode.org/notes/tn5/

The strict comparison operators now use the NFC normalization (update: use .Unicode~defaultNormalization(strict:.true)).
After normalization, they delegate to the String's strict comparison operators.

The non-strict comparison operators now use the NFC normalization (update: use .Unicode~defaultNormalization(strict:.false))
plus
    stripIgnorable:.true
    lump:.true
After normalization + transformations, they delegate to the String's non-strict comparison operators.
Thanks to the lump transformation, all the Unicode spaces are supported.

Examples:
*/

ooRexx> textNFC = "Noël"~text~NFC
ooRexx> textNFC~UnicodeCharacters==
an Array (shape [4], 4 items)
 1 : ( "N"   U+004E Lu 1 "LATIN CAPITAL LETTER N" )
 2 : ( "o"   U+006F Ll 1 "LATIN SMALL LETTER O" )
 3 : ( "ë"   U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" )
 4 : ( "l"   U+006C Ll 1 "LATIN SMALL LETTER L" )
ooRexx> textNFD="Noël"~text~NFD
ooRexx> textNFD~UnicodeCharacters==
an Array (shape [5], 5 items)
 1 : ( "N"   U+004E Lu 1 "LATIN CAPITAL LETTER N" )
 2 : ( "o"   U+006F Ll 1 "LATIN SMALL LETTER O" )
 3 : ( "e"   U+0065 Ll 1 "LATIN SMALL LETTER E" )
 4 : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 5 : ( "l"   U+006C Ll 1 "LATIN SMALL LETTER L" )
ooRexx> (textNFC == textNFD)=                                               -- 1
 1
ooRexx> (textNFC = textNFD)=                                                -- 1
 1
ooRexx> (" "textNFC == textNFD" ")=                                         -- 0 because strict
 0
ooRexx> (" "textNFC = textNFD" ")=                                          -- 1
 1
ooRexx> (" "textNFC = (textNFD"\u{NBSP}")~unescape)=                        -- 1
 1
ooRexx> (" "textNFC = (textNFD"\u{ZWSP}")~unescape)=                        -- 1
 1
ooRexx> ("-"textNFC = ("\u{OBLIQUE HYPHEN}"textNFD"\u{ZWSP}")~unescape)=    -- 1
 1

ooRexx> "pere noel"~text~caselessCompareTo("Père Noël")=                    -- -1 (lesser)
-1
ooRexx> "pere noel"~text~caselessCompareTo("Père Noël", stripMark:.true)=   --  0 (equal because the accents are ignored)
 0

-- Add support for ISO-8859-1 encoding (alias Latin1).
-- Example:
-- all the supported characters: ranges 20-7E and A0-FF
ooRexx> text = xrange("20"x, "7E"x, "A0"x, "FF"x)~text("ISO-8859-1")

-- The ? are just ISO-8859-1 encoded characters that can't be displayed as-is in a console UTF-8 (copy-paste of the console output)
-- After conversion to UTF-8, all is good.
ooRexx> text=       -- T' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~???????????????????????????????????????????????????????????????????????????????????????????????[FF]'
T' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~�����������������������������������������������������������������������������������������������[FF]'
ooRexx> text~utf8=  -- T' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
T' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'

-- ranges 00-1F and 7F-9F are undefined
-- an error is triggered even with the option strict: .false, because there is no fallback mapping
ooRexx> text = xrange("20"x, "FF"x)~text("ISO-8859-1")
ooRexx> text~utf8(strict: .false)=                      -- Error ISO-8859-1 encoding: cannot convert ISO-8859-1 not-ASCII character 127 (7F) at byte-position 96 to UTF-8.
T' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'


-- ===============================================================================
-- 2022 November 06

/*
Refactoring
    Prefix the native methods by the library name (utf8proc_, ziglyph_ or icu4x_).
    That will make more easy the comparison of similar services.

    Remove the native methods 'NFC', 'NFD', 'NFKC', 'NFKD' and 'NFKC_Casefold':
    all replaced by 'utf8proc_transform'.

    ~Casefold is now limited to case fold.
    Previously, NKFC + case fold was applied (because the method NFKC_Casefold of utf8proc was called).

    NFC, NFD, NFKC and NFKD now supports the named argument 'casefold' (default = .false).

Examples
*/
ooRexx> "Père Noël ß ㎒"~text~casefold=                      -- T'père noël ss ㎒'
T'père noël ss ㎒'
ooRexx> "Père Noël ß ㎒"~text~NFKC=                          -- T'Père Noël ß MHz'
T'Père Noël ß MHz'
ooRexx> "Père Noël ß ㎒"~text~NFKC(casefold:.true)=          -- T'père noël ss mhz'
T'père noël ss mhz'

/*
Performance
    NFC, NFD, NFKC, NFKD and Casefold now supports the named argument 'returnString'.
    - When true, the returned value is a String.
    - When false (default), the returned value is a RexxText.
    Maybe this optimization will be replaced by a more general optimization: RexxText indexation on need.

    2 cached values are managed in case of memorization:
    - one for the main transformation,
    - one for the main transformation + case fold.
    That makes 9 possible cached value and 5 indicators per indexer (so per string).
        isCasefold                  CasefoldString
        isNFC       NFCString       NFCCasefoldString
        isNFD       NFDString       NFDCasefoldString
        isNFKC      NFKCString      NFKCCasefoldString
        isNFKD      NFKDString      NFKDCasefoldString

    The memorization can be activated globally:
    .Unicode~memorizeTransformations = .true

Examples
*/
-- Direct access to utf8proc, returns a string
ooRexx> s = "Père Noël ß ㎒"; do 10000; .Unicode~utf8proc_transform(s, normalization:3, casefold:.true); end -- Duration:   0.05
---
ooRexx> t = "Père Noël ß ㎒"~text; do 10000; t~NFKC(casefold:.true); end                                     -- Duration:   7.70
ooRexx> t = "Père Noël ß ㎒"~text; do 10000; t~NFKC(casefold:.true, returnString:.true); end                 -- Duration:   0.33
ooRexx> t = "Père Noël ß ㎒"~text; do 10000; t~NFKC(casefold:.true, returnString:.true, memorize:.true); end -- Duration:   0.11
-- The cache for NFKC  + casefold is different from the cache for NFKC only:
ooRexx> t = "Père Noël ß ㎒"~text; do 10000; t~NFKC; end                                                     -- Duration:   6.50
ooRexx> t = "Père Noël ß ㎒"~text; do 10000; t~NFKC(returnString:.true); end                                 -- Duration:   0.30
ooRexx> t = "Père Noël ß ㎒"~text; do 10000; t~NFKC(returnString:.true, memorize:.true); end                 -- Duration:   0.10


-- ===============================================================================
-- 2022 November 05

/*
New methods on RexxText
    caselessContains        (not ready: posText)
    caselessCompareTo
    caselessMatch
    caselessMatchChar       (not ready: matchCharText)
    caselessEndsWith        (not ready: endsWithText)
    caselessPos             (not ready: posText)
    caselessStartsWith      (not ready: posText)
    compareTo
    contains                (not ready: posText)
    endsWith                (not ready: endsWithText)
    match
    matchChar               (not ready: matchCharText)
    pos                     (not ready: posText)
    startsWith              (not ready: posText)

For caseless, apply NFC Casefold to all the text/string arguments.
Compared to the ooRexx methods, the purpose of these methods is to convert the grapheme indexes to/from byte indexes.
The real work is done by the ooRexx methods, called with the right byte indexes.
From a byte index returned by an ooRexx method, a grapheme index is derived.


Examples:
*/
ooRexx>                                                 --  1  2    3  4  5  6  7  8 9  10 (grapheme indexes)
ooRexx>                                                 --  1  2 3  4  5  6  7  8  9 10 11 (byte indexes)
ooRexx>     "père Noël"~text~c2x=                       -- '70 C3A8 72 65 20 4E 6F C3AB 6C'
'70 C3A8 72 65 20 4E 6F C3AB 6C'
ooRexx>                                                 --  p  è    r  e     N  o  ë    l
ooRexx>     "père Noël"~match(1, "Noël")=               -- .false (byte indexes)
 0
ooRexx>     "père Noël"~text~match(1, "Noël")=          -- .false (grapheme indexes)
 0
ooRexx>     "père Noël"~match(7, "Noël")=               -- .false (was ".true (byte indexes)" before automatic conversion of string literals to text)
 0
ooRexx>     "père Noël"~text~match(6, "Noël")=          -- .true (grapheme indexes)
 1
ooRexx>     "père Noël"~match(11, "Noël", 5)=           -- Invalid position argument specified; found "11" (was ".true (byte indexes)" before automatic conversion of string literals to text)
Invalid position argument specified; found "11".
Error code= 93.924
ooRexx>     "père Noël"~text~match(9, "Noël", 4)=       -- .true (grapheme indexes)
 1

ooRexx>     "père Noël"~text~caselessMatch(1, "NOËL")=  -- .false
 0
ooRexx>     "père Noël"~text~caselessMatch(6, "NOËL")=  -- .true
 1

ooRexx>     -- the first "äXü" is NFC, the second "äẌü" is NFD
ooRexx>     nfcString = "äXü"
ooRexx>         nfcText = nfcString~text
ooRexx>         nfcText~c2x=                            -- 'C3A4 58 C3BC'
'C3A4 58 C3BC'
ooRexx>         nfcText~UnicodeCharacters==
an Array (shape [3], 3 items)
 1 : ( "ä"   U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" )
 2 : ( "X"   U+0058 Lu 1 "LATIN CAPITAL LETTER X" )
 3 : ( "ü"   U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" )
ooRexx>     nfdString = "äXü"
ooRexx>         nfdText = nfdString~text
ooRexx>         nfdText~c2x=                            -- '61 CC88 58 75 CC88'
'61 CC88 58 75 CC88'
ooRexx>         nfdText~UnicodeCharacters==
an Array (shape [5], 5 items)
 1 : ( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )
 2 : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 3 : ( "X"   U+0058 Lu 1 "LATIN CAPITAL LETTER X" )
 4 : ( "u"   U+0075 Ll 1 "LATIN SMALL LETTER U" )
 5 : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )

ooRexx>     nfcString~match(1, nfdString)=              -- 1    (was "0 (because binary representation is different)" before automatic conversion of string literals to text)
 1
ooRexx>     nfcText  ~match(1, nfdText)=                -- 1
 1
ooRexx>     nfdText  ~match(1, nfcText)=                -- 1
 1

ooRexx>     -- match with "X"

ooRexx>     nfcString~match(3, nfdString, 4, 1)=        -- Invalid position argument specified; found "4"   (was "1 (byte indexes)" before automatic conversion of string literals to text)
Invalid position argument specified; found "4".
Error code= 93.924
ooRexx>     nfcText  ~match(2, nfdText,   2, 1)=        -- 1 (grapheme indexes)
 1

ooRexx>     nfdString~match(4, nfcString, 3, 1)=        -- Invalid position argument specified; found "4"   (was "1 (byte indexes)" before automatic conversion of string literals to text)
Invalid position argument specified; found "4".
Error code= 93.924
ooRexx>     nfdText  ~match(2, nfcText,   2, 2)=        -- 1 (grapheme indexes)
 1

-- ===============================================================================
-- 2022 October 15

/*
New native method .Unicode~transform
Mainly for internal use, will replace the current native methods NFC, NFD, NFKC, NFKD.
The purpose of this method is to support additional transformations provided by utf8proc.
Takes a byte string as input (UTF-8 encoded), returns a new transformed byte string as output (UTF-8).

Examples:
*/
ooRexx>     string = "\u{BEL}Le\u{IDEOGRAPHIC SPACE}\u{OGHAM SPACE MARK}\u{ZERO-WIDTH-SPACE}Père\t\u{HYPHEN}\u{SOFT-HYPHEN}\u{EN DASH}\u{EM DASH}Noël\x{EFB790}\r\n"
ooRexx>     text = string~text~unescape
ooRexx>     text~UnicodeCharacters==
an Array (shape [22], 22 items)
 1  : ( ""    U+0007 Cc 0 "", "ALERT", "BEL" )
 2  : ( "L"   U+004C Lu 1 "LATIN CAPITAL LETTER L" )
 3  : ( "e"   U+0065 Ll 1 "LATIN SMALL LETTER E" )
 4  : ( " "  U+3000 Zs 2 "IDEOGRAPHIC SPACE" )
 5  : ( " "   U+1680 Zs 1 "OGHAM SPACE MARK" )
 6  : ( "​"    U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" )
 7  : ( "P"   U+0050 Lu 1 "LATIN CAPITAL LETTER P" )
 8  : ( "è"   U+00E8 Ll 1 "LATIN SMALL LETTER E WITH GRAVE" )
 9  : ( "r"   U+0072 Ll 1 "LATIN SMALL LETTER R" )
 10 : ( "e"   U+0065 Ll 1 "LATIN SMALL LETTER E" )
 11 : ( ""    U+0009 Cc 0 "", "CHARACTER TABULATION", "HORIZONTAL TABULATION", "HT", "TAB" )
 12 : ( "‐"   U+2010 Pd 1 "HYPHEN" )
 13 : ( "­"   U+00AD Cf 1 "SOFT HYPHEN", "SHY" )
 14 : ( "–"   U+2013 Pd 1 "EN DASH" )
 15 : ( "—"   U+2014 Pd 1 "EM DASH" )
 16 : ( "N"   U+004E Lu 1 "LATIN CAPITAL LETTER N" )
 17 : ( "o"   U+006F Ll 1 "LATIN SMALL LETTER O" )
 18 : ( "ë"   U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" )
 19 : ( "l"   U+006C Ll 1 "LATIN SMALL LETTER L" )
 20 : ( "﷐"   U+FDD0 Cn 1 "" )
 21 : ( ""    U+000D Cc 0 "", "CARRIAGE RETURN", "CR" )
 22 : ( ""    U+000A Cc 0 "", "LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL" )

ooRexx>     text=                                                               -- T'[07]Le  ​Père[09]‐­–—Noël﷐[0D0A]'
T'[07]Le  ​Père[09]‐­–—Noël﷐[0D0A]'

ooRexx>     -- Performs unicode case folding, to be able to do a case-insensitive string comparison.
ooRexx>     .Unicode~utf8proc_transform(text~string, casefold:.true)=           --  '[07]le  ​père[09]‐­–—noël﷐[0D0A]'
'[07]le  ​père[09]‐­–—noël﷐[0D0A]'

ooRexx>     -- Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE
ooRexx>     .Unicode~utf8proc_transform(text~string, stripIgnorable:.true)=     --  '[07]Le  Père[09]‐–—Noël﷐[0D0A]'
'[07]Le  Père[09]‐–—Noël﷐[0D0A]'

ooRexx>     -- Lumps certain characters together. See lump.md for details:
ooRexx>     -- https://github.com/JuliaStrings/utf8proc/blob/master/lump.md
ooRexx>     -- E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-"
ooRexx>     -- jlf: I was expecting to have only one space and one "-" but that's not the case
ooRexx>     -- Seems working as designed... All the concerned characters become the same character, but still remain distinct characters.
ooRexx>     .Unicode~utf8proc_transform(text~string, lump:.true)=               --  '[07]Le  ​Père[09]-­--Noël﷐[0D0A]'
'[07]Le  ​Père[09]-­--Noël﷐[0D0A]'

ooRexx>     -- NLF2LF: Convert LF, CRLF, CR and NEL into LF
ooRexx>     .Unicode~utf8proc_transform(text~string, NLF:1)=                    --  '[07]Le  ​Père[09]‐­–—Noël﷐[0A]'
'[07]Le  ​Père[09]‐­–—Noël﷐[0A]'

ooRexx>     -- NLF2LS: Convert LF, CRLF, CR and NEL into LS (U+2028 Zl 0 "LINE SEPARATOR")
ooRexx>     .Unicode~utf8proc_transform(text~string, NLF:2)=                    --  '[07]Le  ​Père[09]‐­–—Noël﷐'
'[07]Le  ​Père[09]‐­–—Noël﷐
'

ooRexx>     -- NLF2PS: convert LF, CRLF, CR and NEL into PS (U+2029 Zp 0 "PARAGRAPH SEPARATOR")
ooRexx>     .Unicode~utf8proc_transform(text~string, NLF:3)=                    --  '[07]Le  ​Père[09]‐­–—Noël﷐
'
'[07]Le  ​Père[09]‐­–—Noël﷐
'

ooRexx>     -- Strips and/or converts control characters.
ooRexx>     .Unicode~utf8proc_transform(text~string, stripCC:.true)=            --  'Le  ​Père ‐­–—Noël﷐ '
'Le  ​Père ‐­–—Noël﷐ '

ooRexx>     -- Strips all character markings.
ooRexx>     -- This includes non-spacing, spacing and enclosing (i.e. accents).
ooRexx>     -- This option works only with normalization.
ooRexx>     .Unicode~utf8proc_transform(text~string, stripMark:.true, normalization:1)=  --  '[07]Le  ​Pere[09]‐­–—Noel﷐[0D0A]'
'[07]Le  ​Pere[09]‐­–—Noel﷐[0D0A]'

ooRexx>     -- Strips unassigned codepoints.
ooRexx>     .Unicode~utf8proc_transform(text~string, stripNA:.true)=            --  '[07]Le  ​Père[09]‐­–—Noël[0D0A]'
'[07]Le  ​Père[09]‐­–—Noël[0D0A]'

ooRexx>     -- Application of several options
ooRexx>     .Unicode~utf8proc_transform(text~string, casefold:.true, lump:.true, normalization:1, stripIgnorable:.true, stripCC:.true, stripMark:.true, stripNA:.true)= --  'le  pere ---noel '
'le  pere ---noel '


-- ===============================================================================
-- 2022 September 14

/*
New methods on RexxText
    center
    centre
Examples:
*/
ooRexx>     "noël👩‍👨‍👩‍👧🎅"~text~description=                  -- 'UTF-8 not-ASCII (6 graphemes, 12 codepoints, 34 bytes, 0 error)'
'UTF-8 not-ASCII (6 characters, 12 codepoints, 34 bytes, 0 error)'
ooRexx>     "noël👩‍👨‍👩‍👧🎅"~text~center(10)=                   -- T'  noël👩‍👨‍👩‍👧🎅  '
T'  noël👩‍👨‍👩‍👧🎅  '
ooRexx>     "noël👩‍👨‍👩‍👧🎅"~text~center(10)~description=       -- 'UTF-8 not-ASCII (10 graphemes, 16 codepoints, 38 bytes, 0 error)'
'UTF-8 not-ASCII (10 characters, 16 codepoints, 38 bytes, 0 error)'
ooRexx>     pad = "═"
ooRexx>     pad~description=                                          -- 'UTF-8 not-ASCII (1 character, 1 codepoint, 3 bytes, 0 error)' (was 'UTF-8 not-ASCII (1 grapheme, 1 codepoint, 3 bytes, 0 error)' before automatic conversion of string literals to text)
'UTF-8 not-ASCII (1 character, 1 codepoint, 3 bytes, 0 error)'
ooRexx>     pad~c2x=                                                  -- 'E29590'
'E29590'
ooRexx>     "noël👩‍👨‍👩‍👧🎅"~text~center(10, pad)=              -- T'══noël👩‍👨‍👩‍👧🎅══'
T'══noël👩‍👨‍👩‍👧🎅══'
ooRexx>     "noël👩‍👨‍👩‍👧🎅"~text~center(10, pad)~description=  -- 'UTF-8 not-ASCII (10 graphemes, 16 codepoints, 46 bytes, 0 error)'
'UTF-8 not-ASCII (10 characters, 16 codepoints, 46 bytes, 0 error)'


-- ===============================================================================
-- 2022 September 09


-- Start working on encoding~previousCodepointIndexB:
ooRexx>     "🎅noël"~text~c2x=  -- 'F09F8E85 6E 6F C3AB 6C'
'F09F8E85 6E 6F C3AB 6C'
ooRexx>     .utf8_encoding~previousCodepointIndexB("🎅noël", 0)=   -- 0
 0
ooRexx>     .utf8_encoding~previousCodepointIndexB("🎅noël", 1)=   -- 1
 1
ooRexx>     .utf8_encoding~previousCodepointIndexB("🎅noël", 2)=   -- 1
 1
ooRexx>     .utf8_encoding~previousCodepointIndexB("🎅noël", 3)=   -- 1
 1
ooRexx>     .utf8_encoding~previousCodepointIndexB("🎅noël", 4)=   -- 1
 1
ooRexx>     .utf8_encoding~previousCodepointIndexB("🎅noël", 5)=   -- 1
 1
ooRexx>     .utf8_encoding~previousCodepointIndexB("🎅noël", 6)=   -- 5
 5
ooRexx>     .utf8_encoding~previousCodepointIndexB("🎅noël", 7)=   -- 6
 6
-- Currently, only Byte_encoding and UTF8_encoding supports this new method.
-- Still lot of work to detect the same errors as nextCodepointIndex.


-- ===============================================================================
-- 2022 September 08

-- Set/get an encoding on a string without having an associated RexxText
-- (similar to MutableBuffer)
ooRexx> s = "nonsense"
ooRexx> s~encoding =                      -- returns the default encoding: (The UTF8_Encoding class)
(The UTF8_Encoding class)
ooRexx> s~hasText =                       -- 0
 0
ooRexx> s~encoding = .UTF16BE_Encoding    -- tag the string: encoded UTF16BE
ooRexx> s~encoding =                      -- (The UTF16BE_Encoding class)
(The UTF16BE_Encoding class)
ooRexx> s~hasText =                       -- still no associated RexxText: 0
 0
ooRexx> t = s~text                        -- associates a RexxText to the string
ooRexx> s~hasText =                       -- the string has an associated text: 1
 1
ooRexx> t~encoding =                      -- the encoding of the text is the one of the string: (The UTF16BE_Encoding class)
(The UTF16BE_Encoding class)
ooRexx> t~utf8 =                          -- T'湯湳敮獥'      Soup
T'湯湳敮獥'
-- Setting/getting the encoding of the string will set/get the encoding of the associated RexxText
ooRexx> s~encoding = .UTF16LE_Encoding
ooRexx> t~encoding =                      -- the encoding of the text has been changed: (The UTF16LE_Encoding class)
(The UTF16LE_Encoding class)
ooRexx> t~utf8 =                          -- T'潮獮湥敳'      tide
T'潮獮湥敳'


-- ===============================================================================
-- 2022 September 07

/*
Add method MutableBuffer~isASCII
Implementation more complex than for String, because mutable.
Try to avoid to rescan the whole buffer, when possible.
The native methods that modify the buffer are never scanning the buffer, they
are just setting the boolean indicators is_ASCII_checked and is_ASCII.
It's only the Rexx method ~isASCII which scans the whole buffer, if needed.
Impacted methods:
    append
    caselessChangeStr
    changeStr
    delete
    delWord
    insert
    overlay
    replaceAt
    setBufferSize
    space
    translate
*/

ooRexx> b = .MutableBuffer~new("pere")
ooRexx> b~isASCII =                             -- 1
 1
ooRexx> b~insert("noël", 5)=                    -- M'pere noël'
M'pere noël'
ooRexx> b~isASCII =                             -- 0
 0
ooRexx> b~setBufferSize(7)=                     -- M'pere no'
M'pere no'
ooRexx> b~isASCII=                              -- 1
 1
ooRexx> b~append("ë", "l")=                     -- M'pere noël'
M'pere noël'
ooRexx> b~isASCII=                              -- 0
 0
ooRexx> b~replaceAt("e", 8, 2)=                 -- M'pere noel'
M'pere noel'
ooRexx> b~isASCII=                              -- 1
 0
ooRexx> b~changeStr("noel", "noël")=            -- M'pere noël'
M'pere noël'
ooRexx> b~isASCII=                              -- 0
 0
ooRexx> b~delete(8,2)=                          -- M'pere nol'
M'pere nol'
ooRexx> b~isASCII=                              -- 1
 1
ooRexx> b~overlay("ël", 8)=                     -- M'pere noël'
M'pere noël'
ooRexx> b~isASCII=                              -- 0
 0
ooRexx> b~delWord(2)=                           -- M'pere '
M'pere '
ooRexx> b~isASCII=                              -- 1
 1
ooRexx> b~translate("è" || "91"x, "er ")=       -- M'pèÑ'    ("è" is "C3A8"x so "e"-->"C3"x, "r"-->A8"x and " "-->"91"x
M'pèÑ'
ooRexx> b~isASCII=                              -- 0
 0


-- ===============================================================================
-- 2022 August 18

/*
Added Unicode case folding.
See https://www.w3.org/TR/charmod-norm/
Case folding is the process of making two texts which differ only in case identical for comparison purposes.
Implemented with utf8proc, which applies an NFKC normalization on the case-folded string.

Methods on RexxText:
    ~Casefold   ~isCasefold
*/
ooRexx> "ß"~text~casefold=               -- T'ss'
T'ss'
ooRexx> "㎒"~text~casefold=              -- T'mhz'   (jlf Nov 8, 2022: now unchanged because no longer NFKC)
T'㎒'

ooRexx> ("sTrasse", "straße", "STRASSE")~each{item~text~casefold}==
an Array (shape [3], 3 items)
 1 : T'strasse'
 2 : T'strasse'
 3 : T'strasse'

-- utf8proc doesn't support language-sensitive case-folding.
-- Example:
-- The name of the second largest city in Turkey is "Diyarbakır", which contains both the dotted and dotless letters i.
ooRexx> "Diyarbakır"~text~upper=        -- T'DIYARBAKIR'   should be DİYARBAKIR
T'DIYARBAKIR'
ooRexx> "DİYARBAKIR"~text~casefold=     -- T'di̇yarbakir'   should be diyarbakır
T'di̇yarbakir'

-- The Julia developers, who uses utf8proc, have decided to remain locale-independent.
-- See https://github.com/JuliaLang/julia/issues/7848


-- ===============================================================================
-- 2022 August 07

/*
Added normalization NFC, NFD, NFKC, NFKD.
http://unicode.org/faq/normalization.html
Implemented with utf8proc.

Methods on RexxText:
    ~NFC    ~isNFC
    ~NFD    ~isNFD
    ~NFKC   ~isNFKC
    ~NFKD   ~isNFKD

Possible values for isNFxx:
    -1  unknown
     0  no
     1  yes
A same text can be in several normalization forms.
Text exclusively containing ASCII characters (U+0000..U+007F) is left unaffected
by all of the Normalization Forms: The 4 indicators isNFxx are 1.

The methods NFxx sets the corresponding indicator isNFxx
- on the source text : 0 or 1 (test if both strings are equal)
- on the result text : 1
*/

-- The normalized text can be memorized on the original text:
ooRexx>     text = "père Noël"~text
ooRexx>     textNFD = text~nfd(memorize:.true)
-- From now, the returned NFD is always the memorized text:
ooRexx>     text~nfd == textNFD=                    -- .true
 1


/*
    Some remarks about the string used in this demo:
    - the first "äöü" is NFC, the second "äöü" is NFD
    - "x̂" is two codepoints in any normalization.
    - "ϔ" normalization forms are all different.
    - "ﷺ" is one of the worst cases regarding the expansion factor in NFKS/NFKS: 18x
    - "baffle"~text~subchar(3)=     -- T'ffl'
      "baffle"~text~upper=          -- T'BAfflE', should be BAFFLE (to rework: utf8proc supports only simple uppercase)
      The ligature disappears in NFK[CD] but not in NF[CD]
*/
ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~UnicodeCharacters==
an Array (shape [22], 22 items)
 1  : ( "ä"   U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" )
 2  : ( "ö"   U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" )
 3  : ( "ü"   U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" )
 4  : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 5  : ( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )
 6  : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 7  : ( "o"   U+006F Ll 1 "LATIN SMALL LETTER O" )
 8  : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 9  : ( "u"   U+0075 Ll 1 "LATIN SMALL LETTER U" )
 10 : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 11 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 12 : ( "x"   U+0078 Ll 1 "LATIN SMALL LETTER X" )
 13 : ( "̂"    U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" )
 14 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 15 : ( "ϔ"   U+03D4 Lu 1 "GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL" )
 16 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 17 : ( "ﷺ"   U+FDFA Lo 1 "ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM" )
 18 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 19 : ( "b"   U+0062 Ll 1 "LATIN SMALL LETTER B" )
 20 : ( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )
 21 : ( "ffl"   U+FB04 Ll 1 "LATIN SMALL LIGATURE FFL" )
 22 : ( "e"   U+0065 Ll 1 "LATIN SMALL LETTER E" )
ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~description=      -- 'UTF-8 not-ASCII (18 graphemes, 22 codepoints, 34 bytes, 0 error)'
'UTF-8 not-ASCII (18 characters, 22 codepoints, 34 bytes, 0 error)'
ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~upper=            -- T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE
T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE'

/*
    NFD
    Normalization Form D
    Canonical Decomposition
    Characters are decomposed by canonical equivalence, and multiple combining characters are arranged in a specific order.
*/
ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfd~UnicodeCharacters==
an Array (shape [26], 26 items)
 1  : ( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )
 2  : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 3  : ( "o"   U+006F Ll 1 "LATIN SMALL LETTER O" )
 4  : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 5  : ( "u"   U+0075 Ll 1 "LATIN SMALL LETTER U" )
 6  : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 7  : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 8  : ( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )
 9  : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 10 : ( "o"   U+006F Ll 1 "LATIN SMALL LETTER O" )
 11 : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 12 : ( "u"   U+0075 Ll 1 "LATIN SMALL LETTER U" )
 13 : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 14 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 15 : ( "x"   U+0078 Ll 1 "LATIN SMALL LETTER X" )
 16 : ( "̂"    U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" )
 17 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 18 : ( "ϒ"   U+03D2 Lu 1 "GREEK UPSILON WITH HOOK SYMBOL" )
 19 : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 20 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 21 : ( "ﷺ"   U+FDFA Lo 1 "ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM" )
 22 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 23 : ( "b"   U+0062 Ll 1 "LATIN SMALL LETTER B" )
 24 : ( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )
 25 : ( "ffl"   U+FB04 Ll 1 "LATIN SMALL LIGATURE FFL" )
 26 : ( "e"   U+0065 Ll 1 "LATIN SMALL LETTER E" )
ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfd~description=  -- 'UTF-8 not-ASCII (18 graphemes, 26 codepoints, 39 bytes, 0 error)'
'UTF-8 not-ASCII (18 characters, 26 codepoints, 39 bytes, 0 error)'
ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfd~upper=        -- T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE'
T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE'

/*
    NFC
    Normalization Form C
    Canonical Decomposition, followed by Canonical Composition
    Characters are decomposed and then recomposed by canonical equivalence.
*/
ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfc~UnicodeCharacters==
an Array (shape [19], 19 items)
 1  : ( "ä"   U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" )
 2  : ( "ö"   U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" )
 3  : ( "ü"   U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" )
 4  : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 5  : ( "ä"   U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" )
 6  : ( "ö"   U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" )
 7  : ( "ü"   U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" )
 8  : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 9  : ( "x"   U+0078 Ll 1 "LATIN SMALL LETTER X" )
 10 : ( "̂"    U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" )
 11 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 12 : ( "ϔ"   U+03D4 Lu 1 "GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL" )
 13 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 14 : ( "ﷺ"   U+FDFA Lo 1 "ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM" )
 15 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 16 : ( "b"   U+0062 Ll 1 "LATIN SMALL LETTER B" )
 17 : ( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )
 18 : ( "ffl"   U+FB04 Ll 1 "LATIN SMALL LIGATURE FFL" )
 19 : ( "e"   U+0065 Ll 1 "LATIN SMALL LETTER E" )
ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfc~description=  -- 'UTF-8 not-ASCII (18 graphemes, 19 codepoints, 31 bytes, 0 error)'
'UTF-8 not-ASCII (18 characters, 19 codepoints, 31 bytes, 0 error)'
ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfc~upper=        -- T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE'
T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE'

/*
    NFKD
    Normalization Form KD
    Compatibility Decomposition (K is used to stand for compatibility to avoid confusion with the C standing for composition)
    Characters are decomposed by compatibility, and multiple combining characters are arranged in a specific order.
*/
ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkd~UnicodeCharacters==
an Array (shape [45], 45 items)
 1  : ( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )
 2  : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 3  : ( "o"   U+006F Ll 1 "LATIN SMALL LETTER O" )
 4  : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 5  : ( "u"   U+0075 Ll 1 "LATIN SMALL LETTER U" )
 6  : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 7  : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 8  : ( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )
 9  : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 10 : ( "o"   U+006F Ll 1 "LATIN SMALL LETTER O" )
 11 : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 12 : ( "u"   U+0075 Ll 1 "LATIN SMALL LETTER U" )
 13 : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 14 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 15 : ( "x"   U+0078 Ll 1 "LATIN SMALL LETTER X" )
 16 : ( "̂"    U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" )
 17 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 18 : ( "Υ"   U+03A5 Lu 1 "GREEK CAPITAL LETTER UPSILON" )
 19 : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 20 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 21 : ( "ص"   U+0635 Lo 1 "ARABIC LETTER SAD" )
 22 : ( "ل"   U+0644 Lo 1 "ARABIC LETTER LAM" )
 23 : ( "ى"   U+0649 Lo 1 "ARABIC LETTER ALEF MAKSURA" )
 24 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 25 : ( "ا"   U+0627 Lo 1 "ARABIC LETTER ALEF" )
 26 : ( "ل"   U+0644 Lo 1 "ARABIC LETTER LAM" )
 27 : ( "ل"   U+0644 Lo 1 "ARABIC LETTER LAM" )
 28 : ( "ه"   U+0647 Lo 1 "ARABIC LETTER HEH" )
 29 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 30 : ( "ع"   U+0639 Lo 1 "ARABIC LETTER AIN" )
 31 : ( "ل"   U+0644 Lo 1 "ARABIC LETTER LAM" )
 32 : ( "ي"   U+064A Lo 1 "ARABIC LETTER YEH" )
 33 : ( "ه"   U+0647 Lo 1 "ARABIC LETTER HEH" )
 34 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 35 : ( "و"   U+0648 Lo 1 "ARABIC LETTER WAW" )
 36 : ( "س"   U+0633 Lo 1 "ARABIC LETTER SEEN" )
 37 : ( "ل"   U+0644 Lo 1 "ARABIC LETTER LAM" )
 38 : ( "م"   U+0645 Lo 1 "ARABIC LETTER MEEM" )
 39 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 40 : ( "b"   U+0062 Ll 1 "LATIN SMALL LETTER B" )
 41 : ( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )
 42 : ( "f"   U+0066 Ll 1 "LATIN SMALL LETTER F" )
 43 : ( "f"   U+0066 Ll 1 "LATIN SMALL LETTER F" )
 44 : ( "l"   U+006C Ll 1 "LATIN SMALL LETTER L" )
 45 : ( "e"   U+0065 Ll 1 "LATIN SMALL LETTER E" )
ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkd~description= -- 'UTF-8 not-ASCII (37 graphemes, 45 codepoints, 69 bytes, 0 error)'
'UTF-8 not-ASCII (37 characters, 45 codepoints, 69 bytes, 0 error)'
ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkd~upper=       -- T'ÄÖÜ ÄÖÜ X̂ Ϋ صلى الله عليه وسلم BAFFLE
T'ÄÖÜ ÄÖÜ X̂ Ϋ صلى الله عليه وسلم BAFFLE'

/*
    NFKC
    Normalization Form KC
    Compatibility Decomposition, followed by Canonical Composition
    Characters are decomposed by compatibility, then recomposed by canonical equivalence.
*/
ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkc~UnicodeCharacters==
an Array (shape [38], 38 items)
 1  : ( "ä"   U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" )
 2  : ( "ö"   U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" )
 3  : ( "ü"   U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" )
 4  : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 5  : ( "ä"   U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" )
 6  : ( "ö"   U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" )
 7  : ( "ü"   U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" )
 8  : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 9  : ( "x"   U+0078 Ll 1 "LATIN SMALL LETTER X" )
 10 : ( "̂"    U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" )
 11 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 12 : ( "Ϋ"   U+03AB Lu 1 "GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA" )
 13 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 14 : ( "ص"   U+0635 Lo 1 "ARABIC LETTER SAD" )
 15 : ( "ل"   U+0644 Lo 1 "ARABIC LETTER LAM" )
 16 : ( "ى"   U+0649 Lo 1 "ARABIC LETTER ALEF MAKSURA" )
 17 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 18 : ( "ا"   U+0627 Lo 1 "ARABIC LETTER ALEF" )
 19 : ( "ل"   U+0644 Lo 1 "ARABIC LETTER LAM" )
 20 : ( "ل"   U+0644 Lo 1 "ARABIC LETTER LAM" )
 21 : ( "ه"   U+0647 Lo 1 "ARABIC LETTER HEH" )
 22 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 23 : ( "ع"   U+0639 Lo 1 "ARABIC LETTER AIN" )
 24 : ( "ل"   U+0644 Lo 1 "ARABIC LETTER LAM" )
 25 : ( "ي"   U+064A Lo 1 "ARABIC LETTER YEH" )
 26 : ( "ه"   U+0647 Lo 1 "ARABIC LETTER HEH" )
 27 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 28 : ( "و"   U+0648 Lo 1 "ARABIC LETTER WAW" )
 29 : ( "س"   U+0633 Lo 1 "ARABIC LETTER SEEN" )
 30 : ( "ل"   U+0644 Lo 1 "ARABIC LETTER LAM" )
 31 : ( "م"   U+0645 Lo 1 "ARABIC LETTER MEEM" )
 32 : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 33 : ( "b"   U+0062 Ll 1 "LATIN SMALL LETTER B" )
 34 : ( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )
 35 : ( "f"   U+0066 Ll 1 "LATIN SMALL LETTER F" )
 36 : ( "f"   U+0066 Ll 1 "LATIN SMALL LETTER F" )
 37 : ( "l"   U+006C Ll 1 "LATIN SMALL LETTER L" )
 38 : ( "e"   U+0065 Ll 1 "LATIN SMALL LETTER E" )
ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkc~description= -- 'UTF-8 not-ASCII (37 graphemes, 38 codepoints, 61 bytes, 0 error)'
'UTF-8 not-ASCII (37 characters, 38 codepoints, 61 bytes, 0 error)'
ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkc~upper=       -- T'ÄÖÜ ÄÖÜ X̂ Ϋ صلى الله عليه وسلم BAFFLE'
T'ÄÖÜ ÄÖÜ X̂ Ϋ صلى الله عليه وسلم BAFFLE'


-- The normalization forms are implemented only for UTF-8 and WTF-8.
ooRexx> "D800 DC01"x~text("utf16")~nfd~UnicodeCharacters==  -- Method TRANSFORM is ABSTRACT and cannot be directly invoked.
Method TRANSFORM is ABSTRACT and cannot be directly invoked.
Error code= 93.965
ooRexx> "D800 DC01"x~text("utf16")~utf8~nfd~UnicodeCharacters==
an Array (shape [1], 1 items)
 1 : ( "𐀁"   U+10001 Lo 1 "LINEAR B SYLLABLE B038 E" )
ooRexx> "\uD800\uDC01"~text("wtf8")~unescape~nfd~UnicodeCharacters==
an Array (shape [1], 1 items)
 1 : ( "𐀁"   U+10001 Lo 1 "LINEAR B SYLLABLE B038 E" )

-- If the WTF-8 string is not a valid UTF-8 string then an error is raised by utf8proc
ooRexx> "D800"x     ~text("wtf16")~wtf8~nfd~UnicodeCharacters==    -- Invalid UTF-8 string
Invalid UTF-8 string.
Error code= 22.900
ooRexx> "\uD800"~text("wtf8")~unescape~nfd~UnicodeCharacters==     -- Invalid UTF-8 string
Invalid UTF-8 string.
Error code= 22.900


-- ===============================================================================
-- 2022 August 03

/*
https://discourse.julialang.org/t/stupid-question-on-unicode/27674/10
    Should I support this when unescaping?
    (High surrogate followed by low surrogate)
    Surrogate pairs are a UTF-16-specific construct.
    However, string escapes aren’t byte sequences of a particular encoding.
    They are somewhat arbitrary substitutions / macros.
*/

ooRexx>         "\uD83D\uDE3F"~text~unescape~errors==
an Array (shape [6], 6 items)
 1 : 'UTF-8 encoding: byte sequence at byte-position 1 has an invalid continuation byte 160 (A0x) at byte-position 2 (high surrogate, use WTF-8).'
 2 : 'UTF-8 encoding: byte sequence at byte-position 2 has an invalid start byte 160 (A0x) (non-shortest form).'
 3 : 'UTF-8 encoding: byte sequence at byte-position 3 has an invalid start byte 189 (BDx) (non-shortest form).'
 4 : 'UTF-8 encoding: byte sequence at byte-position 4 has an invalid continuation byte 184 (B8x) at byte-position 5 (low surrogate, use WTF-8).'
 5 : 'UTF-8 encoding: byte sequence at byte-position 5 has an invalid start byte 184 (B8x) (non-shortest form).'
 6 : 'UTF-8 encoding: byte sequence at byte-position 6 has an invalid start byte 191 (BFx) (non-shortest form).'

ooRexx>         "\uD83D\uDE3F"~text~wtf8~unescape~errors==
(The NIL object)

--    Yes, I should support it when the encoding is WTF-8, because the concatenation manages correctly this case:
ooRexx>         ("\uD83D"~text~wtf8~unescape || "\uDE3F"~text~wtf8~unescape)~UnicodeCharacters==
an Array (shape [1], 1 items)
 1 : ( "😿"  U+1F63F So 2 "CRYING CAT FACE" )

ooRexx>         ("\uD83D"~text~wtf8~unescape || "\uDE3F"~text~wtf8~unescape)~description=
'WTF-8 not-ASCII (1 character, 1 codepoint, 4 bytes, 0 error)'

--    Done, now "\uD83D\uDE3F"~text~wtf8~unescape=    -- "😿"


-- ===============================================================================
-- 2022 July 20

/*
I realize that I can pass options when filtering the unicode characters.
Same options as when sending the message "matcher" to a string.

-- Options: not wholestring, trace with prefix "> "
*/
ooRexx> .unicode~characters("father", wholeString:0, trace:1, prefix:">")
>description: stringChunkPattern="father" wholeString=0 caseless=1
>stringPattern="father"
>matcher: expose description stringPattern; use strict arg string; return string~caselessPos(stringPattern) <> 0

-- Same options with a regular expression.
-- "/father" is faster than "/.*father.*" but still very slow compared to "father"
ooRexx> .unicode~characters("/father", wholeString:0, trace:1, prefix:"> ")
> description: stringChunkPattern="/father" wholeString=0 caseless=1
> stringPattern="father"
> pattern = .Pattern~compile(stringPattern, .RegexCompiler~new(.RegexCompiler~caseless))
> matcher: expose description pattern; use strict arg string; return pattern~find(string)~matched

-- Note that "/.*father.*" in mode not wholestring is just unusable: 419 sec under MBP 2010 Intel Core 2 Duo
-- [2022 Dec 22] Still unusable under MBP 2021 M1 Pro: 78s (only 5.37 faster)


-- ===============================================================================
-- 2022 July 17


-- For convenience, add an optional parameter 'filter' to the method .unicode~characters
ooRexx>     .unicode~characters("*rex*")==
an Array (shape [15], 15 items)
 1  : ( "꜌"   U+A70C Sk 1 "MODIFIER LETTER EXTRA-LOW DOTTED TONE BAR" )
 2  : ( "˩"   U+02E9 Sk 1 "MODIFIER LETTER EXTRA-LOW TONE BAR" )
 3  : ( "꜍"   U+A70D Sk 1 "MODIFIER LETTER EXTRA-HIGH DOTTED LEFT-STEM TONE BAR" )
 4  : ( "꜑"   U+A711 Sk 1 "MODIFIER LETTER EXTRA-LOW DOTTED LEFT-STEM TONE BAR" )
 5  : ( "˥"   U+02E5 Sk 1 "MODIFIER LETTER EXTRA-HIGH TONE BAR" )
 6  : ( "꜈"   U+A708 Sk 1 "MODIFIER LETTER EXTRA-HIGH DOTTED TONE BAR" )
 7  : ( "🖕"  U+1F595 So 2 "REVERSED HAND WITH MIDDLE FINGER EXTENDED" )
 8  : ( "ꎅ"  U+A385 Lo 2 "YI SYLLABLE RREX" )
 9  : ( "꜒"   U+A712 Sk 1 "MODIFIER LETTER EXTRA-HIGH LEFT-STEM TONE BAR" )
 10 : ( "ꏑ"  U+A3D1 Lo 2 "YI SYLLABLE REX" )
 11 : ( "ꎜ"  U+A39C Lo 2 "YI SYLLABLE NREX" )
 12 : ( "꜖"   U+A716 Sk 1 "MODIFIER LETTER EXTRA-LOW LEFT-STEM TONE BAR" )
 13 : ( "𖩿"   U+16A7F Lo 1 "TANGSA LETTER EX" )
 14 : ( "𝍊"   U+1D34A So 1 "TETRAGRAM FOR EXHAUSTION" )
 15 : ( "🦖"  U+1F996 So 2 "T-REX" )
-- is equivalent to
ooRexx>     matcher = "*rex*"~matcher; .unicode~characters~select{expose matcher; matcher~(item~name)}==
an Array (shape [4], 4 items)
 1 : ( "ꎅ"  U+A385 Lo 2 "YI SYLLABLE RREX" )
 2 : ( "ꎜ"  U+A39C Lo 2 "YI SYLLABLE NREX" )
 3 : ( "ꏑ"  U+A3D1 Lo 2 "YI SYLLABLE REX" )
 4 : ( "🦖"  U+1F996 So 2 "T-REX" )

-- Regular expressions are supported:
-- returns all the characters whose name starts with "math" and ends with "psi"
ooRexx>     .unicode~characters("/^math.*psi$")==
an Array (shape [10], 10 items)
 1  : ( "𝚿"   U+1D6BF Lu 1 "MATHEMATICAL BOLD CAPITAL PSI" )
 2  : ( "𝛙"   U+1D6D9 Ll 1 "MATHEMATICAL BOLD SMALL PSI" )
 3  : ( "𝛹"   U+1D6F9 Lu 1 "MATHEMATICAL ITALIC CAPITAL PSI" )
 4  : ( "𝜓"   U+1D713 Ll 1 "MATHEMATICAL ITALIC SMALL PSI" )
 5  : ( "𝜳"   U+1D733 Lu 1 "MATHEMATICAL BOLD ITALIC CAPITAL PSI" )
 6  : ( "𝝍"   U+1D74D Ll 1 "MATHEMATICAL BOLD ITALIC SMALL PSI" )
 7  : ( "𝝭"   U+1D76D Lu 1 "MATHEMATICAL SANS-SERIF BOLD CAPITAL PSI" )
 8  : ( "𝞇"   U+1D787 Ll 1 "MATHEMATICAL SANS-SERIF BOLD SMALL PSI" )
 9  : ( "𝞧"   U+1D7A7 Lu 1 "MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL PSI" )
 10 : ( "𝟁"   U+1D7C1 Ll 1 "MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL PSI" )

/*
The regular expressions are implemented with 100% ooRexx code, and as such
can be particularly inefficient...
When applied to a collection of 43885 Unicode characters, we have:
    .unicode~characters("/.*father.*")    -- 30.5 sec
The same filter without regular expression:
    .unicode~characters("*father*")       -- 0.9 sec

Something to clarify:
Why such a difference of duration for the following pieces of code?
In the end, it's the same code in both cases:
matcher = "/.*father.*"~matcher; supplier = .unicode~characters; collectedItems = .Array~new; do while supplier~available; item = supplier~item; if matcher~(item~name) then collectedItems~append(item); supplier~next; end; collectedItems==
64 sec
matcher = "/.*father.*"~matcher; .unicode~characters~select{expose matcher; matcher~(item~name)}==
31 sec
*/

-- ===============================================================================
-- 2022 July 13

/*
Rework ~unescape to be closer to other languages:
\u{...} and \U{...} are equivalent
\u{X..X} is now hexadecimal, no more decimal codepoint. The first character must be 0..9.
\uXXXX is now supported
\UXXXXXXXX is now supported

Ex:
*/
ooRexx> "\u{bed} is different from \u{0bed}"~text~unescape=                         -- T'🛏 is different from ௭'
T'🛏 is different from ௭'
ooRexx> .unicode~character("bed")=                                                  -- ( "🛏"   U+1F6CF So 1 "BED" )
( "🛏"   U+1F6CF So 1 "BED" )
ooRexx> .unicode~character("bed", hexadecimal:.true)=                               -- ( "௭"   U+0BED Nd 1 "TAMIL DIGIT SEVEN" )
( "௭"   U+0BED Nd 1 "TAMIL DIGIT SEVEN" )
ooRexx> .unicode~character("U+0bed")=                                               -- ( "௭"   U+0BED Nd 1 "TAMIL DIGIT SEVEN" )
( "௭"   U+0BED Nd 1 "TAMIL DIGIT SEVEN" )
ooRexx> "The \u{t-rex} shows his \u{flexed biceps}!"~text~unescape=                 -- T'The 🦖 shows his 💪!'
T'The 🦖 shows his 💪!'
ooRexx> "\u0031 + \u0032\u0033 = \u0032\u0034"~text~unescape=                       -- T'1 + 23 = 24'
T'1 + 23 = 24'
ooRexx> "\U00000031 + \U00000032\U00000033 = \U00000032\U00000034"~text~unescape=   -- T'1 + 23 = 24'
T'1 + 23 = 24'

-- ===============================================================================
-- 2022 February 13

/*
New method unescape, available only for Byte, UTF-8 and WTF-8.
    \b                  backspace (BS)
    \t                  horizontal tab (HT)
    \n                  linefeed (LF)
    \f                  form feed (FF)
    \r                  carriage return (CR)
    \u{Unicode name}    Character name in the Unicode database
    \u{N..N}            Unicode character denoted by 1-8 hex digits. The first character must be a digit 0..9.
    \u{U+X..X}          Unicode character denoted by 1-n hex digits
    \x{X..X}            sequence of 1..n hexadecimal digits
Examples:
*/
ooRexx>     "hello\u{space}John\n"~text~unescape=           -- T'hello John[0A]'
T'hello John[0A]'
ooRexx>     "hello\u{20}John\n"~text~unescape=
T'hello John[0A]'
ooRexx>     "hello\u{U+20}John\n"~text~unescape=
Expected U+ or u+ followed by 4..6 hex digits, got '20'.
Error code= 93.900

ooRexx>     -- \u is not supported for Byte encoding, you can use \x
ooRexx>     "hello\u{U+20}John\n"~text("byte")~unescape=    -- Byte encoding: \u not supported.
Byte encoding: \u not supported.
Error code= 23.900
ooRexx>     "hello\x{20}John\n"~text("byte")~unescape       -- T'hello John[0A]'

ooRexx>     -- No implementation for UTF-16, WTF-16, UTF-32.
ooRexx>     "hello\u{U+20}John\n"~text~utf16~unescape=      -- Method UNESCAPE is ABSTRACT and cannot be directly invoked.
Method UNESCAPE is ABSTRACT and cannot be directly invoked.
Error code= 93.965


-- ===============================================================================
-- 2021 September 30

/*
New methods:
.String
    join (was concatenateSeparated)

.MutableBuffer
    join (was concatenateSeparated)

.Unicode
    []  (equivalent to .Unicode~character)

.UnicodeCharacter
    makeRexxText
    text
    wtf8
    wtf16
    wtf16be
    wtf16le

.RexxText
    join
    left
    right
    x2d

Examples:
*/

-- https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
-- no break before ZWJ (GB9), but break after if not emoji modifier sequence or emoji zwj sequence (GB11)
ooRexx> .unicode["zwj"]~utf8~join("ab", "cd", .unicode["woman"]~utf8, .unicode["father christmas"]~utf8)~c2g=  -- '61 62E2808D 63 64E2808D F09F91A9E2808DF09F8E85'
'61 62E2808D 63 64E2808D F09F91A9E2808DF09F8E85'
ooRexx> .unicode["zwj"]~utf8~join("ab", "cd", .unicode["woman"]~utf8, .unicode["father christmas"]~utf8)~graphemes==
a CharacterSupplier 
 1 : T'a'
 2 : T'b‍'
 3 : T'c'
 4 : T'd‍'
 5 : T'👩‍🎅'

ooRexx> "noël👩‍👨‍👩‍👧🎅"~text~UnicodeCharacters==
an Array (shape [12], 12 items)
 1  : ( "n"   U+006E Ll 1 "LATIN SMALL LETTER N" )
 2  : ( "o"   U+006F Ll 1 "LATIN SMALL LETTER O" )
 3  : ( "ë"   U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" )
 4  : ( "l"   U+006C Ll 1 "LATIN SMALL LETTER L" )
 5  : ( "👩"  U+1F469 So 2 "WOMAN" )
 6  : ( "‍"    U+200D Cf 0 "ZERO WIDTH JOINER", "ZWJ" )
 7  : ( "👨"  U+1F468 So 2 "MAN" )
 8  : ( "‍"    U+200D Cf 0 "ZERO WIDTH JOINER", "ZWJ" )
 9  : ( "👩"  U+1F469 So 2 "WOMAN" )
 10 : ( "‍"    U+200D Cf 0 "ZERO WIDTH JOINER", "ZWJ" )
 11 : ( "👧"  U+1F467 So 2 "GIRL" )
 12 : ( "🎅"  U+1F385 So 2 "FATHER CHRISTMAS" )

-- https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
-- Do not break within emoji modifier sequences or emoji zwj sequences (GB11).
ooRexx> "noël👩‍👨‍👩‍👧🎅"~text~graphemes==
a CharacterSupplier 
 1 : T'n'
 2 : T'o'
 3 : T'ë'
 4 : T'l'
 5 : T'👩‍👨‍👩‍👧'
 6 : T'🎅'

ooRexx> do i=0 to 9; "left("i") = " || "noël👩‍👨‍👩‍👧🎅"~text~left(i)=; end
T'left(0) = '
T'left(1) = n'
T'left(2) = no'
T'left(3) = noë'
T'left(4) = noël'
T'left(5) = noël👩‍👨‍👩‍👧'
T'left(6) = noël👩‍👨‍👩‍👧🎅'
T'left(7) = noël👩‍👨‍👩‍👧🎅 '
T'left(8) = noël👩‍👨‍👩‍👧🎅  '
T'left(9) = noël👩‍👨‍👩‍👧🎅   '

ooRexx> do i=0 to 9; "right("i") = " || "noël👩‍👨‍👩‍👧🎅"~text~right(i)=; end
T'right(0) = '
T'right(1) = 🎅'
T'right(2) = 👩‍👨‍👩‍👧🎅'
T'right(3) = l👩‍👨‍👩‍👧🎅'
T'right(4) = ël👩‍👨‍👩‍👧🎅'
T'right(5) = oël👩‍👨‍👩‍👧🎅'
T'right(6) = noël👩‍👨‍👩‍👧🎅'
T'right(7) =  noël👩‍👨‍👩‍👧🎅'
T'right(8) =   noël👩‍👨‍👩‍👧🎅'
T'right(9) =    noël👩‍👨‍👩‍👧🎅'


-- ===============================================================================
-- 2021 September 28

/*
New methods:
.RexxText
    reverse

Examples:
*/

-- Correct reverse
ooRexx> "noël"~text~c2x=            -- '6E 6F C3AB 6C'
'6E 6F C3AB 6C'
ooRexx> "noël"~text~reverse~c2x=    -- '6C C3AB 6F 6E'
'6C C3AB 6F 6E'
ooRexx> "noël"~text~reverse=        -- T'lëon'
T'lëon'

-- Correct reverse (was Wrong reverse before automatic conversion of string literals to text)
ooRexx> "noël"~c2x=             -- '6E 6F C3AB 6C'
'6E 6F C3AB 6C'
ooRexx> "noël"~reverse~c2x=     -- '6C C3AB 6F 6E'
'6C C3AB 6F 6E'
ooRexx> "noël"~reverse=         -- T'lëon'
T'lëon'


-- ===============================================================================
-- 2021 September 27

/*
New native methods:
.Unicode
    codepointToLower
    codepointToUpper
    codepointToTitle
    codepointIsLower
    codepointIsUpper

New methods:
.RexxText
    lower
    upper
    isLower
    isUpper
    characters

Examples:
*/

ooRexx> "aàâäeéèêëiîïoôöuûü"~text~isUpper=              -- .false
 0
ooRexx> "aàâäeéèêëiîïoôöuûü"~text~isLower=              -- .true
 1
ooRexx> "AÀÂÄEÉÈÊËIÎÏOÔÖUÛÜ"~text~isUpper=              -- .true
 1
ooRexx> "AÀÂÄEÉÈÊËIÎÏOÔÖUÛÜ"~text~isLower=              -- .false
 0
ooRexx> "Le père Noël est fatigué..."~text~upper=       -- T'LE PÈRE NOËL EST FATIGUÉ...'
T'LE PÈRE NOËL EST FATIGUÉ...'
ooRexx> "LE PÈRE NOËL EST FATIGUÉ..."~text~lower=       -- T'le père noël est fatigué...'
T'le père noël est fatigué...'

/*
utf8proc supports only the basic cases (those in UnicodeData.txt).
The cases described in SpecialCasing.txt are not supported by utf8proc.
Examples:
*/
-- # The German es-zed is special--the normal mapping is to SS.
-- # Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>))
-- # <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment>
-- 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S

/*
TODO: full casing not yet implemented
    .Unicode~codepointToLowerFull
    .Unicode~codepointToUpperFull
    .Unicode~codepointToTitleFull
The rest of the framework is ready for full casing.
*/

ooRexx> .unicode~character("LATIN SMALL LETTER SHARP S")~utf8=          -- T'ß'
T'ß'
ooRexx> .unicode~character("LATIN SMALL LETTER SHARP S")~toUpperSimple= -- 7838, which is the codepoint of (U+1E9E Lu "LATIN CAPITAL LETTER SHARP S")
 7838
ooRexx> .unicode~character(7838)~utf8=                                  -- T'ẞ'
T'ẞ'
-- T'ß' to uppercase should be T'SS':
ooRexx> "0053 0053"x~text("utf16")~UnicodeCharacters==
an Array (shape [2], 2 items)
 1 : ( "S"   U+0053 Lu 1 "LATIN CAPITAL LETTER S" )
 2 : ( "S"   U+0053 Lu 1 "LATIN CAPITAL LETTER S" )

-- # Preserve canonical equivalence for I with dot. Turkic is handled below.
-- 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
ooRexx> .unicode~character("LATIN CAPITAL LETTER I WITH DOT ABOVE")~utf8=           -- T'İ'
T'İ'
ooRexx> .unicode~character("LATIN CAPITAL LETTER I WITH DOT ABOVE")~toLowerSimple=  -- 105, which is the codepoint of (U+0069 Ll "LATIN SMALL LETTER I")
 105
ooRexx> .unicode~character(105)~utf8=                                               -- T'i'
T'i'
-- T'İ' to lowercase should be T'i̇̇':
ooRexx> "0069 0307"x~text("utf16")~UnicodeCharacters==
an Array (shape [2], 2 items)
 1 : ( "i"   U+0069 Ll 1 "LATIN SMALL LETTER I" )
 2 : ( "̇"    U+0307 Mn 0 "COMBINING DOT ABOVE" )

-- # Turkish and Azeri
-- # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
-- # The following rules handle those cases.
-- 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
-- 0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE

-- # Note: the following case is already in the UnicodeData.txt file.
-- # 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
ooRexx> .unicode~character("LATIN SMALL LETTER DOTLESS I")~utf8=            -- T'ı'
T'ı'
ooRexx> .unicode~character("LATIN SMALL LETTER DOTLESS I")~toUpperSimple=   -- 73, which is the codepoint of (U+0049 Lu "LATIN CAPITAL LETTER I")
 73
ooRexx> .unicode~character(73)~utf8=                                        -- T'I'
T'I'


-- Which characters have their title character different from their upper character?
ooRexx> .unicode~characters~select{item~toTitleSimple <> item~toUpperSimple}~each{.Unicode[item~toTitleSimple]~utf8 .Unicode[item~ToUpperSimple]~utf8 item~utf8 item}==
an Array (shape [58], 58 items)
 1  : T'Dž DŽ DŽ  "DŽ"   U+01C4 Lu 1 "LATIN CAPITAL LETTER DZ WITH CARON" '
 2  : T'Dž DŽ Dž  "Dž"   U+01C5 Lt 1 "LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON" '
 3  : T'Dž DŽ dž  "dž"   U+01C6 Ll 1 "LATIN SMALL LETTER DZ WITH CARON" '
 4  : T'Lj LJ LJ  "LJ"   U+01C7 Lu 1 "LATIN CAPITAL LETTER LJ" '
 5  : T'Lj LJ Lj  "Lj"   U+01C8 Lt 1 "LATIN CAPITAL LETTER L WITH SMALL LETTER J" '
 6  : T'Lj LJ lj  "lj"   U+01C9 Ll 1 "LATIN SMALL LETTER LJ" '
 7  : T'Nj NJ NJ  "NJ"   U+01CA Lu 1 "LATIN CAPITAL LETTER NJ" '
 8  : T'Nj NJ Nj  "Nj"   U+01CB Lt 1 "LATIN CAPITAL LETTER N WITH SMALL LETTER J" '
 9  : T'Nj NJ nj  "nj"   U+01CC Ll 1 "LATIN SMALL LETTER NJ" '
 10 : T'Dz DZ DZ  "DZ"   U+01F1 Lu 1 "LATIN CAPITAL LETTER DZ" '
 11 : T'Dz DZ Dz  "Dz"   U+01F2 Lt 1 "LATIN CAPITAL LETTER D WITH SMALL LETTER Z" '
 12 : T'Dz DZ dz  "dz"   U+01F3 Ll 1 "LATIN SMALL LETTER DZ" '
 13 : T'ა Ა ა  "ა"   U+10D0 Ll 1 "GEORGIAN LETTER AN" '
 14 : T'ბ Ბ ბ  "ბ"   U+10D1 Ll 1 "GEORGIAN LETTER BAN" '
 15 : T'გ Გ გ  "გ"   U+10D2 Ll 1 "GEORGIAN LETTER GAN" '
 16 : T'დ Დ დ  "დ"   U+10D3 Ll 1 "GEORGIAN LETTER DON" '
 17 : T'ე Ე ე  "ე"   U+10D4 Ll 1 "GEORGIAN LETTER EN" '
 18 : T'ვ Ვ ვ  "ვ"   U+10D5 Ll 1 "GEORGIAN LETTER VIN" '
 19 : T'ზ Ზ ზ  "ზ"   U+10D6 Ll 1 "GEORGIAN LETTER ZEN" '
 20 : T'თ Თ თ  "თ"   U+10D7 Ll 1 "GEORGIAN LETTER TAN" '
 21 : T'ი Ი ი  "ი"   U+10D8 Ll 1 "GEORGIAN LETTER IN" '
 22 : T'კ Კ კ  "კ"   U+10D9 Ll 1 "GEORGIAN LETTER KAN" '
 23 : T'ლ Ლ ლ  "ლ"   U+10DA Ll 1 "GEORGIAN LETTER LAS" '
 24 : T'მ Მ მ  "მ"   U+10DB Ll 1 "GEORGIAN LETTER MAN" '
 25 : T'ნ Ნ ნ  "ნ"   U+10DC Ll 1 "GEORGIAN LETTER NAR" '
 26 : T'ო Ო ო  "ო"   U+10DD Ll 1 "GEORGIAN LETTER ON" '
 27 : T'პ Პ პ  "პ"   U+10DE Ll 1 "GEORGIAN LETTER PAR" '
 28 : T'ჟ Ჟ ჟ  "ჟ"   U+10DF Ll 1 "GEORGIAN LETTER ZHAR" '
 29 : T'რ Რ რ  "რ"   U+10E0 Ll 1 "GEORGIAN LETTER RAE" '
 30 : T'ს Ს ს  "ს"   U+10E1 Ll 1 "GEORGIAN LETTER SAN" '
 31 : T'ტ Ტ ტ  "ტ"   U+10E2 Ll 1 "GEORGIAN LETTER TAR" '
 32 : T'უ Უ უ  "უ"   U+10E3 Ll 1 "GEORGIAN LETTER UN" '
 33 : T'ფ Ფ ფ  "ფ"   U+10E4 Ll 1 "GEORGIAN LETTER PHAR" '
 34 : T'ქ Ქ ქ  "ქ"   U+10E5 Ll 1 "GEORGIAN LETTER KHAR" '
 35 : T'ღ Ღ ღ  "ღ"   U+10E6 Ll 1 "GEORGIAN LETTER GHAN" '
 36 : T'ყ Ყ ყ  "ყ"   U+10E7 Ll 1 "GEORGIAN LETTER QAR" '
 37 : T'შ Შ შ  "შ"   U+10E8 Ll 1 "GEORGIAN LETTER SHIN" '
 38 : T'ჩ Ჩ ჩ  "ჩ"   U+10E9 Ll 1 "GEORGIAN LETTER CHIN" '
 39 : T'ც Ც ც  "ც"   U+10EA Ll 1 "GEORGIAN LETTER CAN" '
 40 : T'ძ Ძ ძ  "ძ"   U+10EB Ll 1 "GEORGIAN LETTER JIL" '
 41 : T'წ Წ წ  "წ"   U+10EC Ll 1 "GEORGIAN LETTER CIL" '
 42 : T'ჭ Ჭ ჭ  "ჭ"   U+10ED Ll 1 "GEORGIAN LETTER CHAR" '
 43 : T'ხ Ხ ხ  "ხ"   U+10EE Ll 1 "GEORGIAN LETTER XAN" '
 44 : T'ჯ Ჯ ჯ  "ჯ"   U+10EF Ll 1 "GEORGIAN LETTER JHAN" '
 45 : T'ჰ Ჰ ჰ  "ჰ"   U+10F0 Ll 1 "GEORGIAN LETTER HAE" '
 46 : T'ჱ Ჱ ჱ  "ჱ"   U+10F1 Ll 1 "GEORGIAN LETTER HE" '
 47 : T'ჲ Ჲ ჲ  "ჲ"   U+10F2 Ll 1 "GEORGIAN LETTER HIE" '
 48 : T'ჳ Ჳ ჳ  "ჳ"   U+10F3 Ll 1 "GEORGIAN LETTER WE" '
 49 : T'ჴ Ჴ ჴ  "ჴ"   U+10F4 Ll 1 "GEORGIAN LETTER HAR" '
 50 : T'ჵ Ჵ ჵ  "ჵ"   U+10F5 Ll 1 "GEORGIAN LETTER HOE" '
 51 : T'ჶ Ჶ ჶ  "ჶ"   U+10F6 Ll 1 "GEORGIAN LETTER FI" '
 52 : T'ჷ Ჷ ჷ  "ჷ"   U+10F7 Ll 1 "GEORGIAN LETTER YN" '
 53 : T'ჸ Ჸ ჸ  "ჸ"   U+10F8 Ll 1 "GEORGIAN LETTER ELIFI" '
 54 : T'ჹ Ჹ ჹ  "ჹ"   U+10F9 Ll 1 "GEORGIAN LETTER TURNED GAN" '
 55 : T'ჺ Ჺ ჺ  "ჺ"   U+10FA Ll 1 "GEORGIAN LETTER AIN" '
 56 : T'ჽ Ჽ ჽ  "ჽ"   U+10FD Ll 1 "GEORGIAN LETTER AEN" '
 57 : T'ჾ Ჾ ჾ  "ჾ"   U+10FE Ll 1 "GEORGIAN LETTER HARD SIGN" '
 58 : T'ჿ Ჿ ჿ  "ჿ"   U+10FF Ll 1 "GEORGIAN LETTER LABIAL SIGN" '


-- ===============================================================================
-- 2021 September 22

/*
New native methods:
.Unicode
    codepointBidiMirrored
    codepointDecompositionType


Add character aliases.
.unicode~characters returns now a supplier, instead of the internal array of characters.
The indexes of the characters supplier are the codepoints, not the indexes of the
internal array which are codepoint+2.
*/
ooRexx> .unicode~characters==
an UnicodeCharacterSupplier 
 0    : ( ""    U+0000 Cc 0 "", "NULL", "NUL" )
 1    : ( ""    U+0001 Cc 0 "", "START OF HEADING", "SOH" )
 2    : ( ""    U+0002 Cc 0 "", "START OF TEXT", "STX" )
 3    : ( ""    U+0003 Cc 0 "", "END OF TEXT", "ETX" )
 4    : ( ""    U+0004 Cc 0 "", "END OF TRANSMISSION", "EOT" )
 5    : ( ""    U+0005 Cc 0 "", "ENQUIRY", "ENQ" )
 6    : ( ""    U+0006 Cc 0 "", "ACKNOWLEDGE", "ACK" )
 7    : ( ""    U+0007 Cc 0 "", "ALERT", "BEL" )
 8    : ( ""    U+0008 Cc 0 "", "BACKSPACE", "BS" )
 9    : ( ""    U+0009 Cc 0 "", "CHARACTER TABULATION", "HORIZONTAL TABULATION", "HT", "TAB" )
 10   : ( ""    U+000A Cc 0 "", "LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL" )
 11   : ( ""    U+000B Cc 0 "", "LINE TABULATION", "VERTICAL TABULATION", "VT" )
 12   : ( ""    U+000C Cc 0 "", "FORM FEED", "FF" )
 13   : ( ""    U+000D Cc 0 "", "CARRIAGE RETURN", "CR" )
 14   : ( ""    U+000E Cc 0 "", "SHIFT OUT", "LOCKING-SHIFT ONE", "SO" )
 15   : ( ""    U+000F Cc 0 "", "SHIFT IN", "LOCKING-SHIFT ZERO", "SI" )
 16   : ( ""    U+0010 Cc 0 "", "DATA LINK ESCAPE", "DLE" )
 17   : ( ""    U+0011 Cc 0 "", "DEVICE CONTROL ONE", "DC1" )
 18   : ( ""    U+0012 Cc 0 "", "DEVICE CONTROL TWO", "DC2" )
 19   : ( ""    U+0013 Cc 0 "", "DEVICE CONTROL THREE", "DC3" )
 20   : ( ""    U+0014 Cc 0 "", "DEVICE CONTROL FOUR", "DC4" )
 21   : ( ""    U+0015 Cc 0 "", "NEGATIVE ACKNOWLEDGE", "NAK" )
 22   : ( ""    U+0016 Cc 0 "", "SYNCHRONOUS IDLE", "SYN" )
 23   : ( ""    U+0017 Cc 0 "", "END OF TRANSMISSION BLOCK", "ETB" )
 24   : ( ""    U+0018 Cc 0 "", "CANCEL", "CAN" )
 25   : ( ""    U+0019 Cc 0 "", "END OF MEDIUM", "EOM", "EM" )
 26   : ( ""    U+001A Cc 0 "", "SUBSTITUTE", "SUB" )
 27   : ( ""    U+001B Cc 0 "", "ESCAPE", "ESC" )
 28   : ( ""    U+001C Cc 0 "", "INFORMATION SEPARATOR FOUR", "FILE SEPARATOR", "FS" )
 29   : ( ""    U+001D Cc 0 "", "INFORMATION SEPARATOR THREE", "GROUP SEPARATOR", "GS" )
 30   : ( ""    U+001E Cc 0 "", "INFORMATION SEPARATOR TWO", "RECORD SEPARATOR", "RS" )
 31   : ( ""    U+001F Cc 0 "", "INFORMATION SEPARATOR ONE", "UNIT SEPARATOR", "US" )
 32   : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 33   : ( "!"   U+0021 Po 1 "EXCLAMATION MARK" )
 34   : ( """   U+0022 Po 1 "QUOTATION MARK" )
 35   : ( "#"   U+0023 Po 1 "NUMBER SIGN" )
 36   : ( "$"   U+0024 Sc 1 "DOLLAR SIGN" )
 37   : ( "%"   U+0025 Po 1 "PERCENT SIGN" )
 38   : ( "&"   U+0026 Po 1 "AMPERSAND" )
 39   : ( "'"   U+0027 Po 1 "APOSTROPHE" )
 40   : ( "("   U+0028 Ps 1 "LEFT PARENTHESIS" )
 41   : ( ")"   U+0029 Pe 1 "RIGHT PARENTHESIS" )
 42   : ( "*"   U+002A Po 1 "ASTERISK" )
 43   : ( "+"   U+002B Sm 1 "PLUS SIGN" )
 44   : ( ","   U+002C Po 1 "COMMA" )
 45   : ( "-"   U+002D Pd 1 "HYPHEN-MINUS" )
 46   : ( "."   U+002E Po 1 "FULL STOP" )
 47   : ( "/"   U+002F Po 1 "SOLIDUS" )
 48   : ( "0"   U+0030 Nd 1 "DIGIT ZERO" )
 49   : ( "1"   U+0031 Nd 1 "DIGIT ONE" )
 50   : ( "2"   U+0032 Nd 1 "DIGIT TWO" )
 51   : ( "3"   U+0033 Nd 1 "DIGIT THREE" )
 52   : ( "4"   U+0034 Nd 1 "DIGIT FOUR" )
 53   : ( "5"   U+0035 Nd 1 "DIGIT FIVE" )
 54   : ( "6"   U+0036 Nd 1 "DIGIT SIX" )
 55   : ( "7"   U+0037 Nd 1 "DIGIT SEVEN" )
 56   : ( "8"   U+0038 Nd 1 "DIGIT EIGHT" )
 57   : ( "9"   U+0039 Nd 1 "DIGIT NINE" )
 58   : ( ":"   U+003A Po 1 "COLON" )
 59   : ( ";"   U+003B Po 1 "SEMICOLON" )
 60   : ( "<"   U+003C Sm 1 "LESS-THAN SIGN" )
 61   : ( "="   U+003D Sm 1 "EQUALS SIGN" )
 62   : ( ">"   U+003E Sm 1 "GREATER-THAN SIGN" )
 63   : ( "?"   U+003F Po 1 "QUESTION MARK" )
 64   : ( "@"   U+0040 Po 1 "COMMERCIAL AT" )
 65   : ( "A"   U+0041 Lu 1 "LATIN CAPITAL LETTER A" )
 66   : ( "B"   U+0042 Lu 1 "LATIN CAPITAL LETTER B" )
 67   : ( "C"   U+0043 Lu 1 "LATIN CAPITAL LETTER C" )
 68   : ( "D"   U+0044 Lu 1 "LATIN CAPITAL LETTER D" )
 69   : ( "E"   U+0045 Lu 1 "LATIN CAPITAL LETTER E" )
 70   : ( "F"   U+0046 Lu 1 "LATIN CAPITAL LETTER F" )
 71   : ( "G"   U+0047 Lu 1 "LATIN CAPITAL LETTER G" )
 72   : ( "H"   U+0048 Lu 1 "LATIN CAPITAL LETTER H" )
 73   : ( "I"   U+0049 Lu 1 "LATIN CAPITAL LETTER I" )
 74   : ( "J"   U+004A Lu 1 "LATIN CAPITAL LETTER J" )
 75   : ( "K"   U+004B Lu 1 "LATIN CAPITAL LETTER K" )
 76   : ( "L"   U+004C Lu 1 "LATIN CAPITAL LETTER L" )
 77   : ( "M"   U+004D Lu 1 "LATIN CAPITAL LETTER M" )
 78   : ( "N"   U+004E Lu 1 "LATIN CAPITAL LETTER N" )
 79   : ( "O"   U+004F Lu 1 "LATIN CAPITAL LETTER O" )
 80   : ( "P"   U+0050 Lu 1 "LATIN CAPITAL LETTER P" )
 81   : ( "Q"   U+0051 Lu 1 "LATIN CAPITAL LETTER Q" )
 82   : ( "R"   U+0052 Lu 1 "LATIN CAPITAL LETTER R" )
 83   : ( "S"   U+0053 Lu 1 "LATIN CAPITAL LETTER S" )
 84   : ( "T"   U+0054 Lu 1 "LATIN CAPITAL LETTER T" )
 85   : ( "U"   U+0055 Lu 1 "LATIN CAPITAL LETTER U" )
 86   : ( "V"   U+0056 Lu 1 "LATIN CAPITAL LETTER V" )
 87   : ( "W"   U+0057 Lu 1 "LATIN CAPITAL LETTER W" )
 88   : ( "X"   U+0058 Lu 1 "LATIN CAPITAL LETTER X" )
 89   : ( "Y"   U+0059 Lu 1 "LATIN CAPITAL LETTER Y" )
 90   : ( "Z"   U+005A Lu 1 "LATIN CAPITAL LETTER Z" )
 91   : ( "["   U+005B Ps 1 "LEFT SQUARE BRACKET" )
 92   : ( "\"   U+005C Po 1 "REVERSE SOLIDUS" )
 93   : ( "]"   U+005D Pe 1 "RIGHT SQUARE BRACKET" )
 94   : ( "^"   U+005E Sk 1 "CIRCUMFLEX ACCENT" )
 95   : ( "_"   U+005F Pc 1 "LOW LINE" )
 96   : ( "`"   U+0060 Sk 1 "GRAVE ACCENT" )
 97   : ( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )
 98   : ( "b"   U+0062 Ll 1 "LATIN SMALL LETTER B" )
 99   : ( "c"   U+0063 Ll 1 "LATIN SMALL LETTER C" )
 100  : ( "d"   U+0064 Ll 1 "LATIN SMALL LETTER D" )
 101  : ( "e"   U+0065 Ll 1 "LATIN SMALL LETTER E" )
 102  : ( "f"   U+0066 Ll 1 "LATIN SMALL LETTER F" )
 103  : ( "g"   U+0067 Ll 1 "LATIN SMALL LETTER G" )
 104  : ( "h"   U+0068 Ll 1 "LATIN SMALL LETTER H" )
 105  : ( "i"   U+0069 Ll 1 "LATIN SMALL LETTER I" )
 106  : ( "j"   U+006A Ll 1 "LATIN SMALL LETTER J" )
 107  : ( "k"   U+006B Ll 1 "LATIN SMALL LETTER K" )
 108  : ( "l"   U+006C Ll 1 "LATIN SMALL LETTER L" )
 109  : ( "m"   U+006D Ll 1 "LATIN SMALL LETTER M" )
 110  : ( "n"   U+006E Ll 1 "LATIN SMALL LETTER N" )
 111  : ( "o"   U+006F Ll 1 "LATIN SMALL LETTER O" )
 112  : ( "p"   U+0070 Ll 1 "LATIN SMALL LETTER P" )
 113  : ( "q"   U+0071 Ll 1 "LATIN SMALL LETTER Q" )
 114  : ( "r"   U+0072 Ll 1 "LATIN SMALL LETTER R" )
 115  : ( "s"   U+0073 Ll 1 "LATIN SMALL LETTER S" )
 116  : ( "t"   U+0074 Ll 1 "LATIN SMALL LETTER T" )
 117  : ( "u"   U+0075 Ll 1 "LATIN SMALL LETTER U" )
 118  : ( "v"   U+0076 Ll 1 "LATIN SMALL LETTER V" )
 119  : ( "w"   U+0077 Ll 1 "LATIN SMALL LETTER W" )
 120  : ( "x"   U+0078 Ll 1 "LATIN SMALL LETTER X" )
 121  : ( "y"   U+0079 Ll 1 "LATIN SMALL LETTER Y" )
 122  : ( "z"   U+007A Ll 1 "LATIN SMALL LETTER Z" )
 123  : ( "{"   U+007B Ps 1 "LEFT CURLY BRACKET" )
 124  : ( "|"   U+007C Sm 1 "VERTICAL LINE" )
 125  : ( "}"   U+007D Pe 1 "RIGHT CURLY BRACKET" )
 126  : ( "~"   U+007E Sm 1 "TILDE" )
 127  : ( ""    U+007F Cc 0 "", "DELETE", "DEL" )
 128  : ( "€"    U+0080 Cc 0 "", "PADDING CHARACTER", "PAD" )
 129  : ( ""    U+0081 Cc 0 "", "HIGH OCTET PRESET", "HOP" )
 130  : ( "‚"    U+0082 Cc 0 "", "BREAK PERMITTED HERE", "BPH" )
 131  : ( "ƒ"    U+0083 Cc 0 "", "NO BREAK HERE", "NBH" )
 132  : ( "„"    U+0084 Cc 0 "", "INDEX", "IND" )
 133  : ( "…"    U+0085 Cc 0 "", "NEXT LINE", "NEL" )
 134  : ( "†"    U+0086 Cc 0 "", "START OF SELECTED AREA", "SSA" )
 135  : ( "‡"    U+0087 Cc 0 "", "END OF SELECTED AREA", "ESA" )
 136  : ( "ˆ"    U+0088 Cc 0 "", "CHARACTER TABULATION SET", "HORIZONTAL TABULATION SET", "HTS" )
 137  : ( "‰"    U+0089 Cc 0 "", "CHARACTER TABULATION WITH JUSTIFICATION", "HORIZONTAL TABULATION WITH JUSTIFICATION", "HTJ" )
 138  : ( "Š"    U+008A Cc 0 "", "LINE TABULATION SET", "VERTICAL TABULATION SET", "VTS" )
 139  : ( "‹"    U+008B Cc 0 "", "PARTIAL LINE FORWARD", "PARTIAL LINE DOWN", "PLD" )
 140  : ( "Œ"    U+008C Cc 0 "", "PARTIAL LINE BACKWARD", "PARTIAL LINE UP", "PLU" )
 141  : ( ""    U+008D Cc 0 "", "REVERSE LINE FEED", "REVERSE INDEX", "RI" )
 142  : ( "Ž"    U+008E Cc 0 "", "SINGLE SHIFT TWO", "SINGLE-SHIFT-2", "SS2" )
 143  : ( ""    U+008F Cc 0 "", "SINGLE SHIFT THREE", "SINGLE-SHIFT-3", "SS3" )
 144  : ( ""    U+0090 Cc 0 "", "DEVICE CONTROL STRING", "DCS" )
 145  : ( "‘"    U+0091 Cc 0 "", "PRIVATE USE ONE", "PRIVATE USE-1", "PU1" )
 146  : ( "’"    U+0092 Cc 0 "", "PRIVATE USE TWO", "PRIVATE USE-2", "PU2" )
 147  : ( "“"    U+0093 Cc 0 "", "SET TRANSMIT STATE", "STS" )
 148  : ( "”"    U+0094 Cc 0 "", "CANCEL CHARACTER", "CCH" )
 149  : ( "•"    U+0095 Cc 0 "", "MESSAGE WAITING", "MW" )
 150  : ( "–"    U+0096 Cc 0 "", "START OF GUARDED AREA", "START OF PROTECTED AREA", "SPA" )
 151  : ( "—"    U+0097 Cc 0 "", "END OF GUARDED AREA", "END OF PROTECTED AREA", "EPA" )
 152  : ( "˜"    U+0098 Cc 0 "", "START OF STRING", "SOS" )
 153  : ( "™"    U+0099 Cc 0 "", "SINGLE GRAPHIC CHARACTER INTRODUCER", "SGC" )
 154  : ( "š"    U+009A Cc 0 "", "SINGLE CHARACTER INTRODUCER", "SCI" )
 155  : ( "›"    U+009B Cc 0 "", "CONTROL SEQUENCE INTRODUCER", "CSI" )
 156  : ( "œ"    U+009C Cc 0 "", "STRING TERMINATOR", "ST" )
 157  : ( ""    U+009D Cc 0 "", "OPERATING SYSTEM COMMAND", "OSC" )
 158  : ( "ž"    U+009E Cc 0 "", "PRIVACY MESSAGE", "PM" )
 159  : ( "Ÿ"    U+009F Cc 0 "", "APPLICATION PROGRAM COMMAND", "APC" )
 160  : ( " "   U+00A0 Zs 1 "NO-BREAK SPACE", "NBSP" )
 161  : ( "¡"   U+00A1 Po 1 "INVERTED EXCLAMATION MARK" )
 162  : ( "¢"   U+00A2 Sc 1 "CENT SIGN" )
 163  : ( "£"   U+00A3 Sc 1 "POUND SIGN" )
 164  : ( "¤"   U+00A4 Sc 1 "CURRENCY SIGN" )
 165  : ( "¥"   U+00A5 Sc 1 "YEN SIGN" )
 166  : ( "¦"   U+00A6 So 1 "BROKEN BAR" )
 167  : ( "§"   U+00A7 Po 1 "SECTION SIGN" )
 168  : ( "¨"   U+00A8 Sk 1 "DIAERESIS" )
 169  : ( "©"   U+00A9 So 1 "COPYRIGHT SIGN" )
 170  : ( "ª"   U+00AA Lo 1 "FEMININE ORDINAL INDICATOR" )
 171  : ( "«"   U+00AB Pi 1 "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK" )
 172  : ( "¬"   U+00AC Sm 1 "NOT SIGN" )
 173  : ( "­"   U+00AD Cf 1 "SOFT HYPHEN", "SHY" )
 174  : ( "®"   U+00AE So 1 "REGISTERED SIGN" )
 175  : ( "¯"   U+00AF Sk 1 "MACRON" )
 176  : ( "°"   U+00B0 So 1 "DEGREE SIGN" )
 177  : ( "±"   U+00B1 Sm 1 "PLUS-MINUS SIGN" )
 178  : ( "²"   U+00B2 No 1 "SUPERSCRIPT TWO" )
 179  : ( "³"   U+00B3 No 1 "SUPERSCRIPT THREE" )
 180  : ( "´"   U+00B4 Sk 1 "ACUTE ACCENT" )
 181  : ( "µ"   U+00B5 Ll 1 "MICRO SIGN" )
 182  : ( "¶"   U+00B6 Po 1 "PILCROW SIGN" )
 183  : ( "·"   U+00B7 Po 1 "MIDDLE DOT" )
 184  : ( "¸"   U+00B8 Sk 1 "CEDILLA" )
 185  : ( "¹"   U+00B9 No 1 "SUPERSCRIPT ONE" )
 186  : ( "º"   U+00BA Lo 1 "MASCULINE ORDINAL INDICATOR" )
 187  : ( "»"   U+00BB Pf 1 "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK" )
 188  : ( "¼"   U+00BC No 1 "VULGAR FRACTION ONE QUARTER" )
 189  : ( "½"   U+00BD No 1 "VULGAR FRACTION ONE HALF" )
 190  : ( "¾"   U+00BE No 1 "VULGAR FRACTION THREE QUARTERS" )
 191  : ( "¿"   U+00BF Po 1 "INVERTED QUESTION MARK" )
 192  : ( "À"   U+00C0 Lu 1 "LATIN CAPITAL LETTER A WITH GRAVE" )
 193  : ( "Á"   U+00C1 Lu 1 "LATIN CAPITAL LETTER A WITH ACUTE" )
 194  : ( "Â"   U+00C2 Lu 1 "LATIN CAPITAL LETTER A WITH CIRCUMFLEX" )
 195  : ( "Ã"   U+00C3 Lu 1 "LATIN CAPITAL LETTER A WITH TILDE" )
 196  : ( "Ä"   U+00C4 Lu 1 "LATIN CAPITAL LETTER A WITH DIAERESIS" )
 197  : ( "Å"   U+00C5 Lu 1 "LATIN CAPITAL LETTER A WITH RING ABOVE" )
 198  : ( "Æ"   U+00C6 Lu 1 "LATIN CAPITAL LETTER AE" )
 199  : ( "Ç"   U+00C7 Lu 1 "LATIN CAPITAL LETTER C WITH CEDILLA" )
 200  : ( "È"   U+00C8 Lu 1 "LATIN CAPITAL LETTER E WITH GRAVE" )
 201  : ( "É"   U+00C9 Lu 1 "LATIN CAPITAL LETTER E WITH ACUTE" )
 202  : ( "Ê"   U+00CA Lu 1 "LATIN CAPITAL LETTER E WITH CIRCUMFLEX" )
 203  : ( "Ë"   U+00CB Lu 1 "LATIN CAPITAL LETTER E WITH DIAERESIS" )
 204  : ( "Ì"   U+00CC Lu 1 "LATIN CAPITAL LETTER I WITH GRAVE" )
 205  : ( "Í"   U+00CD Lu 1 "LATIN CAPITAL LETTER I WITH ACUTE" )
 206  : ( "Î"   U+00CE Lu 1 "LATIN CAPITAL LETTER I WITH CIRCUMFLEX" )
 207  : ( "Ï"   U+00CF Lu 1 "LATIN CAPITAL LETTER I WITH DIAERESIS" )
 208  : ( "Ð"   U+00D0 Lu 1 "LATIN CAPITAL LETTER ETH" )
 209  : ( "Ñ"   U+00D1 Lu 1 "LATIN CAPITAL LETTER N WITH TILDE" )
 210  : ( "Ò"   U+00D2 Lu 1 "LATIN CAPITAL LETTER O WITH GRAVE" )
 211  : ( "Ó"   U+00D3 Lu 1 "LATIN CAPITAL LETTER O WITH ACUTE" )
 212  : ( "Ô"   U+00D4 Lu 1 "LATIN CAPITAL LETTER O WITH CIRCUMFLEX" )
 213  : ( "Õ"   U+00D5 Lu 1 "LATIN CAPITAL LETTER O WITH TILDE" )
 214  : ( "Ö"   U+00D6 Lu 1 "LATIN CAPITAL LETTER O WITH DIAERESIS" )
 215  : ( "×"   U+00D7 Sm 1 "MULTIPLICATION SIGN" )
 216  : ( "Ø"   U+00D8 Lu 1 "LATIN CAPITAL LETTER O WITH STROKE" )
 217  : ( "Ù"   U+00D9 Lu 1 "LATIN CAPITAL LETTER U WITH GRAVE" )
 218  : ( "Ú"   U+00DA Lu 1 "LATIN CAPITAL LETTER U WITH ACUTE" )
 219  : ( "Û"   U+00DB Lu 1 "LATIN CAPITAL LETTER U WITH CIRCUMFLEX" )
 220  : ( "Ü"   U+00DC Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS" )
 221  : ( "Ý"   U+00DD Lu 1 "LATIN CAPITAL LETTER Y WITH ACUTE" )
 222  : ( "Þ"   U+00DE Lu 1 "LATIN CAPITAL LETTER THORN" )
 223  : ( "ß"   U+00DF Ll 1 "LATIN SMALL LETTER SHARP S" )
 224  : ( "à"   U+00E0 Ll 1 "LATIN SMALL LETTER A WITH GRAVE" )
 225  : ( "á"   U+00E1 Ll 1 "LATIN SMALL LETTER A WITH ACUTE" )
 226  : ( "â"   U+00E2 Ll 1 "LATIN SMALL LETTER A WITH CIRCUMFLEX" )
 227  : ( "ã"   U+00E3 Ll 1 "LATIN SMALL LETTER A WITH TILDE" )
 228  : ( "ä"   U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" )
 229  : ( "å"   U+00E5 Ll 1 "LATIN SMALL LETTER A WITH RING ABOVE" )
 230  : ( "æ"   U+00E6 Ll 1 "LATIN SMALL LETTER AE" )
 231  : ( "ç"   U+00E7 Ll 1 "LATIN SMALL LETTER C WITH CEDILLA" )
 232  : ( "è"   U+00E8 Ll 1 "LATIN SMALL LETTER E WITH GRAVE" )
 233  : ( "é"   U+00E9 Ll 1 "LATIN SMALL LETTER E WITH ACUTE" )
 234  : ( "ê"   U+00EA Ll 1 "LATIN SMALL LETTER E WITH CIRCUMFLEX" )
 235  : ( "ë"   U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" )
 236  : ( "ì"   U+00EC Ll 1 "LATIN SMALL LETTER I WITH GRAVE" )
 237  : ( "í"   U+00ED Ll 1 "LATIN SMALL LETTER I WITH ACUTE" )
 238  : ( "î"   U+00EE Ll 1 "LATIN SMALL LETTER I WITH CIRCUMFLEX" )
 239  : ( "ï"   U+00EF Ll 1 "LATIN SMALL LETTER I WITH DIAERESIS" )
 240  : ( "ð"   U+00F0 Ll 1 "LATIN SMALL LETTER ETH" )
 241  : ( "ñ"   U+00F1 Ll 1 "LATIN SMALL LETTER N WITH TILDE" )
 242  : ( "ò"   U+00F2 Ll 1 "LATIN SMALL LETTER O WITH GRAVE" )
 243  : ( "ó"   U+00F3 Ll 1 "LATIN SMALL LETTER O WITH ACUTE" )
 244  : ( "ô"   U+00F4 Ll 1 "LATIN SMALL LETTER O WITH CIRCUMFLEX" )
 245  : ( "õ"   U+00F5 Ll 1 "LATIN SMALL LETTER O WITH TILDE" )
 246  : ( "ö"   U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" )
 247  : ( "÷"   U+00F7 Sm 1 "DIVISION SIGN" )
 248  : ( "ø"   U+00F8 Ll 1 "LATIN SMALL LETTER O WITH STROKE" )
 249  : ( "ù"   U+00F9 Ll 1 "LATIN SMALL LETTER U WITH GRAVE" )
 250  : ( "ú"   U+00FA Ll 1 "LATIN SMALL LETTER U WITH ACUTE" )
 251  : ( "û"   U+00FB Ll 1 "LATIN SMALL LETTER U WITH CIRCUMFLEX" )
 252  : ( "ü"   U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" )
 253  : ( "ý"   U+00FD Ll 1 "LATIN SMALL LETTER Y WITH ACUTE" )
 254  : ( "þ"   U+00FE Ll 1 "LATIN SMALL LETTER THORN" )
 255  : ( "ÿ"   U+00FF Ll 1 "LATIN SMALL LETTER Y WITH DIAERESIS" )
 256  : ( "Ā"   U+0100 Lu 1 "LATIN CAPITAL LETTER A WITH MACRON" )
 257  : ( "ā"   U+0101 Ll 1 "LATIN SMALL LETTER A WITH MACRON" )
 258  : ( "Ă"   U+0102 Lu 1 "LATIN CAPITAL LETTER A WITH BREVE" )
 259  : ( "ă"   U+0103 Ll 1 "LATIN SMALL LETTER A WITH BREVE" )
 260  : ( "Ą"   U+0104 Lu 1 "LATIN CAPITAL LETTER A WITH OGONEK" )
 261  : ( "ą"   U+0105 Ll 1 "LATIN SMALL LETTER A WITH OGONEK" )
 262  : ( "Ć"   U+0106 Lu 1 "LATIN CAPITAL LETTER C WITH ACUTE" )
 263  : ( "ć"   U+0107 Ll 1 "LATIN SMALL LETTER C WITH ACUTE" )
 264  : ( "Ĉ"   U+0108 Lu 1 "LATIN CAPITAL LETTER C WITH CIRCUMFLEX" )
 265  : ( "ĉ"   U+0109 Ll 1 "LATIN SMALL LETTER C WITH CIRCUMFLEX" )
 266  : ( "Ċ"   U+010A Lu 1 "LATIN CAPITAL LETTER C WITH DOT ABOVE" )
 267  : ( "ċ"   U+010B Ll 1 "LATIN SMALL LETTER C WITH DOT ABOVE" )
 268  : ( "Č"   U+010C Lu 1 "LATIN CAPITAL LETTER C WITH CARON" )
 269  : ( "č"   U+010D Ll 1 "LATIN SMALL LETTER C WITH CARON" )
 270  : ( "Ď"   U+010E Lu 1 "LATIN CAPITAL LETTER D WITH CARON" )
 271  : ( "ď"   U+010F Ll 1 "LATIN SMALL LETTER D WITH CARON" )
 272  : ( "Đ"   U+0110 Lu 1 "LATIN CAPITAL LETTER D WITH STROKE" )
 273  : ( "đ"   U+0111 Ll 1 "LATIN SMALL LETTER D WITH STROKE" )
 274  : ( "Ē"   U+0112 Lu 1 "LATIN CAPITAL LETTER E WITH MACRON" )
 275  : ( "ē"   U+0113 Ll 1 "LATIN SMALL LETTER E WITH MACRON" )
 276  : ( "Ĕ"   U+0114 Lu 1 "LATIN CAPITAL LETTER E WITH BREVE" )
 277  : ( "ĕ"   U+0115 Ll 1 "LATIN SMALL LETTER E WITH BREVE" )
 278  : ( "Ė"   U+0116 Lu 1 "LATIN CAPITAL LETTER E WITH DOT ABOVE" )
 279  : ( "ė"   U+0117 Ll 1 "LATIN SMALL LETTER E WITH DOT ABOVE" )
 280  : ( "Ę"   U+0118 Lu 1 "LATIN CAPITAL LETTER E WITH OGONEK" )
 281  : ( "ę"   U+0119 Ll 1 "LATIN SMALL LETTER E WITH OGONEK" )
 282  : ( "Ě"   U+011A Lu 1 "LATIN CAPITAL LETTER E WITH CARON" )
 283  : ( "ě"   U+011B Ll 1 "LATIN SMALL LETTER E WITH CARON" )
 284  : ( "Ĝ"   U+011C Lu 1 "LATIN CAPITAL LETTER G WITH CIRCUMFLEX" )
 285  : ( "ĝ"   U+011D Ll 1 "LATIN SMALL LETTER G WITH CIRCUMFLEX" )
 286  : ( "Ğ"   U+011E Lu 1 "LATIN CAPITAL LETTER G WITH BREVE" )
 287  : ( "ğ"   U+011F Ll 1 "LATIN SMALL LETTER G WITH BREVE" )
 288  : ( "Ġ"   U+0120 Lu 1 "LATIN CAPITAL LETTER G WITH DOT ABOVE" )
 289  : ( "ġ"   U+0121 Ll 1 "LATIN SMALL LETTER G WITH DOT ABOVE" )
 290  : ( "Ģ"   U+0122 Lu 1 "LATIN CAPITAL LETTER G WITH CEDILLA" )
 291  : ( "ģ"   U+0123 Ll 1 "LATIN SMALL LETTER G WITH CEDILLA" )
 292  : ( "Ĥ"   U+0124 Lu 1 "LATIN CAPITAL LETTER H WITH CIRCUMFLEX" )
 293  : ( "ĥ"   U+0125 Ll 1 "LATIN SMALL LETTER H WITH CIRCUMFLEX" )
 294  : ( "Ħ"   U+0126 Lu 1 "LATIN CAPITAL LETTER H WITH STROKE" )
 295  : ( "ħ"   U+0127 Ll 1 "LATIN SMALL LETTER H WITH STROKE" )
 296  : ( "Ĩ"   U+0128 Lu 1 "LATIN CAPITAL LETTER I WITH TILDE" )
 297  : ( "ĩ"   U+0129 Ll 1 "LATIN SMALL LETTER I WITH TILDE" )
 298  : ( "Ī"   U+012A Lu 1 "LATIN CAPITAL LETTER I WITH MACRON" )
 299  : ( "ī"   U+012B Ll 1 "LATIN SMALL LETTER I WITH MACRON" )
 300  : ( "Ĭ"   U+012C Lu 1 "LATIN CAPITAL LETTER I WITH BREVE" )
 301  : ( "ĭ"   U+012D Ll 1 "LATIN SMALL LETTER I WITH BREVE" )
 302  : ( "Į"   U+012E Lu 1 "LATIN CAPITAL LETTER I WITH OGONEK" )
 303  : ( "į"   U+012F Ll 1 "LATIN SMALL LETTER I WITH OGONEK" )
 304  : ( "İ"   U+0130 Lu 1 "LATIN CAPITAL LETTER I WITH DOT ABOVE" )
 305  : ( "ı"   U+0131 Ll 1 "LATIN SMALL LETTER DOTLESS I" )
 306  : ( "IJ"   U+0132 Lu 1 "LATIN CAPITAL LIGATURE IJ" )
 307  : ( "ij"   U+0133 Ll 1 "LATIN SMALL LIGATURE IJ" )
 308  : ( "Ĵ"   U+0134 Lu 1 "LATIN CAPITAL LETTER J WITH CIRCUMFLEX" )
 309  : ( "ĵ"   U+0135 Ll 1 "LATIN SMALL LETTER J WITH CIRCUMFLEX" )
 310  : ( "Ķ"   U+0136 Lu 1 "LATIN CAPITAL LETTER K WITH CEDILLA" )
 311  : ( "ķ"   U+0137 Ll 1 "LATIN SMALL LETTER K WITH CEDILLA" )
 312  : ( "ĸ"   U+0138 Ll 1 "LATIN SMALL LETTER KRA" )
 313  : ( "Ĺ"   U+0139 Lu 1 "LATIN CAPITAL LETTER L WITH ACUTE" )
 314  : ( "ĺ"   U+013A Ll 1 "LATIN SMALL LETTER L WITH ACUTE" )
 315  : ( "Ļ"   U+013B Lu 1 "LATIN CAPITAL LETTER L WITH CEDILLA" )
 316  : ( "ļ"   U+013C Ll 1 "LATIN SMALL LETTER L WITH CEDILLA" )
 317  : ( "Ľ"   U+013D Lu 1 "LATIN CAPITAL LETTER L WITH CARON" )
 318  : ( "ľ"   U+013E Ll 1 "LATIN SMALL LETTER L WITH CARON" )
 319  : ( "Ŀ"   U+013F Lu 1 "LATIN CAPITAL LETTER L WITH MIDDLE DOT" )
 320  : ( "ŀ"   U+0140 Ll 1 "LATIN SMALL LETTER L WITH MIDDLE DOT" )
 321  : ( "Ł"   U+0141 Lu 1 "LATIN CAPITAL LETTER L WITH STROKE" )
 322  : ( "ł"   U+0142 Ll 1 "LATIN SMALL LETTER L WITH STROKE" )
 323  : ( "Ń"   U+0143 Lu 1 "LATIN CAPITAL LETTER N WITH ACUTE" )
 324  : ( "ń"   U+0144 Ll 1 "LATIN SMALL LETTER N WITH ACUTE" )
 325  : ( "Ņ"   U+0145 Lu 1 "LATIN CAPITAL LETTER N WITH CEDILLA" )
 326  : ( "ņ"   U+0146 Ll 1 "LATIN SMALL LETTER N WITH CEDILLA" )
 327  : ( "Ň"   U+0147 Lu 1 "LATIN CAPITAL LETTER N WITH CARON" )
 328  : ( "ň"   U+0148 Ll 1 "LATIN SMALL LETTER N WITH CARON" )
 329  : ( "ʼn"   U+0149 Ll 1 "LATIN SMALL LETTER N PRECEDED BY APOSTROPHE" )
 330  : ( "Ŋ"   U+014A Lu 1 "LATIN CAPITAL LETTER ENG" )
 331  : ( "ŋ"   U+014B Ll 1 "LATIN SMALL LETTER ENG" )
 332  : ( "Ō"   U+014C Lu 1 "LATIN CAPITAL LETTER O WITH MACRON" )
 333  : ( "ō"   U+014D Ll 1 "LATIN SMALL LETTER O WITH MACRON" )
 334  : ( "Ŏ"   U+014E Lu 1 "LATIN CAPITAL LETTER O WITH BREVE" )
 335  : ( "ŏ"   U+014F Ll 1 "LATIN SMALL LETTER O WITH BREVE" )
 336  : ( "Ő"   U+0150 Lu 1 "LATIN CAPITAL LETTER O WITH DOUBLE ACUTE" )
 337  : ( "ő"   U+0151 Ll 1 "LATIN SMALL LETTER O WITH DOUBLE ACUTE" )
 338  : ( "Œ"   U+0152 Lu 1 "LATIN CAPITAL LIGATURE OE" )
 339  : ( "œ"   U+0153 Ll 1 "LATIN SMALL LIGATURE OE" )
 340  : ( "Ŕ"   U+0154 Lu 1 "LATIN CAPITAL LETTER R WITH ACUTE" )
 341  : ( "ŕ"   U+0155 Ll 1 "LATIN SMALL LETTER R WITH ACUTE" )
 342  : ( "Ŗ"   U+0156 Lu 1 "LATIN CAPITAL LETTER R WITH CEDILLA" )
 343  : ( "ŗ"   U+0157 Ll 1 "LATIN SMALL LETTER R WITH CEDILLA" )
 344  : ( "Ř"   U+0158 Lu 1 "LATIN CAPITAL LETTER R WITH CARON" )
 345  : ( "ř"   U+0159 Ll 1 "LATIN SMALL LETTER R WITH CARON" )
 346  : ( "Ś"   U+015A Lu 1 "LATIN CAPITAL LETTER S WITH ACUTE" )
 347  : ( "ś"   U+015B Ll 1 "LATIN SMALL LETTER S WITH ACUTE" )
 348  : ( "Ŝ"   U+015C Lu 1 "LATIN CAPITAL LETTER S WITH CIRCUMFLEX" )
 349  : ( "ŝ"   U+015D Ll 1 "LATIN SMALL LETTER S WITH CIRCUMFLEX" )
 350  : ( "Ş"   U+015E Lu 1 "LATIN CAPITAL LETTER S WITH CEDILLA" )
 351  : ( "ş"   U+015F Ll 1 "LATIN SMALL LETTER S WITH CEDILLA" )
 352  : ( "Š"   U+0160 Lu 1 "LATIN CAPITAL LETTER S WITH CARON" )
 353  : ( "š"   U+0161 Ll 1 "LATIN SMALL LETTER S WITH CARON" )
 354  : ( "Ţ"   U+0162 Lu 1 "LATIN CAPITAL LETTER T WITH CEDILLA" )
 355  : ( "ţ"   U+0163 Ll 1 "LATIN SMALL LETTER T WITH CEDILLA" )
 356  : ( "Ť"   U+0164 Lu 1 "LATIN CAPITAL LETTER T WITH CARON" )
 357  : ( "ť"   U+0165 Ll 1 "LATIN SMALL LETTER T WITH CARON" )
 358  : ( "Ŧ"   U+0166 Lu 1 "LATIN CAPITAL LETTER T WITH STROKE" )
 359  : ( "ŧ"   U+0167 Ll 1 "LATIN SMALL LETTER T WITH STROKE" )
 360  : ( "Ũ"   U+0168 Lu 1 "LATIN CAPITAL LETTER U WITH TILDE" )
 361  : ( "ũ"   U+0169 Ll 1 "LATIN SMALL LETTER U WITH TILDE" )
 362  : ( "Ū"   U+016A Lu 1 "LATIN CAPITAL LETTER U WITH MACRON" )
 363  : ( "ū"   U+016B Ll 1 "LATIN SMALL LETTER U WITH MACRON" )
 364  : ( "Ŭ"   U+016C Lu 1 "LATIN CAPITAL LETTER U WITH BREVE" )
 365  : ( "ŭ"   U+016D Ll 1 "LATIN SMALL LETTER U WITH BREVE" )
 366  : ( "Ů"   U+016E Lu 1 "LATIN CAPITAL LETTER U WITH RING ABOVE" )
 367  : ( "ů"   U+016F Ll 1 "LATIN SMALL LETTER U WITH RING ABOVE" )
 368  : ( "Ű"   U+0170 Lu 1 "LATIN CAPITAL LETTER U WITH DOUBLE ACUTE" )
 369  : ( "ű"   U+0171 Ll 1 "LATIN SMALL LETTER U WITH DOUBLE ACUTE" )
 370  : ( "Ų"   U+0172 Lu 1 "LATIN CAPITAL LETTER U WITH OGONEK" )
 371  : ( "ų"   U+0173 Ll 1 "LATIN SMALL LETTER U WITH OGONEK" )
 372  : ( "Ŵ"   U+0174 Lu 1 "LATIN CAPITAL LETTER W WITH CIRCUMFLEX" )
 373  : ( "ŵ"   U+0175 Ll 1 "LATIN SMALL LETTER W WITH CIRCUMFLEX" )
 374  : ( "Ŷ"   U+0176 Lu 1 "LATIN CAPITAL LETTER Y WITH CIRCUMFLEX" )
 375  : ( "ŷ"   U+0177 Ll 1 "LATIN SMALL LETTER Y WITH CIRCUMFLEX" )
 376  : ( "Ÿ"   U+0178 Lu 1 "LATIN CAPITAL LETTER Y WITH DIAERESIS" )
 377  : ( "Ź"   U+0179 Lu 1 "LATIN CAPITAL LETTER Z WITH ACUTE" )
 378  : ( "ź"   U+017A Ll 1 "LATIN SMALL LETTER Z WITH ACUTE" )
 379  : ( "Ż"   U+017B Lu 1 "LATIN CAPITAL LETTER Z WITH DOT ABOVE" )
 380  : ( "ż"   U+017C Ll 1 "LATIN SMALL LETTER Z WITH DOT ABOVE" )
 381  : ( "Ž"   U+017D Lu 1 "LATIN CAPITAL LETTER Z WITH CARON" )
 382  : ( "ž"   U+017E Ll 1 "LATIN SMALL LETTER Z WITH CARON" )
 383  : ( "ſ"   U+017F Ll 1 "LATIN SMALL LETTER LONG S" )
 384  : ( "ƀ"   U+0180 Ll 1 "LATIN SMALL LETTER B WITH STROKE" )
 385  : ( "Ɓ"   U+0181 Lu 1 "LATIN CAPITAL LETTER B WITH HOOK" )
 386  : ( "Ƃ"   U+0182 Lu 1 "LATIN CAPITAL LETTER B WITH TOPBAR" )
 387  : ( "ƃ"   U+0183 Ll 1 "LATIN SMALL LETTER B WITH TOPBAR" )
 388  : ( "Ƅ"   U+0184 Lu 1 "LATIN CAPITAL LETTER TONE SIX" )
 389  : ( "ƅ"   U+0185 Ll 1 "LATIN SMALL LETTER TONE SIX" )
 390  : ( "Ɔ"   U+0186 Lu 1 "LATIN CAPITAL LETTER OPEN O" )
 391  : ( "Ƈ"   U+0187 Lu 1 "LATIN CAPITAL LETTER C WITH HOOK" )
 392  : ( "ƈ"   U+0188 Ll 1 "LATIN SMALL LETTER C WITH HOOK" )
 393  : ( "Ɖ"   U+0189 Lu 1 "LATIN CAPITAL LETTER AFRICAN D" )
 394  : ( "Ɗ"   U+018A Lu 1 "LATIN CAPITAL LETTER D WITH HOOK" )
 395  : ( "Ƌ"   U+018B Lu 1 "LATIN CAPITAL LETTER D WITH TOPBAR" )
 396  : ( "ƌ"   U+018C Ll 1 "LATIN SMALL LETTER D WITH TOPBAR" )
 397  : ( "ƍ"   U+018D Ll 1 "LATIN SMALL LETTER TURNED DELTA" )
 398  : ( "Ǝ"   U+018E Lu 1 "LATIN CAPITAL LETTER REVERSED E" )
 399  : ( "Ə"   U+018F Lu 1 "LATIN CAPITAL LETTER SCHWA" )
 400  : ( "Ɛ"   U+0190 Lu 1 "LATIN CAPITAL LETTER OPEN E" )
 401  : ( "Ƒ"   U+0191 Lu 1 "LATIN CAPITAL LETTER F WITH HOOK" )
 402  : ( "ƒ"   U+0192 Ll 1 "LATIN SMALL LETTER F WITH HOOK" )
 403  : ( "Ɠ"   U+0193 Lu 1 "LATIN CAPITAL LETTER G WITH HOOK" )
 404  : ( "Ɣ"   U+0194 Lu 1 "LATIN CAPITAL LETTER GAMMA" )
 405  : ( "ƕ"   U+0195 Ll 1 "LATIN SMALL LETTER HV" )
 406  : ( "Ɩ"   U+0196 Lu 1 "LATIN CAPITAL LETTER IOTA" )
 407  : ( "Ɨ"   U+0197 Lu 1 "LATIN CAPITAL LETTER I WITH STROKE" )
 408  : ( "Ƙ"   U+0198 Lu 1 "LATIN CAPITAL LETTER K WITH HOOK" )
 409  : ( "ƙ"   U+0199 Ll 1 "LATIN SMALL LETTER K WITH HOOK" )
 410  : ( "ƚ"   U+019A Ll 1 "LATIN SMALL LETTER L WITH BAR" )
 411  : ( "ƛ"   U+019B Ll 1 "LATIN SMALL LETTER LAMBDA WITH STROKE" )
 412  : ( "Ɯ"   U+019C Lu 1 "LATIN CAPITAL LETTER TURNED M" )
 413  : ( "Ɲ"   U+019D Lu 1 "LATIN CAPITAL LETTER N WITH LEFT HOOK" )
 414  : ( "ƞ"   U+019E Ll 1 "LATIN SMALL LETTER N WITH LONG RIGHT LEG" )
 415  : ( "Ɵ"   U+019F Lu 1 "LATIN CAPITAL LETTER O WITH MIDDLE TILDE" )
 416  : ( "Ơ"   U+01A0 Lu 1 "LATIN CAPITAL LETTER O WITH HORN" )
 417  : ( "ơ"   U+01A1 Ll 1 "LATIN SMALL LETTER O WITH HORN" )
 418  : ( "Ƣ"   U+01A2 Lu 1 "LATIN CAPITAL LETTER OI", "LATIN CAPITAL LETTER GHA" )
 419  : ( "ƣ"   U+01A3 Ll 1 "LATIN SMALL LETTER OI", "LATIN SMALL LETTER GHA" )
 420  : ( "Ƥ"   U+01A4 Lu 1 "LATIN CAPITAL LETTER P WITH HOOK" )
 421  : ( "ƥ"   U+01A5 Ll 1 "LATIN SMALL LETTER P WITH HOOK" )
 422  : ( "Ʀ"   U+01A6 Lu 1 "LATIN LETTER YR" )
 423  : ( "Ƨ"   U+01A7 Lu 1 "LATIN CAPITAL LETTER TONE TWO" )
 424  : ( "ƨ"   U+01A8 Ll 1 "LATIN SMALL LETTER TONE TWO" )
 425  : ( "Ʃ"   U+01A9 Lu 1 "LATIN CAPITAL LETTER ESH" )
 426  : ( "ƪ"   U+01AA Ll 1 "LATIN LETTER REVERSED ESH LOOP" )
 427  : ( "ƫ"   U+01AB Ll 1 "LATIN SMALL LETTER T WITH PALATAL HOOK" )
 428  : ( "Ƭ"   U+01AC Lu 1 "LATIN CAPITAL LETTER T WITH HOOK" )
 429  : ( "ƭ"   U+01AD Ll 1 "LATIN SMALL LETTER T WITH HOOK" )
 430  : ( "Ʈ"   U+01AE Lu 1 "LATIN CAPITAL LETTER T WITH RETROFLEX HOOK" )
 431  : ( "Ư"   U+01AF Lu 1 "LATIN CAPITAL LETTER U WITH HORN" )
 432  : ( "ư"   U+01B0 Ll 1 "LATIN SMALL LETTER U WITH HORN" )
 433  : ( "Ʊ"   U+01B1 Lu 1 "LATIN CAPITAL LETTER UPSILON" )
 434  : ( "Ʋ"   U+01B2 Lu 1 "LATIN CAPITAL LETTER V WITH HOOK" )
 435  : ( "Ƴ"   U+01B3 Lu 1 "LATIN CAPITAL LETTER Y WITH HOOK" )
 436  : ( "ƴ"   U+01B4 Ll 1 "LATIN SMALL LETTER Y WITH HOOK" )
 437  : ( "Ƶ"   U+01B5 Lu 1 "LATIN CAPITAL LETTER Z WITH STROKE" )
 438  : ( "ƶ"   U+01B6 Ll 1 "LATIN SMALL LETTER Z WITH STROKE" )
 439  : ( "Ʒ"   U+01B7 Lu 1 "LATIN CAPITAL LETTER EZH" )
 440  : ( "Ƹ"   U+01B8 Lu 1 "LATIN CAPITAL LETTER EZH REVERSED" )
 441  : ( "ƹ"   U+01B9 Ll 1 "LATIN SMALL LETTER EZH REVERSED" )
 442  : ( "ƺ"   U+01BA Ll 1 "LATIN SMALL LETTER EZH WITH TAIL" )
 443  : ( "ƻ"   U+01BB Lo 1 "LATIN LETTER TWO WITH STROKE" )
 444  : ( "Ƽ"   U+01BC Lu 1 "LATIN CAPITAL LETTER TONE FIVE" )
 445  : ( "ƽ"   U+01BD Ll 1 "LATIN SMALL LETTER TONE FIVE" )
 446  : ( "ƾ"   U+01BE Ll 1 "LATIN LETTER INVERTED GLOTTAL STOP WITH STROKE" )
 447  : ( "ƿ"   U+01BF Ll 1 "LATIN LETTER WYNN" )
 448  : ( "ǀ"   U+01C0 Lo 1 "LATIN LETTER DENTAL CLICK" )
 449  : ( "ǁ"   U+01C1 Lo 1 "LATIN LETTER LATERAL CLICK" )
 450  : ( "ǂ"   U+01C2 Lo 1 "LATIN LETTER ALVEOLAR CLICK" )
 451  : ( "ǃ"   U+01C3 Lo 1 "LATIN LETTER RETROFLEX CLICK" )
 452  : ( "DŽ"   U+01C4 Lu 1 "LATIN CAPITAL LETTER DZ WITH CARON" )
 453  : ( "Dž"   U+01C5 Lt 1 "LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON" )
 454  : ( "dž"   U+01C6 Ll 1 "LATIN SMALL LETTER DZ WITH CARON" )
 455  : ( "LJ"   U+01C7 Lu 1 "LATIN CAPITAL LETTER LJ" )
 456  : ( "Lj"   U+01C8 Lt 1 "LATIN CAPITAL LETTER L WITH SMALL LETTER J" )
 457  : ( "lj"   U+01C9 Ll 1 "LATIN SMALL LETTER LJ" )
 458  : ( "NJ"   U+01CA Lu 1 "LATIN CAPITAL LETTER NJ" )
 459  : ( "Nj"   U+01CB Lt 1 "LATIN CAPITAL LETTER N WITH SMALL LETTER J" )
 460  : ( "nj"   U+01CC Ll 1 "LATIN SMALL LETTER NJ" )
 461  : ( "Ǎ"   U+01CD Lu 1 "LATIN CAPITAL LETTER A WITH CARON" )
 462  : ( "ǎ"   U+01CE Ll 1 "LATIN SMALL LETTER A WITH CARON" )
 463  : ( "Ǐ"   U+01CF Lu 1 "LATIN CAPITAL LETTER I WITH CARON" )
 464  : ( "ǐ"   U+01D0 Ll 1 "LATIN SMALL LETTER I WITH CARON" )
 465  : ( "Ǒ"   U+01D1 Lu 1 "LATIN CAPITAL LETTER O WITH CARON" )
 466  : ( "ǒ"   U+01D2 Ll 1 "LATIN SMALL LETTER O WITH CARON" )
 467  : ( "Ǔ"   U+01D3 Lu 1 "LATIN CAPITAL LETTER U WITH CARON" )
 468  : ( "ǔ"   U+01D4 Ll 1 "LATIN SMALL LETTER U WITH CARON" )
 469  : ( "Ǖ"   U+01D5 Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON" )
 470  : ( "ǖ"   U+01D6 Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS AND MACRON" )
 471  : ( "Ǘ"   U+01D7 Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE" )
 472  : ( "ǘ"   U+01D8 Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE" )
 473  : ( "Ǚ"   U+01D9 Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON" )
 474  : ( "ǚ"   U+01DA Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS AND CARON" )
 475  : ( "Ǜ"   U+01DB Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE" )
 476  : ( "ǜ"   U+01DC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE" )
 477  : ( "ǝ"   U+01DD Ll 1 "LATIN SMALL LETTER TURNED E" )
 478  : ( "Ǟ"   U+01DE Lu 1 "LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON" )
 479  : ( "ǟ"   U+01DF Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS AND MACRON" )
 480  : ( "Ǡ"   U+01E0 Lu 1 "LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON" )
 481  : ( "ǡ"   U+01E1 Ll 1 "LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON" )
 482  : ( "Ǣ"   U+01E2 Lu 1 "LATIN CAPITAL LETTER AE WITH MACRON" )
 483  : ( "ǣ"   U+01E3 Ll 1 "LATIN SMALL LETTER AE WITH MACRON" )
 484  : ( "Ǥ"   U+01E4 Lu 1 "LATIN CAPITAL LETTER G WITH STROKE" )
 485  : ( "ǥ"   U+01E5 Ll 1 "LATIN SMALL LETTER G WITH STROKE" )
 486  : ( "Ǧ"   U+01E6 Lu 1 "LATIN CAPITAL LETTER G WITH CARON" )
 487  : ( "ǧ"   U+01E7 Ll 1 "LATIN SMALL LETTER G WITH CARON" )
 488  : ( "Ǩ"   U+01E8 Lu 1 "LATIN CAPITAL LETTER K WITH CARON" )
 489  : ( "ǩ"   U+01E9 Ll 1 "LATIN SMALL LETTER K WITH CARON" )
 490  : ( "Ǫ"   U+01EA Lu 1 "LATIN CAPITAL LETTER O WITH OGONEK" )
 491  : ( "ǫ"   U+01EB Ll 1 "LATIN SMALL LETTER O WITH OGONEK" )
 492  : ( "Ǭ"   U+01EC Lu 1 "LATIN CAPITAL LETTER O WITH OGONEK AND MACRON" )
 493  : ( "ǭ"   U+01ED Ll 1 "LATIN SMALL LETTER O WITH OGONEK AND MACRON" )
 494  : ( "Ǯ"   U+01EE Lu 1 "LATIN CAPITAL LETTER EZH WITH CARON" )
 495  : ( "ǯ"   U+01EF Ll 1 "LATIN SMALL LETTER EZH WITH CARON" )
 496  : ( "ǰ"   U+01F0 Ll 1 "LATIN SMALL LETTER J WITH CARON" )
 497  : ( "DZ"   U+01F1 Lu 1 "LATIN CAPITAL LETTER DZ" )
 498  : ( "Dz"   U+01F2 Lt 1 "LATIN CAPITAL LETTER D WITH SMALL LETTER Z" )
 499  : ( "dz"   U+01F3 Ll 1 "LATIN SMALL LETTER DZ" )
 500  : ( "Ǵ"   U+01F4 Lu 1 "LATIN CAPITAL LETTER G WITH ACUTE" )
 501  : ( "ǵ"   U+01F5 Ll 1 "LATIN SMALL LETTER G WITH ACUTE" )
 502  : ( "Ƕ"   U+01F6 Lu 1 "LATIN CAPITAL LETTER HWAIR" )
 503  : ( "Ƿ"   U+01F7 Lu 1 "LATIN CAPITAL LETTER WYNN" )
 504  : ( "Ǹ"   U+01F8 Lu 1 "LATIN CAPITAL LETTER N WITH GRAVE" )
 505  : ( "ǹ"   U+01F9 Ll 1 "LATIN SMALL LETTER N WITH GRAVE" )
 506  : ( "Ǻ"   U+01FA Lu 1 "LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE" )
 507  : ( "ǻ"   U+01FB Ll 1 "LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE" )
 508  : ( "Ǽ"   U+01FC Lu 1 "LATIN CAPITAL LETTER AE WITH ACUTE" )
 509  : ( "ǽ"   U+01FD Ll 1 "LATIN SMALL LETTER AE WITH ACUTE" )
 510  : ( "Ǿ"   U+01FE Lu 1 "LATIN CAPITAL LETTER O WITH STROKE AND ACUTE" )
 511  : ( "ǿ"   U+01FF Ll 1 "LATIN SMALL LETTER O WITH STROKE AND ACUTE" )
 512  : ( "Ȁ"   U+0200 Lu 1 "LATIN CAPITAL LETTER A WITH DOUBLE GRAVE" )
 513  : ( "ȁ"   U+0201 Ll 1 "LATIN SMALL LETTER A WITH DOUBLE GRAVE" )
 514  : ( "Ȃ"   U+0202 Lu 1 "LATIN CAPITAL LETTER A WITH INVERTED BREVE" )
 515  : ( "ȃ"   U+0203 Ll 1 "LATIN SMALL LETTER A WITH INVERTED BREVE" )
 516  : ( "Ȅ"   U+0204 Lu 1 "LATIN CAPITAL LETTER E WITH DOUBLE GRAVE" )
 517  : ( "ȅ"   U+0205 Ll 1 "LATIN SMALL LETTER E WITH DOUBLE GRAVE" )
 518  : ( "Ȇ"   U+0206 Lu 1 "LATIN CAPITAL LETTER E WITH INVERTED BREVE" )
 519  : ( "ȇ"   U+0207 Ll 1 "LATIN SMALL LETTER E WITH INVERTED BREVE" )
 520  : ( "Ȉ"   U+0208 Lu 1 "LATIN CAPITAL LETTER I WITH DOUBLE GRAVE" )
 521  : ( "ȉ"   U+0209 Ll 1 "LATIN SMALL LETTER I WITH DOUBLE GRAVE" )
 522  : ( "Ȋ"   U+020A Lu 1 "LATIN CAPITAL LETTER I WITH INVERTED BREVE" )
 523  : ( "ȋ"   U+020B Ll 1 "LATIN SMALL LETTER I WITH INVERTED BREVE" )
 524  : ( "Ȍ"   U+020C Lu 1 "LATIN CAPITAL LETTER O WITH DOUBLE GRAVE" )
 525  : ( "ȍ"   U+020D Ll 1 "LATIN SMALL LETTER O WITH DOUBLE GRAVE" )
 526  : ( "Ȏ"   U+020E Lu 1 "LATIN CAPITAL LETTER O WITH INVERTED BREVE" )
 527  : ( "ȏ"   U+020F Ll 1 "LATIN SMALL LETTER O WITH INVERTED BREVE" )
 528  : ( "Ȑ"   U+0210 Lu 1 "LATIN CAPITAL LETTER R WITH DOUBLE GRAVE" )
 529  : ( "ȑ"   U+0211 Ll 1 "LATIN SMALL LETTER R WITH DOUBLE GRAVE" )
 530  : ( "Ȓ"   U+0212 Lu 1 "LATIN CAPITAL LETTER R WITH INVERTED BREVE" )
 531  : ( "ȓ"   U+0213 Ll 1 "LATIN SMALL LETTER R WITH INVERTED BREVE" )
 532  : ( "Ȕ"   U+0214 Lu 1 "LATIN CAPITAL LETTER U WITH DOUBLE GRAVE" )
 533  : ( "ȕ"   U+0215 Ll 1 "LATIN SMALL LETTER U WITH DOUBLE GRAVE" )
 534  : ( "Ȗ"   U+0216 Lu 1 "LATIN CAPITAL LETTER U WITH INVERTED BREVE" )
 535  : ( "ȗ"   U+0217 Ll 1 "LATIN SMALL LETTER U WITH INVERTED BREVE" )
 536  : ( "Ș"   U+0218 Lu 1 "LATIN CAPITAL LETTER S WITH COMMA BELOW" )
 537  : ( "ș"   U+0219 Ll 1 "LATIN SMALL LETTER S WITH COMMA BELOW" )
 538  : ( "Ț"   U+021A Lu 1 "LATIN CAPITAL LETTER T WITH COMMA BELOW" )
 539  : ( "ț"   U+021B Ll 1 "LATIN SMALL LETTER T WITH COMMA BELOW" )
 540  : ( "Ȝ"   U+021C Lu 1 "LATIN CAPITAL LETTER YOGH" )
 541  : ( "ȝ"   U+021D Ll 1 "LATIN SMALL LETTER YOGH" )
 542  : ( "Ȟ"   U+021E Lu 1 "LATIN CAPITAL LETTER H WITH CARON" )
 543  : ( "ȟ"   U+021F Ll 1 "LATIN SMALL LETTER H WITH CARON" )
 544  : ( "Ƞ"   U+0220 Lu 1 "LATIN CAPITAL LETTER N WITH LONG RIGHT LEG" )
 545  : ( "ȡ"   U+0221 Ll 1 "LATIN SMALL LETTER D WITH CURL" )
 546  : ( "Ȣ"   U+0222 Lu 1 "LATIN CAPITAL LETTER OU" )
 547  : ( "ȣ"   U+0223 Ll 1 "LATIN SMALL LETTER OU" )
 548  : ( "Ȥ"   U+0224 Lu 1 "LATIN CAPITAL LETTER Z WITH HOOK" )
 549  : ( "ȥ"   U+0225 Ll 1 "LATIN SMALL LETTER Z WITH HOOK" )
 550  : ( "Ȧ"   U+0226 Lu 1 "LATIN CAPITAL LETTER A WITH DOT ABOVE" )
 551  : ( "ȧ"   U+0227 Ll 1 "LATIN SMALL LETTER A WITH DOT ABOVE" )
 552  : ( "Ȩ"   U+0228 Lu 1 "LATIN CAPITAL LETTER E WITH CEDILLA" )
 553  : ( "ȩ"   U+0229 Ll 1 "LATIN SMALL LETTER E WITH CEDILLA" )
 554  : ( "Ȫ"   U+022A Lu 1 "LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON" )
 555  : ( "ȫ"   U+022B Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS AND MACRON" )
 556  : ( "Ȭ"   U+022C Lu 1 "LATIN CAPITAL LETTER O WITH TILDE AND MACRON" )
 557  : ( "ȭ"   U+022D Ll 1 "LATIN SMALL LETTER O WITH TILDE AND MACRON" )
 558  : ( "Ȯ"   U+022E Lu 1 "LATIN CAPITAL LETTER O WITH DOT ABOVE" )
 559  : ( "ȯ"   U+022F Ll 1 "LATIN SMALL LETTER O WITH DOT ABOVE" )
 560  : ( "Ȱ"   U+0230 Lu 1 "LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON" )
 561  : ( "ȱ"   U+0231 Ll 1 "LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON" )
 562  : ( "Ȳ"   U+0232 Lu 1 "LATIN CAPITAL LETTER Y WITH MACRON" )
 563  : ( "ȳ"   U+0233 Ll 1 "LATIN SMALL LETTER Y WITH MACRON" )
 564  : ( "ȴ"   U+0234 Ll 1 "LATIN SMALL LETTER L WITH CURL" )
 565  : ( "ȵ"   U+0235 Ll 1 "LATIN SMALL LETTER N WITH CURL" )
 566  : ( "ȶ"   U+0236 Ll 1 "LATIN SMALL LETTER T WITH CURL" )
 567  : ( "ȷ"   U+0237 Ll 1 "LATIN SMALL LETTER DOTLESS J" )
 568  : ( "ȸ"   U+0238 Ll 1 "LATIN SMALL LETTER DB DIGRAPH" )
 569  : ( "ȹ"   U+0239 Ll 1 "LATIN SMALL LETTER QP DIGRAPH" )
 570  : ( "Ⱥ"   U+023A Lu 1 "LATIN CAPITAL LETTER A WITH STROKE" )
 571  : ( "Ȼ"   U+023B Lu 1 "LATIN CAPITAL LETTER C WITH STROKE" )
 572  : ( "ȼ"   U+023C Ll 1 "LATIN SMALL LETTER C WITH STROKE" )
 573  : ( "Ƚ"   U+023D Lu 1 "LATIN CAPITAL LETTER L WITH BAR" )
 574  : ( "Ⱦ"   U+023E Lu 1 "LATIN CAPITAL LETTER T WITH DIAGONAL STROKE" )
 575  : ( "ȿ"   U+023F Ll 1 "LATIN SMALL LETTER S WITH SWASH TAIL" )
 576  : ( "ɀ"   U+0240 Ll 1 "LATIN SMALL LETTER Z WITH SWASH TAIL" )
 577  : ( "Ɂ"   U+0241 Lu 1 "LATIN CAPITAL LETTER GLOTTAL STOP" )
 578  : ( "ɂ"   U+0242 Ll 1 "LATIN SMALL LETTER GLOTTAL STOP" )
 579  : ( "Ƀ"   U+0243 Lu 1 "LATIN CAPITAL LETTER B WITH STROKE" )
 580  : ( "Ʉ"   U+0244 Lu 1 "LATIN CAPITAL LETTER U BAR" )
 581  : ( "Ʌ"   U+0245 Lu 1 "LATIN CAPITAL LETTER TURNED V" )
 582  : ( "Ɇ"   U+0246 Lu 1 "LATIN CAPITAL LETTER E WITH STROKE" )
 583  : ( "ɇ"   U+0247 Ll 1 "LATIN SMALL LETTER E WITH STROKE" )
 584  : ( "Ɉ"   U+0248 Lu 1 "LATIN CAPITAL LETTER J WITH STROKE" )
 585  : ( "ɉ"   U+0249 Ll 1 "LATIN SMALL LETTER J WITH STROKE" )
 586  : ( "Ɋ"   U+024A Lu 1 "LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL" )
 587  : ( "ɋ"   U+024B Ll 1 "LATIN SMALL LETTER Q WITH HOOK TAIL" )
 588  : ( "Ɍ"   U+024C Lu 1 "LATIN CAPITAL LETTER R WITH STROKE" )
 589  : ( "ɍ"   U+024D Ll 1 "LATIN SMALL LETTER R WITH STROKE" )
 590  : ( "Ɏ"   U+024E Lu 1 "LATIN CAPITAL LETTER Y WITH STROKE" )
 591  : ( "ɏ"   U+024F Ll 1 "LATIN SMALL LETTER Y WITH STROKE" )
 592  : ( "ɐ"   U+0250 Ll 1 "LATIN SMALL LETTER TURNED A" )
 593  : ( "ɑ"   U+0251 Ll 1 "LATIN SMALL LETTER ALPHA" )
 594  : ( "ɒ"   U+0252 Ll 1 "LATIN SMALL LETTER TURNED ALPHA" )
 595  : ( "ɓ"   U+0253 Ll 1 "LATIN SMALL LETTER B WITH HOOK" )
 596  : ( "ɔ"   U+0254 Ll 1 "LATIN SMALL LETTER OPEN O" )
 597  : ( "ɕ"   U+0255 Ll 1 "LATIN SMALL LETTER C WITH CURL" )
 598  : ( "ɖ"   U+0256 Ll 1 "LATIN SMALL LETTER D WITH TAIL" )
 599  : ( "ɗ"   U+0257 Ll 1 "LATIN SMALL LETTER D WITH HOOK" )
 600  : ( "ɘ"   U+0258 Ll 1 "LATIN SMALL LETTER REVERSED E" )
 601  : ( "ə"   U+0259 Ll 1 "LATIN SMALL LETTER SCHWA" )
 602  : ( "ɚ"   U+025A Ll 1 "LATIN SMALL LETTER SCHWA WITH HOOK" )
 603  : ( "ɛ"   U+025B Ll 1 "LATIN SMALL LETTER OPEN E" )
 604  : ( "ɜ"   U+025C Ll 1 "LATIN SMALL LETTER REVERSED OPEN E" )
 605  : ( "ɝ"   U+025D Ll 1 "LATIN SMALL LETTER REVERSED OPEN E WITH HOOK" )
 606  : ( "ɞ"   U+025E Ll 1 "LATIN SMALL LETTER CLOSED REVERSED OPEN E" )
 607  : ( "ɟ"   U+025F Ll 1 "LATIN SMALL LETTER DOTLESS J WITH STROKE" )
 608  : ( "ɠ"   U+0260 Ll 1 "LATIN SMALL LETTER G WITH HOOK" )
 609  : ( "ɡ"   U+0261 Ll 1 "LATIN SMALL LETTER SCRIPT G" )
 610  : ( "ɢ"   U+0262 Ll 1 "LATIN LETTER SMALL CAPITAL G" )
 611  : ( "ɣ"   U+0263 Ll 1 "LATIN SMALL LETTER GAMMA" )
 612  : ( "ɤ"   U+0264 Ll 1 "LATIN SMALL LETTER RAMS HORN" )
 613  : ( "ɥ"   U+0265 Ll 1 "LATIN SMALL LETTER TURNED H" )
 614  : ( "ɦ"   U+0266 Ll 1 "LATIN SMALL LETTER H WITH HOOK" )
 615  : ( "ɧ"   U+0267 Ll 1 "LATIN SMALL LETTER HENG WITH HOOK" )
 616  : ( "ɨ"   U+0268 Ll 1 "LATIN SMALL LETTER I WITH STROKE" )
 617  : ( "ɩ"   U+0269 Ll 1 "LATIN SMALL LETTER IOTA" )
 618  : ( "ɪ"   U+026A Ll 1 "LATIN LETTER SMALL CAPITAL I" )
 619  : ( "ɫ"   U+026B Ll 1 "LATIN SMALL LETTER L WITH MIDDLE TILDE" )
 620  : ( "ɬ"   U+026C Ll 1 "LATIN SMALL LETTER L WITH BELT" )
 621  : ( "ɭ"   U+026D Ll 1 "LATIN SMALL LETTER L WITH RETROFLEX HOOK" )
 622  : ( "ɮ"   U+026E Ll 1 "LATIN SMALL LETTER LEZH" )
 623  : ( "ɯ"   U+026F Ll 1 "LATIN SMALL LETTER TURNED M" )
 624  : ( "ɰ"   U+0270 Ll 1 "LATIN SMALL LETTER TURNED M WITH LONG LEG" )
 625  : ( "ɱ"   U+0271 Ll 1 "LATIN SMALL LETTER M WITH HOOK" )
 626  : ( "ɲ"   U+0272 Ll 1 "LATIN SMALL LETTER N WITH LEFT HOOK" )
 627  : ( "ɳ"   U+0273 Ll 1 "LATIN SMALL LETTER N WITH RETROFLEX HOOK" )
 628  : ( "ɴ"   U+0274 Ll 1 "LATIN LETTER SMALL CAPITAL N" )
 629  : ( "ɵ"   U+0275 Ll 1 "LATIN SMALL LETTER BARRED O" )
 630  : ( "ɶ"   U+0276 Ll 1 "LATIN LETTER SMALL CAPITAL OE" )
 631  : ( "ɷ"   U+0277 Ll 1 "LATIN SMALL LETTER CLOSED OMEGA" )
 632  : ( "ɸ"   U+0278 Ll 1 "LATIN SMALL LETTER PHI" )
 633  : ( "ɹ"   U+0279 Ll 1 "LATIN SMALL LETTER TURNED R" )
 634  : ( "ɺ"   U+027A Ll 1 "LATIN SMALL LETTER TURNED R WITH LONG LEG" )
 635  : ( "ɻ"   U+027B Ll 1 "LATIN SMALL LETTER TURNED R WITH HOOK" )
 636  : ( "ɼ"   U+027C Ll 1 "LATIN SMALL LETTER R WITH LONG LEG" )
 637  : ( "ɽ"   U+027D Ll 1 "LATIN SMALL LETTER R WITH TAIL" )
 638  : ( "ɾ"   U+027E Ll 1 "LATIN SMALL LETTER R WITH FISHHOOK" )
 639  : ( "ɿ"   U+027F Ll 1 "LATIN SMALL LETTER REVERSED R WITH FISHHOOK" )
 640  : ( "ʀ"   U+0280 Ll 1 "LATIN LETTER SMALL CAPITAL R" )
 641  : ( "ʁ"   U+0281 Ll 1 "LATIN LETTER SMALL CAPITAL INVERTED R" )
 642  : ( "ʂ"   U+0282 Ll 1 "LATIN SMALL LETTER S WITH HOOK" )
 643  : ( "ʃ"   U+0283 Ll 1 "LATIN SMALL LETTER ESH" )
 644  : ( "ʄ"   U+0284 Ll 1 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" )
 645  : ( "ʅ"   U+0285 Ll 1 "LATIN SMALL LETTER SQUAT REVERSED ESH" )
 646  : ( "ʆ"   U+0286 Ll 1 "LATIN SMALL LETTER ESH WITH CURL" )
 647  : ( "ʇ"   U+0287 Ll 1 "LATIN SMALL LETTER TURNED T" )
 648  : ( "ʈ"   U+0288 Ll 1 "LATIN SMALL LETTER T WITH RETROFLEX HOOK" )
 649  : ( "ʉ"   U+0289 Ll 1 "LATIN SMALL LETTER U BAR" )
 650  : ( "ʊ"   U+028A Ll 1 "LATIN SMALL LETTER UPSILON" )
 651  : ( "ʋ"   U+028B Ll 1 "LATIN SMALL LETTER V WITH HOOK" )
 652  : ( "ʌ"   U+028C Ll 1 "LATIN SMALL LETTER TURNED V" )
 653  : ( "ʍ"   U+028D Ll 1 "LATIN SMALL LETTER TURNED W" )
 654  : ( "ʎ"   U+028E Ll 1 "LATIN SMALL LETTER TURNED Y" )
 655  : ( "ʏ"   U+028F Ll 1 "LATIN LETTER SMALL CAPITAL Y" )
 656  : ( "ʐ"   U+0290 Ll 1 "LATIN SMALL LETTER Z WITH RETROFLEX HOOK" )
 657  : ( "ʑ"   U+0291 Ll 1 "LATIN SMALL LETTER Z WITH CURL" )
 658  : ( "ʒ"   U+0292 Ll 1 "LATIN SMALL LETTER EZH" )
 659  : ( "ʓ"   U+0293 Ll 1 "LATIN SMALL LETTER EZH WITH CURL" )
 660  : ( "ʔ"   U+0294 Lo 1 "LATIN LETTER GLOTTAL STOP" )
 661  : ( "ʕ"   U+0295 Ll 1 "LATIN LETTER PHARYNGEAL VOICED FRICATIVE" )
 662  : ( "ʖ"   U+0296 Ll 1 "LATIN LETTER INVERTED GLOTTAL STOP" )
 663  : ( "ʗ"   U+0297 Ll 1 "LATIN LETTER STRETCHED C" )
 664  : ( "ʘ"   U+0298 Ll 1 "LATIN LETTER BILABIAL CLICK" )
 665  : ( "ʙ"   U+0299 Ll 1 "LATIN LETTER SMALL CAPITAL B" )
 666  : ( "ʚ"   U+029A Ll 1 "LATIN SMALL LETTER CLOSED OPEN E" )
 667  : ( "ʛ"   U+029B Ll 1 "LATIN LETTER SMALL CAPITAL G WITH HOOK" )
 668  : ( "ʜ"   U+029C Ll 1 "LATIN LETTER SMALL CAPITAL H" )
 669  : ( "ʝ"   U+029D Ll 1 "LATIN SMALL LETTER J WITH CROSSED-TAIL" )
 670  : ( "ʞ"   U+029E Ll 1 "LATIN SMALL LETTER TURNED K" )
 671  : ( "ʟ"   U+029F Ll 1 "LATIN LETTER SMALL CAPITAL L" )
 672  : ( "ʠ"   U+02A0 Ll 1 "LATIN SMALL LETTER Q WITH HOOK" )
 673  : ( "ʡ"   U+02A1 Ll 1 "LATIN LETTER GLOTTAL STOP WITH STROKE" )
 674  : ( "ʢ"   U+02A2 Ll 1 "LATIN LETTER REVERSED GLOTTAL STOP WITH STROKE" )
 675  : ( "ʣ"   U+02A3 Ll 1 "LATIN SMALL LETTER DZ DIGRAPH" )
 676  : ( "ʤ"   U+02A4 Ll 1 "LATIN SMALL LETTER DEZH DIGRAPH" )
 677  : ( "ʥ"   U+02A5 Ll 1 "LATIN SMALL LETTER DZ DIGRAPH WITH CURL" )
 678  : ( "ʦ"   U+02A6 Ll 1 "LATIN SMALL LETTER TS DIGRAPH" )
 679  : ( "ʧ"   U+02A7 Ll 1 "LATIN SMALL LETTER TESH DIGRAPH" )
 680  : ( "ʨ"   U+02A8 Ll 1 "LATIN SMALL LETTER TC DIGRAPH WITH CURL" )
 681  : ( "ʩ"   U+02A9 Ll 1 "LATIN SMALL LETTER FENG DIGRAPH" )
 682  : ( "ʪ"   U+02AA Ll 1 "LATIN SMALL LETTER LS DIGRAPH" )
 683  : ( "ʫ"   U+02AB Ll 1 "LATIN SMALL LETTER LZ DIGRAPH" )
 684  : ( "ʬ"   U+02AC Ll 1 "LATIN LETTER BILABIAL PERCUSSIVE" )
 685  : ( "ʭ"   U+02AD Ll 1 "LATIN LETTER BIDENTAL PERCUSSIVE" )
 686  : ( "ʮ"   U+02AE Ll 1 "LATIN SMALL LETTER TURNED H WITH FISHHOOK" )
 687  : ( "ʯ"   U+02AF Ll 1 "LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL" )
 688  : ( "ʰ"   U+02B0 Lm 1 "MODIFIER LETTER SMALL H" )
 689  : ( "ʱ"   U+02B1 Lm 1 "MODIFIER LETTER SMALL H WITH HOOK" )
 690  : ( "ʲ"   U+02B2 Lm 1 "MODIFIER LETTER SMALL J" )
 691  : ( "ʳ"   U+02B3 Lm 1 "MODIFIER LETTER SMALL R" )
 692  : ( "ʴ"   U+02B4 Lm 1 "MODIFIER LETTER SMALL TURNED R" )
 693  : ( "ʵ"   U+02B5 Lm 1 "MODIFIER LETTER SMALL TURNED R WITH HOOK" )
 694  : ( "ʶ"   U+02B6 Lm 1 "MODIFIER LETTER SMALL CAPITAL INVERTED R" )
 695  : ( "ʷ"   U+02B7 Lm 1 "MODIFIER LETTER SMALL W" )
 696  : ( "ʸ"   U+02B8 Lm 1 "MODIFIER LETTER SMALL Y" )
 697  : ( "ʹ"   U+02B9 Lm 1 "MODIFIER LETTER PRIME" )
 698  : ( "ʺ"   U+02BA Lm 1 "MODIFIER LETTER DOUBLE PRIME" )
 699  : ( "ʻ"   U+02BB Lm 1 "MODIFIER LETTER TURNED COMMA" )
 700  : ( "ʼ"   U+02BC Lm 1 "MODIFIER LETTER APOSTROPHE" )
 701  : ( "ʽ"   U+02BD Lm 1 "MODIFIER LETTER REVERSED COMMA" )
 702  : ( "ʾ"   U+02BE Lm 1 "MODIFIER LETTER RIGHT HALF RING" )
 703  : ( "ʿ"   U+02BF Lm 1 "MODIFIER LETTER LEFT HALF RING" )
 704  : ( "ˀ"   U+02C0 Lm 1 "MODIFIER LETTER GLOTTAL STOP" )
 705  : ( "ˁ"   U+02C1 Lm 1 "MODIFIER LETTER REVERSED GLOTTAL STOP" )
 706  : ( "˂"   U+02C2 Sk 1 "MODIFIER LETTER LEFT ARROWHEAD" )
 707  : ( "˃"   U+02C3 Sk 1 "MODIFIER LETTER RIGHT ARROWHEAD" )
 708  : ( "˄"   U+02C4 Sk 1 "MODIFIER LETTER UP ARROWHEAD" )
 709  : ( "˅"   U+02C5 Sk 1 "MODIFIER LETTER DOWN ARROWHEAD" )
 710  : ( "ˆ"   U+02C6 Lm 1 "MODIFIER LETTER CIRCUMFLEX ACCENT" )
 711  : ( "ˇ"   U+02C7 Lm 1 "CARON" )
 712  : ( "ˈ"   U+02C8 Lm 1 "MODIFIER LETTER VERTICAL LINE" )
 713  : ( "ˉ"   U+02C9 Lm 1 "MODIFIER LETTER MACRON" )
 714  : ( "ˊ"   U+02CA Lm 1 "MODIFIER LETTER ACUTE ACCENT" )
 715  : ( "ˋ"   U+02CB Lm 1 "MODIFIER LETTER GRAVE ACCENT" )
 716  : ( "ˌ"   U+02CC Lm 1 "MODIFIER LETTER LOW VERTICAL LINE" )
 717  : ( "ˍ"   U+02CD Lm 1 "MODIFIER LETTER LOW MACRON" )
 718  : ( "ˎ"   U+02CE Lm 1 "MODIFIER LETTER LOW GRAVE ACCENT" )
 719  : ( "ˏ"   U+02CF Lm 1 "MODIFIER LETTER LOW ACUTE ACCENT" )
 720  : ( "ː"   U+02D0 Lm 1 "MODIFIER LETTER TRIANGULAR COLON" )
 721  : ( "ˑ"   U+02D1 Lm 1 "MODIFIER LETTER HALF TRIANGULAR COLON" )
 722  : ( "˒"   U+02D2 Sk 1 "MODIFIER LETTER CENTRED RIGHT HALF RING" )
 723  : ( "˓"   U+02D3 Sk 1 "MODIFIER LETTER CENTRED LEFT HALF RING" )
 724  : ( "˔"   U+02D4 Sk 1 "MODIFIER LETTER UP TACK" )
 725  : ( "˕"   U+02D5 Sk 1 "MODIFIER LETTER DOWN TACK" )
 726  : ( "˖"   U+02D6 Sk 1 "MODIFIER LETTER PLUS SIGN" )
 727  : ( "˗"   U+02D7 Sk 1 "MODIFIER LETTER MINUS SIGN" )
 728  : ( "˘"   U+02D8 Sk 1 "BREVE" )
 729  : ( "˙"   U+02D9 Sk 1 "DOT ABOVE" )
 730  : ( "˚"   U+02DA Sk 1 "RING ABOVE" )
 731  : ( "˛"   U+02DB Sk 1 "OGONEK" )
 732  : ( "˜"   U+02DC Sk 1 "SMALL TILDE" )
 733  : ( "˝"   U+02DD Sk 1 "DOUBLE ACUTE ACCENT" )
 734  : ( "˞"   U+02DE Sk 1 "MODIFIER LETTER RHOTIC HOOK" )
 735  : ( "˟"   U+02DF Sk 1 "MODIFIER LETTER CROSS ACCENT" )
 736  : ( "ˠ"   U+02E0 Lm 1 "MODIFIER LETTER SMALL GAMMA" )
 737  : ( "ˡ"   U+02E1 Lm 1 "MODIFIER LETTER SMALL L" )
 738  : ( "ˢ"   U+02E2 Lm 1 "MODIFIER LETTER SMALL S" )
 739  : ( "ˣ"   U+02E3 Lm 1 "MODIFIER LETTER SMALL X" )
 740  : ( "ˤ"   U+02E4 Lm 1 "MODIFIER LETTER SMALL REVERSED GLOTTAL STOP" )
 741  : ( "˥"   U+02E5 Sk 1 "MODIFIER LETTER EXTRA-HIGH TONE BAR" )
 742  : ( "˦"   U+02E6 Sk 1 "MODIFIER LETTER HIGH TONE BAR" )
 743  : ( "˧"   U+02E7 Sk 1 "MODIFIER LETTER MID TONE BAR" )
 744  : ( "˨"   U+02E8 Sk 1 "MODIFIER LETTER LOW TONE BAR" )
 745  : ( "˩"   U+02E9 Sk 1 "MODIFIER LETTER EXTRA-LOW TONE BAR" )
 746  : ( "˪"   U+02EA Sk 1 "MODIFIER LETTER YIN DEPARTING TONE MARK" )
 747  : ( "˫"   U+02EB Sk 1 "MODIFIER LETTER YANG DEPARTING TONE MARK" )
 748  : ( "ˬ"   U+02EC Lm 1 "MODIFIER LETTER VOICING" )
 749  : ( "˭"   U+02ED Sk 1 "MODIFIER LETTER UNASPIRATED" )
 750  : ( "ˮ"   U+02EE Lm 1 "MODIFIER LETTER DOUBLE APOSTROPHE" )
 751  : ( "˯"   U+02EF Sk 1 "MODIFIER LETTER LOW DOWN ARROWHEAD" )
 752  : ( "˰"   U+02F0 Sk 1 "MODIFIER LETTER LOW UP ARROWHEAD" )
 753  : ( "˱"   U+02F1 Sk 1 "MODIFIER LETTER LOW LEFT ARROWHEAD" )
 754  : ( "˲"   U+02F2 Sk 1 "MODIFIER LETTER LOW RIGHT ARROWHEAD" )
 755  : ( "˳"   U+02F3 Sk 1 "MODIFIER LETTER LOW RING" )
 756  : ( "˴"   U+02F4 Sk 1 "MODIFIER LETTER MIDDLE GRAVE ACCENT" )
 757  : ( "˵"   U+02F5 Sk 1 "MODIFIER LETTER MIDDLE DOUBLE GRAVE ACCENT" )
 758  : ( "˶"   U+02F6 Sk 1 "MODIFIER LETTER MIDDLE DOUBLE ACUTE ACCENT" )
 759  : ( "˷"   U+02F7 Sk 1 "MODIFIER LETTER LOW TILDE" )
 760  : ( "˸"   U+02F8 Sk 1 "MODIFIER LETTER RAISED COLON" )
 761  : ( "˹"   U+02F9 Sk 1 "MODIFIER LETTER BEGIN HIGH TONE" )
 762  : ( "˺"   U+02FA Sk 1 "MODIFIER LETTER END HIGH TONE" )
 763  : ( "˻"   U+02FB Sk 1 "MODIFIER LETTER BEGIN LOW TONE" )
 764  : ( "˼"   U+02FC Sk 1 "MODIFIER LETTER END LOW TONE" )
 765  : ( "˽"   U+02FD Sk 1 "MODIFIER LETTER SHELF" )
 766  : ( "˾"   U+02FE Sk 1 "MODIFIER LETTER OPEN SHELF" )
 767  : ( "˿"   U+02FF Sk 1 "MODIFIER LETTER LOW LEFT ARROW" )
 768  : ( "̀"    U+0300 Mn 0 "COMBINING GRAVE ACCENT" )
 769  : ( "́"    U+0301 Mn 0 "COMBINING ACUTE ACCENT" )
 770  : ( "̂"    U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" )
 771  : ( "̃"    U+0303 Mn 0 "COMBINING TILDE" )
 772  : ( "̄"    U+0304 Mn 0 "COMBINING MACRON" )
 773  : ( "̅"    U+0305 Mn 0 "COMBINING OVERLINE" )
 774  : ( "̆"    U+0306 Mn 0 "COMBINING BREVE" )
 775  : ( "̇"    U+0307 Mn 0 "COMBINING DOT ABOVE" )
 776  : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 777  : ( "̉"    U+0309 Mn 0 "COMBINING HOOK ABOVE" )
 778  : ( "̊"    U+030A Mn 0 "COMBINING RING ABOVE" )
 779  : ( "̋"    U+030B Mn 0 "COMBINING DOUBLE ACUTE ACCENT" )
 780  : ( "̌"    U+030C Mn 0 "COMBINING CARON" )
 781  : ( "̍"    U+030D Mn 0 "COMBINING VERTICAL LINE ABOVE" )
 782  : ( "̎"    U+030E Mn 0 "COMBINING DOUBLE VERTICAL LINE ABOVE" )
 783  : ( "̏"    U+030F Mn 0 "COMBINING DOUBLE GRAVE ACCENT" )
 784  : ( "̐"    U+0310 Mn 0 "COMBINING CANDRABINDU" )
 785  : ( "̑"    U+0311 Mn 0 "COMBINING INVERTED BREVE" )
 786  : ( "̒"    U+0312 Mn 0 "COMBINING TURNED COMMA ABOVE" )
 787  : ( "̓"    U+0313 Mn 0 "COMBINING COMMA ABOVE" )
 788  : ( "̔"    U+0314 Mn 0 "COMBINING REVERSED COMMA ABOVE" )
 789  : ( "̕"    U+0315 Mn 0 "COMBINING COMMA ABOVE RIGHT" )
 790  : ( "̖"    U+0316 Mn 0 "COMBINING GRAVE ACCENT BELOW" )
 791  : ( "̗"    U+0317 Mn 0 "COMBINING ACUTE ACCENT BELOW" )
 792  : ( "̘"    U+0318 Mn 0 "COMBINING LEFT TACK BELOW" )
 793  : ( "̙"    U+0319 Mn 0 "COMBINING RIGHT TACK BELOW" )
 794  : ( "̚"    U+031A Mn 0 "COMBINING LEFT ANGLE ABOVE" )
 795  : ( "̛"    U+031B Mn 0 "COMBINING HORN" )
 796  : ( "̜"    U+031C Mn 0 "COMBINING LEFT HALF RING BELOW" )
 797  : ( "̝"    U+031D Mn 0 "COMBINING UP TACK BELOW" )
 798  : ( "̞"    U+031E Mn 0 "COMBINING DOWN TACK BELOW" )
 799  : ( "̟"    U+031F Mn 0 "COMBINING PLUS SIGN BELOW" )
 800  : ( "̠"    U+0320 Mn 0 "COMBINING MINUS SIGN BELOW" )
 801  : ( "̡"    U+0321 Mn 0 "COMBINING PALATALIZED HOOK BELOW" )
 802  : ( "̢"    U+0322 Mn 0 "COMBINING RETROFLEX HOOK BELOW" )
 803  : ( "̣"    U+0323 Mn 0 "COMBINING DOT BELOW" )
 804  : ( "̤"    U+0324 Mn 0 "COMBINING DIAERESIS BELOW" )
 805  : ( "̥"    U+0325 Mn 0 "COMBINING RING BELOW" )
 806  : ( "̦"    U+0326 Mn 0 "COMBINING COMMA BELOW" )
 807  : ( "̧"    U+0327 Mn 0 "COMBINING CEDILLA" )
 808  : ( "̨"    U+0328 Mn 0 "COMBINING OGONEK" )
 809  : ( "̩"    U+0329 Mn 0 "COMBINING VERTICAL LINE BELOW" )
 810  : ( "̪"    U+032A Mn 0 "COMBINING BRIDGE BELOW" )
 811  : ( "̫"    U+032B Mn 0 "COMBINING INVERTED DOUBLE ARCH BELOW" )
 812  : ( "̬"    U+032C Mn 0 "COMBINING CARON BELOW" )
 813  : ( "̭"    U+032D Mn 0 "COMBINING CIRCUMFLEX ACCENT BELOW" )
 814  : ( "̮"    U+032E Mn 0 "COMBINING BREVE BELOW" )
 815  : ( "̯"    U+032F Mn 0 "COMBINING INVERTED BREVE BELOW" )
 816  : ( "̰"    U+0330 Mn 0 "COMBINING TILDE BELOW" )
 817  : ( "̱"    U+0331 Mn 0 "COMBINING MACRON BELOW" )
 818  : ( "̲"    U+0332 Mn 0 "COMBINING LOW LINE" )
 819  : ( "̳"    U+0333 Mn 0 "COMBINING DOUBLE LOW LINE" )
 820  : ( "̴"    U+0334 Mn 0 "COMBINING TILDE OVERLAY" )
 821  : ( "̵"    U+0335 Mn 0 "COMBINING SHORT STROKE OVERLAY" )
 822  : ( "̶"    U+0336 Mn 0 "COMBINING LONG STROKE OVERLAY" )
 823  : ( "̷"    U+0337 Mn 0 "COMBINING SHORT SOLIDUS OVERLAY" )
 824  : ( "̸"    U+0338 Mn 0 "COMBINING LONG SOLIDUS OVERLAY" )
 825  : ( "̹"    U+0339 Mn 0 "COMBINING RIGHT HALF RING BELOW" )
 826  : ( "̺"    U+033A Mn 0 "COMBINING INVERTED BRIDGE BELOW" )
 827  : ( "̻"    U+033B Mn 0 "COMBINING SQUARE BELOW" )
 828  : ( "̼"    U+033C Mn 0 "COMBINING SEAGULL BELOW" )
 829  : ( "̽"    U+033D Mn 0 "COMBINING X ABOVE" )
 830  : ( "̾"    U+033E Mn 0 "COMBINING VERTICAL TILDE" )
 831  : ( "̿"    U+033F Mn 0 "COMBINING DOUBLE OVERLINE" )
 832  : ( "̀"    U+0340 Mn 0 "COMBINING GRAVE TONE MARK" )
 833  : ( "́"    U+0341 Mn 0 "COMBINING ACUTE TONE MARK" )
 834  : ( "͂"    U+0342 Mn 0 "COMBINING GREEK PERISPOMENI" )
 835  : ( "̓"    U+0343 Mn 0 "COMBINING GREEK KORONIS" )
 836  : ( "̈́"    U+0344 Mn 0 "COMBINING GREEK DIALYTIKA TONOS" )
 837  : ( "ͅ"    U+0345 Mn 0 "COMBINING GREEK YPOGEGRAMMENI" )
 838  : ( "͆"    U+0346 Mn 0 "COMBINING BRIDGE ABOVE" )
 839  : ( "͇"    U+0347 Mn 0 "COMBINING EQUALS SIGN BELOW" )
 840  : ( "͈"    U+0348 Mn 0 "COMBINING DOUBLE VERTICAL LINE BELOW" )
 841  : ( "͉"    U+0349 Mn 0 "COMBINING LEFT ANGLE BELOW" )
 842  : ( "͊"    U+034A Mn 0 "COMBINING NOT TILDE ABOVE" )
 843  : ( "͋"    U+034B Mn 0 "COMBINING HOMOTHETIC ABOVE" )
 844  : ( "͌"    U+034C Mn 0 "COMBINING ALMOST EQUAL TO ABOVE" )
 845  : ( "͍"    U+034D Mn 0 "COMBINING LEFT RIGHT ARROW BELOW" )
 846  : ( "͎"    U+034E Mn 0 "COMBINING UPWARDS ARROW BELOW" )
 847  : ( "͏"    U+034F Mn 0 "COMBINING GRAPHEME JOINER", "CGJ" )
 848  : ( "͐"    U+0350 Mn 0 "COMBINING RIGHT ARROWHEAD ABOVE" )
 849  : ( "͑"    U+0351 Mn 0 "COMBINING LEFT HALF RING ABOVE" )
 850  : ( "͒"    U+0352 Mn 0 "COMBINING FERMATA" )
 851  : ( "͓"    U+0353 Mn 0 "COMBINING X BELOW" )
 852  : ( "͔"    U+0354 Mn 0 "COMBINING LEFT ARROWHEAD BELOW" )
 853  : ( "͕"    U+0355 Mn 0 "COMBINING RIGHT ARROWHEAD BELOW" )
 854  : ( "͖"    U+0356 Mn 0 "COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW" )
 855  : ( "͗"    U+0357 Mn 0 "COMBINING RIGHT HALF RING ABOVE" )
 856  : ( "͘"    U+0358 Mn 0 "COMBINING DOT ABOVE RIGHT" )
 857  : ( "͙"    U+0359 Mn 0 "COMBINING ASTERISK BELOW" )
 858  : ( "͚"    U+035A Mn 0 "COMBINING DOUBLE RING BELOW" )
 859  : ( "͛"    U+035B Mn 0 "COMBINING ZIGZAG ABOVE" )
 860  : ( "͜"    U+035C Mn 0 "COMBINING DOUBLE BREVE BELOW" )
 861  : ( "͝"    U+035D Mn 0 "COMBINING DOUBLE BREVE" )
 862  : ( "͞"    U+035E Mn 0 "COMBINING DOUBLE MACRON" )
 863  : ( "͟"    U+035F Mn 0 "COMBINING DOUBLE MACRON BELOW" )
 864  : ( "͠"    U+0360 Mn 0 "COMBINING DOUBLE TILDE" )
 865  : ( "͡"    U+0361 Mn 0 "COMBINING DOUBLE INVERTED BREVE" )
 866  : ( "͢"    U+0362 Mn 0 "COMBINING DOUBLE RIGHTWARDS ARROW BELOW" )
 867  : ( "ͣ"    U+0363 Mn 0 "COMBINING LATIN SMALL LETTER A" )
 868  : ( "ͤ"    U+0364 Mn 0 "COMBINING LATIN SMALL LETTER E" )
 869  : ( "ͥ"    U+0365 Mn 0 "COMBINING LATIN SMALL LETTER I" )
 870  : ( "ͦ"    U+0366 Mn 0 "COMBINING LATIN SMALL LETTER O" )
 871  : ( "ͧ"    U+0367 Mn 0 "COMBINING LATIN SMALL LETTER U" )
 872  : ( "ͨ"    U+0368 Mn 0 "COMBINING LATIN SMALL LETTER C" )
 873  : ( "ͩ"    U+0369 Mn 0 "COMBINING LATIN SMALL LETTER D" )
 874  : ( "ͪ"    U+036A Mn 0 "COMBINING LATIN SMALL LETTER H" )
 875  : ( "ͫ"    U+036B Mn 0 "COMBINING LATIN SMALL LETTER M" )
 876  : ( "ͬ"    U+036C Mn 0 "COMBINING LATIN SMALL LETTER R" )
 877  : ( "ͭ"    U+036D Mn 0 "COMBINING LATIN SMALL LETTER T" )
 878  : ( "ͮ"    U+036E Mn 0 "COMBINING LATIN SMALL LETTER V" )
 879  : ( "ͯ"    U+036F Mn 0 "COMBINING LATIN SMALL LETTER X" )
 880  : ( "Ͱ"   U+0370 Lu 1 "GREEK CAPITAL LETTER HETA" )
 881  : ( "ͱ"   U+0371 Ll 1 "GREEK SMALL LETTER HETA" )
 882  : ( "Ͳ"   U+0372 Lu 1 "GREEK CAPITAL LETTER ARCHAIC SAMPI" )
 883  : ( "ͳ"   U+0373 Ll 1 "GREEK SMALL LETTER ARCHAIC SAMPI" )
 884  : ( "ʹ"   U+0374 Lm 1 "GREEK NUMERAL SIGN" )
 885  : ( "͵"   U+0375 Sk 1 "GREEK LOWER NUMERAL SIGN" )
 886  : ( "Ͷ"   U+0376 Lu 1 "GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA" )
 887  : ( "ͷ"   U+0377 Ll 1 "GREEK SMALL LETTER PAMPHYLIAN DIGAMMA" )
 890  : ( "ͺ"   U+037A Lm 1 "GREEK YPOGEGRAMMENI" )
 891  : ( "ͻ"   U+037B Ll 1 "GREEK SMALL REVERSED LUNATE SIGMA SYMBOL" )
 892  : ( "ͼ"   U+037C Ll 1 "GREEK SMALL DOTTED LUNATE SIGMA SYMBOL" )
 893  : ( "ͽ"   U+037D Ll 1 "GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL" )
 894  : ( ";"   U+037E Po 1 "GREEK QUESTION MARK" )
 895  : ( "Ϳ"   U+037F Lu 1 "GREEK CAPITAL LETTER YOT" )
 900  : ( "΄"   U+0384 Sk 1 "GREEK TONOS" )
 901  : ( "΅"   U+0385 Sk 1 "GREEK DIALYTIKA TONOS" )
 902  : ( "Ά"   U+0386 Lu 1 "GREEK CAPITAL LETTER ALPHA WITH TONOS" )
 903  : ( "·"   U+0387 Po 1 "GREEK ANO TELEIA" )
 904  : ( "Έ"   U+0388 Lu 1 "GREEK CAPITAL LETTER EPSILON WITH TONOS" )
 905  : ( "Ή"   U+0389 Lu 1 "GREEK CAPITAL LETTER ETA WITH TONOS" )
 906  : ( "Ί"   U+038A Lu 1 "GREEK CAPITAL LETTER IOTA WITH TONOS" )
 908  : ( "Ό"   U+038C Lu 1 "GREEK CAPITAL LETTER OMICRON WITH TONOS" )
 910  : ( "Ύ"   U+038E Lu 1 "GREEK CAPITAL LETTER UPSILON WITH TONOS" )
 911  : ( "Ώ"   U+038F Lu 1 "GREEK CAPITAL LETTER OMEGA WITH TONOS" )
 912  : ( "ΐ"   U+0390 Ll 1 "GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS" )
 913  : ( "Α"   U+0391 Lu 1 "GREEK CAPITAL LETTER ALPHA" )
 914  : ( "Β"   U+0392 Lu 1 "GREEK CAPITAL LETTER BETA" )
 915  : ( "Γ"   U+0393 Lu 1 "GREEK CAPITAL LETTER GAMMA" )
 916  : ( "Δ"   U+0394 Lu 1 "GREEK CAPITAL LETTER DELTA" )
 917  : ( "Ε"   U+0395 Lu 1 "GREEK CAPITAL LETTER EPSILON" )
 918  : ( "Ζ"   U+0396 Lu 1 "GREEK CAPITAL LETTER ZETA" )
 919  : ( "Η"   U+0397 Lu 1 "GREEK CAPITAL LETTER ETA" )
 920  : ( "Θ"   U+0398 Lu 1 "GREEK CAPITAL LETTER THETA" )
 921  : ( "Ι"   U+0399 Lu 1 "GREEK CAPITAL LETTER IOTA" )
 922  : ( "Κ"   U+039A Lu 1 "GREEK CAPITAL LETTER KAPPA" )
 923  : ( "Λ"   U+039B Lu 1 "GREEK CAPITAL LETTER LAMDA" )
 924  : ( "Μ"   U+039C Lu 1 "GREEK CAPITAL LETTER MU" )
 925  : ( "Ν"   U+039D Lu 1 "GREEK CAPITAL LETTER NU" )
 926  : ( "Ξ"   U+039E Lu 1 "GREEK CAPITAL LETTER XI" )
 927  : ( "Ο"   U+039F Lu 1 "GREEK CAPITAL LETTER OMICRON" )
 928  : ( "Π"   U+03A0 Lu 1 "GREEK CAPITAL LETTER PI" )
 929  : ( "Ρ"   U+03A1 Lu 1 "GREEK CAPITAL LETTER RHO" )
 931  : ( "Σ"   U+03A3 Lu 1 "GREEK CAPITAL LETTER SIGMA" )
 932  : ( "Τ"   U+03A4 Lu 1 "GREEK CAPITAL LETTER TAU" )
 933  : ( "Υ"   U+03A5 Lu 1 "GREEK CAPITAL LETTER UPSILON" )
 934  : ( "Φ"   U+03A6 Lu 1 "GREEK CAPITAL LETTER PHI" )
 935  : ( "Χ"   U+03A7 Lu 1 "GREEK CAPITAL LETTER CHI" )
 936  : ( "Ψ"   U+03A8 Lu 1 "GREEK CAPITAL LETTER PSI" )
 937  : ( "Ω"   U+03A9 Lu 1 "GREEK CAPITAL LETTER OMEGA" )
 938  : ( "Ϊ"   U+03AA Lu 1 "GREEK CAPITAL LETTER IOTA WITH DIALYTIKA" )
 939  : ( "Ϋ"   U+03AB Lu 1 "GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA" )
 940  : ( "ά"   U+03AC Ll 1 "GREEK SMALL LETTER ALPHA WITH TONOS" )
 941  : ( "έ"   U+03AD Ll 1 "GREEK SMALL LETTER EPSILON WITH TONOS" )
 942  : ( "ή"   U+03AE Ll 1 "GREEK SMALL LETTER ETA WITH TONOS" )
 943  : ( "ί"   U+03AF Ll 1 "GREEK SMALL LETTER IOTA WITH TONOS" )
 944  : ( "ΰ"   U+03B0 Ll 1 "GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS" )
 945  : ( "α"   U+03B1 Ll 1 "GREEK SMALL LETTER ALPHA" )
 946  : ( "β"   U+03B2 Ll 1 "GREEK SMALL LETTER BETA" )
 947  : ( "γ"   U+03B3 Ll 1 "GREEK SMALL LETTER GAMMA" )
 948  : ( "δ"   U+03B4 Ll 1 "GREEK SMALL LETTER DELTA" )
 949  : ( "ε"   U+03B5 Ll 1 "GREEK SMALL LETTER EPSILON" )
 950  : ( "ζ"   U+03B6 Ll 1 "GREEK SMALL LETTER ZETA" )
 951  : ( "η"   U+03B7 Ll 1 "GREEK SMALL LETTER ETA" )
 952  : ( "θ"   U+03B8 Ll 1 "GREEK SMALL LETTER THETA" )
 953  : ( "ι"   U+03B9 Ll 1 "GREEK SMALL LETTER IOTA" )
 954  : ( "κ"   U+03BA Ll 1 "GREEK SMALL LETTER KAPPA" )
 955  : ( "λ"   U+03BB Ll 1 "GREEK SMALL LETTER LAMDA" )
 956  : ( "μ"   U+03BC Ll 1 "GREEK SMALL LETTER MU" )
 957  : ( "ν"   U+03BD Ll 1 "GREEK SMALL LETTER NU" )
 958  : ( "ξ"   U+03BE Ll 1 "GREEK SMALL LETTER XI" )
 959  : ( "ο"   U+03BF Ll 1 "GREEK SMALL LETTER OMICRON" )
 960  : ( "π"   U+03C0 Ll 1 "GREEK SMALL LETTER PI" )
 961  : ( "ρ"   U+03C1 Ll 1 "GREEK SMALL LETTER RHO" )
 962  : ( "ς"   U+03C2 Ll 1 "GREEK SMALL LETTER FINAL SIGMA" )
 963  : ( "σ"   U+03C3 Ll 1 "GREEK SMALL LETTER SIGMA" )
 964  : ( "τ"   U+03C4 Ll 1 "GREEK SMALL LETTER TAU" )
 965  : ( "υ"   U+03C5 Ll 1 "GREEK SMALL LETTER UPSILON" )
 966  : ( "φ"   U+03C6 Ll 1 "GREEK SMALL LETTER PHI" )
 967  : ( "χ"   U+03C7 Ll 1 "GREEK SMALL LETTER CHI" )
 968  : ( "ψ"   U+03C8 Ll 1 "GREEK SMALL LETTER PSI" )
 969  : ( "ω"   U+03C9 Ll 1 "GREEK SMALL LETTER OMEGA" )
 970  : ( "ϊ"   U+03CA Ll 1 "GREEK SMALL LETTER IOTA WITH DIALYTIKA" )
 971  : ( "ϋ"   U+03CB Ll 1 "GREEK SMALL LETTER UPSILON WITH DIALYTIKA" )
 972  : ( "ό"   U+03CC Ll 1 "GREEK SMALL LETTER OMICRON WITH TONOS" )
 973  : ( "ύ"   U+03CD Ll 1 "GREEK SMALL LETTER UPSILON WITH TONOS" )
 974  : ( "ώ"   U+03CE Ll 1 "GREEK SMALL LETTER OMEGA WITH TONOS" )
 975  : ( "Ϗ"   U+03CF Lu 1 "GREEK CAPITAL KAI SYMBOL" )
 976  : ( "ϐ"   U+03D0 Ll 1 "GREEK BETA SYMBOL" )
 977  : ( "ϑ"   U+03D1 Ll 1 "GREEK THETA SYMBOL" )
 978  : ( "ϒ"   U+03D2 Lu 1 "GREEK UPSILON WITH HOOK SYMBOL" )
 979  : ( "ϓ"   U+03D3 Lu 1 "GREEK UPSILON WITH ACUTE AND HOOK SYMBOL" )
 980  : ( "ϔ"   U+03D4 Lu 1 "GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL" )
 981  : ( "ϕ"   U+03D5 Ll 1 "GREEK PHI SYMBOL" )
 982  : ( "ϖ"   U+03D6 Ll 1 "GREEK PI SYMBOL" )
 983  : ( "ϗ"   U+03D7 Ll 1 "GREEK KAI SYMBOL" )
 984  : ( "Ϙ"   U+03D8 Lu 1 "GREEK LETTER ARCHAIC KOPPA" )
 985  : ( "ϙ"   U+03D9 Ll 1 "GREEK SMALL LETTER ARCHAIC KOPPA" )
 986  : ( "Ϛ"   U+03DA Lu 1 "GREEK LETTER STIGMA" )
 987  : ( "ϛ"   U+03DB Ll 1 "GREEK SMALL LETTER STIGMA" )
 988  : ( "Ϝ"   U+03DC Lu 1 "GREEK LETTER DIGAMMA" )
 989  : ( "ϝ"   U+03DD Ll 1 "GREEK SMALL LETTER DIGAMMA" )
 990  : ( "Ϟ"   U+03DE Lu 1 "GREEK LETTER KOPPA" )
 991  : ( "ϟ"   U+03DF Ll 1 "GREEK SMALL LETTER KOPPA" )
 992  : ( "Ϡ"   U+03E0 Lu 1 "GREEK LETTER SAMPI" )
 993  : ( "ϡ"   U+03E1 Ll 1 "GREEK SMALL LETTER SAMPI" )
 994  : ( "Ϣ"   U+03E2 Lu 1 "COPTIC CAPITAL LETTER SHEI" )
 995  : ( "ϣ"   U+03E3 Ll 1 "COPTIC SMALL LETTER SHEI" )
 996  : ( "Ϥ"   U+03E4 Lu 1 "COPTIC CAPITAL LETTER FEI" )
 997  : ( "ϥ"   U+03E5 Ll 1 "COPTIC SMALL LETTER FEI" )
 998  : ( "Ϧ"   U+03E6 Lu 1 "COPTIC CAPITAL LETTER KHEI" )
 999  : ( "ϧ"   U+03E7 Ll 1 "COPTIC SMALL LETTER KHEI" )
 1000 : ( "Ϩ"   U+03E8 Lu 1 "COPTIC CAPITAL LETTER HORI" )
 1001 : ( "ϩ"   U+03E9 Ll 1 "COPTIC SMALL LETTER HORI" )
 1002 : ( "Ϫ"   U+03EA Lu 1 "COPTIC CAPITAL LETTER GANGIA" )
 1003 : ( "ϫ"   U+03EB Ll 1 "COPTIC SMALL LETTER GANGIA" )
 1004 : ( "Ϭ"   U+03EC Lu 1 "COPTIC CAPITAL LETTER SHIMA" )
 1005 : ( "ϭ"   U+03ED Ll 1 "COPTIC SMALL LETTER SHIMA" )
 1006 : ( "Ϯ"   U+03EE Lu 1 "COPTIC CAPITAL LETTER DEI" )
 1007 : ( "ϯ"   U+03EF Ll 1 "COPTIC SMALL LETTER DEI" )
 1008 : ( "ϰ"   U+03F0 Ll 1 "GREEK KAPPA SYMBOL" )
...

/*
Add character intervals.
.UnicodeCharacterInterval
    codepointFrom
    codepointTo
    name
    isExpanded
*/
ooRexx> .unicode~characterIntervals==
an UnicodeCharacterIntervalSupplier 
 1  : (U+3400..U+4DBF "CJK UNIFIED IDEOGRAPH-*" 6591 characters)
 2  : (U+4E00..U+9FFF "CJK UNIFIED IDEOGRAPH-*" 20991 characters)
 3  : (U+F900..U+FA6D "CJK COMPATIBILITY IDEOGRAPH-*" 365 characters)
 4  : (U+FA70..U+FAD9 "CJK COMPATIBILITY IDEOGRAPH-*" 105 characters)
 5  : (U+17000..U+187F7 "TANGUT IDEOGRAPH-*" 6135 characters)
 6  : (U+18B00..U+18CD5 "KHITAN SMALL SCRIPT CHARACTER-*" 469 characters)
 7  : (U+18D00..U+18D08 "TANGUT IDEOGRAPH-*" 8 characters)
 8  : (U+1B170..U+1B2FB "NUSHU CHARACTER-*" 395 characters)
 9  : (U+20000..U+2A6DF "CJK UNIFIED IDEOGRAPH-*" 42719 characters)
 10 : (U+2A700..U+2B739 "CJK UNIFIED IDEOGRAPH-*" 4153 characters)
 11 : (U+2B740..U+2B81D "CJK UNIFIED IDEOGRAPH-*" 221 characters)
 12 : (U+2B820..U+2CEA1 "CJK UNIFIED IDEOGRAPH-*" 5761 characters)
 13 : (U+2CEB0..U+2EBE0 "CJK UNIFIED IDEOGRAPH-*" 7472 characters)
 14 : (U+2EBF0..U+2EE5D "CJK UNIFIED IDEOGRAPH-*" 621 characters)
 15 : (U+2F800..U+2FA1D "CJK COMPATIBILITY IDEOGRAPH-*" 541 characters)
 16 : (U+30000..U+3134A "CJK UNIFIED IDEOGRAPH-*" 4938 characters)
 17 : (U+31350..U+323AF "CJK UNIFIED IDEOGRAPH-*" 4191 characters)


-- Informations about Unicode:
-- Remove dataDirectory because the value is different between Windows and Macos/Linux
ooRexx> .Unicode~informations~~remove("dataDirectory")=
a Directory (12 items)
'characterIntervals'                 : (an UnicodeCharacterIntervalSupplier count=17 notExpanded:17 intervals, 105693 characters)
'characters'                         : (an UnicodeCharacterSupplier count=44189 size=918000)
'maxCodepoint'                       :  1114111
'memorizeTranscodings'               :  0
'memorizeTransformations'            :  0
'systemIsLittleEndian'               :  1
'totalCharacterNameAliases'          :  473
'totalCharactersLoaded'              :  149813
'totalIntervalCharacters'            :  105693
'totalIntervalCharactersNotExpanded' :  105693
'unckeckedConversionToString'        :  1
'version'                            : '15.1.0'


-- ===============================================================================
-- 2021 September 13, updated September 22

/*
Add character informations.

The loading of the character names is optional.
By default, they are not loaded.
From ooRexxShell, execute: call loadUnicodeCharacterNames
By default, the character intervals are not expanded.
From ooRexxShell, execute: call expandUnicodeCharacterIntervals

The other character properties are always loaded (provided by utf8proc)

.Unicode
    characters          --> supplier of UnicodeCharacter
    character(index)    --> UnicodeCharacter (index can be a loose matching name (UAX44-LM2) or a codepoint)
    characterIntervals  --> supplier of UnicodeCharacterInterval

.UnicodeCharacter
    codepoint       --> integer -1..1114111
    name            --> string
    aliases         --> array of .UnicodeAlias

    bidiClass       --> enum 1, 2, 3, ...
    bidiClassName   --> enum 'L', 'LRE', 'LRO', ...
    boundClass      --> enum 0, 1, 2, ...
    boundClassName  --> enum 'START', 'OTHER', 'CR', ...
    category        --> enum 0, 1, 2, ...
    categoryName    --> enum 'Cn', 'Lu', 'Ll', ...
    charWidth       --> integer
    combiningClass  --> integer 0..254
    controlBoundary --> boolean
    decompType      --> enum 0, 1, 2, ...
    decompTypeName  --> enum '<none>', '<font>', '<nobreak>, ...
    ignorable       --> boolean

Examples:
*/
-- All the Unicode characters (sparse array).
ooRexx> .unicode~characters==
an UnicodeCharacterSupplier 
 0    : ( ""    U+0000 Cc 0 "", "NULL", "NUL" )
 1    : ( ""    U+0001 Cc 0 "", "START OF HEADING", "SOH" )
 2    : ( ""    U+0002 Cc 0 "", "START OF TEXT", "STX" )
 3    : ( ""    U+0003 Cc 0 "", "END OF TEXT", "ETX" )
 4    : ( ""    U+0004 Cc 0 "", "END OF TRANSMISSION", "EOT" )
 5    : ( ""    U+0005 Cc 0 "", "ENQUIRY", "ENQ" )
 6    : ( ""    U+0006 Cc 0 "", "ACKNOWLEDGE", "ACK" )
 7    : ( ""    U+0007 Cc 0 "", "ALERT", "BEL" )
 8    : ( ""    U+0008 Cc 0 "", "BACKSPACE", "BS" )
 9    : ( ""    U+0009 Cc 0 "", "CHARACTER TABULATION", "HORIZONTAL TABULATION", "HT", "TAB" )
 10   : ( ""    U+000A Cc 0 "", "LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL" )
 11   : ( ""    U+000B Cc 0 "", "LINE TABULATION", "VERTICAL TABULATION", "VT" )
 12   : ( ""    U+000C Cc 0 "", "FORM FEED", "FF" )
 13   : ( ""    U+000D Cc 0 "", "CARRIAGE RETURN", "CR" )
 14   : ( ""    U+000E Cc 0 "", "SHIFT OUT", "LOCKING-SHIFT ONE", "SO" )
 15   : ( ""    U+000F Cc 0 "", "SHIFT IN", "LOCKING-SHIFT ZERO", "SI" )
 16   : ( ""    U+0010 Cc 0 "", "DATA LINK ESCAPE", "DLE" )
 17   : ( ""    U+0011 Cc 0 "", "DEVICE CONTROL ONE", "DC1" )
 18   : ( ""    U+0012 Cc 0 "", "DEVICE CONTROL TWO", "DC2" )
 19   : ( ""    U+0013 Cc 0 "", "DEVICE CONTROL THREE", "DC3" )
 20   : ( ""    U+0014 Cc 0 "", "DEVICE CONTROL FOUR", "DC4" )
 21   : ( ""    U+0015 Cc 0 "", "NEGATIVE ACKNOWLEDGE", "NAK" )
 22   : ( ""    U+0016 Cc 0 "", "SYNCHRONOUS IDLE", "SYN" )
 23   : ( ""    U+0017 Cc 0 "", "END OF TRANSMISSION BLOCK", "ETB" )
 24   : ( ""    U+0018 Cc 0 "", "CANCEL", "CAN" )
 25   : ( ""    U+0019 Cc 0 "", "END OF MEDIUM", "EOM", "EM" )
 26   : ( ""    U+001A Cc 0 "", "SUBSTITUTE", "SUB" )
 27   : ( ""    U+001B Cc 0 "", "ESCAPE", "ESC" )
 28   : ( ""    U+001C Cc 0 "", "INFORMATION SEPARATOR FOUR", "FILE SEPARATOR", "FS" )
 29   : ( ""    U+001D Cc 0 "", "INFORMATION SEPARATOR THREE", "GROUP SEPARATOR", "GS" )
 30   : ( ""    U+001E Cc 0 "", "INFORMATION SEPARATOR TWO", "RECORD SEPARATOR", "RS" )
 31   : ( ""    U+001F Cc 0 "", "INFORMATION SEPARATOR ONE", "UNIT SEPARATOR", "US" )
 32   : ( " "   U+0020 Zs 1 "SPACE", "SP" )
 33   : ( "!"   U+0021 Po 1 "EXCLAMATION MARK" )
 34   : ( """   U+0022 Po 1 "QUOTATION MARK" )
 35   : ( "#"   U+0023 Po 1 "NUMBER SIGN" )
 36   : ( "$"   U+0024 Sc 1 "DOLLAR SIGN" )
 37   : ( "%"   U+0025 Po 1 "PERCENT SIGN" )
 38   : ( "&"   U+0026 Po 1 "AMPERSAND" )
 39   : ( "'"   U+0027 Po 1 "APOSTROPHE" )
 40   : ( "("   U+0028 Ps 1 "LEFT PARENTHESIS" )
 41   : ( ")"   U+0029 Pe 1 "RIGHT PARENTHESIS" )
 42   : ( "*"   U+002A Po 1 "ASTERISK" )
 43   : ( "+"   U+002B Sm 1 "PLUS SIGN" )
 44   : ( ","   U+002C Po 1 "COMMA" )
 45   : ( "-"   U+002D Pd 1 "HYPHEN-MINUS" )
 46   : ( "."   U+002E Po 1 "FULL STOP" )
 47   : ( "/"   U+002F Po 1 "SOLIDUS" )
 48   : ( "0"   U+0030 Nd 1 "DIGIT ZERO" )
 49   : ( "1"   U+0031 Nd 1 "DIGIT ONE" )
 50   : ( "2"   U+0032 Nd 1 "DIGIT TWO" )
 51   : ( "3"   U+0033 Nd 1 "DIGIT THREE" )
 52   : ( "4"   U+0034 Nd 1 "DIGIT FOUR" )
 53   : ( "5"   U+0035 Nd 1 "DIGIT FIVE" )
 54   : ( "6"   U+0036 Nd 1 "DIGIT SIX" )
 55   : ( "7"   U+0037 Nd 1 "DIGIT SEVEN" )
 56   : ( "8"   U+0038 Nd 1 "DIGIT EIGHT" )
 57   : ( "9"   U+0039 Nd 1 "DIGIT NINE" )
 58   : ( ":"   U+003A Po 1 "COLON" )
 59   : ( ";"   U+003B Po 1 "SEMICOLON" )
 60   : ( "<"   U+003C Sm 1 "LESS-THAN SIGN" )
 61   : ( "="   U+003D Sm 1 "EQUALS SIGN" )
 62   : ( ">"   U+003E Sm 1 "GREATER-THAN SIGN" )
 63   : ( "?"   U+003F Po 1 "QUESTION MARK" )
 64   : ( "@"   U+0040 Po 1 "COMMERCIAL AT" )
 65   : ( "A"   U+0041 Lu 1 "LATIN CAPITAL LETTER A" )
 66   : ( "B"   U+0042 Lu 1 "LATIN CAPITAL LETTER B" )
 67   : ( "C"   U+0043 Lu 1 "LATIN CAPITAL LETTER C" )
 68   : ( "D"   U+0044 Lu 1 "LATIN CAPITAL LETTER D" )
 69   : ( "E"   U+0045 Lu 1 "LATIN CAPITAL LETTER E" )
 70   : ( "F"   U+0046 Lu 1 "LATIN CAPITAL LETTER F" )
 71   : ( "G"   U+0047 Lu 1 "LATIN CAPITAL LETTER G" )
 72   : ( "H"   U+0048 Lu 1 "LATIN CAPITAL LETTER H" )
 73   : ( "I"   U+0049 Lu 1 "LATIN CAPITAL LETTER I" )
 74   : ( "J"   U+004A Lu 1 "LATIN CAPITAL LETTER J" )
 75   : ( "K"   U+004B Lu 1 "LATIN CAPITAL LETTER K" )
 76   : ( "L"   U+004C Lu 1 "LATIN CAPITAL LETTER L" )
 77   : ( "M"   U+004D Lu 1 "LATIN CAPITAL LETTER M" )
 78   : ( "N"   U+004E Lu 1 "LATIN CAPITAL LETTER N" )
 79   : ( "O"   U+004F Lu 1 "LATIN CAPITAL LETTER O" )
 80   : ( "P"   U+0050 Lu 1 "LATIN CAPITAL LETTER P" )
 81   : ( "Q"   U+0051 Lu 1 "LATIN CAPITAL LETTER Q" )
 82   : ( "R"   U+0052 Lu 1 "LATIN CAPITAL LETTER R" )
 83   : ( "S"   U+0053 Lu 1 "LATIN CAPITAL LETTER S" )
 84   : ( "T"   U+0054 Lu 1 "LATIN CAPITAL LETTER T" )
 85   : ( "U"   U+0055 Lu 1 "LATIN CAPITAL LETTER U" )
 86   : ( "V"   U+0056 Lu 1 "LATIN CAPITAL LETTER V" )
 87   : ( "W"   U+0057 Lu 1 "LATIN CAPITAL LETTER W" )
 88   : ( "X"   U+0058 Lu 1 "LATIN CAPITAL LETTER X" )
 89   : ( "Y"   U+0059 Lu 1 "LATIN CAPITAL LETTER Y" )
 90   : ( "Z"   U+005A Lu 1 "LATIN CAPITAL LETTER Z" )
 91   : ( "["   U+005B Ps 1 "LEFT SQUARE BRACKET" )
 92   : ( "\"   U+005C Po 1 "REVERSE SOLIDUS" )
 93   : ( "]"   U+005D Pe 1 "RIGHT SQUARE BRACKET" )
 94   : ( "^"   U+005E Sk 1 "CIRCUMFLEX ACCENT" )
 95   : ( "_"   U+005F Pc 1 "LOW LINE" )
 96   : ( "`"   U+0060 Sk 1 "GRAVE ACCENT" )
 97   : ( "a"   U+0061 Ll 1 "LATIN SMALL LETTER A" )
 98   : ( "b"   U+0062 Ll 1 "LATIN SMALL LETTER B" )
 99   : ( "c"   U+0063 Ll 1 "LATIN SMALL LETTER C" )
 100  : ( "d"   U+0064 Ll 1 "LATIN SMALL LETTER D" )
 101  : ( "e"   U+0065 Ll 1 "LATIN SMALL LETTER E" )
 102  : ( "f"   U+0066 Ll 1 "LATIN SMALL LETTER F" )
 103  : ( "g"   U+0067 Ll 1 "LATIN SMALL LETTER G" )
 104  : ( "h"   U+0068 Ll 1 "LATIN SMALL LETTER H" )
 105  : ( "i"   U+0069 Ll 1 "LATIN SMALL LETTER I" )
 106  : ( "j"   U+006A Ll 1 "LATIN SMALL LETTER J" )
 107  : ( "k"   U+006B Ll 1 "LATIN SMALL LETTER K" )
 108  : ( "l"   U+006C Ll 1 "LATIN SMALL LETTER L" )
 109  : ( "m"   U+006D Ll 1 "LATIN SMALL LETTER M" )
 110  : ( "n"   U+006E Ll 1 "LATIN SMALL LETTER N" )
 111  : ( "o"   U+006F Ll 1 "LATIN SMALL LETTER O" )
 112  : ( "p"   U+0070 Ll 1 "LATIN SMALL LETTER P" )
 113  : ( "q"   U+0071 Ll 1 "LATIN SMALL LETTER Q" )
 114  : ( "r"   U+0072 Ll 1 "LATIN SMALL LETTER R" )
 115  : ( "s"   U+0073 Ll 1 "LATIN SMALL LETTER S" )
 116  : ( "t"   U+0074 Ll 1 "LATIN SMALL LETTER T" )
 117  : ( "u"   U+0075 Ll 1 "LATIN SMALL LETTER U" )
 118  : ( "v"   U+0076 Ll 1 "LATIN SMALL LETTER V" )
 119  : ( "w"   U+0077 Ll 1 "LATIN SMALL LETTER W" )
 120  : ( "x"   U+0078 Ll 1 "LATIN SMALL LETTER X" )
 121  : ( "y"   U+0079 Ll 1 "LATIN SMALL LETTER Y" )
 122  : ( "z"   U+007A Ll 1 "LATIN SMALL LETTER Z" )
 123  : ( "{"   U+007B Ps 1 "LEFT CURLY BRACKET" )
 124  : ( "|"   U+007C Sm 1 "VERTICAL LINE" )
 125  : ( "}"   U+007D Pe 1 "RIGHT CURLY BRACKET" )
 126  : ( "~"   U+007E Sm 1 "TILDE" )
 127  : ( ""    U+007F Cc 0 "", "DELETE", "DEL" )
 128  : ( "€"    U+0080 Cc 0 "", "PADDING CHARACTER", "PAD" )
 129  : ( ""    U+0081 Cc 0 "", "HIGH OCTET PRESET", "HOP" )
 130  : ( "‚"    U+0082 Cc 0 "", "BREAK PERMITTED HERE", "BPH" )
 131  : ( "ƒ"    U+0083 Cc 0 "", "NO BREAK HERE", "NBH" )
 132  : ( "„"    U+0084 Cc 0 "", "INDEX", "IND" )
 133  : ( "…"    U+0085 Cc 0 "", "NEXT LINE", "NEL" )
 134  : ( "†"    U+0086 Cc 0 "", "START OF SELECTED AREA", "SSA" )
 135  : ( "‡"    U+0087 Cc 0 "", "END OF SELECTED AREA", "ESA" )
 136  : ( "ˆ"    U+0088 Cc 0 "", "CHARACTER TABULATION SET", "HORIZONTAL TABULATION SET", "HTS" )
 137  : ( "‰"    U+0089 Cc 0 "", "CHARACTER TABULATION WITH JUSTIFICATION", "HORIZONTAL TABULATION WITH JUSTIFICATION", "HTJ" )
 138  : ( "Š"    U+008A Cc 0 "", "LINE TABULATION SET", "VERTICAL TABULATION SET", "VTS" )
 139  : ( "‹"    U+008B Cc 0 "", "PARTIAL LINE FORWARD", "PARTIAL LINE DOWN", "PLD" )
 140  : ( "Œ"    U+008C Cc 0 "", "PARTIAL LINE BACKWARD", "PARTIAL LINE UP", "PLU" )
 141  : ( ""    U+008D Cc 0 "", "REVERSE LINE FEED", "REVERSE INDEX", "RI" )
 142  : ( "Ž"    U+008E Cc 0 "", "SINGLE SHIFT TWO", "SINGLE-SHIFT-2", "SS2" )
 143  : ( ""    U+008F Cc 0 "", "SINGLE SHIFT THREE", "SINGLE-SHIFT-3", "SS3" )
 144  : ( ""    U+0090 Cc 0 "", "DEVICE CONTROL STRING", "DCS" )
 145  : ( "‘"    U+0091 Cc 0 "", "PRIVATE USE ONE", "PRIVATE USE-1", "PU1" )
 146  : ( "’"    U+0092 Cc 0 "", "PRIVATE USE TWO", "PRIVATE USE-2", "PU2" )
 147  : ( "“"    U+0093 Cc 0 "", "SET TRANSMIT STATE", "STS" )
 148  : ( "”"    U+0094 Cc 0 "", "CANCEL CHARACTER", "CCH" )
 149  : ( "•"    U+0095 Cc 0 "", "MESSAGE WAITING", "MW" )
 150  : ( "–"    U+0096 Cc 0 "", "START OF GUARDED AREA", "START OF PROTECTED AREA", "SPA" )
 151  : ( "—"    U+0097 Cc 0 "", "END OF GUARDED AREA", "END OF PROTECTED AREA", "EPA" )
 152  : ( "˜"    U+0098 Cc 0 "", "START OF STRING", "SOS" )
 153  : ( "™"    U+0099 Cc 0 "", "SINGLE GRAPHIC CHARACTER INTRODUCER", "SGC" )
 154  : ( "š"    U+009A Cc 0 "", "SINGLE CHARACTER INTRODUCER", "SCI" )
 155  : ( "›"    U+009B Cc 0 "", "CONTROL SEQUENCE INTRODUCER", "CSI" )
 156  : ( "œ"    U+009C Cc 0 "", "STRING TERMINATOR", "ST" )
 157  : ( ""    U+009D Cc 0 "", "OPERATING SYSTEM COMMAND", "OSC" )
 158  : ( "ž"    U+009E Cc 0 "", "PRIVACY MESSAGE", "PM" )
 159  : ( "Ÿ"    U+009F Cc 0 "", "APPLICATION PROGRAM COMMAND", "APC" )
 160  : ( " "   U+00A0 Zs 1 "NO-BREAK SPACE", "NBSP" )
 161  : ( "¡"   U+00A1 Po 1 "INVERTED EXCLAMATION MARK" )
 162  : ( "¢"   U+00A2 Sc 1 "CENT SIGN" )
 163  : ( "£"   U+00A3 Sc 1 "POUND SIGN" )
 164  : ( "¤"   U+00A4 Sc 1 "CURRENCY SIGN" )
 165  : ( "¥"   U+00A5 Sc 1 "YEN SIGN" )
 166  : ( "¦"   U+00A6 So 1 "BROKEN BAR" )
 167  : ( "§"   U+00A7 Po 1 "SECTION SIGN" )
 168  : ( "¨"   U+00A8 Sk 1 "DIAERESIS" )
 169  : ( "©"   U+00A9 So 1 "COPYRIGHT SIGN" )
 170  : ( "ª"   U+00AA Lo 1 "FEMININE ORDINAL INDICATOR" )
 171  : ( "«"   U+00AB Pi 1 "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK" )
 172  : ( "¬"   U+00AC Sm 1 "NOT SIGN" )
 173  : ( "­"   U+00AD Cf 1 "SOFT HYPHEN", "SHY" )
 174  : ( "®"   U+00AE So 1 "REGISTERED SIGN" )
 175  : ( "¯"   U+00AF Sk 1 "MACRON" )
 176  : ( "°"   U+00B0 So 1 "DEGREE SIGN" )
 177  : ( "±"   U+00B1 Sm 1 "PLUS-MINUS SIGN" )
 178  : ( "²"   U+00B2 No 1 "SUPERSCRIPT TWO" )
 179  : ( "³"   U+00B3 No 1 "SUPERSCRIPT THREE" )
 180  : ( "´"   U+00B4 Sk 1 "ACUTE ACCENT" )
 181  : ( "µ"   U+00B5 Ll 1 "MICRO SIGN" )
 182  : ( "¶"   U+00B6 Po 1 "PILCROW SIGN" )
 183  : ( "·"   U+00B7 Po 1 "MIDDLE DOT" )
 184  : ( "¸"   U+00B8 Sk 1 "CEDILLA" )
 185  : ( "¹"   U+00B9 No 1 "SUPERSCRIPT ONE" )
 186  : ( "º"   U+00BA Lo 1 "MASCULINE ORDINAL INDICATOR" )
 187  : ( "»"   U+00BB Pf 1 "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK" )
 188  : ( "¼"   U+00BC No 1 "VULGAR FRACTION ONE QUARTER" )
 189  : ( "½"   U+00BD No 1 "VULGAR FRACTION ONE HALF" )
 190  : ( "¾"   U+00BE No 1 "VULGAR FRACTION THREE QUARTERS" )
 191  : ( "¿"   U+00BF Po 1 "INVERTED QUESTION MARK" )
 192  : ( "À"   U+00C0 Lu 1 "LATIN CAPITAL LETTER A WITH GRAVE" )
 193  : ( "Á"   U+00C1 Lu 1 "LATIN CAPITAL LETTER A WITH ACUTE" )
 194  : ( "Â"   U+00C2 Lu 1 "LATIN CAPITAL LETTER A WITH CIRCUMFLEX" )
 195  : ( "Ã"   U+00C3 Lu 1 "LATIN CAPITAL LETTER A WITH TILDE" )
 196  : ( "Ä"   U+00C4 Lu 1 "LATIN CAPITAL LETTER A WITH DIAERESIS" )
 197  : ( "Å"   U+00C5 Lu 1 "LATIN CAPITAL LETTER A WITH RING ABOVE" )
 198  : ( "Æ"   U+00C6 Lu 1 "LATIN CAPITAL LETTER AE" )
 199  : ( "Ç"   U+00C7 Lu 1 "LATIN CAPITAL LETTER C WITH CEDILLA" )
 200  : ( "È"   U+00C8 Lu 1 "LATIN CAPITAL LETTER E WITH GRAVE" )
 201  : ( "É"   U+00C9 Lu 1 "LATIN CAPITAL LETTER E WITH ACUTE" )
 202  : ( "Ê"   U+00CA Lu 1 "LATIN CAPITAL LETTER E WITH CIRCUMFLEX" )
 203  : ( "Ë"   U+00CB Lu 1 "LATIN CAPITAL LETTER E WITH DIAERESIS" )
 204  : ( "Ì"   U+00CC Lu 1 "LATIN CAPITAL LETTER I WITH GRAVE" )
 205  : ( "Í"   U+00CD Lu 1 "LATIN CAPITAL LETTER I WITH ACUTE" )
 206  : ( "Î"   U+00CE Lu 1 "LATIN CAPITAL LETTER I WITH CIRCUMFLEX" )
 207  : ( "Ï"   U+00CF Lu 1 "LATIN CAPITAL LETTER I WITH DIAERESIS" )
 208  : ( "Ð"   U+00D0 Lu 1 "LATIN CAPITAL LETTER ETH" )
 209  : ( "Ñ"   U+00D1 Lu 1 "LATIN CAPITAL LETTER N WITH TILDE" )
 210  : ( "Ò"   U+00D2 Lu 1 "LATIN CAPITAL LETTER O WITH GRAVE" )
 211  : ( "Ó"   U+00D3 Lu 1 "LATIN CAPITAL LETTER O WITH ACUTE" )
 212  : ( "Ô"   U+00D4 Lu 1 "LATIN CAPITAL LETTER O WITH CIRCUMFLEX" )
 213  : ( "Õ"   U+00D5 Lu 1 "LATIN CAPITAL LETTER O WITH TILDE" )
 214  : ( "Ö"   U+00D6 Lu 1 "LATIN CAPITAL LETTER O WITH DIAERESIS" )
 215  : ( "×"   U+00D7 Sm 1 "MULTIPLICATION SIGN" )
 216  : ( "Ø"   U+00D8 Lu 1 "LATIN CAPITAL LETTER O WITH STROKE" )
 217  : ( "Ù"   U+00D9 Lu 1 "LATIN CAPITAL LETTER U WITH GRAVE" )
 218  : ( "Ú"   U+00DA Lu 1 "LATIN CAPITAL LETTER U WITH ACUTE" )
 219  : ( "Û"   U+00DB Lu 1 "LATIN CAPITAL LETTER U WITH CIRCUMFLEX" )
 220  : ( "Ü"   U+00DC Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS" )
 221  : ( "Ý"   U+00DD Lu 1 "LATIN CAPITAL LETTER Y WITH ACUTE" )
 222  : ( "Þ"   U+00DE Lu 1 "LATIN CAPITAL LETTER THORN" )
 223  : ( "ß"   U+00DF Ll 1 "LATIN SMALL LETTER SHARP S" )
 224  : ( "à"   U+00E0 Ll 1 "LATIN SMALL LETTER A WITH GRAVE" )
 225  : ( "á"   U+00E1 Ll 1 "LATIN SMALL LETTER A WITH ACUTE" )
 226  : ( "â"   U+00E2 Ll 1 "LATIN SMALL LETTER A WITH CIRCUMFLEX" )
 227  : ( "ã"   U+00E3 Ll 1 "LATIN SMALL LETTER A WITH TILDE" )
 228  : ( "ä"   U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" )
 229  : ( "å"   U+00E5 Ll 1 "LATIN SMALL LETTER A WITH RING ABOVE" )
 230  : ( "æ"   U+00E6 Ll 1 "LATIN SMALL LETTER AE" )
 231  : ( "ç"   U+00E7 Ll 1 "LATIN SMALL LETTER C WITH CEDILLA" )
 232  : ( "è"   U+00E8 Ll 1 "LATIN SMALL LETTER E WITH GRAVE" )
 233  : ( "é"   U+00E9 Ll 1 "LATIN SMALL LETTER E WITH ACUTE" )
 234  : ( "ê"   U+00EA Ll 1 "LATIN SMALL LETTER E WITH CIRCUMFLEX" )
 235  : ( "ë"   U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" )
 236  : ( "ì"   U+00EC Ll 1 "LATIN SMALL LETTER I WITH GRAVE" )
 237  : ( "í"   U+00ED Ll 1 "LATIN SMALL LETTER I WITH ACUTE" )
 238  : ( "î"   U+00EE Ll 1 "LATIN SMALL LETTER I WITH CIRCUMFLEX" )
 239  : ( "ï"   U+00EF Ll 1 "LATIN SMALL LETTER I WITH DIAERESIS" )
 240  : ( "ð"   U+00F0 Ll 1 "LATIN SMALL LETTER ETH" )
 241  : ( "ñ"   U+00F1 Ll 1 "LATIN SMALL LETTER N WITH TILDE" )
 242  : ( "ò"   U+00F2 Ll 1 "LATIN SMALL LETTER O WITH GRAVE" )
 243  : ( "ó"   U+00F3 Ll 1 "LATIN SMALL LETTER O WITH ACUTE" )
 244  : ( "ô"   U+00F4 Ll 1 "LATIN SMALL LETTER O WITH CIRCUMFLEX" )
 245  : ( "õ"   U+00F5 Ll 1 "LATIN SMALL LETTER O WITH TILDE" )
 246  : ( "ö"   U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" )
 247  : ( "÷"   U+00F7 Sm 1 "DIVISION SIGN" )
 248  : ( "ø"   U+00F8 Ll 1 "LATIN SMALL LETTER O WITH STROKE" )
 249  : ( "ù"   U+00F9 Ll 1 "LATIN SMALL LETTER U WITH GRAVE" )
 250  : ( "ú"   U+00FA Ll 1 "LATIN SMALL LETTER U WITH ACUTE" )
 251  : ( "û"   U+00FB Ll 1 "LATIN SMALL LETTER U WITH CIRCUMFLEX" )
 252  : ( "ü"   U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" )
 253  : ( "ý"   U+00FD Ll 1 "LATIN SMALL LETTER Y WITH ACUTE" )
 254  : ( "þ"   U+00FE Ll 1 "LATIN SMALL LETTER THORN" )
 255  : ( "ÿ"   U+00FF Ll 1 "LATIN SMALL LETTER Y WITH DIAERESIS" )
 256  : ( "Ā"   U+0100 Lu 1 "LATIN CAPITAL LETTER A WITH MACRON" )
 257  : ( "ā"   U+0101 Ll 1 "LATIN SMALL LETTER A WITH MACRON" )
 258  : ( "Ă"   U+0102 Lu 1 "LATIN CAPITAL LETTER A WITH BREVE" )
 259  : ( "ă"   U+0103 Ll 1 "LATIN SMALL LETTER A WITH BREVE" )
 260  : ( "Ą"   U+0104 Lu 1 "LATIN CAPITAL LETTER A WITH OGONEK" )
 261  : ( "ą"   U+0105 Ll 1 "LATIN SMALL LETTER A WITH OGONEK" )
 262  : ( "Ć"   U+0106 Lu 1 "LATIN CAPITAL LETTER C WITH ACUTE" )
 263  : ( "ć"   U+0107 Ll 1 "LATIN SMALL LETTER C WITH ACUTE" )
 264  : ( "Ĉ"   U+0108 Lu 1 "LATIN CAPITAL LETTER C WITH CIRCUMFLEX" )
 265  : ( "ĉ"   U+0109 Ll 1 "LATIN SMALL LETTER C WITH CIRCUMFLEX" )
 266  : ( "Ċ"   U+010A Lu 1 "LATIN CAPITAL LETTER C WITH DOT ABOVE" )
 267  : ( "ċ"   U+010B Ll 1 "LATIN SMALL LETTER C WITH DOT ABOVE" )
 268  : ( "Č"   U+010C Lu 1 "LATIN CAPITAL LETTER C WITH CARON" )
 269  : ( "č"   U+010D Ll 1 "LATIN SMALL LETTER C WITH CARON" )
 270  : ( "Ď"   U+010E Lu 1 "LATIN CAPITAL LETTER D WITH CARON" )
 271  : ( "ď"   U+010F Ll 1 "LATIN SMALL LETTER D WITH CARON" )
 272  : ( "Đ"   U+0110 Lu 1 "LATIN CAPITAL LETTER D WITH STROKE" )
 273  : ( "đ"   U+0111 Ll 1 "LATIN SMALL LETTER D WITH STROKE" )
 274  : ( "Ē"   U+0112 Lu 1 "LATIN CAPITAL LETTER E WITH MACRON" )
 275  : ( "ē"   U+0113 Ll 1 "LATIN SMALL LETTER E WITH MACRON" )
 276  : ( "Ĕ"   U+0114 Lu 1 "LATIN CAPITAL LETTER E WITH BREVE" )
 277  : ( "ĕ"   U+0115 Ll 1 "LATIN SMALL LETTER E WITH BREVE" )
 278  : ( "Ė"   U+0116 Lu 1 "LATIN CAPITAL LETTER E WITH DOT ABOVE" )
 279  : ( "ė"   U+0117 Ll 1 "LATIN SMALL LETTER E WITH DOT ABOVE" )
 280  : ( "Ę"   U+0118 Lu 1 "LATIN CAPITAL LETTER E WITH OGONEK" )
 281  : ( "ę"   U+0119 Ll 1 "LATIN SMALL LETTER E WITH OGONEK" )
 282  : ( "Ě"   U+011A Lu 1 "LATIN CAPITAL LETTER E WITH CARON" )
 283  : ( "ě"   U+011B Ll 1 "LATIN SMALL LETTER E WITH CARON" )
 284  : ( "Ĝ"   U+011C Lu 1 "LATIN CAPITAL LETTER G WITH CIRCUMFLEX" )
 285  : ( "ĝ"   U+011D Ll 1 "LATIN SMALL LETTER G WITH CIRCUMFLEX" )
 286  : ( "Ğ"   U+011E Lu 1 "LATIN CAPITAL LETTER G WITH BREVE" )
 287  : ( "ğ"   U+011F Ll 1 "LATIN SMALL LETTER G WITH BREVE" )
 288  : ( "Ġ"   U+0120 Lu 1 "LATIN CAPITAL LETTER G WITH DOT ABOVE" )
 289  : ( "ġ"   U+0121 Ll 1 "LATIN SMALL LETTER G WITH DOT ABOVE" )
 290  : ( "Ģ"   U+0122 Lu 1 "LATIN CAPITAL LETTER G WITH CEDILLA" )
 291  : ( "ģ"   U+0123 Ll 1 "LATIN SMALL LETTER G WITH CEDILLA" )
 292  : ( "Ĥ"   U+0124 Lu 1 "LATIN CAPITAL LETTER H WITH CIRCUMFLEX" )
 293  : ( "ĥ"   U+0125 Ll 1 "LATIN SMALL LETTER H WITH CIRCUMFLEX" )
 294  : ( "Ħ"   U+0126 Lu 1 "LATIN CAPITAL LETTER H WITH STROKE" )
 295  : ( "ħ"   U+0127 Ll 1 "LATIN SMALL LETTER H WITH STROKE" )
 296  : ( "Ĩ"   U+0128 Lu 1 "LATIN CAPITAL LETTER I WITH TILDE" )
 297  : ( "ĩ"   U+0129 Ll 1 "LATIN SMALL LETTER I WITH TILDE" )
 298  : ( "Ī"   U+012A Lu 1 "LATIN CAPITAL LETTER I WITH MACRON" )
 299  : ( "ī"   U+012B Ll 1 "LATIN SMALL LETTER I WITH MACRON" )
 300  : ( "Ĭ"   U+012C Lu 1 "LATIN CAPITAL LETTER I WITH BREVE" )
 301  : ( "ĭ"   U+012D Ll 1 "LATIN SMALL LETTER I WITH BREVE" )
 302  : ( "Į"   U+012E Lu 1 "LATIN CAPITAL LETTER I WITH OGONEK" )
 303  : ( "į"   U+012F Ll 1 "LATIN SMALL LETTER I WITH OGONEK" )
 304  : ( "İ"   U+0130 Lu 1 "LATIN CAPITAL LETTER I WITH DOT ABOVE" )
 305  : ( "ı"   U+0131 Ll 1 "LATIN SMALL LETTER DOTLESS I" )
 306  : ( "IJ"   U+0132 Lu 1 "LATIN CAPITAL LIGATURE IJ" )
 307  : ( "ij"   U+0133 Ll 1 "LATIN SMALL LIGATURE IJ" )
 308  : ( "Ĵ"   U+0134 Lu 1 "LATIN CAPITAL LETTER J WITH CIRCUMFLEX" )
 309  : ( "ĵ"   U+0135 Ll 1 "LATIN SMALL LETTER J WITH CIRCUMFLEX" )
 310  : ( "Ķ"   U+0136 Lu 1 "LATIN CAPITAL LETTER K WITH CEDILLA" )
 311  : ( "ķ"   U+0137 Ll 1 "LATIN SMALL LETTER K WITH CEDILLA" )
 312  : ( "ĸ"   U+0138 Ll 1 "LATIN SMALL LETTER KRA" )
 313  : ( "Ĺ"   U+0139 Lu 1 "LATIN CAPITAL LETTER L WITH ACUTE" )
 314  : ( "ĺ"   U+013A Ll 1 "LATIN SMALL LETTER L WITH ACUTE" )
 315  : ( "Ļ"   U+013B Lu 1 "LATIN CAPITAL LETTER L WITH CEDILLA" )
 316  : ( "ļ"   U+013C Ll 1 "LATIN SMALL LETTER L WITH CEDILLA" )
 317  : ( "Ľ"   U+013D Lu 1 "LATIN CAPITAL LETTER L WITH CARON" )
 318  : ( "ľ"   U+013E Ll 1 "LATIN SMALL LETTER L WITH CARON" )
 319  : ( "Ŀ"   U+013F Lu 1 "LATIN CAPITAL LETTER L WITH MIDDLE DOT" )
 320  : ( "ŀ"   U+0140 Ll 1 "LATIN SMALL LETTER L WITH MIDDLE DOT" )
 321  : ( "Ł"   U+0141 Lu 1 "LATIN CAPITAL LETTER L WITH STROKE" )
 322  : ( "ł"   U+0142 Ll 1 "LATIN SMALL LETTER L WITH STROKE" )
 323  : ( "Ń"   U+0143 Lu 1 "LATIN CAPITAL LETTER N WITH ACUTE" )
 324  : ( "ń"   U+0144 Ll 1 "LATIN SMALL LETTER N WITH ACUTE" )
 325  : ( "Ņ"   U+0145 Lu 1 "LATIN CAPITAL LETTER N WITH CEDILLA" )
 326  : ( "ņ"   U+0146 Ll 1 "LATIN SMALL LETTER N WITH CEDILLA" )
 327  : ( "Ň"   U+0147 Lu 1 "LATIN CAPITAL LETTER N WITH CARON" )
 328  : ( "ň"   U+0148 Ll 1 "LATIN SMALL LETTER N WITH CARON" )
 329  : ( "ʼn"   U+0149 Ll 1 "LATIN SMALL LETTER N PRECEDED BY APOSTROPHE" )
 330  : ( "Ŋ"   U+014A Lu 1 "LATIN CAPITAL LETTER ENG" )
 331  : ( "ŋ"   U+014B Ll 1 "LATIN SMALL LETTER ENG" )
 332  : ( "Ō"   U+014C Lu 1 "LATIN CAPITAL LETTER O WITH MACRON" )
 333  : ( "ō"   U+014D Ll 1 "LATIN SMALL LETTER O WITH MACRON" )
 334  : ( "Ŏ"   U+014E Lu 1 "LATIN CAPITAL LETTER O WITH BREVE" )
 335  : ( "ŏ"   U+014F Ll 1 "LATIN SMALL LETTER O WITH BREVE" )
 336  : ( "Ő"   U+0150 Lu 1 "LATIN CAPITAL LETTER O WITH DOUBLE ACUTE" )
 337  : ( "ő"   U+0151 Ll 1 "LATIN SMALL LETTER O WITH DOUBLE ACUTE" )
 338  : ( "Œ"   U+0152 Lu 1 "LATIN CAPITAL LIGATURE OE" )
 339  : ( "œ"   U+0153 Ll 1 "LATIN SMALL LIGATURE OE" )
 340  : ( "Ŕ"   U+0154 Lu 1 "LATIN CAPITAL LETTER R WITH ACUTE" )
 341  : ( "ŕ"   U+0155 Ll 1 "LATIN SMALL LETTER R WITH ACUTE" )
 342  : ( "Ŗ"   U+0156 Lu 1 "LATIN CAPITAL LETTER R WITH CEDILLA" )
 343  : ( "ŗ"   U+0157 Ll 1 "LATIN SMALL LETTER R WITH CEDILLA" )
 344  : ( "Ř"   U+0158 Lu 1 "LATIN CAPITAL LETTER R WITH CARON" )
 345  : ( "ř"   U+0159 Ll 1 "LATIN SMALL LETTER R WITH CARON" )
 346  : ( "Ś"   U+015A Lu 1 "LATIN CAPITAL LETTER S WITH ACUTE" )
 347  : ( "ś"   U+015B Ll 1 "LATIN SMALL LETTER S WITH ACUTE" )
 348  : ( "Ŝ"   U+015C Lu 1 "LATIN CAPITAL LETTER S WITH CIRCUMFLEX" )
 349  : ( "ŝ"   U+015D Ll 1 "LATIN SMALL LETTER S WITH CIRCUMFLEX" )
 350  : ( "Ş"   U+015E Lu 1 "LATIN CAPITAL LETTER S WITH CEDILLA" )
 351  : ( "ş"   U+015F Ll 1 "LATIN SMALL LETTER S WITH CEDILLA" )
 352  : ( "Š"   U+0160 Lu 1 "LATIN CAPITAL LETTER S WITH CARON" )
 353  : ( "š"   U+0161 Ll 1 "LATIN SMALL LETTER S WITH CARON" )
 354  : ( "Ţ"   U+0162 Lu 1 "LATIN CAPITAL LETTER T WITH CEDILLA" )
 355  : ( "ţ"   U+0163 Ll 1 "LATIN SMALL LETTER T WITH CEDILLA" )
 356  : ( "Ť"   U+0164 Lu 1 "LATIN CAPITAL LETTER T WITH CARON" )
 357  : ( "ť"   U+0165 Ll 1 "LATIN SMALL LETTER T WITH CARON" )
 358  : ( "Ŧ"   U+0166 Lu 1 "LATIN CAPITAL LETTER T WITH STROKE" )
 359  : ( "ŧ"   U+0167 Ll 1 "LATIN SMALL LETTER T WITH STROKE" )
 360  : ( "Ũ"   U+0168 Lu 1 "LATIN CAPITAL LETTER U WITH TILDE" )
 361  : ( "ũ"   U+0169 Ll 1 "LATIN SMALL LETTER U WITH TILDE" )
 362  : ( "Ū"   U+016A Lu 1 "LATIN CAPITAL LETTER U WITH MACRON" )
 363  : ( "ū"   U+016B Ll 1 "LATIN SMALL LETTER U WITH MACRON" )
 364  : ( "Ŭ"   U+016C Lu 1 "LATIN CAPITAL LETTER U WITH BREVE" )
 365  : ( "ŭ"   U+016D Ll 1 "LATIN SMALL LETTER U WITH BREVE" )
 366  : ( "Ů"   U+016E Lu 1 "LATIN CAPITAL LETTER U WITH RING ABOVE" )
 367  : ( "ů"   U+016F Ll 1 "LATIN SMALL LETTER U WITH RING ABOVE" )
 368  : ( "Ű"   U+0170 Lu 1 "LATIN CAPITAL LETTER U WITH DOUBLE ACUTE" )
 369  : ( "ű"   U+0171 Ll 1 "LATIN SMALL LETTER U WITH DOUBLE ACUTE" )
 370  : ( "Ų"   U+0172 Lu 1 "LATIN CAPITAL LETTER U WITH OGONEK" )
 371  : ( "ų"   U+0173 Ll 1 "LATIN SMALL LETTER U WITH OGONEK" )
 372  : ( "Ŵ"   U+0174 Lu 1 "LATIN CAPITAL LETTER W WITH CIRCUMFLEX" )
 373  : ( "ŵ"   U+0175 Ll 1 "LATIN SMALL LETTER W WITH CIRCUMFLEX" )
 374  : ( "Ŷ"   U+0176 Lu 1 "LATIN CAPITAL LETTER Y WITH CIRCUMFLEX" )
 375  : ( "ŷ"   U+0177 Ll 1 "LATIN SMALL LETTER Y WITH CIRCUMFLEX" )
 376  : ( "Ÿ"   U+0178 Lu 1 "LATIN CAPITAL LETTER Y WITH DIAERESIS" )
 377  : ( "Ź"   U+0179 Lu 1 "LATIN CAPITAL LETTER Z WITH ACUTE" )
 378  : ( "ź"   U+017A Ll 1 "LATIN SMALL LETTER Z WITH ACUTE" )
 379  : ( "Ż"   U+017B Lu 1 "LATIN CAPITAL LETTER Z WITH DOT ABOVE" )
 380  : ( "ż"   U+017C Ll 1 "LATIN SMALL LETTER Z WITH DOT ABOVE" )
 381  : ( "Ž"   U+017D Lu 1 "LATIN CAPITAL LETTER Z WITH CARON" )
 382  : ( "ž"   U+017E Ll 1 "LATIN SMALL LETTER Z WITH CARON" )
 383  : ( "ſ"   U+017F Ll 1 "LATIN SMALL LETTER LONG S" )
 384  : ( "ƀ"   U+0180 Ll 1 "LATIN SMALL LETTER B WITH STROKE" )
 385  : ( "Ɓ"   U+0181 Lu 1 "LATIN CAPITAL LETTER B WITH HOOK" )
 386  : ( "Ƃ"   U+0182 Lu 1 "LATIN CAPITAL LETTER B WITH TOPBAR" )
 387  : ( "ƃ"   U+0183 Ll 1 "LATIN SMALL LETTER B WITH TOPBAR" )
 388  : ( "Ƅ"   U+0184 Lu 1 "LATIN CAPITAL LETTER TONE SIX" )
 389  : ( "ƅ"   U+0185 Ll 1 "LATIN SMALL LETTER TONE SIX" )
 390  : ( "Ɔ"   U+0186 Lu 1 "LATIN CAPITAL LETTER OPEN O" )
 391  : ( "Ƈ"   U+0187 Lu 1 "LATIN CAPITAL LETTER C WITH HOOK" )
 392  : ( "ƈ"   U+0188 Ll 1 "LATIN SMALL LETTER C WITH HOOK" )
 393  : ( "Ɖ"   U+0189 Lu 1 "LATIN CAPITAL LETTER AFRICAN D" )
 394  : ( "Ɗ"   U+018A Lu 1 "LATIN CAPITAL LETTER D WITH HOOK" )
 395  : ( "Ƌ"   U+018B Lu 1 "LATIN CAPITAL LETTER D WITH TOPBAR" )
 396  : ( "ƌ"   U+018C Ll 1 "LATIN SMALL LETTER D WITH TOPBAR" )
 397  : ( "ƍ"   U+018D Ll 1 "LATIN SMALL LETTER TURNED DELTA" )
 398  : ( "Ǝ"   U+018E Lu 1 "LATIN CAPITAL LETTER REVERSED E" )
 399  : ( "Ə"   U+018F Lu 1 "LATIN CAPITAL LETTER SCHWA" )
 400  : ( "Ɛ"   U+0190 Lu 1 "LATIN CAPITAL LETTER OPEN E" )
 401  : ( "Ƒ"   U+0191 Lu 1 "LATIN CAPITAL LETTER F WITH HOOK" )
 402  : ( "ƒ"   U+0192 Ll 1 "LATIN SMALL LETTER F WITH HOOK" )
 403  : ( "Ɠ"   U+0193 Lu 1 "LATIN CAPITAL LETTER G WITH HOOK" )
 404  : ( "Ɣ"   U+0194 Lu 1 "LATIN CAPITAL LETTER GAMMA" )
 405  : ( "ƕ"   U+0195 Ll 1 "LATIN SMALL LETTER HV" )
 406  : ( "Ɩ"   U+0196 Lu 1 "LATIN CAPITAL LETTER IOTA" )
 407  : ( "Ɨ"   U+0197 Lu 1 "LATIN CAPITAL LETTER I WITH STROKE" )
 408  : ( "Ƙ"   U+0198 Lu 1 "LATIN CAPITAL LETTER K WITH HOOK" )
 409  : ( "ƙ"   U+0199 Ll 1 "LATIN SMALL LETTER K WITH HOOK" )
 410  : ( "ƚ"   U+019A Ll 1 "LATIN SMALL LETTER L WITH BAR" )
 411  : ( "ƛ"   U+019B Ll 1 "LATIN SMALL LETTER LAMBDA WITH STROKE" )
 412  : ( "Ɯ"   U+019C Lu 1 "LATIN CAPITAL LETTER TURNED M" )
 413  : ( "Ɲ"   U+019D Lu 1 "LATIN CAPITAL LETTER N WITH LEFT HOOK" )
 414  : ( "ƞ"   U+019E Ll 1 "LATIN SMALL LETTER N WITH LONG RIGHT LEG" )
 415  : ( "Ɵ"   U+019F Lu 1 "LATIN CAPITAL LETTER O WITH MIDDLE TILDE" )
 416  : ( "Ơ"   U+01A0 Lu 1 "LATIN CAPITAL LETTER O WITH HORN" )
 417  : ( "ơ"   U+01A1 Ll 1 "LATIN SMALL LETTER O WITH HORN" )
 418  : ( "Ƣ"   U+01A2 Lu 1 "LATIN CAPITAL LETTER OI", "LATIN CAPITAL LETTER GHA" )
 419  : ( "ƣ"   U+01A3 Ll 1 "LATIN SMALL LETTER OI", "LATIN SMALL LETTER GHA" )
 420  : ( "Ƥ"   U+01A4 Lu 1 "LATIN CAPITAL LETTER P WITH HOOK" )
 421  : ( "ƥ"   U+01A5 Ll 1 "LATIN SMALL LETTER P WITH HOOK" )
 422  : ( "Ʀ"   U+01A6 Lu 1 "LATIN LETTER YR" )
 423  : ( "Ƨ"   U+01A7 Lu 1 "LATIN CAPITAL LETTER TONE TWO" )
 424  : ( "ƨ"   U+01A8 Ll 1 "LATIN SMALL LETTER TONE TWO" )
 425  : ( "Ʃ"   U+01A9 Lu 1 "LATIN CAPITAL LETTER ESH" )
 426  : ( "ƪ"   U+01AA Ll 1 "LATIN LETTER REVERSED ESH LOOP" )
 427  : ( "ƫ"   U+01AB Ll 1 "LATIN SMALL LETTER T WITH PALATAL HOOK" )
 428  : ( "Ƭ"   U+01AC Lu 1 "LATIN CAPITAL LETTER T WITH HOOK" )
 429  : ( "ƭ"   U+01AD Ll 1 "LATIN SMALL LETTER T WITH HOOK" )
 430  : ( "Ʈ"   U+01AE Lu 1 "LATIN CAPITAL LETTER T WITH RETROFLEX HOOK" )
 431  : ( "Ư"   U+01AF Lu 1 "LATIN CAPITAL LETTER U WITH HORN" )
 432  : ( "ư"   U+01B0 Ll 1 "LATIN SMALL LETTER U WITH HORN" )
 433  : ( "Ʊ"   U+01B1 Lu 1 "LATIN CAPITAL LETTER UPSILON" )
 434  : ( "Ʋ"   U+01B2 Lu 1 "LATIN CAPITAL LETTER V WITH HOOK" )
 435  : ( "Ƴ"   U+01B3 Lu 1 "LATIN CAPITAL LETTER Y WITH HOOK" )
 436  : ( "ƴ"   U+01B4 Ll 1 "LATIN SMALL LETTER Y WITH HOOK" )
 437  : ( "Ƶ"   U+01B5 Lu 1 "LATIN CAPITAL LETTER Z WITH STROKE" )
 438  : ( "ƶ"   U+01B6 Ll 1 "LATIN SMALL LETTER Z WITH STROKE" )
 439  : ( "Ʒ"   U+01B7 Lu 1 "LATIN CAPITAL LETTER EZH" )
 440  : ( "Ƹ"   U+01B8 Lu 1 "LATIN CAPITAL LETTER EZH REVERSED" )
 441  : ( "ƹ"   U+01B9 Ll 1 "LATIN SMALL LETTER EZH REVERSED" )
 442  : ( "ƺ"   U+01BA Ll 1 "LATIN SMALL LETTER EZH WITH TAIL" )
 443  : ( "ƻ"   U+01BB Lo 1 "LATIN LETTER TWO WITH STROKE" )
 444  : ( "Ƽ"   U+01BC Lu 1 "LATIN CAPITAL LETTER TONE FIVE" )
 445  : ( "ƽ"   U+01BD Ll 1 "LATIN SMALL LETTER TONE FIVE" )
 446  : ( "ƾ"   U+01BE Ll 1 "LATIN LETTER INVERTED GLOTTAL STOP WITH STROKE" )
 447  : ( "ƿ"   U+01BF Ll 1 "LATIN LETTER WYNN" )
 448  : ( "ǀ"   U+01C0 Lo 1 "LATIN LETTER DENTAL CLICK" )
 449  : ( "ǁ"   U+01C1 Lo 1 "LATIN LETTER LATERAL CLICK" )
 450  : ( "ǂ"   U+01C2 Lo 1 "LATIN LETTER ALVEOLAR CLICK" )
 451  : ( "ǃ"   U+01C3 Lo 1 "LATIN LETTER RETROFLEX CLICK" )
 452  : ( "DŽ"   U+01C4 Lu 1 "LATIN CAPITAL LETTER DZ WITH CARON" )
 453  : ( "Dž"   U+01C5 Lt 1 "LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON" )
 454  : ( "dž"   U+01C6 Ll 1 "LATIN SMALL LETTER DZ WITH CARON" )
 455  : ( "LJ"   U+01C7 Lu 1 "LATIN CAPITAL LETTER LJ" )
 456  : ( "Lj"   U+01C8 Lt 1 "LATIN CAPITAL LETTER L WITH SMALL LETTER J" )
 457  : ( "lj"   U+01C9 Ll 1 "LATIN SMALL LETTER LJ" )
 458  : ( "NJ"   U+01CA Lu 1 "LATIN CAPITAL LETTER NJ" )
 459  : ( "Nj"   U+01CB Lt 1 "LATIN CAPITAL LETTER N WITH SMALL LETTER J" )
 460  : ( "nj"   U+01CC Ll 1 "LATIN SMALL LETTER NJ" )
 461  : ( "Ǎ"   U+01CD Lu 1 "LATIN CAPITAL LETTER A WITH CARON" )
 462  : ( "ǎ"   U+01CE Ll 1 "LATIN SMALL LETTER A WITH CARON" )
 463  : ( "Ǐ"   U+01CF Lu 1 "LATIN CAPITAL LETTER I WITH CARON" )
 464  : ( "ǐ"   U+01D0 Ll 1 "LATIN SMALL LETTER I WITH CARON" )
 465  : ( "Ǒ"   U+01D1 Lu 1 "LATIN CAPITAL LETTER O WITH CARON" )
 466  : ( "ǒ"   U+01D2 Ll 1 "LATIN SMALL LETTER O WITH CARON" )
 467  : ( "Ǔ"   U+01D3 Lu 1 "LATIN CAPITAL LETTER U WITH CARON" )
 468  : ( "ǔ"   U+01D4 Ll 1 "LATIN SMALL LETTER U WITH CARON" )
 469  : ( "Ǖ"   U+01D5 Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON" )
 470  : ( "ǖ"   U+01D6 Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS AND MACRON" )
 471  : ( "Ǘ"   U+01D7 Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE" )
 472  : ( "ǘ"   U+01D8 Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE" )
 473  : ( "Ǚ"   U+01D9 Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON" )
 474  : ( "ǚ"   U+01DA Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS AND CARON" )
 475  : ( "Ǜ"   U+01DB Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE" )
 476  : ( "ǜ"   U+01DC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE" )
 477  : ( "ǝ"   U+01DD Ll 1 "LATIN SMALL LETTER TURNED E" )
 478  : ( "Ǟ"   U+01DE Lu 1 "LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON" )
 479  : ( "ǟ"   U+01DF Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS AND MACRON" )
 480  : ( "Ǡ"   U+01E0 Lu 1 "LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON" )
 481  : ( "ǡ"   U+01E1 Ll 1 "LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON" )
 482  : ( "Ǣ"   U+01E2 Lu 1 "LATIN CAPITAL LETTER AE WITH MACRON" )
 483  : ( "ǣ"   U+01E3 Ll 1 "LATIN SMALL LETTER AE WITH MACRON" )
 484  : ( "Ǥ"   U+01E4 Lu 1 "LATIN CAPITAL LETTER G WITH STROKE" )
 485  : ( "ǥ"   U+01E5 Ll 1 "LATIN SMALL LETTER G WITH STROKE" )
 486  : ( "Ǧ"   U+01E6 Lu 1 "LATIN CAPITAL LETTER G WITH CARON" )
 487  : ( "ǧ"   U+01E7 Ll 1 "LATIN SMALL LETTER G WITH CARON" )
 488  : ( "Ǩ"   U+01E8 Lu 1 "LATIN CAPITAL LETTER K WITH CARON" )
 489  : ( "ǩ"   U+01E9 Ll 1 "LATIN SMALL LETTER K WITH CARON" )
 490  : ( "Ǫ"   U+01EA Lu 1 "LATIN CAPITAL LETTER O WITH OGONEK" )
 491  : ( "ǫ"   U+01EB Ll 1 "LATIN SMALL LETTER O WITH OGONEK" )
 492  : ( "Ǭ"   U+01EC Lu 1 "LATIN CAPITAL LETTER O WITH OGONEK AND MACRON" )
 493  : ( "ǭ"   U+01ED Ll 1 "LATIN SMALL LETTER O WITH OGONEK AND MACRON" )
 494  : ( "Ǯ"   U+01EE Lu 1 "LATIN CAPITAL LETTER EZH WITH CARON" )
 495  : ( "ǯ"   U+01EF Ll 1 "LATIN SMALL LETTER EZH WITH CARON" )
 496  : ( "ǰ"   U+01F0 Ll 1 "LATIN SMALL LETTER J WITH CARON" )
 497  : ( "DZ"   U+01F1 Lu 1 "LATIN CAPITAL LETTER DZ" )
 498  : ( "Dz"   U+01F2 Lt 1 "LATIN CAPITAL LETTER D WITH SMALL LETTER Z" )
 499  : ( "dz"   U+01F3 Ll 1 "LATIN SMALL LETTER DZ" )
 500  : ( "Ǵ"   U+01F4 Lu 1 "LATIN CAPITAL LETTER G WITH ACUTE" )
 501  : ( "ǵ"   U+01F5 Ll 1 "LATIN SMALL LETTER G WITH ACUTE" )
 502  : ( "Ƕ"   U+01F6 Lu 1 "LATIN CAPITAL LETTER HWAIR" )
 503  : ( "Ƿ"   U+01F7 Lu 1 "LATIN CAPITAL LETTER WYNN" )
 504  : ( "Ǹ"   U+01F8 Lu 1 "LATIN CAPITAL LETTER N WITH GRAVE" )
 505  : ( "ǹ"   U+01F9 Ll 1 "LATIN SMALL LETTER N WITH GRAVE" )
 506  : ( "Ǻ"   U+01FA Lu 1 "LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE" )
 507  : ( "ǻ"   U+01FB Ll 1 "LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE" )
 508  : ( "Ǽ"   U+01FC Lu 1 "LATIN CAPITAL LETTER AE WITH ACUTE" )
 509  : ( "ǽ"   U+01FD Ll 1 "LATIN SMALL LETTER AE WITH ACUTE" )
 510  : ( "Ǿ"   U+01FE Lu 1 "LATIN CAPITAL LETTER O WITH STROKE AND ACUTE" )
 511  : ( "ǿ"   U+01FF Ll 1 "LATIN SMALL LETTER O WITH STROKE AND ACUTE" )
 512  : ( "Ȁ"   U+0200 Lu 1 "LATIN CAPITAL LETTER A WITH DOUBLE GRAVE" )
 513  : ( "ȁ"   U+0201 Ll 1 "LATIN SMALL LETTER A WITH DOUBLE GRAVE" )
 514  : ( "Ȃ"   U+0202 Lu 1 "LATIN CAPITAL LETTER A WITH INVERTED BREVE" )
 515  : ( "ȃ"   U+0203 Ll 1 "LATIN SMALL LETTER A WITH INVERTED BREVE" )
 516  : ( "Ȅ"   U+0204 Lu 1 "LATIN CAPITAL LETTER E WITH DOUBLE GRAVE" )
 517  : ( "ȅ"   U+0205 Ll 1 "LATIN SMALL LETTER E WITH DOUBLE GRAVE" )
 518  : ( "Ȇ"   U+0206 Lu 1 "LATIN CAPITAL LETTER E WITH INVERTED BREVE" )
 519  : ( "ȇ"   U+0207 Ll 1 "LATIN SMALL LETTER E WITH INVERTED BREVE" )
 520  : ( "Ȉ"   U+0208 Lu 1 "LATIN CAPITAL LETTER I WITH DOUBLE GRAVE" )
 521  : ( "ȉ"   U+0209 Ll 1 "LATIN SMALL LETTER I WITH DOUBLE GRAVE" )
 522  : ( "Ȋ"   U+020A Lu 1 "LATIN CAPITAL LETTER I WITH INVERTED BREVE" )
 523  : ( "ȋ"   U+020B Ll 1 "LATIN SMALL LETTER I WITH INVERTED BREVE" )
 524  : ( "Ȍ"   U+020C Lu 1 "LATIN CAPITAL LETTER O WITH DOUBLE GRAVE" )
 525  : ( "ȍ"   U+020D Ll 1 "LATIN SMALL LETTER O WITH DOUBLE GRAVE" )
 526  : ( "Ȏ"   U+020E Lu 1 "LATIN CAPITAL LETTER O WITH INVERTED BREVE" )
 527  : ( "ȏ"   U+020F Ll 1 "LATIN SMALL LETTER O WITH INVERTED BREVE" )
 528  : ( "Ȑ"   U+0210 Lu 1 "LATIN CAPITAL LETTER R WITH DOUBLE GRAVE" )
 529  : ( "ȑ"   U+0211 Ll 1 "LATIN SMALL LETTER R WITH DOUBLE GRAVE" )
 530  : ( "Ȓ"   U+0212 Lu 1 "LATIN CAPITAL LETTER R WITH INVERTED BREVE" )
 531  : ( "ȓ"   U+0213 Ll 1 "LATIN SMALL LETTER R WITH INVERTED BREVE" )
 532  : ( "Ȕ"   U+0214 Lu 1 "LATIN CAPITAL LETTER U WITH DOUBLE GRAVE" )
 533  : ( "ȕ"   U+0215 Ll 1 "LATIN SMALL LETTER U WITH DOUBLE GRAVE" )
 534  : ( "Ȗ"   U+0216 Lu 1 "LATIN CAPITAL LETTER U WITH INVERTED BREVE" )
 535  : ( "ȗ"   U+0217 Ll 1 "LATIN SMALL LETTER U WITH INVERTED BREVE" )
 536  : ( "Ș"   U+0218 Lu 1 "LATIN CAPITAL LETTER S WITH COMMA BELOW" )
 537  : ( "ș"   U+0219 Ll 1 "LATIN SMALL LETTER S WITH COMMA BELOW" )
 538  : ( "Ț"   U+021A Lu 1 "LATIN CAPITAL LETTER T WITH COMMA BELOW" )
 539  : ( "ț"   U+021B Ll 1 "LATIN SMALL LETTER T WITH COMMA BELOW" )
 540  : ( "Ȝ"   U+021C Lu 1 "LATIN CAPITAL LETTER YOGH" )
 541  : ( "ȝ"   U+021D Ll 1 "LATIN SMALL LETTER YOGH" )
 542  : ( "Ȟ"   U+021E Lu 1 "LATIN CAPITAL LETTER H WITH CARON" )
 543  : ( "ȟ"   U+021F Ll 1 "LATIN SMALL LETTER H WITH CARON" )
 544  : ( "Ƞ"   U+0220 Lu 1 "LATIN CAPITAL LETTER N WITH LONG RIGHT LEG" )
 545  : ( "ȡ"   U+0221 Ll 1 "LATIN SMALL LETTER D WITH CURL" )
 546  : ( "Ȣ"   U+0222 Lu 1 "LATIN CAPITAL LETTER OU" )
 547  : ( "ȣ"   U+0223 Ll 1 "LATIN SMALL LETTER OU" )
 548  : ( "Ȥ"   U+0224 Lu 1 "LATIN CAPITAL LETTER Z WITH HOOK" )
 549  : ( "ȥ"   U+0225 Ll 1 "LATIN SMALL LETTER Z WITH HOOK" )
 550  : ( "Ȧ"   U+0226 Lu 1 "LATIN CAPITAL LETTER A WITH DOT ABOVE" )
 551  : ( "ȧ"   U+0227 Ll 1 "LATIN SMALL LETTER A WITH DOT ABOVE" )
 552  : ( "Ȩ"   U+0228 Lu 1 "LATIN CAPITAL LETTER E WITH CEDILLA" )
 553  : ( "ȩ"   U+0229 Ll 1 "LATIN SMALL LETTER E WITH CEDILLA" )
 554  : ( "Ȫ"   U+022A Lu 1 "LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON" )
 555  : ( "ȫ"   U+022B Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS AND MACRON" )
 556  : ( "Ȭ"   U+022C Lu 1 "LATIN CAPITAL LETTER O WITH TILDE AND MACRON" )
 557  : ( "ȭ"   U+022D Ll 1 "LATIN SMALL LETTER O WITH TILDE AND MACRON" )
 558  : ( "Ȯ"   U+022E Lu 1 "LATIN CAPITAL LETTER O WITH DOT ABOVE" )
 559  : ( "ȯ"   U+022F Ll 1 "LATIN SMALL LETTER O WITH DOT ABOVE" )
 560  : ( "Ȱ"   U+0230 Lu 1 "LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON" )
 561  : ( "ȱ"   U+0231 Ll 1 "LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON" )
 562  : ( "Ȳ"   U+0232 Lu 1 "LATIN CAPITAL LETTER Y WITH MACRON" )
 563  : ( "ȳ"   U+0233 Ll 1 "LATIN SMALL LETTER Y WITH MACRON" )
 564  : ( "ȴ"   U+0234 Ll 1 "LATIN SMALL LETTER L WITH CURL" )
 565  : ( "ȵ"   U+0235 Ll 1 "LATIN SMALL LETTER N WITH CURL" )
 566  : ( "ȶ"   U+0236 Ll 1 "LATIN SMALL LETTER T WITH CURL" )
 567  : ( "ȷ"   U+0237 Ll 1 "LATIN SMALL LETTER DOTLESS J" )
 568  : ( "ȸ"   U+0238 Ll 1 "LATIN SMALL LETTER DB DIGRAPH" )
 569  : ( "ȹ"   U+0239 Ll 1 "LATIN SMALL LETTER QP DIGRAPH" )
 570  : ( "Ⱥ"   U+023A Lu 1 "LATIN CAPITAL LETTER A WITH STROKE" )
 571  : ( "Ȼ"   U+023B Lu 1 "LATIN CAPITAL LETTER C WITH STROKE" )
 572  : ( "ȼ"   U+023C Ll 1 "LATIN SMALL LETTER C WITH STROKE" )
 573  : ( "Ƚ"   U+023D Lu 1 "LATIN CAPITAL LETTER L WITH BAR" )
 574  : ( "Ⱦ"   U+023E Lu 1 "LATIN CAPITAL LETTER T WITH DIAGONAL STROKE" )
 575  : ( "ȿ"   U+023F Ll 1 "LATIN SMALL LETTER S WITH SWASH TAIL" )
 576  : ( "ɀ"   U+0240 Ll 1 "LATIN SMALL LETTER Z WITH SWASH TAIL" )
 577  : ( "Ɂ"   U+0241 Lu 1 "LATIN CAPITAL LETTER GLOTTAL STOP" )
 578  : ( "ɂ"   U+0242 Ll 1 "LATIN SMALL LETTER GLOTTAL STOP" )
 579  : ( "Ƀ"   U+0243 Lu 1 "LATIN CAPITAL LETTER B WITH STROKE" )
 580  : ( "Ʉ"   U+0244 Lu 1 "LATIN CAPITAL LETTER U BAR" )
 581  : ( "Ʌ"   U+0245 Lu 1 "LATIN CAPITAL LETTER TURNED V" )
 582  : ( "Ɇ"   U+0246 Lu 1 "LATIN CAPITAL LETTER E WITH STROKE" )
 583  : ( "ɇ"   U+0247 Ll 1 "LATIN SMALL LETTER E WITH STROKE" )
 584  : ( "Ɉ"   U+0248 Lu 1 "LATIN CAPITAL LETTER J WITH STROKE" )
 585  : ( "ɉ"   U+0249 Ll 1 "LATIN SMALL LETTER J WITH STROKE" )
 586  : ( "Ɋ"   U+024A Lu 1 "LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL" )
 587  : ( "ɋ"   U+024B Ll 1 "LATIN SMALL LETTER Q WITH HOOK TAIL" )
 588  : ( "Ɍ"   U+024C Lu 1 "LATIN CAPITAL LETTER R WITH STROKE" )
 589  : ( "ɍ"   U+024D Ll 1 "LATIN SMALL LETTER R WITH STROKE" )
 590  : ( "Ɏ"   U+024E Lu 1 "LATIN CAPITAL LETTER Y WITH STROKE" )
 591  : ( "ɏ"   U+024F Ll 1 "LATIN SMALL LETTER Y WITH STROKE" )
 592  : ( "ɐ"   U+0250 Ll 1 "LATIN SMALL LETTER TURNED A" )
 593  : ( "ɑ"   U+0251 Ll 1 "LATIN SMALL LETTER ALPHA" )
 594  : ( "ɒ"   U+0252 Ll 1 "LATIN SMALL LETTER TURNED ALPHA" )
 595  : ( "ɓ"   U+0253 Ll 1 "LATIN SMALL LETTER B WITH HOOK" )
 596  : ( "ɔ"   U+0254 Ll 1 "LATIN SMALL LETTER OPEN O" )
 597  : ( "ɕ"   U+0255 Ll 1 "LATIN SMALL LETTER C WITH CURL" )
 598  : ( "ɖ"   U+0256 Ll 1 "LATIN SMALL LETTER D WITH TAIL" )
 599  : ( "ɗ"   U+0257 Ll 1 "LATIN SMALL LETTER D WITH HOOK" )
 600  : ( "ɘ"   U+0258 Ll 1 "LATIN SMALL LETTER REVERSED E" )
 601  : ( "ə"   U+0259 Ll 1 "LATIN SMALL LETTER SCHWA" )
 602  : ( "ɚ"   U+025A Ll 1 "LATIN SMALL LETTER SCHWA WITH HOOK" )
 603  : ( "ɛ"   U+025B Ll 1 "LATIN SMALL LETTER OPEN E" )
 604  : ( "ɜ"   U+025C Ll 1 "LATIN SMALL LETTER REVERSED OPEN E" )
 605  : ( "ɝ"   U+025D Ll 1 "LATIN SMALL LETTER REVERSED OPEN E WITH HOOK" )
 606  : ( "ɞ"   U+025E Ll 1 "LATIN SMALL LETTER CLOSED REVERSED OPEN E" )
 607  : ( "ɟ"   U+025F Ll 1 "LATIN SMALL LETTER DOTLESS J WITH STROKE" )
 608  : ( "ɠ"   U+0260 Ll 1 "LATIN SMALL LETTER G WITH HOOK" )
 609  : ( "ɡ"   U+0261 Ll 1 "LATIN SMALL LETTER SCRIPT G" )
 610  : ( "ɢ"   U+0262 Ll 1 "LATIN LETTER SMALL CAPITAL G" )
 611  : ( "ɣ"   U+0263 Ll 1 "LATIN SMALL LETTER GAMMA" )
 612  : ( "ɤ"   U+0264 Ll 1 "LATIN SMALL LETTER RAMS HORN" )
 613  : ( "ɥ"   U+0265 Ll 1 "LATIN SMALL LETTER TURNED H" )
 614  : ( "ɦ"   U+0266 Ll 1 "LATIN SMALL LETTER H WITH HOOK" )
 615  : ( "ɧ"   U+0267 Ll 1 "LATIN SMALL LETTER HENG WITH HOOK" )
 616  : ( "ɨ"   U+0268 Ll 1 "LATIN SMALL LETTER I WITH STROKE" )
 617  : ( "ɩ"   U+0269 Ll 1 "LATIN SMALL LETTER IOTA" )
 618  : ( "ɪ"   U+026A Ll 1 "LATIN LETTER SMALL CAPITAL I" )
 619  : ( "ɫ"   U+026B Ll 1 "LATIN SMALL LETTER L WITH MIDDLE TILDE" )
 620  : ( "ɬ"   U+026C Ll 1 "LATIN SMALL LETTER L WITH BELT" )
 621  : ( "ɭ"   U+026D Ll 1 "LATIN SMALL LETTER L WITH RETROFLEX HOOK" )
 622  : ( "ɮ"   U+026E Ll 1 "LATIN SMALL LETTER LEZH" )
 623  : ( "ɯ"   U+026F Ll 1 "LATIN SMALL LETTER TURNED M" )
 624  : ( "ɰ"   U+0270 Ll 1 "LATIN SMALL LETTER TURNED M WITH LONG LEG" )
 625  : ( "ɱ"   U+0271 Ll 1 "LATIN SMALL LETTER M WITH HOOK" )
 626  : ( "ɲ"   U+0272 Ll 1 "LATIN SMALL LETTER N WITH LEFT HOOK" )
 627  : ( "ɳ"   U+0273 Ll 1 "LATIN SMALL LETTER N WITH RETROFLEX HOOK" )
 628  : ( "ɴ"   U+0274 Ll 1 "LATIN LETTER SMALL CAPITAL N" )
 629  : ( "ɵ"   U+0275 Ll 1 "LATIN SMALL LETTER BARRED O" )
 630  : ( "ɶ"   U+0276 Ll 1 "LATIN LETTER SMALL CAPITAL OE" )
 631  : ( "ɷ"   U+0277 Ll 1 "LATIN SMALL LETTER CLOSED OMEGA" )
 632  : ( "ɸ"   U+0278 Ll 1 "LATIN SMALL LETTER PHI" )
 633  : ( "ɹ"   U+0279 Ll 1 "LATIN SMALL LETTER TURNED R" )
 634  : ( "ɺ"   U+027A Ll 1 "LATIN SMALL LETTER TURNED R WITH LONG LEG" )
 635  : ( "ɻ"   U+027B Ll 1 "LATIN SMALL LETTER TURNED R WITH HOOK" )
 636  : ( "ɼ"   U+027C Ll 1 "LATIN SMALL LETTER R WITH LONG LEG" )
 637  : ( "ɽ"   U+027D Ll 1 "LATIN SMALL LETTER R WITH TAIL" )
 638  : ( "ɾ"   U+027E Ll 1 "LATIN SMALL LETTER R WITH FISHHOOK" )
 639  : ( "ɿ"   U+027F Ll 1 "LATIN SMALL LETTER REVERSED R WITH FISHHOOK" )
 640  : ( "ʀ"   U+0280 Ll 1 "LATIN LETTER SMALL CAPITAL R" )
 641  : ( "ʁ"   U+0281 Ll 1 "LATIN LETTER SMALL CAPITAL INVERTED R" )
 642  : ( "ʂ"   U+0282 Ll 1 "LATIN SMALL LETTER S WITH HOOK" )
 643  : ( "ʃ"   U+0283 Ll 1 "LATIN SMALL LETTER ESH" )
 644  : ( "ʄ"   U+0284 Ll 1 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" )
 645  : ( "ʅ"   U+0285 Ll 1 "LATIN SMALL LETTER SQUAT REVERSED ESH" )
 646  : ( "ʆ"   U+0286 Ll 1 "LATIN SMALL LETTER ESH WITH CURL" )
 647  : ( "ʇ"   U+0287 Ll 1 "LATIN SMALL LETTER TURNED T" )
 648  : ( "ʈ"   U+0288 Ll 1 "LATIN SMALL LETTER T WITH RETROFLEX HOOK" )
 649  : ( "ʉ"   U+0289 Ll 1 "LATIN SMALL LETTER U BAR" )
 650  : ( "ʊ"   U+028A Ll 1 "LATIN SMALL LETTER UPSILON" )
 651  : ( "ʋ"   U+028B Ll 1 "LATIN SMALL LETTER V WITH HOOK" )
 652  : ( "ʌ"   U+028C Ll 1 "LATIN SMALL LETTER TURNED V" )
 653  : ( "ʍ"   U+028D Ll 1 "LATIN SMALL LETTER TURNED W" )
 654  : ( "ʎ"   U+028E Ll 1 "LATIN SMALL LETTER TURNED Y" )
 655  : ( "ʏ"   U+028F Ll 1 "LATIN LETTER SMALL CAPITAL Y" )
 656  : ( "ʐ"   U+0290 Ll 1 "LATIN SMALL LETTER Z WITH RETROFLEX HOOK" )
 657  : ( "ʑ"   U+0291 Ll 1 "LATIN SMALL LETTER Z WITH CURL" )
 658  : ( "ʒ"   U+0292 Ll 1 "LATIN SMALL LETTER EZH" )
 659  : ( "ʓ"   U+0293 Ll 1 "LATIN SMALL LETTER EZH WITH CURL" )
 660  : ( "ʔ"   U+0294 Lo 1 "LATIN LETTER GLOTTAL STOP" )
 661  : ( "ʕ"   U+0295 Ll 1 "LATIN LETTER PHARYNGEAL VOICED FRICATIVE" )
 662  : ( "ʖ"   U+0296 Ll 1 "LATIN LETTER INVERTED GLOTTAL STOP" )
 663  : ( "ʗ"   U+0297 Ll 1 "LATIN LETTER STRETCHED C" )
 664  : ( "ʘ"   U+0298 Ll 1 "LATIN LETTER BILABIAL CLICK" )
 665  : ( "ʙ"   U+0299 Ll 1 "LATIN LETTER SMALL CAPITAL B" )
 666  : ( "ʚ"   U+029A Ll 1 "LATIN SMALL LETTER CLOSED OPEN E" )
 667  : ( "ʛ"   U+029B Ll 1 "LATIN LETTER SMALL CAPITAL G WITH HOOK" )
 668  : ( "ʜ"   U+029C Ll 1 "LATIN LETTER SMALL CAPITAL H" )
 669  : ( "ʝ"   U+029D Ll 1 "LATIN SMALL LETTER J WITH CROSSED-TAIL" )
 670  : ( "ʞ"   U+029E Ll 1 "LATIN SMALL LETTER TURNED K" )
 671  : ( "ʟ"   U+029F Ll 1 "LATIN LETTER SMALL CAPITAL L" )
 672  : ( "ʠ"   U+02A0 Ll 1 "LATIN SMALL LETTER Q WITH HOOK" )
 673  : ( "ʡ"   U+02A1 Ll 1 "LATIN LETTER GLOTTAL STOP WITH STROKE" )
 674  : ( "ʢ"   U+02A2 Ll 1 "LATIN LETTER REVERSED GLOTTAL STOP WITH STROKE" )
 675  : ( "ʣ"   U+02A3 Ll 1 "LATIN SMALL LETTER DZ DIGRAPH" )
 676  : ( "ʤ"   U+02A4 Ll 1 "LATIN SMALL LETTER DEZH DIGRAPH" )
 677  : ( "ʥ"   U+02A5 Ll 1 "LATIN SMALL LETTER DZ DIGRAPH WITH CURL" )
 678  : ( "ʦ"   U+02A6 Ll 1 "LATIN SMALL LETTER TS DIGRAPH" )
 679  : ( "ʧ"   U+02A7 Ll 1 "LATIN SMALL LETTER TESH DIGRAPH" )
 680  : ( "ʨ"   U+02A8 Ll 1 "LATIN SMALL LETTER TC DIGRAPH WITH CURL" )
 681  : ( "ʩ"   U+02A9 Ll 1 "LATIN SMALL LETTER FENG DIGRAPH" )
 682  : ( "ʪ"   U+02AA Ll 1 "LATIN SMALL LETTER LS DIGRAPH" )
 683  : ( "ʫ"   U+02AB Ll 1 "LATIN SMALL LETTER LZ DIGRAPH" )
 684  : ( "ʬ"   U+02AC Ll 1 "LATIN LETTER BILABIAL PERCUSSIVE" )
 685  : ( "ʭ"   U+02AD Ll 1 "LATIN LETTER BIDENTAL PERCUSSIVE" )
 686  : ( "ʮ"   U+02AE Ll 1 "LATIN SMALL LETTER TURNED H WITH FISHHOOK" )
 687  : ( "ʯ"   U+02AF Ll 1 "LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL" )
 688  : ( "ʰ"   U+02B0 Lm 1 "MODIFIER LETTER SMALL H" )
 689  : ( "ʱ"   U+02B1 Lm 1 "MODIFIER LETTER SMALL H WITH HOOK" )
 690  : ( "ʲ"   U+02B2 Lm 1 "MODIFIER LETTER SMALL J" )
 691  : ( "ʳ"   U+02B3 Lm 1 "MODIFIER LETTER SMALL R" )
 692  : ( "ʴ"   U+02B4 Lm 1 "MODIFIER LETTER SMALL TURNED R" )
 693  : ( "ʵ"   U+02B5 Lm 1 "MODIFIER LETTER SMALL TURNED R WITH HOOK" )
 694  : ( "ʶ"   U+02B6 Lm 1 "MODIFIER LETTER SMALL CAPITAL INVERTED R" )
 695  : ( "ʷ"   U+02B7 Lm 1 "MODIFIER LETTER SMALL W" )
 696  : ( "ʸ"   U+02B8 Lm 1 "MODIFIER LETTER SMALL Y" )
 697  : ( "ʹ"   U+02B9 Lm 1 "MODIFIER LETTER PRIME" )
 698  : ( "ʺ"   U+02BA Lm 1 "MODIFIER LETTER DOUBLE PRIME" )
 699  : ( "ʻ"   U+02BB Lm 1 "MODIFIER LETTER TURNED COMMA" )
 700  : ( "ʼ"   U+02BC Lm 1 "MODIFIER LETTER APOSTROPHE" )
 701  : ( "ʽ"   U+02BD Lm 1 "MODIFIER LETTER REVERSED COMMA" )
 702  : ( "ʾ"   U+02BE Lm 1 "MODIFIER LETTER RIGHT HALF RING" )
 703  : ( "ʿ"   U+02BF Lm 1 "MODIFIER LETTER LEFT HALF RING" )
 704  : ( "ˀ"   U+02C0 Lm 1 "MODIFIER LETTER GLOTTAL STOP" )
 705  : ( "ˁ"   U+02C1 Lm 1 "MODIFIER LETTER REVERSED GLOTTAL STOP" )
 706  : ( "˂"   U+02C2 Sk 1 "MODIFIER LETTER LEFT ARROWHEAD" )
 707  : ( "˃"   U+02C3 Sk 1 "MODIFIER LETTER RIGHT ARROWHEAD" )
 708  : ( "˄"   U+02C4 Sk 1 "MODIFIER LETTER UP ARROWHEAD" )
 709  : ( "˅"   U+02C5 Sk 1 "MODIFIER LETTER DOWN ARROWHEAD" )
 710  : ( "ˆ"   U+02C6 Lm 1 "MODIFIER LETTER CIRCUMFLEX ACCENT" )
 711  : ( "ˇ"   U+02C7 Lm 1 "CARON" )
 712  : ( "ˈ"   U+02C8 Lm 1 "MODIFIER LETTER VERTICAL LINE" )
 713  : ( "ˉ"   U+02C9 Lm 1 "MODIFIER LETTER MACRON" )
 714  : ( "ˊ"   U+02CA Lm 1 "MODIFIER LETTER ACUTE ACCENT" )
 715  : ( "ˋ"   U+02CB Lm 1 "MODIFIER LETTER GRAVE ACCENT" )
 716  : ( "ˌ"   U+02CC Lm 1 "MODIFIER LETTER LOW VERTICAL LINE" )
 717  : ( "ˍ"   U+02CD Lm 1 "MODIFIER LETTER LOW MACRON" )
 718  : ( "ˎ"   U+02CE Lm 1 "MODIFIER LETTER LOW GRAVE ACCENT" )
 719  : ( "ˏ"   U+02CF Lm 1 "MODIFIER LETTER LOW ACUTE ACCENT" )
 720  : ( "ː"   U+02D0 Lm 1 "MODIFIER LETTER TRIANGULAR COLON" )
 721  : ( "ˑ"   U+02D1 Lm 1 "MODIFIER LETTER HALF TRIANGULAR COLON" )
 722  : ( "˒"   U+02D2 Sk 1 "MODIFIER LETTER CENTRED RIGHT HALF RING" )
 723  : ( "˓"   U+02D3 Sk 1 "MODIFIER LETTER CENTRED LEFT HALF RING" )
 724  : ( "˔"   U+02D4 Sk 1 "MODIFIER LETTER UP TACK" )
 725  : ( "˕"   U+02D5 Sk 1 "MODIFIER LETTER DOWN TACK" )
 726  : ( "˖"   U+02D6 Sk 1 "MODIFIER LETTER PLUS SIGN" )
 727  : ( "˗"   U+02D7 Sk 1 "MODIFIER LETTER MINUS SIGN" )
 728  : ( "˘"   U+02D8 Sk 1 "BREVE" )
 729  : ( "˙"   U+02D9 Sk 1 "DOT ABOVE" )
 730  : ( "˚"   U+02DA Sk 1 "RING ABOVE" )
 731  : ( "˛"   U+02DB Sk 1 "OGONEK" )
 732  : ( "˜"   U+02DC Sk 1 "SMALL TILDE" )
 733  : ( "˝"   U+02DD Sk 1 "DOUBLE ACUTE ACCENT" )
 734  : ( "˞"   U+02DE Sk 1 "MODIFIER LETTER RHOTIC HOOK" )
 735  : ( "˟"   U+02DF Sk 1 "MODIFIER LETTER CROSS ACCENT" )
 736  : ( "ˠ"   U+02E0 Lm 1 "MODIFIER LETTER SMALL GAMMA" )
 737  : ( "ˡ"   U+02E1 Lm 1 "MODIFIER LETTER SMALL L" )
 738  : ( "ˢ"   U+02E2 Lm 1 "MODIFIER LETTER SMALL S" )
 739  : ( "ˣ"   U+02E3 Lm 1 "MODIFIER LETTER SMALL X" )
 740  : ( "ˤ"   U+02E4 Lm 1 "MODIFIER LETTER SMALL REVERSED GLOTTAL STOP" )
 741  : ( "˥"   U+02E5 Sk 1 "MODIFIER LETTER EXTRA-HIGH TONE BAR" )
 742  : ( "˦"   U+02E6 Sk 1 "MODIFIER LETTER HIGH TONE BAR" )
 743  : ( "˧"   U+02E7 Sk 1 "MODIFIER LETTER MID TONE BAR" )
 744  : ( "˨"   U+02E8 Sk 1 "MODIFIER LETTER LOW TONE BAR" )
 745  : ( "˩"   U+02E9 Sk 1 "MODIFIER LETTER EXTRA-LOW TONE BAR" )
 746  : ( "˪"   U+02EA Sk 1 "MODIFIER LETTER YIN DEPARTING TONE MARK" )
 747  : ( "˫"   U+02EB Sk 1 "MODIFIER LETTER YANG DEPARTING TONE MARK" )
 748  : ( "ˬ"   U+02EC Lm 1 "MODIFIER LETTER VOICING" )
 749  : ( "˭"   U+02ED Sk 1 "MODIFIER LETTER UNASPIRATED" )
 750  : ( "ˮ"   U+02EE Lm 1 "MODIFIER LETTER DOUBLE APOSTROPHE" )
 751  : ( "˯"   U+02EF Sk 1 "MODIFIER LETTER LOW DOWN ARROWHEAD" )
 752  : ( "˰"   U+02F0 Sk 1 "MODIFIER LETTER LOW UP ARROWHEAD" )
 753  : ( "˱"   U+02F1 Sk 1 "MODIFIER LETTER LOW LEFT ARROWHEAD" )
 754  : ( "˲"   U+02F2 Sk 1 "MODIFIER LETTER LOW RIGHT ARROWHEAD" )
 755  : ( "˳"   U+02F3 Sk 1 "MODIFIER LETTER LOW RING" )
 756  : ( "˴"   U+02F4 Sk 1 "MODIFIER LETTER MIDDLE GRAVE ACCENT" )
 757  : ( "˵"   U+02F5 Sk 1 "MODIFIER LETTER MIDDLE DOUBLE GRAVE ACCENT" )
 758  : ( "˶"   U+02F6 Sk 1 "MODIFIER LETTER MIDDLE DOUBLE ACUTE ACCENT" )
 759  : ( "˷"   U+02F7 Sk 1 "MODIFIER LETTER LOW TILDE" )
 760  : ( "˸"   U+02F8 Sk 1 "MODIFIER LETTER RAISED COLON" )
 761  : ( "˹"   U+02F9 Sk 1 "MODIFIER LETTER BEGIN HIGH TONE" )
 762  : ( "˺"   U+02FA Sk 1 "MODIFIER LETTER END HIGH TONE" )
 763  : ( "˻"   U+02FB Sk 1 "MODIFIER LETTER BEGIN LOW TONE" )
 764  : ( "˼"   U+02FC Sk 1 "MODIFIER LETTER END LOW TONE" )
 765  : ( "˽"   U+02FD Sk 1 "MODIFIER LETTER SHELF" )
 766  : ( "˾"   U+02FE Sk 1 "MODIFIER LETTER OPEN SHELF" )
 767  : ( "˿"   U+02FF Sk 1 "MODIFIER LETTER LOW LEFT ARROW" )
 768  : ( "̀"    U+0300 Mn 0 "COMBINING GRAVE ACCENT" )
 769  : ( "́"    U+0301 Mn 0 "COMBINING ACUTE ACCENT" )
 770  : ( "̂"    U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" )
 771  : ( "̃"    U+0303 Mn 0 "COMBINING TILDE" )
 772  : ( "̄"    U+0304 Mn 0 "COMBINING MACRON" )
 773  : ( "̅"    U+0305 Mn 0 "COMBINING OVERLINE" )
 774  : ( "̆"    U+0306 Mn 0 "COMBINING BREVE" )
 775  : ( "̇"    U+0307 Mn 0 "COMBINING DOT ABOVE" )
 776  : ( "̈"    U+0308 Mn 0 "COMBINING DIAERESIS" )
 777  : ( "̉"    U+0309 Mn 0 "COMBINING HOOK ABOVE" )
 778  : ( "̊"    U+030A Mn 0 "COMBINING RING ABOVE" )
 779  : ( "̋"    U+030B Mn 0 "COMBINING DOUBLE ACUTE ACCENT" )
 780  : ( "̌"    U+030C Mn 0 "COMBINING CARON" )
 781  : ( "̍"    U+030D Mn 0 "COMBINING VERTICAL LINE ABOVE" )
 782  : ( "̎"    U+030E Mn 0 "COMBINING DOUBLE VERTICAL LINE ABOVE" )
 783  : ( "̏"    U+030F Mn 0 "COMBINING DOUBLE GRAVE ACCENT" )
 784  : ( "̐"    U+0310 Mn 0 "COMBINING CANDRABINDU" )
 785  : ( "̑"    U+0311 Mn 0 "COMBINING INVERTED BREVE" )
 786  : ( "̒"    U+0312 Mn 0 "COMBINING TURNED COMMA ABOVE" )
 787  : ( "̓"    U+0313 Mn 0 "COMBINING COMMA ABOVE" )
 788  : ( "̔"    U+0314 Mn 0 "COMBINING REVERSED COMMA ABOVE" )
 789  : ( "̕"    U+0315 Mn 0 "COMBINING COMMA ABOVE RIGHT" )
 790  : ( "̖"    U+0316 Mn 0 "COMBINING GRAVE ACCENT BELOW" )
 791  : ( "̗"    U+0317 Mn 0 "COMBINING ACUTE ACCENT BELOW" )
 792  : ( "̘"    U+0318 Mn 0 "COMBINING LEFT TACK BELOW" )
 793  : ( "̙"    U+0319 Mn 0 "COMBINING RIGHT TACK BELOW" )
 794  : ( "̚"    U+031A Mn 0 "COMBINING LEFT ANGLE ABOVE" )
 795  : ( "̛"    U+031B Mn 0 "COMBINING HORN" )
 796  : ( "̜"    U+031C Mn 0 "COMBINING LEFT HALF RING BELOW" )
 797  : ( "̝"    U+031D Mn 0 "COMBINING UP TACK BELOW" )
 798  : ( "̞"    U+031E Mn 0 "COMBINING DOWN TACK BELOW" )
 799  : ( "̟"    U+031F Mn 0 "COMBINING PLUS SIGN BELOW" )
 800  : ( "̠"    U+0320 Mn 0 "COMBINING MINUS SIGN BELOW" )
 801  : ( "̡"    U+0321 Mn 0 "COMBINING PALATALIZED HOOK BELOW" )
 802  : ( "̢"    U+0322 Mn 0 "COMBINING RETROFLEX HOOK BELOW" )
 803  : ( "̣"    U+0323 Mn 0 "COMBINING DOT BELOW" )
 804  : ( "̤"    U+0324 Mn 0 "COMBINING DIAERESIS BELOW" )
 805  : ( "̥"    U+0325 Mn 0 "COMBINING RING BELOW" )
 806  : ( "̦"    U+0326 Mn 0 "COMBINING COMMA BELOW" )
 807  : ( "̧"    U+0327 Mn 0 "COMBINING CEDILLA" )
 808  : ( "̨"    U+0328 Mn 0 "COMBINING OGONEK" )
 809  : ( "̩"    U+0329 Mn 0 "COMBINING VERTICAL LINE BELOW" )
 810  : ( "̪"    U+032A Mn 0 "COMBINING BRIDGE BELOW" )
 811  : ( "̫"    U+032B Mn 0 "COMBINING INVERTED DOUBLE ARCH BELOW" )
 812  : ( "̬"    U+032C Mn 0 "COMBINING CARON BELOW" )
 813  : ( "̭"    U+032D Mn 0 "COMBINING CIRCUMFLEX ACCENT BELOW" )
 814  : ( "̮"    U+032E Mn 0 "COMBINING BREVE BELOW" )
 815  : ( "̯"    U+032F Mn 0 "COMBINING INVERTED BREVE BELOW" )
 816  : ( "̰"    U+0330 Mn 0 "COMBINING TILDE BELOW" )
 817  : ( "̱"    U+0331 Mn 0 "COMBINING MACRON BELOW" )
 818  : ( "̲"    U+0332 Mn 0 "COMBINING LOW LINE" )
 819  : ( "̳"    U+0333 Mn 0 "COMBINING DOUBLE LOW LINE" )
 820  : ( "̴"    U+0334 Mn 0 "COMBINING TILDE OVERLAY" )
 821  : ( "̵"    U+0335 Mn 0 "COMBINING SHORT STROKE OVERLAY" )
 822  : ( "̶"    U+0336 Mn 0 "COMBINING LONG STROKE OVERLAY" )
 823  : ( "̷"    U+0337 Mn 0 "COMBINING SHORT SOLIDUS OVERLAY" )
 824  : ( "̸"    U+0338 Mn 0 "COMBINING LONG SOLIDUS OVERLAY" )
 825  : ( "̹"    U+0339 Mn 0 "COMBINING RIGHT HALF RING BELOW" )
 826  : ( "̺"    U+033A Mn 0 "COMBINING INVERTED BRIDGE BELOW" )
 827  : ( "̻"    U+033B Mn 0 "COMBINING SQUARE BELOW" )
 828  : ( "̼"    U+033C Mn 0 "COMBINING SEAGULL BELOW" )
 829  : ( "̽"    U+033D Mn 0 "COMBINING X ABOVE" )
 830  : ( "̾"    U+033E Mn 0 "COMBINING VERTICAL TILDE" )
 831  : ( "̿"    U+033F Mn 0 "COMBINING DOUBLE OVERLINE" )
 832  : ( "̀"    U+0340 Mn 0 "COMBINING GRAVE TONE MARK" )
 833  : ( "́"    U+0341 Mn 0 "COMBINING ACUTE TONE MARK" )
 834  : ( "͂"    U+0342 Mn 0 "COMBINING GREEK PERISPOMENI" )
 835  : ( "̓"    U+0343 Mn 0 "COMBINING GREEK KORONIS" )
 836  : ( "̈́"    U+0344 Mn 0 "COMBINING GREEK DIALYTIKA TONOS" )
 837  : ( "ͅ"    U+0345 Mn 0 "COMBINING GREEK YPOGEGRAMMENI" )
 838  : ( "͆"    U+0346 Mn 0 "COMBINING BRIDGE ABOVE" )
 839  : ( "͇"    U+0347 Mn 0 "COMBINING EQUALS SIGN BELOW" )
 840  : ( "͈"    U+0348 Mn 0 "COMBINING DOUBLE VERTICAL LINE BELOW" )
 841  : ( "͉"    U+0349 Mn 0 "COMBINING LEFT ANGLE BELOW" )
 842  : ( "͊"    U+034A Mn 0 "COMBINING NOT TILDE ABOVE" )
 843  : ( "͋"    U+034B Mn 0 "COMBINING HOMOTHETIC ABOVE" )
 844  : ( "͌"    U+034C Mn 0 "COMBINING ALMOST EQUAL TO ABOVE" )
 845  : ( "͍"    U+034D Mn 0 "COMBINING LEFT RIGHT ARROW BELOW" )
 846  : ( "͎"    U+034E Mn 0 "COMBINING UPWARDS ARROW BELOW" )
 847  : ( "͏"    U+034F Mn 0 "COMBINING GRAPHEME JOINER", "CGJ" )
 848  : ( "͐"    U+0350 Mn 0 "COMBINING RIGHT ARROWHEAD ABOVE" )
 849  : ( "͑"    U+0351 Mn 0 "COMBINING LEFT HALF RING ABOVE" )
 850  : ( "͒"    U+0352 Mn 0 "COMBINING FERMATA" )
 851  : ( "͓"    U+0353 Mn 0 "COMBINING X BELOW" )
 852  : ( "͔"    U+0354 Mn 0 "COMBINING LEFT ARROWHEAD BELOW" )
 853  : ( "͕"    U+0355 Mn 0 "COMBINING RIGHT ARROWHEAD BELOW" )
 854  : ( "͖"    U+0356 Mn 0 "COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW" )
 855  : ( "͗"    U+0357 Mn 0 "COMBINING RIGHT HALF RING ABOVE" )
 856  : ( "͘"    U+0358 Mn 0 "COMBINING DOT ABOVE RIGHT" )
 857  : ( "͙"    U+0359 Mn 0 "COMBINING ASTERISK BELOW" )
 858  : ( "͚"    U+035A Mn 0 "COMBINING DOUBLE RING BELOW" )
 859  : ( "͛"    U+035B Mn 0 "COMBINING ZIGZAG ABOVE" )
 860  : ( "͜"    U+035C Mn 0 "COMBINING DOUBLE BREVE BELOW" )
 861  : ( "͝"    U+035D Mn 0 "COMBINING DOUBLE BREVE" )
 862  : ( "͞"    U+035E Mn 0 "COMBINING DOUBLE MACRON" )
 863  : ( "͟"    U+035F Mn 0 "COMBINING DOUBLE MACRON BELOW" )
 864  : ( "͠"    U+0360 Mn 0 "COMBINING DOUBLE TILDE" )
 865  : ( "͡"    U+0361 Mn 0 "COMBINING DOUBLE INVERTED BREVE" )
 866  : ( "͢"    U+0362 Mn 0 "COMBINING DOUBLE RIGHTWARDS ARROW BELOW" )
 867  : ( "ͣ"    U+0363 Mn 0 "COMBINING LATIN SMALL LETTER A" )
 868  : ( "ͤ"    U+0364 Mn 0 "COMBINING LATIN SMALL LETTER E" )
 869  : ( "ͥ"    U+0365 Mn 0 "COMBINING LATIN SMALL LETTER I" )
 870  : ( "ͦ"    U+0366 Mn 0 "COMBINING LATIN SMALL LETTER O" )
 871  : ( "ͧ"    U+0367 Mn 0 "COMBINING LATIN SMALL LETTER U" )
 872  : ( "ͨ"    U+0368 Mn 0 "COMBINING LATIN SMALL LETTER C" )
 873  : ( "ͩ"    U+0369 Mn 0 "COMBINING LATIN SMALL LETTER D" )
 874  : ( "ͪ"    U+036A Mn 0 "COMBINING LATIN SMALL LETTER H" )
 875  : ( "ͫ"    U+036B Mn 0 "COMBINING LATIN SMALL LETTER M" )
 876  : ( "ͬ"    U+036C Mn 0 "COMBINING LATIN SMALL LETTER R" )
 877  : ( "ͭ"    U+036D Mn 0 "COMBINING LATIN SMALL LETTER T" )
 878  : ( "ͮ"    U+036E Mn 0 "COMBINING LATIN SMALL LETTER V" )
 879  : ( "ͯ"    U+036F Mn 0 "COMBINING LATIN SMALL LETTER X" )
 880  : ( "Ͱ"   U+0370 Lu 1 "GREEK CAPITAL LETTER HETA" )
 881  : ( "ͱ"   U+0371 Ll 1 "GREEK SMALL LETTER HETA" )
 882  : ( "Ͳ"   U+0372 Lu 1 "GREEK CAPITAL LETTER ARCHAIC SAMPI" )
 883  : ( "ͳ"   U+0373 Ll 1 "GREEK SMALL LETTER ARCHAIC SAMPI" )
 884  : ( "ʹ"   U+0374 Lm 1 "GREEK NUMERAL SIGN" )
 885  : ( "͵"   U+0375 Sk 1 "GREEK LOWER NUMERAL SIGN" )
 886  : ( "Ͷ"   U+0376 Lu 1 "GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA" )
 887  : ( "ͷ"   U+0377 Ll 1 "GREEK SMALL LETTER PAMPHYLIAN DIGAMMA" )
 890  : ( "ͺ"   U+037A Lm 1 "GREEK YPOGEGRAMMENI" )
 891  : ( "ͻ"   U+037B Ll 1 "GREEK SMALL REVERSED LUNATE SIGMA SYMBOL" )
 892  : ( "ͼ"   U+037C Ll 1 "GREEK SMALL DOTTED LUNATE SIGMA SYMBOL" )
 893  : ( "ͽ"   U+037D Ll 1 "GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL" )
 894  : ( ";"   U+037E Po 1 "GREEK QUESTION MARK" )
 895  : ( "Ϳ"   U+037F Lu 1 "GREEK CAPITAL LETTER YOT" )
 900  : ( "΄"   U+0384 Sk 1 "GREEK TONOS" )
 901  : ( "΅"   U+0385 Sk 1 "GREEK DIALYTIKA TONOS" )
 902  : ( "Ά"   U+0386 Lu 1 "GREEK CAPITAL LETTER ALPHA WITH TONOS" )
 903  : ( "·"   U+0387 Po 1 "GREEK ANO TELEIA" )
 904  : ( "Έ"   U+0388 Lu 1 "GREEK CAPITAL LETTER EPSILON WITH TONOS" )
 905  : ( "Ή"   U+0389 Lu 1 "GREEK CAPITAL LETTER ETA WITH TONOS" )
 906  : ( "Ί"   U+038A Lu 1 "GREEK CAPITAL LETTER IOTA WITH TONOS" )
 908  : ( "Ό"   U+038C Lu 1 "GREEK CAPITAL LETTER OMICRON WITH TONOS" )
 910  : ( "Ύ"   U+038E Lu 1 "GREEK CAPITAL LETTER UPSILON WITH TONOS" )
 911  : ( "Ώ"   U+038F Lu 1 "GREEK CAPITAL LETTER OMEGA WITH TONOS" )
 912  : ( "ΐ"   U+0390 Ll 1 "GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS" )
 913  : ( "Α"   U+0391 Lu 1 "GREEK CAPITAL LETTER ALPHA" )
 914  : ( "Β"   U+0392 Lu 1 "GREEK CAPITAL LETTER BETA" )
 915  : ( "Γ"   U+0393 Lu 1 "GREEK CAPITAL LETTER GAMMA" )
 916  : ( "Δ"   U+0394 Lu 1 "GREEK CAPITAL LETTER DELTA" )
 917  : ( "Ε"   U+0395 Lu 1 "GREEK CAPITAL LETTER EPSILON" )
 918  : ( "Ζ"   U+0396 Lu 1 "GREEK CAPITAL LETTER ZETA" )
 919  : ( "Η"   U+0397 Lu 1 "GREEK CAPITAL LETTER ETA" )
 920  : ( "Θ"   U+0398 Lu 1 "GREEK CAPITAL LETTER THETA" )
 921  : ( "Ι"   U+0399 Lu 1 "GREEK CAPITAL LETTER IOTA" )
 922  : ( "Κ"   U+039A Lu 1 "GREEK CAPITAL LETTER KAPPA" )
 923  : ( "Λ"   U+039B Lu 1 "GREEK CAPITAL LETTER LAMDA" )
 924  : ( "Μ"   U+039C Lu 1 "GREEK CAPITAL LETTER MU" )
 925  : ( "Ν"   U+039D Lu 1 "GREEK CAPITAL LETTER NU" )
 926  : ( "Ξ"   U+039E Lu 1 "GREEK CAPITAL LETTER XI" )
 927  : ( "Ο"   U+039F Lu 1 "GREEK CAPITAL LETTER OMICRON" )
 928  : ( "Π"   U+03A0 Lu 1 "GREEK CAPITAL LETTER PI" )
 929  : ( "Ρ"   U+03A1 Lu 1 "GREEK CAPITAL LETTER RHO" )
 931  : ( "Σ"   U+03A3 Lu 1 "GREEK CAPITAL LETTER SIGMA" )
 932  : ( "Τ"   U+03A4 Lu 1 "GREEK CAPITAL LETTER TAU" )
 933  : ( "Υ"   U+03A5 Lu 1 "GREEK CAPITAL LETTER UPSILON" )
 934  : ( "Φ"   U+03A6 Lu 1 "GREEK CAPITAL LETTER PHI" )
 935  : ( "Χ"   U+03A7 Lu 1 "GREEK CAPITAL LETTER CHI" )
 936  : ( "Ψ"   U+03A8 Lu 1 "GREEK CAPITAL LETTER PSI" )
 937  : ( "Ω"   U+03A9 Lu 1 "GREEK CAPITAL LETTER OMEGA" )
 938  : ( "Ϊ"   U+03AA Lu 1 "GREEK CAPITAL LETTER IOTA WITH DIALYTIKA" )
 939  : ( "Ϋ"   U+03AB Lu 1 "GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA" )
 940  : ( "ά"   U+03AC Ll 1 "GREEK SMALL LETTER ALPHA WITH TONOS" )
 941  : ( "έ"   U+03AD Ll 1 "GREEK SMALL LETTER EPSILON WITH TONOS" )
 942  : ( "ή"   U+03AE Ll 1 "GREEK SMALL LETTER ETA WITH TONOS" )
 943  : ( "ί"   U+03AF Ll 1 "GREEK SMALL LETTER IOTA WITH TONOS" )
 944  : ( "ΰ"   U+03B0 Ll 1 "GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS" )
 945  : ( "α"   U+03B1 Ll 1 "GREEK SMALL LETTER ALPHA" )
 946  : ( "β"   U+03B2 Ll 1 "GREEK SMALL LETTER BETA" )
 947  : ( "γ"   U+03B3 Ll 1 "GREEK SMALL LETTER GAMMA" )
 948  : ( "δ"   U+03B4 Ll 1 "GREEK SMALL LETTER DELTA" )
 949  : ( "ε"   U+03B5 Ll 1 "GREEK SMALL LETTER EPSILON" )
 950  : ( "ζ"   U+03B6 Ll 1 "GREEK SMALL LETTER ZETA" )
 951  : ( "η"   U+03B7 Ll 1 "GREEK SMALL LETTER ETA" )
 952  : ( "θ"   U+03B8 Ll 1 "GREEK SMALL LETTER THETA" )
 953  : ( "ι"   U+03B9 Ll 1 "GREEK SMALL LETTER IOTA" )
 954  : ( "κ"   U+03BA Ll 1 "GREEK SMALL LETTER KAPPA" )
 955  : ( "λ"   U+03BB Ll 1 "GREEK SMALL LETTER LAMDA" )
 956  : ( "μ"   U+03BC Ll 1 "GREEK SMALL LETTER MU" )
 957  : ( "ν"   U+03BD Ll 1 "GREEK SMALL LETTER NU" )
 958  : ( "ξ"   U+03BE Ll 1 "GREEK SMALL LETTER XI" )
 959  : ( "ο"   U+03BF Ll 1 "GREEK SMALL LETTER OMICRON" )
 960  : ( "π"   U+03C0 Ll 1 "GREEK SMALL LETTER PI" )
 961  : ( "ρ"   U+03C1 Ll 1 "GREEK SMALL LETTER RHO" )
 962  : ( "ς"   U+03C2 Ll 1 "GREEK SMALL LETTER FINAL SIGMA" )
 963  : ( "σ"   U+03C3 Ll 1 "GREEK SMALL LETTER SIGMA" )
 964  : ( "τ"   U+03C4 Ll 1 "GREEK SMALL LETTER TAU" )
 965  : ( "υ"   U+03C5 Ll 1 "GREEK SMALL LETTER UPSILON" )
 966  : ( "φ"   U+03C6 Ll 1 "GREEK SMALL LETTER PHI" )
 967  : ( "χ"   U+03C7 Ll 1 "GREEK SMALL LETTER CHI" )
 968  : ( "ψ"   U+03C8 Ll 1 "GREEK SMALL LETTER PSI" )
 969  : ( "ω"   U+03C9 Ll 1 "GREEK SMALL LETTER OMEGA" )
 970  : ( "ϊ"   U+03CA Ll 1 "GREEK SMALL LETTER IOTA WITH DIALYTIKA" )
 971  : ( "ϋ"   U+03CB Ll 1 "GREEK SMALL LETTER UPSILON WITH DIALYTIKA" )
 972  : ( "ό"   U+03CC Ll 1 "GREEK SMALL LETTER OMICRON WITH TONOS" )
 973  : ( "ύ"   U+03CD Ll 1 "GREEK SMALL LETTER UPSILON WITH TONOS" )
 974  : ( "ώ"   U+03CE Ll 1 "GREEK SMALL LETTER OMEGA WITH TONOS" )
 975  : ( "Ϗ"   U+03CF Lu 1 "GREEK CAPITAL KAI SYMBOL" )
 976  : ( "ϐ"   U+03D0 Ll 1 "GREEK BETA SYMBOL" )
 977  : ( "ϑ"   U+03D1 Ll 1 "GREEK THETA SYMBOL" )
 978  : ( "ϒ"   U+03D2 Lu 1 "GREEK UPSILON WITH HOOK SYMBOL" )
 979  : ( "ϓ"   U+03D3 Lu 1 "GREEK UPSILON WITH ACUTE AND HOOK SYMBOL" )
 980  : ( "ϔ"   U+03D4 Lu 1 "GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL" )
 981  : ( "ϕ"   U+03D5 Ll 1 "GREEK PHI SYMBOL" )
 982  : ( "ϖ"   U+03D6 Ll 1 "GREEK PI SYMBOL" )
 983  : ( "ϗ"   U+03D7 Ll 1 "GREEK KAI SYMBOL" )
 984  : ( "Ϙ"   U+03D8 Lu 1 "GREEK LETTER ARCHAIC KOPPA" )
 985  : ( "ϙ"   U+03D9 Ll 1 "GREEK SMALL LETTER ARCHAIC KOPPA" )
 986  : ( "Ϛ"   U+03DA Lu 1 "GREEK LETTER STIGMA" )
 987  : ( "ϛ"   U+03DB Ll 1 "GREEK SMALL LETTER STIGMA" )
 988  : ( "Ϝ"   U+03DC Lu 1 "GREEK LETTER DIGAMMA" )
 989  : ( "ϝ"   U+03DD Ll 1 "GREEK SMALL LETTER DIGAMMA" )
 990  : ( "Ϟ"   U+03DE Lu 1 "GREEK LETTER KOPPA" )
 991  : ( "ϟ"   U+03DF Ll 1 "GREEK SMALL LETTER KOPPA" )
 992  : ( "Ϡ"   U+03E0 Lu 1 "GREEK LETTER SAMPI" )
 993  : ( "ϡ"   U+03E1 Ll 1 "GREEK SMALL LETTER SAMPI" )
 994  : ( "Ϣ"   U+03E2 Lu 1 "COPTIC CAPITAL LETTER SHEI" )
 995  : ( "ϣ"   U+03E3 Ll 1 "COPTIC SMALL LETTER SHEI" )
 996  : ( "Ϥ"   U+03E4 Lu 1 "COPTIC CAPITAL LETTER FEI" )
 997  : ( "ϥ"   U+03E5 Ll 1 "COPTIC SMALL LETTER FEI" )
 998  : ( "Ϧ"   U+03E6 Lu 1 "COPTIC CAPITAL LETTER KHEI" )
 999  : ( "ϧ"   U+03E7 Ll 1 "COPTIC SMALL LETTER KHEI" )
 1000 : ( "Ϩ"   U+03E8 Lu 1 "COPTIC CAPITAL LETTER HORI" )
 1001 : ( "ϩ"   U+03E9 Ll 1 "COPTIC SMALL LETTER HORI" )
 1002 : ( "Ϫ"   U+03EA Lu 1 "COPTIC CAPITAL LETTER GANGIA" )
 1003 : ( "ϫ"   U+03EB Ll 1 "COPTIC SMALL LETTER GANGIA" )
 1004 : ( "Ϭ"   U+03EC Lu 1 "COPTIC CAPITAL LETTER SHIMA" )
 1005 : ( "ϭ"   U+03ED Ll 1 "COPTIC SMALL LETTER SHIMA" )
 1006 : ( "Ϯ"   U+03EE Lu 1 "COPTIC CAPITAL LETTER DEI" )
 1007 : ( "ϯ"   U+03EF Ll 1 "COPTIC SMALL LETTER DEI" )
 1008 : ( "ϰ"   U+03F0 Ll 1 "GREEK KAPPA SYMBOL" )
...

-- The last 10 characters
ooRexx> .unicode~characters~pipe(.take "last" 10 | .console)
917990 : ( "󠇦"    U+E01E6 Mn 0 "VARIATION SELECTOR-247", "VS247" )
917991 : ( "󠇧"    U+E01E7 Mn 0 "VARIATION SELECTOR-248", "VS248" )
917992 : ( "󠇨"    U+E01E8 Mn 0 "VARIATION SELECTOR-249", "VS249" )
917993 : ( "󠇩"    U+E01E9 Mn 0 "VARIATION SELECTOR-250", "VS250" )
917994 : ( "󠇪"    U+E01EA Mn 0 "VARIATION SELECTOR-251", "VS251" )
917995 : ( "󠇫"    U+E01EB Mn 0 "VARIATION SELECTOR-252", "VS252" )
917996 : ( "󠇬"    U+E01EC Mn 0 "VARIATION SELECTOR-253", "VS253" )
917997 : ( "󠇭"    U+E01ED Mn 0 "VARIATION SELECTOR-254", "VS254" )
917998 : ( "󠇮"    U+E01EE Mn 0 "VARIATION SELECTOR-255", "VS255" )
917999 : ( "󠇯"    U+E01EF Mn 0 "VARIATION SELECTOR-256", "VS256" )

-- get a character by codepoint
ooRexx> .unicode~character(8203)=                   -- (U+200B Cf "ZERO WIDTH SPACE")
( "​"    U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" )
ooRexx> .unicode~character("U+200B")=               -- (U+200B Cf "ZERO WIDTH SPACE")
( "​"    U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" )
ooRexx> .unicode~character("u+200b")=               -- (U+200B Cf "ZERO WIDTH SPACE")
( "​"    U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" )

-- get a character by name.
-- loose matching name. See https://unicode.org/reports/tr44/#UAX44-LM2
ooRexx> .unicode~character("ZERO WIDTH SPACE")=     -- (U+200B Cf "ZERO WIDTH SPACE")
( "​"    U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" )
ooRexx> .unicode~character("ZERO_WIDTH-SPACE")=     -- (U+200B Cf "ZERO WIDTH SPACE")
( "​"    U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" )
ooRexx> .unicode~character("ZEROWIDTHSPACE")=       -- (U+200B Cf "ZERO WIDTH SPACE")
( "​"    U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" )
ooRexx> .unicode~character("zerowidthspace")=       -- (U+200B Cf "ZERO WIDTH SPACE")
( "​"    U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" )

-- select characters using a matcher
-- remember: it's better to initialize the matcher outside the iteration.
ooRexx> matcher = "*chris*"~matcher; .unicode~characters~select{expose matcher; matcher~(item~name)}==
an Array (shape [3], 3 items)
 1 : ( "🎄"  U+1F384 So 2 "CHRISTMAS TREE" )
 2 : ( "🎅"  U+1F385 So 2 "FATHER CHRISTMAS" )
 3 : ( "🤶"  U+1F936 So 2 "MOTHER CHRISTMAS" )

-- string character names
ooRexx> "noël👩‍👨‍👩‍👧🎅"~text~codepoints~each{uchar = .unicode~character(item); uchar~charWidth uchar~categoryName uchar~name}==
an Array (shape [12], 12 items)
 1  : '1 Ll LATIN SMALL LETTER N'
 2  : '1 Ll LATIN SMALL LETTER O'
 3  : '1 Ll LATIN SMALL LETTER E WITH DIAERESIS'
 4  : '1 Ll LATIN SMALL LETTER L'
 5  : '2 So WOMAN'
 6  : '0 Cf ZERO WIDTH JOINER'
 7  : '2 So MAN'
 8  : '0 Cf ZERO WIDTH JOINER'
 9  : '2 So WOMAN'
 10 : '0 Cf ZERO WIDTH JOINER'
 11 : '2 So GIRL'
 12 : '2 So FATHER CHRISTMAS'

-- shortest name:
ooRexx> .unicode~characters~reduce{if accu~name~length > item~name~length, item~name~length <> 0 then item }=
( ""    U+0000 Cc 0 "", "NULL", "NUL" )

-- longest name:
ooRexx> .unicode~characters~reduce{if accu~name~length < item~name~length then item }=
( "🮨"   U+1FBA8 So 1 "BOX DRAWINGS LIGHT DIAGONAL UPPER CENTRE TO MIDDLE LEFT AND MIDDLE RIGHT TO LOWER CENTRE" )


-- ===============================================================================
-- 2021 September 12

/*
[String chunks]

The functionality of splitting text by quoted/unquoted chunks is moved from
ooRexxShell to a dedicated package:
extension/stringChunk.cls               (compatible with official ooRexx)

The initial need was to parse a command line and split it the same way as a cmd
or bash shell. Also used to parse the queries in ooRexxShell.
The quotes are removed, but each character is associated to a 'quote flag' to
remember if the character was inside a quoted section.
These flags are typically used by the matchers of type string pattern, to decide
if a character can be special or not.

Description:
    routine stringChunks
    use strict arg string, withInfos=.false, breakTokens="", splitLevel=1

    Converts a string to an array of String or to an array of stringChunk.
    The type of result is indicated by the argument withInfos:
    - If withInfos == .false (default) then the result is an array of String.
    - If withInfos == .true then the result is an array of StringChunk.

    A StringChunk is a substring which references the start and end character
    in its container. It's associated to a string of booleans (quotedFlags)
    which indicate for each character if it was inside a quoted section.

    A quote is either " or '.

    An unquoted section is splitted in StringChunks delimited by whitespaces
    (anything <= 32) and break tokens.

    A quoted section is not splitted:
    - Whitespaces are kept,
    - single occurences of quotes are removed,
    - double occurrence of quotes are replaced by a single embedded quote,
    - break tokens and escape characters are ignored.

    An escape character is any character passed in the argument escapeCharacters.
    An escape character sets the quote flag of the next character to 1.
    Escape characters are removed, even if they are not followed by another
    character (truncated string).
    Example with 'a' declared escape character:
    - "a" --> ""
    - "aa" --> "a"
    - "aaa" --> "a"
    - "aaaa" --> "aa"

    If a quote is declared escape character, there is no impact: a quote is
    already an escape mechanism.

    If a space is declared escape character, there is an impact when splitLevel=0:
    the quote flag of a character following an unquoted space is set to 1, the
    unquoted spaces are removed
    Example:
        'one two "three four" five six' --> onetwothree fourfivesix
                                            00010011111111111000100

    Break tokens are passed in the argument breakTokens.
    A break token cannot contains spaces.
    The break tokens can be case sensitive (default) or case insensitive.
    Each break token can be prefixed by:
    - cs:  case sensitive
    - ci:  case insensitive
    - cl:  caseless (synonym of case insensitive)
    Any other prefix is not an error. It's just not a case prefix.

    If a quote is declared break token then it's no longer recognized as a quote.
    If an escape character is declared break token then it's no longer recognized
    as an escape character.

    The split process is controlled by the argument splitLevel:
    - If splitLevel == 0 then the string is not splitted but the quotes and
      escape characters are managed, quotedFlags is set.
        'xx aa"b b"cc"d d"ee yy' is 1 StringChunk.
    - If splitLevel == 1 (default) then adjacent quoted/unquoted sections are kept glued.
        'xx aa"b b"cc"d d"ee yy' is 3 StringChunk: xx "aab bccd dee" yy
    - If splitLevel == 2 then adjacent quoted/unquoted sections are separated.
        'xx aa"b b"cc"d d"ee yy' is splitted in 7 StringChunk: xx aa "b b" cc "d d" ee yy

    Illustration with splitLevel=1:
     11111111111111111111111111 222222222222222 333333333333333333333
    '"hello "John" how are you" good" bye "John "my name is ""BOND"""'
     0000000001111111111222222222233333333334444444444555555555566666
     1234567890123456789012345678901234567890123456789012345678901234
    arg1 = |hello John how are you|      containerStart = 01      containerEnd = 26      quotedFlags = 1111110000111111111111
    arg2 = |good bye John|               containerStart = 28      containerEnd = 42      quotedFlags = 0000111110000
    arg3 = |my name is "BOND"|           containerStart = 44      containerEnd = 64      quotedFlags = 11111111111111111

Extensions available in Executor only:
    .String~chunk           withInfos is true, splitLevel is 0 --> always returns ONE StringChunk
    .String~chunks          withInfos is true by default, splitLevel is 1 by default

Examples:
*/
ooRexx>     -- splitLevel = 0: no split
ooRexx>     'aa"b\ b"cc"d\ d"ee\* ff'~chunks(splitLevel:0)~each{item~sayDescription(25, index, 2)}
1  |aab\ bccd\ dee\* ff|       01 23 |aa"b\ b"cc"d\ d"ee\* ff|  
1  |0011110011110000000|      
/*
        1  |aab\ bccd\ dee\* ff|       01 23 |aa"b\ b"cc"d\ d"ee\* ff|
        1  |0011110011110000000|
*/

ooRexx>     -- splitLevel = 1: Adjacent quoted/unquoted sections are kept glued
ooRexx>     'aa"b\ b"cc"d\ d"ee\* ff'~chunks(splitLevel:1)~each{item~sayDescription(25, index, 2)}
1  |aab\ bccd\ dee\*|          01 20 |aa"b\ b"cc"d\ d"ee\*|     
1  |0011110011110000|         
2  |ff|                        22 23 |ff|                       
2  |00|                       
/*
        1  |aab\ bccd\ dee\*|          01 20 |aa"b\ b"cc"d\ d"ee\*|
        1  |0011110011110000|
        2  |ff|                        22 23 |ff|
        2  |00|
*/

ooRexx>     -- splitLevel = 2: Adjacent quoted/unquoted sections are separated
ooRexx>     'aa"b\ b"cc"d\ d"ee\* ff'~chunks(splitLevel:2)~each{item~sayDescription(25, index, 2)}
1  |aa|                        01 02 |aa|                       
1  |00|                       
2  |b\ b|                      03 08 |"b\ b"|                   
2  |1111|                     
3  |cc|                        09 10 |cc|                       
3  |00|                       
4  |d\ d|                      11 16 |"d\ d"|                   
4  |1111|                     
5  |ee\*|                      17 20 |ee\*|                     
5  |0000|                     
6  |ff|                        22 23 |ff|                       
6  |00|                       
/*
        1  |aa|                        01 02 |aa|
        1  |00|
        2  |b\ b|                      03 08 |"b\ b"|
        2  |1111|
        3  |cc|                        09 10 |cc|
        3  |00|
        4  |d\ d|                      11 16 |"d\ d"|
        4  |1111|
        5  |ee\*|                      17 20 |ee\*|
        5  |0000|
        6  |ff|                        22 23 |ff|
        6  |00|
*/

ooRexx>     -- Default splitLevel (1)
ooRexx>     -- The quote is declared break token, there is no more quoted sections, and the quote itself is returned
ooRexx>     'aa"b\ b"cc"d\ d"ee\* ff'~chunks(breakTokens: '"')~each{item~sayDescription(25, index, 2)}
1  |aa|                        01 02 |aa|                       
1  |00|                       
2  |"|                         03 03 |"|                        
2  |0|                        
3  |b\|                        04 05 |b\|                       
3  |00|                       
4  |b|                         07 07 |b|                        
4  |0|                        
5  |"|                         08 08 |"|                        
5  |0|                        
6  |cc|                        09 10 |cc|                       
6  |00|                       
7  |"|                         11 11 |"|                        
7  |0|                        
8  |d\|                        12 13 |d\|                       
8  |00|                       
9  |d|                         15 15 |d|                        
9  |0|                        
10 |"|                         16 16 |"|                        
10 |0|                        
11 |ee\*|                      17 20 |ee\*|                     
11 |0000|                     
12 |ff|                        22 23 |ff|                       
12 |00|                       
/*
        1  |aa|                        01 02 |aa|
        1  |00|
        2  |"|                         03 03 |"|
        2  |0|
        3  |b\|                        04 05 |b\|
        3  |00|
        4  |b|                         07 07 |b|
        4  |0|
        5  |"|                         08 08 |"|
        5  |0|
        6  |cc|                        09 10 |cc|
        6  |00|
        7  |"|                         11 11 |"|
        7  |0|
        8  |d\|                        12 13 |d\|
        8  |00|
        9  |d|                         15 15 |d|
        9  |0|
        10 |"|                         16 16 |"|
        10 |0|
        11 |ee\*|                      17 20 |ee\*|
        11 |0000|
        12 |ff|                        22 23 |ff|
        12 |00|
*/

ooRexx>     -- Same as previous, plus \ which is declared escape character
ooRexx>     'aa"b\ b"cc"d\ d"ee\* ff'~chunks(breakTokens: '"', escapeCharacters:"\")~each{item~sayDescription(25, index, 2)}
1  |aa|                        01 02 |aa|                       
1  |00|                       
2  |"|                         03 03 |"|                        
2  |0|                        
3  |b b|                       04 07 |b\ b|                     
3  |010|                      
4  |"|                         08 08 |"|                        
4  |0|                        
5  |cc|                        09 10 |cc|                       
5  |00|                       
6  |"|                         11 11 |"|                        
6  |0|                        
7  |d d|                       12 15 |d\ d|                     
7  |010|                      
8  |"|                         16 16 |"|                        
8  |0|                        
9  |ee*|                       17 20 |ee\*|                     
9  |001|                      
10 |ff|                        22 23 |ff|                       
10 |00|                       
/*
        1  |aa|                        01 02 |aa|
        1  |00|
        2  |"|                         03 03 |"|
        2  |0|
        3  |b b|                       04 07 |b\ b|
        3  |010|
        4  |"|                         08 08 |"|
        4  |0|
        5  |cc|                        09 10 |cc|
        5  |00|
        6  |"|                         11 11 |"|
        6  |0|
        7  |d d|                       12 15 |d\ d|
        7  |010|
        8  |"|                         16 16 |"|
        8  |0|
        9  |ee*|                       17 20 |ee\*|
        9  |001|
        10 |ff|                        22 23 |ff|
        10 |00|
*/

ooRexx>     -- A break token can be made of several characters, and can contain a quote
ooRexx>     'aa"b\ b"cc"d\ d"ee\* ff'~chunks(breakTokens: ' a"b ')~each{item~sayDescription(25, index, 2)}
1  |a|                         01 01 |a|                        
1  |0|                        
2  |a"b|                       02 04 |a"b|                      
2  |000|                      
3  |\|                         05 05 |\|                        
3  |0|                        
4  |bccd\|                     07 13 |b"cc"d\|                  
4  |01100|                    
5  |dee\* ff|                  15 23 |d"ee\* ff|                
5  |01111111|                 
/*
        1  |a|                         01 01 |a|
        1  |0|
        2  |a"b|                       02 04 |a"b|
        2  |000|
        3  |\|                         05 05 |\|
        3  |0|
        4  |bccd\|                     07 13 |b"cc"d\|
        4  |01100|
        5  |dee\* ff|                  15 23 |d"ee\* ff|
        5  |01111111|
*/

ooRexx>     -- If an escape character is also declared break token then it's no longer an escape character
ooRexx>     'aa"b\ b"cc"d\ d"ee\* ff'~chunks(breakTokens:"\", escapeCharacters:"\")~each{item~sayDescription(25, index, 2)}==
1  |aab\ bccd\ dee|            01 18 |aa"b\ b"cc"d\ d"ee|       
1  |00111100111100|           
2  |\|                         19 19 |\|                        
2  |0|                        
3  |*|                         20 20 |*|                        
3  |0|                        
4  |ff|                        22 23 |ff|                       
4  |00|                       
an Array (no shape, 0 items)
/*
        1  |aab\ bccd\ dee|            01 18 |aa"b\ b"cc"d\ d"ee|
        1  |00111100111100|
        2  |\|                         19 19 |\|
        2  |0|
        3  |*|                         20 20 |*|
        3  |0|
        4  |ff|                        22 23 |ff|
        4  |00|
*/

ooRexx>     -- A break token can contain characters that are declared escape character
ooRexx>     'aa"b\ b"cc"d\ d"ee\* ff'~chunks(breakTokens:"e\*", escapeCharacters:"\*")~each{item~sayDescription(25, index, 2)}==
1  |aab\ bccd\ de|             01 17 |aa"b\ b"cc"d\ d"e|        
1  |0011110011110|            
2  |e\*|                       18 20 |e\*|                      
2  |000|                      
3  |ff|                        22 23 |ff|                       
3  |00|                       
an Array (no shape, 0 items)
/*
        1  |aab\ bccd\ de|             01 17 |aa"b\ b"cc"d\ d"e|
        1  |0011110011110|
        2  |e\*|                       18 20 |e\*|
        2  |000|
        3  |ff|                        22 23 |ff|
        3  |00|
*/

ooRexx>     -- A break token can be case insensitive (prefix ci: or cl:)
ooRexx>     '1Plus2'~chunks(breakTokens:"ci:plus")~each{item~sayDescription(25, index, 2)}
1  |1|                         1 1 |1|                        
1  |0|                        
2  |Plus|                      2 5 |Plus|                     
2  |0000|                     
3  |2|                         6 6 |2|                        
3  |0|                        
/*
        1  |1|                         1 1 |1|
        1  |0|
        2  |Plus|                      2 5 |Plus|
        2  |0000|
        3  |2|                         6 6 |2|
        3  |0|
*/

/*
[String patterns]

The functionality of selecting text using patterns is moved from ooRexxShell
to a dedicated package:
extension/stringChunkExtended.cls       (not compatible with official ooRexx)

Description
    .StringChunk~matcher
    use strict named arg wholeString(1)=.true, caseless(1)=.true,-
                         trace(1)=.false, displayer(1)=.traceOutput, prefix(1)=""

    Pattern matching by equality (whole) or by inclusion (not whole), caseless or not.

    If the package regex.cls is loaded, then the pattern (a StringChunk) can be
    a regular expression prefixed by "/".

    When whole, and the pattern is not a regular expression, then the charecter
    "*" is recognized as a generic character when first or last character.

    When not whole, and the pattern is not a regular expression, then the character
    "^" is recognized as the metacharacter 'begining of string' when first character.

    When not whole, and the pattern is not a regular expression, then the character
    "$" is recognized as the metacharacter 'end of string' when last character.

    The returned result is a closure (matcher) which implements the pattern matching,
    or .nil if error.

    The pattern matching is tested when the closure is evaluated with a string passed
    as argument.

    Examples:

        '*' or '**'      : matches everything
        '"*"' or '"**"'  : matches exactly "*" or "**", see case stringPattern
        '***'            : matches all names containing "*", see case *stringPattern*
        '*"*"*'          : matches all names containing "*", see case *stringPattern*
        '*"**"*'         : matches all names containing "**", see case *stringPattern*
        '*stringPattern' : string~right(stringPattern~length)~caselessEquals(stringPattern)
        'stringPattern*' : string~left(stringPattern~length)~caselessEquals(stringPattern)
        '*stringPattern*': string~caselessPos(stringPattern) <> 0
        'stringPattern'  : string~caselessEquals(stringPattern)
*/

ooRexx>         -- caseless equality
ooRexx>         matcher = "object"~matcher
ooRexx>         say matcher~("ObjeCt") -- true
1
ooRexx>         say matcher~("my ObjeCt") -- false
0

ooRexx>         -- caseless equality with generic character
ooRexx>         matcher = "*object"~matcher
ooRexx>         say matcher~("ObjeCt") -- true
1
ooRexx>         say matcher~("my ObjeCt") -- true
1

ooRexx>         -- caseless inclusion
ooRexx>         matcher = "object"~matcher(wholeString:.false)
ooRexx>         say matcher~("ObjeCt") -- true
1
ooRexx>         say matcher~("my ObjeCt") -- true
1

ooRexx>         -- caseless inclusion, regular expression: "object" at the begining or at the end.
ooRexx>         matcher = "/^object|object$"~matcher(wholeString:.false)
ooRexx>         say matcher~("ObjeCt") -- true
1
ooRexx>         say matcher~("my ObjeCt") -- true
1
ooRexx>         say matcher~("my ObjeCts") -- false
0

ooRexx>         -- trace
ooRexx>         "*stringPattern"~matcher(trace:.true)
description: stringChunkPattern="*stringPattern" wholeString=1 caseless=1
stringPattern="stringPattern"
matcher: expose description stringPattern; use strict arg string; return string~right(stringPattern~length)~caselessEquals(stringPattern)
/*
        output:
            description: stringChunkPattern="*stringPattern" wholeString=1 caseless=1
            stringPattern="stringPattern"
            matcher: expose description stringPattern; use strict arg string; return string~right(stringPattern~length)~caselessEquals(stringPattern)
*/

ooRexx>         -- trace when regular expression
ooRexx>         "/.*stringPattern"~matcher(trace:.true)
description: stringChunkPattern="/.*stringPattern" wholeString=1 caseless=1
stringPattern=".*stringPattern"
pattern = .Pattern~compile(stringPattern, .RegexCompiler~new(.RegexCompiler~caseless))
matcher: expose description pattern; use strict arg string; return pattern~matches(string)
/*
        output:
            description: stringChunkPattern="/.*stringPattern" wholeString=1 caseless=1
            stringPattern=".*stringPattern"
            pattern = .Pattern~compile(stringPattern, .RegexCompiler~new(.RegexCompiler~caseless))
            matcher: expose description pattern; use strict arg string; return pattern~matches(string)
*/

-- ===============================================================================
-- 2021 August 11

/*
Added support for strings of codepoints encoded as native integers.
3 representations:
    Unicode8_Encoding
    Unicode16_Encoding
    Unicode32_Encoding.
The method ~unicode returns one of these encodings, depending on the character
with the largest Unicode codepoint (1, 2, or 4 bytes) in the source string.
Unlike the flexible representation of Python, the 3 representions are first-class.
No BOM, the endiannes is the CPU one. This is for internal use only.
Unicode32_Encoding can be used with utf8proc for the functions taking a buffer of 32-bit integers.
*/

ooRexx> "côté"~text("unicode8")=    -- T'côté Just an interpretative layer put above the string
T'côté'
ooRexx> "côté"~text("unicode8")~pipe{item~description(short:1) ":" item~c2x}=
'Unicode8 not-ASCII : 63 C3 B4 74 C3 A9'
--    'Unicode8 not-ASCII : 63 C3 B4 74 C3 A9

ooRexx> "côté"~text~unicode=        -- T'c?t?' UTF-8 converted to Unicode8
T'c�t�'
ooRexx> "côté"~text~unicode~pipe{item~description(short:1) ":" item~c2x}=
'Unicode8 not-ASCII : 63 F4 74 E9'
--    'Unicode8 not-ASCII : 63 F4 74 E9

ooRexx> "noël‍👨‍👩‍👧"~text~maximumCodepoint~pipe{"U+"item~d2x}=   -- U+1F469 is the maximum codepoint
'U+1F469'
ooRexx> "noël‍👨‍👩‍👧"~text~unicode~description(technical:1)=      -- For this maximum codepoint, we need Unicode32
'Unicode32 (5 characters (1 index from index 5), 10 codepoints (0 index), 40 bytes, 0 error)'
--    'Unicode32 not-ASCII (5 graphemes (1 index from index 5), 10 codepoints (0 index), 40 bytes, 0 error)'

-- The endianness of the UnicodeXX_Encoding is the one of the machine.
-- With an Intel CPU, it's little-endian.
ooRexx> "noël‍👨‍👩‍👧"~text~unicode~c2x=
'6E000000 6F000000 EB000000 6C000000 0D200000 68F40100 0D200000 69F40100 0D200000 67F40100'
--    '6E000000 6F000000 EB000000 6C000000 0D200000 68F40100 0D200000 69F40100 0D200000 67F40100'

-- The default endianness for UTF32 is big-endian.
ooRexx> "noël‍👨‍👩‍👧"~text~utf32~c2x=
'0000006E 0000006F 000000EB 0000006C 0000200D 0001F468 0000200D 0001F469 0000200D 0001F467'
--    '0000006E 0000006F 000000EB 0000006C 0000200D 0001F468 0000200D 0001F469 0000200D 0001F467'


-- ===============================================================================
-- 2021 may 31

/*
Encodeded strings.
The ooRexx programmer has the choice:
- working with String at byte level
- working with RexxText at grapheme level.
- the same instance of String is used in both cases.

    aString
     ▲  text --------> aRexxText
     │                     indexer (anEncoding)
     │                          codepoints (sequential access)
     │                          graphemes  (direct access)
     +-----------------------<- string
*/

-- First binding of utf8proc, for the detection of grapheme cluster break.
ooRexx> "( ͡° ͜ʖ ͡°)"~text~description=                    -- 'UTF-8 not-ASCII ( 9 graphemes, 12 codepoints, 20 bytes )'
'UTF-8 not-ASCII (9 characters, 12 codepoints, 20 bytes, 0 error)'
ooRexx> "( ͡° ͜ʖ ͡°)"~text~graphemes~each{item~c2x}=       -- [ 28,'20CDA1','C2B0','20CD9C','CA96','EFBBBF','20CDA1','C2B0', 29]
[ 28,'20 CDA1','C2B0','20 CD9C','CA96','EFBBBF','20 CDA1','C2B0', 29]

-- Classes in relation with Unicode and encoded strings:
ooRexx> ?c *encoding* *encoded* *indexer* *codepoint* *grapheme* *RexxText* *Unicode*
P.         'Byte_Encoding'                    : (byte_encoding.cls)
P.         'CodePointSupplier'                : (stringIndexer.cls)
.M         'EncodedMutableBuffer'             : (text.cls)
.M         'EncodedPackage'                   : (text.cls)
.M         'EncodedRexxBlock'                 : (text.cls)
.M         'EncodedString'                    : (text.cls)
P.         'Encoding'                         : (encoding.cls)
P.         'IBM1252_Encoding'                 : (ibm-1252_encoding.cls)
P.         'IBM437_Encoding'                  : (ibm-437_encoding.cls)
P.         'ISO88591_Encoding'                : (iso-8859-1_encoding.cls)
PM         'IndexerHelpers'                   : (stringInterface.cls)
PM         'IndexerStringInterface'           : (stringInterface.cls)
P.         'RexxText'                         : (REXX)
.M         'RexxTextContents'                 : (text.cls)
.M         'RexxTextInitializer'              : (text.cls)
PM         'RexxTextMapper'                   : (functionals.cls)
.M         'RexxTextOperators'                : (text.cls)
.M         'RexxTextPrettyPrinter'            : (notrace.cls)
.M         'RexxTextStringInterface'          : (text.cls)
P.         'RexxTextTransformer'              : (stringIndexer.cls)
PM         'StringIndexer'                    : (stringIndexer.cls)
.M         'StringRexxTextInterface'          : (text.cls)
P.         'UTF16BE_Encoding'                 : (utf16_encoding.cls)
P.         'UTF16LE_Encoding'                 : (utf16_encoding.cls)
P.         'UTF32BE_Encoding'                 : (utf32_encoding.cls)
P.         'UTF32LE_Encoding'                 : (utf32_encoding.cls)
P.         'UTF8_Encoding'                    : (utf8_encoding.cls)
P.         'Unicode'                          : (REXX)
P.         'Unicode16_Encoding'               : (unicode16_encoding.cls)
P.         'Unicode32_Encoding'               : (unicode32_encoding.cls)
P.         'Unicode8_Encoding'                : (unicode8_encoding.cls)
P.         'UnicodeCharacter'                 : (unicode.cls)
P.         'UnicodeCharacterAlias'            : (unicode.cls)
P.         'UnicodeCharacterInterval'         : (unicode.cls)
P.         'UnicodeCharacterIntervalSupplier' : (unicode.cls)
P.         'UnicodeCharacterSupplier'         : (unicode.cls)
PM         'UnicodeN_Encoding'                : (unicodeN_encoding.cls)
PM         'Unicode_CommonServices'           : (unicode_common.cls)
P.         'WINDOWS1252_Encoding'             : (windows-1252_encoding.cls)
P.         'WTF16BE_Encoding'                 : (wtf16_encoding.cls)
P.         'WTF16LE_Encoding'                 : (wtf16_encoding.cls)
P.         'WTF8_Encoding'                    : (wtf8_encoding.cls)
[Info] 42 lines displayed

-- ===============================================================================
-- 2021 mar 24

/*
Optimization of String~isASCII:
The old implementation checks from start to end.
The new implementation checks from start ascending, from middle descending, from middle ascending, from end descending.
That will divide by 4 the number of iterations, while increasing the chance to find a not-ascii character faster.
Strangely, the new implementation is also faster when all the characters are ASCII.

Benchmark using a version where the flag isASCII is not stored:
*/
-- MUST declare the byte encoding as default encoding, otherwise "é" is converted to text and the concatenation is catastrophically long!
ooRexx> previousEncoding = .encoding~setDefaultEncoding("byte") -- backup and change to Byte
ooRexx> big10m = "0123456789"~copies(1e6)
ooRexx> s = big10m                              -- 10 millions of ASCII characters, must check all of them
-- do 1000; s~isASCIIold; end              -- 9.3s
ooRexx> do 1000; s~isASCII; end                 -- 6.2s
ooRexx> s = "é" || big10m                       -- 1 non-ASCII character followed by 10 millions of ASCII characters
-- do 1000; s~isASCIIold; end              -- 0.001s
ooRexx> do 1000; s~isASCII; end                 -- 0.001s
ooRexx> s = big10m || "é"                       -- 10 millions of ASCII characters followed by 1 non-ASCII character
-- do 1000; s~isASCIIold; end              -- 9.3s
ooRexx> do 1000; s~isASCII; end                 -- 0.001s
ooRexx> big5m = "01234"~copies(1e6)
ooRexx> s = big5m || "é" || big5m               -- 1 non-ASCII character in the middle of 10 millions of ASCII characters
-- do 1000; s~isASCIIold; end              -- 4.7s
ooRexx> do 1000; s~isASCII; end                 -- 0.001s
ooRexx> .encoding~setDefaultEncoding(previousEncoding) -- restore


-- ===============================================================================
-- 2021 mar 15

/*
Encoded strings (prototype).
Added support for UTF-8.
Added suppliers for codepoints and graphemes.
*/

ooRexx> s = "ça va ?"
ooRexx> s~length=                           -- 7 (was 8 before automatic conversion of string literals to text)
 7
ooRexx> s~eachC{item~c2x" "}=               -- ['C3A7 ', 61 , 20 , 76 , 61 , 20 ,'3F ']     (was ['C3 ','A7 ', 61 , 20 , 76 , 61 , 20 ,'3F '] before automatic conversion of string literals to text)
['C3A7 ', 61 , 20 , 76 , 61 , 20 ,'3F ']
ooRexx> s~text~encoding=                    -- (The UTF8_Encoding class)
(The UTF8_Encoding class)
ooRexx> s~text~length=                      -- 7
 7
ooRexx> s~text("utf8")~length==             -- 7
 7
ooRexx> s~text~codepoints~each=             -- [ 231, 97, 32, 118, 97, 32, 63]
[ 231, 97, 32, 118, 97, 32, 63]
ooRexx> s~text~graphemes~each("c2x")=       -- ['C3A7', 61, 20, 76, 61, 20,'3F']
['C3A7', 61, 20, 76, 61, 20,'3F']