loadPackage OK for extension/stringChunk.cls loadPackage OK for utilities/indentedStream.cls loadPackage OK for extension/extensions.cls loadLibrary OK for rxunixsys loadPackage OK for ncurses.cls loadPackage OK for csvStream.cls loadLibrary OK for hostemu loadPackage OK for json.cls loadPackage OK for mime.cls loadPackage OK for rxftp.cls loadLibrary OK for rxmath loadPackage OK for rxregexp.cls loadPackage OK for regex/regex.cls loadPackage OK for smtp.cls loadPackage OK for socket.cls loadPackage OK for streamsocket.cls loadPackage OK for pipeline/pipe.cls loadPackage OK for rgf_util2/rgf_util2.rex loadPackage OK for BSF.CLS loadPackage OK for oorexxshell_queries.cls loadPackage OK for pipeline/pipe_extension.cls loadPackage OK for rgf_util2/rgf_util2_wrappers.rex REXX-ooRexx_4.3.0(MT)_64-bit 6.04 2 Jul 2024 Input queue name: S1cb9Q600001d700d0 ooRexx> call loadUnicodeCharacterNames Load the Unicode character names 15.1.0 ............................................ Total loaded character names: 149813 Total character name aliases: 473 Unicode character intervals not expanded, execute: call expandUnicodeCharacterIntervals ooRexx> .Unicode~memorizeTranscodings = .false ooRexx> .Unicode~memorizeTransformations = .false -- =============================================================================== -- 2024 Jul 02 /* Align the result of the C2X BIF with the result of the C2X BIM */ ooRexx> c2x("Noël")= -- '4E 6F C3AB 6C' (was '4E6FC3AB6C') '4E 6F C3AB 6C' ooRexx> "Noël"~c2x= -- '4E 6F C3AB 6C' '4E 6F C3AB 6C' ooRexx> c2x("Noel")= -- '4E6F656C' (unchanged) '4E6F656C' ooRexx> "Noel"~c2x= -- '4E6F656C' (unchanged) '4E6F656C' -- =============================================================================== -- 2024 June 22 /* Only ASCII compatible encodings can be used as the default encoding. */ ooRexx> .stringIndexer~pipe(.subclasses "recursive" | .select {item~canBeDefaultEncoding} | .console) 1 : (The Byte_Encoding class) 1 : (The ISO88591_Encoding class) 2 : (The IBM1252_Encoding class) 3 : (The WINDOWS1252_Encoding class) 4 : (The IBM437_Encoding class) 4 : (The WTF8_Encoding class) 9 : (The UTF8_Encoding class) 12 : (The Unicode8_Encoding class) /* Reason: The interpreter can return ASCII strings with no stored encoding (per design). In this case their encoding is the default encoding. This encoding must be compatible with ASCII. Examples: */ ooRexx> oldEncoding = .encoding~setDefaultEncoding("byte") -- ok ooRexx> .encoding~defaultEncoding = "cp1252" -- ok ooRexx> .encoding~defaultEncoding = "utf8" -- ok ooRexx> .encoding~defaultEncoding = "wtf8" -- ok ooRexx> .encoding~defaultEncoding = "unicode8" -- ok ooRexx> .encoding~defaultEncoding = "utf16" -- Encoding: 'utf16' is not supported as default encoding. Must be compatible with ASCII Encoding: 'utf16' is not supported as default encoding. Must be compatible with ASCII. Error code= 93.900 ooRexx> .encoding~defaultEncoding = "utf32" -- Encoding: 'utf32' is not supported as default encoding. Must be compatible with ASCII Encoding: 'utf32' is not supported as default encoding. Must be compatible with ASCII. Error code= 93.900 ooRexx> .encoding~setDefaultEncoding(oldEncoding)= -- (The Unicode8_Encoding class) last default encoding successfully assigned (The Unicode8_Encoding class) ooRexx> system sh> executor --encoding byte -e "say .encoding~defaultEncoding" -- The Byte_Encoding class The Byte_Encoding class sh> executor --encoding utf-8 -e "say .encoding~defaultEncoding" -- The UTF8_Encoding class The UTF8_Encoding class sh> executor --encoding utf-16 -e "say .encoding~defaultEncoding" -- "utf-16" is an invalid default encoding "utf-16" is an invalid default encoding. Supported default encodings: 28591 : The ISO88591_Encoding class 437 : The IBM437_Encoding class 819 : The ISO88591_Encoding class 8859 : The ISO88591_Encoding class 88591 : The ISO88591_Encoding class ASCII8BIT : The Byte_Encoding class BINARY : The Byte_Encoding class BYTE : The Byte_Encoding class BYTES : The Byte_Encoding class CP1252 : The WINDOWS1252_Encoding class CP437 : The IBM437_Encoding class CP5348 : The WINDOWS1252_Encoding class CP65001 : The UTF8_Encoding class CP819 : The ISO88591_Encoding class CSISOLATIN1 : The ISO88591_Encoding class CSPC8CODEPAGE437 : The IBM437_Encoding class CSWINDOWS1252 : The WINDOWS1252_Encoding class IBM1252 : The IBM1252_Encoding class IBM437 : The IBM437_Encoding class IBM5348 : The WINDOWS1252_Encoding class IBM819 : The ISO88591_Encoding class ISO88591 : The ISO88591_Encoding class ISO885911987 : The ISO88591_Encoding class ISOIR100 : The ISO88591_Encoding class L1 : The ISO88591_Encoding class LATIN : The ISO88591_Encoding class LATIN1 : The ISO88591_Encoding class UNICODE8 : The Unicode8_Encoding class UTF8 : The UTF8_Encoding class WE8ISO8859P1 : The ISO88591_Encoding class WINDOWS1252 : The WINDOWS1252_Encoding class WINDOWS28591 : The ISO88591_Encoding class WINDOWS437 : The IBM437_Encoding class WTF8 : The WTF8_Encoding class RC= 255 sh> oorexx /* The method setEncoding has now a default value for its 'encoding' argument, which is the current self encoding. This is useful when the encoding is not stored, and we want to make it stored. The "setEncoding" message without arguments will be sent from native methods, without having to send the "encoding" message before. Examples: */ ooRexx> s = "a" "string" ooRexx> s~hasEncoding= -- 0 0 ooRexx> s~encoding= -- (The UTF8_Encoding class) (The UTF8_Encoding class) ooRexx> s~setEncoding -- make the current encoding persistent if not already stored ooRexx> s~hasEncoding= -- 1 1 ooRexx> s~encoding= -- (The UTF8_Encoding class) (The UTF8_Encoding class) ooRexx> t = "Père" "Noël" ooRexx> t~hasEncoding= -- 1 true because a RexxText has always an encoding 1 ooRexx> t~string~hasEncoding= -- 1 true because linked to a RexxText 1 -- =============================================================================== -- 2024 Apr 24 /* Rework the support of encoding for RexxBlock. The definition doesn't change: A RexxBlock has the same encoding as its definition package. New methods: encoding encoding= hasEncoding setEncoding Examples: */ ooRexx> block = {say .context~package~encoding; s1 = "Père Noël"; say s1~class s1~encoding; s2 = "Père" "Noël"; say s~class s~encoding} ooRexx> block~hasEncoding= -- 1 1 ooRexx> block~encoding= -- (The UTF8_Encoding class) (The UTF8_Encoding class) ooRexx> block~() The UTF8_Encoding class The RexxText class The UTF8_Encoding class The String class The UTF8_Encoding class /* The UTF8_Encoding class -- encoding of the definition package The RexxText class The UTF8_Encoding class -- encoding of the definition package (string literal) The String class The UTF8_Encoding class -- encoding of the calculated string */ -- Changing the block encoding ooRexx> block = {say .context~package~encoding; s1 = "Père Noël"; say s1~class s1~encoding; s2 = "Père" "Noël"; say s~class s~encoding} ooRexx> oldEncoding = block~setEncoding("byte") ooRexx> oldEncoding= -- (The UTF8_Encoding class) (The UTF8_Encoding class) ooRexx> block~hasEncoding= -- 1 1 ooRexx> block~encoding= -- (The Byte_Encoding class) (The Byte_Encoding class) ooRexx> block~() The Byte_Encoding class The String class The Byte_Encoding class The String class The UTF8_Encoding class /* The Byte_Encoding class -- encoding of the definition package The String class The Byte_Encoding class -- encoding of the definition package (string literal) The String class The UTF8_Encoding class -- Calculated string. TODO should be The Byte_Encoding */ ooRexx> block~setEncoding(oldEncoding) ooRexx> block~hasEncoding= -- 1 1 ooRexx> block~encoding= -- (The UTF8_Encoding class) (The UTF8_Encoding class) ooRexx> block~() The UTF8_Encoding class The String class The Byte_Encoding class The String class The UTF8_Encoding class /* The UTF8_Encoding class The String class The Byte_Encoding class -- Once a string literal has a stored encoding, it doesn't change The String class The UTF8_Encoding class */ -- =============================================================================== -- 2024 Apr 22 /* The encoding of a string literal is the encoding of its definition package. It is set when the string literal is first evaluated. Once a string literal has received its encoding, it does not change even if the package encoding is changed later. Only string literals not yet evaluated will be impacted. It is possible to explicitly change the encoding using the ~setEncoding method or using the ~encoding = new_encoding assignment. The same goes for the default encoding. Once a calculated string has received its encoding, it does not change even if the default encoding is changed later. Examples: */ ooRexx> system rexx string_literal_encoding/package_main.rex prolog of package_byte.cls The Byte_Encoding class prolog of package_cp1252.cls The WINDOWS1252_Encoding class prolog of package_utf8.cls The UTF8_Encoding class prolog of package_utf16be.cls The UTF16BE_Encoding class prolog of package_utf32be.cls The UTF32BE_Encoding class .package_byte~m_name package_byte String The Byte_Encoding class .package_byte~m_noel Noël String The Byte_Encoding class .package_byte~m_noel_x Noël String The Byte_Encoding class .package_byte~m_noel_x2c Noël String The UTF8_Encoding class .package_cp1252~m_name package_cp1252 String The WINDOWS1252_Encoding class .package_cp1252~m_noel Noël String The WINDOWS1252_Encoding class .package_cp1252~m_noel_x Noël String The WINDOWS1252_Encoding class .package_cp1252~m_noel_x2c Noël String The UTF8_Encoding class .package_utf8~m_name package_utf8 String The UTF8_Encoding class .package_utf8~m_noel Noël RexxText The UTF8_Encoding class .package_utf8~m_noel_x Noël RexxText The UTF8_Encoding class .package_utf8~m_noel_x2c Noël String The UTF8_Encoding class .package_utf16be~m_name package_utf16be String The UTF16BE_Encoding class .package_utf16be~m_noel 乯쎫氀 � RexxText The UTF16BE_Encoding class .package_utf16be~m_noel_x Noël RexxText The UTF16BE_Encoding class .package_utf16be~m_noel_x2c N o � l String The UTF8_Encoding class .package_utf32be~m_name package_utf32be String The UTF32BE_Encoding class .package_utf32be~m_noel ������������������������������ RexxText The UTF32BE_Encoding class .package_utf32be~m_noel_x Noël RexxText The UTF32BE_Encoding class .package_utf32be~m_noel_x2c N o � l String The UTF8_Encoding class /* Consequence of the previous rule, the hexadecimal and binary strings are no longer declared Byte encoded. Now their encoding is given by their definition package. Idem for the BIFs/BIMs D2C and X2C, their results are no longer declared Byte encoded. Since they have no assigned encoding, their results encoding depend on the default encoding. Examples: */ ooRexx> "41"x= -- 'A' 'A' ooRexx> "41"x~hasEncoding= -- 1 The encoding is stored 1 ooRexx> "41"x~encoding= -- (The UTF8_Encoding class) This is the encoding of the definition package (The UTF8_Encoding class) ooRexx> "41"~x2c= -- 'A' 'A' ooRexx> "41"~x2c~hasEncoding= -- 0 No stored encoding 0 ooRexx> "41"~x2c~encoding= -- (The UTF8_Encoding class) This is the default encoding (The UTF8_Encoding class) /* For the proper management of the encoding of string literals, the globalStrings directory is no longer used by the parser when building an image. Now, each source (package) manages its own directory, even when building an image. For the moment, all the packages that are included in rexx.img are byte encoded, so this change is not needed. But maybe in the future, I may have packages with different encodings in rexx.img. */ /* It's now possible to reset the encoding of a string, mutable buffer or package by passing .nil when using target~encoding = .nil target~setEncoding(.nil) After reset, the encoding is no longer stored and the default encoding is returned. A RexxText has always an encoding, so an error is raised when passing .nil. This same error is raised when the target is a string linked to a RexxText. Examples: */ ooRexx> s = "Noel" ooRexx> s~description= -- 'UTF-8 ASCII (4 bytes)' 'UTF-8 ASCII (4 bytes)' ooRexx> oldEncoding = s~setEncoding(.nil) ooRexx> oldEncoding= -- (The UTF8_Encoding class) (The UTF8_Encoding class) ooRexx> s~description= -- 'UTF-8 ASCII by default (4 bytes)' 'UTF-8 ASCII by default (4 bytes)' ooRexx> s~setEncoding(oldEncoding) ooRexx> s~description= -- 'UTF-8 ASCII (4 bytes)' 'UTF-8 ASCII (4 bytes)' ooRexx> t = "Noël" ooRexx> t~description= -- 'UTF-8 not-ASCII (4 characters, 4 codepoints, 5 bytes, 0 error)' 'UTF-8 not-ASCII (4 characters, 4 codepoints, 5 bytes, 0 error)' ooRexx> t~setEncoding(.nil) -- Encoding: 'The NIL object' is not supported Encoding: 'The NIL object' is not supported. Error code= 93.900 ooRexx> s = t~string ooRexx> s~description= -- 'UTF-8 not-ASCII (4 characters, 4 codepoints, 5 bytes, 0 error)' 'UTF-8 not-ASCII (4 characters, 4 codepoints, 5 bytes, 0 error)' ooRexx> s~setEncoding(.nil) -- Encoding: 'The NIL object' is not supported Encoding: 'The NIL object' is not supported. Error code= 93.900 /* The method ~setEncoding returns .nil when the target has no stored encoding. That allows to reset properly the encoding when restoring the previous value. Note: the method ~encoding never returns .nil. It returns the default encoding when no encoding is stored. Examples: */ ooRexx> package = {}~rawExecutable~package -- Use a package different from ooRexxShell ooRexx> package~setEncoding(.nil) -- reset the package encoding, no stored encoding ooRexx> package~hasEncoding= -- 0 The encoding is not stored 0 ooRexx> package~encoding= -- (The Byte_Encoding class) It's the default encoding for a package not requiring text.cls (The Byte_Encoding class) ooRexx> oldEncoding = package~setEncoding("cp1252") ooRexx> oldEncoding= -- (The NIL object) (The NIL object) ooRexx> package~hasEncoding= -- 1 The encoding is stored 1 ooRexx> package~encoding= -- (The WINDOWS1252_Encoding class) (The WINDOWS1252_Encoding class) ooRexx> package~setEncoding(oldEncoding)= -- (The WINDOWS1252_Encoding class) Previous encoding (The WINDOWS1252_Encoding class) ooRexx> package~hasEncoding= -- 0 Return to non-stored encoding 0 ooRexx> package~encoding= -- (The Byte_Encoding class) It's the default encoding for a package not requiring text.cls (The Byte_Encoding class) /* New methods: .String~detach .RexxText~detach The string is detached from its text counterpart. The text becomes an empty text "". Useful when working with big strings, to reclaim memory. No need to call ~detach on both targets. There is a forward to the counterpart. Examples: */ ooRexx> s = "Noel" ooRexx> t = s~text ooRexx> t= -- T'Noel' T'Noel' ooRexx> s~hasText= -- 1 1 ooRexx> s~detach ooRexx> s~hasText= -- 0 0 ooRexx> t= -- T'' T'' ooRexx> t = "Noël" ooRexx> s = t~string ooRexx> t= -- T'Noël' T'Noël' ooRexx> s~hasText= -- 1 1 ooRexx> t~detach ooRexx> s~hasText= -- 0 0 ooRexx> t= -- T'' T'' /* New methods: .String~byte .RexxText~byte Returns a copy of the string or text, with encoding = The Byte_Encoding. The Byte_Encoding is a raw encoding with few constraints. It's often used for diagnostic or repair. It can be always absorbed when doing a concatenation or a comparison. BUT it's impossible to transcode from/to it without errors if the string contains not-ASCII characters. Here, no transcoding, it's a copy as-is whose encoding is The Byte_Encoding. Examples: */ ooRexx> "50C3"x~description= -- 'UTF-8 not-ASCII (2 characters, 2 codepoints, 2 bytes, 1 error)' 'UTF-8 not-ASCII (2 characters, 2 codepoints, 2 bytes, 1 error)' ooRexx> "Père"~text~startsWith("50C3"x)= -- Invalid UTF-8 string (raised by utf8proc) Invalid UTF-8 string. Error code= 22.900 ooRexx> "50C3"x~byte~description= -- 'Byte not-ASCII (2 characters, 2 codepoints, 2 bytes, 0 error)' 'Byte not-ASCII (2 characters, 2 codepoints, 2 bytes, 0 error)' ooRexx> "Père"~text~startsWith("50C3"x~byte)= -- 0 (not aligned) 0 /* New methods: .String~bytes .RexxText~bytes Returns a ByteSupplier which provides each byte in decimal. Examples: */ ooRexx> "Noel"~bytes== a ByteSupplier 1 : 78 2 : 111 3 : 101 4 : 108 ooRexx> "Noël"~bytes== a ByteSupplier 1 : 78 2 : 111 3 : 195 4 : 171 5 : 108 -- =============================================================================== -- 2024 Apr 12 /* [interpreter] Add support for dynamic target when sending messages. The target is calculated based on the initial target and the arguments values/types of the message. It's still a single-dispatch. The ~~ form of message is not impacted: it returns the object that received the message (the initial target), not the calculated target. New method .Object~dynamicTarget which returns the target in function of the arguments: RexxObject *RexxObject::dynamicTargetRexx(RexxObject **arguments, size_t argCount, size_t named_argCount) { return this->dynamicTarget(arguments, argCount, named_argCount); } By default, the dynamic target is the receiver object. Native classes can override the virtual method dynamicTarget. For the moment, it's not possible to override this method with an ooRexx method. Examples: */ ooRexx> (1,2)~dynamicTarget= -- initial target: [ 1, 2] [ 1, 2] ooRexx> (1,2)~dynamicTarget("string")= -- initial target: [ 1, 2] [ 1, 2] ooRexx> (1,2)~dynamicTarget("string", "teẌt")= -- initial target: [ 1, 2] [ 1, 2] /* The forward instruction does not depend on the dynamic target calculation. If you need to forward using the dynamic target then do: forward message "DYNAMICTARGET" continue forward to (result) */ /* [Encoded strings] +---------------------------------------------------------------+ | 3rd important milestone | | The String messages become polymorphic on RexxString/RexxText | +---------------------------------------------------------------+ If at least one positional argument is a RexxText then the String message is sent to the RexxText counterpart of the String instance, otherwise the String message is sent to the String instance. The RexxString class overrides the virtual method dynamicTarget: RexxObject *RexxString::dynamicTarget(RexxObject **arguments, size_t count, size_t named_count) { if (hasRexxTextArguments(arguments, count, named_count)) { RexxText *text = this->requestText(); return text; } return this; } Examples: */ ooRexx> "Noel"~dynamicTarget= -- initial target: 'Noel' 'Noel' ooRexx> "Noel"~dynamicTarget("string")= -- initial target: 'Noel' 'Noel' ooRexx> "Noel"~dynamicTarget("string", "teẌt")= -- text counterpart of the initial target: T'Noel' because "teẌt" is a RexxText T'Noel' /* Examples of dynamic target with ~center: */ ooRexx> "é"~c2x=; "é"~class= -- 'C3A9' (The RexxText class) 'C3A9' (The RexxText class) ooRexx> "test"~center(10, "é")= -- T'ééétestééé' T'ééétestééé' ooRexx> "C3A9"x=; result~description= -- T'é' 'UTF-8 not-ASCII (1 character, 1 codepoint, 2 bytes, 0 error)' T'é' 'UTF-8 not-ASCII (1 character, 1 codepoint, 2 bytes, 0 error)' ooRexx> "test"~center(10, "C3A9"x)= -- T'ééétestééé' T'ééétestééé' ooRexx> x2c("C3A9")=; result~description= -- 'é' 'UTF-8 not-ASCII by default (2 bytes)' 'é' 'UTF-8 not-ASCII by default (2 bytes)' -- next error is ok: the pad is a string made of 2 bytes ooRexx> "test"~center(10, x2c("C3A9"))= -- Incorrect pad or character argument specified; found "é" Incorrect pad or character argument specified; found "é" Error code= 93.922 /* Examples of dynamic target with ~left: */ ooRexx> "test"~left(10)= -- 'test ' 'test ' ooRexx> "test"~left(10, ".")= -- 'test......' 'test......' ooRexx> "test"~left(10, "🦖")= -- T'test🦖🦖🦖🦖🦖🦖' T'test🦖🦖🦖🦖🦖🦖' /* The ~~ form of message is not impacted: it always returns the initial target */ ooRexx> "test"~right(10, "é")~left(20, "è")= -- T'éééééétestèèèèèèèèèè' T'éééééétestèèèèèèèèèè' ooRexx> "test"~~right(10, "é")~left(20, "è")= -- T'testèèèèèèèèèèèèèèèè' T'testèèèèèèèèèèèèèèèè' ooRexx> "test"~right(10, "é")~~left(20, "è")= -- T'éééééétest' T'éééééétest' ooRexx> "test"~~right(10, "é")~~left(20, "è")= -- 'test' 'test' /* [doers] RexxText inherit from TextDoer. Examples: */ ooRexx> "c2x"~text~do("a")= -- 61 (was Object "c2x" does not understand message "DO") 61 ooRexx> "ça va ?"~characters~each= -- [T'ç',T'a',T' ',T'v',T'a',T' ',T'?'] [T'ç',T'a',T' ',T'v',T'a',T' ',T'?'] ooRexx> "ça va ?"~characters~each("c2x")= -- ['C3A7', 61, 20, 76, 61, 20,'3F'] ['C3A7', 61, 20, 76, 61, 20,'3F'] /* A RexxBlock has the same encoding as its definition package. Examples: */ ooRexx> {.context~package~encoding}~()= -- (The UTF8_Encoding class) (The UTF8_Encoding class) ooRexx> oldEncoding = .context~package~setEncoding("byte") ooRexx> {.context~package~encoding}~()= -- (The Byte_Encoding class) (The Byte_Encoding class) ooRexx> .context~package~setEncoding(oldEncoding) -- was: Incorrect pad or character argument specified; found "é" -- because the package encoding of the block was The Byte_Encoding (default) -- and the string literal "é" was not converted to a RexxText instance. -- Now the package encoding of the block is The UTF8_Encoding and it works: ooRexx> ("un", "deux")~each{item~right(10, "é")}== an Array (shape [2], 2 items) 1 : T'ééééééééun' 2 : T'éééééédeux' -- =============================================================================== -- 2024 Apr 10 /* +-----------------------------------------------------------+ | 2nd important milestone | | The string BIFs become polymorphic on RexxString/RexxText | +-----------------------------------------------------------+ If at least one positional argument is a RexxText then the string BIFs forward to RexxText, otherwise the string BIFs forward to RexxString. Enhanced BIFs: ABBREV CENTER implemented on RexxText CENTRE implemented on RexxText CHANGESTR COMPARE implemented on RexxText COPIES implemented on RexxText COUNTSTR D2C implemented on RexxText DELSTR DELWORD INSERT LASTPOS LEFT implemented on RexxText LENGTH implemented on RexxText LOWER implemented on RexxText OVERLAY POS implemented on RexxText REVERSE implemented on RexxText RIGHT implemented on RexxText SPACE STRIP implemented on RexxText SUBSTR implemented on RexxText SUBWORD UPPER implemented on RexxText VERIFY WORD WORDINDEX WORDLENGTH WORDPOS WORDS X2C implemented on RexxText Examples: */ -- CENTER ooRexx> CENTER("Noel", 10, "*")= -- '***Noel***' '***Noel***' ooRexx> CENTER("Noel", 10, "🤶")= -- T'🤶🤶🤶Noel🤶🤶🤶' because "🤶" is a RexxText T'🤶🤶🤶Noel🤶🤶🤶' ooRexx> CENTER("Noël", 10, "*")= -- T'***Noël***' because "Noël" is a RexxText T'***Noël***' ooRexx> CENTER("Noël"~string, 10, "*")= -- '**Noël***' '**Noël***' ooRexx> CENTER("Noël", 10, "🤶")= -- T'🤶🤶🤶Noël🤶🤶🤶' T'🤶🤶🤶Noël🤶🤶🤶' ooRexx> CENTER("Noël"~string, 10, "🤶")= -- T'🤶🤶🤶Noël🤶🤶🤶' because "🤶" is a RexxText T'🤶🤶🤶Noël🤶🤶🤶' ooRexx> CENTER("Noël", 10, "🤶"~string)= -- T'🤶🤶🤶Noël🤶🤶🤶' because "Noël" is a RexxText T'🤶🤶🤶Noël🤶🤶🤶' ooRexx> CENTER("Noel", 10, "🤶"~string)= -- CENTER positional argument 3 must be a single character; found "🤶" CENTER positional argument 3 must be a single character; found "🤶". Error code= 40.23 ooRexx> CENTER("Noël"~string, 10, "🤶"~string)= -- CENTER positional argument 3 must be a single character; found "🤶" CENTER positional argument 3 must be a single character; found "🤶". Error code= 40.23 -- Other BIFs ooRexx> ABBREV("Printer","Pri")= -- 1 1 ooRexx> ABBREV("Printer 🖨","Pri")= -- Object "Printer 🖨" does not understand message "ABBREV" Object "Printer 🖨" does not understand message "ABBREV". Error code= 97.1 ooRexx> CHANGESTR("p", "mpNoelpp", "m", 2)= -- 'mmNoelmp' 'mmNoelmp' ooRexx> CHANGESTR("🎅", "🤶🎅Noël🎅🎅", "🤶", 2)= -- Object "🤶🎅Noël🎅🎅" does not understand message "CHANGESTR" Object "🤶🎅Noël🎅🎅" does not understand message "CHANGESTR". Error code= 97.1 ooRexx> COMPARE("straSssSSssse", "stra", "S")= -- 6 6 ooRexx> COMPARE("straßssßßssse", "stra", "ß")= -- 6 6 ooRexx> COPIES("🤶", 4)= -- T'🤶🤶🤶🤶' T'🤶🤶🤶🤶' ooRexx> COUNTSTR("m", "mpmp")= -- 2 2 ooRexx> COUNTSTR("🤶", "🤶🎅🤶🎅")= -- Object "🤶🎅🤶🎅" does not understand message "COUNTSTR" Object "🤶🎅🤶🎅" does not understand message "COUNTSTR". Error code= 97.1 ooRexx> D2C(65)= -- 'A' 'A' ooRexx> D2C(65~text)= -- T'A' T'A' ooRexx> DELSTR("Noel", 3, 2)= -- 'No' 'No' ooRexx> DELSTR("Noël", 3, 2)= -- Object "Noël" does not understand message "DELSTR" Object "Noël" does not understand message "DELSTR". Error code= 97.1 ooRexx> DELWORD("Pere Noel p", 2, 2)= -- 'Pere ' 'Pere ' ooRexx> DELWORD("Père Noël 🎅", 2, 2)= -- Object "Père Noël 🎅" does not understand message "DELWORD" Object "Père Noël 🎅" does not understand message "DELWORD". Error code= 97.1 ooRexx> INSERT("123", "abc", 5, 6, "+")= -- 'abc++123+++' 'abc++123+++' ooRexx> INSERT("123", "abc", 5, 6, "🎅")= -- Object "abc" does not understand message "INSERT" Object "abc" does not understand message "INSERT". Error code= 97.1 ooRexx> LASTPOS("m", "mMere Noelm")= -- 11 11 ooRexx> LASTPOS("🤶", "🤶Mère Noël🤶")= -- Object "🤶Mère Noël🤶" does not understand message "LASTPOS" Object "🤶Mère Noël🤶" does not understand message "LASTPOS". Error code= 97.1 ooRexx> LEFT("abc d",8,".")= -- 'abc d...' 'abc d...' ooRexx> LEFT("abc d",8,"🤶")= -- T'abc d🤶🤶🤶' T'abc d🤶🤶🤶' ooRexx> LENGTH("Père Noël 🎅"~string)= -- 16 16 ooRexx> LENGTH("Père Noël 🎅")= -- 11 11 ooRexx> LOWER("PÈRE NOËL")= -- T'père noël' T'père noël' ooRexx> OVERLAY("123","abc",5,6,"+")= -- 'abc+123+++' 'abc+123+++' ooRexx> OVERLAY("123","abc",5,6,"🤶")= -- Object "abc" does not understand message "OVERLAY" Object "abc" does not understand message "OVERLAY". Error code= 97.1 ooRexx> POS("Frei", "Bundesstraße im Freiland")= -- 17 17 ooRexx> REVERSE("Noël")= -- T'lëoN' T'lëoN' ooRexx> RIGHT("12",5,"0")= -- 00012 00012 ooRexx> RIGHT("12",5,"𝟶")= -- T'𝟶𝟶𝟶12' T'𝟶𝟶𝟶12' ooRexx> SPACE("abc def ",2,"+")= -- 'abc++def' 'abc++def' ooRexx> SPACE("abc def ",2,"⊕")= -- Object "abc def " does not understand message "SPACE" Object "abc def " does not understand message "SPACE". Error code= 97.1 ooRexx> STRIP("12.0000", "T", '.0')= -- 12 12 ooRexx> STRIP("12.øøøø", "T", '.ø')= -- T'12' where 'ø'~c2x='C3B8'. T'12' ooRexx> STRIP(("12.øø" || "C3"x || "øø")~string, "T", '.ø'~string)= -- 12 Every byte of the last parameter is searched and removed 12 ooRexx> STRIP("12.øø" || "C3"x || "øø", "T", '.ø')= -- Invalid UTF-8 string (raised by utf8proc) Invalid UTF-8 string. Error code= 22.900 ooRexx> STRIP(("12.øø" || "C3"x || "øø")~transcodeTo("ISO-8859-1", replacementCharacter:"#"), "T", '.ø'~transcodeTo("ISO-8859-1"))= -- T'12.??#' T'12.��#' ooRexx> SUBSTR("abc",2,6,".")= -- 'bc....' 'bc....' ooRexx> SUBSTR("abc",2,6,"🤶")= -- T'bc🤶🤶🤶🤶' T'bc🤶🤶🤶🤶' ooRexx> SUBWORD("Now is the time",2,2)= -- 'is the' 'is the' ooRexx> SUBWORD("Now is the 🕑",2,2)= -- Object "Now is the 🕑" does not understand message "SUBWORD" Object "Now is the 🕑" does not understand message "SUBWORD". Error code= 97.1 ooRexx> UPPER("père noël")= -- T'PÈRE NOËL' T'PÈRE NOËL' ooRexx> VERIFY("ABCDEF","ABC","N",2,3)= -- 4 4 ooRexx> VERIFY("ABCDEF","ABC","N"~text,2,3)= -- Object "ABCDEF" does not understand message "VERIFY" (yes! ANY parameter is tested, including the option) Object "ABCDEF" does not understand message "VERIFY". Error code= 97.1 ooRexx> WORD("Now is the time",3)= -- 'the' 'the' ooRexx> WORD("Now is the 🕑",3)= -- Object "Now is the 🕑" does not understand message "WORD" Object "Now is the 🕑" does not understand message "WORD". Error code= 97.1 ooRexx> WORDINDEX("Now is the time",3)= -- 8 8 ooRexx> WORDINDEX("Now is the 🕑",3)= -- Object "Now is the 🕑" does not understand message "WORDINDEX" Object "Now is the 🕑" does not understand message "WORDINDEX". Error code= 97.1 ooRexx> WORDLENGTH("Now is the time",4)= -- 4 4 ooRexx> WORDLENGTH("Now is the 🕑",4)= -- Object "Now is the 🕑" does not understand message "WORDLENGTH" Object "Now is the 🕑" does not understand message "WORDLENGTH". Error code= 97.1 ooRexx> WORDPOS("the","Now is the time")= -- 3 3 ooRexx> WORDPOS("the","Now is the 🕑")= -- Object "Now is the 🕑" does not understand message "WORDPOS" Object "Now is the 🕑" does not understand message "WORDPOS". Error code= 97.1 ooRexx> WORDS("Now is the time")= -- 4 4 ooRexx> WORDS("Now is the 🕑")= -- Object "Now is the 🕑" does not understand message "WORDS" Object "Now is the 🕑" does not understand message "WORDS". Error code= 97.1 ooRexx> X2C(41)= -- 'A' 'A' ooRexx> X2C(41~text)= -- T'A' T'A' /* Still not sure: When the target is a String, should the BIF d2c and x2c return a RexxText when the result is not-ASCII and the evaluation context encoding is not Byte? That would be consistent with the rules for string literal (R1, R2). Currently, assuming the package encoding is UTF-8: "FF"x is a RexxText but x2c("FF") is a String. And what about "FF"~x2c? currently it's a String. Examples: */ ooRexx> "FF"x=;result~description= -- T'[FF]' 'UTF-8 not-ASCII (1 character, 1 codepoint, 1 byte, 1 error)' T'[FF]' 'UTF-8 not-ASCII (1 character, 1 codepoint, 1 byte, 1 error)' ooRexx> x2c("FF")=;result~description= -- '[FF]' 'UTF-8 not-ASCII by default (1 byte)' '[FF]' 'UTF-8 not-ASCII by default (1 byte)' ooRexx> "FF"~x2c=;result~description= -- '[FF]' 'UTF-8 not-ASCII by default (1 byte)' '[FF]' 'UTF-8 not-ASCII by default (1 byte)' ooRexx> "FF"~text~x2c=;result~description= -- T'[FF]' 'UTF-8 not-ASCII (1 character, 1 codepoint, 1 byte, 1 error)' T'[FF]' 'UTF-8 not-ASCII (1 character, 1 codepoint, 1 byte, 1 error)' ooRexx> "FF"~text("cp1252")~x2c=;result~description= -- T'[FF]' 'windows-1252 not-ASCII (1 character, 1 codepoint, 1 byte, 0 error)' T'[FF]' 'windows-1252 not-ASCII (1 character, 1 codepoint, 1 byte, 0 error)' --- ooRexx> "41"x=;result~description= -- 'A' 'UTF-8 ASCII (1 byte)' 'A' 'UTF-8 ASCII (1 byte)' ooRexx> x2c("41")=;result~description= -- 'A' 'UTF-8 ASCII by default (1 byte)' 'A' 'UTF-8 ASCII by default (1 byte)' ooRexx> "41"~x2c=;result~description= -- 'A' 'UTF-8 ASCII by default (1 byte)' 'A' 'UTF-8 ASCII by default (1 byte)' ooRexx> "41"~text~x2c=;result~description= -- T'A' 'UTF-8 ASCII (1 character, 1 codepoint, 1 byte, 0 error)' T'A' 'UTF-8 ASCII (1 character, 1 codepoint, 1 byte, 0 error)' ooRexx> "41"~text("cp1252")~x2c=;result~description= -- T'A' 'windows-1252 ASCII (1 character, 1 codepoint, 1 byte, 0 error)' T'A' 'windows-1252 ASCII (1 character, 1 codepoint, 1 byte, 0 error)' -- =============================================================================== -- 2024 Apr 03 /* No longer apply the rule R3 during the automatic conversion of String literals to RexxText instances. If the package encoding is not a byte encoding then any not-ASCII String literal is converted to a RexxText, whatever its encoding. Reason: inconsistency between "noel" "FF"x~~setEncoding("cp1252")= -- 'noel [FF]' because concatenation of 2 String instances "noël" "FF"x~~setEncoding("cp1252")= -- Encoding: cannot append... because concatenation of a RexxText with a String Now: */ ooRexx> "FF"x= -- T'[FF]' (was a String thanks to R3) T'[FF]' ooRexx> "noel" "FF"x~~setEncoding("cp1252")= -- Encoding: cannot append windows-1252 not-ASCII '[FF]' to UTF-8 ASCII 'noel' (was 'noel [FF]') Encoding: cannot append windows-1252 not-ASCII '[FF]' to UTF-8 ASCII 'noel'. Error code= 23.900 ooRexx> -- Note: no longer "by default" in "UTF-8 ASCII 'noel'" because the string literal has now a stored encoding /* Unchanged: */ ooRexx> "noël" "FF"x= -- T'noël [FF]' no error because the Byte_Encoding is always absorbed T'noël [FF]' ooRexx> "FF"x "noël"= -- T'[FF] noël' idem T'[FF] noël' ooRexx> "noël" "FF"x~~setEncoding("cp1252")= -- Encoding: cannot append windows-1252 not-ASCII '[FF]' to UTF-8 not-ASCII 'noël' Encoding: cannot append windows-1252 not-ASCII '[FF]' to UTF-8 not-ASCII 'noël'. Error code= 23.900 ooRexx> "FF"x~~setEncoding("cp1252") "noël"= -- Encoding: cannot append UTF-8 not-ASCII 'noël' to windows-1252 not-ASCII '[FF]' Encoding: cannot append UTF-8 not-ASCII 'noël' to windows-1252 not-ASCII '[FF]'. Error code= 23.900 -- =============================================================================== -- 2024 Apr 01 /* A package has an encoding: .Package encoding encoding= hasEncoding Rules for the calculation of a package default encoding: Case 1: package not requesting "text.cls", directly or indirectly. Most of the legacy packages don't support an automatic conversion to text. The package's default encoding is Byte (not .Encoding~defaultEncoding). Case 2: package requesting "text.cls", directly or indirectly. We assume that the requester supports an automatic conversion to text. The package's default encoding is .Encoding~defaultEncoding. */ /* New method setEncoding on String, MutableBuffer, Package and RexxText, to change the current encoding and return the previous encoding. The bytes are not impacted, it's just an update of the encoding annotation. Example, assuming the default encoding is UTF-8: */ ooRexx> "Noel"~setEncoding("windows-1252")= -- (The UTF8_Encoding class) (previous encoding) (The UTF8_Encoding class) ooRexx> "Noël"~setEncoding("byte")= -- (The UTF8_Encoding class) (previous encoding) (The UTF8_Encoding class) /* Example, when the package encoding is Byte: */ ooRexx> oldEncoding = .context~package~setEncoding("byte") ooRexx> "Noel"~setEncoding("windows-1252")= -- (The Byte_Encoding class) (previous encoding) (The Byte_Encoding class) ooRexx> "Noël"~setEncoding("utf8")= -- (The Byte_Encoding class) (previous encoding) (The Byte_Encoding class) ooRexx> .context~package~setEncoding(oldEncoding) /* New methods on the class Encoding, to change the current encoding and return the previous encoding: setDefaultEncoding setDefaultInputEncoding setDefaultOutputEncoding */ /* Relax the constraints for the Byte_Encoding in the methods compatibleEncoding and asEncodingFor: The Byte_Encoding can be always absorbed. Reason: The Byte_Encoding is often used for diagnostic or repair. Examples: */ ooRexx> "Père"~c2g= -- '50 C3A8 72 65' '50 C3A8 72 65' ooRexx> "Père"~text~startsWith("50C3"x~byte)= -- false (not aligned) (was Encoding: cannot compare Byte not-ASCII 'P\C3' with UTF-8 not-ASCII 'Père') 0 ooRexx> "Père"~text~startsWith("50C3A8"x~byte)= -- true (was Encoding: cannot compare Byte not-ASCII 'Pè' with UTF-8 not-ASCII 'Père') 1 /* +-------------------------------------------+ | 1st important milestone | | Activation of the automatic conversion | | of String literals to RexxText instances | +-------------------------------------------+ This is managed in RexxString::evaluate Rules: if string~isASCII then value = string -- R1 don't convert to RexxText if the string literal is ASCII (here, NO test of encoding, just testing the bytes) else if .context~package~encoding~isByte then value = string -- R2 don't convert to RexxText if the encoding of its definition package is the Byte_Encoding or a subclass of it (legacy package). -- else if string~isCompatibleWithByteString then value = string -- R3 (no longer applied) don't convert to RexxText if the string literal is compatible with a Byte string. else value = string~text -- R4 convert to RexxText Examples, assuming the package encoding is UTF-8: */ ooRexx> "Noel"~class= -- (The String class) R1 (The String class) ooRexx> oldEncoding = .context~package~setEncoding("byte") ooRexx> "Noël"~class= -- (The String class) R2 (The String class) ooRexx> .context~package~setEncoding(oldEncoding) -- The rule R3 is no longer applied -- The only way to test it is to use an hexadecimal (or binary) string literal. -- [later] The hexadecimal string literals are no longer Byte encoded, so this test is no longer a good test ooRexx> "Noël"~c2x= -- '4E 6F C3AB 6C' '4E 6F C3AB 6C' ooRexx> '4E 6F C3AB 6C'x~encoding= -- (The UTF8_Encoding class) (was (The Byte_Encoding class) so R3 could apply, but we no longer apply it) (The UTF8_Encoding class) ooRexx> '4E 6F C3AB 6C'x~class= -- (The RexxText class) R4 (The RexxText class) ooRexx> "Noël"~class= -- (The RexxText class) R4 (The RexxText class) ooRexx> "Noël"~string~class= -- (The String class) R4 The string literal is a RexxText, the method ~string returns a String with encoding UTF-8 (The String class) ooRexx> "Noël"~~setEncoding("byte")~class= -- (The RexxText class) R4 The string literal is a RexxText, its encoding is changed from UTF-8 to Byte (The RexxText class) ooRexx> "Noël"~~setEncoding("byte")~string~class= -- (The String class) R4 The string literal is a RexxText, its encoding is changed from UTF-8 to Byte, the method ~string returns a String with encoding Byte (The String class) /* Deactivate (again) the constraint "self~isCompatibleWithByteString" when converting a RexxText to a String (.Unicode~unckeckedConversionToString = .true). Reason: after activation of the automatic conversion to RexxText, I get these errors if I keep the constraint "self~isCompatibleWithByteString". say "Noël" -- raise an error "UTF-8 not-ASCII 'Noël' cannot be converted to a String instance" xrange("00"x,"ff"x) -- raise an error "UTF-8 not-ASCII '[FF]' cannot be converted to a String instance" The constraint "self~isCompatibleWithByteString" was put in place to detect when a RexxText instance is "lost" during conversion to string. Now that we have a common interface on String and RexxText, plus an automatic conversion to RexxText, this "loss" should occur less often. But still occurs. Example, assuming the default encoding and the package encoding are UTF-8: */ ooRexx> "Noël"~length= -- 4 4 ooRexx> "Noël"~text~length= -- 4 4 ooRexx> "Noël"~string~length= -- 5 5 ooRexx> length("Noël")= -- 4 (was 5, should be 4 (with the constraint, would raise UTF-8 not-ASCII 'Noël' cannot be converted to a String instance)) 4 ooRexx> length("Noël"~string)= -- 5 5 /* ---------- ABANDONNED (incompatible with the decision to assign the encoding of the definition package to the string literals) ---------- The strings created by D2C, X2C are declared Byte encoded. It's because it's not unusual to create ill-formed encoded strings with these BIF/BIM. The Byte_Encoding is a raw encoding with few constraints, BUT it's impossible to transcode from/to it without errors if the string contains not-ASCII characters. That's why, often, a more specialized byte encoding is applied on the byte string, to interpret the bytes differently. Implementation notes: D2C: RexxNumberString::d2xD2c calls StringUtil::packHex X2C: StringUtil::packHex Examples: */ ooRexx> "é"~encoding= -- (The UTF8_Encoding class) (The UTF8_Encoding class) -- D2C ooRexx> "é"~c2d= -- 50089 50089 ooRexx> d2c(50089)= -- 'é' 'é' ooRexx> 50089~d2c= -- 'é' 'é' ooRexx> d2c(50089)~encoding= -- (The UTF8_Encoding class) (was (The Byte_Encoding class)) (The UTF8_Encoding class) ooRexx> 50089~d2c~encoding= -- (The UTF8_Encoding class) (was (The Byte_Encoding class)) (The UTF8_Encoding class) -- X2C ooRexx> "é"~c2x= -- 'C3A9' 'C3A9' ooRexx> x2c("C3A9")= -- 'é' 'é' ooRexx> "C3A9"~x2c= -- 'é' 'é' ooRexx> x2c("C3A9")~encoding= -- (The UTF8_Encoding class) (was (The Byte_Encoding class)) (The UTF8_Encoding class) ooRexx> "C3A9"~x2c~encoding= -- (The UTF8_Encoding class) (was (The Byte_Encoding class)) (The UTF8_Encoding class) -- Valid Byte string, but invalid UTF-8 string ooRexx> "C3"~x2c~class= -- (The String class) (The String class) ooRexx> "C3"~x2c~encoding= -- (The UTF8_Encoding class) (was (The Byte_Encoding class)) (The UTF8_Encoding class) -- Apply an UTF-8 view through the String interface ooRexx> "C3"~x2c~~setEncoding("utf8")~description= -- 'UTF-8 not-ASCII (1 byte)' 'UTF-8 not-ASCII (1 byte)' ooRexx> "C3"~x2c~~setEncoding("utf8")~errors= -- 'UTF-8 encoding: byte sequence at byte-position 1 is truncated, expected 2 bytes.' ['UTF-8 encoding: byte sequence at byte-position 1 is truncated, expected 2 bytes.'] -- Apply an UTF-8 view through the RexxText interface ooRexx> "C3"~x2c~text("utf8")~description= -- 'UTF-8 not-ASCII (1 character, 1 codepoint, 1 byte, 1 error)' 'UTF-8 not-ASCII (1 character, 1 codepoint, 1 byte, 1 error)' ooRexx> "C3"~x2c~text("utf8")~errors= -- 'UTF-8 encoding: byte sequence at byte-position 1 is truncated, expected 2 bytes ['UTF-8 encoding: byte sequence at byte-position 1 is truncated, expected 2 bytes.'] /* ---------- ABANDONNED (incompatible with the decision to assign the encoding of the definition package to the string literals) ---------- The hexadecimal and binary strings are declared Byte encoded, for the same reasons as D2C, X2C. Implementation notes: RexxSource::packLiteral (Scanner.cpp) Examples: */ -- The encoding of a string literal is the encoding of its definition package. ooRexx> "é"~encoding= -- (The UTF8_Encoding class) (The UTF8_Encoding class) -- The encoding of an hexadecimal string is the Byte encoding. ooRexx> "é"~c2x= -- 'C3A9' 'C3A9' ooRexx> "C3A9"x= -- T'é' T'é' ooRexx> "C3A9"x~encoding= -- (The UTF8_Encoding class) (was (The Byte_Encoding class)) (The UTF8_Encoding class) -- The encoding of a binary string is the Byte encoding. ooRexx> "é"~c2x~x2b= -- 1100001110101001 1100001110101001 ooRexx> "11000011 10101001"b= -- T'é' T'é' ooRexx> "11000011 10101001"b~encoding= -- (The UTF8_Encoding class) (was (The Byte_Encoding class)) (The UTF8_Encoding class) /* Implementation of Strip: */ ooRexx> "Noël"~strip= -- T'Noël' T'Noël' ooRexx> "\tNoël "~unescape~strip= -- T'Noël' T'Noël' ooRexx> "Noël"~strip("b", "ë")= -- T'Noël' T'Noël' ooRexx> "Noë"~strip("b", "ë")= -- T'No' T'No' ooRexx> "🤶Noël🎅"~strip("b", "lë🎅🤶")= -- T'No' T'No' ooRexx> "\u{NBSP}\u{EN SPACE}\u{EM SPACE}\u{HAIR SPACE}\u{FIGURE SPACE}\u{THIN SPACE}"~unescape~strip= -- T' ' T' ' ooRexx> "\u{NBSP}\u{EN SPACE}\u{EM SPACE}\u{HAIR SPACE}\u{FIGURE SPACE}\u{THIN SPACE}"~unescape~strip(lump:)= -- T'' T'' /* New methods on String for compatibility with RexxText (inherit StringRexxTextInterface). Most of these methods forward to string~text. */ ooRexx> "a"~errors= -- (The NIL object) (The NIL object) ooRexx> "a"~isCompatibleWithASCII= -- 1 1 ooRexx> "a"~isCompatibleWithByteString= -- 1 1 ooRexx> "a"~isUpper= -- 0 0 ooRexx> "A"~isUpper= -- 1 1 ooRexx> "a"~isLower= -- 1 1 ooRexx> "A"~isLower= -- 0 0 ooRexx> "a"~codepoints= -- (a CodePointSupplier) (a CodePointSupplier) ooRexx> "a"~maximumCodepoint= -- 97 97 ooRexx> "a"~maximumUnicodeCodepoint= -- 97 97 ooRexx> "a"~UnicodeCharacters= -- [( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" )] [( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" )] ooRexx> "a"~characters= -- ['a'] ['a'] ooRexx> "a"~character(1)= -- 'a' 'a' ooRexx> buffer = .MutableBuffer~new; "a"~character(1, :buffer)= -- M'a' M'a' ooRexx> "a"~transcodeTo("utf16")= -- T'[00]a' T'[00]a' ooRexx> "a"~utf8= -- T'a' T'a' ooRexx> "a"~wtf8= -- T'a' T'a' ooRexx> "a"~utf16= -- T'[00]a' T'[00]a' ooRexx> "a"~utf16be= -- T'[00]a' T'[00]a' ooRexx> "a"~utf16le= -- T'a[00]' T'a[00]' ooRexx> "a"~wtf16= -- T'[00]a' T'[00]a' ooRexx> "a"~wtf16be= -- T'[00]a' T'[00]a' ooRexx> "a"~wtf16le= -- T'a[00] T'a[00]' ooRexx> "a"~utf32= -- T'[000000]a' T'[000000]a' ooRexx> "a"~utf32be= -- T'[000000]a' T'[000000]a' ooRexx> "a"~utf32le= -- T'a[000000]' T'a[000000]' ooRexx> "a"~unicode~c2x= -- 61 61 ooRexx> "a"~unicodeN~c2x= -- 61 61 ooRexx> "a"~unicode8~c2x= -- 61 61 ooRexx> "a"~unicode16~c2x= -- 6100 6100 ooRexx> "a"~unicode32~c2x= -- 61000000 61000000 ooRexx> "a"~c2u= -- 'U+0061' 'U+0061' ooRexx> 'U+0061'~u2c= -- T'a[000000]' T'a[000000]' ooRexx> 'U+0061'~u2c~c2x= -- 61000000 61000000 ooRexx> 'U+0061'~u2c~utf8= -- T'a' T'a' ooRexx> "ab"~c2g= -- '61 62' '61 62' ooRexx> "z"~checkHexadecimalValueCompatibility= -- [no result] (good, no error raised) [no result] ooRexx> "z"~checkNumericValueCompatibility= -- [no result] (good, no error raised) [no result] ooRexx> "z"~checkLogicalValueCompatibility= -- [no result] (good, no error raised) [no result] ooRexx> "\u{FLAG IN HOLE}"~unescape= -- T'⛳' T'⛳' ooRexx> "a"~transform= -- T'a' T'a' ooRexx> "a"~transformer= -- (a RexxTextTransformer) (a RexxTextTransformer) ooRexx> "abc def"~title= -- T'Abc Def' T'Abc Def' ooRexx> "a"~isNFC= -- 1 1 ooRexx> "a"~NFC= -- T'a' T'a' ooRexx> "a"~isNFD= -- 1 1 ooRexx> "a"~NFD= -- T'a' T'a' ooRexx> "a"~isNFKC= -- 1 1 ooRexx> "a"~NFKC= -- T'a' T'a' ooRexx> "a"~isNFKD= -- 1 1 ooRexx> "a"~NFKD= -- T'a' T'a' ooRexx> "a"~isCasefold= -- -1 -1 ooRexx> "A"~isCasefold= -- -1 -1 ooRexx> "a"~transform(casefold:)~isCasefold= -- 1 1 ooRexx> "A"~transform(casefold:)~isCasefold= -- 1 1 ooRexx> "a"~casefold= -- T'a' T'a' ooRexx> "A"~casefold= -- T'a' T'a' ooRexx> "a"~isMarkStripped= -- -1 -1 ooRexx> "a"~transform(stripMark:)~isMarkStripped= -- 1 1 ooRexx> "a"~isIgnorableStripped= -- -1 -1 ooRexx> "a"~transform(stripIgnorable:)~isIgnorableStripped= -- 1 1 ooRexx> "a"~isCCStripped= -- -1 -1 ooRexx> "a"~transform(stripCC:)~isCCStripped= -- 1 1 ooRexx> "a"~isNAStripped= -- -1 -1 ooRexx> "a"~transform(stripNA:)~isNAStripped= -- 1 1 ooRexx> "ab"~graphemes= -- ['a','b'] ['a','b'] ooRexx> "ab"~grapheme(1)= -- 'a' 'a' /* Implementation of the abstract method 'transform' for Byte_Encoding and its subclasses. Parameters: normalization = 0 Ignored, there is no normalization for byte strings. casefold = .false if .true then apply ~lower lump= .false Ignored stripMark = .false if .true then replace the accented letters by their base letter stripIgnorable= .false Ignored stripCC = .false if .true then remove the codepoints < 20x stripNA = .false if .true then remove the unassigned codepoints Examples: */ -- casefold ooRexx> "Père Noël"~transcodeTo("windows-1252")= -- T'P?re No?l' T'P�re No�l' ooRexx> "Père Noël"~transcodeTo("windows-1252")~c2x= -- '50 E8 72 65 20 4E 6F EB 6C' '50 E8 72 65 20 4E 6F EB 6C' ooRexx> '50 E8 72 65 20 4E 6F EB 6C'x~byte~transform(casefold:)= -- T'p?re no?l' T'p�re no�l' ooRexx> '50 E8 72 65 20 4E 6F EB 6C'x~byte~transform(casefold:)~encoding= -- (The Byte_Encoding class) (The Byte_Encoding class) ooRexx> '50 E8 72 65 20 4E 6F EB 6C'x~byte~transform(casefold:)~utf8= -- Cannot convert Byte not-ASCII character 232 (E8) at byte-position 2 to UTF-8 Cannot convert Byte not-ASCII character 232 (E8) at byte-position 2 to UTF-8. Error code= 23.900 ooRexx> '50 E8 72 65 20 4E 6F EB 6C'x~byte~transform(casefold:)~~setEncoding("windows-1252")~utf8= -- T'père noël' T'père noël' -- stripMark depends on the encoding ooRexx> "80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F 90 93 94 95 96 97 98 99 9A 9F A0 A1 A2 A3 A4 A5"x~text("ibm-437")~utf8= -- T'ÇüéâäàåçêëèïîìÄÅÉôöòûùÿÖ܃áíóúñÑ' T'ÇüéâäàåçêëèïîìÄÅÉôöòûùÿÖ܃áíóúñÑ' ooRexx> "80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F 90 93 94 95 96 97 98 99 9A 9F A0 A1 A2 A3 A4 A5"x~text("ibm-437")~transform(stripMark:)~utf8= -- T'CueaaaaceeeiiiAAEooouuyOUfaiounN' T'CueaaaaceeeiiiAAEooouuyOUfaiounN' ooRexx> "83 8A 9A 9F C0 C1 C2 C3 C4 C5 C7 C8 C9 CA CB CC CD CE CF D1 D2 D3 D4 D5 D6 D8 D9 DA DB DC DD E0 E1 E2 E3 E4 E5 E7 E8 E9 EA EB EC ED EE EF F1 F2 F3 F4 F5 F6 F8 F9 FA FB FC FD FF"x~text("ibm-1252")~utf8= -- T'ƒŠšŸÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüýÿ' T'ƒŠšŸÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüýÿ' ooRexx> "83 8A 9A 9F C0 C1 C2 C3 C4 C5 C7 C8 C9 CA CB CC CD CE CF D1 D2 D3 D4 D5 D6 D8 D9 DA DB DC DD E0 E1 E2 E3 E4 E5 E7 E8 E9 EA EB EC ED EE EF F1 F2 F3 F4 F5 F6 F8 F9 FA FB FC FD FF"x~text("ibm-1252")~transform(stripMark:)~utf8= -- T'fSsYAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy' T'fSsYAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy' ooRexx> "C0 C1 C2 C3 C4 C5 C7 C8 C9 CA CB CC CD CE CF D1 D2 D3 D4 D5 D6 D8 D9 DA DB DC DD E0 E1 E2 E3 E4 E5 E7 E8 E9 EA EB EC ED EE EF F1 F2 F3 F4 F5 F6 F8 F9 FA FB FC FD FF"x~text("iso-8859-1")~utf8= -- T'ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüýÿ' T'ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüýÿ' ooRexx> "C0 C1 C2 C3 C4 C5 C7 C8 C9 CA CB CC CD CE CF D1 D2 D3 D4 D5 D6 D8 D9 DA DB DC DD E0 E1 E2 E3 E4 E5 E7 E8 E9 EA EB EC ED EE EF F1 F2 F3 F4 F5 F6 F8 F9 FA FB FC FD FF"x~text("iso-8859-1")~transform(stripMark:)~utf8= -- T'AAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy' T'AAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy' ooRexx> "83 8A 8E 9A 9E 9F C0 C1 C2 C3 C4 C5 C7 C8 C9 CA CB CC CD CE CF D1 D2 D3 D4 D5 D6 D8 D9 DA DB DC DD E0 E1 E2 E3 E4 E5 E7 E8 E9 EA EB EC ED EE EF F1 F2 F3 F4 F5 F6 F8 F9 FA FB FC FD FF"x~text("windows-1252")~utf8= -- T'ƒŠŽšžŸÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüýÿ' T'ƒŠŽšžŸÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüýÿ' ooRexx> "83 8A 8E 9A 9E 9F C0 C1 C2 C3 C4 C5 C7 C8 C9 CA CB CC CD CE CF D1 D2 D3 D4 D5 D6 D8 D9 DA DB DC DD E0 E1 E2 E3 E4 E5 E7 E8 E9 EA EB EC ED EE EF F1 F2 F3 F4 F5 F6 F8 F9 FA FB FC FD FF"x~text("windows-1252")~transform(stripMark:)~utf8= -- T'fSZszYAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy' T'fSZszYAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy' -- several transformations ooRexx> "Père Noël"~transcodeTo("windows-1252")~transform(casefold:, stripMark:)~utf8= -- T'pere noel' T'pere noel' ooRexx> '50 E8 72 65 20 4E 6F EB 6C'x~~setEncoding("windows-1252")~transform(casefold:, stripMark:)~utf8= -- T'pere noel' T'pere noel' -- next: the transform is done on Byte string, which has no rule for stripMark. -- the accents are not removed. ooRexx> '50 E8 72 65 20 4E 6F EB 6C'x~byte~transform(casefold:, stripMark:)~~setEncoding("windows-1252")~utf8= -- T'père noël' T'père noël' -- =============================================================================== -- 2024 Mar 17 /* For consistency with other methods, add the optional named argument 'buffer' to [] c2g c2x x2b x2d Examples: */ ooRexx> buffer = .MutableBuffer~new ooRexx> "Tête à tête"~text[2, 5, :buffer]= -- M'ête à' M'ête à' ooRexx> "A"~text~c2g(:buffer)= -- M'ête à41' M'ête à41' ooRexx> "A"~text~c2x(:buffer)= -- M'ête à4141' M'ête à4141' ooRexx> "41"~text~x2b(:buffer)= -- M'ête à414101000001' M'ête à414101000001' ooRexx> "41"~text~x2d(:buffer)= -- M'ête à41410100000165' M'ête à41410100000165' /* For compatibility with Python, add support for \N{Unicode name}. Example: */ ooRexx> "\N{for all} x \N{there exists} y such that x+y=0"~text~unescape= -- T'∀ x ∃ y such that x+y=0' T'∀ x ∃ y such that x+y=0' /* Add support for code point labels. Examples: */ ooRexx> .unicode~character("<control-000A>")= -- ( "" U+000A Cc 0 "", "LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL" ) ( "" U+000A Cc 0 "", "LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL" ) ooRexx> "hello\N{<control-000A>}bye"~text~unescape= -- T'hello[0A]bye' T'hello[0A]bye' ooRexx> "hello\U{<control-000A>}bye"~text~unescape= -- T'hello[0A]bye' T'hello[0A]bye' /* Modify the display of UnicodeCharacter properties to show the codepoint values in U+ and 0x notation. */ ooRexx> .Unicode["🤶"]~properties= a Directory (30 items) 'aliases' : (an Array no shape, 0 items) 'bidiClass' : 19 'bidiClassName' : 'ON' 'bidiMirrored' : 0 'boundClass' : 19 'boundClassName' : 'EXTENDED_PICTOGRAPHIC' 'category' : 22 'categoryName' : 'So' 'charWidth' : 2 'codepoint' : 'U+1F936' 'combiningClass' : 0 'controlBoundary' : 0 'decompositionType' : 0 'decompositionTypeName' : 'None' 'ignorable' : 0 'isLower' : 0 'isUpper' : 0 'name' : 'MOTHER CHRISTMAS' 'toLowerFull' : 'U+1F936' 'toLowerSimple' : 'U+1F936' 'toTitleFull' : 'U+1F936' 'toTitleSimple' : 'U+1F936' 'toUpperFull' : 'U+1F936' 'toUpperSimple' : 'U+1F936' 'Unicode' : '0x36F90100' 'UTF16BE' : '0xD83EDD36' 'UTF16LE' : '0x3ED836DD' 'UTF32BE' : '0x0001F936' 'UTF32LE' : '0x36F90100' 'UTF8' : '0xF09FA4B6' /* Modification of the rule for buffer encoding neutrality. old: If left is a buffer with no encoding then use the right encoding. new: If left is an empty buffer with no encoding then use the right encoding. Impacted methods: .Encoding~compatibleEncoding .StringIndexer~asEncodingFor Examples: */ ooRexx> buffer = .MutableBuffer~new -- This is an empty buffer with no explicit encoding: -- The rule for encoding neutrality will apply. ooRexx> buffer~description= -- 'UTF-8 ASCII by default (0 byte)' 'UTF-8 ASCII by default (0 byte)' ooRexx> "Test"~text~utf16~left(2, :buffer)= -- M'[00]T[00]e' M'[00]T[00]e' -- The buffer encoding is now UTF-16BE. ooRexx> buffer~description= -- 'UTF-16BE (4 bytes)' 'UTF-16BE (4 bytes)' ooRexx> buffer = .MutableBuffer~new("not empty") ooRexx> buffer~description= -- 'UTF-8 ASCII (9 bytes)' 'UTF-8 ASCII (9 bytes)' ooRexx> -- Note: no longer "UTF-8 ASCII by default" because the string literal has now a stored encoding -- Here, the rule for encoding neutrality does not apply. ooRexx> "Test"~text~utf16~left(2, :buffer)= -- Encoding: cannot append UTF-16BE to UTF-8 ASCII 'not empty' Encoding: cannot append UTF-16BE to UTF-8 ASCII 'not empty'. Error code= 23.900 ooRexx> -- Note: no longer "UTF-8 ASCII by default" because the string literal has now a stored encoding /* New method ~u2c on String and RexxText. Create a Unicode32 text from a sequence of U+xxxx. The U+ string/text must be compatible with a byte encoding (Byte or subclass, UTF-8 ASCII, WTF-8 ASCII). In other words, will not support a sequence of U+xxxx encoded in UTF-16 or UTF-32. Examples: */ -- U+ string ooRexx> "U+004E u+006F U+00EB U+006C U+1F936 U+1F385"~u2c~description= -- 'Unicode32 (6 characters, 6 codepoints, 24 bytes, 0 error)' 'Unicode32 (6 characters, 6 codepoints, 24 bytes, 0 error)' ooRexx> "U+004E u+006F U+00EB U+006C U+1F936 U+1F385"~u2c~c2x= -- '4E000000 6F000000 EB000000 6C000000 36F90100 85F30100' '4E000000 6F000000 EB000000 6C000000 36F90100 85F30100' ooRexx> "U+004E u+006F U+00EB U+006C U+1F936 U+1F385"~u2c~utf8= -- T'Noël🤶🎅' T'Noël🤶🎅' -- U+ text ooRexx> "U+004E u+006F U+00EB U+006C U+1F936 U+1F385"~text~u2c~description= -- 'Unicode32 (6 characters, 6 codepoints, 24 bytes, 0 error)' 'Unicode32 (6 characters, 6 codepoints, 24 bytes, 0 error)' ooRexx> "U+004E u+006F U+00EB U+006C U+1F936 U+1F385"~text~u2c~c2x= -- '4E000000 6F000000 EB000000 6C000000 36F90100 85F30100' '4E000000 6F000000 EB000000 6C000000 36F90100 85F30100' ooRexx> "U+004E u+006F U+00EB U+006C U+1F936 U+1F385"~text~u2c~utf8= -- T'Noël🤶🎅' T'Noël🤶🎅' ooRexx> buffer = .MutableBuffer~new ooRexx> "U+0031 U+0032"~text~u2c(:buffer)= -- M'1[000000]2[000000]' M'1[000000]2[000000]' -- The buffer encoding is now Unicode32. ooRexx> buffer~description= -- 'Unicode32 (8 bytes)' 'Unicode32 (8 bytes)' -- Examples of invalid U+ string/text ooRexx> "U+004E u+006F U+00EB U+006C U+1F936 U+1F385"~text~utf16~u2c= -- UTF-16BE '[00]U[00]+[00]0[00]0[00]4[00]E[00] ...' is not compatible with an U+ string. UTF-16BE '[00]U[00]+[00]0[00]0[00]4[00]E[00] ...' is not compatible with an U+ string. Error code= 23.900 ooRexx> "A+004E"~u2c= -- Expecting U+ or u+ followed by 4..6 hex digits, got 'A+004E' Expecting U+ or u+ followed by 4..6 hex digits, got 'A+004E'. Error code= 93.900 ooRexx> "u+4E"~u2c= -- Expecting U+ or u+ followed by 4..6 hex digits, got 'u+4E' Expecting U+ or u+ followed by 4..6 hex digits, got 'u+4E'. Error code= 93.900 ooRexx> "u+000004E"~u2c= -- Expecting U+ or u+ followed by 4..6 hex digits, got 'u+000004E' Expecting U+ or u+ followed by 4..6 hex digits, got 'u+000004E'. Error code= 93.900 /* New supported methods on RexxText: - d2c forward to String, return a Text or a MutableBuffer - d2x forward to String, return a String or a MutableBuffer Examples: */ ooRexx> "65"~text~d2c= -- T'A' T'A' ooRexx> "65"~text~d2x= -- 41 41 ooRexx> buffer = .MutableBuffer~new ooRexx> "65"~text~d2c(:buffer)= -- M'A' M'A' ooRexx> "65"~text~d2x(:buffer)= -- M'A41' M'A41' ooRexx> buffer~encoding = "utf16" ooRexx> "65"~text~d2c(:buffer)= -- Encoding: cannot append Byte ASCII 'A' to UTF-16BE 'A41' Encoding: cannot append UTF-8 ASCII by default 'A' to UTF-16BE 'A41'. Error code= 23.900 /* Partial implementation of translate (ASCII string only): Examples: */ ooRexx> "hello"~text~translate= -- 'HELLO' 'HELLO' ooRexx> "hello"~text~translate(,,"x")= -- 'xxxxx' 'xxxxx' ooRexx> "hello"~text~translate(,"el","x")= -- 'hxxxo' 'hxxxo' -- =============================================================================== -- 2023 Dec 04 /* Reworked the implementation of caselessMatchChar, matchCar. */ ooRexx> "Bundesschnellstraße"~text~caselessMatchChar(18, "s")= -- now 0: "ß" casefolded to "ss" doesn't match "s" 0 ooRexx> "BAFFLE"~text~caselessMatchChar(5, "ffl")= -- now 0: "L" casefolded to "l" doesn't match "ffl" casefolded to "ffl" (no more iteration on each character of "ffl") 0 ooRexx> "baffle"~text~matchChar(3, "f", normalization:.Unicode~NFKD)= -- now 0: "ffl" transformed to "ffl" doesn't match "f" 0 /* After rework, I have these other differences: */ -- Case 1 sounds good (no more iteration on each character of "ffl") ooRexx> "BAFFLE"~text~caselessMatchChar(3, "ffl")= -- 0 was 1 "ffl" becomes "ffl" (3 graphemes), there is a match on "f" at 3 0 ooRexx> "BAFFLE"~text~caselessPos("ffl", aslist:, aligned:0)= a List (1 items) 0 : [+3.3,+6.6] /* a List (1 items) 0 : [+3.3,+6.6] */ -- I get the same result as before by explicitely decomposing the ligature "ffl" to "ffl" BEFORE : ooRexx> "BAFFLE"~text~caselessMatchChar(3, "ffl"~text~transform(normalization:.Unicode~NFKD))= -- 1 1 -- here, it's ok because the match is on several characters ooRexx> "BAFFLE"~text~caselessMatch(3, "ffl")= -- 1 1 -- Case 2 sounds good (no more iteration on each character of "ffl") ooRexx> "BAFFLE"~text~caselessMatchChar(5, "ffl")= -- 0 was 1 "ffl" becomes "ffl" (3 graphemes), there is a match on "l" at 5 0 ooRexx> "BAFFLE"~text~caselessMatch(5, "ffl")= -- 0 0 -- Case 3 sounds good (no more iteration on each character of "ffl") ooRexx> "baffle"~text~caselessMatchChar(3, "F")= -- 0 was 1 "ffl" at 3 (1 grapheme) becomes "ffl" (3 graphemes), there is a match on "f" 0 -- Case 4 sound good (hum... did I really think that the character "ffl" at pos 3 can match an "l"?) ooRexx> "baffle"~text~caselessMatchChar(3, "L")= -- 0 was 1 "ffl" at 3 (1 grapheme) becomes "ffl" (3 graphemes), there is a match on "l" 0 -- =============================================================================== -- 2023 Nov 28 /* https://github.com/unicode-org/icu4x/issues/4365 Segmenter does not work correctly in some languages let text = "as `নমস্কাৰ, আপোনাৰ কি খবৰ?` hi `हैलो, क्या हाल हैं?` mai `नमस्ते अहाँ केना छथि?` mr `नमस्कार, कसे आहात?` ne `नमस्ते, कस्तो हुनुहुन्छ?` or `ନମସ୍କାର ତୁମେ କେମିତି ଅଛ?` sa `हे त्वं किदं असि?` te `హాయ్, ఎలా ఉన్నారు?`"; icu4c: 151 rust: 161 --- ICU4X and ICU4C are just using different definitions of EGCs; ICU4C has had a tailoring for years which has just been incorporated into Unicode 15.1, whereas ICU4X implements the 15.0 version without that tailoring. The difference is the handling of aksaras in some indic scripts: in Unicode 15.1 (and in any recent ICU4C) क्या is one EGC, but it is two EGCs (क्, या) in untailored Unicode 15.0 (and in ICU4X). --- executor: 151 */ ooRexx> s="as `নমস্কাৰ, আপোনাৰ কি খবৰ?`"'0D'x"hi `हैलो, क्या हाल हैं?`"'0D'x"mai `नमस्ते अहाँ केना छथि?`"'0D'x"mr `नमस्कार, कसे आहात?`"'0D'x"ne `नमस्ते, कस्तो हुनुहुन्छ?`"'0D'x"or `ନମସ୍କାର ତୁମେ କେମିତି ଅଛ?`"'0D'x"sa `हे त्वं किदं असि?`"'0D'x"te `హాయ్, ఎలా ఉన్నారు?`" ooRexx> s~text~length= -- 151 151 /* https://boyter.org/posts/unicode-support-what-does-that-actually-mean/ According wikipedia the character ſ is a long s. Which means if you want to support unicode you need to ensure that if someone does a case insensitive comparison then the following examples are all string equivalent. ſecret == secret == Secret ſatisfaction == satisfaction == ſatiſfaction == Satiſfaction == SatiSfaction === ſatiSfaction */ ooRexx> "ſ"~text~casefold= -- "s" T's' ooRexx> "ſecret"~text~caselessEquals("secret")= -- 1 1 ooRexx> "ſecret"~text~caselessEquals("Secret")= -- 1 1 ooRexx> "ſatisfaction"~text~caselessEquals("satisfaction")= -- 1 1 ooRexx> "satisfaction"~text~caselessEquals("ſatiſfaction")= -- 1 1 ooRexx> "ſatiſfaction"~text~caselessEquals("Satiſfaction")= -- 1 1 ooRexx> "Satiſfaction"~text~caselessEquals("SatiSfaction")= -- 1 1 ooRexx> "SatiSfaction"~text~caselessEquals("ſatiSfaction")= -- 1 1 -- =============================================================================== -- 2023 Nov 21 /* To rework? matchChar sometimes returns .true whereas pos returns 0. Examples in demoTextCompatibility: KO? 2023.12.04: yes */ ooRexx> "Bundesschnellstraße"~text~caselessMatchChar(18, "s")= -- now 0, was 1 before 2023.12.04 0 ooRexx> "Bundesschnellstraße"~text~caselessPos("s", aslist:, aligned:0)= a List (5 items) 0 : [+6.6,+7.7] 1 : [+7.7,+8.8] 2 : [+14.14,+15.15] 3 : [+18.18,-18.19] 4 : [-18.19,+19.20] /* a List (5 items) 0 : [+6.6,+7.7] 1 : [+7.7,+8.8] 2 : [+14.14,+15.15] 3 : [+18.18,-18.19] 4 : [-18.19,+19.20] */ /* KO? 2023.12.04: yes */ ooRexx> "BAFFLE"~text~caselessMatchChar(5, "ffl")= -- now 0, was 1 before 2023.12.04 0 ooRexx> "BAFFLE"~text~caselessPos("ffl", aslist:, aligned:0)= a List (1 items) 0 : [+3.3,+6.6] /* a List (1 items) 0 : [+3.3,+6.6] */ /* KO? 2023.12.04: yes */ ooRexx> "baffle"~text~matchChar(3, "f", normalization:.Unicode~NFKD)= -- now 0, was 1 before 2023.12.04 0 ooRexx> "baffle"~text~pos("f", normalization:.Unicode~NFKD, aslist:, aligned:0)= a List (2 items) 0 : [+3.3,-3.4] 1 : [-3.4,-3.5] /* a List (2 items) 0 : [+3.3,-3.4] 1 : [-3.4,-3.5] */ -- =============================================================================== -- 2023 Nov 17 /* Rework the implementation of caselessCompare, to get the right answer here: */ ooRexx> "sss"~text~caselessCompare("", "ß")= -- 3 (not 4 because the 3rd 's' matches only half of the casefolded pad "ß" which is "ss") 3 ooRexx> "straßssßßssse"~text~caselessCompare("stra", "ß")= -- 12 (not 13 because the last 's' matches only half of the casefolded pad "ß" which is "ss") 12 /* Analysis using Unicode scalars: ----------------------------------------- CASE 1 : aligned in self, aligned in arg1 ----------------------------------------- */ ooRexx> "straßssßßssse"~text~compare("stra", "ß")= -- 6 6 /* "straßssßßssse"~text~unicode~c2g= 1 2 3 4 5 6 7 8 9 0 1 2 3 -- (external character indexes) s t r a ß s s ß ß s s s e 73 74 72 61 DF 73 73 DF DF 73 73 73 65 -- (unicode scalars) ------------------------------------------- "straßßßßßßßßß"~text~unicode~c2g= 1 2 3 4 5 6 7 8 9 0 1 2 3 -- (external character indexes) s t r a ß ß ß ß ß ß ß ß ß 73 74 72 61 DF DF DF DF DF DF DF DF DF -- (unicode scalars) | first different unicode scalar */ /* Debug output: the indexer supports the named parameter debug "straßssßßssse"~text~indexer~compare("stra", "ß", debug:.true)= selfTextTransformer~iSubtext~string = straßssßßssse selfTextTransformer~iSubtext~c2g = 73 74 72 61 C39F 73 73 C39F C39F 73 73 73 65 selfTextTransformedString~length = 16 textTextTransformer~iSubtext~string = straßßßßßßßßß textTextTransformer~iSubtext~c2g = 73 74 72 61 C39F C39F C39F C39F C39F C39F C39F C39F C39F textTextTransformedString~length = 22 posB1 = 7 posC1 = +6.7 posB2 = 7 posC2 = +6.7 6 */ /* --------------------------------------------- CASE 2 : aligned in self, not aligned in arg1 --------------------------------------------- */ ooRexx> "straßssßßssse"~text~caselessCompare("stra", "ß")= -- 12 12 /* "straßssßßssse"~text~unicode~c2g= 1 2 3 4 5 6 7 8 9 0 1 2 3 -- (external character indexes) s t r a ß s s ß ß s s s e 73 74 72 61 DF 73 73 DF DF 73 73 73 65 -- (unicode scalars) "straßssßßssse"~text~casefold~unicode~c2g= 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 -- (internal byte indexes) s t r a s s s s s s s s s s s e 73 74 72 61 73 73 73 73 73 73 73 73 73 73 73 65 -- (unicode scalars) ---------------------------------------------------- "straßßßßßßßßß"~text~unicode~c2g= 1 2 3 4 5 6 7 8 9 0 1 2 3 -- (external character indexes) s t r a ß ß ß ß ß ß ß ß ß 73 74 72 61 DF DF DF DF DF DF DF DF DF -- (unicode scalars) "straßßßßßßßßß"~text~casefold~unicode~c2g= 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 -- (internal byte indexes) s t r a ß ß ß ß ß ß ß ß ß 73 74 72 61 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 -- (unicode scalars) | | | +-- 65 at (13,16) <> 73 at (-10,+16) but can't be 13 because would match only the first 73 of ß at (10,15) +-- yes, 12. */ /* Debug output: the indexer supports the named parameter debug "straßssßßssse"~text~indexer~caselessCompare("stra", "ß", debug:.true)= selfTextTransformer~iSubtext~string = strassssssssssse selfTextTransformer~iSubtext~c2g = 73 74 72 61 73 73 73 73 73 73 73 73 73 73 73 65 selfTextTransformedString~length = 16 textTextTransformer~iSubtext~string = strassssssssssssssssss textTextTransformer~iSubtext~c2g = 73 74 72 61 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 textTextTransformedString~length = 22 posB1 = 16 posC1 = +13.16 posB2 = 16 posC2 = -10.16 12 */ -- Another way to test: at which moment the growing padded string on the right will no longer be found at pos 1 -- 1234567890123 ooRexx> "straßssßßssse"~text~caselessPos("straß")= -- 1 1 -- straß ooRexx> "straßssßßssse"~text~caselessPos("straßß")= -- 1 1 -- straßß ooRexx> "straßssßßssse"~text~caselessPos("straßßß")= -- 1 1 -- straßß ß ooRexx> "straßssßßssse"~text~caselessPos("straßßßß")= -- 1 1 -- straßß ßß ooRexx> "straßssßßssse"~text~caselessPos("straßßßßß")= -- 1 1 -- straßß ßßß ooRexx> "straßssßßssse"~text~caselessPos("straßßßßßß")= -- 0 The last ß doesn't match "se" at 12 0 -- straßß ßßß ß /* --------------------------------------------- CASE 3 : not aligned in self, aligned in arg1 --------------------------------------------- */ ooRexx> "stra"~text~caselessCompare("straßssßßssse", "ß")= -- 9 9 /* 1 2 3 4 5 6 7 8 9 0 1 2 3 -- (external character indexes) s t r a ß ß ß ß ß ß ß ß ß 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 -- (internal byte indexes) 73 74 72 61 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 -- (unicode scalars of the casefolded string) ----------------------------------------------------------------- 1 2 3 4 5 6 7 8 9 0 1 2 3 -- (external character indexes) s t r a ß s s ß ß s s s e 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 -- (internal byte indexes) 73 74 72 61 73 73 73 73 73 73 73 73 73 73 73 65 -- (unicode scalars of the casefolded string) | | | + 73 at (-10,16) <> 65 at (13,16) +-- yes, 9. */ /* Debug output: the indexer supports the named parameter debug "stra"~text~indexer~caselessCompare("straßssßßssse", "ß", debug:.true)= selfTextTransformer~iSubtext~string = strassssssssssssssssss selfTextTransformer~iSubtext~c2g = 73 74 72 61 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 73 selfTextTransformedString~length = 22 textTextTransformer~iSubtext~string = strassssssssssse textTextTransformer~iSubtext~c2g = 73 74 72 61 73 73 73 73 73 73 73 73 73 73 73 65 textTextTransformedString~length = 16 posB1 = 16 posC1 = -10.16 posB2 = 16 posC2 = +13.16 9 */ ooRexx> "straß" ~text~caselessCompare("straßssßßssse", "ß")= -- 9 9 ooRexx> "straßß" ~text~caselessCompare("straßssßßssse", "ß")= -- 9 9 ooRexx> "straßßß" ~text~caselessCompare("straßssßßssse", "ß")= -- 9 9 ooRexx> "straßßßß" ~text~caselessCompare("straßssßßssse", "ß")= -- 9 9 ooRexx> "straßßßßß" ~text~caselessCompare("straßssßßssse", "ß")= -- 9 9 ooRexx> "straßßßßßß" ~text~caselessCompare("straßssßßssse", "ß")= -- 9 9 ooRexx> "straßßßßßßß" ~text~caselessCompare("straßssßßssse", "ß")= -- 9 9 ooRexx> "straßßßßßßßß" ~text~caselessCompare("straßssßßssse", "ß")= -- 9 9 ooRexx> "straß" ~text~caselessCompareTo("straßssßßssse")= -- -1 -1 ooRexx> "straßß" ~text~caselessCompareTo("straßssßßssse")= -- -1 -1 ooRexx> "straßßß" ~text~caselessCompareTo("straßssßßssse")= -- -1 -1 ooRexx> "straßßßß" ~text~caselessCompareTo("straßssßßssse")= -- -1 -1 ooRexx> "straßßßßß" ~text~caselessCompareTo("straßssßßssse")= -- -1 up to 9 characters, it's lesser -1 ooRexx> "straßßßßßß" ~text~caselessCompareTo("straßssßßssse")= -- 1 from 10 characters, it's greater 1 ooRexx> "straßßßßßßß" ~text~caselessCompareTo("straßssßßssse")= -- 1 1 ooRexx> "straßßßßßßßß" ~text~caselessCompareTo("straßssßßssse")= -- 1 1 ooRexx> "stra" ~caselessCompare("strasssssse", "s")= -- 11 11 ooRexx> "stra"~text~caselessCompare("strasssssse", "s")= -- 11 11 ooRexx> "strasssssse" ~caselessCompare("stra", "s")= -- 11 11 ooRexx> "strasssssse"~text~caselessCompare("stra", "s")= -- 11 11 ooRexx> "strà" ~caselessCompare("stràsssssse", "s")= -- 11 (was 12 before automatic conversion of string literals to text) 11 ooRexx> "strà"~text~caselessCompare("stràsssssse", "s")= -- 11 11 ooRexx> "stràsssssse" ~caselessCompare("strà", "s")= -- 11 (was 12 before automatic conversion of string literals to text) 11 ooRexx> "stràsssssse"~text~caselessCompare("strà", "s")= -- 11 11 /* --------------------------------------------- CASE 4 : not aligned in self, aligned in arg1 --------------------------------------------- */ ooRexx> iota_dt = "\u{GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS}"~text~unescape ooRexx> ("a" iota_dt~casefold "b")~compare("a" iota_dt, normalization: 0)= -- 3 3 /* Debug output: the indexer supports the named parameter debug ("a" iota_dt~casefold "b")~indexer~compare("a" iota_dt, normalization: 0, debug:.true)= selfTextTransformer~iSubtext~string = a ΐ b selfTextTransformer~iSubtext~c2g = 61 20 CEB9CC88CC81 20 62 selfTextTransformedString~length = 10 textTextTransformer~iSubtext~string = a ΐ textTextTransformer~iSubtext~c2g = 61 20 CE90 20 20 textTextTransformedString~length = 6 posB1 = 4 posC1 = -3.4 posB2 = 4 posC2 = -3.4 3 */ -- =============================================================================== -- 2023 Oct 04 /* Reactivate the constraint "self~isCompatibleWithByteString" when converting a RexxText to a String. It can be disabled by setting .Unicode~unckeckedConversionToString = .true Currently, the only case where this constraint is disabled is when testing the regular expressions in diary_examples.rex. Some checks of encoding compatibiliy were missing. Added in: - compareText: caselessCompare, compare - compareToText: caselessCompareTo, compareTo - endsWithText: caselessEndsWith, endsWith - matchCharText: caselessMatchChar, matchChar - matchText: caselessMatch, match - posText: caselessPos, pos New supported methods: - abs forward to String, return a String - b2x forward to String, return a String - bitAnd forward to String, return a String - bitOr forward to String, return a String - bitXor forward to String, return a String Examples: */ ooRexx> (-1)~text~abs= -- 1 1 ooRexx> ("-x")~text~abs= -- ABS method target must be a number; found "-x". ABS method target must be a number; found "-x". Error code= 93.943 ooRexx> ("-é")~text~abs= -- UTF-8 not-ASCII '-é' is not compatible with a Rexx numeric value. UTF-8 not-ASCII '-é' is not compatible with a Rexx numeric value. Error code= 23.900 ooRexx> 100~text~b2x= -- 4 4 ooRexx> "x"~text~b2x= -- Only 0, 1, and whitespace characters are valid in a binary string; character found "x". Only 0, 1, and whitespace characters are valid in a binary string; character found "x". Error code= 93.934 ooRexx> "é"~text~b2x= -- UTF-8 not-ASCII 'é' is not compatible with a Rexx numeric value. UTF-8 not-ASCII 'é' is not compatible with a Rexx numeric value. Error code= 23.900 ooRexx> "12"x~text~bitAnd= -- '[12]' ("12"x) '[12]' ooRexx> "73"x~text~bitAnd("27"x~text)= -- '#' ("23"x) '#' ooRexx> "13"x~text~bitAnd("5555"x~text)= -- '[11]U' ("1155"x) '[11]U' ooRexx> "13"x~text~bitAnd("5555"x~text,"74"x~text)= -- '[11]T' ("1154"x) '[11]T' ooRexx> "pQrS"~text~bitAnd(,"DF"x~text("byte"))= -- "PQRS" 'PQRS' ooRexx> "12"x~text~bitOr= -- '[12]' ("12"x) '[12]' ooRexx> "15"x~text~bitOr("24"x~text)= -- 5 ("35"x) 5 ooRexx> "15"x~text~bitOr("2456"x~text)= -- '5V' ("3556"x) '5V' ooRexx> "15"x~text~bitOr("2456"x~text,"F0"x~text("byte"))= -- '5?' ("35F6"x) '5�' ooRexx> "1111"x~text~bitOr(,"4D"x~text)= -- ']]' ("5D5D"x) ']]' ooRexx> "pQrS"~text~bitOr(,"20"x~text)= -- "pqrs" 'pqrs' ooRexx> "12"x~text~bitXor= -- '[12]' ("12"x) '[12]' ooRexx> "12"x~text~bitXor("22"x~text)= -- 0 ("30"x) 0 ooRexx> "1211"x~text~bitXor("22"x~text)= -- '0[11]' ("3011"x) '0[11]' ooRexx> "1111"x~text~bitXor("444444"x~text)= -- 'UUD' ("555544"x) 'UUD' ooRexx> "1111"x~text~bitXor("444444"x~text,"40"x~text)= -- 'UU[04]' ("555504"x) 'UU[04]' ooRexx> "1111"x~text~bitXor(,"4D"x~text)= -- '\\' ("5C5C"x) '\\' ooRexx> "C711"x~text~bitXor("222222"x~text," "~text)= -- '?3[02]' ("E53302"x) '�3[02]' /* Implementation of caselessStartsWith, startsWith: (forwards to caselessPos or pos, and returns .true if result == 1) (was already implemented, waiting for 'pos' implementation) Examples: */ ooRexx> "Père"~text~c2g= -- '50 C3A8 72 65' '50 C3A8 72 65' ooRexx> "Père"~text~startsWith("50"x)= -- true 1 ooRexx> "Père"~text~startsWith("50C3"x)= -- was Invalid UTF-8 string (utf8proc error because "50C3"x is an invalid UTF-8 encoding) Invalid UTF-8 string. Error code= 22.900 ooRexx> "Père"~text~startsWith("50C3"x~text("byte"))= -- false (not aligned) (was Encoding: cannot compare Byte not-ASCII 'P?' with UTF-8 not-ASCII 'Père') 0 ooRexx> "Père"~text~startsWith("50C3A8"x)= -- true 1 ooRexx> "éßffl"~text~c2g= -- 'C3A9 C39F EFAC84' 'C3A9 C39F EFAC84' ooRexx> "éßffl"~text~casefold~c2g= -- 'C3A9 73 73 66 66 6C' 'C3A9 73 73 66 66 6C' ooRexx> "éßffl"~text~caselessStartsWith("É")= -- true 1 ooRexx> "éßffl"~text~caselessStartsWith("És")= -- false 0 ooRexx> "éßffl"~text~caselessStartsWith("Éss")= -- true 1 ooRexx> "éßffl"~text~caselessStartsWith("Éssf")= -- false 0 ooRexx> "éßffl"~text~caselessStartsWith("Éssff")= -- false 0 ooRexx> "éßffl"~text~caselessStartsWith("Éssffl")= -- true 1 ooRexx> "noël👩👨👩👧🎅"~text~startsWith("noël👩")= -- false 0 ooRexx> "noël👩👨👩👧🎅"~text~startsWith("noël👩", aligned:.false)= -- true 1 ooRexx> "noël👩👨👩👧🎅"~text~startsWith("noël👩👨👩")= -- false 0 ooRexx> "noël👩👨👩👧🎅"~text~startsWith("noël👩👨👩", aligned:.false)= -- true 1 ooRexx> "noël👩👨👩👧🎅"~text~startsWith("noël👩👨👩👧")= -- true 1 -- =============================================================================== -- 2023 Oct 03 /* Move the routine createCharacterTranscodingTable from byte_common.cls to byte_encoding.cls. It's used only by Byte_Encoding and its subclasses. The 'text' method of UnicodeCharacter has been replaced by 'transcodeTo'. Reason 1: the byte encodings were not supported correctly. Reason 2: the fact a transcoding is needed is against the definition of the 'text' method (apply a view on the bytes without modifying them). Finalize the support of replacement character during transcoding. A replacement character can be .nil or "" or a character. When a character, it can be a String or a RexxText made of one codepoint or a UnicodeCharacter. In all cases, the corresponding codepoint is used. This codepoint is transcoded to the target encoding. Behavior when a source codepoint does not have a matching target codepoint: - When the replacement character is .nil, an error is raised. - When the replacement character is "", the source codepoint is ignored (not transcoded) - Otherwise the source codepoint is replaced by the replacement character. Reminder: if the 'strict' named argument is false (default) then the fallback codepoint transcodings are used, if any. So when 'strict' is false, potentially more source could be transcoded. Examples: */ ooRexx> -- The Windows-1252 encoding has some fallback codepoint transcodings. ooRexx> -- HOP is one of them: 81x --> +U0081 only when strict:.false ooRexx> ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~utf8(strict:.false)= -- T'Noël (strict:.false is the default) T'Noël' ooRexx> ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~utf8(strict:.false)~unicodecharacters== an Array (shape [5], 5 items) 1 : ( "N" U+004E Lu 1 "LATIN CAPITAL LETTER N" ) 2 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 3 : ( "ë" U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" ) 4 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) 5 : ( "" U+0081 Cc 0 "", "HIGH OCTET PRESET", "HOP" ) ooRexx> ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~utf8(strict:.true)= -- Cannot convert windows-1252 not-ASCII character 129 (81) at byte-position 5 to UTF-8. Cannot convert windows-1252 not-ASCII character 129 (81) at byte-position 5 to UTF-8. Error code= 23.900 ooRexx> ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~utf8(strict:.true, replacementCharacter:"")= -- T'Noël' T'Noël' ooRexx> ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~utf8(strict:.true, replacementCharacter:"#")= -- T'Noël#' T'Noël#' ooRexx> ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~utf8(strict:.true, replacementCharacter:"🎅")= -- T'Noël🎅' T'Noël🎅' ooRexx> "Noël\u{HOP}"~text("utf8")~unescape~transcodeTo("byte")= -- Cannot convert UTF-8 not-ASCII codepoint 235 (EB) at position 3 to Byte. Cannot convert UTF-8 not-ASCII codepoint 235 (EB) at position 3 to Byte. Error code= 23.900 ooRexx> "Noël\u{HOP}"~text("utf8")~unescape~transcodeTo("windows-1252")= -- T'No?l?' T'No�l�' ooRexx> "Noël\u{HOP}"~text("utf8")~unescape~transcodeTo("windows-1252")~c2x= -- '4E 6F EB 6C 81' '4E 6F EB 6C 81' ooRexx> "Noël\u{HOP}"~text("utf8")~unescape~transcodeTo("windows-1252", strict:)= -- Cannot convert UTF-8 not-ASCII codepoint 129 (81) at position 5 to windows-1252. Cannot convert UTF-8 not-ASCII codepoint 129 (81) at position 5 to windows-1252. Error code= 23.900 ooRexx> -- "byte" encoding: only 00..7F can be transcoded ooRexx> ("No" || "EB"x || "l" || "81"x)~text("byte")~utf8= -- Cannot convert Byte not-ASCII character 235 (EB) at byte-position 3 to UTF-8. Cannot convert Byte not-ASCII character 235 (EB) at byte-position 3 to UTF-8. Error code= 23.900 ooRexx> ("No" || "EB"x || "l" || "81"x)~text("byte")~utf8(replacementCharacter:"")= -- T'Nol' T'Nol' ooRexx> ("No" || "EB"x || "l" || "81"x)~text("byte")~utf8(replacementCharacter:"#")= -- T'No#l#' 1 replacement character for ë because "ë" is 'EB'x T'No#l#' ooRexx> ("No" || "EB"x || "l" || "81"x)~text("byte")~utf8(replacementCharacter:"🎅")= -- T'No🎅l🎅' T'No🎅l🎅' ooRexx> ("No" || "EB"x || "l" || "81"x)~text("byte")~utf8(replacementCharacter:"🎅"~text)= -- T'No🎅l🎅' T'No🎅l🎅' ooRexx> ("No" || "EB"x || "l" || "81"x)~text("byte")~utf8(replacementCharacter:.unicode["Father Christmas"])= -- T'No🎅l🎅' T'No🎅l🎅' ooRexx> "Noël"~text("byte")~utf8(replacementCharacter:"")= -- T'Nol' T'Nol' ooRexx> "Noël"~text("byte")~utf8(replacementCharacter:"#")= -- T'No##l' 2 replacement characters for ë because "ë" is 'C3 AB'x T'No##l' ooRexx> "Noël"~text("byte")~utf8(replacementCharacter:"🎅")= -- T'No🎅🎅l' T'No🎅🎅l' ooRexx> "Noël"~text("byte")~utf8(replacementCharacter:"🎅🎅")= -- The transcoded replacement character must have at most one codepoint, got UTF-8 not-ASCII (2 characters, 2 codepoints, 8 bytes, 0 error) '🎅🎅'. The transcoded replacement character must have at most one codepoint, got UTF-8 not-ASCII (2 characters, 2 codepoints, 8 bytes, 0 error) '🎅🎅'. Error code= 23.900 ooRexx> "Noël"~text("byte")~utf8(replacementCharacter:"🎅🎅"~text)= -- The transcoded replacement character must have at most one codepoint, got UTF-8 not-ASCII (2 characters, 2 codepoints, 8 bytes, 0 error) '🎅🎅'. The transcoded replacement character must have at most one codepoint, got UTF-8 not-ASCII (2 characters, 2 codepoints, 8 bytes, 0 error) '🎅🎅'. Error code= 23.900 ooRexx> "Noël"~text("utf8")~transcodeTo("byte")= -- Cannot convert UTF-8 not-ASCII codepoint 235 (EB) at position 3 to Byte. Cannot convert UTF-8 not-ASCII codepoint 235 (EB) at position 3 to Byte. Error code= 23.900 ooRexx> "Noël"~text("utf8")~transcodeTo("byte", replacementCharacter:"")= -- T'Nol' T'Nol' ooRexx> "Noël"~text("utf8")~transcodeTo("byte", replacementCharacter:"#")= -- T'No#l' T'No#l' ooRexx> "Noël"~text("utf8")~transcodeTo("byte", replacementCharacter:"🎅")= -- The replacement character UTF-8 not-ASCII '🎅' cannot be transcoded to Byte. The replacement character UTF-8 not-ASCII '🎅' cannot be transcoded to Byte. Error code= 93.900 ooRexx> "Noël🤶"~text("utf8")~transcodeTo("unicode", replacementCharacter:"🎅")= -- T'N[000000]o[000000]?[000000]l[000000]??[0100]' T'N[000000]o[000000]�[000000]l[000000]6�[0100]' ooRexx> "Noël🤶"~text("utf8")~transcodeTo("unicode", replacementCharacter:"🎅")~c2x= -- '4E000000 6F000000 EB000000 6C000000 36F90100' '4E000000 6F000000 EB000000 6C000000 36F90100' ooRexx> "Noël🤶"~text("utf8")~transcodeTo("unicode", replacementCharacter:"🎅")~c2u= -- 'U+004E U+006F U+00EB U+006C U+1F936' 'U+004E U+006F U+00EB U+006C U+1F936' ooRexx> "Noël🤶"~text("utf8")~transcodeTo("unicode8", replacementCharacter:"")= -- T'No?l' T'No�l' ooRexx> "Noël🤶"~text("utf8")~transcodeTo("unicode8", replacementCharacter:"")~c2x= -- '4E 6F EB 6C' '4E 6F EB 6C' ooRexx> "Noël🤶"~text("utf8")~transcodeTo("unicode8", replacementCharacter:"#")= -- T'No?l#' T'No�l#' ooRexx> "Noël🤶"~text("utf8")~transcodeTo("unicode8", replacementCharacter:"#")~c2x= -- '4E 6F EB 6C 23' '4E 6F EB 6C 23' ooRexx> "Noël🤶"~text("utf8")~transcodeTo("unicode8", replacementCharacter:"🎅")= -- The replacement character UTF-8 not-ASCII '🎅' cannot be transcoded to Unicode8. The replacement character UTF-8 not-ASCII '🎅' cannot be transcoded to Unicode8. Error code= 93.900 ooRexx> "Noël🤶"~text("utf8")~transcodeTo("unicode16", replacementCharacter:"")= -- T'N[00]o[00]?[00]l[00]' T'N[00]o[00]�[00]l[00]' ooRexx> "Noël🤶"~text("utf8")~transcodeTo("unicode16", replacementCharacter:"#")= -- T'N[00]o[00]?[00]l[00]#[00]' T'N[00]o[00]�[00]l[00]#[00]' ooRexx> "Noël🤶"~text("utf8")~transcodeTo("unicode16", replacementCharacter:"#")~c2x= -- '4E00 6F00 EB00 6C00 2300' '4E00 6F00 EB00 6C00 2300' ooRexx> "Noël🤶"~text("utf8")~transcodeTo("unicode16", replacementCharacter:"#")~c2u= -- 'U+004E U+006F U+00EB U+006C U+0023' 'U+004E U+006F U+00EB U+006C U+0023' ooRexx> "Noël🤶"~text("utf8")~transcodeTo("unicode16", replacementCharacter:"🎅")= -- The replacement character UTF-8 not-ASCII '🎅' cannot be transcoded to Unicode16. The replacement character UTF-8 not-ASCII '🎅' cannot be transcoded to Unicode16. Error code= 93.900 ooRexx> "Noël🤶"~text("utf8")~transcodeTo("unicode32")= -- T'N[000000]o[000000]?[000000]l[000000]6?[0100]' T'N[000000]o[000000]�[000000]l[000000]6�[0100]' ooRexx> "Noël🤶"~text("utf8")~transcodeTo("unicode32")~c2x= -- '4E000000 6F000000 EB000000 6C000000 36F90100' '4E000000 6F000000 EB000000 6C000000 36F90100' ooRexx> "Noël🤶"~text("utf8")~transcodeTo("unicode32")~c2u= -- 'U+004E U+006F U+00EB U+006C U+1F936' 'U+004E U+006F U+00EB U+006C U+1F936' /* The method c2u is no longer abstract for the byte encodings. Now, a byte encoding is converted on the fly to UnicodeN in non strict mode, replacing any unsupported character by .Unicode~replacementCharacter. Idem for the method unicodeCharacters. Examples: */ ooRexx> "FF FE FD FC"x~text("byte")~c2x= -- 'FF FE FD FC' 'FF FE FD FC' ooRexx> "FF FE FD FC"x~text("byte")~c2g= -- 'FF FE FD FC' 'FF FE FD FC' ooRexx> "FF FE FD FC"x~text("byte")~codepoints== a CodePointSupplier 1 : 255 2 : 254 3 : 253 4 : 252 ooRexx> "FF FE FD FC"x~text("byte")~c2u= -- 'U+FFFD U+FFFD U+FFFD U+FFFD' 'U+FFFD U+FFFD U+FFFD U+FFFD' ooRexx> "FF FE FD FC"x~text("byte")~unicodeCharacters== an Array (shape [4], 4 items) 1 : ( "�" U+FFFD So 1 "REPLACEMENT CHARACTER" ) 2 : ( "�" U+FFFD So 1 "REPLACEMENT CHARACTER" ) 3 : ( "�" U+FFFD So 1 "REPLACEMENT CHARACTER" ) 4 : ( "�" U+FFFD So 1 "REPLACEMENT CHARACTER" ) ooRexx> "FF FE FD FC"x~text("utf8")~c2x= -- 'FF FE FD FC' 'FF FE FD FC' ooRexx> "FF FE FD FC"x~text("utf8")~c2g= -- 'FF FE FD FC' 'FF FE FD FC' ooRexx> "FF FE FD FC"x~text("utf8")~c2u= -- 'U+FFFD U+FFFD U+FFFD U+FFFD' 'U+FFFD U+FFFD U+FFFD U+FFFD' ooRexx> "FF FE FD FC"x~text("utf8")~codepoints== a CodePointSupplier 1 : 65533 2 : 65533 3 : 65533 4 : 65533 ooRexx> "FF FE FD FC"x~text("unicode8")~c2x= -- 'FF FE FD FC' 'FF FE FD FC' ooRexx> "FF FE FD FC"x~text("unicode8")~c2g= -- 'FF FE FD FC' 'FF FE FD FC' ooRexx> "FF FE FD FC"x~text("unicode8")~codepoints== a CodePointSupplier 1 : 255 2 : 254 3 : 253 4 : 252 ooRexx> "FF FE FD FC"x~text("unicode8")~c2u= -- 'U+00FF U+00FE U+00FD U+00FC' 'U+00FF U+00FE U+00FD U+00FC' ooRexx> "FF FE FD FC"x~text("unicode8")~unicodecharacters== an Array (shape [4], 4 items) 1 : ( "ÿ" U+00FF Ll 1 "LATIN SMALL LETTER Y WITH DIAERESIS" ) 2 : ( "þ" U+00FE Ll 1 "LATIN SMALL LETTER THORN" ) 3 : ( "ý" U+00FD Ll 1 "LATIN SMALL LETTER Y WITH ACUTE" ) 4 : ( "ü" U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" ) ooRexx> ("No" || "EB"x || "l" || "81"x)~text("byte")~c2x= -- '4E 6F EB 6C 81' '4E 6F EB 6C 81' ooRexx> ("No" || "EB"x || "l" || "81"x)~text("byte")~c2g= -- '4E 6F EB 6C 81' '4E 6F EB 6C 81' ooRexx> ("No" || "EB"x || "l" || "81"x)~text("byte")~c2u= -- 'U+004E U+006F U+FFFD U+006C U+FFFD' 'U+004E U+006F U+FFFD U+006C U+FFFD' ooRexx> ("No" || "EB"x || "l" || "81"x)~text("byte")~unicodecharacters== an Array (shape [5], 5 items) 1 : ( "N" U+004E Lu 1 "LATIN CAPITAL LETTER N" ) 2 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 3 : ( "�" U+FFFD So 1 "REPLACEMENT CHARACTER" ) 4 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) 5 : ( "�" U+FFFD So 1 "REPLACEMENT CHARACTER" ) ooRexx> ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~c2x= -- '4E 6F EB 6C 81' '4E 6F EB 6C 81' ooRexx> ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~c2g= -- '4E 6F EB 6C 81' '4E 6F EB 6C 81' ooRexx> ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~c2u= -- 'U+004E U+006F U+00EB U+006C U+0081' 'U+004E U+006F U+00EB U+006C U+0081' ooRexx> ("No" || "EB"x || "l" || "81"x)~text("windows-1252")~unicodecharacters== an Array (shape [5], 5 items) 1 : ( "N" U+004E Lu 1 "LATIN CAPITAL LETTER N" ) 2 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 3 : ( "ë" U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" ) 4 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) 5 : ( "" U+0081 Cc 0 "", "HIGH OCTET PRESET", "HOP" ) -- =============================================================================== -- 2023 Sep 27 /* Add the named parameters 'stripCC' and 'stripNA' to all the methods supporting the named parameter 'normalization'. This is utf8proc specific. - stripCC: remove control characters (see utf8proc doc for more information: HorizontalTab (HT) and FormFeed (FF) are transformed into space) - stripNA: remove unassigned codepoints Example: */ ooRexx> .unicode["ESA"]= -- ( "" U+0087 Cc 0 "", "END OF SELECTED AREA", "ESA" ( "" U+0087 Cc 0 "", "END OF SELECTED AREA", "ESA" ) ooRexx> .unicode["NBSP"]= -- ( " " U+00A0 Zs 1 "NO-BREAK SPACE", "NBSP" ) ( " " U+00A0 Zs 1 "NO-BREAK SPACE", "NBSP" ) ooRexx> .unicode["SSA"]= -- ( "" U+0086 Cc 0 "", "START OF SELECTED AREA", "SSA" ( "" U+0086 Cc 0 "", "START OF SELECTED AREA", "SSA" ) ooRexx> .unicode["U+0378"]= -- ( "" U+0378 Cn 1 "" ) unassigned ( "" U+0378 Cn 1 "" ) ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape= -- T'Mrs. 🤶 and Mr. 🎅 T'Mrs. 🤶 and Mr. 🎅' ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~c2g= -- 'C286 4D 72 73 2E C2A0 F09FA4B6 20 61 CDB8 6E 64 20 4D 72 2E C2A0 F09F8E85 C287' 'C286 4D 72 73 2E C2A0 F09FA4B6 20 61 CDB8 6E 64 20 4D 72 2E C2A0 F09F8E85 C287' ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~transform(stripNA:)~c2g= -- 'C286 4D 72 73 2E C2A0 F09FA4B6 20 61 6E 64 20 4D 72 2E C2A0 F09F8E85 C287' 'C286 4D 72 73 2E C2A0 F09FA4B6 20 61 6E 64 20 4D 72 2E C2A0 F09F8E85 C287' ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~transform(stripNA:, stripCC:)~c2g= -- ' 4D 72 73 2E C2A0 F09FA4B6 20 61 6E 64 20 4D 72 2E C2A0 F09F8E85 ' '4D 72 73 2E C2A0 F09FA4B6 20 61 6E 64 20 4D 72 2E C2A0 F09F8E85' ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~pos("and")= -- 0 0 ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~pos("and", stripNA:)= -- 9 9 ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~pos("and", stripNA:, stripCC:)= -- 9 yes! 9, not 8 because it's the EXTERNAL position 9 ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("mr.")= -- 14 14 ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("mr.", stripNA:)= -- 14 yes! 14, not 13 14 ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("mr.", stripNA:, stripCC:)= -- 14 yes! 14, not 12 14 ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("\U{SSA}"~text~unescape)= -- 1 1 ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("\U{SSA}"~text~unescape, stripCC:)= -- 0 0 ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("a\u0378nd"~text~unescape)= -- 9 9 ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("a\u0378nd"~text~unescape, stripCC:)= -- 9 9 ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("a\u0378nd"~text~unescape, stripNA:)= -- 9 yes! 9, not 0 because \u0378 is removed both in the needle and in thehaystack 9 ooRexx> "\U{SSA}Mrs.\U{NBSP}🤶 a\u0378nd Mr.\U{NBSP}🎅\U{ESA}"~text~unescape~caselessPos("a\u0378nd"~text~unescape, stripNA:, stripCC:)= -- 9 yes! 9, not 8 9 /* caselessEndsWith, endsWith: returns false if the start of the 'other' string is not aligned with a character. Examples */ ooRexx> "#éßffl#…"~text~endsWith("…")= -- true 1 ooRexx> "#éßffl#…"~text~caselessEndsWith("…")= -- true 1 ooRexx> "#éßffl#…"~text~endsWith("fl#…")= -- false, ffl remains ffl 0 ooRexx> "#éßffl#…"~text~caselessEndsWith("FL#…")= -- false, ffl becomes ffl but FL is not aligned with ffl 0 ooRexx> "#éßffl#…"~text~endsWith("ffl#…")= -- true 1 ooRexx> "#éßffl#…"~text~caselessEndsWith("ffl#…")= -- true 1 ooRexx> "#éßffl#…"~text~endsWith("ffl#…")= -- false, ffl remains ffl 0 ooRexx> "#éßffl#…"~text~caselessEndsWith("FFL#…")= -- true, ffl becomes ffl and FFL is aligned with ffl 1 ooRexx> "#éßffl#…"~text~endsWith("sffl#…")= -- false, ß remains ß 0 ooRexx> "#éßffl#…"~text~caselessEndsWith("Sffl#…")= -- false, ß becomes ss but s is not aligned with ss 0 ooRexx> "#éßffl#…"~text~endsWith("ßffl#…")= -- true 1 ooRexx> "#éßffl#…"~text~caselessEndsWith("ßffl#…")= -- true 1 ooRexx> "#éßffl#…"~text~endsWith("ssffl#…")= -- false, ß remains ß 0 ooRexx> "#éßffl#…"~text~caselessEndsWith("SSffl#…")= -- true, ß becomes ss 1 ooRexx> "#éßffl#…"~text~endsWith("éßffl#…")= -- true 1 ooRexx> "#éßffl#…"~text~caselessEndsWith("ÉSSFFL#…")= -- true 1 ooRexx> "#éßffl#…"~text~endsWith("#éßffl#…")= -- true 1 ooRexx> "#éßffl#…"~text~caselessEndsWith("#ÉSSFFL#…")= -- true 1 ooRexx> "#e\U{COMBINING ACUTE ACCENT}ßffl#…"~text~unescape~c2g= -- '23 65CC81 C39F EFAC84 23 E280A6' '23 65CC81 C39F EFAC84 23 E280A6' ooRexx> "\U{COMBINING ACUTE ACCENT}ßffl#…"~text~unescape~c2g= -- ' CC81 C39F EFAC84 23 E280A6' 'CC81 C39F EFAC84 23 E280A6' ooRexx> "#e\U{COMBINING ACUTE ACCENT}ßffl#…"~text~unescape~endsWith("\U{COMBINING ACUTE ACCENT}ßffl#…"~text~unescape)= -- false, not aligned with e\U{COMBINING ACUTE ACCENT} 0 ooRexx> "#e\U{COMBINING ACUTE ACCENT}ßffl#…"~text~unescape~casefold~c2g= -- '23 65CC81 73 73 66 66 6C 23 E280A6' '23 65CC81 73 73 66 66 6C 23 E280A6' ooRexx> "\U{COMBINING ACUTE ACCENT}SSFFL#…"~text~unescape~casefold~c2g= -- ' CC81 73 73 66 66 6C 23 E280A6' 'CC81 73 73 66 66 6C 23 E280A6' ooRexx> "#e\U{COMBINING ACUTE ACCENT}ßffl#…"~text~unescape~caselessEndsWith("\U{COMBINING ACUTE ACCENT}SSFFL#…"~text~unescape)= -- false, not aligned with e\U{COMBINING ACUTE ACCENT} 0 /* New 'RexxTextTransformer' class: - Converts positions in a transformed string to positions in the corresponding untransformed string. This is used for the caselessXXX methods which takes or return positions. - Supports inflating and deflating transformations. jlf 2023 Sep 28: better names are expansion and contraction. - The transformation can be made on a part of the string (from startC, for lengthC characters). - The methods for the transformation are the same as for RexxText: NFC, NFD, NFKC, NFKD, casefold, transform. The result is the instance of RexxTextTransformer, not the transformed text. - Only one call to a transformation method can be done. This is because the parameters of the transformation are memorized to re-apply internally the transformation character by character, when moving the cursors. - The 'transformer' method lets create an instance of RexxTextTransformer from a text. Example: - full text = original text (untransformed) - external subtext = part of the full text to transform - internal subtext = transformed part of the full text The method ib2xc converts an internal byte (ib) position in the internal subtext (iSubtext) to an external character (xc) position in the external full text. ib2xc supports only growing positions. The only way to go backward is to use backupPos/restorePos or resetPos. -- Transformed part of the full text -- +-------------------------------------+ -- GLOBAL INDEXES (offsetC=3, offsetB=7) -- 01 | 02 | 03 | 04 | 05 | 06 | 07 | 08 | 09 -- (external character indexes) <--------+ -- 1 2 | 3 4 | 5 6 7 | 8 | 9 0 | 1 2 | 3 4 5 | 6 | 7 8 9 -- (external byte indexes) | "éßffl#éßffl#…"~text~c2g -- C3A9 | C39F | EFAC84 | 23 | C3A9 | C39F | EFAC84 | 23 | E280A6 -- (external bytes) | -- é | ß | ffl | # | é | ß | ffl | # | … -- (full text) ^ -- 1 2 | 3 4 | 5 6 7 | 8 | 9 0 1 | 2 3 | 4 5 6 | 7 | 8 9 0 -- (internal byte indexes, offset=7) | -- C3A9 | C39F | EFAC84 | 23 | 65CC81 | 73 73 | 66 66 6C | 23 | E280A6 -- (internal bytes) | +-------------------------------------+ | -- RELATIVE INDEXES | -- 01 | 02 | 03 | 04 | 05 -- (external character indexes) <--------+ -- 1 | 2 3 | 4 5 | 6 7 8 | 9 -- (external byte indexes) | "#éßffl#"~text~c2g= -- 23 | C3A9 | C39F | EFAC84 | 23 -- (external bytes) | -- # | é | ß | ffl | # -- (external subtext) ^ | -- RELATIVE INDEXES | -- 01 | 02 | 03 04 | 05 06 07 | 08 -- (internal character indexes) | -- 1 | 2 3 4 | 5 6 | 7 8 9 | 0 -- (internal byte indexes) ------>-------+ "#éßffl#"~text~NFD(casefold:)~c2g= -- 23 | 65CC81 | 73 73 | 66 66 6C | 23 -- (internal bytes) -- # | é | s s | f f l | # -- (internal subtext) */ ooRexx> transformer = "éßffl#éßffl#…"~text~transformer(4, 5)~NFD(casefold:) ooRexx> transformer~fulltext= -- T'éßffl#éßffl#…' T'éßffl#éßffl#…' ooRexx> transformer~xSubtext= -- T'#éßffl#' T'#éßffl#' ooRexx> transformer~iSubtext= -- T'#éssffl#' T'#éssffl#' -- ib2xc supports only growing positions ooRexx> transformer~ib2xc(1)= -- 4 the internal byte position 1 in the internal subtext corresponds to the 4th external character in the full text 4 ooRexx> transformer~ib2xc(7)= -- 7 7 ooRexx> transformer~ib2xc(2)= -- Error RexxTextTransformer: You specified a byte position (2) lower than the previous one (7). RexxTextTransformer: You specified a byte position (2) lower than the previous one (7). Error code= 93.900 -- The previous error is avoided by backuping/restoring the current position ooRexx> transformer~resetPos -- reset to allow iteration again from internal byte position 1 ooRexx> transformer~ib2xc(1)= -- 4 4 ooRexx> transformer~backupPos ooRexx> transformer~ib2xc(7)= -- 7 7 ooRexx> transformer~restorePos ooRexx> transformer~ib2xc(2)= -- 5 5 ooRexx> transformer~resetPos ooRexx> do i=1 to transformer~iSubtext~string~length; say "byte pos" i~right(2) " character pos=" transformer~ib2xc(i)~string~left(20) transformer~ib2xc(i, aligned:.false); end byte pos 1 character pos= 4 +4.8 byte pos 2 character pos= 5 +5.9 byte pos 3 character pos= -5 -5.10 byte pos 4 character pos= -5 -5.11 byte pos 5 character pos= 6 +6.12 byte pos 6 character pos= -6 -6.13 byte pos 7 character pos= 7 +7.14 byte pos 8 character pos= -7 -7.15 byte pos 9 character pos= -7 -7.16 byte pos 10 character pos= 8 +8.17 /* byte pos 1 character pos= 4 +4.8 -- the 8th internal byte is aligned with the 4th external character byte pos 2 character pos= 5 +5.9 byte pos 3 character pos= The NIL object -5.10 -- the 10th internal byte is part of the 5th external character, but is not aligned with it. byte pos 4 character pos= The NIL object -5.11 byte pos 5 character pos= 6 +6.12 byte pos 6 character pos= The NIL object -6.13 byte pos 7 character pos= 7 +7.14 byte pos 8 character pos= The NIL object -7.15 byte pos 9 character pos= The NIL object -7.16 byte pos 10 character pos= 8 +8.17 */ /* More details on positions mappings. transformer~iSubtext is the transformed part of the full text. The internal relative byte position 1 becomes the internal global byte position 8: There are 7 bytes (offsetB=7) before the part to transform: 1 + 7 = 8. It's the same offsetB=7 for external and internal bytes, because this part is not transformed. Remember: It doesn't make sense to return the external byte position, because some internal byte positions have no corresponding external byte position. For example the internal global byte position 11. For diagnostics and analysis, only internal byte positions are relevant. The external relative character position 1 becomes the external global character position 4: There are 3 characters (offsetC=3) before the part to transform: 1 + 3 = 4. It's the same offsetC=3 for external and internal characters, because this part is not transformed. Remember: The user works only with external global character positions. It wouldn't make sense to return internal character positions. Example of alignment: The internal relative byte position 1 becomes the internal global byte position 8, is part of the 4th external character and is aligned with it. Example of non-alignment: The internal relative byte position 3 becomes the internal global byte position 10, is part of the 5th external character and is not aligned with it. */ -- =============================================================================== -- 2023 Sep 16 /* Relax the constraint "self~isCompatibleWithByteString" when converting a RexxText to a String. That allows to go further in the tests of regular expression. */ ooRexx> unckeckedConversionToString = .Unicode~unckeckedConversionToString -- backup ooRexx> .Unicode~unckeckedConversionToString = .true -- bug in regex.cls ooRexx> p = .Pattern~compile("(.)*foo") ooRexx> p~matches("xfooxxxxxxfooXXXX")= -- Invalid position argument specified; found "0". Invalid position argument specified; found "0". Error code= 93.924 -- False success in text mode -- "à" is 2 bytes 'C3A0', "🎅" is 4 bytes 'F09F8E85' -- When compiling a String then each of the bytes of "à" or "🎅" become candidate for matching -- When compiling a RexxText then only the sequence of all the bytes of "à" or "🎅" should match... But that's not the case. ooRexx> pB = .Pattern~compile("[àb🎅]") ooRexx> pT = .Pattern~compile("[àb🎅]"~text) ooRexx> pB~startsWith('àXXXX')= -- 1 1 ooRexx> pT~startsWith('àXXXX'~text)= -- 1 but matched only C3 1 ooRexx> pB~startsWith('bXXXX')= -- 1 1 ooRexx> pT~startsWith('bXXXX'~text)= -- 1 1 ooRexx> pB~startsWith('🎅XXXX')= -- 1 1 ooRexx> pT~startsWith('🎅XXXX'~text)= -- 1 1 ooRexx> pB~startsWith('F0'x || 'XXXX')= -- Invalid UTF-8 string (raised by utf8proc) (was 1 before automatic conversion of string literals to text) Invalid UTF-8 string. Error code= 22.900 ooRexx> pT~startsWith('F0'x || 'XXXX'~text)= -- Invalid UTF-8 string (raised by utf8proc) Invalid UTF-8 string. Error code= 22.900 ooRexx> pT~startsWith('F0'x || 'XXXX')= -- Invalid UTF-8 string (raised by utf8proc) (was 1 (not good) before automatic conversion of string literals to text) Invalid UTF-8 string. Error code= 22.900 ooRexx> pB~startsWith('9F'x || 'XXXX')= -- Invalid UTF-8 string (raised by utf8proc) (was 1 before automatic conversion of string literals to text) Invalid UTF-8 string. Error code= 22.900 ooRexx> pT~startsWith('9F'x || 'XXXX'~text)= -- Invalid UTF-8 string (raised by utf8proc) Invalid UTF-8 string. Error code= 22.900 ooRexx> pT~startsWith('9F'x || 'XXXX')= -- Invalid UTF-8 string (raised by utf8proc) (was 1 (not good) before automatic conversion of string literals to text) Invalid UTF-8 string. Error code= 22.900 -- greedy pattern ooRexx> pB = .Pattern~compile("(.)*fô🎅") ooRexx> pT = .Pattern~compile("(.)*fô🎅"~text) ooRexx> pB~matches("xfooxxxxxxfô🎅")= -- 1 1 ooRexx> pT~matches("xfooxxxxxxfô🎅"~text)= -- 1 1 ooRexx> pB~startsWith("xfooxxxxxxfô🎅")= -- 1 1 ooRexx> pT~startsWith("xfooxxxxxxfô🎅"~text)= -- 1 1 -- zero or one occurrences of "a" ooRexx> pB = .Pattern~compile("a?") ooRexx> pT = .Pattern~compile("a?"~text) ooRexx> pB~matches("")= -- 1 1 ooRexx> pT~matches(""~text)= -- 1 1 ooRexx> pB~matches("a")= -- 1 1 ooRexx> pT~matches("a"~text)= -- 1 1 ooRexx> pB~matches("aa")= -- 0 0 ooRexx> pT~matches("aa"~text)= -- 0 0 -- zero or one occurrences of "🎅" ooRexx> pB = .Pattern~compile("🎅?") ooRexx> pT = .Pattern~compile("🎅?"~text) ooRexx> pB~matches("")= -- 1 (was 0 (KO) before automatic conversion of string literals to text) 1 ooRexx> pT~matches(""~text)= -- 1 1 ooRexx> pB~matches("🎅")= -- 1 1 ooRexx> pT~matches("🎅"~text)= -- 1 1 ooRexx> pB~matches("🎅🎅")= -- 0 0 ooRexx> pT~matches("🎅🎅"~text)= -- 0 0 -- exactly 3 occurrences of "a" ooRexx> pB = .Pattern~compile("a{3}") ooRexx> pT = .Pattern~compile("a{3}"~text) ooRexx> pB~matches("aa")= -- 0 0 ooRexx> pT~matches("aa"~text)= -- 0 0 ooRexx> pB~matches("aaa")= -- 1 1 ooRexx> pT~matches("aaa"~text)= -- 1 1 ooRexx> pB~matches("aaaa")= -- 0 0 ooRexx> pT~matches("aaaa"~text)= -- 0 0 -- exactly 3 occurrences of "🎅" ooRexx> pB = .Pattern~compile("🎅{3}") ooRexx> pT = .Pattern~compile("🎅{3}"~text) ooRexx> pB~matches("🎅🎅")= -- 0 0 ooRexx> pT~matches("🎅🎅"~text)= -- 0 0 ooRexx> pB~matches("🎅🎅🎅")= -- 1 (was 0 KO before automatic conversion of string literals to text) 1 ooRexx> pT~matches("🎅🎅🎅"~text)= -- 1 1 ooRexx> pB~matches("🎅🎅🎅🎅")= -- 0 0 ooRexx> pT~matches("🎅🎅🎅🎅"~text)= -- 0 0 -- repetitive "b" in the middle ooRexx> pB = .Pattern~compile("ab{2}c") ooRexx> pT = .Pattern~compile("ab{2}c"~text) ooRexx> pB~matches("ac")= -- 0 0 ooRexx> pT~matches("ac"~text)= -- 0 0 ooRexx> pB~matches("abc")= -- 0 0 ooRexx> pT~matches("abc"~text)= -- 0 0 ooRexx> pB~matches("abbc")= -- 1 1 ooRexx> pT~matches("abbc"~text)= -- 1 1 ooRexx> pB~matches("abbbc")= -- 0 0 ooRexx> pT~matches("abbbc"~text)= -- 0 0 -- repetitive "🎅" in the middle ooRexx> pB = .Pattern~compile("a🎅{2}c") ooRexx> pT = .Pattern~compile("a🎅{2}c"~text) ooRexx> pB~matches("ac")= -- 0 0 ooRexx> pT~matches("ac"~text)= -- 0 0 ooRexx> pB~matches("a🎅c")= -- 0 0 ooRexx> pT~matches("a🎅c"~text)= -- 0 0 ooRexx> pB~matches("a🎅🎅c")= -- 1 (was 0 (KO) before automatic conversion of string literals to text) 1 ooRexx> pT~matches("a🎅🎅c"~text)= -- 1 1 ooRexx> pB~matches("a🎅🎅🎅c")= -- 0 0 ooRexx> pT~matches("a🎅🎅🎅c"~text)= -- 0 0 -- "a" or "b" ooRexx> pB = .Pattern~compile("a|b") ooRexx> pT = .Pattern~compile("a|b"~text) ooRexx> pB~matches("a")= -- 1 1 ooRexx> pT~matches("a"~text)= -- 1 1 ooRexx> pB~matches("b")= -- 1 1 ooRexx> pT~matches("b"~text)= -- 1 1 ooRexx> pB~matches("c")= -- 0 0 ooRexx> pT~matches("c"~text)= -- 0 0 ooRexx> pB~startsWith("abc")= -- 1 1 ooRexx> pT~startsWith("abc"~text)= -- 1 1 ooRexx> pB~startsWith("bac")= -- 1 1 ooRexx> pT~startsWith("bac"~text)= -- 1 1 ooRexx> r = pB~find("xxxabcxxx") ooRexx> r~matched=; r~start=; r~end=; r~text=; r~length= 1 4 5 'a' 1 ooRexx> r = pT~find("xxxabcxxx"~text) ooRexx> r~matched=; r~start=; r~end=; r~text=; r~length= 1 4 5 T'a' 1 ooRexx> r = pB~find("xxxbacxxx") ooRexx> r~matched=; r~start=; r~end=; r~text=; r~length= 1 4 5 'b' 1 ooRexx> r = pT~find("xxxbacxxx"~text) ooRexx> r~matched=; r~start=; r~end=; r~text=; r~length= 1 4 5 T'b' 1 -- "🤶" or "🎅" ooRexx> pB = .Pattern~compile("🤶|🎅") ooRexx> pT = .Pattern~compile("🤶|🎅"~text) ooRexx> pB~matches("🤶")= -- 1 1 ooRexx> pT~matches("🤶"~text)= -- 1 1 ooRexx> pB~matches("🎅")= -- 1 1 ooRexx> pT~matches("🎅"~text)= -- 1 1 ooRexx> pB~matches("c")= -- 0 0 ooRexx> pT~matches("c"~text)= -- 0 0 ooRexx> pB~startsWith("🤶🎅c")= -- 1 1 ooRexx> pT~startsWith("🤶🎅c"~text)= -- 1 1 ooRexx> pB~startsWith("🎅🤶c")= -- 1 1 ooRexx> pT~startsWith("🎅🤶c"~text)= -- 1 1 ooRexx> r = pB~find("xxx🤶🎅cxxx") ooRexx> r~matched=; r~start=; r~end=; r~text=; r~length= -- now ok (r~end was 8 and r~length was 4 before automatic conversion of string literals to text) 1 4 5 T'🤶' 1 ooRexx> r = pT~find("xxx🤶🎅cxxx"~text) ooRexx> r~matched=; r~start=; r~end=; r~text=; r~length= 1 4 5 T'🤶' 1 ooRexx> r = pB~find("xxx🎅🤶cxxx") ooRexx> r~matched=; r~start=; r~end=; r~text=; r~length= -- now ok (r~end was 8 and r~length was 4 before automatic conversion of string literals to text) 1 4 5 T'🎅' 1 ooRexx> r = pT~find("xxx🎅🤶cxxx"~text) ooRexx> r~matched=; r~start=; r~end=; r~text=; r~length= 1 4 5 T'🎅' 1 ooRexx> .Unicode~unckeckedConversionToString = unckeckedConversionToString -- restore -- =============================================================================== -- 2023 Sep 14 /* Fix implementation of caselessPos, pos for ligatures. The results were not good for some byte indexes when using aligned:.false */ -------------- -- test case 1 -------------- -- pos with ligature "ffl" in strict mode (default) ooRexx> "bâfflé"~text~c2u= -- 'U+0062 U+00E2 U+FB04 U+00E9' 'U+0062 U+00E2 U+FB04 U+00E9' /* -- 01 | 02 | 03 | 04 (external grapheme indexes) -- 1 | 2 3 | 4 5 6 | 7 8 (external byte indexes) "bâfflé"~text~c2g= -- '62 | C3A2 | EFAC84 | C3A9' -- b | â | ffl | é */ ooRexx> "bâfflé"~text~pos("é")= -- 4 4 ooRexx> "bâfflé"~text~pos("e")= -- 0 0 ooRexx> "bâfflé"~text~pos("e", stripMark:)= -- 4 4 ooRexx> "bâfflé"~text~pos("f")= -- 0 because in strict mode, "ffl" remains U+FB04 0 ooRexx> "bâfflé"~text~pos("f", asList:, overlap:, aligned:.false)= -- a List (0 items) a List (0 items) -------------- -- test case 2 -------------- -- caselessPos with ligature "ffl" in strict mode (default) -- (apply casefold internally but returns external indexes) -- The ligature is decomposed by casefold. /* -- 01 | 02 | 03 | 04 (external grapheme indexes) -- 1 | 2 3 | 4 5 6 | 7 8 (external byte indexes) "bâfflé"~text~c2g= -- '62 | C3A2 | EFAC84 | C3A9' -- b | â | ffl | é -- 01 | 02 | 03 04 05 | 06 (internal grapheme indexes) -- 1 | 2 3 | 4 5 6 | 7 8 (internal byte indexes) "bâfflé"~text~casefold~c2g= -- '62 | C3A2 | 66 66 6C | C3A9' -- b | â | f f l | é */ ooRexx> "bâfflé"~text~caselessPos("É")= -- 4 4 ooRexx> "bâfflé"~text~caselessPos("E")= -- 0 0 ooRexx> "bâfflé"~text~caselessPos("E", stripMark:)= -- 4 4 ooRexx> "bâfflé"~text~caselessPos("F")= -- 0 because "F" matches only a subset of "ffl"-->"ffl" 0 ooRexx> "bâfflé"~text~caselessPos("FF")= -- 0 because "FF" matches only a subset of "ffl"-->"ffl" 0 ooRexx> "bâfflé"~text~caselessPos("FL")= -- 0 because "FL" matches only a subset of "ffl"-->"ffl" 0 ooRexx> "bâfflé"~text~caselessPos("FFL")= -- 3 because "FFL" matches all of "ffl"-->"ffl" 3 ooRexx> "bâfflé"~text~caselessPos("F", asList:, overlap:, aligned:.false)= a List (2 items) 0 : [+3.4,-3.5] 1 : [-3.5,-3.6] ooRexx> "bâfflfflé"~text~caselessPos("É")= -- 5 5 ooRexx> "bâfflfflé"~text~caselessPos("FFL", asList:, overlap:, aligned:.false)= a List (2 items) 0 : [+3.4,+4.7] 1 : [+4.7,+5.10] ooRexx> "bâfflfflé"~text~caselessPos("F", asList:, overlap:, aligned:.false)= a List (4 items) 0 : [+3.4,-3.5] 1 : [-3.5,-3.6] 2 : [+4.7,-4.8] 3 : [-4.8,-4.9] ooRexx> "bâfflfflé"~text~caselessPos("FLFF")= -- 0 0 ooRexx> "bâfflfflé"~text~caselessPos("FLFF", aligned:.false)= -- [-3.5,-4.9] [-3.5,-4.9] ooRexx> "bâfflfflé"~text~caselessPos("FFLFFL")= -- 3 3 -------------- -- test case 3 -------------- -- pos with ligature "ffl" in non-strict mode -- (in non-strict mode, the normalization is NFKD, but returns external indexes) -- The ligature is decomposed by NFKD /* -- 01 | 02 | 03 | 04 (external grapheme indexes) -- 1 | 2 3 | 4 5 6 | 7 8 (external byte indexes) "bâfflé"~text~c2g= -- '62 | C3A2 | EFAC84 | C3A9' -- b | â | ffl | é -- 01 | 02 | 03 04 05 | 06 (internal grapheme indexes) -- 1 | 2 3 4 | 5 6 7 | 8 9 0 (internal byte indexes) "bâfflé"~text~NFKD~c2g= -- '62 | 61CC82 | 66 66 6C | 65CC81' -- b | a ^ | f f l | e ´ */ ooRexx> "bâfflé"~text~pos("é", strict:.false)= -- 4 4 ooRexx> "bâfflé"~text~pos("e", strict:.false)= -- 0 0 ooRexx> "bâfflé"~text~pos("e", strict:.false, stripMark:)= -- 4 4 ooRexx> "bâfflé"~text~pos("f", strict:.false)= -- 0 because "f" matches only a subset of "ffl"-->"ffl" 0 ooRexx> "bâfflé"~text~pos("ff", strict:.false)= -- 0 because "ff" matches only a subset of "ffl"-->"ffl" 0 ooRexx> "bâfflé"~text~pos("ffl", strict:.false)= -- 3 because "ffl" matches all of "ffl"-->"ffl" 3 ooRexx> "bâfflé"~text~pos("f", strict:.false, asList:, overlap:, aligned:.false)= a List (2 items) 0 : [+3.5,-3.6] 1 : [-3.6,-3.7] ooRexx> "bâfflfflé"~text~pos("é", strict:.false)= -- 5 5 ooRexx> "bâfflfflé"~text~pos("ffl", strict:.false, asList:, overlap:, aligned:.false)= a List (2 items) 0 : [+3.5,+4.8] 1 : [+4.8,+5.11] ooRexx> "bâfflfflé"~text~pos("f", strict:.false, asList:, overlap:, aligned:.false)= a List (4 items) 0 : [+3.5,-3.6] 1 : [-3.6,-3.7] 2 : [+4.8,-4.9] 3 : [-4.9,-4.10] ooRexx> "bâfflfflé"~text~pos("flff", strict:.false)= -- 0 0 ooRexx> "bâfflfflé"~text~pos("flff", strict:.false, aligned:.false)= -- [-3.6,-4.10] [-3.6,-4.10] ooRexx> "bâfflfflé"~text~pos("fflffl", strict:.false)= -- 3 3 -------------- -- test case 4 -------------- -- caselessPos with ligature "ffl" in non-strict mode -- (apply casefold internally but returns external indexes) -- (in non-strict mode, the normalization is NFKD, but returns external indexes) -- The ligature is decomposed both by casefold and by NFKD. /* -- 01 | 02 | 03 | 04 (external grapheme indexes) -- 1 | 2 3 | 4 5 6 | 7 8 (external byte indexes) "bâfflé"~text~c2g= -- '62 | C3A2 | EFAC84 | C3A9' -- b | â | ffl | é -- 01 | 02 | 03 04 05 | 06 (internal grapheme indexes) -- 1 | 2 3 4 | 5 6 7 | 8 9 0 (internal byte indexes) "bâfflé"~text~NFKD~c2g= -- '62 | 61CC82 | 66 66 6C | 65CC81' -- b | a ^ | f f l | e ´ */ ooRexx> "bâfflé"~text~caselessPos("É", strict:.false)= -- 4 4 ooRexx> "bâfflé"~text~caselessPos("E", strict:.false)= -- 0 0 ooRexx> "bâfflé"~text~caselessPos("E", strict:.false, stripMark:)= -- 4 4 ooRexx> "bâfflé"~text~caselessPos("F", strict:.false)= -- 0 because "F" matches only a subset of "ffl"-->"ffl" 0 ooRexx> "bâfflé"~text~caselessPos("FF", strict:.false)= -- 0 because "FF" matches only a subset of "ffl"-->"ffl" 0 ooRexx> "bâfflé"~text~caselessPos("FFL", strict:.false)= -- 3 because "FFL" matches all of "ffl"-->"ffl" 3 ooRexx> "bâfflé"~text~caselessPos("F", strict:.false, asList:, overlap:, aligned:.false)= a List (2 items) 0 : [+3.5,-3.6] 1 : [-3.6,-3.7] ooRexx> "bâfflfflé"~text~caselessPos("É", strict:.false)= -- 5 5 ooRexx> "bâfflfflé"~text~caselessPos("FFL", strict:.false, asList:, overlap:, aligned:.false)= a List (2 items) 0 : [+3.5,+4.8] 1 : [+4.8,+5.11] ooRexx> "bâfflfflé"~text~caselessPos("F", strict:.false, asList:, overlap:, aligned:.false)= a List (4 items) 0 : [+3.5,-3.6] 1 : [-3.6,-3.7] 2 : [+4.8,-4.9] 3 : [-4.9,-4.10] ooRexx> "bâfflfflé"~text~caselessPos("FLFF", strict:.false)= -- 0 0 ooRexx> "bâfflfflé"~text~caselessPos("FLFF", strict:.false, aligned:.false)= -- [-3.6,-4.10] [-3.6,-4.10] ooRexx> "bâfflfflé"~text~caselessPos("FFLFFL", strict:.false)= -- 3 3 -- =============================================================================== -- 2023 Sep 11 /* casefold now supports the option stripMark. Rework the implementation of caselessPos, pos. - Thanks to Raku and Chrome, I realize that a matching should be succesful only if all the bytes of a grapheme are matched. - New named argument 'asList', to return a list of positions (similar to Raku's method .indices). - New named argument overlap: (same as Raku) If the optional named argument 'overlap' is specified, the search continues from the position directly following the previous match, otherwise the search will continue after the previous match. */ /* Remember: aligned=.false is intended for analysis of matchings and [non-]regression tests. Otherwise, I don't see any use. When aligned:.false, a returned position has the form +/-posC.posB where posB is the position of the matched byte in the transformed haystack, and posC is the corresponding grapheme position in the untransformed haystack. Don't use trunc(abs(position)) because you may need up to numeric digits 40: position max can be +/-(2**64-1)||"."||(2**64-1) Use instead: if position~matchChar(1, "+-") then parse var position 2 posC "." posB */ /* Additional test cases to cover corner cases for caselessPos, pos. */ -------------- -- test case 1 -------------- -- case no overlap versus overlap /* -- 01 | 02 | 03 | 04 | 05 | 06 -- 1 2 | 3 4 | 5 6 | 7 8 | 9 0 | 1 2 "àààààà"~text~c2g= -- 'C3A0 | C3A0 | C3A0 | C3A0 | C3A0 | C3A0' -- à | à | à | à | à | à -- 01 | 02 | 03 | 04 | 05 | 06 -- 1 2 | 3 4 | 5 6 | 7 8 | 9 0 | 1 2 "àààààà"~text~casefold~c2g= -- 'C3A0 | C3A0 | C3A0 | C3A0 | C3A0 | C3A0' -- à | à | à | à | à | à */ ooRexx> "àààààà"~text~caselessPos("aa", stripMark:)= -- 1 1 ooRexx> "àààààà"~text~caselessPos("aa", stripMark:, asList:)~allItems= -- [ 1, 3, 5] [ 1, 3, 5] ooRexx> "àààààà"~text~caselessPos("aa", stripMark:, asList:, overlap:)~allItems= -- [ 1, 2, 3, 4, 5] [ 1, 2, 3, 4, 5] ooRexx> "àààààà"~text~caselessPos("aa", stripMark:, asList:, aligned:.false)= a List (3 items) 0 : [+1.1,+3.3] 1 : [+3.3,+5.5] 2 : [+5.5,+7.7] ooRexx> "àààààà"~text~caselessPos("aa", stripMark:, asList:, overlap:, aligned:.false)= a List (5 items) 0 : [+1.1,+3.3] 1 : [+2.2,+4.4] 2 : [+3.3,+5.5] 3 : [+4.4,+6.6] 4 : [+5.5,+7.7] -------------- -- test case 2 -------------- -- case where the end of the matching is inside the untransformed grapheme /* -- 01 -- 1 2 "ß"~text~c2g= -- 'C39F' -- ß -- 01 02 -- 1 2 "ß"~text~casefold~c2g= -- '73 73' -- s s */ ooRexx> "ß"~text~caselessPos("s")= -- 0, not 1 because 1 would match only the first byte of "ß"-->"ss" 0 ooRexx> "ß"~text~caselessPos("s", asList:)= -- a List (0 items) a List (0 items) ooRexx> "ß"~text~caselessPos("s", asList:, overlap:)= -- a List (0 items) a List (0 items) ooRexx> "ß"~text~caselessPos("s", asList:, aligned:.false)= a List (2 items) 0 : [+1.1,-1.2] 1 : [-1.2,+2.3] ooRexx> "ß"~text~caselessPos("s", asList:, overlap:, aligned:.false)= a List (2 items) 0 : [+1.1,-1.2] 1 : [-1.2,+2.3] /* -- 01 | 02 -- 1 | 2 3 "sß"~text~c2g= -- '73 | C39F' -- s | ß -- 01 | 02 03 -- 1 | 2 3 "sß"~text~casefold~c2g= -- '73 | 73 73' -- s | s s */ ooRexx> "sß"~text~caselessPos("ss")= -- 2, not 1 because 1 would match only the first byte of "ß"-->"ss" 2 ooRexx> "sß"~text~caselessPos("ss", asList:)~allItems= -- [ 2] [ 2] ooRexx> "sß"~text~caselessPos("ss", asList:, overlap:)~allItems= -- [ 2] [ 2] ooRexx> "sß"~text~caselessPos("ss", asList:, aligned:.false)= a List (1 items) 0 : [+1.1,-2.3] ooRexx> "sß"~text~caselessPos("ss", asList:, overlap:, aligned:.false)= a List (2 items) 0 : [+1.1,-2.3] 1 : [+2.2,+3.4] /* -- 01 | 02 | 03 -- 1 | 2 3 | 4 "sßs"~text~c2g= -- '73 | C39F | 73' -- s | ß | s -- 01 | 02 03 | 04 -- 1 | 2 3 | 4 "sßs"~text~casefold~c2g= -- '73 | 73 73 | 73' -- s | s s | s */ ooRexx> "sßs"~text~caselessPos("s", 2)= -- 3, not 2 because 2 would match only the first byte of "ß"-->"ss" 3 ooRexx> "sßs"~text~caselessPos("s", 2, asList:)~allItems= -- [ 3] [ 3] ooRexx> "sßs"~text~caselessPos("s", 2, asList:, overlap:)~allItems= -- [ 3] [ 3] ooRexx> "sßs"~text~caselessPos("s", 2, asList:, aligned:.false)= a List (3 items) 0 : [+2.2,-2.3] 1 : [-2.3,+3.4] 2 : [+3.4,+4.5] ooRexx> "sßs"~text~caselessPos("s", 2, asList:, overlap:, aligned:.false)= a List (3 items) 0 : [+2.2,-2.3] 1 : [-2.3,+3.4] 2 : [+3.4,+4.5] ooRexx> "sßs"~text~caselessPos("ss")= -- 2, not 1 because 1 would match only the first byte of "ß"-->"ss" 2 ooRexx> "sßs"~text~caselessPos("ss", asList:)~allItems= -- [ 2] [ 2] ooRexx> "sßs"~text~caselessPos("ss", asList:, overlap:)~allItems= -- [ 2] [ 2] ooRexx> "sßs"~text~caselessPos("ss", asList:, aligned:.false)= a List (2 items) 0 : [+1.1,-2.3] 1 : [-2.3,+4.5] ooRexx> "sßs"~text~caselessPos("ss", asList:, overlap:, aligned:.false)= a List (3 items) 0 : [+1.1,-2.3] 1 : [+2.2,+3.4] 2 : [-2.3,+4.5] -------------- -- test case 3 -------------- -- caselessPos (apply casefold internally but returns external indexes) -- search 1 character, no overlap when searching a single character. /* -- 01 | 02 | 03 | 04 | 05 | 06 | 07 | 08 | 09 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 -- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 0 | 1 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 0 1 | 2 | 3 | 4 | 5 "Bundesstraße sss sßs ss"~text~c2g= -- '42 | 75 | 6E | 64 | 65 | 73 | 73 | 74 | 72 | 61 | C39F | 65 | 20 | 73 | 73 | 73 | 20 | 73 | C39F | 73 | 20 | 73 | 73' -- B | u | n | d | e | s | s | t | r | a | ß | e | _ | s | s | s | _ | s | ß | s | _ | s | s -- ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ -- 01 | 02 | 03 | 04 | 05 | 06 | 07 | 08 | 09 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 -- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 0 | 1 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 0 1 | 2 | 3 | 4 | 5 "Bundesstraße sss sßs ss"~text~casefold~c2g= -- '62 | 75 | 6E | 64 | 65 | 73 | 73 | 74 | 72 | 61 | 73 73 | 65 | 20 | 73 | 73 | 73 | 20 | 73 | 73 73 | 73 | 20 | 73 | 73' -- B | u | n | d | e | s | s | t | r | a | ß | e | _ | s | s | s | _ | s | ß | s | _ | s | s */ ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s")= -- 6 6 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 7)= -- 7 7 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 8)= -- 14 14 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 15)= -- 15 15 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 16)= -- 16 16 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 17)= -- 18 18 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 19)= -- 20 20 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 21)= -- 22 22 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 23)= -- 23 23 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", 24)= -- 0 0 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", asList:)~allItems= -- [ 6, 7, 14, 15, 16, 18, 20, 22, 23] [ 6, 7, 14, 15, 16, 18, 20, 22, 23] ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", asList:, overlap:)~allItems= -- [ 6, 7, 14, 15, 16, 18, 20, 22, 23] [ 6, 7, 14, 15, 16, 18, 20, 22, 23] ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", asList:, aligned:.false)= a List (13 items) 0 : [+6.6,+7.7] 1 : [+7.7,+8.8] 2 : [+11.11,-11.12] 3 : [-11.12,+12.13] 4 : [+14.15,+15.16] 5 : [+15.16,+16.17] 6 : [+16.17,+17.18] 7 : [+18.19,+19.20] 8 : [+19.20,-19.21] 9 : [-19.21,+20.22] 10 : [+20.22,+21.23] 11 : [+22.24,+23.25] 12 : [+23.25,+24.26] ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("s", asList:, overlap:, aligned:.false)= a List (13 items) 0 : [+6.6,+7.7] 1 : [+7.7,+8.8] 2 : [+11.11,-11.12] 3 : [-11.12,+12.13] 4 : [+14.15,+15.16] 5 : [+15.16,+16.17] 6 : [+16.17,+17.18] 7 : [+18.19,+19.20] 8 : [+19.20,-19.21] 9 : [-19.21,+20.22] 10 : [+20.22,+21.23] 11 : [+22.24,+23.25] 12 : [+23.25,+24.26] -------------- -- test case 4 -------------- -- caselessPos (apply casefold internally but returns external indexes) -- search 3 characters /* -- 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 "Bundesstraße sss sßs ss"~text~c2g= -- '42 75 6E 64 65 73 73 74 72 61 C39F 65 20 73 73 73 20 73 C39F 73 20 73 73' -- B u n d e s s t r a ß e _ s s s _ s ß s _ s s -- | | | */ ooRexx> -- Raku Chrome ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSs")= -- 14 13 y 14 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSs", 15)= -- 18 17 y 18 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSs", 19)= -- 19 (overlap) 18 (if overlap) y 19 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSs", 20)= -- 0 0 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSs", asList:)~allItems= -- [ 14, 18] [ 14, 18] ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSs", asList:, overlap:)~allItems= -- [ 14, 18, 19] [ 14, 18, 19] ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSs", asList:, aligned:.false)= a List (2 items) 0 : [+14.15,+17.18] 1 : [+18.19,+20.22] ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSs", asList:, overlap:, aligned:.false)= a List (3 items) 0 : [+14.15,+17.18] 1 : [+18.19,+20.22] 2 : [+19.20,+21.23] -------------- -- test case 5 -------------- -- caselessPos (apply casefold internally but returns external indexes) -- search 4 characters /* -- 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 "Bundesstraße sss sßs ss"~text~c2g= -- '42 75 6E 64 65 73 73 74 72 61 C39F 65 20 73 73 73 20 73 C39F 73 20 73 73' -- B u n d e s s t r a ß e _ s s s _ s ß s _ s s -- | */ ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSsS")= -- 18 (good, same result as Raku and Chrome) 18 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSsS", asList:)~allItems= -- [ 18] [ 18] ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSsS", asList:, overlap:)~allItems= -- [ 18] [ 18] ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSsS", asList:, aligned:.false)= a List (1 items) 0 : [+18.19,+21.23] ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("sSsS", asList:, overlap:, aligned:.false)= a List (1 items) 0 : [+18.19,+21.23] -------------- -- test case 6 -------------- -- caselessPos (apply casefold internally but returns external indexes) -- search 2 characters in a long sequence /* -- 01 02 03 04 05 06 07 08 09 10 11 12 13 "straßssßßssse"~text~c2g= -- '73 74 72 61 C39F 73 73 C39F C39F 73 73 73 65' -- s t r a ß s s ß ß s s s e -- | | | | | | | */ ooRexx> -- Raku Chome ooRexx> "straßssßßssse"~text~caselessPos("Ss")= -- 5 4 y 5 ooRexx> "straßssßßssse"~text~caselessPos("Ss", 6)= -- 6 5 (if overlap) y why Raku needs overlap? 6 ooRexx> "straßssßßssse"~text~caselessPos("Ss", 7)= -- 8 7 y 8 ooRexx> "straßssßßssse"~text~caselessPos("Ss", 9)= -- 9 8 (if overlap) y why Raku needs overlap? 9 ooRexx> "straßssßßssse"~text~caselessPos("Ss", 10)= -- 10 9 y 10 ooRexx> "straßssßßssse"~text~caselessPos("Ss", 11)= -- 11 (overlap) 10 (if overlap) y 11 ooRexx> "straßssßßssse"~text~caselessPos("Ss", 12)= -- 0 0 ooRexx> "straßssßßssse"~text~caselessPos("Ss", asList:)~allItems= -- [ 5, 6, 8, 9, 10] [ 5, 6, 8, 9, 10] ooRexx> "straßssßßssse"~text~caselessPos("Ss", asList:, overlap:)~allItems= -- [ 5, 6, 8, 9, 10, 11] [ 5, 6, 8, 9, 10, 11] ooRexx> "straßssßßssse"~text~caselessPos("Ss", asList:, aligned:.false)= a List (5 items) 0 : [+5.5,+6.7] 1 : [+6.7,+8.9] 2 : [+8.9,+9.11] 3 : [+9.11,+10.13] 4 : [+10.13,+12.15] ooRexx> "straßssßßssse"~text~caselessPos("Ss", asList:, overlap:, aligned:.false)= a List (10 items) 0 : [+5.5,+6.7] 1 : [-5.6,+7.8] 2 : [+6.7,+8.9] 3 : [+7.8,-8.10] 4 : [+8.9,+9.11] 5 : [-8.10,-9.12] 6 : [+9.11,+10.13] 7 : [-9.12,+11.14] 8 : [+10.13,+12.15] 9 : [+11.14,+13.16] -------------- -- test case 7 -------------- -- pos, caselessPos /* -- 01 02 03 04 05 06 07 08 09 10 11 12 13 -- 0 1 2 3 4 -- 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 "straße noël👩👨👩👧🎅"~text~c2g= -- '73 74 72 61 C39F 65 20 6E 6F C3AB 6C F09F91A9E2808DF09F91A8E2808DF09F91A9E2808DF09F91A7 F09F8E85' -- | */ ooRexx> "👧🎅"~text~c2g= -- 'F09F91A7 F09F8E85' 'F09F91A7 F09F8E85' ooRexx> "👧🎅"~text~casefold~c2g= -- 'F09F91A7 F09F8E85' 'F09F91A7 F09F8E85' ooRexx> "straße noël👩👨👩👧🎅"~text~pos("👧🎅", 1, aligned:.false)= -- [-12.35,+14.43] [-12.35,+14.43] ooRexx> "straße noël👩👨👩👧🎅"~text~pos("👧🎅", 12, aligned:.false)= -- [-12.35,+14.43] [-12.35,+14.43] ooRexx> "straße noël👩👨👩👧🎅"~text~pos("👧🎅", 13, aligned:.false)= -- 0 0 ooRexx> "straße noël👩👨👩👧🎅"~text~pos("👧🎅", 13, asList:)= -- a List (0 items) a List (0 items) ooRexx> "straße noël👩👨👩👧🎅"~text~pos("👧🎅", 13, asList:, overlap:)= -- a List (0 items) a List (0 items) ooRexx> "straße noël👩👨👩👧🎅"~text~pos("👧🎅", asList:, aligned:.false)= a List (1 items) 0 : [-12.35,+14.43] ooRexx> "straße noël👩👨👩👧🎅"~text~pos("👧🎅", asList:, overlap:, aligned:.false)= a List (1 items) 0 : [-12.35,+14.43] ooRexx> "straße noël👩👨👩👧🎅"~text~caselessPos("👧🎅", 1, aligned:.false)= -- [-12.35,+14.43] [-12.35,+14.43] ooRexx> "straße noël👩👨👩👧🎅"~text~caselessPos("👧🎅", 12, aligned:.false)= -- [-12.35,+14.43] [-12.35,+14.43] ooRexx> "straße noël👩👨👩👧🎅"~text~caselessPos("👧🎅", 13, aligned:.false)= -- 0 0 ooRexx> "straße noël👩👨👩👧🎅"~text~caselessPos("👧🎅", asList:, aligned:.false)= a List (1 items) 0 : [-12.35,+14.43] ooRexx> "straße noël👩👨👩👧🎅"~text~caselessPos("👧🎅", asList:, overlap:, aligned:.false)= a List (1 items) 0 : [-12.35,+14.43] -- yes, 12.35, not 12.34 even if "ë" (2 bytes) becomes internally "e" (1 byte) -- because the indexes are external (relative to the target string, not related to the internal transformed string) ooRexx> "straße noël👩👨👩👧🎅"~text~caselessPos("👧🎅", 1, aligned:.false, stripMark:)= -- [-12.35,+14.43] [-12.34,+14.42] ooRexx> "straße noël👩👨👩👧🎅"~text~caselessPos("👧🎅", 12, aligned:.false, stripMark:)= -- [-12.35,+14.43] [-12.35,+14.43] ooRexx> "straße noël👩👨👩👧🎅"~text~caselessPos("👧🎅", 13, aligned:.false, stripMark:)= -- 0 0 ooRexx> "straße noël👩👨👩👧🎅"~text~caselessPos("👧🎅", asList:, aligned:.false, stripMark:)= a List (1 items) 0 : [-12.34,+14.42] ooRexx> "straße noël👩👨👩👧🎅"~text~caselessPos("👧🎅", asList:, overlap:, aligned:.false, stripMark:)= a List (1 items) 0 : [-12.34,+14.42] -------------- -- test case 8 -------------- -- casefold /* -- 01 02 03 04 05 06 07 08 09 10 11 12 13 14 -- 0 1 2 3 4 -- 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 "straße noël👩👨👩👧🎅"~text~casefold~c2g= -- '73 74 72 61 73 73 65 20 6E 6F C3AB 6C F09F91A9E2808DF09F91A8E2808DF09F91A9E2808DF09F91A7 F09F8E85' -- | */ -- here we get 13 because "ß" is replaced by "ss" before calling pos -- the byte position .35 is unchanged because "ß" is 2 bytes, as is "ss". ooRexx> "straße noël👩👨👩👧🎅"~text~casefold~pos("👧🎅", 1, aligned:.false)= -- [-13.35,+15.43] [-13.35,+15.43] ooRexx> "straße noël👩👨👩👧🎅"~text~casefold~pos("👧🎅", asList:, aligned:.false)= a List (1 items) 0 : [-13.35,+15.43] ooRexx> "straße noël👩👨👩👧🎅"~text~casefold~pos("👧🎅", asList:, overlap:, aligned:.false)= a List (1 items) 0 : [-13.35,+15.43] -- stripMark has no impact on the byte position because it's an internal transformation ooRexx> "straße noël👩👨👩👧🎅"~text~casefold~pos("👧🎅", 1, aligned:.false, stripMark:)= -- [-13.35,+15.43] [-13.34,+15.42] ooRexx> "straße noël👩👨👩👧🎅"~text~casefold~pos("👧🎅", asList:, aligned:.false, stripMark:)= a List (1 items) 0 : [-13.34,+15.42] ooRexx> "straße noël👩👨👩👧🎅"~text~casefold~pos("👧🎅", asList:, overlap:, aligned:.false, stripMark:)= a List (1 items) 0 : [-13.34,+15.42] -- here we get 13.34 because stripMark has an impact on the byte position: -- "ë" (2 bytes" becomes "e" (1 byte) before calling pos. ooRexx> "straße noël👩👨👩👧🎅"~text~casefold(stripMark:)~pos("👧🎅", 1, aligned:.false)= -- [-13.34,+15.42] [-13.34,+15.42] ooRexx> "straße noël👩👨👩👧🎅"~text~casefold(stripMark:)~pos("👧🎅", asList:, aligned:.false)= a List (1 items) 0 : [-13.34,+15.42] ooRexx> "straße noël👩👨👩👧🎅"~text~casefold(stripMark:)~pos("👧🎅", asList:, overlap:, aligned:.false)= a List (1 items) 0 : [-13.34,+15.42] -------------- -- test case 9 -------------- -- pos with a needle inside a grapheme of the haystack -- Raku consider there is no matching. ooRexx> "👨👩"~text~c2g= -- 'F09F91A8E2808DF09F91A9' 'F09F91A8E2808DF09F91A9' ooRexx> "straße noël👩👨👩👧🎅"~text~pos("👨👩")= -- 0 0 ooRexx> "straße noël👩👨👩👧🎅"~text~pos("👨👩", aligned:.false)= -- [-12.21,-12.32] [-12.21,-12.32] ooRexx> "straße noël👩👨👩👧🎅"~text~pos("👨👩", asList:, aligned:.false)= a List (1 items) 0 : [-12.21,-12.32] ooRexx> "straße noël👩👨👩👧🎅"~text~pos("👨👩", asList:, overlap:, aligned:.false)= a List (1 items) 0 : [-12.21,-12.32] --------------- -- test case 10 --------------- -- pos with ignorable (no internal transformation) -- TAG SPACE is ignorable /* -- 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~c2g= -- '54 C38A 74F3A080A0 65 20 73 73 73 20 73 C39F 73 20 73 73 20 74F3A080A0 C3AA 54 45' -- T Ê t TAG SPAC e _ s s s _ s ß s _ s s _ t TAG SPAC ê T E -- | | | */ ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("ss", asList:)~allItems= -- [ 6, 14] [ 6, 14] ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("ss", asList:, overlap:)~allItems= -- [ 6, 7, 14] [ 6, 7, 14] ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("ss", asList:, aligned:.false)= a List (2 items) 0 : [+6.11,+8.13] 1 : [+14.20,+16.22] ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("ss", asList:, overlap:, aligned:.false)= a List (3 items) 0 : [+6.11,+8.13] 1 : [+7.12,+9.14] 2 : [+14.20,+16.22] -------------- -- test case 11 -------------- -- caselessPos with ignorable (apply casefold internally but returns external indexes) -- TAG SPACE is ignorable /* -- 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~c2g= -- '54 C38A 74F3A080A0 65 20 73 73 73 20 73 C39F 73 20 73 73 20 74F3A080A0 C3AA 54 45' -- T Ê t TAG SPAC e _ s s s _ s ß s _ s s _ t TAG SPAC ê T E -- | | | | */ ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss", asList:)~allItems= -- [ 6, 11, 14] [ 6, 11, 14] ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss", asList:, overlap:)~allItems= -- [ 6, 7, 11, 14] [ 6, 7, 11, 14] ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss", asList:, aligned:.false)= a List (4 items) 0 : [+6.11,+8.13] 1 : [+10.15,-11.17] 2 : [-11.17,+13.19] 3 : [+14.20,+16.22] ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss", asList:, overlap:, aligned:.false)= a List (6 items) 0 : [+6.11,+8.13] 1 : [+7.12,+9.14] 2 : [+10.15,-11.17] 3 : [+11.16,+12.18] 4 : [-11.17,+13.19] 5 : [+14.20,+16.22] ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", asList:)~allItems= -- [ 19] [ 19] ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", asList:, overlap:)~allItems= -- [ 19] [ 19] ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", asList:, aligned:.false)= a List (1 items) 0 : [+19.30,+21.32] ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", asList:, overlap:, aligned:.false)= a List (1 items) 0 : [+19.30,+21.32] --------------- -- test case 12 --------------- -- pos with ignorable (apply casefold + stripMark internally but returns external indexes) -- TAG SPACE is ignorable ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("te", stripMark:, asList:)= -- a List (0 items) a List (0 items) ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("te", stripMark:, asList:, overlap:)= -- a List (0 items) a List (0 items) --------------- -- test case 13 --------------- -- caselessPos with ignorable (apply casefold + stripMark internally but returns external indexes) -- TAG SPACE is ignorable ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, asList:)~allItems= -- [ 1, 19] [ 1, 19] ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, asList:, overlap:)~allItems= -- [ 1, 19] [ 1, 19] ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, asList:, aligned:.false)= a List (2 items) 0 : [+1.1,+3.3] 1 : [+19.28,+21.30] ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, asList:, overlap:, aligned:.false)= a List (2 items) 0 : [+1.1,+3.3] 1 : [+19.28,+21.30] --------------- -- test case 14 --------------- -- caselessPos with ignorable (apply casefold + stripIgnorable internally but returns external indexes) -- TAG SPACE is ignorable /* -- 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~c2g= -- '54 C38A 74F3A080A0 65 20 73 73 73 20 73 C39F 73 20 73 73 20 74F3A080A0 C3AA 54 45' -- T Ê t TAG SPAC e _ s s s _ s ß s _ s s _ t TAG SPAC ê T E -- | | | | */ ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, stripIgnorable:, asList:)~allItems= -- [ 1, 3, 17, 19] [ 1, 3, 17, 19] ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, stripIgnorable:, asList:, overlap:)~allItems= -- [ 1, 3, 17, 19] [ 1, 3, 17, 19] ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, stripIgnorable:, asList:, aligned:.false)= a List (4 items) 0 : [+1.1,+3.3] 1 : [+3.3,+5.5] 2 : [+17.18,+19.20] 3 : [+19.20,+21.22] ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, stripIgnorable:, asList:, overlap:, aligned:.false)= a List (4 items) 0 : [+1.1,+3.3] 1 : [+3.3,+5.5] 2 : [+17.18,+19.20] 3 : [+19.20,+21.22] -- =============================================================================== -- 2023 Sep 06 /* Fix the implementation of caselessPos, pos. Was not returning the right position when the length of the string changed internally. Now the results are identical to Raku's (with a few exceptions). */ ooRexx> "Bundesstraße im Freiland"~text~pos("Freiland")= -- 17 17 ooRexx> "Bundesstraße im Freiland"~text~caselessPos("freiland")= -- 17 17 -------------- -- test case 1 -------------- -- pos (no internal transformation) /* -- 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 "Bundesstraße sss sßs ss"~text~c2g= -- '42 75 6E 64 65 73 73 74 72 61 C39F 65 20 73 73 73 20 73 C39F 73 20 73 73' -- B u n d e s s t r a ß e _ s s s _ s ß s _ s s -- | | | no overlap -- | | | | with overlap */ ooRexx> "Bundesstraße sss sßs ss"~text~pos("ss")= -- 6 6 ooRexx> "Bundesstraße sss sßs ss"~text~pos("ss", 7)= -- 14 14 ooRexx> "Bundesstraße sss sßs ss"~text~pos("ss", 15)= -- 15 (overlap) 15 ooRexx> "Bundesstraße sss sßs ss"~text~pos("ss", 16)= -- 22 22 ooRexx> "Bundesstraße sss sßs ss"~text~pos("ss", 23)= -- 0 0 -------------- -- test case 2 -------------- -- caselessPos (apply casefold internally but returns external indexes) /* -- 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 "Bundesstraße sss sßs ss"~text~c2g= -- '42 75 6E 64 65 73 73 74 72 61 C39F 65 20 73 73 73 20 73 C39F 73 20 73 73' -- B u n d e s s t r a ß e _ s s s _ s ß s _ s s -- | | | | | no overlap -- | | | | | | | with overlap */ ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("ss")= -- 6 6 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("ss", 7)= -- 11 11 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("ss", 12)= -- 14 14 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("ss", 15)= -- 15 (overlap) 15 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("ss", 16)= -- 19 (Raku doesn't return this index, am I wrong? sounds good to me...) 19 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("ss", 20)= -- 22 22 ooRexx> "Bundesstraße sss sßs ss"~text~caselessPos("ss", 23)= -- 0 0 -------------- -- test case 3 -------------- -- casefold~pos (the returned indexes are different from caselessPos because the string is transformed before calling ~pos) -- Use "ü" instead of "u" to have a non-ASCII string. -- Without "ü", the 'pos' method would forward to String. /* -- 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 "Bündesstraße sss sßs ss"~text~casefold~c2g= -- '62 C3BC 6E 64 65 73 73 74 72 61 73 73 65 20 73 73 73 20 73 73 73 73 20 73 73' -- b ü n d e s s t r a s s e _ s s s _ s s s s _ s s -- | | | | | | no overlap -- | | | | | | | | with overlap */ ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss")= -- 6 6 ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss", 7)= -- 11 11 ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss", 12)= -- 15 15 ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss", 16)= -- 16 (overlap) 16 ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss", 17)= -- 19 19 ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss", 20)= -- 20 (overlap) 20 ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss", 21)= -- 21 (overlap) 21 ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss", 22)= -- 24 24 ooRexx> "Bündesstraße sss sßs ss"~text~casefold~pos("ss", 25)= -- 0 0 -------------- -- test case 4 -------------- -- TAG SPACE is ignorable ooRexx> "TÊt\u{TAG SPACE}e"~text~unescape~length= -- 4 4 ooRexx> "TÊt\u{TAG SPACE}e"~text~unescape~c2g= -- '54 C38A 74F3A080A0 65' '54 C38A 74F3A080A0 65' ooRexx> "TÊt\u{TAG SPACE}e"~text~unescape~transform(stripIgnorable:)~c2g= -- '54 C38A 74 65' '54 C38A 74 65' -------------- -- test case 5 -------------- -- pos with ignorable (no internal transformation) /* -- 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~c2g= -- '54 C38A 74F3A080A0 65 20 73 73 73 20 73 C39F 73 20 73 73 20 74F3A080A0 C3AA 54 45' -- T Ê t TAG SPAC e _ s s s _ s ß s _ s s _ t TAG SPAC ê T E -- | | | */ ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("ss")= -- 6 6 ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("ss", 7)= -- 7 7 ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("ss", 8)= -- 14 14 ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("ss", 15)= -- 0 0 ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("te")= -- 0 0 -------------- -- test case 6 -------------- -- caselessPos with ignorable (apply casefold internally but returns external indexes) /* -- 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~c2g= -- '54 C38A 74F3A080A0 65 20 73 73 73 20 73 C39F 73 20 73 73 20 74F3A080A0 C3AA 54 45' -- T Ê t TAG SPAC e _ s s s _ s ß s _ s s _ t TAG SPAC ê T E -- | | | | | */ ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss")= -- 6 6 ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss", 7)= -- 7 (overlap) 7 ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss", 8)= -- 11 11 ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss", 12)= -- 14 14 ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("ss", 15)= -- 0 0 ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te")= -- 19 19 ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", 20)= -- 0 0 -------------- -- test case 7 -------------- -- pos with ignorable (apply casefold + stripMark internally but returns external indexes) ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~pos("te", stripMark:)= -- 0 0 -------------- -- test case 8 -------------- -- caselessPos with ignorable (apply casefold + stripMark internally but returns external indexes) ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:)= -- 1 1 ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", 2, stripMark:)= -- 19 19 ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", 20, stripMark:)= -- 0 0 -------------- -- test case 9 -------------- -- caselessPos with ignorable (apply casefold + stripIgnorable internally but returns external indexes) /* -- 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~c2g= -- '54 C38A 74F3A080A0 65 20 73 73 73 20 73 C39F 73 20 73 73 20 74F3A080A0 C3AA 54 45' -- T Ê t TAG SPAC e _ s s s _ s ß s _ s s _ t TAG SPAC ê T E -- | | | | */ ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", stripMark:, stripIgnorable:)= -- 1 1 ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", 2, stripMark:, stripIgnorable:)= -- 3 3 ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", 4, stripMark:, stripIgnorable:)= -- 17 17 ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", 18, stripMark:, stripIgnorable:)= -- 19 19 ooRexx> "TÊt\u{TAG SPACE}e sss sßs ss t\u{TAG SPACE}êTE"~text~unescape~caselessPos("te", 20, stripMark:, stripIgnorable:)= -- 0 0 -- =============================================================================== -- 2023 Aug 29 /* Implementation of caselessContains, contains: (forwards to caselessPos or pos, and returns .true if result <> 0) (was already implemented, waiting for 'pos' implementation) Examples: */ ooRexx> "Père Noël Père Noël"~text~contains("oë")= -- .true 1 ooRexx> "Père Noël Père Noël"~text~contains("oë", , 7)= -- .false 0 ooRexx> "Père Noël Père Noël"~text~contains("oë", , 8)= -- .true 1 ooRexx> "Père Noël Père Noël"~text~contains("oë", 8)= -- .true 1 ooRexx> "Père Noël Père Noël"~text~contains("oë", 8, 10)= -- .false 0 ooRexx> "Père Noël Père Noël"~text~contains("oë", 8, 11)= -- .true 1 ooRexx> "Père Noël Père Noël"~text~caselessContains("OË", 8, 11)= -- .true 1 ooRexx> "noël👩👨👩👧🎅"~text~contains("👧🎅")= -- .false 0 ooRexx> "noël👩👨👩👧🎅"~text~contains("👧🎅", aligned:.false)= -- .true 1 ooRexx> "noël👩👨👩👧🎅"~text~contains("👩👨👩👧🎅", aligned:.false)= -- .true 1 -- =============================================================================== -- 2023 Aug 28 /* Add a named argument 'aligned' to caselessPos, pos: - If aligned=.true (default) then return the first character position in the untransformed haystack such as all the bytes of the transformed needle are matched with corresponding bytes in the transformed haystack AND the first and last byte positions are aligned with character positions. If no match then return 0. - If aligned=.false then return a couple (array) of numbers +/-posC.posB where posB is the position of the matched byte in the transformed haystack, and posC is the corresponding grapheme position in the untransformed haystack. A number is negative if the byte position is not aligned with the corresponding character position. The first number is the start of the matching. The second number is the end of the matching + 1. aligned=.false is intended for analysis of matchings and [non-]regression tests. Otherwise, I don't see any use. Example: */ ooRexx> "noël👩👨👩👧🎅"~text~pos("👧🎅")= -- 0 0 ooRexx> "noël👩👨👩👧🎅"~text~pos("👧🎅", aligned:.false)= -- [-5.27,+7.35] [-5.27,+7.35] ooRexx> "noël👩👨👩👧🎅"~text~pos("👩👨👩👧🎅", aligned:.false)= -- [+5.6,+7.35] [+5.6,+7.35] /* Comparison operators: Take into account the default normalization managed by the .Unicode class */ ooRexx> .Unicode~normalizationName(.Unicode~defaultNormalization(strict:.true))= -- NFC when strict 'NFC' ooRexx> .Unicode~normalizationName(.Unicode~defaultNormalization(strict:.false))= -- NFKD when not strict 'NFKD' /* Example: */ ooRexx> ("baffle"~text == "baffle"~text) = -- false 0 ooRexx> ("baffle"~text = "baffle"~text) = -- true 1 /* Reminder: the non-strict mode supports all the Unicode spaces, not just U+0032. */ ooRexx> string1 = " Le\u{IDEOGRAPHIC SPACE}Pè\u{ZERO-WIDTH-SPACE}re\u{HYPHEN}Noël"~text~unescape ooRexx> string2 = "Le\u{OGHAM SPACE MARK}Père\u{EN DASH}No\u{ZERO-WIDTH-SPACE}ël "~text~unescape ooRexx> (string1 == string2) = -- false 0 ooRexx> (string1 = string2) = -- true 1 -- =============================================================================== -- 2023 Aug 26 ooRexx> t = "noël👩👨👩👧🎅"~text; t~c2g= -- '6E 6F C3AB 6C F09F91A9E2808DF09F91A8E2808DF09F91A9E2808DF09F91A7 F09F8E85' '6E 6F C3AB 6C F09F91A9E2808DF09F91A8E2808DF09F91A9E2808DF09F91A7 F09F8E85' ooRexx> t = "noël👩👨👩👧🎅"~text; do indexB=1 to t~string~length + 2; indexC = t~indexer~characterIndexC(indexB); character = t~character(abs(indexC)); say "indexB" indexB~right(3) "--> indexC" indexC~right(4) " " character~c2x; end indexB 1 --> indexC 1 6E indexB 2 --> indexC 2 6F indexB 3 --> indexC 3 C3AB indexB 4 --> indexC -3 C3AB indexB 5 --> indexC 4 6C indexB 6 --> indexC 5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 7 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 8 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 9 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 10 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 11 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 12 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 13 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 14 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 15 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 16 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 17 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 18 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 19 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 20 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 21 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 22 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 23 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 24 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 25 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 26 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 27 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 28 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 29 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 30 --> indexC -5 F09F91A9 E2808D F09F91A8 E2808D F09F91A9 E2808D F09F91A7 indexB 31 --> indexC 6 F09F8E85 indexB 32 --> indexC -6 F09F8E85 indexB 33 --> indexC -6 F09F8E85 indexB 34 --> indexC -6 F09F8E85 indexB 35 --> indexC 7 indexB 36 --> indexC 7 -- Implementation of caselessCompare, compare -- ------------------------------------------ ooRexx> "hello"~text~compare("hello")= -- 0 0 ooRexx> "hello"~text~compare("helloo")= -- 6 6 ooRexx> "hello"~text~compare("hellô")= -- 5 5 ooRexx> "hello"~text~caselessCompare("hellô",stripMark:)= -- 0 0 ooRexx> "hellÔ"~text~caselessCompare("hellô")= -- 0 0 ooRexx> "hellÔ"~text~caselessCompare("")= -- 1 1 ooRexx> "hellÔ"~text~caselessCompare("", "h")= -- 2 2 ooRexx> zwsp = "\u{ZERO WIDTH SPACE}"~text~unescape -- ignorable ooRexx> ("he"zwsp"llo")~compare("hellô")= -- 3 (ok) 3 ooRexx> ("he"zwsp"llo")~compare("hellô", stripIgnorable:)= -- 6 (ok? not 5 because the ignorable character count as a character) 6 -- casefold 2 characters: "ß" becomes "ss" ooRexx> "Bundesstraße im Freiland"~text~caselessCompare("Bundesstraße")= -- 14 (good) 14 ooRexx> "Bundesstraße im Freiland"~text~caselessCompare("Bundesstraße", "_")= -- 13 (good) 13 ooRexx> "Bundesstraße im Freiland"~text~caselessCompare("bundesstrasse")= -- 14 (good) 14 ooRexx> "Bundesstrasse im Freiland"~text~caselessCompare("bundesstraße")= -- 15 (good) 15 ooRexx> "straßssßßssse"~text~compare("stra", "ß")= -- 6 (good) 6 ooRexx> "straßssßßssse"~text~caselessCompare("stra", "ß")= -- 12 (not 13 because the last 's' match half of the pad 'ss') 12 /* This test case is a little bit strange because: - the case-folded character looks identical to the original character. - the normalization and the casefold have the same effect. */ -- casefold 3 characters: "ΐ" 'U+0390' becomes "ΐ" 'U+03B9 U+0308 U+0301' ooRexx> iota_dt = "\u{GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS}"~text~unescape ooRexx> iota_dt~casefold~UnicodeCharacters== an Array (shape [3], 3 items) 1 : ( "ι" U+03B9 Ll 1 "GREEK SMALL LETTER IOTA" ) 2 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 3 : ( "́" U+0301 Mn 0 "COMBINING ACUTE ACCENT" ) ooRexx> ("a" iota_dt "b")~compare("a")= -- 3 3 ooRexx> ("a" iota_dt "b")~compare("a" iota_dt)= -- 5 5 ooRexx> ("a" iota_dt~casefold "b")~compare("a" iota_dt)= -- 5 (yes! not 3 because the default NFC transforms iota_dt~casefold 'U+03B9 U+0308 U+0301' into 'U+0390') 5 ooRexx> ("a" iota_dt~casefold "b")~compare("a" iota_dt, normalization: .Unicode~NFD)= -- 5 (yes! not 3 because NFD transforms iota_dt 'U+0390' into 'U+03B9 U+0308 U+0301' 5 ooRexx> ("a" iota_dt~casefold "b")~compare("a" iota_dt, normalization: 0)= -- 3 because normalization deactivated 3 ooRexx> ("a" iota_dt "b")~caselessCompare("a")= -- 3 3 ooRexx> ("a" iota_dt "b")~caselessCompare("a" iota_dt)= -- 5 5 ooRexx> ("a" iota_dt "b")~caselessCompare("a ", iota_dt)= -- 4 4 -- Implementation of caselessEndsWith, endsWith -- -------------------------------------------- ooRexx> "hello"~text~endsWith("")= -- false 0 ooRexx> "hello"~text~endsWith("o")= -- true 1 ooRexx> "hello"~text~endsWith("ô")= -- false 0 ooRexx> "hello"~text~endsWith("ô", stripMark:)= -- true 1 ooRexx> "hello"~text~endsWith("O")= -- false 0 ooRexx> "hello"~text~caselessEndsWith("O")= -- true 1 -- Rework implementation of caselessMatchChar, matchChar -- ----------------------------------------------------- ooRexx> "BAFFLE"~text~caselessMatchChar(3, "ffl")= -- 0, was 1 before 2023.12.04 "ffl" becomes "ffl" (3 graphemes), there is a match on "f" at 3 0 ooRexx> "BAFFLE"~text~caselessMatchChar(5, "ffl")= -- 0, was 1 before 2023.12.04 "ffl" becomes "ffl" (3 graphemes), there is a match on "l" at 5 0 ooRexx> "baffle"~text~caselessMatchChar(5, "L")= -- 1 there is a match on "l" at 5 (forward to string) 1 ooRexx> "baffle"~text~caselessMatchChar(3, "ffl")= -- 1 "ffl" at 3 (1 grapheme) becomes "ffl" (3 graphemes), there is a match on "l" 1 ooRexx> "baffle"~text~caselessMatchChar(3, "F")= -- 0, was 1 before 2023.12.04 "ffl" at 3 (1 grapheme) becomes "ffl" (3 graphemes), there is a match on "f" 0 ooRexx> "baffle"~text~caselessMatchChar(3, "L")= -- 0, was 1 before 2023.12.04 "ffl" at 3 (1 grapheme) becomes "ffl" (3 graphemes), there is a match on "l" 0 ooRexx> "baffle"~text~caselessMatchChar(4, "E")= -- 1 the grapheme at 4 is "e", not "f". There is a match with "e" 1 -- Rework implementation of caselessCompareTo, compareTo -- ----------------------------------------------------- ooRexx> "Père Noël"~text~nfc~compareTo("Père Noël"~text~nfc)= -- 0 (equal) 0 ooRexx> "Père Noël"~text~nfc~compareTo("Père Noël"~text~nfd)= -- 0 (equal) 0 ooRexx> "Père Noël"~text~nfd~compareTo("Père Noël"~text~nfc)= -- 0 (equal) 0 ooRexx> "Père Noël"~text~nfd~compareTo("Père Noël"~text~nfd)= -- 0 (equal) 0 ooRexx> --- ooRexx> "Pere Noël"~text~nfc~compareTo("Père Noel"~text~nfc, stripMark:)= -- 0 (equal) 0 ooRexx> "Pere Noël"~text~nfc~compareTo("Père Noel"~text~nfd, stripMark:)= -- 0 (equal) 0 ooRexx> "Pere Noël"~text~nfd~compareTo("Père Noel"~text~nfc, stripMark:)= -- 0 (equal) 0 ooRexx> "Pere Noël"~text~nfd~compareTo("Père Noel"~text~nfd, stripMark:)= -- 0 (equal) 0 ooRexx> --- ooRexx> "1st Père Noël"~text~nfc~compareTo("2nd Père Noël"~text~nfc)= -- -1 (lesser) -1 ooRexx> "1st Père Noël"~text~nfc~compareTo("2nd Père Noël"~text~nfd)= -- -1 (lesser) -1 ooRexx> "1st Père Noël"~text~nfd~compareTo("2nd Père Noël"~text~nfc)= -- -1 (lesser) -1 ooRexx> "1st Père Noël"~text~nfd~compareTo("2nd Père Noël"~text~nfd)= -- -1 (lesser) -1 ooRexx> --- ooRexx> "Père Noël 2nd"~text~nfc~compareTo("Père Noël 1st"~text~nfc)= -- 1 (greater) 1 ooRexx> "Père Noël 2nd"~text~nfc~compareTo("Père Noël 1st"~text~nfd)= -- 1 (greater) 1 ooRexx> "Père Noël 2nd"~text~nfd~compareTo("Père Noël 1st"~text~nfc)= -- 1 (greater) 1 ooRexx> "Père Noël 2nd"~text~nfd~compareTo("Père Noël 1st"~text~nfd)= -- 1 (greater) 1 ooRexx> --- ooRexx> "Pere Noël"~text~nfc~compareTo("Père Noel"~text~nfc, 3, 4)= -- 0 (equal) 0 ooRexx> "Pere Noël"~text~nfc~compareTo("Père Noel"~text~nfd, 3, 4)= -- 0 (equal) 0 ooRexx> "Pere Noël"~text~nfd~compareTo("Père Noel"~text~nfc, 3, 4)= -- 0 (equal) 0 ooRexx> "Pere Noël"~text~nfd~compareTo("Père Noel"~text~nfd, 3, 4)= -- 0 (equal) 0 ooRexx> --- ooRexx> "PÈRE NOËL"~text~nfc~compareTo("Père Noël"~text~nfc)= -- -1 (lesser) -1 ooRexx> "PÈRE NOËL"~text~nfc~compareTo("Père Noël"~text~nfd)= -- -1 (lesser) -1 ooRexx> "PÈRE NOËL"~text~nfd~compareTo("Père Noël"~text~nfc)= -- -1 (lesser) -1 ooRexx> "PÈRE NOËL"~text~nfd~compareTo("Père Noël"~text~nfd)= -- -1 (lesser) -1 ooRexx> --- ooRexx> "PÈRE NOËL"~text~nfc~caselessCompareTo("Père Noël"~text~nfc)= -- 0 (equal) 0 ooRexx> "PÈRE NOËL"~text~nfc~caselessCompareTo("Père Noël"~text~nfd)= -- 0 (equal) 0 ooRexx> "PÈRE NOËL"~text~nfd~caselessCompareTo("Père Noël"~text~nfc)= -- 0 (equal) 0 ooRexx> "PÈRE NOËL"~text~nfd~caselessCompareTo("Père Noël"~text~nfd)= -- 0 (equal) 0 ooRexx> --- ooRexx> "PERE NOËL"~text~nfc~caselessCompareTo("Père Noel"~text~nfc, 3, 4)= -- 0 (equal) 0 ooRexx> "PERE NOËL"~text~nfc~caselessCompareTo("Père Noel"~text~nfd, 3, 4)= -- 0 (equal) 0 ooRexx> "PERE NOËL"~text~nfd~caselessCompareTo("Père Noel"~text~nfc, 3, 4)= -- 0 (equal) 0 ooRexx> "PERE NOËL"~text~nfd~caselessCompareTo("Père Noel"~text~nfd, 3, 4)= -- 0 (equal) 0 -- Implementation of caselessPos, pos -- ---------------------------------- /* -- P è r e _ N o ë l -- 1 2 3 4 5 6 7 8 9 -- NFC '50 C3A8 72 65 20 4E 6F C3AB 6C' -- 1 2 3 4 5 6 7 8 9 10 11 -- NFD '50 65 CC80 72 65 20 4E 6F 65 CC88 6C' -- 1 2 3 4 5 6 7 8 9 19 1112 13 */ ooRexx> -- self needle ooRexx> "Père Noël Père Noël"~text~pos("l")= -- 9 NFC, NFC 9 ooRexx> "Père Noël Père Noël" ~pos("l")= -- 9 NFC, NFC (was 11 before automatic conversion of string literals to text) 9 ooRexx> "Père Noël Père Noël"~text~pos("l", , 8)= -- 0 NFC, NFC 0 ooRexx> "Père Noël Père Noël" ~pos("l", , 10)= -- 9 NFC, NFC (was 0 before automatic conversion of string literals to text) 9 ooRexx> "Père Noël Père Noël"~text~pos("l", , 9)= -- 9 NFC, NFC 9 ooRexx> "Père Noël Père Noël" ~pos("l", , 11)= -- 9 NFC, NFC (was 11 before automatic conversion of string literals to text) 9 ooRexx> "Père Noël Père Noël"~text~pos("l", 10)= -- 19 NFC, NFC 19 ooRexx> "Père Noël Père Noël" ~pos("l", 12)= -- 19 NFC, NFC (was 23 before automatic conversion of string literals to text) 19 ooRexx> "Père Noël Père Noël"~text~pos("l", 10, 9)= -- 0 NFC, NFC 0 ooRexx> "Père Noël Père Noël" ~pos("l", 12, 11)= -- 19 NFC, NFC (was 0 before automatic conversion of string literals to text) 19 ooRexx> "Père Noël Père Noël"~text~pos("l", 10, 10)= -- 19 NFC, NFC 19 ooRexx> "Père Noël Père Noël" ~pos("l", 12, 12)= -- 19 NFC, NFC (was 23 before automatic conversion of string literals to text) 19 ooRexx> --- ooRexx> "Père Noël Père Noël"~text~pos("l")= -- 9 NFD, NFC 9 ooRexx> "Père Noël Père Noël" ~pos("l")= -- 9 NFD, NFC (was 13 before automatic conversion of string literals to text) 9 ooRexx> "Père Noël Père Noël"~text~pos("l", , 8)= -- 0 NFD, NFC 0 ooRexx> "Père Noël Père Noël" ~pos("l", , 12)= -- 9 NFD, NFC (was 0 before automatic conversion of string literals to text) 9 ooRexx> "Père Noël Père Noël"~text~pos("l", , 9)= -- 9 NFD, NFC 9 ooRexx> "Père Noël Père Noël" ~pos("l", , 13)= -- 9 NFD, NFC (was 13 before automatic conversion of string literals to text) 9 ooRexx> "Père Noël Père Noël"~text~pos("l", 10)= -- 19 NFD, NFC 19 ooRexx> "Père Noël Père Noël" ~pos("l", 14)= -- 19 NFD, NFC (was 27 before automatic conversion of string literals to text) 19 ooRexx> "Père Noël Père Noël"~text~pos("l", 10, 9)= -- 0 NFD, NFC 0 ooRexx> "Père Noël Père Noël" ~pos("l", 14, 13)= -- 19 NFD, NFC (was 0 before automatic conversion of string literals to text) 19 ooRexx> "Père Noël Père Noël"~text~pos("l", 10, 10)= -- 19 NFD, NFC 19 ooRexx> "Père Noël Père Noël" ~pos("l", 14, 14)= -- 19 NFD, NFC (was 27 before automatic conversion of string literals to text) 19 ooRexx> --- ooRexx> "Père Noël Père Noël"~text~pos("oë")= -- 7 NFC, NFC 7 ooRexx> "Père Noël Père Noël" ~pos("oë")= -- 7 NFC, NFC (was 8 before automatic conversion of string literals to text) 7 ooRexx> "Père Noël Père Noël"~text~pos("oë", , 7)= -- 0 NFC, NFC 0 ooRexx> "Père Noël Père Noël" ~pos("oë", , 9)= -- 7 NFC, NFC (was 0 before automatic conversion of string literals to text) 7 ooRexx> "Père Noël Père Noël"~text~pos("oë", , 8)= -- 7 NFC, NFC 7 ooRexx> "Père Noël Père Noël" ~pos("oë", , 10)= -- 7 NFC, NFC (was 8 before automatic conversion of string literals to text) 7 ooRexx> "Père Noël Père Noël"~text~pos("oë", 8)= -- 17 NFC, NFC 17 ooRexx> "Père Noël Père Noël" ~pos("oë", 9)= -- 17 NFC, NFC (was 20 before automatic conversion of string literals to text) 17 ooRexx> "Père Noël Père Noël"~text~pos("oë", 8, 10)= -- 0 NFC, NFC 0 ooRexx> "Père Noël Père Noël" ~pos("oë", 9, 13)= -- 17 NFC, NFC (was 0 before automatic conversion of string literals to text) 17 ooRexx> "Père Noël Père Noël"~text~pos("oë", 8, 11)= -- 17 NFC, NFC 17 ooRexx> "Père Noël Père Noël" ~pos("oë", 9, 14)= -- 17 NFC, NFC (was 20 before automatic conversion of string literals to text) 17 ooRexx> --- ooRexx> "Père Noël Père Noël"~text~pos("oë")= -- 7 NFD, NFC 7 ooRexx> "Père Noël Père Noël" ~pos("oë")= -- 7 NFD, NFC (was "always 0, no need to test all the combinations" before automatic conversion of string literals to text) 7 ooRexx> "Père Noël Père Noël"~text~pos("oë", , 7)= -- 0 NFD, NFC 0 ooRexx> "Père Noël Père Noël"~text~pos("oë", , 8)= -- 7 NFD, NFC 7 ooRexx> "Père Noël Père Noël"~text~pos("oë", 8)= -- 17 NFD, NFC 17 ooRexx> "Père Noël Père Noël"~text~pos("oë", 8, 10)= -- 0 NFD, NFC 0 ooRexx> "Père Noël Père Noël"~text~pos("oë", 8, 11)= -- 17 NFD, NFC 17 ooRexx> --- ooRexx> "Père Noël Père Noël"~text~pos("oë")= -- 7 NFC, NFD 7 ooRexx> "Père Noël Père Noël" ~pos("oë")= -- 7 NFC, NFD (was "always 0, no need to test all the combinations" before automatic conversion of string literals to text) 7 ooRexx> "Père Noël Père Noël"~text~pos("oë", , 7)= -- 0 NFC, NFD 0 ooRexx> "Père Noël Père Noël"~text~pos("oë", , 8)= -- 7 NFC, NFD 7 ooRexx> "Père Noël Père Noël"~text~pos("oë", 8)= -- 17 NFC, NFD 17 ooRexx> "Père Noël Père Noël"~text~pos("oë", 8, 10)= -- 0 NFC, NFD 0 ooRexx> "Père Noël Père Noël"~text~pos("oë", 8, 11)= -- 17 NFC, NFD 17 ooRexx> --- ooRexx> "Père Noël Père Noël"~text~pos("oë")= -- 7 NFD, NFD 7 ooRexx> "Père Noël Père Noël" ~pos("oë")= -- 7 NFD, NFD (was 9 before automatic conversion of string literals to text) 7 ooRexx> "Père Noël Père Noël"~text~pos("oë", , 7)= -- 0 NFD, NFD 0 ooRexx> "Père Noël Père Noël" ~pos("oë", , 11)= -- 7 NFD, NFD (was 0 before automatic conversion of string literals to text) 7 ooRexx> "Père Noël Père Noël"~text~pos("oë", , 8)= -- 7 NFD, NFD 7 ooRexx> "Père Noël Père Noël" ~pos("oë", , 12)= -- 7 NFD, NFD (was 9 before automatic conversion of string literals to text) 7 ooRexx> "Père Noël Père Noël"~text~pos("oë", 8)= -- 17 NFD, NFD 17 ooRexx> "Père Noël Père Noël" ~pos("oë", 10)= -- 17 NFD, NFD (was 23 before automatic conversion of string literals to text) 17 ooRexx> "Père Noël Père Noël"~text~pos("oë", 8, 10)= -- 0 NFD, NFD 0 ooRexx> "Père Noël Père Noël" ~pos("oë", 10, 16)= -- 17 NFD, NFD (was 0 before automatic conversion of string literals to text) 17 ooRexx> "Père Noël Père Noël"~text~pos("oë", 8, 11)= -- 17 NFD, NFD 17 ooRexx> "Père Noël Père Noël" ~pos("oë", 10, 17)= -- 17 NFD, NFD (was 23 before automatic conversion of string literals to text) 17 ooRexx> --- ooRexx> "Père Noël Père Noël"~text~pos("oe")= -- 0 NFC, NFC always 0, no need to test all the combinations 0 ooRexx> "Père Noël Père Noël"~text~pos("oe", stripMark:)= -- 7 NFC, NFC 7 ooRexx> "Père Noël Père Noël"~text~pos("oe", , 7, stripMark:)= -- 0 NFC, NFC 0 ooRexx> "Père Noël Père Noël"~text~pos("oe", , 8, stripMark:)= -- 7 NFC, NFC 7 ooRexx> "Père Noël Père Noël"~text~pos("oe", 8, stripMark:)= -- 17 NFC, NFC 17 ooRexx> "Père Noël Père Noël"~text~pos("oe", 8, 10, stripMark:)=-- 0 NFC, NFC 0 ooRexx> "Père Noël Père Noël"~text~pos("oe", 8, 11, stripMark:)=-- 17 NFC, NFC 17 ooRexx> --- ooRexx> -- caseless tests not in the diary: ooRexx> --- ooRexx> "Père Noël Père Noël"~text~caselessPos("L")= -- 9 NFC, NFC 9 ooRexx> "Père Noël Père Noël" ~caselessPos("L")= -- 9 NFC, NFC (was 11 before automatic conversion of string literals to text) 9 ooRexx> "Père Noël Père Noël"~text~caselessPos("L", , 8)= -- 0 NFC, NFC 0 ooRexx> "Père Noël Père Noël" ~caselessPos("L", , 10)= -- 9 NFC, NFC (was 0 before automatic conversion of string literals to text) 9 ooRexx> "Père Noël Père Noël"~text~caselessPos("L", , 9)= -- 9 NFC, NFC 9 ooRexx> "Père Noël Père Noël" ~caselessPos("L", , 11)= -- 9 NFC, NFC (was 11 before automatic conversion of string literals to text) 9 ooRexx> "Père Noël Père Noël"~text~caselessPos("L", 10)= -- 19 NFC, NFC 19 ooRexx> "Père Noël Père Noël" ~caselessPos("L", 12)= -- 19 NFC, NFC (was 23 before automatic conversion of string literals to text) 19 ooRexx> "Père Noël Père Noël"~text~caselessPos("L", 10, 9)= -- 0 NFC, NFC 0 ooRexx> "Père Noël Père Noël" ~caselessPos("L", 12, 11)= -- 19 NFC, NFC (was 0 before automatic conversion of string literals to text) 19 ooRexx> "Père Noël Père Noël"~text~caselessPos("L", 10, 10)= -- 19 NFC, NFC 19 ooRexx> "Père Noël Père Noël" ~caselessPos("L", 12, 12)= -- 19 NFC, NFC (was 23 before automatic conversion of string literals to text) 19 ooRexx> --- ooRexx> "Père Noël Père Noël"~text~caselessPos("L")= -- 9 NFD, NFC 9 ooRexx> "Père Noël Père Noël" ~caselessPos("L")= -- 9 NFD, NFC (was 13 before automatic conversion of string literals to text) 9 ooRexx> "Père Noël Père Noël"~text~caselessPos("L", , 8)= -- 0 NFD, NFC 0 ooRexx> "Père Noël Père Noël" ~caselessPos("L", , 12)= -- 9 NFD, NFC (was 0 before automatic conversion of string literals to text) 9 ooRexx> "Père Noël Père Noël"~text~caselessPos("L", , 9)= -- 9 NFD, NFC 9 ooRexx> "Père Noël Père Noël" ~caselessPos("L", , 13)= -- 9 NFD, NFC (was 13 before automatic conversion of string literals to text) 9 ooRexx> "Père Noël Père Noël"~text~caselessPos("L", 10)= -- 19 NFD, NFC 19 ooRexx> "Père Noël Père Noël" ~caselessPos("L", 14)= -- 19 NFD, NFC (was 27 before automatic conversion of string literals to text) 19 ooRexx> "Père Noël Père Noël"~text~caselessPos("L", 10, 9)= -- 0 NFD, NFC 0 ooRexx> "Père Noël Père Noël" ~caselessPos("L", 14, 13)= -- 19 NFD, NFC (was 0 before automatic conversion of string literals to text) 19 ooRexx> "Père Noël Père Noël"~text~caselessPos("L", 10, 10)= -- 19 NFD, NFC 19 ooRexx> "Père Noël Père Noël" ~caselessPos("L", 14, 14)= -- 19 NFD, NFC (was 27 before automatic conversion of string literals to text) 19 ooRexx> --- ooRexx> "Père Noël Père Noël"~text~caselessPos("OË")= -- 7 NFC, NFC 7 ooRexx> "Père Noël Père Noël" ~caselessPos("OË")= -- 7 NFC, NFC (was "yes, 0, not 8 because "OË"~lower=='oË'" before automatic conversion of string literals to text) 7 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", , 7)= -- 0 NFC, NFC 0 ooRexx> "Père Noël Père Noël" ~caselessPos("OË", , 9)= -- 7 NFC, NFC (was 0 before automatic conversion of string literals to text) 7 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", , 8)= -- 7 NFC, NFC 7 ooRexx> "Père Noël Père Noël" ~caselessPos("OË", , 10)= -- 7 NFC, NFC (was "yes, 0, not 8 because "OË"~lower=='oË'" before automatic conversion of string literals to text) 7 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", 8)= -- 17 NFC, NFC 17 ooRexx> "Père Noël Père Noël" ~caselessPos("OË", 9)= -- 17 NFC, NFC (was "yes, 0, not 20 because "OË"~lower=='oË'" before automatic conversion of string literals to text) 17 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", 8, 10)= -- 0 NFC, NFC 0 ooRexx> "Père Noël Père Noël" ~caselessPos("OË", 9, 13)= -- 17 NFC, NFC (was 0 before automatic conversion of string literals to text) 17 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", 8, 11)= -- 17 NFC, NFC 17 ooRexx> "Père Noël Père Noël" ~caselessPos("OË", 9, 14)= -- 17 NFC, NFC (was "yes, 0, not 20 because "OË"~lower=='oË'" before automatic conversion of string literals to text) 17 ooRexx> --- ooRexx> "Père Noël Père Noël"~text~caselessPos("OË")= -- 7 NFD, NFC 7 ooRexx> "Père Noël Père Noël" ~caselessPos("OË")= -- 7 NFD, NFC (was "always 0, no need to test all the combinations" before automatic conversion of string literals to text) 7 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", , 7)= -- 0 NFD, NFC 0 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", , 8)= -- 7 NFD, NFC 7 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", 8)= -- 17 NFD, NFC 17 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", 8, 10)= -- 0 NFD, NFC 0 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", 8, 11)= -- 17 NFD, NFC 17 ooRexx> --- ooRexx> "Père Noël Père Noël"~text~caselessPos("OË")= -- 7 NFC, NFD 7 ooRexx> "Père Noël Père Noël" ~caselessPos("OË")= -- 7 NFC, NFD (was "always 0, no need to test all the combinations" before automatic conversion of string literals to text) 7 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", , 7)= -- 0 NFC, NFD 0 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", , 8)= -- 7 NFC, NFD 7 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", 8)= -- 17 NFC, NFD 17 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", 8, 10)= -- 0 NFC, NFD 0 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", 8, 11)= -- 17 NFC, NFD 17 ooRexx> --- ooRexx> "Père Noël Père Noël"~text~caselessPos("OË")= -- 7 NFD, NFD 7 ooRexx> "Père Noël Père Noël" ~caselessPos("OË")= -- 7 NFD, NFD (was "yes, 9 (it works...) because the NFD representation isolate the accent: "oë"~c2x=='6F65CC88', "OË"~lower~c2x=='6F65CC88'" before automatic conversion of string literals to text) 7 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", , 7)= -- 0 NFD, NFD 0 ooRexx> "Père Noël Père Noël" ~caselessPos("OË", , 11)= -- 7 NFD, NFD (was 0 before automatic conversion of string literals to text) 7 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", , 8)= -- 7 NFD, NFD 7 ooRexx> "Père Noël Père Noël" ~caselessPos("OË", , 12)= -- 7 NFD, NFD (was "yes, 9 (it works thanks to the NFD), see previous comment" before automatic conversion of string literals to text) 7 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", 8)= -- 17 NFD, NFD 17 ooRexx> "Père Noël Père Noël" ~caselessPos("OË", 10)= -- 17 NFD, NFD (was "yes, 23 (it works thanks to the NFD), see previous comment" before automatic conversion of string literals to text) 17 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", 8, 10)= -- 0 NFD, NFD 0 ooRexx> "Père Noël Père Noël" ~caselessPos("OË", 10, 16)= -- 17 NFD, NFD (was 0 before automatic conversion of string literals to text) 17 ooRexx> "Père Noël Père Noël"~text~caselessPos("OË", 8, 11)= -- 17 NFD, NFD 17 ooRexx> "Père Noël Père Noël" ~caselessPos("OË", 10, 17)= -- 17 NFD, NFD (was "yes, 23 (it works thanks to the NFD), see previous comment" before automatic conversion of string literals to text) 17 ooRexx> --- ooRexx> "Père Noël Père Noël"~text~caselessPos("OE")= -- 0 NFC, NFC always 0, no need to test all the combinations 0 ooRexx> "Père Noël Père Noël"~text~caselessPos("OE", stripMark:)= -- 7 NFC, NFC 7 ooRexx> "Père Noël Père Noël"~text~caselessPos("OE", , 7, stripMark:)= -- 0 NFC, NFC 0 ooRexx> "Père Noël Père Noël"~text~caselessPos("OE", , 8, stripMark:)= -- 7 NFC, NFC 7 ooRexx> "Père Noël Père Noël"~text~caselessPos("OE", 8, stripMark:)= -- 17 NFC, NFC 17 ooRexx> "Père Noël Père Noël"~text~caselessPos("OE", 8, 10, stripMark:)=-- 0 NFC, NFC 0 ooRexx> "Père Noël Père Noël"~text~caselessPos("OE", 8, 11, stripMark:)=-- 17 NFC, NFC 17 -- =============================================================================== -- 2023 Aug 07 -- Add conversion from a Unicode encoding to a Byte encoding. ooRexx> "Père Noël"~text~transcodeTo("cp437")~c2x= -- '50 8A 72 65 20 4E 6F 89 6C' '50 8A 72 65 20 4E 6F 89 6C' ooRexx> '50 8A 72 65 20 4E 6F 89 6C'x~text("cp437")~utf8~c2x= -- '50 C3A8 72 65 20 4E 6F C3AB 6C' '50 C3A8 72 65 20 4E 6F C3AB 6C' ooRexx> '50 8A 72 65 20 4E 6F 89 6C'x~text("cp437")~transcodeTo("utf8")~c2x= -- '50 C3A8 72 65 20 4E 6F C3AB 6C' '50 C3A8 72 65 20 4E 6F C3AB 6C' -- The replacementCharacter "FF"x is interpreted as a UTF-8 string (default encoding). "FF"x~text~c2u= -- 'U+FFFD' -- Was: Hence the error "The replacement character UTF-8 not-ASCII '[FF]' cannot be transcoded to ISO-8859-1." -- Now: Invalid UTF-8 string (since automatic conversion of string literals to text) -- Now: Direct transcoding from 'Byte' to 'ISO-8859-1' is not supported (since the systematic absorption of The Byte_Encoding) -- TODO: test case to get the previous error message '...cannot be transcoded...' ooRexx> text = "Père Noël 🎅 10€"~text; do encoding over .Byte_Encoding~subclasses~~append(.Byte_Encoding); say encoding~name~left(13)":" text~transcodeTo(encoding, replacementCharacter:"FF"x~byte)~c2x; end Direct transcoding from 'Byte' to 'ISO-8859-1' is not supported. Error code= 23.900 -- Here, the replacementCharacter is interpreted as a byte string encoded in the target encoding ooRexx> text = "Père Noël 🎅 10€"~text; do encoding over .Byte_Encoding~subclasses~~append(.Byte_Encoding); say encoding~name~left(13)":" text~transcodeTo(encoding, replacementCharacter:"FF"x~text(encoding))~c2x; end ISO-8859-1 : 50 E8 72 65 20 4E 6F EB 6C 20 FF 20 31 30 FF ibm-1252 : 50 E8 72 65 20 4E 6F EB 6C 20 FF 20 31 30 FF windows-1252 : 50 E8 72 65 20 4E 6F EB 6C 20 FF 20 31 30 80 IBM437 : 50 8A 72 65 20 4E 6F 89 6C 20 FF 20 31 30 FF Byte : 50 FF 72 65 20 4E 6F FF 6C 20 FF 20 31 30 FF -- =============================================================================== -- 2023 Aug 04 --- Following expressions return the same result correctly tagged 'ISO-8859-1' ooRexx> b = .MutableBuffer~new; "Pere"~text("windows-1252")~append(" "~text("windows-1252"), buffer:b)~appendEncoded("Noël"~text("iso-8859-1"), buffer:b)=; result~description= M'Pere Noël' 'ISO-8859-1 not-ASCII (10 bytes)' ooRexx> b = .MutableBuffer~new; "Pere"~text("windows-1252")~appendEncoded(" "~text("windows-1252"), buffer:b)~appendEncoded("Noël"~text("iso-8859-1"), buffer:b)=; result~description= M'Pere Noël' 'ISO-8859-1 not-ASCII (10 bytes)' ooRexx> b = .MutableBuffer~new; b~appendEncoded("Pere"~text("windows-1252"), " "~text("windows-1252"), "Noël"~text("iso-8859-1"))=; result~description= M'Pere Noël' 'ISO-8859-1 not-ASCII (10 bytes)' -- Following expressions (not using 'appendEncoded') return the same result as above, but wrongly tagged 'windows-1252' or 'UTF-8' ooRexx> b = .MutableBuffer~new; "Pere"~text("windows-1252")~append(" "~text("windows-1252"), buffer:b)~append("Noël"~text("iso-8859-1"), buffer:b)=; result~description= M'Pere Noël' 'windows-1252 not-ASCII (10 bytes)' ooRexx> b = .MutableBuffer~new; b~append("Pere"~text("windows-1252"), " "~text("windows-1252"), "Noël"~text("iso-8859-1"))=; result~description= M'Pere Noël' 'UTF-8 not-ASCII by default (10 bytes)' -- =============================================================================== -- 2023 Jun 28 -- Bitkey is now 2 bytes (4 hex digits) always. -- For debug, give temporarily access to the flags stored on an indexer. ooRexx> "Père Noël"~text~nfc(casefold:, stripMark:)~indexer~flags= a Directory (10 items) 'FLAG_CASEFOLD' : 1 'FLAG_LUMP' : -1 'FLAG_NFC' : 1 'FLAG_NFD' : -1 'FLAG_NFKC' : -1 'FLAG_NFKD' : -1 'FLAG_STRIP_CC' : -1 'FLAG_STRIP_IGNORABLE' : -1 'FLAG_STRIP_MARK' : 1 'FLAG_STRIP_NA' : -1 -- =============================================================================== -- 2023 May 31 -- Add support for functional methods to RexxText. -- Example inspired by https://elixir-lang.org/ -- Frequency of each character, ignoring the accents: ooRexx> "Notre père Noël 🎅"~text~transform(stripMark:)~reduce(by: "characters", initial: .stem~new~~put(0)){accu[item~string] += 1}= a Stem (9 items) '🎅' : 1 ' ' : 3 'e' : 4 'l' : 1 'N' : 2 'o' : 2 'p' : 1 'r' : 2 't' : 1 -- Add support for generator methods to RexxText. ooRexx> g="Noël 🎅"~text~generateC ooRexx> g~()= -- T'N' T'N' ooRexx> g~()= -- T'o' T'o' ooRexx> g~()= -- T'ë' T'ë' ooRexx> g~()= -- T'l' T'l' ooRexx> g~()= -- T' ' T' ' ooRexx> g~()= -- T'🎅' T'🎅' ooRexx> g~()= -- [no result] [no result] -- =============================================================================== -- 2023 May 29 -- For convenience, additional way to search a character: -- with a routine ooRexx> .UnicodeCharacter("bed")= -- ( "🛏" U+1F6CF So 1 "BED" ) ( "🛏" U+1F6CF So 1 "BED" ) ooRexx> .UnicodeCharacter("bed", hexadecimal:)= -- ( "௭" U+0BED Nd 1 "TAMIL DIGIT SEVEN" ) ( "௭" U+0BED Nd 1 "TAMIL DIGIT SEVEN" ) -- with the operator [] ooRexx> .UnicodeCharacter["bed"]= -- ( "🛏" U+1F6CF So 1 "BED" ) ( "🛏" U+1F6CF So 1 "BED" ) ooRexx> .UnicodeCharacter["bed", hexadecimal:]= -- ( "௭" U+0BED Nd 1 "TAMIL DIGIT SEVEN" ) ( "௭" U+0BED Nd 1 "TAMIL DIGIT SEVEN" ) -- This comes in complement of: ooRexx> .Unicode["bed", hexadecimal:]= -- ( "௭" U+0BED Nd 1 "TAMIL DIGIT SEVEN" ) ( "௭" U+0BED Nd 1 "TAMIL DIGIT SEVEN" ) ooRexx> .Unicode~character("bed", hexadecimal:)= -- ( "௭" U+0BED Nd 1 "TAMIL DIGIT SEVEN" ) ( "௭" U+0BED Nd 1 "TAMIL DIGIT SEVEN" ) -- New method UnicodeCharacter~properties at class level: return a list of property names. ooRexx> .UnicodeCharacter~properties= ['aliases','bidiClass','bidiClassName','bidiMirrored','boundClass','boundClassName','category','categoryName','charWidth','codepoint','combiningClass','controlBoundary','decompositionTypeName','decompositionType','ignorable','isLower','isUpper','name','toLowerFull','toLowerSimple','toTitleFull','toTitleSimple','toUpperFull','toUpperSimple','Unicode','UTF16BE','UTF16LE','UTF32BE','UTF32LE','UTF8'] -- =============================================================================== -- 2023 May 24 -- For convenience, it's now possible to search directly a character if it's made of one codepoint only: ooRexx> .Unicode~character("a")= -- ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) ooRexx> .Unicode~character("à")= -- ( "à" U+00E0 Ll 1 "LATIN SMALL LETTER A WITH GRAVE" ) ( "à" U+00E0 Ll 1 "LATIN SMALL LETTER A WITH GRAVE" ) ooRexx> .Unicode~character("à")= -- Error: The character 'à' is made of several codepoints: U+0061 U+0300 The character 'à' is made of several codepoints: U+0061 U+0300. Error code= 93.900 -- For the last example, you can get an array of characters: ooRexx> "à"~text~UnicodeCharacters== an Array (shape [2], 2 items) 1 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 2 : ( "̀" U+0300 Mn 0 "COMBINING GRAVE ACCENT" ) -- New method UnicodeCharacter~properties at instance level: Return a directory of properties. ooRexx> .Unicode~character("U+000D")~properties= a Directory (30 items) 'aliases' : [(CARRIAGE RETURN),(CR)] 'bidiClass' : 16 'bidiClassName' : 'B' 'bidiMirrored' : 0 'boundClass' : 2 'boundClassName' : 'CR' 'category' : 26 'categoryName' : 'Cc' 'charWidth' : 0 'codepoint' : 'U+000D' 'combiningClass' : 0 'controlBoundary' : 1 'decompositionType' : 0 'decompositionTypeName' : 'None' 'ignorable' : 0 'isLower' : 0 'isUpper' : 0 'name' : '' 'toLowerFull' : 'U+000D' 'toLowerSimple' : 'U+000D' 'toTitleFull' : 'U+000D' 'toTitleSimple' : 'U+000D' 'toUpperFull' : 'U+000D' 'toUpperSimple' : 'U+000D' 'Unicode' : '0x0D' 'UTF16BE' : '0x000D' 'UTF16LE' : '0x0D00' 'UTF32BE' : '0x0000000D' 'UTF32LE' : '0x0D000000' 'UTF8' : '0x0D' -- =============================================================================== -- 2023 March 20 -- Rework implementation of caselessMatch to support correctly ooRexx> "Bundesstraße im Freiland"~text~caselessMatch(14, "im")= -- .true 1 -- =============================================================================== -- 2023 March 08 -- Implementation of caselessMatchChar, matchChar ooRexx> "Noëlle"~text~matchChar(2, "aeiouy")= -- 1 1 ooRexx> "Noëlle"~text~matchChar(3, "aeiouy")= -- 0 0 ooRexx> "Noëlle"~text~matchChar(3, "aeëiouy")= -- 1 include the accents in the list of accepted characters 1 ooRexx> "Noëlle"~text~matchChar(3, "aeiouy", stripMark:)= -- 1 or remove the accents from the tested string 1 ooRexx> "Noëlle"~text~matchChar(6, "aeiouy")= -- 1 1 ooRexx> "Bundesschnellstraße"~text~matchChar(14, "s")= -- 1 1 ooRexx> "Bundesschnellstraße"~text~matchChar(18, "s")= -- 0 0 ooRexx> "Bundesschnellstraße"~text~matchChar(18, "sß")= -- 1 1 ooRexx> "Bundesschnellstraße"~text~caselessMatchChar(18, "s")= -- 0, was 1 before 2023.12.04 "ß" becomes "ss" which is 2 graphemes. The first grapheme at 18 matches "s" 0 ooRexx> "Bundesschnellstraße"~text~caselessMatchChar(19, "s")= -- 0 "ß" becomes "ss" which is 2 graphemes. The grapheme at 19 is "e", not the second "s" 0 ooRexx> "Bundesschnellstraße"~text~caselessMatchChar(19, "e")= -- 1 "ß" becomes "ss" which is 2 graphemes. The grapheme at 19 is "e", not the second "s" 1 -- The ligature disappears in NFK[CD] but not in NF[CD] ooRexx> "baffle"~text~NFKC= -- T'baffle' T'baffle' ooRexx> "baffle"~text~NFKD= -- T'baffle' T'baffle' ooRexx> "baffle"~text~matchChar(3, "f")= -- 0 "ffl" is ONE grapheme because NFC 0 ooRexx> "baffle"~text~matchChar(3, "ffl")= -- 1 "ffl" is ONE grapheme because NFC 1 ooRexx> "baffle"~text~matchChar(3, "ffl", normalization:.Unicode~NFKD)= -- 1 "ffl" becomes "ffl" (3 graphemes). There is a match because the first grapheme is "f" 1 ooRexx> "baffle"~text~matchChar(3, "f", normalization:.Unicode~NFKD)= -- 0, was 1 before 2023.12.04 "ffl" becomes "ffl" (3 graphemes). There is a match because the first grapheme is "f" 0 ooRexx> "baffle"~text~matchChar(4, "f", normalization:.Unicode~NFKD)= -- 0 "ffl" becomes "ffl" (3 graphemes). The grapheme at 4 is "e", not the second "f" 0 ooRexx> "baffle"~text~matchChar(4, "e", normalization:.Unicode~NFKD)= -- 1 "ffl" becomes "ffl" (3 graphemes). The grapheme at 4 is "e", not the second "f" 1 -- The ligature disappears when casefolded ooRexx> "baffle"~text~casefold= -- T'baffle' T'baffle' ooRexx> "BAFFLE"~text~caselessMatchChar(3, "ffl")= -- 0, was 1 before 2023.12.04 "ffl" becomes "ffl" (3 graphemes), there is a match on "f" at 3 0 ooRexx> "BAFFLE"~text~caselessMatchChar(5, "ffl")= -- 0, was 1 before 2023.12.04 "ffl" becomes "ffl" (3 graphemes), there is a match on "l" at 5 0 ooRexx> "BAFFLE"~text~caselessMatchChar(5, "L")= -- 1 there is a match on "l" at 5 (forward to String) 1 -- Implementation of caselessEquals, equals ooRexx> "ŒUF"~text~caselessEquals("œuf")= -- 1 1 ooRexx> "œuf"~text~caselessEquals("ŒUF")= -- 1 1 ooRexx> "Straße"~text~caselessEquals("strasse")= -- 1 1 ooRexx> "strasse"~text~caselessEquals("Straße")= -- 1 1 -- Some ligatures are not decomposed by NFKC. ooRexx> "ŒUF"~text~caselessEquals("oeuf")= -- 0 0 ooRexx> "ŒUF"~text~caselessEquals("oeuf", normalization:.Unicode~NFKC)= -- 0 0 -- =============================================================================== -- 2022 November 20 /* For consistency, all the conversion methods accept the named argument 'strict', even if it's not needed for the unicode encodings. Previously, was supported only for the byte encodings. The default value of 'strict' is now .false. The conversion methods accept the named argument 'memorize(3)'. Its default value is given by .unicode~memorizeTranscodings (was memorizeConversions) which is .false by default. Example: s = "hello" t = s~text utf16 = t~utf16(memorize:) utf32 = t~utf32(memorize:) t~utf16~"==":.object(utf16)= -- 1 t~utf32~"==":.object(utf32)= -- 1 */ /* CP1252 to UTF-8, UTF-16, UTF-32 "Un œuf de chez MaPoule™ coûte ±0.40€" */ ooRexx> str_cp1252 = "Un " || "9C"x || "uf de chez MaPoule" || "99"x || " co" || "FB"x || "te " || "B1"x || "0.40" || "80"x ooRexx> txt_cp1252 = str_cp1252~text("cp1252") ooRexx> utf8 = txt_cp1252~utf8(memorize:) ooRexx> utf16 = txt_cp1252~utf16(memorize:) ooRexx> utf32 = txt_cp1252~utf32(memorize:) ooRexx> txt_cp1252~utf8 ~"==":.object(utf8) = -- 1 1 ooRexx> txt_cp1252~utf16~"==":.object(utf16)= -- 1 1 ooRexx> txt_cp1252~utf32~"==":.object(utf32)= -- 1 1 /* When an optional buffer is passed, must check that its encoding is compatible. Done for the conversion methods. Example: */ ooRexx> b = .mutablebuffer~new -- No encoding yet ooRexx> "hello"~text~utf16(buffer:b) -- now the buffer's encoding is UTF-16 ooRexx> "bye"~text~utf8(buffer:b) -- Encoding: cannot append UTF-8 to UTF-16BE '[00]h[00]e[00]l[00]l[00]o'. Encoding: cannot append UTF-8 to UTF-16BE '[00]h[00]e[00]l[00]l[00]o'. Error code= 23.900 -- =============================================================================== -- 2022 November 08 /* Additional arguments are supported by NFC, NFD, NFKC, NFKD, Casefold: lump Lumps certain different codepoints together. All the concerned characters become the same character, but still remain distinct characters. E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-" all space characters (general category Zs) to U+0020 stripIgnorable Strips the characters whose property Default_Ignorable_Code_Point = true such as SOFT-HYPHEN or ZERO-WIDTH-SPACE stripCC Strips and/or converts control characters: characters 00-1F and 7F-9F, except 09 which is replaced by 20. stripMark Strips all character markings: characters whose category is Mc Me Mn (i.e. accents) Mc Spacing Mark Me Enclosing Mark Mn Nonspacing Mark This option works only with normalization. stripNA Strips the characters whose category is Cn Unassigned Note that the value gc=Cn does not actually occur in UnicodeData.txt, because that data file does not list unassigned code points. Remark: the normalization NFKC_Casefold (short alias NFKC_CF) is done with ~NFKC(Casefold: .true, stripIgnorable: .true) */ /* Two RexxText values are considered equal if their extended grapheme clusters are canonically equivalent. This is the definition of Swift. Q&A https://lists.isocpp.org/sg16/2018/08/0121.php TODO: confirm that it's NFC, and only that. The definition of canonical equivalence by the Unicode standard seems not limited to NFC. https://unicode.org/notes/tn5/ The strict comparison operators now use the NFC normalization (update: use .Unicode~defaultNormalization(strict:.true)). After normalization, they delegate to the String's strict comparison operators. The non-strict comparison operators now use the NFC normalization (update: use .Unicode~defaultNormalization(strict:.false)) plus stripIgnorable:.true lump:.true After normalization + transformations, they delegate to the String's non-strict comparison operators. Thanks to the lump transformation, all the Unicode spaces are supported. Examples: */ ooRexx> textNFC = "Noël"~text~NFC ooRexx> textNFC~UnicodeCharacters== an Array (shape [4], 4 items) 1 : ( "N" U+004E Lu 1 "LATIN CAPITAL LETTER N" ) 2 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 3 : ( "ë" U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" ) 4 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) ooRexx> textNFD="Noël"~text~NFD ooRexx> textNFD~UnicodeCharacters== an Array (shape [5], 5 items) 1 : ( "N" U+004E Lu 1 "LATIN CAPITAL LETTER N" ) 2 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 3 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) 4 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 5 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) ooRexx> (textNFC == textNFD)= -- 1 1 ooRexx> (textNFC = textNFD)= -- 1 1 ooRexx> (" "textNFC == textNFD" ")= -- 0 because strict 0 ooRexx> (" "textNFC = textNFD" ")= -- 1 1 ooRexx> (" "textNFC = (textNFD"\u{NBSP}")~unescape)= -- 1 1 ooRexx> (" "textNFC = (textNFD"\u{ZWSP}")~unescape)= -- 1 1 ooRexx> ("-"textNFC = ("\u{OBLIQUE HYPHEN}"textNFD"\u{ZWSP}")~unescape)= -- 1 1 ooRexx> "pere noel"~text~caselessCompareTo("Père Noël")= -- -1 (lesser) -1 ooRexx> "pere noel"~text~caselessCompareTo("Père Noël", stripMark:.true)= -- 0 (equal because the accents are ignored) 0 -- Add support for ISO-8859-1 encoding (alias Latin1). -- Example: -- all the supported characters: ranges 20-7E and A0-FF ooRexx> text = xrange("20"x, "7E"x, "A0"x, "FF"x)~text("ISO-8859-1") -- The ? are just ISO-8859-1 encoded characters that can't be displayed as-is in a console UTF-8 (copy-paste of the console output) -- After conversion to UTF-8, all is good. ooRexx> text= -- T' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~???????????????????????????????????????????????????????????????????????????????????????????????[FF]' T' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~�����������������������������������������������������������������������������������������������[FF]' ooRexx> text~utf8= -- T' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ' T' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ' -- ranges 00-1F and 7F-9F are undefined -- an error is triggered even with the option strict: .false, because there is no fallback mapping ooRexx> text = xrange("20"x, "FF"x)~text("ISO-8859-1") ooRexx> text~utf8(strict: .false)= -- Error ISO-8859-1 encoding: cannot convert ISO-8859-1 not-ASCII character 127 (7F) at byte-position 96 to UTF-8. T' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ' -- =============================================================================== -- 2022 November 06 /* Refactoring Prefix the native methods by the library name (utf8proc_, ziglyph_ or icu4x_). That will make more easy the comparison of similar services. Remove the native methods 'NFC', 'NFD', 'NFKC', 'NFKD' and 'NFKC_Casefold': all replaced by 'utf8proc_transform'. ~Casefold is now limited to case fold. Previously, NKFC + case fold was applied (because the method NFKC_Casefold of utf8proc was called). NFC, NFD, NFKC and NFKD now supports the named argument 'casefold' (default = .false). Examples */ ooRexx> "Père Noël ß ㎒"~text~casefold= -- T'père noël ss ㎒' T'père noël ss ㎒' ooRexx> "Père Noël ß ㎒"~text~NFKC= -- T'Père Noël ß MHz' T'Père Noël ß MHz' ooRexx> "Père Noël ß ㎒"~text~NFKC(casefold:.true)= -- T'père noël ss mhz' T'père noël ss mhz' /* Performance NFC, NFD, NFKC, NFKD and Casefold now supports the named argument 'returnString'. - When true, the returned value is a String. - When false (default), the returned value is a RexxText. Maybe this optimization will be replaced by a more general optimization: RexxText indexation on need. 2 cached values are managed in case of memorization: - one for the main transformation, - one for the main transformation + case fold. That makes 9 possible cached value and 5 indicators per indexer (so per string). isCasefold CasefoldString isNFC NFCString NFCCasefoldString isNFD NFDString NFDCasefoldString isNFKC NFKCString NFKCCasefoldString isNFKD NFKDString NFKDCasefoldString The memorization can be activated globally: .Unicode~memorizeTransformations = .true Examples */ -- Direct access to utf8proc, returns a string ooRexx> s = "Père Noël ß ㎒"; do 10000; .Unicode~utf8proc_transform(s, normalization:3, casefold:.true); end -- Duration: 0.05 --- ooRexx> t = "Père Noël ß ㎒"~text; do 10000; t~NFKC(casefold:.true); end -- Duration: 7.70 ooRexx> t = "Père Noël ß ㎒"~text; do 10000; t~NFKC(casefold:.true, returnString:.true); end -- Duration: 0.33 ooRexx> t = "Père Noël ß ㎒"~text; do 10000; t~NFKC(casefold:.true, returnString:.true, memorize:.true); end -- Duration: 0.11 -- The cache for NFKC + casefold is different from the cache for NFKC only: ooRexx> t = "Père Noël ß ㎒"~text; do 10000; t~NFKC; end -- Duration: 6.50 ooRexx> t = "Père Noël ß ㎒"~text; do 10000; t~NFKC(returnString:.true); end -- Duration: 0.30 ooRexx> t = "Père Noël ß ㎒"~text; do 10000; t~NFKC(returnString:.true, memorize:.true); end -- Duration: 0.10 -- =============================================================================== -- 2022 November 05 /* New methods on RexxText caselessContains (not ready: posText) caselessCompareTo caselessMatch caselessMatchChar (not ready: matchCharText) caselessEndsWith (not ready: endsWithText) caselessPos (not ready: posText) caselessStartsWith (not ready: posText) compareTo contains (not ready: posText) endsWith (not ready: endsWithText) match matchChar (not ready: matchCharText) pos (not ready: posText) startsWith (not ready: posText) For caseless, apply NFC Casefold to all the text/string arguments. Compared to the ooRexx methods, the purpose of these methods is to convert the grapheme indexes to/from byte indexes. The real work is done by the ooRexx methods, called with the right byte indexes. From a byte index returned by an ooRexx method, a grapheme index is derived. Examples: */ ooRexx> -- 1 2 3 4 5 6 7 8 9 10 (grapheme indexes) ooRexx> -- 1 2 3 4 5 6 7 8 9 10 11 (byte indexes) ooRexx> "père Noël"~text~c2x= -- '70 C3A8 72 65 20 4E 6F C3AB 6C' '70 C3A8 72 65 20 4E 6F C3AB 6C' ooRexx> -- p è r e N o ë l ooRexx> "père Noël"~match(1, "Noël")= -- .false (byte indexes) 0 ooRexx> "père Noël"~text~match(1, "Noël")= -- .false (grapheme indexes) 0 ooRexx> "père Noël"~match(7, "Noël")= -- .false (was ".true (byte indexes)" before automatic conversion of string literals to text) 0 ooRexx> "père Noël"~text~match(6, "Noël")= -- .true (grapheme indexes) 1 ooRexx> "père Noël"~match(11, "Noël", 5)= -- Invalid position argument specified; found "11" (was ".true (byte indexes)" before automatic conversion of string literals to text) Invalid position argument specified; found "11". Error code= 93.924 ooRexx> "père Noël"~text~match(9, "Noël", 4)= -- .true (grapheme indexes) 1 ooRexx> "père Noël"~text~caselessMatch(1, "NOËL")= -- .false 0 ooRexx> "père Noël"~text~caselessMatch(6, "NOËL")= -- .true 1 ooRexx> -- the first "äXü" is NFC, the second "äẌü" is NFD ooRexx> nfcString = "äXü" ooRexx> nfcText = nfcString~text ooRexx> nfcText~c2x= -- 'C3A4 58 C3BC' 'C3A4 58 C3BC' ooRexx> nfcText~UnicodeCharacters== an Array (shape [3], 3 items) 1 : ( "ä" U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" ) 2 : ( "X" U+0058 Lu 1 "LATIN CAPITAL LETTER X" ) 3 : ( "ü" U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" ) ooRexx> nfdString = "äXü" ooRexx> nfdText = nfdString~text ooRexx> nfdText~c2x= -- '61 CC88 58 75 CC88' '61 CC88 58 75 CC88' ooRexx> nfdText~UnicodeCharacters== an Array (shape [5], 5 items) 1 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 2 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 3 : ( "X" U+0058 Lu 1 "LATIN CAPITAL LETTER X" ) 4 : ( "u" U+0075 Ll 1 "LATIN SMALL LETTER U" ) 5 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) ooRexx> nfcString~match(1, nfdString)= -- 1 (was "0 (because binary representation is different)" before automatic conversion of string literals to text) 1 ooRexx> nfcText ~match(1, nfdText)= -- 1 1 ooRexx> nfdText ~match(1, nfcText)= -- 1 1 ooRexx> -- match with "X" ooRexx> nfcString~match(3, nfdString, 4, 1)= -- Invalid position argument specified; found "4" (was "1 (byte indexes)" before automatic conversion of string literals to text) Invalid position argument specified; found "4". Error code= 93.924 ooRexx> nfcText ~match(2, nfdText, 2, 1)= -- 1 (grapheme indexes) 1 ooRexx> nfdString~match(4, nfcString, 3, 1)= -- Invalid position argument specified; found "4" (was "1 (byte indexes)" before automatic conversion of string literals to text) Invalid position argument specified; found "4". Error code= 93.924 ooRexx> nfdText ~match(2, nfcText, 2, 2)= -- 1 (grapheme indexes) 1 -- =============================================================================== -- 2022 October 15 /* New native method .Unicode~transform Mainly for internal use, will replace the current native methods NFC, NFD, NFKC, NFKD. The purpose of this method is to support additional transformations provided by utf8proc. Takes a byte string as input (UTF-8 encoded), returns a new transformed byte string as output (UTF-8). Examples: */ ooRexx> string = "\u{BEL}Le\u{IDEOGRAPHIC SPACE}\u{OGHAM SPACE MARK}\u{ZERO-WIDTH-SPACE}Père\t\u{HYPHEN}\u{SOFT-HYPHEN}\u{EN DASH}\u{EM DASH}Noël\x{EFB790}\r\n" ooRexx> text = string~text~unescape ooRexx> text~UnicodeCharacters== an Array (shape [22], 22 items) 1 : ( "" U+0007 Cc 0 "", "ALERT", "BEL" ) 2 : ( "L" U+004C Lu 1 "LATIN CAPITAL LETTER L" ) 3 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) 4 : ( " " U+3000 Zs 2 "IDEOGRAPHIC SPACE" ) 5 : ( " " U+1680 Zs 1 "OGHAM SPACE MARK" ) 6 : ( "" U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" ) 7 : ( "P" U+0050 Lu 1 "LATIN CAPITAL LETTER P" ) 8 : ( "è" U+00E8 Ll 1 "LATIN SMALL LETTER E WITH GRAVE" ) 9 : ( "r" U+0072 Ll 1 "LATIN SMALL LETTER R" ) 10 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) 11 : ( "" U+0009 Cc 0 "", "CHARACTER TABULATION", "HORIZONTAL TABULATION", "HT", "TAB" ) 12 : ( "‐" U+2010 Pd 1 "HYPHEN" ) 13 : ( "" U+00AD Cf 1 "SOFT HYPHEN", "SHY" ) 14 : ( "–" U+2013 Pd 1 "EN DASH" ) 15 : ( "—" U+2014 Pd 1 "EM DASH" ) 16 : ( "N" U+004E Lu 1 "LATIN CAPITAL LETTER N" ) 17 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 18 : ( "ë" U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" ) 19 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) 20 : ( "" U+FDD0 Cn 1 "" ) 21 : ( "" U+000D Cc 0 "", "CARRIAGE RETURN", "CR" ) 22 : ( "" U+000A Cc 0 "", "LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL" ) ooRexx> text= -- T'[07]Le Père[09]‐–—Noël[0D0A]' T'[07]Le Père[09]‐–—Noël[0D0A]' ooRexx> -- Performs unicode case folding, to be able to do a case-insensitive string comparison. ooRexx> .Unicode~utf8proc_transform(text~string, casefold:.true)= -- '[07]le père[09]‐–—noël[0D0A]' '[07]le père[09]‐–—noël[0D0A]' ooRexx> -- Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE ooRexx> .Unicode~utf8proc_transform(text~string, stripIgnorable:.true)= -- '[07]Le Père[09]‐–—Noël[0D0A]' '[07]Le Père[09]‐–—Noël[0D0A]' ooRexx> -- Lumps certain characters together. See lump.md for details: ooRexx> -- https://github.com/JuliaStrings/utf8proc/blob/master/lump.md ooRexx> -- E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-" ooRexx> -- jlf: I was expecting to have only one space and one "-" but that's not the case ooRexx> -- Seems working as designed... All the concerned characters become the same character, but still remain distinct characters. ooRexx> .Unicode~utf8proc_transform(text~string, lump:.true)= -- '[07]Le Père[09]---Noël[0D0A]' '[07]Le Père[09]---Noël[0D0A]' ooRexx> -- NLF2LF: Convert LF, CRLF, CR and NEL into LF ooRexx> .Unicode~utf8proc_transform(text~string, NLF:1)= -- '[07]Le Père[09]‐–—Noël[0A]' '[07]Le Père[09]‐–—Noël[0A]' ooRexx> -- NLF2LS: Convert LF, CRLF, CR and NEL into LS (U+2028 Zl 0 "LINE SEPARATOR") ooRexx> .Unicode~utf8proc_transform(text~string, NLF:2)= -- '[07]Le Père[09]‐–—Noël' '[07]Le Père[09]‐–—Noël ' ooRexx> -- NLF2PS: convert LF, CRLF, CR and NEL into PS (U+2029 Zp 0 "PARAGRAPH SEPARATOR") ooRexx> .Unicode~utf8proc_transform(text~string, NLF:3)= -- '[07]Le Père[09]‐–—Noël ' '[07]Le Père[09]‐–—Noël ' ooRexx> -- Strips and/or converts control characters. ooRexx> .Unicode~utf8proc_transform(text~string, stripCC:.true)= -- 'Le Père ‐–—Noël ' 'Le Père ‐–—Noël ' ooRexx> -- Strips all character markings. ooRexx> -- This includes non-spacing, spacing and enclosing (i.e. accents). ooRexx> -- This option works only with normalization. ooRexx> .Unicode~utf8proc_transform(text~string, stripMark:.true, normalization:1)= -- '[07]Le Pere[09]‐–—Noel[0D0A]' '[07]Le Pere[09]‐–—Noel[0D0A]' ooRexx> -- Strips unassigned codepoints. ooRexx> .Unicode~utf8proc_transform(text~string, stripNA:.true)= -- '[07]Le Père[09]‐–—Noël[0D0A]' '[07]Le Père[09]‐–—Noël[0D0A]' ooRexx> -- Application of several options ooRexx> .Unicode~utf8proc_transform(text~string, casefold:.true, lump:.true, normalization:1, stripIgnorable:.true, stripCC:.true, stripMark:.true, stripNA:.true)= -- 'le pere ---noel ' 'le pere ---noel ' -- =============================================================================== -- 2022 September 14 /* New methods on RexxText center centre Examples: */ ooRexx> "noël👩👨👩👧🎅"~text~description= -- 'UTF-8 not-ASCII (6 graphemes, 12 codepoints, 34 bytes, 0 error)' 'UTF-8 not-ASCII (6 characters, 12 codepoints, 34 bytes, 0 error)' ooRexx> "noël👩👨👩👧🎅"~text~center(10)= -- T' noël👩👨👩👧🎅 ' T' noël👩👨👩👧🎅 ' ooRexx> "noël👩👨👩👧🎅"~text~center(10)~description= -- 'UTF-8 not-ASCII (10 graphemes, 16 codepoints, 38 bytes, 0 error)' 'UTF-8 not-ASCII (10 characters, 16 codepoints, 38 bytes, 0 error)' ooRexx> pad = "═" ooRexx> pad~description= -- 'UTF-8 not-ASCII (1 character, 1 codepoint, 3 bytes, 0 error)' (was 'UTF-8 not-ASCII (1 grapheme, 1 codepoint, 3 bytes, 0 error)' before automatic conversion of string literals to text) 'UTF-8 not-ASCII (1 character, 1 codepoint, 3 bytes, 0 error)' ooRexx> pad~c2x= -- 'E29590' 'E29590' ooRexx> "noël👩👨👩👧🎅"~text~center(10, pad)= -- T'══noël👩👨👩👧🎅══' T'══noël👩👨👩👧🎅══' ooRexx> "noël👩👨👩👧🎅"~text~center(10, pad)~description= -- 'UTF-8 not-ASCII (10 graphemes, 16 codepoints, 46 bytes, 0 error)' 'UTF-8 not-ASCII (10 characters, 16 codepoints, 46 bytes, 0 error)' -- =============================================================================== -- 2022 September 09 -- Start working on encoding~previousCodepointIndexB: ooRexx> "🎅noël"~text~c2x= -- 'F09F8E85 6E 6F C3AB 6C' 'F09F8E85 6E 6F C3AB 6C' ooRexx> .utf8_encoding~previousCodepointIndexB("🎅noël", 0)= -- 0 0 ooRexx> .utf8_encoding~previousCodepointIndexB("🎅noël", 1)= -- 1 1 ooRexx> .utf8_encoding~previousCodepointIndexB("🎅noël", 2)= -- 1 1 ooRexx> .utf8_encoding~previousCodepointIndexB("🎅noël", 3)= -- 1 1 ooRexx> .utf8_encoding~previousCodepointIndexB("🎅noël", 4)= -- 1 1 ooRexx> .utf8_encoding~previousCodepointIndexB("🎅noël", 5)= -- 1 1 ooRexx> .utf8_encoding~previousCodepointIndexB("🎅noël", 6)= -- 5 5 ooRexx> .utf8_encoding~previousCodepointIndexB("🎅noël", 7)= -- 6 6 -- Currently, only Byte_encoding and UTF8_encoding supports this new method. -- Still lot of work to detect the same errors as nextCodepointIndex. -- =============================================================================== -- 2022 September 08 -- Set/get an encoding on a string without having an associated RexxText -- (similar to MutableBuffer) ooRexx> s = "nonsense" ooRexx> s~encoding = -- returns the default encoding: (The UTF8_Encoding class) (The UTF8_Encoding class) ooRexx> s~hasText = -- 0 0 ooRexx> s~encoding = .UTF16BE_Encoding -- tag the string: encoded UTF16BE ooRexx> s~encoding = -- (The UTF16BE_Encoding class) (The UTF16BE_Encoding class) ooRexx> s~hasText = -- still no associated RexxText: 0 0 ooRexx> t = s~text -- associates a RexxText to the string ooRexx> s~hasText = -- the string has an associated text: 1 1 ooRexx> t~encoding = -- the encoding of the text is the one of the string: (The UTF16BE_Encoding class) (The UTF16BE_Encoding class) ooRexx> t~utf8 = -- T'湯湳敮獥' Soup T'湯湳敮獥' -- Setting/getting the encoding of the string will set/get the encoding of the associated RexxText ooRexx> s~encoding = .UTF16LE_Encoding ooRexx> t~encoding = -- the encoding of the text has been changed: (The UTF16LE_Encoding class) (The UTF16LE_Encoding class) ooRexx> t~utf8 = -- T'潮獮湥敳' tide T'潮獮湥敳' -- =============================================================================== -- 2022 September 07 /* Add method MutableBuffer~isASCII Implementation more complex than for String, because mutable. Try to avoid to rescan the whole buffer, when possible. The native methods that modify the buffer are never scanning the buffer, they are just setting the boolean indicators is_ASCII_checked and is_ASCII. It's only the Rexx method ~isASCII which scans the whole buffer, if needed. Impacted methods: append caselessChangeStr changeStr delete delWord insert overlay replaceAt setBufferSize space translate */ ooRexx> b = .MutableBuffer~new("pere") ooRexx> b~isASCII = -- 1 1 ooRexx> b~insert("noël", 5)= -- M'pere noël' M'pere noël' ooRexx> b~isASCII = -- 0 0 ooRexx> b~setBufferSize(7)= -- M'pere no' M'pere no' ooRexx> b~isASCII= -- 1 1 ooRexx> b~append("ë", "l")= -- M'pere noël' M'pere noël' ooRexx> b~isASCII= -- 0 0 ooRexx> b~replaceAt("e", 8, 2)= -- M'pere noel' M'pere noel' ooRexx> b~isASCII= -- 1 0 ooRexx> b~changeStr("noel", "noël")= -- M'pere noël' M'pere noël' ooRexx> b~isASCII= -- 0 0 ooRexx> b~delete(8,2)= -- M'pere nol' M'pere nol' ooRexx> b~isASCII= -- 1 1 ooRexx> b~overlay("ël", 8)= -- M'pere noël' M'pere noël' ooRexx> b~isASCII= -- 0 0 ooRexx> b~delWord(2)= -- M'pere ' M'pere ' ooRexx> b~isASCII= -- 1 1 ooRexx> b~translate("è" || "91"x, "er ")= -- M'pèÑ' ("è" is "C3A8"x so "e"-->"C3"x, "r"-->A8"x and " "-->"91"x M'pèÑ' ooRexx> b~isASCII= -- 0 0 -- =============================================================================== -- 2022 August 18 /* Added Unicode case folding. See https://www.w3.org/TR/charmod-norm/ Case folding is the process of making two texts which differ only in case identical for comparison purposes. Implemented with utf8proc, which applies an NFKC normalization on the case-folded string. Methods on RexxText: ~Casefold ~isCasefold */ ooRexx> "ß"~text~casefold= -- T'ss' T'ss' ooRexx> "㎒"~text~casefold= -- T'mhz' (jlf Nov 8, 2022: now unchanged because no longer NFKC) T'㎒' ooRexx> ("sTrasse", "straße", "STRASSE")~each{item~text~casefold}== an Array (shape [3], 3 items) 1 : T'strasse' 2 : T'strasse' 3 : T'strasse' -- utf8proc doesn't support language-sensitive case-folding. -- Example: -- The name of the second largest city in Turkey is "Diyarbakır", which contains both the dotted and dotless letters i. ooRexx> "Diyarbakır"~text~upper= -- T'DIYARBAKIR' should be DİYARBAKIR T'DIYARBAKIR' ooRexx> "DİYARBAKIR"~text~casefold= -- T'di̇yarbakir' should be diyarbakır T'di̇yarbakir' -- The Julia developers, who uses utf8proc, have decided to remain locale-independent. -- See https://github.com/JuliaLang/julia/issues/7848 -- =============================================================================== -- 2022 August 07 /* Added normalization NFC, NFD, NFKC, NFKD. http://unicode.org/faq/normalization.html Implemented with utf8proc. Methods on RexxText: ~NFC ~isNFC ~NFD ~isNFD ~NFKC ~isNFKC ~NFKD ~isNFKD Possible values for isNFxx: -1 unknown 0 no 1 yes A same text can be in several normalization forms. Text exclusively containing ASCII characters (U+0000..U+007F) is left unaffected by all of the Normalization Forms: The 4 indicators isNFxx are 1. The methods NFxx sets the corresponding indicator isNFxx - on the source text : 0 or 1 (test if both strings are equal) - on the result text : 1 */ -- The normalized text can be memorized on the original text: ooRexx> text = "père Noël"~text ooRexx> textNFD = text~nfd(memorize:.true) -- From now, the returned NFD is always the memorized text: ooRexx> text~nfd == textNFD= -- .true 1 /* Some remarks about the string used in this demo: - the first "äöü" is NFC, the second "äöü" is NFD - "x̂" is two codepoints in any normalization. - "ϔ" normalization forms are all different. - "ﷺ" is one of the worst cases regarding the expansion factor in NFKS/NFKS: 18x - "baffle"~text~subchar(3)= -- T'ffl' "baffle"~text~upper= -- T'BAfflE', should be BAFFLE (to rework: utf8proc supports only simple uppercase) The ligature disappears in NFK[CD] but not in NF[CD] */ ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~UnicodeCharacters== an Array (shape [22], 22 items) 1 : ( "ä" U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" ) 2 : ( "ö" U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" ) 3 : ( "ü" U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" ) 4 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 5 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 6 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 7 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 8 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 9 : ( "u" U+0075 Ll 1 "LATIN SMALL LETTER U" ) 10 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 11 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 12 : ( "x" U+0078 Ll 1 "LATIN SMALL LETTER X" ) 13 : ( "̂" U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" ) 14 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 15 : ( "ϔ" U+03D4 Lu 1 "GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL" ) 16 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 17 : ( "ﷺ" U+FDFA Lo 1 "ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM" ) 18 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 19 : ( "b" U+0062 Ll 1 "LATIN SMALL LETTER B" ) 20 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 21 : ( "ffl" U+FB04 Ll 1 "LATIN SMALL LIGATURE FFL" ) 22 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~description= -- 'UTF-8 not-ASCII (18 graphemes, 22 codepoints, 34 bytes, 0 error)' 'UTF-8 not-ASCII (18 characters, 22 codepoints, 34 bytes, 0 error)' ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~upper= -- T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE' /* NFD Normalization Form D Canonical Decomposition Characters are decomposed by canonical equivalence, and multiple combining characters are arranged in a specific order. */ ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfd~UnicodeCharacters== an Array (shape [26], 26 items) 1 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 2 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 3 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 4 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 5 : ( "u" U+0075 Ll 1 "LATIN SMALL LETTER U" ) 6 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 7 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 8 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 9 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 10 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 11 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 12 : ( "u" U+0075 Ll 1 "LATIN SMALL LETTER U" ) 13 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 14 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 15 : ( "x" U+0078 Ll 1 "LATIN SMALL LETTER X" ) 16 : ( "̂" U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" ) 17 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 18 : ( "ϒ" U+03D2 Lu 1 "GREEK UPSILON WITH HOOK SYMBOL" ) 19 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 20 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 21 : ( "ﷺ" U+FDFA Lo 1 "ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM" ) 22 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 23 : ( "b" U+0062 Ll 1 "LATIN SMALL LETTER B" ) 24 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 25 : ( "ffl" U+FB04 Ll 1 "LATIN SMALL LIGATURE FFL" ) 26 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfd~description= -- 'UTF-8 not-ASCII (18 graphemes, 26 codepoints, 39 bytes, 0 error)' 'UTF-8 not-ASCII (18 characters, 26 codepoints, 39 bytes, 0 error)' ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfd~upper= -- T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE' T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE' /* NFC Normalization Form C Canonical Decomposition, followed by Canonical Composition Characters are decomposed and then recomposed by canonical equivalence. */ ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfc~UnicodeCharacters== an Array (shape [19], 19 items) 1 : ( "ä" U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" ) 2 : ( "ö" U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" ) 3 : ( "ü" U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" ) 4 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 5 : ( "ä" U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" ) 6 : ( "ö" U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" ) 7 : ( "ü" U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" ) 8 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 9 : ( "x" U+0078 Ll 1 "LATIN SMALL LETTER X" ) 10 : ( "̂" U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" ) 11 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 12 : ( "ϔ" U+03D4 Lu 1 "GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL" ) 13 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 14 : ( "ﷺ" U+FDFA Lo 1 "ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM" ) 15 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 16 : ( "b" U+0062 Ll 1 "LATIN SMALL LETTER B" ) 17 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 18 : ( "ffl" U+FB04 Ll 1 "LATIN SMALL LIGATURE FFL" ) 19 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfc~description= -- 'UTF-8 not-ASCII (18 graphemes, 19 codepoints, 31 bytes, 0 error)' 'UTF-8 not-ASCII (18 characters, 19 codepoints, 31 bytes, 0 error)' ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfc~upper= -- T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE' T'ÄÖÜ ÄÖÜ X̂ ϔ ﷺ BAfflE' /* NFKD Normalization Form KD Compatibility Decomposition (K is used to stand for compatibility to avoid confusion with the C standing for composition) Characters are decomposed by compatibility, and multiple combining characters are arranged in a specific order. */ ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkd~UnicodeCharacters== an Array (shape [45], 45 items) 1 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 2 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 3 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 4 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 5 : ( "u" U+0075 Ll 1 "LATIN SMALL LETTER U" ) 6 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 7 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 8 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 9 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 10 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 11 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 12 : ( "u" U+0075 Ll 1 "LATIN SMALL LETTER U" ) 13 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 14 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 15 : ( "x" U+0078 Ll 1 "LATIN SMALL LETTER X" ) 16 : ( "̂" U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" ) 17 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 18 : ( "Υ" U+03A5 Lu 1 "GREEK CAPITAL LETTER UPSILON" ) 19 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 20 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 21 : ( "ص" U+0635 Lo 1 "ARABIC LETTER SAD" ) 22 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 23 : ( "ى" U+0649 Lo 1 "ARABIC LETTER ALEF MAKSURA" ) 24 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 25 : ( "ا" U+0627 Lo 1 "ARABIC LETTER ALEF" ) 26 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 27 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 28 : ( "ه" U+0647 Lo 1 "ARABIC LETTER HEH" ) 29 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 30 : ( "ع" U+0639 Lo 1 "ARABIC LETTER AIN" ) 31 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 32 : ( "ي" U+064A Lo 1 "ARABIC LETTER YEH" ) 33 : ( "ه" U+0647 Lo 1 "ARABIC LETTER HEH" ) 34 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 35 : ( "و" U+0648 Lo 1 "ARABIC LETTER WAW" ) 36 : ( "س" U+0633 Lo 1 "ARABIC LETTER SEEN" ) 37 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 38 : ( "م" U+0645 Lo 1 "ARABIC LETTER MEEM" ) 39 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 40 : ( "b" U+0062 Ll 1 "LATIN SMALL LETTER B" ) 41 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 42 : ( "f" U+0066 Ll 1 "LATIN SMALL LETTER F" ) 43 : ( "f" U+0066 Ll 1 "LATIN SMALL LETTER F" ) 44 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) 45 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkd~description= -- 'UTF-8 not-ASCII (37 graphemes, 45 codepoints, 69 bytes, 0 error)' 'UTF-8 not-ASCII (37 characters, 45 codepoints, 69 bytes, 0 error)' ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkd~upper= -- T'ÄÖÜ ÄÖÜ X̂ Ϋ صلى الله عليه وسلم BAFFLE T'ÄÖÜ ÄÖÜ X̂ Ϋ صلى الله عليه وسلم BAFFLE' /* NFKC Normalization Form KC Compatibility Decomposition, followed by Canonical Composition Characters are decomposed by compatibility, then recomposed by canonical equivalence. */ ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkc~UnicodeCharacters== an Array (shape [38], 38 items) 1 : ( "ä" U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" ) 2 : ( "ö" U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" ) 3 : ( "ü" U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" ) 4 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 5 : ( "ä" U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" ) 6 : ( "ö" U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" ) 7 : ( "ü" U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" ) 8 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 9 : ( "x" U+0078 Ll 1 "LATIN SMALL LETTER X" ) 10 : ( "̂" U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" ) 11 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 12 : ( "Ϋ" U+03AB Lu 1 "GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA" ) 13 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 14 : ( "ص" U+0635 Lo 1 "ARABIC LETTER SAD" ) 15 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 16 : ( "ى" U+0649 Lo 1 "ARABIC LETTER ALEF MAKSURA" ) 17 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 18 : ( "ا" U+0627 Lo 1 "ARABIC LETTER ALEF" ) 19 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 20 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 21 : ( "ه" U+0647 Lo 1 "ARABIC LETTER HEH" ) 22 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 23 : ( "ع" U+0639 Lo 1 "ARABIC LETTER AIN" ) 24 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 25 : ( "ي" U+064A Lo 1 "ARABIC LETTER YEH" ) 26 : ( "ه" U+0647 Lo 1 "ARABIC LETTER HEH" ) 27 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 28 : ( "و" U+0648 Lo 1 "ARABIC LETTER WAW" ) 29 : ( "س" U+0633 Lo 1 "ARABIC LETTER SEEN" ) 30 : ( "ل" U+0644 Lo 1 "ARABIC LETTER LAM" ) 31 : ( "م" U+0645 Lo 1 "ARABIC LETTER MEEM" ) 32 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 33 : ( "b" U+0062 Ll 1 "LATIN SMALL LETTER B" ) 34 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 35 : ( "f" U+0066 Ll 1 "LATIN SMALL LETTER F" ) 36 : ( "f" U+0066 Ll 1 "LATIN SMALL LETTER F" ) 37 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) 38 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkc~description= -- 'UTF-8 not-ASCII (37 graphemes, 38 codepoints, 61 bytes, 0 error)' 'UTF-8 not-ASCII (37 characters, 38 codepoints, 61 bytes, 0 error)' ooRexx> "äöü äöü x̂ ϔ ﷺ baffle"~text~nfkc~upper= -- T'ÄÖÜ ÄÖÜ X̂ Ϋ صلى الله عليه وسلم BAFFLE' T'ÄÖÜ ÄÖÜ X̂ Ϋ صلى الله عليه وسلم BAFFLE' -- The normalization forms are implemented only for UTF-8 and WTF-8. ooRexx> "D800 DC01"x~text("utf16")~nfd~UnicodeCharacters== -- Method TRANSFORM is ABSTRACT and cannot be directly invoked. Method TRANSFORM is ABSTRACT and cannot be directly invoked. Error code= 93.965 ooRexx> "D800 DC01"x~text("utf16")~utf8~nfd~UnicodeCharacters== an Array (shape [1], 1 items) 1 : ( "𐀁" U+10001 Lo 1 "LINEAR B SYLLABLE B038 E" ) ooRexx> "\uD800\uDC01"~text("wtf8")~unescape~nfd~UnicodeCharacters== an Array (shape [1], 1 items) 1 : ( "𐀁" U+10001 Lo 1 "LINEAR B SYLLABLE B038 E" ) -- If the WTF-8 string is not a valid UTF-8 string then an error is raised by utf8proc ooRexx> "D800"x ~text("wtf16")~wtf8~nfd~UnicodeCharacters== -- Invalid UTF-8 string Invalid UTF-8 string. Error code= 22.900 ooRexx> "\uD800"~text("wtf8")~unescape~nfd~UnicodeCharacters== -- Invalid UTF-8 string Invalid UTF-8 string. Error code= 22.900 -- =============================================================================== -- 2022 August 03 /* https://discourse.julialang.org/t/stupid-question-on-unicode/27674/10 Should I support this when unescaping? (High surrogate followed by low surrogate) Surrogate pairs are a UTF-16-specific construct. However, string escapes aren’t byte sequences of a particular encoding. They are somewhat arbitrary substitutions / macros. */ ooRexx> "\uD83D\uDE3F"~text~unescape~errors== an Array (shape [6], 6 items) 1 : 'UTF-8 encoding: byte sequence at byte-position 1 has an invalid continuation byte 160 (A0x) at byte-position 2 (high surrogate, use WTF-8).' 2 : 'UTF-8 encoding: byte sequence at byte-position 2 has an invalid start byte 160 (A0x) (non-shortest form).' 3 : 'UTF-8 encoding: byte sequence at byte-position 3 has an invalid start byte 189 (BDx) (non-shortest form).' 4 : 'UTF-8 encoding: byte sequence at byte-position 4 has an invalid continuation byte 184 (B8x) at byte-position 5 (low surrogate, use WTF-8).' 5 : 'UTF-8 encoding: byte sequence at byte-position 5 has an invalid start byte 184 (B8x) (non-shortest form).' 6 : 'UTF-8 encoding: byte sequence at byte-position 6 has an invalid start byte 191 (BFx) (non-shortest form).' ooRexx> "\uD83D\uDE3F"~text~wtf8~unescape~errors== (The NIL object) -- Yes, I should support it when the encoding is WTF-8, because the concatenation manages correctly this case: ooRexx> ("\uD83D"~text~wtf8~unescape || "\uDE3F"~text~wtf8~unescape)~UnicodeCharacters== an Array (shape [1], 1 items) 1 : ( "😿" U+1F63F So 2 "CRYING CAT FACE" ) ooRexx> ("\uD83D"~text~wtf8~unescape || "\uDE3F"~text~wtf8~unescape)~description= 'WTF-8 not-ASCII (1 character, 1 codepoint, 4 bytes, 0 error)' -- Done, now "\uD83D\uDE3F"~text~wtf8~unescape= -- "😿" -- =============================================================================== -- 2022 July 20 /* I realize that I can pass options when filtering the unicode characters. Same options as when sending the message "matcher" to a string. -- Options: not wholestring, trace with prefix "> " */ ooRexx> .unicode~characters("father", wholeString:0, trace:1, prefix:">") >description: stringChunkPattern="father" wholeString=0 caseless=1 >stringPattern="father" >matcher: expose description stringPattern; use strict arg string; return string~caselessPos(stringPattern) <> 0 -- Same options with a regular expression. -- "/father" is faster than "/.*father.*" but still very slow compared to "father" ooRexx> .unicode~characters("/father", wholeString:0, trace:1, prefix:"> ") > description: stringChunkPattern="/father" wholeString=0 caseless=1 > stringPattern="father" > pattern = .Pattern~compile(stringPattern, .RegexCompiler~new(.RegexCompiler~caseless)) > matcher: expose description pattern; use strict arg string; return pattern~find(string)~matched -- Note that "/.*father.*" in mode not wholestring is just unusable: 419 sec under MBP 2010 Intel Core 2 Duo -- [2022 Dec 22] Still unusable under MBP 2021 M1 Pro: 78s (only 5.37 faster) -- =============================================================================== -- 2022 July 17 -- For convenience, add an optional parameter 'filter' to the method .unicode~characters ooRexx> .unicode~characters("*rex*")== an Array (shape [15], 15 items) 1 : ( "꜌" U+A70C Sk 1 "MODIFIER LETTER EXTRA-LOW DOTTED TONE BAR" ) 2 : ( "˩" U+02E9 Sk 1 "MODIFIER LETTER EXTRA-LOW TONE BAR" ) 3 : ( "꜍" U+A70D Sk 1 "MODIFIER LETTER EXTRA-HIGH DOTTED LEFT-STEM TONE BAR" ) 4 : ( "꜑" U+A711 Sk 1 "MODIFIER LETTER EXTRA-LOW DOTTED LEFT-STEM TONE BAR" ) 5 : ( "˥" U+02E5 Sk 1 "MODIFIER LETTER EXTRA-HIGH TONE BAR" ) 6 : ( "꜈" U+A708 Sk 1 "MODIFIER LETTER EXTRA-HIGH DOTTED TONE BAR" ) 7 : ( "🖕" U+1F595 So 2 "REVERSED HAND WITH MIDDLE FINGER EXTENDED" ) 8 : ( "ꎅ" U+A385 Lo 2 "YI SYLLABLE RREX" ) 9 : ( "꜒" U+A712 Sk 1 "MODIFIER LETTER EXTRA-HIGH LEFT-STEM TONE BAR" ) 10 : ( "ꏑ" U+A3D1 Lo 2 "YI SYLLABLE REX" ) 11 : ( "ꎜ" U+A39C Lo 2 "YI SYLLABLE NREX" ) 12 : ( "꜖" U+A716 Sk 1 "MODIFIER LETTER EXTRA-LOW LEFT-STEM TONE BAR" ) 13 : ( "𖩿" U+16A7F Lo 1 "TANGSA LETTER EX" ) 14 : ( "𝍊" U+1D34A So 1 "TETRAGRAM FOR EXHAUSTION" ) 15 : ( "🦖" U+1F996 So 2 "T-REX" ) -- is equivalent to ooRexx> matcher = "*rex*"~matcher; .unicode~characters~select{expose matcher; matcher~(item~name)}== an Array (shape [4], 4 items) 1 : ( "ꎅ" U+A385 Lo 2 "YI SYLLABLE RREX" ) 2 : ( "ꎜ" U+A39C Lo 2 "YI SYLLABLE NREX" ) 3 : ( "ꏑ" U+A3D1 Lo 2 "YI SYLLABLE REX" ) 4 : ( "🦖" U+1F996 So 2 "T-REX" ) -- Regular expressions are supported: -- returns all the characters whose name starts with "math" and ends with "psi" ooRexx> .unicode~characters("/^math.*psi$")== an Array (shape [10], 10 items) 1 : ( "𝚿" U+1D6BF Lu 1 "MATHEMATICAL BOLD CAPITAL PSI" ) 2 : ( "𝛙" U+1D6D9 Ll 1 "MATHEMATICAL BOLD SMALL PSI" ) 3 : ( "𝛹" U+1D6F9 Lu 1 "MATHEMATICAL ITALIC CAPITAL PSI" ) 4 : ( "𝜓" U+1D713 Ll 1 "MATHEMATICAL ITALIC SMALL PSI" ) 5 : ( "𝜳" U+1D733 Lu 1 "MATHEMATICAL BOLD ITALIC CAPITAL PSI" ) 6 : ( "𝝍" U+1D74D Ll 1 "MATHEMATICAL BOLD ITALIC SMALL PSI" ) 7 : ( "𝝭" U+1D76D Lu 1 "MATHEMATICAL SANS-SERIF BOLD CAPITAL PSI" ) 8 : ( "𝞇" U+1D787 Ll 1 "MATHEMATICAL SANS-SERIF BOLD SMALL PSI" ) 9 : ( "𝞧" U+1D7A7 Lu 1 "MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL PSI" ) 10 : ( "𝟁" U+1D7C1 Ll 1 "MATHEMATICAL SANS-SERIF BOLD ITALIC SMALL PSI" ) /* The regular expressions are implemented with 100% ooRexx code, and as such can be particularly inefficient... When applied to a collection of 43885 Unicode characters, we have: .unicode~characters("/.*father.*") -- 30.5 sec The same filter without regular expression: .unicode~characters("*father*") -- 0.9 sec Something to clarify: Why such a difference of duration for the following pieces of code? In the end, it's the same code in both cases: matcher = "/.*father.*"~matcher; supplier = .unicode~characters; collectedItems = .Array~new; do while supplier~available; item = supplier~item; if matcher~(item~name) then collectedItems~append(item); supplier~next; end; collectedItems== 64 sec matcher = "/.*father.*"~matcher; .unicode~characters~select{expose matcher; matcher~(item~name)}== 31 sec */ -- =============================================================================== -- 2022 July 13 /* Rework ~unescape to be closer to other languages: \u{...} and \U{...} are equivalent \u{X..X} is now hexadecimal, no more decimal codepoint. The first character must be 0..9. \uXXXX is now supported \UXXXXXXXX is now supported Ex: */ ooRexx> "\u{bed} is different from \u{0bed}"~text~unescape= -- T'🛏 is different from ௭' T'🛏 is different from ௭' ooRexx> .unicode~character("bed")= -- ( "🛏" U+1F6CF So 1 "BED" ) ( "🛏" U+1F6CF So 1 "BED" ) ooRexx> .unicode~character("bed", hexadecimal:.true)= -- ( "௭" U+0BED Nd 1 "TAMIL DIGIT SEVEN" ) ( "௭" U+0BED Nd 1 "TAMIL DIGIT SEVEN" ) ooRexx> .unicode~character("U+0bed")= -- ( "௭" U+0BED Nd 1 "TAMIL DIGIT SEVEN" ) ( "௭" U+0BED Nd 1 "TAMIL DIGIT SEVEN" ) ooRexx> "The \u{t-rex} shows his \u{flexed biceps}!"~text~unescape= -- T'The 🦖 shows his 💪!' T'The 🦖 shows his 💪!' ooRexx> "\u0031 + \u0032\u0033 = \u0032\u0034"~text~unescape= -- T'1 + 23 = 24' T'1 + 23 = 24' ooRexx> "\U00000031 + \U00000032\U00000033 = \U00000032\U00000034"~text~unescape= -- T'1 + 23 = 24' T'1 + 23 = 24' -- =============================================================================== -- 2022 February 13 /* New method unescape, available only for Byte, UTF-8 and WTF-8. \b backspace (BS) \t horizontal tab (HT) \n linefeed (LF) \f form feed (FF) \r carriage return (CR) \u{Unicode name} Character name in the Unicode database \u{N..N} Unicode character denoted by 1-8 hex digits. The first character must be a digit 0..9. \u{U+X..X} Unicode character denoted by 1-n hex digits \x{X..X} sequence of 1..n hexadecimal digits Examples: */ ooRexx> "hello\u{space}John\n"~text~unescape= -- T'hello John[0A]' T'hello John[0A]' ooRexx> "hello\u{20}John\n"~text~unescape= T'hello John[0A]' ooRexx> "hello\u{U+20}John\n"~text~unescape= Expected U+ or u+ followed by 4..6 hex digits, got '20'. Error code= 93.900 ooRexx> -- \u is not supported for Byte encoding, you can use \x ooRexx> "hello\u{U+20}John\n"~text("byte")~unescape= -- Byte encoding: \u not supported. Byte encoding: \u not supported. Error code= 23.900 ooRexx> "hello\x{20}John\n"~text("byte")~unescape -- T'hello John[0A]' ooRexx> -- No implementation for UTF-16, WTF-16, UTF-32. ooRexx> "hello\u{U+20}John\n"~text~utf16~unescape= -- Method UNESCAPE is ABSTRACT and cannot be directly invoked. Method UNESCAPE is ABSTRACT and cannot be directly invoked. Error code= 93.965 -- =============================================================================== -- 2021 September 30 /* New methods: .String join (was concatenateSeparated) .MutableBuffer join (was concatenateSeparated) .Unicode [] (equivalent to .Unicode~character) .UnicodeCharacter makeRexxText text wtf8 wtf16 wtf16be wtf16le .RexxText join left right x2d Examples: */ -- https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries -- no break before ZWJ (GB9), but break after if not emoji modifier sequence or emoji zwj sequence (GB11) ooRexx> .unicode["zwj"]~utf8~join("ab", "cd", .unicode["woman"]~utf8, .unicode["father christmas"]~utf8)~c2g= -- '61 62E2808D 63 64E2808D F09F91A9E2808DF09F8E85' '61 62E2808D 63 64E2808D F09F91A9E2808DF09F8E85' ooRexx> .unicode["zwj"]~utf8~join("ab", "cd", .unicode["woman"]~utf8, .unicode["father christmas"]~utf8)~graphemes== a CharacterSupplier 1 : T'a' 2 : T'b' 3 : T'c' 4 : T'd' 5 : T'👩🎅' ooRexx> "noël👩👨👩👧🎅"~text~UnicodeCharacters== an Array (shape [12], 12 items) 1 : ( "n" U+006E Ll 1 "LATIN SMALL LETTER N" ) 2 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 3 : ( "ë" U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" ) 4 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) 5 : ( "👩" U+1F469 So 2 "WOMAN" ) 6 : ( "" U+200D Cf 0 "ZERO WIDTH JOINER", "ZWJ" ) 7 : ( "👨" U+1F468 So 2 "MAN" ) 8 : ( "" U+200D Cf 0 "ZERO WIDTH JOINER", "ZWJ" ) 9 : ( "👩" U+1F469 So 2 "WOMAN" ) 10 : ( "" U+200D Cf 0 "ZERO WIDTH JOINER", "ZWJ" ) 11 : ( "👧" U+1F467 So 2 "GIRL" ) 12 : ( "🎅" U+1F385 So 2 "FATHER CHRISTMAS" ) -- https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries -- Do not break within emoji modifier sequences or emoji zwj sequences (GB11). ooRexx> "noël👩👨👩👧🎅"~text~graphemes== a CharacterSupplier 1 : T'n' 2 : T'o' 3 : T'ë' 4 : T'l' 5 : T'👩👨👩👧' 6 : T'🎅' ooRexx> do i=0 to 9; "left("i") = " || "noël👩👨👩👧🎅"~text~left(i)=; end T'left(0) = ' T'left(1) = n' T'left(2) = no' T'left(3) = noë' T'left(4) = noël' T'left(5) = noël👩👨👩👧' T'left(6) = noël👩👨👩👧🎅' T'left(7) = noël👩👨👩👧🎅 ' T'left(8) = noël👩👨👩👧🎅 ' T'left(9) = noël👩👨👩👧🎅 ' ooRexx> do i=0 to 9; "right("i") = " || "noël👩👨👩👧🎅"~text~right(i)=; end T'right(0) = ' T'right(1) = 🎅' T'right(2) = 👩👨👩👧🎅' T'right(3) = l👩👨👩👧🎅' T'right(4) = ël👩👨👩👧🎅' T'right(5) = oël👩👨👩👧🎅' T'right(6) = noël👩👨👩👧🎅' T'right(7) = noël👩👨👩👧🎅' T'right(8) = noël👩👨👩👧🎅' T'right(9) = noël👩👨👩👧🎅' -- =============================================================================== -- 2021 September 28 /* New methods: .RexxText reverse Examples: */ -- Correct reverse ooRexx> "noël"~text~c2x= -- '6E 6F C3AB 6C' '6E 6F C3AB 6C' ooRexx> "noël"~text~reverse~c2x= -- '6C C3AB 6F 6E' '6C C3AB 6F 6E' ooRexx> "noël"~text~reverse= -- T'lëon' T'lëon' -- Correct reverse (was Wrong reverse before automatic conversion of string literals to text) ooRexx> "noël"~c2x= -- '6E 6F C3AB 6C' '6E 6F C3AB 6C' ooRexx> "noël"~reverse~c2x= -- '6C C3AB 6F 6E' '6C C3AB 6F 6E' ooRexx> "noël"~reverse= -- T'lëon' T'lëon' -- =============================================================================== -- 2021 September 27 /* New native methods: .Unicode codepointToLower codepointToUpper codepointToTitle codepointIsLower codepointIsUpper New methods: .RexxText lower upper isLower isUpper characters Examples: */ ooRexx> "aàâäeéèêëiîïoôöuûü"~text~isUpper= -- .false 0 ooRexx> "aàâäeéèêëiîïoôöuûü"~text~isLower= -- .true 1 ooRexx> "AÀÂÄEÉÈÊËIÎÏOÔÖUÛÜ"~text~isUpper= -- .true 1 ooRexx> "AÀÂÄEÉÈÊËIÎÏOÔÖUÛÜ"~text~isLower= -- .false 0 ooRexx> "Le père Noël est fatigué..."~text~upper= -- T'LE PÈRE NOËL EST FATIGUÉ...' T'LE PÈRE NOËL EST FATIGUÉ...' ooRexx> "LE PÈRE NOËL EST FATIGUÉ..."~text~lower= -- T'le père noël est fatigué...' T'le père noël est fatigué...' /* utf8proc supports only the basic cases (those in UnicodeData.txt). The cases described in SpecialCasing.txt are not supported by utf8proc. Examples: */ -- # The German es-zed is special--the normal mapping is to SS. -- # Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>)) -- # <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment> -- 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S /* TODO: full casing not yet implemented .Unicode~codepointToLowerFull .Unicode~codepointToUpperFull .Unicode~codepointToTitleFull The rest of the framework is ready for full casing. */ ooRexx> .unicode~character("LATIN SMALL LETTER SHARP S")~utf8= -- T'ß' T'ß' ooRexx> .unicode~character("LATIN SMALL LETTER SHARP S")~toUpperSimple= -- 7838, which is the codepoint of (U+1E9E Lu "LATIN CAPITAL LETTER SHARP S") 7838 ooRexx> .unicode~character(7838)~utf8= -- T'ẞ' T'ẞ' -- T'ß' to uppercase should be T'SS': ooRexx> "0053 0053"x~text("utf16")~UnicodeCharacters== an Array (shape [2], 2 items) 1 : ( "S" U+0053 Lu 1 "LATIN CAPITAL LETTER S" ) 2 : ( "S" U+0053 Lu 1 "LATIN CAPITAL LETTER S" ) -- # Preserve canonical equivalence for I with dot. Turkic is handled below. -- 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE ooRexx> .unicode~character("LATIN CAPITAL LETTER I WITH DOT ABOVE")~utf8= -- T'İ' T'İ' ooRexx> .unicode~character("LATIN CAPITAL LETTER I WITH DOT ABOVE")~toLowerSimple= -- 105, which is the codepoint of (U+0069 Ll "LATIN SMALL LETTER I") 105 ooRexx> .unicode~character(105)~utf8= -- T'i' T'i' -- T'İ' to lowercase should be T'i̇̇': ooRexx> "0069 0307"x~text("utf16")~UnicodeCharacters== an Array (shape [2], 2 items) 1 : ( "i" U+0069 Ll 1 "LATIN SMALL LETTER I" ) 2 : ( "̇" U+0307 Mn 0 "COMBINING DOT ABOVE" ) -- # Turkish and Azeri -- # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri -- # The following rules handle those cases. -- 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE -- 0130; 0069; 0130; 0130; az; # LATIN CAPITAL LETTER I WITH DOT ABOVE -- # Note: the following case is already in the UnicodeData.txt file. -- # 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I ooRexx> .unicode~character("LATIN SMALL LETTER DOTLESS I")~utf8= -- T'ı' T'ı' ooRexx> .unicode~character("LATIN SMALL LETTER DOTLESS I")~toUpperSimple= -- 73, which is the codepoint of (U+0049 Lu "LATIN CAPITAL LETTER I") 73 ooRexx> .unicode~character(73)~utf8= -- T'I' T'I' -- Which characters have their title character different from their upper character? ooRexx> .unicode~characters~select{item~toTitleSimple <> item~toUpperSimple}~each{.Unicode[item~toTitleSimple]~utf8 .Unicode[item~ToUpperSimple]~utf8 item~utf8 item}== an Array (shape [58], 58 items) 1 : T'Dž DŽ DŽ "DŽ" U+01C4 Lu 1 "LATIN CAPITAL LETTER DZ WITH CARON" ' 2 : T'Dž DŽ Dž "Dž" U+01C5 Lt 1 "LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON" ' 3 : T'Dž DŽ dž "dž" U+01C6 Ll 1 "LATIN SMALL LETTER DZ WITH CARON" ' 4 : T'Lj LJ LJ "LJ" U+01C7 Lu 1 "LATIN CAPITAL LETTER LJ" ' 5 : T'Lj LJ Lj "Lj" U+01C8 Lt 1 "LATIN CAPITAL LETTER L WITH SMALL LETTER J" ' 6 : T'Lj LJ lj "lj" U+01C9 Ll 1 "LATIN SMALL LETTER LJ" ' 7 : T'Nj NJ NJ "NJ" U+01CA Lu 1 "LATIN CAPITAL LETTER NJ" ' 8 : T'Nj NJ Nj "Nj" U+01CB Lt 1 "LATIN CAPITAL LETTER N WITH SMALL LETTER J" ' 9 : T'Nj NJ nj "nj" U+01CC Ll 1 "LATIN SMALL LETTER NJ" ' 10 : T'Dz DZ DZ "DZ" U+01F1 Lu 1 "LATIN CAPITAL LETTER DZ" ' 11 : T'Dz DZ Dz "Dz" U+01F2 Lt 1 "LATIN CAPITAL LETTER D WITH SMALL LETTER Z" ' 12 : T'Dz DZ dz "dz" U+01F3 Ll 1 "LATIN SMALL LETTER DZ" ' 13 : T'ა Ა ა "ა" U+10D0 Ll 1 "GEORGIAN LETTER AN" ' 14 : T'ბ Ბ ბ "ბ" U+10D1 Ll 1 "GEORGIAN LETTER BAN" ' 15 : T'გ Გ გ "გ" U+10D2 Ll 1 "GEORGIAN LETTER GAN" ' 16 : T'დ Დ დ "დ" U+10D3 Ll 1 "GEORGIAN LETTER DON" ' 17 : T'ე Ე ე "ე" U+10D4 Ll 1 "GEORGIAN LETTER EN" ' 18 : T'ვ Ვ ვ "ვ" U+10D5 Ll 1 "GEORGIAN LETTER VIN" ' 19 : T'ზ Ზ ზ "ზ" U+10D6 Ll 1 "GEORGIAN LETTER ZEN" ' 20 : T'თ Თ თ "თ" U+10D7 Ll 1 "GEORGIAN LETTER TAN" ' 21 : T'ი Ი ი "ი" U+10D8 Ll 1 "GEORGIAN LETTER IN" ' 22 : T'კ Კ კ "კ" U+10D9 Ll 1 "GEORGIAN LETTER KAN" ' 23 : T'ლ Ლ ლ "ლ" U+10DA Ll 1 "GEORGIAN LETTER LAS" ' 24 : T'მ Მ მ "მ" U+10DB Ll 1 "GEORGIAN LETTER MAN" ' 25 : T'ნ Ნ ნ "ნ" U+10DC Ll 1 "GEORGIAN LETTER NAR" ' 26 : T'ო Ო ო "ო" U+10DD Ll 1 "GEORGIAN LETTER ON" ' 27 : T'პ Პ პ "პ" U+10DE Ll 1 "GEORGIAN LETTER PAR" ' 28 : T'ჟ Ჟ ჟ "ჟ" U+10DF Ll 1 "GEORGIAN LETTER ZHAR" ' 29 : T'რ Რ რ "რ" U+10E0 Ll 1 "GEORGIAN LETTER RAE" ' 30 : T'ს Ს ს "ს" U+10E1 Ll 1 "GEORGIAN LETTER SAN" ' 31 : T'ტ Ტ ტ "ტ" U+10E2 Ll 1 "GEORGIAN LETTER TAR" ' 32 : T'უ Უ უ "უ" U+10E3 Ll 1 "GEORGIAN LETTER UN" ' 33 : T'ფ Ფ ფ "ფ" U+10E4 Ll 1 "GEORGIAN LETTER PHAR" ' 34 : T'ქ Ქ ქ "ქ" U+10E5 Ll 1 "GEORGIAN LETTER KHAR" ' 35 : T'ღ Ღ ღ "ღ" U+10E6 Ll 1 "GEORGIAN LETTER GHAN" ' 36 : T'ყ Ყ ყ "ყ" U+10E7 Ll 1 "GEORGIAN LETTER QAR" ' 37 : T'შ Შ შ "შ" U+10E8 Ll 1 "GEORGIAN LETTER SHIN" ' 38 : T'ჩ Ჩ ჩ "ჩ" U+10E9 Ll 1 "GEORGIAN LETTER CHIN" ' 39 : T'ც Ც ც "ც" U+10EA Ll 1 "GEORGIAN LETTER CAN" ' 40 : T'ძ Ძ ძ "ძ" U+10EB Ll 1 "GEORGIAN LETTER JIL" ' 41 : T'წ Წ წ "წ" U+10EC Ll 1 "GEORGIAN LETTER CIL" ' 42 : T'ჭ Ჭ ჭ "ჭ" U+10ED Ll 1 "GEORGIAN LETTER CHAR" ' 43 : T'ხ Ხ ხ "ხ" U+10EE Ll 1 "GEORGIAN LETTER XAN" ' 44 : T'ჯ Ჯ ჯ "ჯ" U+10EF Ll 1 "GEORGIAN LETTER JHAN" ' 45 : T'ჰ Ჰ ჰ "ჰ" U+10F0 Ll 1 "GEORGIAN LETTER HAE" ' 46 : T'ჱ Ჱ ჱ "ჱ" U+10F1 Ll 1 "GEORGIAN LETTER HE" ' 47 : T'ჲ Ჲ ჲ "ჲ" U+10F2 Ll 1 "GEORGIAN LETTER HIE" ' 48 : T'ჳ Ჳ ჳ "ჳ" U+10F3 Ll 1 "GEORGIAN LETTER WE" ' 49 : T'ჴ Ჴ ჴ "ჴ" U+10F4 Ll 1 "GEORGIAN LETTER HAR" ' 50 : T'ჵ Ჵ ჵ "ჵ" U+10F5 Ll 1 "GEORGIAN LETTER HOE" ' 51 : T'ჶ Ჶ ჶ "ჶ" U+10F6 Ll 1 "GEORGIAN LETTER FI" ' 52 : T'ჷ Ჷ ჷ "ჷ" U+10F7 Ll 1 "GEORGIAN LETTER YN" ' 53 : T'ჸ Ჸ ჸ "ჸ" U+10F8 Ll 1 "GEORGIAN LETTER ELIFI" ' 54 : T'ჹ Ჹ ჹ "ჹ" U+10F9 Ll 1 "GEORGIAN LETTER TURNED GAN" ' 55 : T'ჺ Ჺ ჺ "ჺ" U+10FA Ll 1 "GEORGIAN LETTER AIN" ' 56 : T'ჽ Ჽ ჽ "ჽ" U+10FD Ll 1 "GEORGIAN LETTER AEN" ' 57 : T'ჾ Ჾ ჾ "ჾ" U+10FE Ll 1 "GEORGIAN LETTER HARD SIGN" ' 58 : T'ჿ Ჿ ჿ "ჿ" U+10FF Ll 1 "GEORGIAN LETTER LABIAL SIGN" ' -- =============================================================================== -- 2021 September 22 /* New native methods: .Unicode codepointBidiMirrored codepointDecompositionType Add character aliases. .unicode~characters returns now a supplier, instead of the internal array of characters. The indexes of the characters supplier are the codepoints, not the indexes of the internal array which are codepoint+2. */ ooRexx> .unicode~characters== an UnicodeCharacterSupplier 0 : ( "" U+0000 Cc 0 "", "NULL", "NUL" ) 1 : ( "" U+0001 Cc 0 "", "START OF HEADING", "SOH" ) 2 : ( "" U+0002 Cc 0 "", "START OF TEXT", "STX" ) 3 : ( "" U+0003 Cc 0 "", "END OF TEXT", "ETX" ) 4 : ( "" U+0004 Cc 0 "", "END OF TRANSMISSION", "EOT" ) 5 : ( "" U+0005 Cc 0 "", "ENQUIRY", "ENQ" ) 6 : ( "" U+0006 Cc 0 "", "ACKNOWLEDGE", "ACK" ) 7 : ( "" U+0007 Cc 0 "", "ALERT", "BEL" ) 8 : ( "" U+0008 Cc 0 "", "BACKSPACE", "BS" ) 9 : ( "" U+0009 Cc 0 "", "CHARACTER TABULATION", "HORIZONTAL TABULATION", "HT", "TAB" ) 10 : ( "" U+000A Cc 0 "", "LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL" ) 11 : ( "" U+000B Cc 0 "", "LINE TABULATION", "VERTICAL TABULATION", "VT" ) 12 : ( "" U+000C Cc 0 "", "FORM FEED", "FF" ) 13 : ( "" U+000D Cc 0 "", "CARRIAGE RETURN", "CR" ) 14 : ( "" U+000E Cc 0 "", "SHIFT OUT", "LOCKING-SHIFT ONE", "SO" ) 15 : ( "" U+000F Cc 0 "", "SHIFT IN", "LOCKING-SHIFT ZERO", "SI" ) 16 : ( "" U+0010 Cc 0 "", "DATA LINK ESCAPE", "DLE" ) 17 : ( "" U+0011 Cc 0 "", "DEVICE CONTROL ONE", "DC1" ) 18 : ( "" U+0012 Cc 0 "", "DEVICE CONTROL TWO", "DC2" ) 19 : ( "" U+0013 Cc 0 "", "DEVICE CONTROL THREE", "DC3" ) 20 : ( "" U+0014 Cc 0 "", "DEVICE CONTROL FOUR", "DC4" ) 21 : ( "" U+0015 Cc 0 "", "NEGATIVE ACKNOWLEDGE", "NAK" ) 22 : ( "" U+0016 Cc 0 "", "SYNCHRONOUS IDLE", "SYN" ) 23 : ( "" U+0017 Cc 0 "", "END OF TRANSMISSION BLOCK", "ETB" ) 24 : ( "" U+0018 Cc 0 "", "CANCEL", "CAN" ) 25 : ( "" U+0019 Cc 0 "", "END OF MEDIUM", "EOM", "EM" ) 26 : ( "" U+001A Cc 0 "", "SUBSTITUTE", "SUB" ) 27 : ( "" U+001B Cc 0 "", "ESCAPE", "ESC" ) 28 : ( "" U+001C Cc 0 "", "INFORMATION SEPARATOR FOUR", "FILE SEPARATOR", "FS" ) 29 : ( "" U+001D Cc 0 "", "INFORMATION SEPARATOR THREE", "GROUP SEPARATOR", "GS" ) 30 : ( "" U+001E Cc 0 "", "INFORMATION SEPARATOR TWO", "RECORD SEPARATOR", "RS" ) 31 : ( "" U+001F Cc 0 "", "INFORMATION SEPARATOR ONE", "UNIT SEPARATOR", "US" ) 32 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 33 : ( "!" U+0021 Po 1 "EXCLAMATION MARK" ) 34 : ( """ U+0022 Po 1 "QUOTATION MARK" ) 35 : ( "#" U+0023 Po 1 "NUMBER SIGN" ) 36 : ( "$" U+0024 Sc 1 "DOLLAR SIGN" ) 37 : ( "%" U+0025 Po 1 "PERCENT SIGN" ) 38 : ( "&" U+0026 Po 1 "AMPERSAND" ) 39 : ( "'" U+0027 Po 1 "APOSTROPHE" ) 40 : ( "(" U+0028 Ps 1 "LEFT PARENTHESIS" ) 41 : ( ")" U+0029 Pe 1 "RIGHT PARENTHESIS" ) 42 : ( "*" U+002A Po 1 "ASTERISK" ) 43 : ( "+" U+002B Sm 1 "PLUS SIGN" ) 44 : ( "," U+002C Po 1 "COMMA" ) 45 : ( "-" U+002D Pd 1 "HYPHEN-MINUS" ) 46 : ( "." U+002E Po 1 "FULL STOP" ) 47 : ( "/" U+002F Po 1 "SOLIDUS" ) 48 : ( "0" U+0030 Nd 1 "DIGIT ZERO" ) 49 : ( "1" U+0031 Nd 1 "DIGIT ONE" ) 50 : ( "2" U+0032 Nd 1 "DIGIT TWO" ) 51 : ( "3" U+0033 Nd 1 "DIGIT THREE" ) 52 : ( "4" U+0034 Nd 1 "DIGIT FOUR" ) 53 : ( "5" U+0035 Nd 1 "DIGIT FIVE" ) 54 : ( "6" U+0036 Nd 1 "DIGIT SIX" ) 55 : ( "7" U+0037 Nd 1 "DIGIT SEVEN" ) 56 : ( "8" U+0038 Nd 1 "DIGIT EIGHT" ) 57 : ( "9" U+0039 Nd 1 "DIGIT NINE" ) 58 : ( ":" U+003A Po 1 "COLON" ) 59 : ( ";" U+003B Po 1 "SEMICOLON" ) 60 : ( "<" U+003C Sm 1 "LESS-THAN SIGN" ) 61 : ( "=" U+003D Sm 1 "EQUALS SIGN" ) 62 : ( ">" U+003E Sm 1 "GREATER-THAN SIGN" ) 63 : ( "?" U+003F Po 1 "QUESTION MARK" ) 64 : ( "@" U+0040 Po 1 "COMMERCIAL AT" ) 65 : ( "A" U+0041 Lu 1 "LATIN CAPITAL LETTER A" ) 66 : ( "B" U+0042 Lu 1 "LATIN CAPITAL LETTER B" ) 67 : ( "C" U+0043 Lu 1 "LATIN CAPITAL LETTER C" ) 68 : ( "D" U+0044 Lu 1 "LATIN CAPITAL LETTER D" ) 69 : ( "E" U+0045 Lu 1 "LATIN CAPITAL LETTER E" ) 70 : ( "F" U+0046 Lu 1 "LATIN CAPITAL LETTER F" ) 71 : ( "G" U+0047 Lu 1 "LATIN CAPITAL LETTER G" ) 72 : ( "H" U+0048 Lu 1 "LATIN CAPITAL LETTER H" ) 73 : ( "I" U+0049 Lu 1 "LATIN CAPITAL LETTER I" ) 74 : ( "J" U+004A Lu 1 "LATIN CAPITAL LETTER J" ) 75 : ( "K" U+004B Lu 1 "LATIN CAPITAL LETTER K" ) 76 : ( "L" U+004C Lu 1 "LATIN CAPITAL LETTER L" ) 77 : ( "M" U+004D Lu 1 "LATIN CAPITAL LETTER M" ) 78 : ( "N" U+004E Lu 1 "LATIN CAPITAL LETTER N" ) 79 : ( "O" U+004F Lu 1 "LATIN CAPITAL LETTER O" ) 80 : ( "P" U+0050 Lu 1 "LATIN CAPITAL LETTER P" ) 81 : ( "Q" U+0051 Lu 1 "LATIN CAPITAL LETTER Q" ) 82 : ( "R" U+0052 Lu 1 "LATIN CAPITAL LETTER R" ) 83 : ( "S" U+0053 Lu 1 "LATIN CAPITAL LETTER S" ) 84 : ( "T" U+0054 Lu 1 "LATIN CAPITAL LETTER T" ) 85 : ( "U" U+0055 Lu 1 "LATIN CAPITAL LETTER U" ) 86 : ( "V" U+0056 Lu 1 "LATIN CAPITAL LETTER V" ) 87 : ( "W" U+0057 Lu 1 "LATIN CAPITAL LETTER W" ) 88 : ( "X" U+0058 Lu 1 "LATIN CAPITAL LETTER X" ) 89 : ( "Y" U+0059 Lu 1 "LATIN CAPITAL LETTER Y" ) 90 : ( "Z" U+005A Lu 1 "LATIN CAPITAL LETTER Z" ) 91 : ( "[" U+005B Ps 1 "LEFT SQUARE BRACKET" ) 92 : ( "\" U+005C Po 1 "REVERSE SOLIDUS" ) 93 : ( "]" U+005D Pe 1 "RIGHT SQUARE BRACKET" ) 94 : ( "^" U+005E Sk 1 "CIRCUMFLEX ACCENT" ) 95 : ( "_" U+005F Pc 1 "LOW LINE" ) 96 : ( "`" U+0060 Sk 1 "GRAVE ACCENT" ) 97 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 98 : ( "b" U+0062 Ll 1 "LATIN SMALL LETTER B" ) 99 : ( "c" U+0063 Ll 1 "LATIN SMALL LETTER C" ) 100 : ( "d" U+0064 Ll 1 "LATIN SMALL LETTER D" ) 101 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) 102 : ( "f" U+0066 Ll 1 "LATIN SMALL LETTER F" ) 103 : ( "g" U+0067 Ll 1 "LATIN SMALL LETTER G" ) 104 : ( "h" U+0068 Ll 1 "LATIN SMALL LETTER H" ) 105 : ( "i" U+0069 Ll 1 "LATIN SMALL LETTER I" ) 106 : ( "j" U+006A Ll 1 "LATIN SMALL LETTER J" ) 107 : ( "k" U+006B Ll 1 "LATIN SMALL LETTER K" ) 108 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) 109 : ( "m" U+006D Ll 1 "LATIN SMALL LETTER M" ) 110 : ( "n" U+006E Ll 1 "LATIN SMALL LETTER N" ) 111 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 112 : ( "p" U+0070 Ll 1 "LATIN SMALL LETTER P" ) 113 : ( "q" U+0071 Ll 1 "LATIN SMALL LETTER Q" ) 114 : ( "r" U+0072 Ll 1 "LATIN SMALL LETTER R" ) 115 : ( "s" U+0073 Ll 1 "LATIN SMALL LETTER S" ) 116 : ( "t" U+0074 Ll 1 "LATIN SMALL LETTER T" ) 117 : ( "u" U+0075 Ll 1 "LATIN SMALL LETTER U" ) 118 : ( "v" U+0076 Ll 1 "LATIN SMALL LETTER V" ) 119 : ( "w" U+0077 Ll 1 "LATIN SMALL LETTER W" ) 120 : ( "x" U+0078 Ll 1 "LATIN SMALL LETTER X" ) 121 : ( "y" U+0079 Ll 1 "LATIN SMALL LETTER Y" ) 122 : ( "z" U+007A Ll 1 "LATIN SMALL LETTER Z" ) 123 : ( "{" U+007B Ps 1 "LEFT CURLY BRACKET" ) 124 : ( "|" U+007C Sm 1 "VERTICAL LINE" ) 125 : ( "}" U+007D Pe 1 "RIGHT CURLY BRACKET" ) 126 : ( "~" U+007E Sm 1 "TILDE" ) 127 : ( "" U+007F Cc 0 "", "DELETE", "DEL" ) 128 : ( "" U+0080 Cc 0 "", "PADDING CHARACTER", "PAD" ) 129 : ( "" U+0081 Cc 0 "", "HIGH OCTET PRESET", "HOP" ) 130 : ( "" U+0082 Cc 0 "", "BREAK PERMITTED HERE", "BPH" ) 131 : ( "" U+0083 Cc 0 "", "NO BREAK HERE", "NBH" ) 132 : ( "" U+0084 Cc 0 "", "INDEX", "IND" ) 133 : ( " " U+0085 Cc 0 "", "NEXT LINE", "NEL" ) 134 : ( "" U+0086 Cc 0 "", "START OF SELECTED AREA", "SSA" ) 135 : ( "" U+0087 Cc 0 "", "END OF SELECTED AREA", "ESA" ) 136 : ( "" U+0088 Cc 0 "", "CHARACTER TABULATION SET", "HORIZONTAL TABULATION SET", "HTS" ) 137 : ( "" U+0089 Cc 0 "", "CHARACTER TABULATION WITH JUSTIFICATION", "HORIZONTAL TABULATION WITH JUSTIFICATION", "HTJ" ) 138 : ( "" U+008A Cc 0 "", "LINE TABULATION SET", "VERTICAL TABULATION SET", "VTS" ) 139 : ( "" U+008B Cc 0 "", "PARTIAL LINE FORWARD", "PARTIAL LINE DOWN", "PLD" ) 140 : ( "" U+008C Cc 0 "", "PARTIAL LINE BACKWARD", "PARTIAL LINE UP", "PLU" ) 141 : ( "" U+008D Cc 0 "", "REVERSE LINE FEED", "REVERSE INDEX", "RI" ) 142 : ( "" U+008E Cc 0 "", "SINGLE SHIFT TWO", "SINGLE-SHIFT-2", "SS2" ) 143 : ( "" U+008F Cc 0 "", "SINGLE SHIFT THREE", "SINGLE-SHIFT-3", "SS3" ) 144 : ( "" U+0090 Cc 0 "", "DEVICE CONTROL STRING", "DCS" ) 145 : ( "" U+0091 Cc 0 "", "PRIVATE USE ONE", "PRIVATE USE-1", "PU1" ) 146 : ( "" U+0092 Cc 0 "", "PRIVATE USE TWO", "PRIVATE USE-2", "PU2" ) 147 : ( "" U+0093 Cc 0 "", "SET TRANSMIT STATE", "STS" ) 148 : ( "" U+0094 Cc 0 "", "CANCEL CHARACTER", "CCH" ) 149 : ( "" U+0095 Cc 0 "", "MESSAGE WAITING", "MW" ) 150 : ( "" U+0096 Cc 0 "", "START OF GUARDED AREA", "START OF PROTECTED AREA", "SPA" ) 151 : ( "" U+0097 Cc 0 "", "END OF GUARDED AREA", "END OF PROTECTED AREA", "EPA" ) 152 : ( "" U+0098 Cc 0 "", "START OF STRING", "SOS" ) 153 : ( "" U+0099 Cc 0 "", "SINGLE GRAPHIC CHARACTER INTRODUCER", "SGC" ) 154 : ( "" U+009A Cc 0 "", "SINGLE CHARACTER INTRODUCER", "SCI" ) 155 : ( "" U+009B Cc 0 "", "CONTROL SEQUENCE INTRODUCER", "CSI" ) 156 : ( "" U+009C Cc 0 "", "STRING TERMINATOR", "ST" ) 157 : ( "" U+009D Cc 0 "", "OPERATING SYSTEM COMMAND", "OSC" ) 158 : ( "" U+009E Cc 0 "", "PRIVACY MESSAGE", "PM" ) 159 : ( "" U+009F Cc 0 "", "APPLICATION PROGRAM COMMAND", "APC" ) 160 : ( " " U+00A0 Zs 1 "NO-BREAK SPACE", "NBSP" ) 161 : ( "¡" U+00A1 Po 1 "INVERTED EXCLAMATION MARK" ) 162 : ( "¢" U+00A2 Sc 1 "CENT SIGN" ) 163 : ( "£" U+00A3 Sc 1 "POUND SIGN" ) 164 : ( "¤" U+00A4 Sc 1 "CURRENCY SIGN" ) 165 : ( "¥" U+00A5 Sc 1 "YEN SIGN" ) 166 : ( "¦" U+00A6 So 1 "BROKEN BAR" ) 167 : ( "§" U+00A7 Po 1 "SECTION SIGN" ) 168 : ( "¨" U+00A8 Sk 1 "DIAERESIS" ) 169 : ( "©" U+00A9 So 1 "COPYRIGHT SIGN" ) 170 : ( "ª" U+00AA Lo 1 "FEMININE ORDINAL INDICATOR" ) 171 : ( "«" U+00AB Pi 1 "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK" ) 172 : ( "¬" U+00AC Sm 1 "NOT SIGN" ) 173 : ( "" U+00AD Cf 1 "SOFT HYPHEN", "SHY" ) 174 : ( "®" U+00AE So 1 "REGISTERED SIGN" ) 175 : ( "¯" U+00AF Sk 1 "MACRON" ) 176 : ( "°" U+00B0 So 1 "DEGREE SIGN" ) 177 : ( "±" U+00B1 Sm 1 "PLUS-MINUS SIGN" ) 178 : ( "²" U+00B2 No 1 "SUPERSCRIPT TWO" ) 179 : ( "³" U+00B3 No 1 "SUPERSCRIPT THREE" ) 180 : ( "´" U+00B4 Sk 1 "ACUTE ACCENT" ) 181 : ( "µ" U+00B5 Ll 1 "MICRO SIGN" ) 182 : ( "¶" U+00B6 Po 1 "PILCROW SIGN" ) 183 : ( "·" U+00B7 Po 1 "MIDDLE DOT" ) 184 : ( "¸" U+00B8 Sk 1 "CEDILLA" ) 185 : ( "¹" U+00B9 No 1 "SUPERSCRIPT ONE" ) 186 : ( "º" U+00BA Lo 1 "MASCULINE ORDINAL INDICATOR" ) 187 : ( "»" U+00BB Pf 1 "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK" ) 188 : ( "¼" U+00BC No 1 "VULGAR FRACTION ONE QUARTER" ) 189 : ( "½" U+00BD No 1 "VULGAR FRACTION ONE HALF" ) 190 : ( "¾" U+00BE No 1 "VULGAR FRACTION THREE QUARTERS" ) 191 : ( "¿" U+00BF Po 1 "INVERTED QUESTION MARK" ) 192 : ( "À" U+00C0 Lu 1 "LATIN CAPITAL LETTER A WITH GRAVE" ) 193 : ( "Á" U+00C1 Lu 1 "LATIN CAPITAL LETTER A WITH ACUTE" ) 194 : ( "Â" U+00C2 Lu 1 "LATIN CAPITAL LETTER A WITH CIRCUMFLEX" ) 195 : ( "Ã" U+00C3 Lu 1 "LATIN CAPITAL LETTER A WITH TILDE" ) 196 : ( "Ä" U+00C4 Lu 1 "LATIN CAPITAL LETTER A WITH DIAERESIS" ) 197 : ( "Å" U+00C5 Lu 1 "LATIN CAPITAL LETTER A WITH RING ABOVE" ) 198 : ( "Æ" U+00C6 Lu 1 "LATIN CAPITAL LETTER AE" ) 199 : ( "Ç" U+00C7 Lu 1 "LATIN CAPITAL LETTER C WITH CEDILLA" ) 200 : ( "È" U+00C8 Lu 1 "LATIN CAPITAL LETTER E WITH GRAVE" ) 201 : ( "É" U+00C9 Lu 1 "LATIN CAPITAL LETTER E WITH ACUTE" ) 202 : ( "Ê" U+00CA Lu 1 "LATIN CAPITAL LETTER E WITH CIRCUMFLEX" ) 203 : ( "Ë" U+00CB Lu 1 "LATIN CAPITAL LETTER E WITH DIAERESIS" ) 204 : ( "Ì" U+00CC Lu 1 "LATIN CAPITAL LETTER I WITH GRAVE" ) 205 : ( "Í" U+00CD Lu 1 "LATIN CAPITAL LETTER I WITH ACUTE" ) 206 : ( "Î" U+00CE Lu 1 "LATIN CAPITAL LETTER I WITH CIRCUMFLEX" ) 207 : ( "Ï" U+00CF Lu 1 "LATIN CAPITAL LETTER I WITH DIAERESIS" ) 208 : ( "Ð" U+00D0 Lu 1 "LATIN CAPITAL LETTER ETH" ) 209 : ( "Ñ" U+00D1 Lu 1 "LATIN CAPITAL LETTER N WITH TILDE" ) 210 : ( "Ò" U+00D2 Lu 1 "LATIN CAPITAL LETTER O WITH GRAVE" ) 211 : ( "Ó" U+00D3 Lu 1 "LATIN CAPITAL LETTER O WITH ACUTE" ) 212 : ( "Ô" U+00D4 Lu 1 "LATIN CAPITAL LETTER O WITH CIRCUMFLEX" ) 213 : ( "Õ" U+00D5 Lu 1 "LATIN CAPITAL LETTER O WITH TILDE" ) 214 : ( "Ö" U+00D6 Lu 1 "LATIN CAPITAL LETTER O WITH DIAERESIS" ) 215 : ( "×" U+00D7 Sm 1 "MULTIPLICATION SIGN" ) 216 : ( "Ø" U+00D8 Lu 1 "LATIN CAPITAL LETTER O WITH STROKE" ) 217 : ( "Ù" U+00D9 Lu 1 "LATIN CAPITAL LETTER U WITH GRAVE" ) 218 : ( "Ú" U+00DA Lu 1 "LATIN CAPITAL LETTER U WITH ACUTE" ) 219 : ( "Û" U+00DB Lu 1 "LATIN CAPITAL LETTER U WITH CIRCUMFLEX" ) 220 : ( "Ü" U+00DC Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS" ) 221 : ( "Ý" U+00DD Lu 1 "LATIN CAPITAL LETTER Y WITH ACUTE" ) 222 : ( "Þ" U+00DE Lu 1 "LATIN CAPITAL LETTER THORN" ) 223 : ( "ß" U+00DF Ll 1 "LATIN SMALL LETTER SHARP S" ) 224 : ( "à" U+00E0 Ll 1 "LATIN SMALL LETTER A WITH GRAVE" ) 225 : ( "á" U+00E1 Ll 1 "LATIN SMALL LETTER A WITH ACUTE" ) 226 : ( "â" U+00E2 Ll 1 "LATIN SMALL LETTER A WITH CIRCUMFLEX" ) 227 : ( "ã" U+00E3 Ll 1 "LATIN SMALL LETTER A WITH TILDE" ) 228 : ( "ä" U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" ) 229 : ( "å" U+00E5 Ll 1 "LATIN SMALL LETTER A WITH RING ABOVE" ) 230 : ( "æ" U+00E6 Ll 1 "LATIN SMALL LETTER AE" ) 231 : ( "ç" U+00E7 Ll 1 "LATIN SMALL LETTER C WITH CEDILLA" ) 232 : ( "è" U+00E8 Ll 1 "LATIN SMALL LETTER E WITH GRAVE" ) 233 : ( "é" U+00E9 Ll 1 "LATIN SMALL LETTER E WITH ACUTE" ) 234 : ( "ê" U+00EA Ll 1 "LATIN SMALL LETTER E WITH CIRCUMFLEX" ) 235 : ( "ë" U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" ) 236 : ( "ì" U+00EC Ll 1 "LATIN SMALL LETTER I WITH GRAVE" ) 237 : ( "í" U+00ED Ll 1 "LATIN SMALL LETTER I WITH ACUTE" ) 238 : ( "î" U+00EE Ll 1 "LATIN SMALL LETTER I WITH CIRCUMFLEX" ) 239 : ( "ï" U+00EF Ll 1 "LATIN SMALL LETTER I WITH DIAERESIS" ) 240 : ( "ð" U+00F0 Ll 1 "LATIN SMALL LETTER ETH" ) 241 : ( "ñ" U+00F1 Ll 1 "LATIN SMALL LETTER N WITH TILDE" ) 242 : ( "ò" U+00F2 Ll 1 "LATIN SMALL LETTER O WITH GRAVE" ) 243 : ( "ó" U+00F3 Ll 1 "LATIN SMALL LETTER O WITH ACUTE" ) 244 : ( "ô" U+00F4 Ll 1 "LATIN SMALL LETTER O WITH CIRCUMFLEX" ) 245 : ( "õ" U+00F5 Ll 1 "LATIN SMALL LETTER O WITH TILDE" ) 246 : ( "ö" U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" ) 247 : ( "÷" U+00F7 Sm 1 "DIVISION SIGN" ) 248 : ( "ø" U+00F8 Ll 1 "LATIN SMALL LETTER O WITH STROKE" ) 249 : ( "ù" U+00F9 Ll 1 "LATIN SMALL LETTER U WITH GRAVE" ) 250 : ( "ú" U+00FA Ll 1 "LATIN SMALL LETTER U WITH ACUTE" ) 251 : ( "û" U+00FB Ll 1 "LATIN SMALL LETTER U WITH CIRCUMFLEX" ) 252 : ( "ü" U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" ) 253 : ( "ý" U+00FD Ll 1 "LATIN SMALL LETTER Y WITH ACUTE" ) 254 : ( "þ" U+00FE Ll 1 "LATIN SMALL LETTER THORN" ) 255 : ( "ÿ" U+00FF Ll 1 "LATIN SMALL LETTER Y WITH DIAERESIS" ) 256 : ( "Ā" U+0100 Lu 1 "LATIN CAPITAL LETTER A WITH MACRON" ) 257 : ( "ā" U+0101 Ll 1 "LATIN SMALL LETTER A WITH MACRON" ) 258 : ( "Ă" U+0102 Lu 1 "LATIN CAPITAL LETTER A WITH BREVE" ) 259 : ( "ă" U+0103 Ll 1 "LATIN SMALL LETTER A WITH BREVE" ) 260 : ( "Ą" U+0104 Lu 1 "LATIN CAPITAL LETTER A WITH OGONEK" ) 261 : ( "ą" U+0105 Ll 1 "LATIN SMALL LETTER A WITH OGONEK" ) 262 : ( "Ć" U+0106 Lu 1 "LATIN CAPITAL LETTER C WITH ACUTE" ) 263 : ( "ć" U+0107 Ll 1 "LATIN SMALL LETTER C WITH ACUTE" ) 264 : ( "Ĉ" U+0108 Lu 1 "LATIN CAPITAL LETTER C WITH CIRCUMFLEX" ) 265 : ( "ĉ" U+0109 Ll 1 "LATIN SMALL LETTER C WITH CIRCUMFLEX" ) 266 : ( "Ċ" U+010A Lu 1 "LATIN CAPITAL LETTER C WITH DOT ABOVE" ) 267 : ( "ċ" U+010B Ll 1 "LATIN SMALL LETTER C WITH DOT ABOVE" ) 268 : ( "Č" U+010C Lu 1 "LATIN CAPITAL LETTER C WITH CARON" ) 269 : ( "č" U+010D Ll 1 "LATIN SMALL LETTER C WITH CARON" ) 270 : ( "Ď" U+010E Lu 1 "LATIN CAPITAL LETTER D WITH CARON" ) 271 : ( "ď" U+010F Ll 1 "LATIN SMALL LETTER D WITH CARON" ) 272 : ( "Đ" U+0110 Lu 1 "LATIN CAPITAL LETTER D WITH STROKE" ) 273 : ( "đ" U+0111 Ll 1 "LATIN SMALL LETTER D WITH STROKE" ) 274 : ( "Ē" U+0112 Lu 1 "LATIN CAPITAL LETTER E WITH MACRON" ) 275 : ( "ē" U+0113 Ll 1 "LATIN SMALL LETTER E WITH MACRON" ) 276 : ( "Ĕ" U+0114 Lu 1 "LATIN CAPITAL LETTER E WITH BREVE" ) 277 : ( "ĕ" U+0115 Ll 1 "LATIN SMALL LETTER E WITH BREVE" ) 278 : ( "Ė" U+0116 Lu 1 "LATIN CAPITAL LETTER E WITH DOT ABOVE" ) 279 : ( "ė" U+0117 Ll 1 "LATIN SMALL LETTER E WITH DOT ABOVE" ) 280 : ( "Ę" U+0118 Lu 1 "LATIN CAPITAL LETTER E WITH OGONEK" ) 281 : ( "ę" U+0119 Ll 1 "LATIN SMALL LETTER E WITH OGONEK" ) 282 : ( "Ě" U+011A Lu 1 "LATIN CAPITAL LETTER E WITH CARON" ) 283 : ( "ě" U+011B Ll 1 "LATIN SMALL LETTER E WITH CARON" ) 284 : ( "Ĝ" U+011C Lu 1 "LATIN CAPITAL LETTER G WITH CIRCUMFLEX" ) 285 : ( "ĝ" U+011D Ll 1 "LATIN SMALL LETTER G WITH CIRCUMFLEX" ) 286 : ( "Ğ" U+011E Lu 1 "LATIN CAPITAL LETTER G WITH BREVE" ) 287 : ( "ğ" U+011F Ll 1 "LATIN SMALL LETTER G WITH BREVE" ) 288 : ( "Ġ" U+0120 Lu 1 "LATIN CAPITAL LETTER G WITH DOT ABOVE" ) 289 : ( "ġ" U+0121 Ll 1 "LATIN SMALL LETTER G WITH DOT ABOVE" ) 290 : ( "Ģ" U+0122 Lu 1 "LATIN CAPITAL LETTER G WITH CEDILLA" ) 291 : ( "ģ" U+0123 Ll 1 "LATIN SMALL LETTER G WITH CEDILLA" ) 292 : ( "Ĥ" U+0124 Lu 1 "LATIN CAPITAL LETTER H WITH CIRCUMFLEX" ) 293 : ( "ĥ" U+0125 Ll 1 "LATIN SMALL LETTER H WITH CIRCUMFLEX" ) 294 : ( "Ħ" U+0126 Lu 1 "LATIN CAPITAL LETTER H WITH STROKE" ) 295 : ( "ħ" U+0127 Ll 1 "LATIN SMALL LETTER H WITH STROKE" ) 296 : ( "Ĩ" U+0128 Lu 1 "LATIN CAPITAL LETTER I WITH TILDE" ) 297 : ( "ĩ" U+0129 Ll 1 "LATIN SMALL LETTER I WITH TILDE" ) 298 : ( "Ī" U+012A Lu 1 "LATIN CAPITAL LETTER I WITH MACRON" ) 299 : ( "ī" U+012B Ll 1 "LATIN SMALL LETTER I WITH MACRON" ) 300 : ( "Ĭ" U+012C Lu 1 "LATIN CAPITAL LETTER I WITH BREVE" ) 301 : ( "ĭ" U+012D Ll 1 "LATIN SMALL LETTER I WITH BREVE" ) 302 : ( "Į" U+012E Lu 1 "LATIN CAPITAL LETTER I WITH OGONEK" ) 303 : ( "į" U+012F Ll 1 "LATIN SMALL LETTER I WITH OGONEK" ) 304 : ( "İ" U+0130 Lu 1 "LATIN CAPITAL LETTER I WITH DOT ABOVE" ) 305 : ( "ı" U+0131 Ll 1 "LATIN SMALL LETTER DOTLESS I" ) 306 : ( "IJ" U+0132 Lu 1 "LATIN CAPITAL LIGATURE IJ" ) 307 : ( "ij" U+0133 Ll 1 "LATIN SMALL LIGATURE IJ" ) 308 : ( "Ĵ" U+0134 Lu 1 "LATIN CAPITAL LETTER J WITH CIRCUMFLEX" ) 309 : ( "ĵ" U+0135 Ll 1 "LATIN SMALL LETTER J WITH CIRCUMFLEX" ) 310 : ( "Ķ" U+0136 Lu 1 "LATIN CAPITAL LETTER K WITH CEDILLA" ) 311 : ( "ķ" U+0137 Ll 1 "LATIN SMALL LETTER K WITH CEDILLA" ) 312 : ( "ĸ" U+0138 Ll 1 "LATIN SMALL LETTER KRA" ) 313 : ( "Ĺ" U+0139 Lu 1 "LATIN CAPITAL LETTER L WITH ACUTE" ) 314 : ( "ĺ" U+013A Ll 1 "LATIN SMALL LETTER L WITH ACUTE" ) 315 : ( "Ļ" U+013B Lu 1 "LATIN CAPITAL LETTER L WITH CEDILLA" ) 316 : ( "ļ" U+013C Ll 1 "LATIN SMALL LETTER L WITH CEDILLA" ) 317 : ( "Ľ" U+013D Lu 1 "LATIN CAPITAL LETTER L WITH CARON" ) 318 : ( "ľ" U+013E Ll 1 "LATIN SMALL LETTER L WITH CARON" ) 319 : ( "Ŀ" U+013F Lu 1 "LATIN CAPITAL LETTER L WITH MIDDLE DOT" ) 320 : ( "ŀ" U+0140 Ll 1 "LATIN SMALL LETTER L WITH MIDDLE DOT" ) 321 : ( "Ł" U+0141 Lu 1 "LATIN CAPITAL LETTER L WITH STROKE" ) 322 : ( "ł" U+0142 Ll 1 "LATIN SMALL LETTER L WITH STROKE" ) 323 : ( "Ń" U+0143 Lu 1 "LATIN CAPITAL LETTER N WITH ACUTE" ) 324 : ( "ń" U+0144 Ll 1 "LATIN SMALL LETTER N WITH ACUTE" ) 325 : ( "Ņ" U+0145 Lu 1 "LATIN CAPITAL LETTER N WITH CEDILLA" ) 326 : ( "ņ" U+0146 Ll 1 "LATIN SMALL LETTER N WITH CEDILLA" ) 327 : ( "Ň" U+0147 Lu 1 "LATIN CAPITAL LETTER N WITH CARON" ) 328 : ( "ň" U+0148 Ll 1 "LATIN SMALL LETTER N WITH CARON" ) 329 : ( "ʼn" U+0149 Ll 1 "LATIN SMALL LETTER N PRECEDED BY APOSTROPHE" ) 330 : ( "Ŋ" U+014A Lu 1 "LATIN CAPITAL LETTER ENG" ) 331 : ( "ŋ" U+014B Ll 1 "LATIN SMALL LETTER ENG" ) 332 : ( "Ō" U+014C Lu 1 "LATIN CAPITAL LETTER O WITH MACRON" ) 333 : ( "ō" U+014D Ll 1 "LATIN SMALL LETTER O WITH MACRON" ) 334 : ( "Ŏ" U+014E Lu 1 "LATIN CAPITAL LETTER O WITH BREVE" ) 335 : ( "ŏ" U+014F Ll 1 "LATIN SMALL LETTER O WITH BREVE" ) 336 : ( "Ő" U+0150 Lu 1 "LATIN CAPITAL LETTER O WITH DOUBLE ACUTE" ) 337 : ( "ő" U+0151 Ll 1 "LATIN SMALL LETTER O WITH DOUBLE ACUTE" ) 338 : ( "Œ" U+0152 Lu 1 "LATIN CAPITAL LIGATURE OE" ) 339 : ( "œ" U+0153 Ll 1 "LATIN SMALL LIGATURE OE" ) 340 : ( "Ŕ" U+0154 Lu 1 "LATIN CAPITAL LETTER R WITH ACUTE" ) 341 : ( "ŕ" U+0155 Ll 1 "LATIN SMALL LETTER R WITH ACUTE" ) 342 : ( "Ŗ" U+0156 Lu 1 "LATIN CAPITAL LETTER R WITH CEDILLA" ) 343 : ( "ŗ" U+0157 Ll 1 "LATIN SMALL LETTER R WITH CEDILLA" ) 344 : ( "Ř" U+0158 Lu 1 "LATIN CAPITAL LETTER R WITH CARON" ) 345 : ( "ř" U+0159 Ll 1 "LATIN SMALL LETTER R WITH CARON" ) 346 : ( "Ś" U+015A Lu 1 "LATIN CAPITAL LETTER S WITH ACUTE" ) 347 : ( "ś" U+015B Ll 1 "LATIN SMALL LETTER S WITH ACUTE" ) 348 : ( "Ŝ" U+015C Lu 1 "LATIN CAPITAL LETTER S WITH CIRCUMFLEX" ) 349 : ( "ŝ" U+015D Ll 1 "LATIN SMALL LETTER S WITH CIRCUMFLEX" ) 350 : ( "Ş" U+015E Lu 1 "LATIN CAPITAL LETTER S WITH CEDILLA" ) 351 : ( "ş" U+015F Ll 1 "LATIN SMALL LETTER S WITH CEDILLA" ) 352 : ( "Š" U+0160 Lu 1 "LATIN CAPITAL LETTER S WITH CARON" ) 353 : ( "š" U+0161 Ll 1 "LATIN SMALL LETTER S WITH CARON" ) 354 : ( "Ţ" U+0162 Lu 1 "LATIN CAPITAL LETTER T WITH CEDILLA" ) 355 : ( "ţ" U+0163 Ll 1 "LATIN SMALL LETTER T WITH CEDILLA" ) 356 : ( "Ť" U+0164 Lu 1 "LATIN CAPITAL LETTER T WITH CARON" ) 357 : ( "ť" U+0165 Ll 1 "LATIN SMALL LETTER T WITH CARON" ) 358 : ( "Ŧ" U+0166 Lu 1 "LATIN CAPITAL LETTER T WITH STROKE" ) 359 : ( "ŧ" U+0167 Ll 1 "LATIN SMALL LETTER T WITH STROKE" ) 360 : ( "Ũ" U+0168 Lu 1 "LATIN CAPITAL LETTER U WITH TILDE" ) 361 : ( "ũ" U+0169 Ll 1 "LATIN SMALL LETTER U WITH TILDE" ) 362 : ( "Ū" U+016A Lu 1 "LATIN CAPITAL LETTER U WITH MACRON" ) 363 : ( "ū" U+016B Ll 1 "LATIN SMALL LETTER U WITH MACRON" ) 364 : ( "Ŭ" U+016C Lu 1 "LATIN CAPITAL LETTER U WITH BREVE" ) 365 : ( "ŭ" U+016D Ll 1 "LATIN SMALL LETTER U WITH BREVE" ) 366 : ( "Ů" U+016E Lu 1 "LATIN CAPITAL LETTER U WITH RING ABOVE" ) 367 : ( "ů" U+016F Ll 1 "LATIN SMALL LETTER U WITH RING ABOVE" ) 368 : ( "Ű" U+0170 Lu 1 "LATIN CAPITAL LETTER U WITH DOUBLE ACUTE" ) 369 : ( "ű" U+0171 Ll 1 "LATIN SMALL LETTER U WITH DOUBLE ACUTE" ) 370 : ( "Ų" U+0172 Lu 1 "LATIN CAPITAL LETTER U WITH OGONEK" ) 371 : ( "ų" U+0173 Ll 1 "LATIN SMALL LETTER U WITH OGONEK" ) 372 : ( "Ŵ" U+0174 Lu 1 "LATIN CAPITAL LETTER W WITH CIRCUMFLEX" ) 373 : ( "ŵ" U+0175 Ll 1 "LATIN SMALL LETTER W WITH CIRCUMFLEX" ) 374 : ( "Ŷ" U+0176 Lu 1 "LATIN CAPITAL LETTER Y WITH CIRCUMFLEX" ) 375 : ( "ŷ" U+0177 Ll 1 "LATIN SMALL LETTER Y WITH CIRCUMFLEX" ) 376 : ( "Ÿ" U+0178 Lu 1 "LATIN CAPITAL LETTER Y WITH DIAERESIS" ) 377 : ( "Ź" U+0179 Lu 1 "LATIN CAPITAL LETTER Z WITH ACUTE" ) 378 : ( "ź" U+017A Ll 1 "LATIN SMALL LETTER Z WITH ACUTE" ) 379 : ( "Ż" U+017B Lu 1 "LATIN CAPITAL LETTER Z WITH DOT ABOVE" ) 380 : ( "ż" U+017C Ll 1 "LATIN SMALL LETTER Z WITH DOT ABOVE" ) 381 : ( "Ž" U+017D Lu 1 "LATIN CAPITAL LETTER Z WITH CARON" ) 382 : ( "ž" U+017E Ll 1 "LATIN SMALL LETTER Z WITH CARON" ) 383 : ( "ſ" U+017F Ll 1 "LATIN SMALL LETTER LONG S" ) 384 : ( "ƀ" U+0180 Ll 1 "LATIN SMALL LETTER B WITH STROKE" ) 385 : ( "Ɓ" U+0181 Lu 1 "LATIN CAPITAL LETTER B WITH HOOK" ) 386 : ( "Ƃ" U+0182 Lu 1 "LATIN CAPITAL LETTER B WITH TOPBAR" ) 387 : ( "ƃ" U+0183 Ll 1 "LATIN SMALL LETTER B WITH TOPBAR" ) 388 : ( "Ƅ" U+0184 Lu 1 "LATIN CAPITAL LETTER TONE SIX" ) 389 : ( "ƅ" U+0185 Ll 1 "LATIN SMALL LETTER TONE SIX" ) 390 : ( "Ɔ" U+0186 Lu 1 "LATIN CAPITAL LETTER OPEN O" ) 391 : ( "Ƈ" U+0187 Lu 1 "LATIN CAPITAL LETTER C WITH HOOK" ) 392 : ( "ƈ" U+0188 Ll 1 "LATIN SMALL LETTER C WITH HOOK" ) 393 : ( "Ɖ" U+0189 Lu 1 "LATIN CAPITAL LETTER AFRICAN D" ) 394 : ( "Ɗ" U+018A Lu 1 "LATIN CAPITAL LETTER D WITH HOOK" ) 395 : ( "Ƌ" U+018B Lu 1 "LATIN CAPITAL LETTER D WITH TOPBAR" ) 396 : ( "ƌ" U+018C Ll 1 "LATIN SMALL LETTER D WITH TOPBAR" ) 397 : ( "ƍ" U+018D Ll 1 "LATIN SMALL LETTER TURNED DELTA" ) 398 : ( "Ǝ" U+018E Lu 1 "LATIN CAPITAL LETTER REVERSED E" ) 399 : ( "Ə" U+018F Lu 1 "LATIN CAPITAL LETTER SCHWA" ) 400 : ( "Ɛ" U+0190 Lu 1 "LATIN CAPITAL LETTER OPEN E" ) 401 : ( "Ƒ" U+0191 Lu 1 "LATIN CAPITAL LETTER F WITH HOOK" ) 402 : ( "ƒ" U+0192 Ll 1 "LATIN SMALL LETTER F WITH HOOK" ) 403 : ( "Ɠ" U+0193 Lu 1 "LATIN CAPITAL LETTER G WITH HOOK" ) 404 : ( "Ɣ" U+0194 Lu 1 "LATIN CAPITAL LETTER GAMMA" ) 405 : ( "ƕ" U+0195 Ll 1 "LATIN SMALL LETTER HV" ) 406 : ( "Ɩ" U+0196 Lu 1 "LATIN CAPITAL LETTER IOTA" ) 407 : ( "Ɨ" U+0197 Lu 1 "LATIN CAPITAL LETTER I WITH STROKE" ) 408 : ( "Ƙ" U+0198 Lu 1 "LATIN CAPITAL LETTER K WITH HOOK" ) 409 : ( "ƙ" U+0199 Ll 1 "LATIN SMALL LETTER K WITH HOOK" ) 410 : ( "ƚ" U+019A Ll 1 "LATIN SMALL LETTER L WITH BAR" ) 411 : ( "ƛ" U+019B Ll 1 "LATIN SMALL LETTER LAMBDA WITH STROKE" ) 412 : ( "Ɯ" U+019C Lu 1 "LATIN CAPITAL LETTER TURNED M" ) 413 : ( "Ɲ" U+019D Lu 1 "LATIN CAPITAL LETTER N WITH LEFT HOOK" ) 414 : ( "ƞ" U+019E Ll 1 "LATIN SMALL LETTER N WITH LONG RIGHT LEG" ) 415 : ( "Ɵ" U+019F Lu 1 "LATIN CAPITAL LETTER O WITH MIDDLE TILDE" ) 416 : ( "Ơ" U+01A0 Lu 1 "LATIN CAPITAL LETTER O WITH HORN" ) 417 : ( "ơ" U+01A1 Ll 1 "LATIN SMALL LETTER O WITH HORN" ) 418 : ( "Ƣ" U+01A2 Lu 1 "LATIN CAPITAL LETTER OI", "LATIN CAPITAL LETTER GHA" ) 419 : ( "ƣ" U+01A3 Ll 1 "LATIN SMALL LETTER OI", "LATIN SMALL LETTER GHA" ) 420 : ( "Ƥ" U+01A4 Lu 1 "LATIN CAPITAL LETTER P WITH HOOK" ) 421 : ( "ƥ" U+01A5 Ll 1 "LATIN SMALL LETTER P WITH HOOK" ) 422 : ( "Ʀ" U+01A6 Lu 1 "LATIN LETTER YR" ) 423 : ( "Ƨ" U+01A7 Lu 1 "LATIN CAPITAL LETTER TONE TWO" ) 424 : ( "ƨ" U+01A8 Ll 1 "LATIN SMALL LETTER TONE TWO" ) 425 : ( "Ʃ" U+01A9 Lu 1 "LATIN CAPITAL LETTER ESH" ) 426 : ( "ƪ" U+01AA Ll 1 "LATIN LETTER REVERSED ESH LOOP" ) 427 : ( "ƫ" U+01AB Ll 1 "LATIN SMALL LETTER T WITH PALATAL HOOK" ) 428 : ( "Ƭ" U+01AC Lu 1 "LATIN CAPITAL LETTER T WITH HOOK" ) 429 : ( "ƭ" U+01AD Ll 1 "LATIN SMALL LETTER T WITH HOOK" ) 430 : ( "Ʈ" U+01AE Lu 1 "LATIN CAPITAL LETTER T WITH RETROFLEX HOOK" ) 431 : ( "Ư" U+01AF Lu 1 "LATIN CAPITAL LETTER U WITH HORN" ) 432 : ( "ư" U+01B0 Ll 1 "LATIN SMALL LETTER U WITH HORN" ) 433 : ( "Ʊ" U+01B1 Lu 1 "LATIN CAPITAL LETTER UPSILON" ) 434 : ( "Ʋ" U+01B2 Lu 1 "LATIN CAPITAL LETTER V WITH HOOK" ) 435 : ( "Ƴ" U+01B3 Lu 1 "LATIN CAPITAL LETTER Y WITH HOOK" ) 436 : ( "ƴ" U+01B4 Ll 1 "LATIN SMALL LETTER Y WITH HOOK" ) 437 : ( "Ƶ" U+01B5 Lu 1 "LATIN CAPITAL LETTER Z WITH STROKE" ) 438 : ( "ƶ" U+01B6 Ll 1 "LATIN SMALL LETTER Z WITH STROKE" ) 439 : ( "Ʒ" U+01B7 Lu 1 "LATIN CAPITAL LETTER EZH" ) 440 : ( "Ƹ" U+01B8 Lu 1 "LATIN CAPITAL LETTER EZH REVERSED" ) 441 : ( "ƹ" U+01B9 Ll 1 "LATIN SMALL LETTER EZH REVERSED" ) 442 : ( "ƺ" U+01BA Ll 1 "LATIN SMALL LETTER EZH WITH TAIL" ) 443 : ( "ƻ" U+01BB Lo 1 "LATIN LETTER TWO WITH STROKE" ) 444 : ( "Ƽ" U+01BC Lu 1 "LATIN CAPITAL LETTER TONE FIVE" ) 445 : ( "ƽ" U+01BD Ll 1 "LATIN SMALL LETTER TONE FIVE" ) 446 : ( "ƾ" U+01BE Ll 1 "LATIN LETTER INVERTED GLOTTAL STOP WITH STROKE" ) 447 : ( "ƿ" U+01BF Ll 1 "LATIN LETTER WYNN" ) 448 : ( "ǀ" U+01C0 Lo 1 "LATIN LETTER DENTAL CLICK" ) 449 : ( "ǁ" U+01C1 Lo 1 "LATIN LETTER LATERAL CLICK" ) 450 : ( "ǂ" U+01C2 Lo 1 "LATIN LETTER ALVEOLAR CLICK" ) 451 : ( "ǃ" U+01C3 Lo 1 "LATIN LETTER RETROFLEX CLICK" ) 452 : ( "DŽ" U+01C4 Lu 1 "LATIN CAPITAL LETTER DZ WITH CARON" ) 453 : ( "Dž" U+01C5 Lt 1 "LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON" ) 454 : ( "dž" U+01C6 Ll 1 "LATIN SMALL LETTER DZ WITH CARON" ) 455 : ( "LJ" U+01C7 Lu 1 "LATIN CAPITAL LETTER LJ" ) 456 : ( "Lj" U+01C8 Lt 1 "LATIN CAPITAL LETTER L WITH SMALL LETTER J" ) 457 : ( "lj" U+01C9 Ll 1 "LATIN SMALL LETTER LJ" ) 458 : ( "NJ" U+01CA Lu 1 "LATIN CAPITAL LETTER NJ" ) 459 : ( "Nj" U+01CB Lt 1 "LATIN CAPITAL LETTER N WITH SMALL LETTER J" ) 460 : ( "nj" U+01CC Ll 1 "LATIN SMALL LETTER NJ" ) 461 : ( "Ǎ" U+01CD Lu 1 "LATIN CAPITAL LETTER A WITH CARON" ) 462 : ( "ǎ" U+01CE Ll 1 "LATIN SMALL LETTER A WITH CARON" ) 463 : ( "Ǐ" U+01CF Lu 1 "LATIN CAPITAL LETTER I WITH CARON" ) 464 : ( "ǐ" U+01D0 Ll 1 "LATIN SMALL LETTER I WITH CARON" ) 465 : ( "Ǒ" U+01D1 Lu 1 "LATIN CAPITAL LETTER O WITH CARON" ) 466 : ( "ǒ" U+01D2 Ll 1 "LATIN SMALL LETTER O WITH CARON" ) 467 : ( "Ǔ" U+01D3 Lu 1 "LATIN CAPITAL LETTER U WITH CARON" ) 468 : ( "ǔ" U+01D4 Ll 1 "LATIN SMALL LETTER U WITH CARON" ) 469 : ( "Ǖ" U+01D5 Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON" ) 470 : ( "ǖ" U+01D6 Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS AND MACRON" ) 471 : ( "Ǘ" U+01D7 Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE" ) 472 : ( "ǘ" U+01D8 Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE" ) 473 : ( "Ǚ" U+01D9 Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON" ) 474 : ( "ǚ" U+01DA Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS AND CARON" ) 475 : ( "Ǜ" U+01DB Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE" ) 476 : ( "ǜ" U+01DC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE" ) 477 : ( "ǝ" U+01DD Ll 1 "LATIN SMALL LETTER TURNED E" ) 478 : ( "Ǟ" U+01DE Lu 1 "LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON" ) 479 : ( "ǟ" U+01DF Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS AND MACRON" ) 480 : ( "Ǡ" U+01E0 Lu 1 "LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON" ) 481 : ( "ǡ" U+01E1 Ll 1 "LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON" ) 482 : ( "Ǣ" U+01E2 Lu 1 "LATIN CAPITAL LETTER AE WITH MACRON" ) 483 : ( "ǣ" U+01E3 Ll 1 "LATIN SMALL LETTER AE WITH MACRON" ) 484 : ( "Ǥ" U+01E4 Lu 1 "LATIN CAPITAL LETTER G WITH STROKE" ) 485 : ( "ǥ" U+01E5 Ll 1 "LATIN SMALL LETTER G WITH STROKE" ) 486 : ( "Ǧ" U+01E6 Lu 1 "LATIN CAPITAL LETTER G WITH CARON" ) 487 : ( "ǧ" U+01E7 Ll 1 "LATIN SMALL LETTER G WITH CARON" ) 488 : ( "Ǩ" U+01E8 Lu 1 "LATIN CAPITAL LETTER K WITH CARON" ) 489 : ( "ǩ" U+01E9 Ll 1 "LATIN SMALL LETTER K WITH CARON" ) 490 : ( "Ǫ" U+01EA Lu 1 "LATIN CAPITAL LETTER O WITH OGONEK" ) 491 : ( "ǫ" U+01EB Ll 1 "LATIN SMALL LETTER O WITH OGONEK" ) 492 : ( "Ǭ" U+01EC Lu 1 "LATIN CAPITAL LETTER O WITH OGONEK AND MACRON" ) 493 : ( "ǭ" U+01ED Ll 1 "LATIN SMALL LETTER O WITH OGONEK AND MACRON" ) 494 : ( "Ǯ" U+01EE Lu 1 "LATIN CAPITAL LETTER EZH WITH CARON" ) 495 : ( "ǯ" U+01EF Ll 1 "LATIN SMALL LETTER EZH WITH CARON" ) 496 : ( "ǰ" U+01F0 Ll 1 "LATIN SMALL LETTER J WITH CARON" ) 497 : ( "DZ" U+01F1 Lu 1 "LATIN CAPITAL LETTER DZ" ) 498 : ( "Dz" U+01F2 Lt 1 "LATIN CAPITAL LETTER D WITH SMALL LETTER Z" ) 499 : ( "dz" U+01F3 Ll 1 "LATIN SMALL LETTER DZ" ) 500 : ( "Ǵ" U+01F4 Lu 1 "LATIN CAPITAL LETTER G WITH ACUTE" ) 501 : ( "ǵ" U+01F5 Ll 1 "LATIN SMALL LETTER G WITH ACUTE" ) 502 : ( "Ƕ" U+01F6 Lu 1 "LATIN CAPITAL LETTER HWAIR" ) 503 : ( "Ƿ" U+01F7 Lu 1 "LATIN CAPITAL LETTER WYNN" ) 504 : ( "Ǹ" U+01F8 Lu 1 "LATIN CAPITAL LETTER N WITH GRAVE" ) 505 : ( "ǹ" U+01F9 Ll 1 "LATIN SMALL LETTER N WITH GRAVE" ) 506 : ( "Ǻ" U+01FA Lu 1 "LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE" ) 507 : ( "ǻ" U+01FB Ll 1 "LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE" ) 508 : ( "Ǽ" U+01FC Lu 1 "LATIN CAPITAL LETTER AE WITH ACUTE" ) 509 : ( "ǽ" U+01FD Ll 1 "LATIN SMALL LETTER AE WITH ACUTE" ) 510 : ( "Ǿ" U+01FE Lu 1 "LATIN CAPITAL LETTER O WITH STROKE AND ACUTE" ) 511 : ( "ǿ" U+01FF Ll 1 "LATIN SMALL LETTER O WITH STROKE AND ACUTE" ) 512 : ( "Ȁ" U+0200 Lu 1 "LATIN CAPITAL LETTER A WITH DOUBLE GRAVE" ) 513 : ( "ȁ" U+0201 Ll 1 "LATIN SMALL LETTER A WITH DOUBLE GRAVE" ) 514 : ( "Ȃ" U+0202 Lu 1 "LATIN CAPITAL LETTER A WITH INVERTED BREVE" ) 515 : ( "ȃ" U+0203 Ll 1 "LATIN SMALL LETTER A WITH INVERTED BREVE" ) 516 : ( "Ȅ" U+0204 Lu 1 "LATIN CAPITAL LETTER E WITH DOUBLE GRAVE" ) 517 : ( "ȅ" U+0205 Ll 1 "LATIN SMALL LETTER E WITH DOUBLE GRAVE" ) 518 : ( "Ȇ" U+0206 Lu 1 "LATIN CAPITAL LETTER E WITH INVERTED BREVE" ) 519 : ( "ȇ" U+0207 Ll 1 "LATIN SMALL LETTER E WITH INVERTED BREVE" ) 520 : ( "Ȉ" U+0208 Lu 1 "LATIN CAPITAL LETTER I WITH DOUBLE GRAVE" ) 521 : ( "ȉ" U+0209 Ll 1 "LATIN SMALL LETTER I WITH DOUBLE GRAVE" ) 522 : ( "Ȋ" U+020A Lu 1 "LATIN CAPITAL LETTER I WITH INVERTED BREVE" ) 523 : ( "ȋ" U+020B Ll 1 "LATIN SMALL LETTER I WITH INVERTED BREVE" ) 524 : ( "Ȍ" U+020C Lu 1 "LATIN CAPITAL LETTER O WITH DOUBLE GRAVE" ) 525 : ( "ȍ" U+020D Ll 1 "LATIN SMALL LETTER O WITH DOUBLE GRAVE" ) 526 : ( "Ȏ" U+020E Lu 1 "LATIN CAPITAL LETTER O WITH INVERTED BREVE" ) 527 : ( "ȏ" U+020F Ll 1 "LATIN SMALL LETTER O WITH INVERTED BREVE" ) 528 : ( "Ȑ" U+0210 Lu 1 "LATIN CAPITAL LETTER R WITH DOUBLE GRAVE" ) 529 : ( "ȑ" U+0211 Ll 1 "LATIN SMALL LETTER R WITH DOUBLE GRAVE" ) 530 : ( "Ȓ" U+0212 Lu 1 "LATIN CAPITAL LETTER R WITH INVERTED BREVE" ) 531 : ( "ȓ" U+0213 Ll 1 "LATIN SMALL LETTER R WITH INVERTED BREVE" ) 532 : ( "Ȕ" U+0214 Lu 1 "LATIN CAPITAL LETTER U WITH DOUBLE GRAVE" ) 533 : ( "ȕ" U+0215 Ll 1 "LATIN SMALL LETTER U WITH DOUBLE GRAVE" ) 534 : ( "Ȗ" U+0216 Lu 1 "LATIN CAPITAL LETTER U WITH INVERTED BREVE" ) 535 : ( "ȗ" U+0217 Ll 1 "LATIN SMALL LETTER U WITH INVERTED BREVE" ) 536 : ( "Ș" U+0218 Lu 1 "LATIN CAPITAL LETTER S WITH COMMA BELOW" ) 537 : ( "ș" U+0219 Ll 1 "LATIN SMALL LETTER S WITH COMMA BELOW" ) 538 : ( "Ț" U+021A Lu 1 "LATIN CAPITAL LETTER T WITH COMMA BELOW" ) 539 : ( "ț" U+021B Ll 1 "LATIN SMALL LETTER T WITH COMMA BELOW" ) 540 : ( "Ȝ" U+021C Lu 1 "LATIN CAPITAL LETTER YOGH" ) 541 : ( "ȝ" U+021D Ll 1 "LATIN SMALL LETTER YOGH" ) 542 : ( "Ȟ" U+021E Lu 1 "LATIN CAPITAL LETTER H WITH CARON" ) 543 : ( "ȟ" U+021F Ll 1 "LATIN SMALL LETTER H WITH CARON" ) 544 : ( "Ƞ" U+0220 Lu 1 "LATIN CAPITAL LETTER N WITH LONG RIGHT LEG" ) 545 : ( "ȡ" U+0221 Ll 1 "LATIN SMALL LETTER D WITH CURL" ) 546 : ( "Ȣ" U+0222 Lu 1 "LATIN CAPITAL LETTER OU" ) 547 : ( "ȣ" U+0223 Ll 1 "LATIN SMALL LETTER OU" ) 548 : ( "Ȥ" U+0224 Lu 1 "LATIN CAPITAL LETTER Z WITH HOOK" ) 549 : ( "ȥ" U+0225 Ll 1 "LATIN SMALL LETTER Z WITH HOOK" ) 550 : ( "Ȧ" U+0226 Lu 1 "LATIN CAPITAL LETTER A WITH DOT ABOVE" ) 551 : ( "ȧ" U+0227 Ll 1 "LATIN SMALL LETTER A WITH DOT ABOVE" ) 552 : ( "Ȩ" U+0228 Lu 1 "LATIN CAPITAL LETTER E WITH CEDILLA" ) 553 : ( "ȩ" U+0229 Ll 1 "LATIN SMALL LETTER E WITH CEDILLA" ) 554 : ( "Ȫ" U+022A Lu 1 "LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON" ) 555 : ( "ȫ" U+022B Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS AND MACRON" ) 556 : ( "Ȭ" U+022C Lu 1 "LATIN CAPITAL LETTER O WITH TILDE AND MACRON" ) 557 : ( "ȭ" U+022D Ll 1 "LATIN SMALL LETTER O WITH TILDE AND MACRON" ) 558 : ( "Ȯ" U+022E Lu 1 "LATIN CAPITAL LETTER O WITH DOT ABOVE" ) 559 : ( "ȯ" U+022F Ll 1 "LATIN SMALL LETTER O WITH DOT ABOVE" ) 560 : ( "Ȱ" U+0230 Lu 1 "LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON" ) 561 : ( "ȱ" U+0231 Ll 1 "LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON" ) 562 : ( "Ȳ" U+0232 Lu 1 "LATIN CAPITAL LETTER Y WITH MACRON" ) 563 : ( "ȳ" U+0233 Ll 1 "LATIN SMALL LETTER Y WITH MACRON" ) 564 : ( "ȴ" U+0234 Ll 1 "LATIN SMALL LETTER L WITH CURL" ) 565 : ( "ȵ" U+0235 Ll 1 "LATIN SMALL LETTER N WITH CURL" ) 566 : ( "ȶ" U+0236 Ll 1 "LATIN SMALL LETTER T WITH CURL" ) 567 : ( "ȷ" U+0237 Ll 1 "LATIN SMALL LETTER DOTLESS J" ) 568 : ( "ȸ" U+0238 Ll 1 "LATIN SMALL LETTER DB DIGRAPH" ) 569 : ( "ȹ" U+0239 Ll 1 "LATIN SMALL LETTER QP DIGRAPH" ) 570 : ( "Ⱥ" U+023A Lu 1 "LATIN CAPITAL LETTER A WITH STROKE" ) 571 : ( "Ȼ" U+023B Lu 1 "LATIN CAPITAL LETTER C WITH STROKE" ) 572 : ( "ȼ" U+023C Ll 1 "LATIN SMALL LETTER C WITH STROKE" ) 573 : ( "Ƚ" U+023D Lu 1 "LATIN CAPITAL LETTER L WITH BAR" ) 574 : ( "Ⱦ" U+023E Lu 1 "LATIN CAPITAL LETTER T WITH DIAGONAL STROKE" ) 575 : ( "ȿ" U+023F Ll 1 "LATIN SMALL LETTER S WITH SWASH TAIL" ) 576 : ( "ɀ" U+0240 Ll 1 "LATIN SMALL LETTER Z WITH SWASH TAIL" ) 577 : ( "Ɂ" U+0241 Lu 1 "LATIN CAPITAL LETTER GLOTTAL STOP" ) 578 : ( "ɂ" U+0242 Ll 1 "LATIN SMALL LETTER GLOTTAL STOP" ) 579 : ( "Ƀ" U+0243 Lu 1 "LATIN CAPITAL LETTER B WITH STROKE" ) 580 : ( "Ʉ" U+0244 Lu 1 "LATIN CAPITAL LETTER U BAR" ) 581 : ( "Ʌ" U+0245 Lu 1 "LATIN CAPITAL LETTER TURNED V" ) 582 : ( "Ɇ" U+0246 Lu 1 "LATIN CAPITAL LETTER E WITH STROKE" ) 583 : ( "ɇ" U+0247 Ll 1 "LATIN SMALL LETTER E WITH STROKE" ) 584 : ( "Ɉ" U+0248 Lu 1 "LATIN CAPITAL LETTER J WITH STROKE" ) 585 : ( "ɉ" U+0249 Ll 1 "LATIN SMALL LETTER J WITH STROKE" ) 586 : ( "Ɋ" U+024A Lu 1 "LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL" ) 587 : ( "ɋ" U+024B Ll 1 "LATIN SMALL LETTER Q WITH HOOK TAIL" ) 588 : ( "Ɍ" U+024C Lu 1 "LATIN CAPITAL LETTER R WITH STROKE" ) 589 : ( "ɍ" U+024D Ll 1 "LATIN SMALL LETTER R WITH STROKE" ) 590 : ( "Ɏ" U+024E Lu 1 "LATIN CAPITAL LETTER Y WITH STROKE" ) 591 : ( "ɏ" U+024F Ll 1 "LATIN SMALL LETTER Y WITH STROKE" ) 592 : ( "ɐ" U+0250 Ll 1 "LATIN SMALL LETTER TURNED A" ) 593 : ( "ɑ" U+0251 Ll 1 "LATIN SMALL LETTER ALPHA" ) 594 : ( "ɒ" U+0252 Ll 1 "LATIN SMALL LETTER TURNED ALPHA" ) 595 : ( "ɓ" U+0253 Ll 1 "LATIN SMALL LETTER B WITH HOOK" ) 596 : ( "ɔ" U+0254 Ll 1 "LATIN SMALL LETTER OPEN O" ) 597 : ( "ɕ" U+0255 Ll 1 "LATIN SMALL LETTER C WITH CURL" ) 598 : ( "ɖ" U+0256 Ll 1 "LATIN SMALL LETTER D WITH TAIL" ) 599 : ( "ɗ" U+0257 Ll 1 "LATIN SMALL LETTER D WITH HOOK" ) 600 : ( "ɘ" U+0258 Ll 1 "LATIN SMALL LETTER REVERSED E" ) 601 : ( "ə" U+0259 Ll 1 "LATIN SMALL LETTER SCHWA" ) 602 : ( "ɚ" U+025A Ll 1 "LATIN SMALL LETTER SCHWA WITH HOOK" ) 603 : ( "ɛ" U+025B Ll 1 "LATIN SMALL LETTER OPEN E" ) 604 : ( "ɜ" U+025C Ll 1 "LATIN SMALL LETTER REVERSED OPEN E" ) 605 : ( "ɝ" U+025D Ll 1 "LATIN SMALL LETTER REVERSED OPEN E WITH HOOK" ) 606 : ( "ɞ" U+025E Ll 1 "LATIN SMALL LETTER CLOSED REVERSED OPEN E" ) 607 : ( "ɟ" U+025F Ll 1 "LATIN SMALL LETTER DOTLESS J WITH STROKE" ) 608 : ( "ɠ" U+0260 Ll 1 "LATIN SMALL LETTER G WITH HOOK" ) 609 : ( "ɡ" U+0261 Ll 1 "LATIN SMALL LETTER SCRIPT G" ) 610 : ( "ɢ" U+0262 Ll 1 "LATIN LETTER SMALL CAPITAL G" ) 611 : ( "ɣ" U+0263 Ll 1 "LATIN SMALL LETTER GAMMA" ) 612 : ( "ɤ" U+0264 Ll 1 "LATIN SMALL LETTER RAMS HORN" ) 613 : ( "ɥ" U+0265 Ll 1 "LATIN SMALL LETTER TURNED H" ) 614 : ( "ɦ" U+0266 Ll 1 "LATIN SMALL LETTER H WITH HOOK" ) 615 : ( "ɧ" U+0267 Ll 1 "LATIN SMALL LETTER HENG WITH HOOK" ) 616 : ( "ɨ" U+0268 Ll 1 "LATIN SMALL LETTER I WITH STROKE" ) 617 : ( "ɩ" U+0269 Ll 1 "LATIN SMALL LETTER IOTA" ) 618 : ( "ɪ" U+026A Ll 1 "LATIN LETTER SMALL CAPITAL I" ) 619 : ( "ɫ" U+026B Ll 1 "LATIN SMALL LETTER L WITH MIDDLE TILDE" ) 620 : ( "ɬ" U+026C Ll 1 "LATIN SMALL LETTER L WITH BELT" ) 621 : ( "ɭ" U+026D Ll 1 "LATIN SMALL LETTER L WITH RETROFLEX HOOK" ) 622 : ( "ɮ" U+026E Ll 1 "LATIN SMALL LETTER LEZH" ) 623 : ( "ɯ" U+026F Ll 1 "LATIN SMALL LETTER TURNED M" ) 624 : ( "ɰ" U+0270 Ll 1 "LATIN SMALL LETTER TURNED M WITH LONG LEG" ) 625 : ( "ɱ" U+0271 Ll 1 "LATIN SMALL LETTER M WITH HOOK" ) 626 : ( "ɲ" U+0272 Ll 1 "LATIN SMALL LETTER N WITH LEFT HOOK" ) 627 : ( "ɳ" U+0273 Ll 1 "LATIN SMALL LETTER N WITH RETROFLEX HOOK" ) 628 : ( "ɴ" U+0274 Ll 1 "LATIN LETTER SMALL CAPITAL N" ) 629 : ( "ɵ" U+0275 Ll 1 "LATIN SMALL LETTER BARRED O" ) 630 : ( "ɶ" U+0276 Ll 1 "LATIN LETTER SMALL CAPITAL OE" ) 631 : ( "ɷ" U+0277 Ll 1 "LATIN SMALL LETTER CLOSED OMEGA" ) 632 : ( "ɸ" U+0278 Ll 1 "LATIN SMALL LETTER PHI" ) 633 : ( "ɹ" U+0279 Ll 1 "LATIN SMALL LETTER TURNED R" ) 634 : ( "ɺ" U+027A Ll 1 "LATIN SMALL LETTER TURNED R WITH LONG LEG" ) 635 : ( "ɻ" U+027B Ll 1 "LATIN SMALL LETTER TURNED R WITH HOOK" ) 636 : ( "ɼ" U+027C Ll 1 "LATIN SMALL LETTER R WITH LONG LEG" ) 637 : ( "ɽ" U+027D Ll 1 "LATIN SMALL LETTER R WITH TAIL" ) 638 : ( "ɾ" U+027E Ll 1 "LATIN SMALL LETTER R WITH FISHHOOK" ) 639 : ( "ɿ" U+027F Ll 1 "LATIN SMALL LETTER REVERSED R WITH FISHHOOK" ) 640 : ( "ʀ" U+0280 Ll 1 "LATIN LETTER SMALL CAPITAL R" ) 641 : ( "ʁ" U+0281 Ll 1 "LATIN LETTER SMALL CAPITAL INVERTED R" ) 642 : ( "ʂ" U+0282 Ll 1 "LATIN SMALL LETTER S WITH HOOK" ) 643 : ( "ʃ" U+0283 Ll 1 "LATIN SMALL LETTER ESH" ) 644 : ( "ʄ" U+0284 Ll 1 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" ) 645 : ( "ʅ" U+0285 Ll 1 "LATIN SMALL LETTER SQUAT REVERSED ESH" ) 646 : ( "ʆ" U+0286 Ll 1 "LATIN SMALL LETTER ESH WITH CURL" ) 647 : ( "ʇ" U+0287 Ll 1 "LATIN SMALL LETTER TURNED T" ) 648 : ( "ʈ" U+0288 Ll 1 "LATIN SMALL LETTER T WITH RETROFLEX HOOK" ) 649 : ( "ʉ" U+0289 Ll 1 "LATIN SMALL LETTER U BAR" ) 650 : ( "ʊ" U+028A Ll 1 "LATIN SMALL LETTER UPSILON" ) 651 : ( "ʋ" U+028B Ll 1 "LATIN SMALL LETTER V WITH HOOK" ) 652 : ( "ʌ" U+028C Ll 1 "LATIN SMALL LETTER TURNED V" ) 653 : ( "ʍ" U+028D Ll 1 "LATIN SMALL LETTER TURNED W" ) 654 : ( "ʎ" U+028E Ll 1 "LATIN SMALL LETTER TURNED Y" ) 655 : ( "ʏ" U+028F Ll 1 "LATIN LETTER SMALL CAPITAL Y" ) 656 : ( "ʐ" U+0290 Ll 1 "LATIN SMALL LETTER Z WITH RETROFLEX HOOK" ) 657 : ( "ʑ" U+0291 Ll 1 "LATIN SMALL LETTER Z WITH CURL" ) 658 : ( "ʒ" U+0292 Ll 1 "LATIN SMALL LETTER EZH" ) 659 : ( "ʓ" U+0293 Ll 1 "LATIN SMALL LETTER EZH WITH CURL" ) 660 : ( "ʔ" U+0294 Lo 1 "LATIN LETTER GLOTTAL STOP" ) 661 : ( "ʕ" U+0295 Ll 1 "LATIN LETTER PHARYNGEAL VOICED FRICATIVE" ) 662 : ( "ʖ" U+0296 Ll 1 "LATIN LETTER INVERTED GLOTTAL STOP" ) 663 : ( "ʗ" U+0297 Ll 1 "LATIN LETTER STRETCHED C" ) 664 : ( "ʘ" U+0298 Ll 1 "LATIN LETTER BILABIAL CLICK" ) 665 : ( "ʙ" U+0299 Ll 1 "LATIN LETTER SMALL CAPITAL B" ) 666 : ( "ʚ" U+029A Ll 1 "LATIN SMALL LETTER CLOSED OPEN E" ) 667 : ( "ʛ" U+029B Ll 1 "LATIN LETTER SMALL CAPITAL G WITH HOOK" ) 668 : ( "ʜ" U+029C Ll 1 "LATIN LETTER SMALL CAPITAL H" ) 669 : ( "ʝ" U+029D Ll 1 "LATIN SMALL LETTER J WITH CROSSED-TAIL" ) 670 : ( "ʞ" U+029E Ll 1 "LATIN SMALL LETTER TURNED K" ) 671 : ( "ʟ" U+029F Ll 1 "LATIN LETTER SMALL CAPITAL L" ) 672 : ( "ʠ" U+02A0 Ll 1 "LATIN SMALL LETTER Q WITH HOOK" ) 673 : ( "ʡ" U+02A1 Ll 1 "LATIN LETTER GLOTTAL STOP WITH STROKE" ) 674 : ( "ʢ" U+02A2 Ll 1 "LATIN LETTER REVERSED GLOTTAL STOP WITH STROKE" ) 675 : ( "ʣ" U+02A3 Ll 1 "LATIN SMALL LETTER DZ DIGRAPH" ) 676 : ( "ʤ" U+02A4 Ll 1 "LATIN SMALL LETTER DEZH DIGRAPH" ) 677 : ( "ʥ" U+02A5 Ll 1 "LATIN SMALL LETTER DZ DIGRAPH WITH CURL" ) 678 : ( "ʦ" U+02A6 Ll 1 "LATIN SMALL LETTER TS DIGRAPH" ) 679 : ( "ʧ" U+02A7 Ll 1 "LATIN SMALL LETTER TESH DIGRAPH" ) 680 : ( "ʨ" U+02A8 Ll 1 "LATIN SMALL LETTER TC DIGRAPH WITH CURL" ) 681 : ( "ʩ" U+02A9 Ll 1 "LATIN SMALL LETTER FENG DIGRAPH" ) 682 : ( "ʪ" U+02AA Ll 1 "LATIN SMALL LETTER LS DIGRAPH" ) 683 : ( "ʫ" U+02AB Ll 1 "LATIN SMALL LETTER LZ DIGRAPH" ) 684 : ( "ʬ" U+02AC Ll 1 "LATIN LETTER BILABIAL PERCUSSIVE" ) 685 : ( "ʭ" U+02AD Ll 1 "LATIN LETTER BIDENTAL PERCUSSIVE" ) 686 : ( "ʮ" U+02AE Ll 1 "LATIN SMALL LETTER TURNED H WITH FISHHOOK" ) 687 : ( "ʯ" U+02AF Ll 1 "LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL" ) 688 : ( "ʰ" U+02B0 Lm 1 "MODIFIER LETTER SMALL H" ) 689 : ( "ʱ" U+02B1 Lm 1 "MODIFIER LETTER SMALL H WITH HOOK" ) 690 : ( "ʲ" U+02B2 Lm 1 "MODIFIER LETTER SMALL J" ) 691 : ( "ʳ" U+02B3 Lm 1 "MODIFIER LETTER SMALL R" ) 692 : ( "ʴ" U+02B4 Lm 1 "MODIFIER LETTER SMALL TURNED R" ) 693 : ( "ʵ" U+02B5 Lm 1 "MODIFIER LETTER SMALL TURNED R WITH HOOK" ) 694 : ( "ʶ" U+02B6 Lm 1 "MODIFIER LETTER SMALL CAPITAL INVERTED R" ) 695 : ( "ʷ" U+02B7 Lm 1 "MODIFIER LETTER SMALL W" ) 696 : ( "ʸ" U+02B8 Lm 1 "MODIFIER LETTER SMALL Y" ) 697 : ( "ʹ" U+02B9 Lm 1 "MODIFIER LETTER PRIME" ) 698 : ( "ʺ" U+02BA Lm 1 "MODIFIER LETTER DOUBLE PRIME" ) 699 : ( "ʻ" U+02BB Lm 1 "MODIFIER LETTER TURNED COMMA" ) 700 : ( "ʼ" U+02BC Lm 1 "MODIFIER LETTER APOSTROPHE" ) 701 : ( "ʽ" U+02BD Lm 1 "MODIFIER LETTER REVERSED COMMA" ) 702 : ( "ʾ" U+02BE Lm 1 "MODIFIER LETTER RIGHT HALF RING" ) 703 : ( "ʿ" U+02BF Lm 1 "MODIFIER LETTER LEFT HALF RING" ) 704 : ( "ˀ" U+02C0 Lm 1 "MODIFIER LETTER GLOTTAL STOP" ) 705 : ( "ˁ" U+02C1 Lm 1 "MODIFIER LETTER REVERSED GLOTTAL STOP" ) 706 : ( "˂" U+02C2 Sk 1 "MODIFIER LETTER LEFT ARROWHEAD" ) 707 : ( "˃" U+02C3 Sk 1 "MODIFIER LETTER RIGHT ARROWHEAD" ) 708 : ( "˄" U+02C4 Sk 1 "MODIFIER LETTER UP ARROWHEAD" ) 709 : ( "˅" U+02C5 Sk 1 "MODIFIER LETTER DOWN ARROWHEAD" ) 710 : ( "ˆ" U+02C6 Lm 1 "MODIFIER LETTER CIRCUMFLEX ACCENT" ) 711 : ( "ˇ" U+02C7 Lm 1 "CARON" ) 712 : ( "ˈ" U+02C8 Lm 1 "MODIFIER LETTER VERTICAL LINE" ) 713 : ( "ˉ" U+02C9 Lm 1 "MODIFIER LETTER MACRON" ) 714 : ( "ˊ" U+02CA Lm 1 "MODIFIER LETTER ACUTE ACCENT" ) 715 : ( "ˋ" U+02CB Lm 1 "MODIFIER LETTER GRAVE ACCENT" ) 716 : ( "ˌ" U+02CC Lm 1 "MODIFIER LETTER LOW VERTICAL LINE" ) 717 : ( "ˍ" U+02CD Lm 1 "MODIFIER LETTER LOW MACRON" ) 718 : ( "ˎ" U+02CE Lm 1 "MODIFIER LETTER LOW GRAVE ACCENT" ) 719 : ( "ˏ" U+02CF Lm 1 "MODIFIER LETTER LOW ACUTE ACCENT" ) 720 : ( "ː" U+02D0 Lm 1 "MODIFIER LETTER TRIANGULAR COLON" ) 721 : ( "ˑ" U+02D1 Lm 1 "MODIFIER LETTER HALF TRIANGULAR COLON" ) 722 : ( "˒" U+02D2 Sk 1 "MODIFIER LETTER CENTRED RIGHT HALF RING" ) 723 : ( "˓" U+02D3 Sk 1 "MODIFIER LETTER CENTRED LEFT HALF RING" ) 724 : ( "˔" U+02D4 Sk 1 "MODIFIER LETTER UP TACK" ) 725 : ( "˕" U+02D5 Sk 1 "MODIFIER LETTER DOWN TACK" ) 726 : ( "˖" U+02D6 Sk 1 "MODIFIER LETTER PLUS SIGN" ) 727 : ( "˗" U+02D7 Sk 1 "MODIFIER LETTER MINUS SIGN" ) 728 : ( "˘" U+02D8 Sk 1 "BREVE" ) 729 : ( "˙" U+02D9 Sk 1 "DOT ABOVE" ) 730 : ( "˚" U+02DA Sk 1 "RING ABOVE" ) 731 : ( "˛" U+02DB Sk 1 "OGONEK" ) 732 : ( "˜" U+02DC Sk 1 "SMALL TILDE" ) 733 : ( "˝" U+02DD Sk 1 "DOUBLE ACUTE ACCENT" ) 734 : ( "˞" U+02DE Sk 1 "MODIFIER LETTER RHOTIC HOOK" ) 735 : ( "˟" U+02DF Sk 1 "MODIFIER LETTER CROSS ACCENT" ) 736 : ( "ˠ" U+02E0 Lm 1 "MODIFIER LETTER SMALL GAMMA" ) 737 : ( "ˡ" U+02E1 Lm 1 "MODIFIER LETTER SMALL L" ) 738 : ( "ˢ" U+02E2 Lm 1 "MODIFIER LETTER SMALL S" ) 739 : ( "ˣ" U+02E3 Lm 1 "MODIFIER LETTER SMALL X" ) 740 : ( "ˤ" U+02E4 Lm 1 "MODIFIER LETTER SMALL REVERSED GLOTTAL STOP" ) 741 : ( "˥" U+02E5 Sk 1 "MODIFIER LETTER EXTRA-HIGH TONE BAR" ) 742 : ( "˦" U+02E6 Sk 1 "MODIFIER LETTER HIGH TONE BAR" ) 743 : ( "˧" U+02E7 Sk 1 "MODIFIER LETTER MID TONE BAR" ) 744 : ( "˨" U+02E8 Sk 1 "MODIFIER LETTER LOW TONE BAR" ) 745 : ( "˩" U+02E9 Sk 1 "MODIFIER LETTER EXTRA-LOW TONE BAR" ) 746 : ( "˪" U+02EA Sk 1 "MODIFIER LETTER YIN DEPARTING TONE MARK" ) 747 : ( "˫" U+02EB Sk 1 "MODIFIER LETTER YANG DEPARTING TONE MARK" ) 748 : ( "ˬ" U+02EC Lm 1 "MODIFIER LETTER VOICING" ) 749 : ( "˭" U+02ED Sk 1 "MODIFIER LETTER UNASPIRATED" ) 750 : ( "ˮ" U+02EE Lm 1 "MODIFIER LETTER DOUBLE APOSTROPHE" ) 751 : ( "˯" U+02EF Sk 1 "MODIFIER LETTER LOW DOWN ARROWHEAD" ) 752 : ( "˰" U+02F0 Sk 1 "MODIFIER LETTER LOW UP ARROWHEAD" ) 753 : ( "˱" U+02F1 Sk 1 "MODIFIER LETTER LOW LEFT ARROWHEAD" ) 754 : ( "˲" U+02F2 Sk 1 "MODIFIER LETTER LOW RIGHT ARROWHEAD" ) 755 : ( "˳" U+02F3 Sk 1 "MODIFIER LETTER LOW RING" ) 756 : ( "˴" U+02F4 Sk 1 "MODIFIER LETTER MIDDLE GRAVE ACCENT" ) 757 : ( "˵" U+02F5 Sk 1 "MODIFIER LETTER MIDDLE DOUBLE GRAVE ACCENT" ) 758 : ( "˶" U+02F6 Sk 1 "MODIFIER LETTER MIDDLE DOUBLE ACUTE ACCENT" ) 759 : ( "˷" U+02F7 Sk 1 "MODIFIER LETTER LOW TILDE" ) 760 : ( "˸" U+02F8 Sk 1 "MODIFIER LETTER RAISED COLON" ) 761 : ( "˹" U+02F9 Sk 1 "MODIFIER LETTER BEGIN HIGH TONE" ) 762 : ( "˺" U+02FA Sk 1 "MODIFIER LETTER END HIGH TONE" ) 763 : ( "˻" U+02FB Sk 1 "MODIFIER LETTER BEGIN LOW TONE" ) 764 : ( "˼" U+02FC Sk 1 "MODIFIER LETTER END LOW TONE" ) 765 : ( "˽" U+02FD Sk 1 "MODIFIER LETTER SHELF" ) 766 : ( "˾" U+02FE Sk 1 "MODIFIER LETTER OPEN SHELF" ) 767 : ( "˿" U+02FF Sk 1 "MODIFIER LETTER LOW LEFT ARROW" ) 768 : ( "̀" U+0300 Mn 0 "COMBINING GRAVE ACCENT" ) 769 : ( "́" U+0301 Mn 0 "COMBINING ACUTE ACCENT" ) 770 : ( "̂" U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" ) 771 : ( "̃" U+0303 Mn 0 "COMBINING TILDE" ) 772 : ( "̄" U+0304 Mn 0 "COMBINING MACRON" ) 773 : ( "̅" U+0305 Mn 0 "COMBINING OVERLINE" ) 774 : ( "̆" U+0306 Mn 0 "COMBINING BREVE" ) 775 : ( "̇" U+0307 Mn 0 "COMBINING DOT ABOVE" ) 776 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 777 : ( "̉" U+0309 Mn 0 "COMBINING HOOK ABOVE" ) 778 : ( "̊" U+030A Mn 0 "COMBINING RING ABOVE" ) 779 : ( "̋" U+030B Mn 0 "COMBINING DOUBLE ACUTE ACCENT" ) 780 : ( "̌" U+030C Mn 0 "COMBINING CARON" ) 781 : ( "̍" U+030D Mn 0 "COMBINING VERTICAL LINE ABOVE" ) 782 : ( "̎" U+030E Mn 0 "COMBINING DOUBLE VERTICAL LINE ABOVE" ) 783 : ( "̏" U+030F Mn 0 "COMBINING DOUBLE GRAVE ACCENT" ) 784 : ( "̐" U+0310 Mn 0 "COMBINING CANDRABINDU" ) 785 : ( "̑" U+0311 Mn 0 "COMBINING INVERTED BREVE" ) 786 : ( "̒" U+0312 Mn 0 "COMBINING TURNED COMMA ABOVE" ) 787 : ( "̓" U+0313 Mn 0 "COMBINING COMMA ABOVE" ) 788 : ( "̔" U+0314 Mn 0 "COMBINING REVERSED COMMA ABOVE" ) 789 : ( "̕" U+0315 Mn 0 "COMBINING COMMA ABOVE RIGHT" ) 790 : ( "̖" U+0316 Mn 0 "COMBINING GRAVE ACCENT BELOW" ) 791 : ( "̗" U+0317 Mn 0 "COMBINING ACUTE ACCENT BELOW" ) 792 : ( "̘" U+0318 Mn 0 "COMBINING LEFT TACK BELOW" ) 793 : ( "̙" U+0319 Mn 0 "COMBINING RIGHT TACK BELOW" ) 794 : ( "̚" U+031A Mn 0 "COMBINING LEFT ANGLE ABOVE" ) 795 : ( "̛" U+031B Mn 0 "COMBINING HORN" ) 796 : ( "̜" U+031C Mn 0 "COMBINING LEFT HALF RING BELOW" ) 797 : ( "̝" U+031D Mn 0 "COMBINING UP TACK BELOW" ) 798 : ( "̞" U+031E Mn 0 "COMBINING DOWN TACK BELOW" ) 799 : ( "̟" U+031F Mn 0 "COMBINING PLUS SIGN BELOW" ) 800 : ( "̠" U+0320 Mn 0 "COMBINING MINUS SIGN BELOW" ) 801 : ( "̡" U+0321 Mn 0 "COMBINING PALATALIZED HOOK BELOW" ) 802 : ( "̢" U+0322 Mn 0 "COMBINING RETROFLEX HOOK BELOW" ) 803 : ( "̣" U+0323 Mn 0 "COMBINING DOT BELOW" ) 804 : ( "̤" U+0324 Mn 0 "COMBINING DIAERESIS BELOW" ) 805 : ( "̥" U+0325 Mn 0 "COMBINING RING BELOW" ) 806 : ( "̦" U+0326 Mn 0 "COMBINING COMMA BELOW" ) 807 : ( "̧" U+0327 Mn 0 "COMBINING CEDILLA" ) 808 : ( "̨" U+0328 Mn 0 "COMBINING OGONEK" ) 809 : ( "̩" U+0329 Mn 0 "COMBINING VERTICAL LINE BELOW" ) 810 : ( "̪" U+032A Mn 0 "COMBINING BRIDGE BELOW" ) 811 : ( "̫" U+032B Mn 0 "COMBINING INVERTED DOUBLE ARCH BELOW" ) 812 : ( "̬" U+032C Mn 0 "COMBINING CARON BELOW" ) 813 : ( "̭" U+032D Mn 0 "COMBINING CIRCUMFLEX ACCENT BELOW" ) 814 : ( "̮" U+032E Mn 0 "COMBINING BREVE BELOW" ) 815 : ( "̯" U+032F Mn 0 "COMBINING INVERTED BREVE BELOW" ) 816 : ( "̰" U+0330 Mn 0 "COMBINING TILDE BELOW" ) 817 : ( "̱" U+0331 Mn 0 "COMBINING MACRON BELOW" ) 818 : ( "̲" U+0332 Mn 0 "COMBINING LOW LINE" ) 819 : ( "̳" U+0333 Mn 0 "COMBINING DOUBLE LOW LINE" ) 820 : ( "̴" U+0334 Mn 0 "COMBINING TILDE OVERLAY" ) 821 : ( "̵" U+0335 Mn 0 "COMBINING SHORT STROKE OVERLAY" ) 822 : ( "̶" U+0336 Mn 0 "COMBINING LONG STROKE OVERLAY" ) 823 : ( "̷" U+0337 Mn 0 "COMBINING SHORT SOLIDUS OVERLAY" ) 824 : ( "̸" U+0338 Mn 0 "COMBINING LONG SOLIDUS OVERLAY" ) 825 : ( "̹" U+0339 Mn 0 "COMBINING RIGHT HALF RING BELOW" ) 826 : ( "̺" U+033A Mn 0 "COMBINING INVERTED BRIDGE BELOW" ) 827 : ( "̻" U+033B Mn 0 "COMBINING SQUARE BELOW" ) 828 : ( "̼" U+033C Mn 0 "COMBINING SEAGULL BELOW" ) 829 : ( "̽" U+033D Mn 0 "COMBINING X ABOVE" ) 830 : ( "̾" U+033E Mn 0 "COMBINING VERTICAL TILDE" ) 831 : ( "̿" U+033F Mn 0 "COMBINING DOUBLE OVERLINE" ) 832 : ( "̀" U+0340 Mn 0 "COMBINING GRAVE TONE MARK" ) 833 : ( "́" U+0341 Mn 0 "COMBINING ACUTE TONE MARK" ) 834 : ( "͂" U+0342 Mn 0 "COMBINING GREEK PERISPOMENI" ) 835 : ( "̓" U+0343 Mn 0 "COMBINING GREEK KORONIS" ) 836 : ( "̈́" U+0344 Mn 0 "COMBINING GREEK DIALYTIKA TONOS" ) 837 : ( "ͅ" U+0345 Mn 0 "COMBINING GREEK YPOGEGRAMMENI" ) 838 : ( "͆" U+0346 Mn 0 "COMBINING BRIDGE ABOVE" ) 839 : ( "͇" U+0347 Mn 0 "COMBINING EQUALS SIGN BELOW" ) 840 : ( "͈" U+0348 Mn 0 "COMBINING DOUBLE VERTICAL LINE BELOW" ) 841 : ( "͉" U+0349 Mn 0 "COMBINING LEFT ANGLE BELOW" ) 842 : ( "͊" U+034A Mn 0 "COMBINING NOT TILDE ABOVE" ) 843 : ( "͋" U+034B Mn 0 "COMBINING HOMOTHETIC ABOVE" ) 844 : ( "͌" U+034C Mn 0 "COMBINING ALMOST EQUAL TO ABOVE" ) 845 : ( "͍" U+034D Mn 0 "COMBINING LEFT RIGHT ARROW BELOW" ) 846 : ( "͎" U+034E Mn 0 "COMBINING UPWARDS ARROW BELOW" ) 847 : ( "͏" U+034F Mn 0 "COMBINING GRAPHEME JOINER", "CGJ" ) 848 : ( "͐" U+0350 Mn 0 "COMBINING RIGHT ARROWHEAD ABOVE" ) 849 : ( "͑" U+0351 Mn 0 "COMBINING LEFT HALF RING ABOVE" ) 850 : ( "͒" U+0352 Mn 0 "COMBINING FERMATA" ) 851 : ( "͓" U+0353 Mn 0 "COMBINING X BELOW" ) 852 : ( "͔" U+0354 Mn 0 "COMBINING LEFT ARROWHEAD BELOW" ) 853 : ( "͕" U+0355 Mn 0 "COMBINING RIGHT ARROWHEAD BELOW" ) 854 : ( "͖" U+0356 Mn 0 "COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW" ) 855 : ( "͗" U+0357 Mn 0 "COMBINING RIGHT HALF RING ABOVE" ) 856 : ( "͘" U+0358 Mn 0 "COMBINING DOT ABOVE RIGHT" ) 857 : ( "͙" U+0359 Mn 0 "COMBINING ASTERISK BELOW" ) 858 : ( "͚" U+035A Mn 0 "COMBINING DOUBLE RING BELOW" ) 859 : ( "͛" U+035B Mn 0 "COMBINING ZIGZAG ABOVE" ) 860 : ( "͜" U+035C Mn 0 "COMBINING DOUBLE BREVE BELOW" ) 861 : ( "͝" U+035D Mn 0 "COMBINING DOUBLE BREVE" ) 862 : ( "͞" U+035E Mn 0 "COMBINING DOUBLE MACRON" ) 863 : ( "͟" U+035F Mn 0 "COMBINING DOUBLE MACRON BELOW" ) 864 : ( "͠" U+0360 Mn 0 "COMBINING DOUBLE TILDE" ) 865 : ( "͡" U+0361 Mn 0 "COMBINING DOUBLE INVERTED BREVE" ) 866 : ( "͢" U+0362 Mn 0 "COMBINING DOUBLE RIGHTWARDS ARROW BELOW" ) 867 : ( "ͣ" U+0363 Mn 0 "COMBINING LATIN SMALL LETTER A" ) 868 : ( "ͤ" U+0364 Mn 0 "COMBINING LATIN SMALL LETTER E" ) 869 : ( "ͥ" U+0365 Mn 0 "COMBINING LATIN SMALL LETTER I" ) 870 : ( "ͦ" U+0366 Mn 0 "COMBINING LATIN SMALL LETTER O" ) 871 : ( "ͧ" U+0367 Mn 0 "COMBINING LATIN SMALL LETTER U" ) 872 : ( "ͨ" U+0368 Mn 0 "COMBINING LATIN SMALL LETTER C" ) 873 : ( "ͩ" U+0369 Mn 0 "COMBINING LATIN SMALL LETTER D" ) 874 : ( "ͪ" U+036A Mn 0 "COMBINING LATIN SMALL LETTER H" ) 875 : ( "ͫ" U+036B Mn 0 "COMBINING LATIN SMALL LETTER M" ) 876 : ( "ͬ" U+036C Mn 0 "COMBINING LATIN SMALL LETTER R" ) 877 : ( "ͭ" U+036D Mn 0 "COMBINING LATIN SMALL LETTER T" ) 878 : ( "ͮ" U+036E Mn 0 "COMBINING LATIN SMALL LETTER V" ) 879 : ( "ͯ" U+036F Mn 0 "COMBINING LATIN SMALL LETTER X" ) 880 : ( "Ͱ" U+0370 Lu 1 "GREEK CAPITAL LETTER HETA" ) 881 : ( "ͱ" U+0371 Ll 1 "GREEK SMALL LETTER HETA" ) 882 : ( "Ͳ" U+0372 Lu 1 "GREEK CAPITAL LETTER ARCHAIC SAMPI" ) 883 : ( "ͳ" U+0373 Ll 1 "GREEK SMALL LETTER ARCHAIC SAMPI" ) 884 : ( "ʹ" U+0374 Lm 1 "GREEK NUMERAL SIGN" ) 885 : ( "͵" U+0375 Sk 1 "GREEK LOWER NUMERAL SIGN" ) 886 : ( "Ͷ" U+0376 Lu 1 "GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA" ) 887 : ( "ͷ" U+0377 Ll 1 "GREEK SMALL LETTER PAMPHYLIAN DIGAMMA" ) 890 : ( "ͺ" U+037A Lm 1 "GREEK YPOGEGRAMMENI" ) 891 : ( "ͻ" U+037B Ll 1 "GREEK SMALL REVERSED LUNATE SIGMA SYMBOL" ) 892 : ( "ͼ" U+037C Ll 1 "GREEK SMALL DOTTED LUNATE SIGMA SYMBOL" ) 893 : ( "ͽ" U+037D Ll 1 "GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL" ) 894 : ( ";" U+037E Po 1 "GREEK QUESTION MARK" ) 895 : ( "Ϳ" U+037F Lu 1 "GREEK CAPITAL LETTER YOT" ) 900 : ( "΄" U+0384 Sk 1 "GREEK TONOS" ) 901 : ( "΅" U+0385 Sk 1 "GREEK DIALYTIKA TONOS" ) 902 : ( "Ά" U+0386 Lu 1 "GREEK CAPITAL LETTER ALPHA WITH TONOS" ) 903 : ( "·" U+0387 Po 1 "GREEK ANO TELEIA" ) 904 : ( "Έ" U+0388 Lu 1 "GREEK CAPITAL LETTER EPSILON WITH TONOS" ) 905 : ( "Ή" U+0389 Lu 1 "GREEK CAPITAL LETTER ETA WITH TONOS" ) 906 : ( "Ί" U+038A Lu 1 "GREEK CAPITAL LETTER IOTA WITH TONOS" ) 908 : ( "Ό" U+038C Lu 1 "GREEK CAPITAL LETTER OMICRON WITH TONOS" ) 910 : ( "Ύ" U+038E Lu 1 "GREEK CAPITAL LETTER UPSILON WITH TONOS" ) 911 : ( "Ώ" U+038F Lu 1 "GREEK CAPITAL LETTER OMEGA WITH TONOS" ) 912 : ( "ΐ" U+0390 Ll 1 "GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS" ) 913 : ( "Α" U+0391 Lu 1 "GREEK CAPITAL LETTER ALPHA" ) 914 : ( "Β" U+0392 Lu 1 "GREEK CAPITAL LETTER BETA" ) 915 : ( "Γ" U+0393 Lu 1 "GREEK CAPITAL LETTER GAMMA" ) 916 : ( "Δ" U+0394 Lu 1 "GREEK CAPITAL LETTER DELTA" ) 917 : ( "Ε" U+0395 Lu 1 "GREEK CAPITAL LETTER EPSILON" ) 918 : ( "Ζ" U+0396 Lu 1 "GREEK CAPITAL LETTER ZETA" ) 919 : ( "Η" U+0397 Lu 1 "GREEK CAPITAL LETTER ETA" ) 920 : ( "Θ" U+0398 Lu 1 "GREEK CAPITAL LETTER THETA" ) 921 : ( "Ι" U+0399 Lu 1 "GREEK CAPITAL LETTER IOTA" ) 922 : ( "Κ" U+039A Lu 1 "GREEK CAPITAL LETTER KAPPA" ) 923 : ( "Λ" U+039B Lu 1 "GREEK CAPITAL LETTER LAMDA" ) 924 : ( "Μ" U+039C Lu 1 "GREEK CAPITAL LETTER MU" ) 925 : ( "Ν" U+039D Lu 1 "GREEK CAPITAL LETTER NU" ) 926 : ( "Ξ" U+039E Lu 1 "GREEK CAPITAL LETTER XI" ) 927 : ( "Ο" U+039F Lu 1 "GREEK CAPITAL LETTER OMICRON" ) 928 : ( "Π" U+03A0 Lu 1 "GREEK CAPITAL LETTER PI" ) 929 : ( "Ρ" U+03A1 Lu 1 "GREEK CAPITAL LETTER RHO" ) 931 : ( "Σ" U+03A3 Lu 1 "GREEK CAPITAL LETTER SIGMA" ) 932 : ( "Τ" U+03A4 Lu 1 "GREEK CAPITAL LETTER TAU" ) 933 : ( "Υ" U+03A5 Lu 1 "GREEK CAPITAL LETTER UPSILON" ) 934 : ( "Φ" U+03A6 Lu 1 "GREEK CAPITAL LETTER PHI" ) 935 : ( "Χ" U+03A7 Lu 1 "GREEK CAPITAL LETTER CHI" ) 936 : ( "Ψ" U+03A8 Lu 1 "GREEK CAPITAL LETTER PSI" ) 937 : ( "Ω" U+03A9 Lu 1 "GREEK CAPITAL LETTER OMEGA" ) 938 : ( "Ϊ" U+03AA Lu 1 "GREEK CAPITAL LETTER IOTA WITH DIALYTIKA" ) 939 : ( "Ϋ" U+03AB Lu 1 "GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA" ) 940 : ( "ά" U+03AC Ll 1 "GREEK SMALL LETTER ALPHA WITH TONOS" ) 941 : ( "έ" U+03AD Ll 1 "GREEK SMALL LETTER EPSILON WITH TONOS" ) 942 : ( "ή" U+03AE Ll 1 "GREEK SMALL LETTER ETA WITH TONOS" ) 943 : ( "ί" U+03AF Ll 1 "GREEK SMALL LETTER IOTA WITH TONOS" ) 944 : ( "ΰ" U+03B0 Ll 1 "GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS" ) 945 : ( "α" U+03B1 Ll 1 "GREEK SMALL LETTER ALPHA" ) 946 : ( "β" U+03B2 Ll 1 "GREEK SMALL LETTER BETA" ) 947 : ( "γ" U+03B3 Ll 1 "GREEK SMALL LETTER GAMMA" ) 948 : ( "δ" U+03B4 Ll 1 "GREEK SMALL LETTER DELTA" ) 949 : ( "ε" U+03B5 Ll 1 "GREEK SMALL LETTER EPSILON" ) 950 : ( "ζ" U+03B6 Ll 1 "GREEK SMALL LETTER ZETA" ) 951 : ( "η" U+03B7 Ll 1 "GREEK SMALL LETTER ETA" ) 952 : ( "θ" U+03B8 Ll 1 "GREEK SMALL LETTER THETA" ) 953 : ( "ι" U+03B9 Ll 1 "GREEK SMALL LETTER IOTA" ) 954 : ( "κ" U+03BA Ll 1 "GREEK SMALL LETTER KAPPA" ) 955 : ( "λ" U+03BB Ll 1 "GREEK SMALL LETTER LAMDA" ) 956 : ( "μ" U+03BC Ll 1 "GREEK SMALL LETTER MU" ) 957 : ( "ν" U+03BD Ll 1 "GREEK SMALL LETTER NU" ) 958 : ( "ξ" U+03BE Ll 1 "GREEK SMALL LETTER XI" ) 959 : ( "ο" U+03BF Ll 1 "GREEK SMALL LETTER OMICRON" ) 960 : ( "π" U+03C0 Ll 1 "GREEK SMALL LETTER PI" ) 961 : ( "ρ" U+03C1 Ll 1 "GREEK SMALL LETTER RHO" ) 962 : ( "ς" U+03C2 Ll 1 "GREEK SMALL LETTER FINAL SIGMA" ) 963 : ( "σ" U+03C3 Ll 1 "GREEK SMALL LETTER SIGMA" ) 964 : ( "τ" U+03C4 Ll 1 "GREEK SMALL LETTER TAU" ) 965 : ( "υ" U+03C5 Ll 1 "GREEK SMALL LETTER UPSILON" ) 966 : ( "φ" U+03C6 Ll 1 "GREEK SMALL LETTER PHI" ) 967 : ( "χ" U+03C7 Ll 1 "GREEK SMALL LETTER CHI" ) 968 : ( "ψ" U+03C8 Ll 1 "GREEK SMALL LETTER PSI" ) 969 : ( "ω" U+03C9 Ll 1 "GREEK SMALL LETTER OMEGA" ) 970 : ( "ϊ" U+03CA Ll 1 "GREEK SMALL LETTER IOTA WITH DIALYTIKA" ) 971 : ( "ϋ" U+03CB Ll 1 "GREEK SMALL LETTER UPSILON WITH DIALYTIKA" ) 972 : ( "ό" U+03CC Ll 1 "GREEK SMALL LETTER OMICRON WITH TONOS" ) 973 : ( "ύ" U+03CD Ll 1 "GREEK SMALL LETTER UPSILON WITH TONOS" ) 974 : ( "ώ" U+03CE Ll 1 "GREEK SMALL LETTER OMEGA WITH TONOS" ) 975 : ( "Ϗ" U+03CF Lu 1 "GREEK CAPITAL KAI SYMBOL" ) 976 : ( "ϐ" U+03D0 Ll 1 "GREEK BETA SYMBOL" ) 977 : ( "ϑ" U+03D1 Ll 1 "GREEK THETA SYMBOL" ) 978 : ( "ϒ" U+03D2 Lu 1 "GREEK UPSILON WITH HOOK SYMBOL" ) 979 : ( "ϓ" U+03D3 Lu 1 "GREEK UPSILON WITH ACUTE AND HOOK SYMBOL" ) 980 : ( "ϔ" U+03D4 Lu 1 "GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL" ) 981 : ( "ϕ" U+03D5 Ll 1 "GREEK PHI SYMBOL" ) 982 : ( "ϖ" U+03D6 Ll 1 "GREEK PI SYMBOL" ) 983 : ( "ϗ" U+03D7 Ll 1 "GREEK KAI SYMBOL" ) 984 : ( "Ϙ" U+03D8 Lu 1 "GREEK LETTER ARCHAIC KOPPA" ) 985 : ( "ϙ" U+03D9 Ll 1 "GREEK SMALL LETTER ARCHAIC KOPPA" ) 986 : ( "Ϛ" U+03DA Lu 1 "GREEK LETTER STIGMA" ) 987 : ( "ϛ" U+03DB Ll 1 "GREEK SMALL LETTER STIGMA" ) 988 : ( "Ϝ" U+03DC Lu 1 "GREEK LETTER DIGAMMA" ) 989 : ( "ϝ" U+03DD Ll 1 "GREEK SMALL LETTER DIGAMMA" ) 990 : ( "Ϟ" U+03DE Lu 1 "GREEK LETTER KOPPA" ) 991 : ( "ϟ" U+03DF Ll 1 "GREEK SMALL LETTER KOPPA" ) 992 : ( "Ϡ" U+03E0 Lu 1 "GREEK LETTER SAMPI" ) 993 : ( "ϡ" U+03E1 Ll 1 "GREEK SMALL LETTER SAMPI" ) 994 : ( "Ϣ" U+03E2 Lu 1 "COPTIC CAPITAL LETTER SHEI" ) 995 : ( "ϣ" U+03E3 Ll 1 "COPTIC SMALL LETTER SHEI" ) 996 : ( "Ϥ" U+03E4 Lu 1 "COPTIC CAPITAL LETTER FEI" ) 997 : ( "ϥ" U+03E5 Ll 1 "COPTIC SMALL LETTER FEI" ) 998 : ( "Ϧ" U+03E6 Lu 1 "COPTIC CAPITAL LETTER KHEI" ) 999 : ( "ϧ" U+03E7 Ll 1 "COPTIC SMALL LETTER KHEI" ) 1000 : ( "Ϩ" U+03E8 Lu 1 "COPTIC CAPITAL LETTER HORI" ) 1001 : ( "ϩ" U+03E9 Ll 1 "COPTIC SMALL LETTER HORI" ) 1002 : ( "Ϫ" U+03EA Lu 1 "COPTIC CAPITAL LETTER GANGIA" ) 1003 : ( "ϫ" U+03EB Ll 1 "COPTIC SMALL LETTER GANGIA" ) 1004 : ( "Ϭ" U+03EC Lu 1 "COPTIC CAPITAL LETTER SHIMA" ) 1005 : ( "ϭ" U+03ED Ll 1 "COPTIC SMALL LETTER SHIMA" ) 1006 : ( "Ϯ" U+03EE Lu 1 "COPTIC CAPITAL LETTER DEI" ) 1007 : ( "ϯ" U+03EF Ll 1 "COPTIC SMALL LETTER DEI" ) 1008 : ( "ϰ" U+03F0 Ll 1 "GREEK KAPPA SYMBOL" ) ... /* Add character intervals. .UnicodeCharacterInterval codepointFrom codepointTo name isExpanded */ ooRexx> .unicode~characterIntervals== an UnicodeCharacterIntervalSupplier 1 : (U+3400..U+4DBF "CJK UNIFIED IDEOGRAPH-*" 6591 characters) 2 : (U+4E00..U+9FFF "CJK UNIFIED IDEOGRAPH-*" 20991 characters) 3 : (U+F900..U+FA6D "CJK COMPATIBILITY IDEOGRAPH-*" 365 characters) 4 : (U+FA70..U+FAD9 "CJK COMPATIBILITY IDEOGRAPH-*" 105 characters) 5 : (U+17000..U+187F7 "TANGUT IDEOGRAPH-*" 6135 characters) 6 : (U+18B00..U+18CD5 "KHITAN SMALL SCRIPT CHARACTER-*" 469 characters) 7 : (U+18D00..U+18D08 "TANGUT IDEOGRAPH-*" 8 characters) 8 : (U+1B170..U+1B2FB "NUSHU CHARACTER-*" 395 characters) 9 : (U+20000..U+2A6DF "CJK UNIFIED IDEOGRAPH-*" 42719 characters) 10 : (U+2A700..U+2B739 "CJK UNIFIED IDEOGRAPH-*" 4153 characters) 11 : (U+2B740..U+2B81D "CJK UNIFIED IDEOGRAPH-*" 221 characters) 12 : (U+2B820..U+2CEA1 "CJK UNIFIED IDEOGRAPH-*" 5761 characters) 13 : (U+2CEB0..U+2EBE0 "CJK UNIFIED IDEOGRAPH-*" 7472 characters) 14 : (U+2EBF0..U+2EE5D "CJK UNIFIED IDEOGRAPH-*" 621 characters) 15 : (U+2F800..U+2FA1D "CJK COMPATIBILITY IDEOGRAPH-*" 541 characters) 16 : (U+30000..U+3134A "CJK UNIFIED IDEOGRAPH-*" 4938 characters) 17 : (U+31350..U+323AF "CJK UNIFIED IDEOGRAPH-*" 4191 characters) -- Informations about Unicode: -- Remove dataDirectory because the value is different between Windows and Macos/Linux ooRexx> .Unicode~informations~~remove("dataDirectory")= a Directory (12 items) 'characterIntervals' : (an UnicodeCharacterIntervalSupplier count=17 notExpanded:17 intervals, 105693 characters) 'characters' : (an UnicodeCharacterSupplier count=44189 size=918000) 'maxCodepoint' : 1114111 'memorizeTranscodings' : 0 'memorizeTransformations' : 0 'systemIsLittleEndian' : 1 'totalCharacterNameAliases' : 473 'totalCharactersLoaded' : 149813 'totalIntervalCharacters' : 105693 'totalIntervalCharactersNotExpanded' : 105693 'unckeckedConversionToString' : 1 'version' : '15.1.0' -- =============================================================================== -- 2021 September 13, updated September 22 /* Add character informations. The loading of the character names is optional. By default, they are not loaded. From ooRexxShell, execute: call loadUnicodeCharacterNames By default, the character intervals are not expanded. From ooRexxShell, execute: call expandUnicodeCharacterIntervals The other character properties are always loaded (provided by utf8proc) .Unicode characters --> supplier of UnicodeCharacter character(index) --> UnicodeCharacter (index can be a loose matching name (UAX44-LM2) or a codepoint) characterIntervals --> supplier of UnicodeCharacterInterval .UnicodeCharacter codepoint --> integer -1..1114111 name --> string aliases --> array of .UnicodeAlias bidiClass --> enum 1, 2, 3, ... bidiClassName --> enum 'L', 'LRE', 'LRO', ... boundClass --> enum 0, 1, 2, ... boundClassName --> enum 'START', 'OTHER', 'CR', ... category --> enum 0, 1, 2, ... categoryName --> enum 'Cn', 'Lu', 'Ll', ... charWidth --> integer combiningClass --> integer 0..254 controlBoundary --> boolean decompType --> enum 0, 1, 2, ... decompTypeName --> enum '<none>', '<font>', '<nobreak>, ... ignorable --> boolean Examples: */ -- All the Unicode characters (sparse array). ooRexx> .unicode~characters== an UnicodeCharacterSupplier 0 : ( "" U+0000 Cc 0 "", "NULL", "NUL" ) 1 : ( "" U+0001 Cc 0 "", "START OF HEADING", "SOH" ) 2 : ( "" U+0002 Cc 0 "", "START OF TEXT", "STX" ) 3 : ( "" U+0003 Cc 0 "", "END OF TEXT", "ETX" ) 4 : ( "" U+0004 Cc 0 "", "END OF TRANSMISSION", "EOT" ) 5 : ( "" U+0005 Cc 0 "", "ENQUIRY", "ENQ" ) 6 : ( "" U+0006 Cc 0 "", "ACKNOWLEDGE", "ACK" ) 7 : ( "" U+0007 Cc 0 "", "ALERT", "BEL" ) 8 : ( "" U+0008 Cc 0 "", "BACKSPACE", "BS" ) 9 : ( "" U+0009 Cc 0 "", "CHARACTER TABULATION", "HORIZONTAL TABULATION", "HT", "TAB" ) 10 : ( "" U+000A Cc 0 "", "LINE FEED", "NEW LINE", "END OF LINE", "LF", "NL", "EOL" ) 11 : ( "" U+000B Cc 0 "", "LINE TABULATION", "VERTICAL TABULATION", "VT" ) 12 : ( "" U+000C Cc 0 "", "FORM FEED", "FF" ) 13 : ( "" U+000D Cc 0 "", "CARRIAGE RETURN", "CR" ) 14 : ( "" U+000E Cc 0 "", "SHIFT OUT", "LOCKING-SHIFT ONE", "SO" ) 15 : ( "" U+000F Cc 0 "", "SHIFT IN", "LOCKING-SHIFT ZERO", "SI" ) 16 : ( "" U+0010 Cc 0 "", "DATA LINK ESCAPE", "DLE" ) 17 : ( "" U+0011 Cc 0 "", "DEVICE CONTROL ONE", "DC1" ) 18 : ( "" U+0012 Cc 0 "", "DEVICE CONTROL TWO", "DC2" ) 19 : ( "" U+0013 Cc 0 "", "DEVICE CONTROL THREE", "DC3" ) 20 : ( "" U+0014 Cc 0 "", "DEVICE CONTROL FOUR", "DC4" ) 21 : ( "" U+0015 Cc 0 "", "NEGATIVE ACKNOWLEDGE", "NAK" ) 22 : ( "" U+0016 Cc 0 "", "SYNCHRONOUS IDLE", "SYN" ) 23 : ( "" U+0017 Cc 0 "", "END OF TRANSMISSION BLOCK", "ETB" ) 24 : ( "" U+0018 Cc 0 "", "CANCEL", "CAN" ) 25 : ( "" U+0019 Cc 0 "", "END OF MEDIUM", "EOM", "EM" ) 26 : ( "" U+001A Cc 0 "", "SUBSTITUTE", "SUB" ) 27 : ( "" U+001B Cc 0 "", "ESCAPE", "ESC" ) 28 : ( "" U+001C Cc 0 "", "INFORMATION SEPARATOR FOUR", "FILE SEPARATOR", "FS" ) 29 : ( "" U+001D Cc 0 "", "INFORMATION SEPARATOR THREE", "GROUP SEPARATOR", "GS" ) 30 : ( "" U+001E Cc 0 "", "INFORMATION SEPARATOR TWO", "RECORD SEPARATOR", "RS" ) 31 : ( "" U+001F Cc 0 "", "INFORMATION SEPARATOR ONE", "UNIT SEPARATOR", "US" ) 32 : ( " " U+0020 Zs 1 "SPACE", "SP" ) 33 : ( "!" U+0021 Po 1 "EXCLAMATION MARK" ) 34 : ( """ U+0022 Po 1 "QUOTATION MARK" ) 35 : ( "#" U+0023 Po 1 "NUMBER SIGN" ) 36 : ( "$" U+0024 Sc 1 "DOLLAR SIGN" ) 37 : ( "%" U+0025 Po 1 "PERCENT SIGN" ) 38 : ( "&" U+0026 Po 1 "AMPERSAND" ) 39 : ( "'" U+0027 Po 1 "APOSTROPHE" ) 40 : ( "(" U+0028 Ps 1 "LEFT PARENTHESIS" ) 41 : ( ")" U+0029 Pe 1 "RIGHT PARENTHESIS" ) 42 : ( "*" U+002A Po 1 "ASTERISK" ) 43 : ( "+" U+002B Sm 1 "PLUS SIGN" ) 44 : ( "," U+002C Po 1 "COMMA" ) 45 : ( "-" U+002D Pd 1 "HYPHEN-MINUS" ) 46 : ( "." U+002E Po 1 "FULL STOP" ) 47 : ( "/" U+002F Po 1 "SOLIDUS" ) 48 : ( "0" U+0030 Nd 1 "DIGIT ZERO" ) 49 : ( "1" U+0031 Nd 1 "DIGIT ONE" ) 50 : ( "2" U+0032 Nd 1 "DIGIT TWO" ) 51 : ( "3" U+0033 Nd 1 "DIGIT THREE" ) 52 : ( "4" U+0034 Nd 1 "DIGIT FOUR" ) 53 : ( "5" U+0035 Nd 1 "DIGIT FIVE" ) 54 : ( "6" U+0036 Nd 1 "DIGIT SIX" ) 55 : ( "7" U+0037 Nd 1 "DIGIT SEVEN" ) 56 : ( "8" U+0038 Nd 1 "DIGIT EIGHT" ) 57 : ( "9" U+0039 Nd 1 "DIGIT NINE" ) 58 : ( ":" U+003A Po 1 "COLON" ) 59 : ( ";" U+003B Po 1 "SEMICOLON" ) 60 : ( "<" U+003C Sm 1 "LESS-THAN SIGN" ) 61 : ( "=" U+003D Sm 1 "EQUALS SIGN" ) 62 : ( ">" U+003E Sm 1 "GREATER-THAN SIGN" ) 63 : ( "?" U+003F Po 1 "QUESTION MARK" ) 64 : ( "@" U+0040 Po 1 "COMMERCIAL AT" ) 65 : ( "A" U+0041 Lu 1 "LATIN CAPITAL LETTER A" ) 66 : ( "B" U+0042 Lu 1 "LATIN CAPITAL LETTER B" ) 67 : ( "C" U+0043 Lu 1 "LATIN CAPITAL LETTER C" ) 68 : ( "D" U+0044 Lu 1 "LATIN CAPITAL LETTER D" ) 69 : ( "E" U+0045 Lu 1 "LATIN CAPITAL LETTER E" ) 70 : ( "F" U+0046 Lu 1 "LATIN CAPITAL LETTER F" ) 71 : ( "G" U+0047 Lu 1 "LATIN CAPITAL LETTER G" ) 72 : ( "H" U+0048 Lu 1 "LATIN CAPITAL LETTER H" ) 73 : ( "I" U+0049 Lu 1 "LATIN CAPITAL LETTER I" ) 74 : ( "J" U+004A Lu 1 "LATIN CAPITAL LETTER J" ) 75 : ( "K" U+004B Lu 1 "LATIN CAPITAL LETTER K" ) 76 : ( "L" U+004C Lu 1 "LATIN CAPITAL LETTER L" ) 77 : ( "M" U+004D Lu 1 "LATIN CAPITAL LETTER M" ) 78 : ( "N" U+004E Lu 1 "LATIN CAPITAL LETTER N" ) 79 : ( "O" U+004F Lu 1 "LATIN CAPITAL LETTER O" ) 80 : ( "P" U+0050 Lu 1 "LATIN CAPITAL LETTER P" ) 81 : ( "Q" U+0051 Lu 1 "LATIN CAPITAL LETTER Q" ) 82 : ( "R" U+0052 Lu 1 "LATIN CAPITAL LETTER R" ) 83 : ( "S" U+0053 Lu 1 "LATIN CAPITAL LETTER S" ) 84 : ( "T" U+0054 Lu 1 "LATIN CAPITAL LETTER T" ) 85 : ( "U" U+0055 Lu 1 "LATIN CAPITAL LETTER U" ) 86 : ( "V" U+0056 Lu 1 "LATIN CAPITAL LETTER V" ) 87 : ( "W" U+0057 Lu 1 "LATIN CAPITAL LETTER W" ) 88 : ( "X" U+0058 Lu 1 "LATIN CAPITAL LETTER X" ) 89 : ( "Y" U+0059 Lu 1 "LATIN CAPITAL LETTER Y" ) 90 : ( "Z" U+005A Lu 1 "LATIN CAPITAL LETTER Z" ) 91 : ( "[" U+005B Ps 1 "LEFT SQUARE BRACKET" ) 92 : ( "\" U+005C Po 1 "REVERSE SOLIDUS" ) 93 : ( "]" U+005D Pe 1 "RIGHT SQUARE BRACKET" ) 94 : ( "^" U+005E Sk 1 "CIRCUMFLEX ACCENT" ) 95 : ( "_" U+005F Pc 1 "LOW LINE" ) 96 : ( "`" U+0060 Sk 1 "GRAVE ACCENT" ) 97 : ( "a" U+0061 Ll 1 "LATIN SMALL LETTER A" ) 98 : ( "b" U+0062 Ll 1 "LATIN SMALL LETTER B" ) 99 : ( "c" U+0063 Ll 1 "LATIN SMALL LETTER C" ) 100 : ( "d" U+0064 Ll 1 "LATIN SMALL LETTER D" ) 101 : ( "e" U+0065 Ll 1 "LATIN SMALL LETTER E" ) 102 : ( "f" U+0066 Ll 1 "LATIN SMALL LETTER F" ) 103 : ( "g" U+0067 Ll 1 "LATIN SMALL LETTER G" ) 104 : ( "h" U+0068 Ll 1 "LATIN SMALL LETTER H" ) 105 : ( "i" U+0069 Ll 1 "LATIN SMALL LETTER I" ) 106 : ( "j" U+006A Ll 1 "LATIN SMALL LETTER J" ) 107 : ( "k" U+006B Ll 1 "LATIN SMALL LETTER K" ) 108 : ( "l" U+006C Ll 1 "LATIN SMALL LETTER L" ) 109 : ( "m" U+006D Ll 1 "LATIN SMALL LETTER M" ) 110 : ( "n" U+006E Ll 1 "LATIN SMALL LETTER N" ) 111 : ( "o" U+006F Ll 1 "LATIN SMALL LETTER O" ) 112 : ( "p" U+0070 Ll 1 "LATIN SMALL LETTER P" ) 113 : ( "q" U+0071 Ll 1 "LATIN SMALL LETTER Q" ) 114 : ( "r" U+0072 Ll 1 "LATIN SMALL LETTER R" ) 115 : ( "s" U+0073 Ll 1 "LATIN SMALL LETTER S" ) 116 : ( "t" U+0074 Ll 1 "LATIN SMALL LETTER T" ) 117 : ( "u" U+0075 Ll 1 "LATIN SMALL LETTER U" ) 118 : ( "v" U+0076 Ll 1 "LATIN SMALL LETTER V" ) 119 : ( "w" U+0077 Ll 1 "LATIN SMALL LETTER W" ) 120 : ( "x" U+0078 Ll 1 "LATIN SMALL LETTER X" ) 121 : ( "y" U+0079 Ll 1 "LATIN SMALL LETTER Y" ) 122 : ( "z" U+007A Ll 1 "LATIN SMALL LETTER Z" ) 123 : ( "{" U+007B Ps 1 "LEFT CURLY BRACKET" ) 124 : ( "|" U+007C Sm 1 "VERTICAL LINE" ) 125 : ( "}" U+007D Pe 1 "RIGHT CURLY BRACKET" ) 126 : ( "~" U+007E Sm 1 "TILDE" ) 127 : ( "" U+007F Cc 0 "", "DELETE", "DEL" ) 128 : ( "" U+0080 Cc 0 "", "PADDING CHARACTER", "PAD" ) 129 : ( "" U+0081 Cc 0 "", "HIGH OCTET PRESET", "HOP" ) 130 : ( "" U+0082 Cc 0 "", "BREAK PERMITTED HERE", "BPH" ) 131 : ( "" U+0083 Cc 0 "", "NO BREAK HERE", "NBH" ) 132 : ( "" U+0084 Cc 0 "", "INDEX", "IND" ) 133 : ( " " U+0085 Cc 0 "", "NEXT LINE", "NEL" ) 134 : ( "" U+0086 Cc 0 "", "START OF SELECTED AREA", "SSA" ) 135 : ( "" U+0087 Cc 0 "", "END OF SELECTED AREA", "ESA" ) 136 : ( "" U+0088 Cc 0 "", "CHARACTER TABULATION SET", "HORIZONTAL TABULATION SET", "HTS" ) 137 : ( "" U+0089 Cc 0 "", "CHARACTER TABULATION WITH JUSTIFICATION", "HORIZONTAL TABULATION WITH JUSTIFICATION", "HTJ" ) 138 : ( "" U+008A Cc 0 "", "LINE TABULATION SET", "VERTICAL TABULATION SET", "VTS" ) 139 : ( "" U+008B Cc 0 "", "PARTIAL LINE FORWARD", "PARTIAL LINE DOWN", "PLD" ) 140 : ( "" U+008C Cc 0 "", "PARTIAL LINE BACKWARD", "PARTIAL LINE UP", "PLU" ) 141 : ( "" U+008D Cc 0 "", "REVERSE LINE FEED", "REVERSE INDEX", "RI" ) 142 : ( "" U+008E Cc 0 "", "SINGLE SHIFT TWO", "SINGLE-SHIFT-2", "SS2" ) 143 : ( "" U+008F Cc 0 "", "SINGLE SHIFT THREE", "SINGLE-SHIFT-3", "SS3" ) 144 : ( "" U+0090 Cc 0 "", "DEVICE CONTROL STRING", "DCS" ) 145 : ( "" U+0091 Cc 0 "", "PRIVATE USE ONE", "PRIVATE USE-1", "PU1" ) 146 : ( "" U+0092 Cc 0 "", "PRIVATE USE TWO", "PRIVATE USE-2", "PU2" ) 147 : ( "" U+0093 Cc 0 "", "SET TRANSMIT STATE", "STS" ) 148 : ( "" U+0094 Cc 0 "", "CANCEL CHARACTER", "CCH" ) 149 : ( "" U+0095 Cc 0 "", "MESSAGE WAITING", "MW" ) 150 : ( "" U+0096 Cc 0 "", "START OF GUARDED AREA", "START OF PROTECTED AREA", "SPA" ) 151 : ( "" U+0097 Cc 0 "", "END OF GUARDED AREA", "END OF PROTECTED AREA", "EPA" ) 152 : ( "" U+0098 Cc 0 "", "START OF STRING", "SOS" ) 153 : ( "" U+0099 Cc 0 "", "SINGLE GRAPHIC CHARACTER INTRODUCER", "SGC" ) 154 : ( "" U+009A Cc 0 "", "SINGLE CHARACTER INTRODUCER", "SCI" ) 155 : ( "" U+009B Cc 0 "", "CONTROL SEQUENCE INTRODUCER", "CSI" ) 156 : ( "" U+009C Cc 0 "", "STRING TERMINATOR", "ST" ) 157 : ( "" U+009D Cc 0 "", "OPERATING SYSTEM COMMAND", "OSC" ) 158 : ( "" U+009E Cc 0 "", "PRIVACY MESSAGE", "PM" ) 159 : ( "" U+009F Cc 0 "", "APPLICATION PROGRAM COMMAND", "APC" ) 160 : ( " " U+00A0 Zs 1 "NO-BREAK SPACE", "NBSP" ) 161 : ( "¡" U+00A1 Po 1 "INVERTED EXCLAMATION MARK" ) 162 : ( "¢" U+00A2 Sc 1 "CENT SIGN" ) 163 : ( "£" U+00A3 Sc 1 "POUND SIGN" ) 164 : ( "¤" U+00A4 Sc 1 "CURRENCY SIGN" ) 165 : ( "¥" U+00A5 Sc 1 "YEN SIGN" ) 166 : ( "¦" U+00A6 So 1 "BROKEN BAR" ) 167 : ( "§" U+00A7 Po 1 "SECTION SIGN" ) 168 : ( "¨" U+00A8 Sk 1 "DIAERESIS" ) 169 : ( "©" U+00A9 So 1 "COPYRIGHT SIGN" ) 170 : ( "ª" U+00AA Lo 1 "FEMININE ORDINAL INDICATOR" ) 171 : ( "«" U+00AB Pi 1 "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK" ) 172 : ( "¬" U+00AC Sm 1 "NOT SIGN" ) 173 : ( "" U+00AD Cf 1 "SOFT HYPHEN", "SHY" ) 174 : ( "®" U+00AE So 1 "REGISTERED SIGN" ) 175 : ( "¯" U+00AF Sk 1 "MACRON" ) 176 : ( "°" U+00B0 So 1 "DEGREE SIGN" ) 177 : ( "±" U+00B1 Sm 1 "PLUS-MINUS SIGN" ) 178 : ( "²" U+00B2 No 1 "SUPERSCRIPT TWO" ) 179 : ( "³" U+00B3 No 1 "SUPERSCRIPT THREE" ) 180 : ( "´" U+00B4 Sk 1 "ACUTE ACCENT" ) 181 : ( "µ" U+00B5 Ll 1 "MICRO SIGN" ) 182 : ( "¶" U+00B6 Po 1 "PILCROW SIGN" ) 183 : ( "·" U+00B7 Po 1 "MIDDLE DOT" ) 184 : ( "¸" U+00B8 Sk 1 "CEDILLA" ) 185 : ( "¹" U+00B9 No 1 "SUPERSCRIPT ONE" ) 186 : ( "º" U+00BA Lo 1 "MASCULINE ORDINAL INDICATOR" ) 187 : ( "»" U+00BB Pf 1 "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK" ) 188 : ( "¼" U+00BC No 1 "VULGAR FRACTION ONE QUARTER" ) 189 : ( "½" U+00BD No 1 "VULGAR FRACTION ONE HALF" ) 190 : ( "¾" U+00BE No 1 "VULGAR FRACTION THREE QUARTERS" ) 191 : ( "¿" U+00BF Po 1 "INVERTED QUESTION MARK" ) 192 : ( "À" U+00C0 Lu 1 "LATIN CAPITAL LETTER A WITH GRAVE" ) 193 : ( "Á" U+00C1 Lu 1 "LATIN CAPITAL LETTER A WITH ACUTE" ) 194 : ( "Â" U+00C2 Lu 1 "LATIN CAPITAL LETTER A WITH CIRCUMFLEX" ) 195 : ( "Ã" U+00C3 Lu 1 "LATIN CAPITAL LETTER A WITH TILDE" ) 196 : ( "Ä" U+00C4 Lu 1 "LATIN CAPITAL LETTER A WITH DIAERESIS" ) 197 : ( "Å" U+00C5 Lu 1 "LATIN CAPITAL LETTER A WITH RING ABOVE" ) 198 : ( "Æ" U+00C6 Lu 1 "LATIN CAPITAL LETTER AE" ) 199 : ( "Ç" U+00C7 Lu 1 "LATIN CAPITAL LETTER C WITH CEDILLA" ) 200 : ( "È" U+00C8 Lu 1 "LATIN CAPITAL LETTER E WITH GRAVE" ) 201 : ( "É" U+00C9 Lu 1 "LATIN CAPITAL LETTER E WITH ACUTE" ) 202 : ( "Ê" U+00CA Lu 1 "LATIN CAPITAL LETTER E WITH CIRCUMFLEX" ) 203 : ( "Ë" U+00CB Lu 1 "LATIN CAPITAL LETTER E WITH DIAERESIS" ) 204 : ( "Ì" U+00CC Lu 1 "LATIN CAPITAL LETTER I WITH GRAVE" ) 205 : ( "Í" U+00CD Lu 1 "LATIN CAPITAL LETTER I WITH ACUTE" ) 206 : ( "Î" U+00CE Lu 1 "LATIN CAPITAL LETTER I WITH CIRCUMFLEX" ) 207 : ( "Ï" U+00CF Lu 1 "LATIN CAPITAL LETTER I WITH DIAERESIS" ) 208 : ( "Ð" U+00D0 Lu 1 "LATIN CAPITAL LETTER ETH" ) 209 : ( "Ñ" U+00D1 Lu 1 "LATIN CAPITAL LETTER N WITH TILDE" ) 210 : ( "Ò" U+00D2 Lu 1 "LATIN CAPITAL LETTER O WITH GRAVE" ) 211 : ( "Ó" U+00D3 Lu 1 "LATIN CAPITAL LETTER O WITH ACUTE" ) 212 : ( "Ô" U+00D4 Lu 1 "LATIN CAPITAL LETTER O WITH CIRCUMFLEX" ) 213 : ( "Õ" U+00D5 Lu 1 "LATIN CAPITAL LETTER O WITH TILDE" ) 214 : ( "Ö" U+00D6 Lu 1 "LATIN CAPITAL LETTER O WITH DIAERESIS" ) 215 : ( "×" U+00D7 Sm 1 "MULTIPLICATION SIGN" ) 216 : ( "Ø" U+00D8 Lu 1 "LATIN CAPITAL LETTER O WITH STROKE" ) 217 : ( "Ù" U+00D9 Lu 1 "LATIN CAPITAL LETTER U WITH GRAVE" ) 218 : ( "Ú" U+00DA Lu 1 "LATIN CAPITAL LETTER U WITH ACUTE" ) 219 : ( "Û" U+00DB Lu 1 "LATIN CAPITAL LETTER U WITH CIRCUMFLEX" ) 220 : ( "Ü" U+00DC Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS" ) 221 : ( "Ý" U+00DD Lu 1 "LATIN CAPITAL LETTER Y WITH ACUTE" ) 222 : ( "Þ" U+00DE Lu 1 "LATIN CAPITAL LETTER THORN" ) 223 : ( "ß" U+00DF Ll 1 "LATIN SMALL LETTER SHARP S" ) 224 : ( "à" U+00E0 Ll 1 "LATIN SMALL LETTER A WITH GRAVE" ) 225 : ( "á" U+00E1 Ll 1 "LATIN SMALL LETTER A WITH ACUTE" ) 226 : ( "â" U+00E2 Ll 1 "LATIN SMALL LETTER A WITH CIRCUMFLEX" ) 227 : ( "ã" U+00E3 Ll 1 "LATIN SMALL LETTER A WITH TILDE" ) 228 : ( "ä" U+00E4 Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS" ) 229 : ( "å" U+00E5 Ll 1 "LATIN SMALL LETTER A WITH RING ABOVE" ) 230 : ( "æ" U+00E6 Ll 1 "LATIN SMALL LETTER AE" ) 231 : ( "ç" U+00E7 Ll 1 "LATIN SMALL LETTER C WITH CEDILLA" ) 232 : ( "è" U+00E8 Ll 1 "LATIN SMALL LETTER E WITH GRAVE" ) 233 : ( "é" U+00E9 Ll 1 "LATIN SMALL LETTER E WITH ACUTE" ) 234 : ( "ê" U+00EA Ll 1 "LATIN SMALL LETTER E WITH CIRCUMFLEX" ) 235 : ( "ë" U+00EB Ll 1 "LATIN SMALL LETTER E WITH DIAERESIS" ) 236 : ( "ì" U+00EC Ll 1 "LATIN SMALL LETTER I WITH GRAVE" ) 237 : ( "í" U+00ED Ll 1 "LATIN SMALL LETTER I WITH ACUTE" ) 238 : ( "î" U+00EE Ll 1 "LATIN SMALL LETTER I WITH CIRCUMFLEX" ) 239 : ( "ï" U+00EF Ll 1 "LATIN SMALL LETTER I WITH DIAERESIS" ) 240 : ( "ð" U+00F0 Ll 1 "LATIN SMALL LETTER ETH" ) 241 : ( "ñ" U+00F1 Ll 1 "LATIN SMALL LETTER N WITH TILDE" ) 242 : ( "ò" U+00F2 Ll 1 "LATIN SMALL LETTER O WITH GRAVE" ) 243 : ( "ó" U+00F3 Ll 1 "LATIN SMALL LETTER O WITH ACUTE" ) 244 : ( "ô" U+00F4 Ll 1 "LATIN SMALL LETTER O WITH CIRCUMFLEX" ) 245 : ( "õ" U+00F5 Ll 1 "LATIN SMALL LETTER O WITH TILDE" ) 246 : ( "ö" U+00F6 Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS" ) 247 : ( "÷" U+00F7 Sm 1 "DIVISION SIGN" ) 248 : ( "ø" U+00F8 Ll 1 "LATIN SMALL LETTER O WITH STROKE" ) 249 : ( "ù" U+00F9 Ll 1 "LATIN SMALL LETTER U WITH GRAVE" ) 250 : ( "ú" U+00FA Ll 1 "LATIN SMALL LETTER U WITH ACUTE" ) 251 : ( "û" U+00FB Ll 1 "LATIN SMALL LETTER U WITH CIRCUMFLEX" ) 252 : ( "ü" U+00FC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS" ) 253 : ( "ý" U+00FD Ll 1 "LATIN SMALL LETTER Y WITH ACUTE" ) 254 : ( "þ" U+00FE Ll 1 "LATIN SMALL LETTER THORN" ) 255 : ( "ÿ" U+00FF Ll 1 "LATIN SMALL LETTER Y WITH DIAERESIS" ) 256 : ( "Ā" U+0100 Lu 1 "LATIN CAPITAL LETTER A WITH MACRON" ) 257 : ( "ā" U+0101 Ll 1 "LATIN SMALL LETTER A WITH MACRON" ) 258 : ( "Ă" U+0102 Lu 1 "LATIN CAPITAL LETTER A WITH BREVE" ) 259 : ( "ă" U+0103 Ll 1 "LATIN SMALL LETTER A WITH BREVE" ) 260 : ( "Ą" U+0104 Lu 1 "LATIN CAPITAL LETTER A WITH OGONEK" ) 261 : ( "ą" U+0105 Ll 1 "LATIN SMALL LETTER A WITH OGONEK" ) 262 : ( "Ć" U+0106 Lu 1 "LATIN CAPITAL LETTER C WITH ACUTE" ) 263 : ( "ć" U+0107 Ll 1 "LATIN SMALL LETTER C WITH ACUTE" ) 264 : ( "Ĉ" U+0108 Lu 1 "LATIN CAPITAL LETTER C WITH CIRCUMFLEX" ) 265 : ( "ĉ" U+0109 Ll 1 "LATIN SMALL LETTER C WITH CIRCUMFLEX" ) 266 : ( "Ċ" U+010A Lu 1 "LATIN CAPITAL LETTER C WITH DOT ABOVE" ) 267 : ( "ċ" U+010B Ll 1 "LATIN SMALL LETTER C WITH DOT ABOVE" ) 268 : ( "Č" U+010C Lu 1 "LATIN CAPITAL LETTER C WITH CARON" ) 269 : ( "č" U+010D Ll 1 "LATIN SMALL LETTER C WITH CARON" ) 270 : ( "Ď" U+010E Lu 1 "LATIN CAPITAL LETTER D WITH CARON" ) 271 : ( "ď" U+010F Ll 1 "LATIN SMALL LETTER D WITH CARON" ) 272 : ( "Đ" U+0110 Lu 1 "LATIN CAPITAL LETTER D WITH STROKE" ) 273 : ( "đ" U+0111 Ll 1 "LATIN SMALL LETTER D WITH STROKE" ) 274 : ( "Ē" U+0112 Lu 1 "LATIN CAPITAL LETTER E WITH MACRON" ) 275 : ( "ē" U+0113 Ll 1 "LATIN SMALL LETTER E WITH MACRON" ) 276 : ( "Ĕ" U+0114 Lu 1 "LATIN CAPITAL LETTER E WITH BREVE" ) 277 : ( "ĕ" U+0115 Ll 1 "LATIN SMALL LETTER E WITH BREVE" ) 278 : ( "Ė" U+0116 Lu 1 "LATIN CAPITAL LETTER E WITH DOT ABOVE" ) 279 : ( "ė" U+0117 Ll 1 "LATIN SMALL LETTER E WITH DOT ABOVE" ) 280 : ( "Ę" U+0118 Lu 1 "LATIN CAPITAL LETTER E WITH OGONEK" ) 281 : ( "ę" U+0119 Ll 1 "LATIN SMALL LETTER E WITH OGONEK" ) 282 : ( "Ě" U+011A Lu 1 "LATIN CAPITAL LETTER E WITH CARON" ) 283 : ( "ě" U+011B Ll 1 "LATIN SMALL LETTER E WITH CARON" ) 284 : ( "Ĝ" U+011C Lu 1 "LATIN CAPITAL LETTER G WITH CIRCUMFLEX" ) 285 : ( "ĝ" U+011D Ll 1 "LATIN SMALL LETTER G WITH CIRCUMFLEX" ) 286 : ( "Ğ" U+011E Lu 1 "LATIN CAPITAL LETTER G WITH BREVE" ) 287 : ( "ğ" U+011F Ll 1 "LATIN SMALL LETTER G WITH BREVE" ) 288 : ( "Ġ" U+0120 Lu 1 "LATIN CAPITAL LETTER G WITH DOT ABOVE" ) 289 : ( "ġ" U+0121 Ll 1 "LATIN SMALL LETTER G WITH DOT ABOVE" ) 290 : ( "Ģ" U+0122 Lu 1 "LATIN CAPITAL LETTER G WITH CEDILLA" ) 291 : ( "ģ" U+0123 Ll 1 "LATIN SMALL LETTER G WITH CEDILLA" ) 292 : ( "Ĥ" U+0124 Lu 1 "LATIN CAPITAL LETTER H WITH CIRCUMFLEX" ) 293 : ( "ĥ" U+0125 Ll 1 "LATIN SMALL LETTER H WITH CIRCUMFLEX" ) 294 : ( "Ħ" U+0126 Lu 1 "LATIN CAPITAL LETTER H WITH STROKE" ) 295 : ( "ħ" U+0127 Ll 1 "LATIN SMALL LETTER H WITH STROKE" ) 296 : ( "Ĩ" U+0128 Lu 1 "LATIN CAPITAL LETTER I WITH TILDE" ) 297 : ( "ĩ" U+0129 Ll 1 "LATIN SMALL LETTER I WITH TILDE" ) 298 : ( "Ī" U+012A Lu 1 "LATIN CAPITAL LETTER I WITH MACRON" ) 299 : ( "ī" U+012B Ll 1 "LATIN SMALL LETTER I WITH MACRON" ) 300 : ( "Ĭ" U+012C Lu 1 "LATIN CAPITAL LETTER I WITH BREVE" ) 301 : ( "ĭ" U+012D Ll 1 "LATIN SMALL LETTER I WITH BREVE" ) 302 : ( "Į" U+012E Lu 1 "LATIN CAPITAL LETTER I WITH OGONEK" ) 303 : ( "į" U+012F Ll 1 "LATIN SMALL LETTER I WITH OGONEK" ) 304 : ( "İ" U+0130 Lu 1 "LATIN CAPITAL LETTER I WITH DOT ABOVE" ) 305 : ( "ı" U+0131 Ll 1 "LATIN SMALL LETTER DOTLESS I" ) 306 : ( "IJ" U+0132 Lu 1 "LATIN CAPITAL LIGATURE IJ" ) 307 : ( "ij" U+0133 Ll 1 "LATIN SMALL LIGATURE IJ" ) 308 : ( "Ĵ" U+0134 Lu 1 "LATIN CAPITAL LETTER J WITH CIRCUMFLEX" ) 309 : ( "ĵ" U+0135 Ll 1 "LATIN SMALL LETTER J WITH CIRCUMFLEX" ) 310 : ( "Ķ" U+0136 Lu 1 "LATIN CAPITAL LETTER K WITH CEDILLA" ) 311 : ( "ķ" U+0137 Ll 1 "LATIN SMALL LETTER K WITH CEDILLA" ) 312 : ( "ĸ" U+0138 Ll 1 "LATIN SMALL LETTER KRA" ) 313 : ( "Ĺ" U+0139 Lu 1 "LATIN CAPITAL LETTER L WITH ACUTE" ) 314 : ( "ĺ" U+013A Ll 1 "LATIN SMALL LETTER L WITH ACUTE" ) 315 : ( "Ļ" U+013B Lu 1 "LATIN CAPITAL LETTER L WITH CEDILLA" ) 316 : ( "ļ" U+013C Ll 1 "LATIN SMALL LETTER L WITH CEDILLA" ) 317 : ( "Ľ" U+013D Lu 1 "LATIN CAPITAL LETTER L WITH CARON" ) 318 : ( "ľ" U+013E Ll 1 "LATIN SMALL LETTER L WITH CARON" ) 319 : ( "Ŀ" U+013F Lu 1 "LATIN CAPITAL LETTER L WITH MIDDLE DOT" ) 320 : ( "ŀ" U+0140 Ll 1 "LATIN SMALL LETTER L WITH MIDDLE DOT" ) 321 : ( "Ł" U+0141 Lu 1 "LATIN CAPITAL LETTER L WITH STROKE" ) 322 : ( "ł" U+0142 Ll 1 "LATIN SMALL LETTER L WITH STROKE" ) 323 : ( "Ń" U+0143 Lu 1 "LATIN CAPITAL LETTER N WITH ACUTE" ) 324 : ( "ń" U+0144 Ll 1 "LATIN SMALL LETTER N WITH ACUTE" ) 325 : ( "Ņ" U+0145 Lu 1 "LATIN CAPITAL LETTER N WITH CEDILLA" ) 326 : ( "ņ" U+0146 Ll 1 "LATIN SMALL LETTER N WITH CEDILLA" ) 327 : ( "Ň" U+0147 Lu 1 "LATIN CAPITAL LETTER N WITH CARON" ) 328 : ( "ň" U+0148 Ll 1 "LATIN SMALL LETTER N WITH CARON" ) 329 : ( "ʼn" U+0149 Ll 1 "LATIN SMALL LETTER N PRECEDED BY APOSTROPHE" ) 330 : ( "Ŋ" U+014A Lu 1 "LATIN CAPITAL LETTER ENG" ) 331 : ( "ŋ" U+014B Ll 1 "LATIN SMALL LETTER ENG" ) 332 : ( "Ō" U+014C Lu 1 "LATIN CAPITAL LETTER O WITH MACRON" ) 333 : ( "ō" U+014D Ll 1 "LATIN SMALL LETTER O WITH MACRON" ) 334 : ( "Ŏ" U+014E Lu 1 "LATIN CAPITAL LETTER O WITH BREVE" ) 335 : ( "ŏ" U+014F Ll 1 "LATIN SMALL LETTER O WITH BREVE" ) 336 : ( "Ő" U+0150 Lu 1 "LATIN CAPITAL LETTER O WITH DOUBLE ACUTE" ) 337 : ( "ő" U+0151 Ll 1 "LATIN SMALL LETTER O WITH DOUBLE ACUTE" ) 338 : ( "Œ" U+0152 Lu 1 "LATIN CAPITAL LIGATURE OE" ) 339 : ( "œ" U+0153 Ll 1 "LATIN SMALL LIGATURE OE" ) 340 : ( "Ŕ" U+0154 Lu 1 "LATIN CAPITAL LETTER R WITH ACUTE" ) 341 : ( "ŕ" U+0155 Ll 1 "LATIN SMALL LETTER R WITH ACUTE" ) 342 : ( "Ŗ" U+0156 Lu 1 "LATIN CAPITAL LETTER R WITH CEDILLA" ) 343 : ( "ŗ" U+0157 Ll 1 "LATIN SMALL LETTER R WITH CEDILLA" ) 344 : ( "Ř" U+0158 Lu 1 "LATIN CAPITAL LETTER R WITH CARON" ) 345 : ( "ř" U+0159 Ll 1 "LATIN SMALL LETTER R WITH CARON" ) 346 : ( "Ś" U+015A Lu 1 "LATIN CAPITAL LETTER S WITH ACUTE" ) 347 : ( "ś" U+015B Ll 1 "LATIN SMALL LETTER S WITH ACUTE" ) 348 : ( "Ŝ" U+015C Lu 1 "LATIN CAPITAL LETTER S WITH CIRCUMFLEX" ) 349 : ( "ŝ" U+015D Ll 1 "LATIN SMALL LETTER S WITH CIRCUMFLEX" ) 350 : ( "Ş" U+015E Lu 1 "LATIN CAPITAL LETTER S WITH CEDILLA" ) 351 : ( "ş" U+015F Ll 1 "LATIN SMALL LETTER S WITH CEDILLA" ) 352 : ( "Š" U+0160 Lu 1 "LATIN CAPITAL LETTER S WITH CARON" ) 353 : ( "š" U+0161 Ll 1 "LATIN SMALL LETTER S WITH CARON" ) 354 : ( "Ţ" U+0162 Lu 1 "LATIN CAPITAL LETTER T WITH CEDILLA" ) 355 : ( "ţ" U+0163 Ll 1 "LATIN SMALL LETTER T WITH CEDILLA" ) 356 : ( "Ť" U+0164 Lu 1 "LATIN CAPITAL LETTER T WITH CARON" ) 357 : ( "ť" U+0165 Ll 1 "LATIN SMALL LETTER T WITH CARON" ) 358 : ( "Ŧ" U+0166 Lu 1 "LATIN CAPITAL LETTER T WITH STROKE" ) 359 : ( "ŧ" U+0167 Ll 1 "LATIN SMALL LETTER T WITH STROKE" ) 360 : ( "Ũ" U+0168 Lu 1 "LATIN CAPITAL LETTER U WITH TILDE" ) 361 : ( "ũ" U+0169 Ll 1 "LATIN SMALL LETTER U WITH TILDE" ) 362 : ( "Ū" U+016A Lu 1 "LATIN CAPITAL LETTER U WITH MACRON" ) 363 : ( "ū" U+016B Ll 1 "LATIN SMALL LETTER U WITH MACRON" ) 364 : ( "Ŭ" U+016C Lu 1 "LATIN CAPITAL LETTER U WITH BREVE" ) 365 : ( "ŭ" U+016D Ll 1 "LATIN SMALL LETTER U WITH BREVE" ) 366 : ( "Ů" U+016E Lu 1 "LATIN CAPITAL LETTER U WITH RING ABOVE" ) 367 : ( "ů" U+016F Ll 1 "LATIN SMALL LETTER U WITH RING ABOVE" ) 368 : ( "Ű" U+0170 Lu 1 "LATIN CAPITAL LETTER U WITH DOUBLE ACUTE" ) 369 : ( "ű" U+0171 Ll 1 "LATIN SMALL LETTER U WITH DOUBLE ACUTE" ) 370 : ( "Ų" U+0172 Lu 1 "LATIN CAPITAL LETTER U WITH OGONEK" ) 371 : ( "ų" U+0173 Ll 1 "LATIN SMALL LETTER U WITH OGONEK" ) 372 : ( "Ŵ" U+0174 Lu 1 "LATIN CAPITAL LETTER W WITH CIRCUMFLEX" ) 373 : ( "ŵ" U+0175 Ll 1 "LATIN SMALL LETTER W WITH CIRCUMFLEX" ) 374 : ( "Ŷ" U+0176 Lu 1 "LATIN CAPITAL LETTER Y WITH CIRCUMFLEX" ) 375 : ( "ŷ" U+0177 Ll 1 "LATIN SMALL LETTER Y WITH CIRCUMFLEX" ) 376 : ( "Ÿ" U+0178 Lu 1 "LATIN CAPITAL LETTER Y WITH DIAERESIS" ) 377 : ( "Ź" U+0179 Lu 1 "LATIN CAPITAL LETTER Z WITH ACUTE" ) 378 : ( "ź" U+017A Ll 1 "LATIN SMALL LETTER Z WITH ACUTE" ) 379 : ( "Ż" U+017B Lu 1 "LATIN CAPITAL LETTER Z WITH DOT ABOVE" ) 380 : ( "ż" U+017C Ll 1 "LATIN SMALL LETTER Z WITH DOT ABOVE" ) 381 : ( "Ž" U+017D Lu 1 "LATIN CAPITAL LETTER Z WITH CARON" ) 382 : ( "ž" U+017E Ll 1 "LATIN SMALL LETTER Z WITH CARON" ) 383 : ( "ſ" U+017F Ll 1 "LATIN SMALL LETTER LONG S" ) 384 : ( "ƀ" U+0180 Ll 1 "LATIN SMALL LETTER B WITH STROKE" ) 385 : ( "Ɓ" U+0181 Lu 1 "LATIN CAPITAL LETTER B WITH HOOK" ) 386 : ( "Ƃ" U+0182 Lu 1 "LATIN CAPITAL LETTER B WITH TOPBAR" ) 387 : ( "ƃ" U+0183 Ll 1 "LATIN SMALL LETTER B WITH TOPBAR" ) 388 : ( "Ƅ" U+0184 Lu 1 "LATIN CAPITAL LETTER TONE SIX" ) 389 : ( "ƅ" U+0185 Ll 1 "LATIN SMALL LETTER TONE SIX" ) 390 : ( "Ɔ" U+0186 Lu 1 "LATIN CAPITAL LETTER OPEN O" ) 391 : ( "Ƈ" U+0187 Lu 1 "LATIN CAPITAL LETTER C WITH HOOK" ) 392 : ( "ƈ" U+0188 Ll 1 "LATIN SMALL LETTER C WITH HOOK" ) 393 : ( "Ɖ" U+0189 Lu 1 "LATIN CAPITAL LETTER AFRICAN D" ) 394 : ( "Ɗ" U+018A Lu 1 "LATIN CAPITAL LETTER D WITH HOOK" ) 395 : ( "Ƌ" U+018B Lu 1 "LATIN CAPITAL LETTER D WITH TOPBAR" ) 396 : ( "ƌ" U+018C Ll 1 "LATIN SMALL LETTER D WITH TOPBAR" ) 397 : ( "ƍ" U+018D Ll 1 "LATIN SMALL LETTER TURNED DELTA" ) 398 : ( "Ǝ" U+018E Lu 1 "LATIN CAPITAL LETTER REVERSED E" ) 399 : ( "Ə" U+018F Lu 1 "LATIN CAPITAL LETTER SCHWA" ) 400 : ( "Ɛ" U+0190 Lu 1 "LATIN CAPITAL LETTER OPEN E" ) 401 : ( "Ƒ" U+0191 Lu 1 "LATIN CAPITAL LETTER F WITH HOOK" ) 402 : ( "ƒ" U+0192 Ll 1 "LATIN SMALL LETTER F WITH HOOK" ) 403 : ( "Ɠ" U+0193 Lu 1 "LATIN CAPITAL LETTER G WITH HOOK" ) 404 : ( "Ɣ" U+0194 Lu 1 "LATIN CAPITAL LETTER GAMMA" ) 405 : ( "ƕ" U+0195 Ll 1 "LATIN SMALL LETTER HV" ) 406 : ( "Ɩ" U+0196 Lu 1 "LATIN CAPITAL LETTER IOTA" ) 407 : ( "Ɨ" U+0197 Lu 1 "LATIN CAPITAL LETTER I WITH STROKE" ) 408 : ( "Ƙ" U+0198 Lu 1 "LATIN CAPITAL LETTER K WITH HOOK" ) 409 : ( "ƙ" U+0199 Ll 1 "LATIN SMALL LETTER K WITH HOOK" ) 410 : ( "ƚ" U+019A Ll 1 "LATIN SMALL LETTER L WITH BAR" ) 411 : ( "ƛ" U+019B Ll 1 "LATIN SMALL LETTER LAMBDA WITH STROKE" ) 412 : ( "Ɯ" U+019C Lu 1 "LATIN CAPITAL LETTER TURNED M" ) 413 : ( "Ɲ" U+019D Lu 1 "LATIN CAPITAL LETTER N WITH LEFT HOOK" ) 414 : ( "ƞ" U+019E Ll 1 "LATIN SMALL LETTER N WITH LONG RIGHT LEG" ) 415 : ( "Ɵ" U+019F Lu 1 "LATIN CAPITAL LETTER O WITH MIDDLE TILDE" ) 416 : ( "Ơ" U+01A0 Lu 1 "LATIN CAPITAL LETTER O WITH HORN" ) 417 : ( "ơ" U+01A1 Ll 1 "LATIN SMALL LETTER O WITH HORN" ) 418 : ( "Ƣ" U+01A2 Lu 1 "LATIN CAPITAL LETTER OI", "LATIN CAPITAL LETTER GHA" ) 419 : ( "ƣ" U+01A3 Ll 1 "LATIN SMALL LETTER OI", "LATIN SMALL LETTER GHA" ) 420 : ( "Ƥ" U+01A4 Lu 1 "LATIN CAPITAL LETTER P WITH HOOK" ) 421 : ( "ƥ" U+01A5 Ll 1 "LATIN SMALL LETTER P WITH HOOK" ) 422 : ( "Ʀ" U+01A6 Lu 1 "LATIN LETTER YR" ) 423 : ( "Ƨ" U+01A7 Lu 1 "LATIN CAPITAL LETTER TONE TWO" ) 424 : ( "ƨ" U+01A8 Ll 1 "LATIN SMALL LETTER TONE TWO" ) 425 : ( "Ʃ" U+01A9 Lu 1 "LATIN CAPITAL LETTER ESH" ) 426 : ( "ƪ" U+01AA Ll 1 "LATIN LETTER REVERSED ESH LOOP" ) 427 : ( "ƫ" U+01AB Ll 1 "LATIN SMALL LETTER T WITH PALATAL HOOK" ) 428 : ( "Ƭ" U+01AC Lu 1 "LATIN CAPITAL LETTER T WITH HOOK" ) 429 : ( "ƭ" U+01AD Ll 1 "LATIN SMALL LETTER T WITH HOOK" ) 430 : ( "Ʈ" U+01AE Lu 1 "LATIN CAPITAL LETTER T WITH RETROFLEX HOOK" ) 431 : ( "Ư" U+01AF Lu 1 "LATIN CAPITAL LETTER U WITH HORN" ) 432 : ( "ư" U+01B0 Ll 1 "LATIN SMALL LETTER U WITH HORN" ) 433 : ( "Ʊ" U+01B1 Lu 1 "LATIN CAPITAL LETTER UPSILON" ) 434 : ( "Ʋ" U+01B2 Lu 1 "LATIN CAPITAL LETTER V WITH HOOK" ) 435 : ( "Ƴ" U+01B3 Lu 1 "LATIN CAPITAL LETTER Y WITH HOOK" ) 436 : ( "ƴ" U+01B4 Ll 1 "LATIN SMALL LETTER Y WITH HOOK" ) 437 : ( "Ƶ" U+01B5 Lu 1 "LATIN CAPITAL LETTER Z WITH STROKE" ) 438 : ( "ƶ" U+01B6 Ll 1 "LATIN SMALL LETTER Z WITH STROKE" ) 439 : ( "Ʒ" U+01B7 Lu 1 "LATIN CAPITAL LETTER EZH" ) 440 : ( "Ƹ" U+01B8 Lu 1 "LATIN CAPITAL LETTER EZH REVERSED" ) 441 : ( "ƹ" U+01B9 Ll 1 "LATIN SMALL LETTER EZH REVERSED" ) 442 : ( "ƺ" U+01BA Ll 1 "LATIN SMALL LETTER EZH WITH TAIL" ) 443 : ( "ƻ" U+01BB Lo 1 "LATIN LETTER TWO WITH STROKE" ) 444 : ( "Ƽ" U+01BC Lu 1 "LATIN CAPITAL LETTER TONE FIVE" ) 445 : ( "ƽ" U+01BD Ll 1 "LATIN SMALL LETTER TONE FIVE" ) 446 : ( "ƾ" U+01BE Ll 1 "LATIN LETTER INVERTED GLOTTAL STOP WITH STROKE" ) 447 : ( "ƿ" U+01BF Ll 1 "LATIN LETTER WYNN" ) 448 : ( "ǀ" U+01C0 Lo 1 "LATIN LETTER DENTAL CLICK" ) 449 : ( "ǁ" U+01C1 Lo 1 "LATIN LETTER LATERAL CLICK" ) 450 : ( "ǂ" U+01C2 Lo 1 "LATIN LETTER ALVEOLAR CLICK" ) 451 : ( "ǃ" U+01C3 Lo 1 "LATIN LETTER RETROFLEX CLICK" ) 452 : ( "DŽ" U+01C4 Lu 1 "LATIN CAPITAL LETTER DZ WITH CARON" ) 453 : ( "Dž" U+01C5 Lt 1 "LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON" ) 454 : ( "dž" U+01C6 Ll 1 "LATIN SMALL LETTER DZ WITH CARON" ) 455 : ( "LJ" U+01C7 Lu 1 "LATIN CAPITAL LETTER LJ" ) 456 : ( "Lj" U+01C8 Lt 1 "LATIN CAPITAL LETTER L WITH SMALL LETTER J" ) 457 : ( "lj" U+01C9 Ll 1 "LATIN SMALL LETTER LJ" ) 458 : ( "NJ" U+01CA Lu 1 "LATIN CAPITAL LETTER NJ" ) 459 : ( "Nj" U+01CB Lt 1 "LATIN CAPITAL LETTER N WITH SMALL LETTER J" ) 460 : ( "nj" U+01CC Ll 1 "LATIN SMALL LETTER NJ" ) 461 : ( "Ǎ" U+01CD Lu 1 "LATIN CAPITAL LETTER A WITH CARON" ) 462 : ( "ǎ" U+01CE Ll 1 "LATIN SMALL LETTER A WITH CARON" ) 463 : ( "Ǐ" U+01CF Lu 1 "LATIN CAPITAL LETTER I WITH CARON" ) 464 : ( "ǐ" U+01D0 Ll 1 "LATIN SMALL LETTER I WITH CARON" ) 465 : ( "Ǒ" U+01D1 Lu 1 "LATIN CAPITAL LETTER O WITH CARON" ) 466 : ( "ǒ" U+01D2 Ll 1 "LATIN SMALL LETTER O WITH CARON" ) 467 : ( "Ǔ" U+01D3 Lu 1 "LATIN CAPITAL LETTER U WITH CARON" ) 468 : ( "ǔ" U+01D4 Ll 1 "LATIN SMALL LETTER U WITH CARON" ) 469 : ( "Ǖ" U+01D5 Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON" ) 470 : ( "ǖ" U+01D6 Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS AND MACRON" ) 471 : ( "Ǘ" U+01D7 Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE" ) 472 : ( "ǘ" U+01D8 Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE" ) 473 : ( "Ǚ" U+01D9 Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON" ) 474 : ( "ǚ" U+01DA Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS AND CARON" ) 475 : ( "Ǜ" U+01DB Lu 1 "LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE" ) 476 : ( "ǜ" U+01DC Ll 1 "LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE" ) 477 : ( "ǝ" U+01DD Ll 1 "LATIN SMALL LETTER TURNED E" ) 478 : ( "Ǟ" U+01DE Lu 1 "LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON" ) 479 : ( "ǟ" U+01DF Ll 1 "LATIN SMALL LETTER A WITH DIAERESIS AND MACRON" ) 480 : ( "Ǡ" U+01E0 Lu 1 "LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON" ) 481 : ( "ǡ" U+01E1 Ll 1 "LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON" ) 482 : ( "Ǣ" U+01E2 Lu 1 "LATIN CAPITAL LETTER AE WITH MACRON" ) 483 : ( "ǣ" U+01E3 Ll 1 "LATIN SMALL LETTER AE WITH MACRON" ) 484 : ( "Ǥ" U+01E4 Lu 1 "LATIN CAPITAL LETTER G WITH STROKE" ) 485 : ( "ǥ" U+01E5 Ll 1 "LATIN SMALL LETTER G WITH STROKE" ) 486 : ( "Ǧ" U+01E6 Lu 1 "LATIN CAPITAL LETTER G WITH CARON" ) 487 : ( "ǧ" U+01E7 Ll 1 "LATIN SMALL LETTER G WITH CARON" ) 488 : ( "Ǩ" U+01E8 Lu 1 "LATIN CAPITAL LETTER K WITH CARON" ) 489 : ( "ǩ" U+01E9 Ll 1 "LATIN SMALL LETTER K WITH CARON" ) 490 : ( "Ǫ" U+01EA Lu 1 "LATIN CAPITAL LETTER O WITH OGONEK" ) 491 : ( "ǫ" U+01EB Ll 1 "LATIN SMALL LETTER O WITH OGONEK" ) 492 : ( "Ǭ" U+01EC Lu 1 "LATIN CAPITAL LETTER O WITH OGONEK AND MACRON" ) 493 : ( "ǭ" U+01ED Ll 1 "LATIN SMALL LETTER O WITH OGONEK AND MACRON" ) 494 : ( "Ǯ" U+01EE Lu 1 "LATIN CAPITAL LETTER EZH WITH CARON" ) 495 : ( "ǯ" U+01EF Ll 1 "LATIN SMALL LETTER EZH WITH CARON" ) 496 : ( "ǰ" U+01F0 Ll 1 "LATIN SMALL LETTER J WITH CARON" ) 497 : ( "DZ" U+01F1 Lu 1 "LATIN CAPITAL LETTER DZ" ) 498 : ( "Dz" U+01F2 Lt 1 "LATIN CAPITAL LETTER D WITH SMALL LETTER Z" ) 499 : ( "dz" U+01F3 Ll 1 "LATIN SMALL LETTER DZ" ) 500 : ( "Ǵ" U+01F4 Lu 1 "LATIN CAPITAL LETTER G WITH ACUTE" ) 501 : ( "ǵ" U+01F5 Ll 1 "LATIN SMALL LETTER G WITH ACUTE" ) 502 : ( "Ƕ" U+01F6 Lu 1 "LATIN CAPITAL LETTER HWAIR" ) 503 : ( "Ƿ" U+01F7 Lu 1 "LATIN CAPITAL LETTER WYNN" ) 504 : ( "Ǹ" U+01F8 Lu 1 "LATIN CAPITAL LETTER N WITH GRAVE" ) 505 : ( "ǹ" U+01F9 Ll 1 "LATIN SMALL LETTER N WITH GRAVE" ) 506 : ( "Ǻ" U+01FA Lu 1 "LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE" ) 507 : ( "ǻ" U+01FB Ll 1 "LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE" ) 508 : ( "Ǽ" U+01FC Lu 1 "LATIN CAPITAL LETTER AE WITH ACUTE" ) 509 : ( "ǽ" U+01FD Ll 1 "LATIN SMALL LETTER AE WITH ACUTE" ) 510 : ( "Ǿ" U+01FE Lu 1 "LATIN CAPITAL LETTER O WITH STROKE AND ACUTE" ) 511 : ( "ǿ" U+01FF Ll 1 "LATIN SMALL LETTER O WITH STROKE AND ACUTE" ) 512 : ( "Ȁ" U+0200 Lu 1 "LATIN CAPITAL LETTER A WITH DOUBLE GRAVE" ) 513 : ( "ȁ" U+0201 Ll 1 "LATIN SMALL LETTER A WITH DOUBLE GRAVE" ) 514 : ( "Ȃ" U+0202 Lu 1 "LATIN CAPITAL LETTER A WITH INVERTED BREVE" ) 515 : ( "ȃ" U+0203 Ll 1 "LATIN SMALL LETTER A WITH INVERTED BREVE" ) 516 : ( "Ȅ" U+0204 Lu 1 "LATIN CAPITAL LETTER E WITH DOUBLE GRAVE" ) 517 : ( "ȅ" U+0205 Ll 1 "LATIN SMALL LETTER E WITH DOUBLE GRAVE" ) 518 : ( "Ȇ" U+0206 Lu 1 "LATIN CAPITAL LETTER E WITH INVERTED BREVE" ) 519 : ( "ȇ" U+0207 Ll 1 "LATIN SMALL LETTER E WITH INVERTED BREVE" ) 520 : ( "Ȉ" U+0208 Lu 1 "LATIN CAPITAL LETTER I WITH DOUBLE GRAVE" ) 521 : ( "ȉ" U+0209 Ll 1 "LATIN SMALL LETTER I WITH DOUBLE GRAVE" ) 522 : ( "Ȋ" U+020A Lu 1 "LATIN CAPITAL LETTER I WITH INVERTED BREVE" ) 523 : ( "ȋ" U+020B Ll 1 "LATIN SMALL LETTER I WITH INVERTED BREVE" ) 524 : ( "Ȍ" U+020C Lu 1 "LATIN CAPITAL LETTER O WITH DOUBLE GRAVE" ) 525 : ( "ȍ" U+020D Ll 1 "LATIN SMALL LETTER O WITH DOUBLE GRAVE" ) 526 : ( "Ȏ" U+020E Lu 1 "LATIN CAPITAL LETTER O WITH INVERTED BREVE" ) 527 : ( "ȏ" U+020F Ll 1 "LATIN SMALL LETTER O WITH INVERTED BREVE" ) 528 : ( "Ȑ" U+0210 Lu 1 "LATIN CAPITAL LETTER R WITH DOUBLE GRAVE" ) 529 : ( "ȑ" U+0211 Ll 1 "LATIN SMALL LETTER R WITH DOUBLE GRAVE" ) 530 : ( "Ȓ" U+0212 Lu 1 "LATIN CAPITAL LETTER R WITH INVERTED BREVE" ) 531 : ( "ȓ" U+0213 Ll 1 "LATIN SMALL LETTER R WITH INVERTED BREVE" ) 532 : ( "Ȕ" U+0214 Lu 1 "LATIN CAPITAL LETTER U WITH DOUBLE GRAVE" ) 533 : ( "ȕ" U+0215 Ll 1 "LATIN SMALL LETTER U WITH DOUBLE GRAVE" ) 534 : ( "Ȗ" U+0216 Lu 1 "LATIN CAPITAL LETTER U WITH INVERTED BREVE" ) 535 : ( "ȗ" U+0217 Ll 1 "LATIN SMALL LETTER U WITH INVERTED BREVE" ) 536 : ( "Ș" U+0218 Lu 1 "LATIN CAPITAL LETTER S WITH COMMA BELOW" ) 537 : ( "ș" U+0219 Ll 1 "LATIN SMALL LETTER S WITH COMMA BELOW" ) 538 : ( "Ț" U+021A Lu 1 "LATIN CAPITAL LETTER T WITH COMMA BELOW" ) 539 : ( "ț" U+021B Ll 1 "LATIN SMALL LETTER T WITH COMMA BELOW" ) 540 : ( "Ȝ" U+021C Lu 1 "LATIN CAPITAL LETTER YOGH" ) 541 : ( "ȝ" U+021D Ll 1 "LATIN SMALL LETTER YOGH" ) 542 : ( "Ȟ" U+021E Lu 1 "LATIN CAPITAL LETTER H WITH CARON" ) 543 : ( "ȟ" U+021F Ll 1 "LATIN SMALL LETTER H WITH CARON" ) 544 : ( "Ƞ" U+0220 Lu 1 "LATIN CAPITAL LETTER N WITH LONG RIGHT LEG" ) 545 : ( "ȡ" U+0221 Ll 1 "LATIN SMALL LETTER D WITH CURL" ) 546 : ( "Ȣ" U+0222 Lu 1 "LATIN CAPITAL LETTER OU" ) 547 : ( "ȣ" U+0223 Ll 1 "LATIN SMALL LETTER OU" ) 548 : ( "Ȥ" U+0224 Lu 1 "LATIN CAPITAL LETTER Z WITH HOOK" ) 549 : ( "ȥ" U+0225 Ll 1 "LATIN SMALL LETTER Z WITH HOOK" ) 550 : ( "Ȧ" U+0226 Lu 1 "LATIN CAPITAL LETTER A WITH DOT ABOVE" ) 551 : ( "ȧ" U+0227 Ll 1 "LATIN SMALL LETTER A WITH DOT ABOVE" ) 552 : ( "Ȩ" U+0228 Lu 1 "LATIN CAPITAL LETTER E WITH CEDILLA" ) 553 : ( "ȩ" U+0229 Ll 1 "LATIN SMALL LETTER E WITH CEDILLA" ) 554 : ( "Ȫ" U+022A Lu 1 "LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON" ) 555 : ( "ȫ" U+022B Ll 1 "LATIN SMALL LETTER O WITH DIAERESIS AND MACRON" ) 556 : ( "Ȭ" U+022C Lu 1 "LATIN CAPITAL LETTER O WITH TILDE AND MACRON" ) 557 : ( "ȭ" U+022D Ll 1 "LATIN SMALL LETTER O WITH TILDE AND MACRON" ) 558 : ( "Ȯ" U+022E Lu 1 "LATIN CAPITAL LETTER O WITH DOT ABOVE" ) 559 : ( "ȯ" U+022F Ll 1 "LATIN SMALL LETTER O WITH DOT ABOVE" ) 560 : ( "Ȱ" U+0230 Lu 1 "LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON" ) 561 : ( "ȱ" U+0231 Ll 1 "LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON" ) 562 : ( "Ȳ" U+0232 Lu 1 "LATIN CAPITAL LETTER Y WITH MACRON" ) 563 : ( "ȳ" U+0233 Ll 1 "LATIN SMALL LETTER Y WITH MACRON" ) 564 : ( "ȴ" U+0234 Ll 1 "LATIN SMALL LETTER L WITH CURL" ) 565 : ( "ȵ" U+0235 Ll 1 "LATIN SMALL LETTER N WITH CURL" ) 566 : ( "ȶ" U+0236 Ll 1 "LATIN SMALL LETTER T WITH CURL" ) 567 : ( "ȷ" U+0237 Ll 1 "LATIN SMALL LETTER DOTLESS J" ) 568 : ( "ȸ" U+0238 Ll 1 "LATIN SMALL LETTER DB DIGRAPH" ) 569 : ( "ȹ" U+0239 Ll 1 "LATIN SMALL LETTER QP DIGRAPH" ) 570 : ( "Ⱥ" U+023A Lu 1 "LATIN CAPITAL LETTER A WITH STROKE" ) 571 : ( "Ȼ" U+023B Lu 1 "LATIN CAPITAL LETTER C WITH STROKE" ) 572 : ( "ȼ" U+023C Ll 1 "LATIN SMALL LETTER C WITH STROKE" ) 573 : ( "Ƚ" U+023D Lu 1 "LATIN CAPITAL LETTER L WITH BAR" ) 574 : ( "Ⱦ" U+023E Lu 1 "LATIN CAPITAL LETTER T WITH DIAGONAL STROKE" ) 575 : ( "ȿ" U+023F Ll 1 "LATIN SMALL LETTER S WITH SWASH TAIL" ) 576 : ( "ɀ" U+0240 Ll 1 "LATIN SMALL LETTER Z WITH SWASH TAIL" ) 577 : ( "Ɂ" U+0241 Lu 1 "LATIN CAPITAL LETTER GLOTTAL STOP" ) 578 : ( "ɂ" U+0242 Ll 1 "LATIN SMALL LETTER GLOTTAL STOP" ) 579 : ( "Ƀ" U+0243 Lu 1 "LATIN CAPITAL LETTER B WITH STROKE" ) 580 : ( "Ʉ" U+0244 Lu 1 "LATIN CAPITAL LETTER U BAR" ) 581 : ( "Ʌ" U+0245 Lu 1 "LATIN CAPITAL LETTER TURNED V" ) 582 : ( "Ɇ" U+0246 Lu 1 "LATIN CAPITAL LETTER E WITH STROKE" ) 583 : ( "ɇ" U+0247 Ll 1 "LATIN SMALL LETTER E WITH STROKE" ) 584 : ( "Ɉ" U+0248 Lu 1 "LATIN CAPITAL LETTER J WITH STROKE" ) 585 : ( "ɉ" U+0249 Ll 1 "LATIN SMALL LETTER J WITH STROKE" ) 586 : ( "Ɋ" U+024A Lu 1 "LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL" ) 587 : ( "ɋ" U+024B Ll 1 "LATIN SMALL LETTER Q WITH HOOK TAIL" ) 588 : ( "Ɍ" U+024C Lu 1 "LATIN CAPITAL LETTER R WITH STROKE" ) 589 : ( "ɍ" U+024D Ll 1 "LATIN SMALL LETTER R WITH STROKE" ) 590 : ( "Ɏ" U+024E Lu 1 "LATIN CAPITAL LETTER Y WITH STROKE" ) 591 : ( "ɏ" U+024F Ll 1 "LATIN SMALL LETTER Y WITH STROKE" ) 592 : ( "ɐ" U+0250 Ll 1 "LATIN SMALL LETTER TURNED A" ) 593 : ( "ɑ" U+0251 Ll 1 "LATIN SMALL LETTER ALPHA" ) 594 : ( "ɒ" U+0252 Ll 1 "LATIN SMALL LETTER TURNED ALPHA" ) 595 : ( "ɓ" U+0253 Ll 1 "LATIN SMALL LETTER B WITH HOOK" ) 596 : ( "ɔ" U+0254 Ll 1 "LATIN SMALL LETTER OPEN O" ) 597 : ( "ɕ" U+0255 Ll 1 "LATIN SMALL LETTER C WITH CURL" ) 598 : ( "ɖ" U+0256 Ll 1 "LATIN SMALL LETTER D WITH TAIL" ) 599 : ( "ɗ" U+0257 Ll 1 "LATIN SMALL LETTER D WITH HOOK" ) 600 : ( "ɘ" U+0258 Ll 1 "LATIN SMALL LETTER REVERSED E" ) 601 : ( "ə" U+0259 Ll 1 "LATIN SMALL LETTER SCHWA" ) 602 : ( "ɚ" U+025A Ll 1 "LATIN SMALL LETTER SCHWA WITH HOOK" ) 603 : ( "ɛ" U+025B Ll 1 "LATIN SMALL LETTER OPEN E" ) 604 : ( "ɜ" U+025C Ll 1 "LATIN SMALL LETTER REVERSED OPEN E" ) 605 : ( "ɝ" U+025D Ll 1 "LATIN SMALL LETTER REVERSED OPEN E WITH HOOK" ) 606 : ( "ɞ" U+025E Ll 1 "LATIN SMALL LETTER CLOSED REVERSED OPEN E" ) 607 : ( "ɟ" U+025F Ll 1 "LATIN SMALL LETTER DOTLESS J WITH STROKE" ) 608 : ( "ɠ" U+0260 Ll 1 "LATIN SMALL LETTER G WITH HOOK" ) 609 : ( "ɡ" U+0261 Ll 1 "LATIN SMALL LETTER SCRIPT G" ) 610 : ( "ɢ" U+0262 Ll 1 "LATIN LETTER SMALL CAPITAL G" ) 611 : ( "ɣ" U+0263 Ll 1 "LATIN SMALL LETTER GAMMA" ) 612 : ( "ɤ" U+0264 Ll 1 "LATIN SMALL LETTER RAMS HORN" ) 613 : ( "ɥ" U+0265 Ll 1 "LATIN SMALL LETTER TURNED H" ) 614 : ( "ɦ" U+0266 Ll 1 "LATIN SMALL LETTER H WITH HOOK" ) 615 : ( "ɧ" U+0267 Ll 1 "LATIN SMALL LETTER HENG WITH HOOK" ) 616 : ( "ɨ" U+0268 Ll 1 "LATIN SMALL LETTER I WITH STROKE" ) 617 : ( "ɩ" U+0269 Ll 1 "LATIN SMALL LETTER IOTA" ) 618 : ( "ɪ" U+026A Ll 1 "LATIN LETTER SMALL CAPITAL I" ) 619 : ( "ɫ" U+026B Ll 1 "LATIN SMALL LETTER L WITH MIDDLE TILDE" ) 620 : ( "ɬ" U+026C Ll 1 "LATIN SMALL LETTER L WITH BELT" ) 621 : ( "ɭ" U+026D Ll 1 "LATIN SMALL LETTER L WITH RETROFLEX HOOK" ) 622 : ( "ɮ" U+026E Ll 1 "LATIN SMALL LETTER LEZH" ) 623 : ( "ɯ" U+026F Ll 1 "LATIN SMALL LETTER TURNED M" ) 624 : ( "ɰ" U+0270 Ll 1 "LATIN SMALL LETTER TURNED M WITH LONG LEG" ) 625 : ( "ɱ" U+0271 Ll 1 "LATIN SMALL LETTER M WITH HOOK" ) 626 : ( "ɲ" U+0272 Ll 1 "LATIN SMALL LETTER N WITH LEFT HOOK" ) 627 : ( "ɳ" U+0273 Ll 1 "LATIN SMALL LETTER N WITH RETROFLEX HOOK" ) 628 : ( "ɴ" U+0274 Ll 1 "LATIN LETTER SMALL CAPITAL N" ) 629 : ( "ɵ" U+0275 Ll 1 "LATIN SMALL LETTER BARRED O" ) 630 : ( "ɶ" U+0276 Ll 1 "LATIN LETTER SMALL CAPITAL OE" ) 631 : ( "ɷ" U+0277 Ll 1 "LATIN SMALL LETTER CLOSED OMEGA" ) 632 : ( "ɸ" U+0278 Ll 1 "LATIN SMALL LETTER PHI" ) 633 : ( "ɹ" U+0279 Ll 1 "LATIN SMALL LETTER TURNED R" ) 634 : ( "ɺ" U+027A Ll 1 "LATIN SMALL LETTER TURNED R WITH LONG LEG" ) 635 : ( "ɻ" U+027B Ll 1 "LATIN SMALL LETTER TURNED R WITH HOOK" ) 636 : ( "ɼ" U+027C Ll 1 "LATIN SMALL LETTER R WITH LONG LEG" ) 637 : ( "ɽ" U+027D Ll 1 "LATIN SMALL LETTER R WITH TAIL" ) 638 : ( "ɾ" U+027E Ll 1 "LATIN SMALL LETTER R WITH FISHHOOK" ) 639 : ( "ɿ" U+027F Ll 1 "LATIN SMALL LETTER REVERSED R WITH FISHHOOK" ) 640 : ( "ʀ" U+0280 Ll 1 "LATIN LETTER SMALL CAPITAL R" ) 641 : ( "ʁ" U+0281 Ll 1 "LATIN LETTER SMALL CAPITAL INVERTED R" ) 642 : ( "ʂ" U+0282 Ll 1 "LATIN SMALL LETTER S WITH HOOK" ) 643 : ( "ʃ" U+0283 Ll 1 "LATIN SMALL LETTER ESH" ) 644 : ( "ʄ" U+0284 Ll 1 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" ) 645 : ( "ʅ" U+0285 Ll 1 "LATIN SMALL LETTER SQUAT REVERSED ESH" ) 646 : ( "ʆ" U+0286 Ll 1 "LATIN SMALL LETTER ESH WITH CURL" ) 647 : ( "ʇ" U+0287 Ll 1 "LATIN SMALL LETTER TURNED T" ) 648 : ( "ʈ" U+0288 Ll 1 "LATIN SMALL LETTER T WITH RETROFLEX HOOK" ) 649 : ( "ʉ" U+0289 Ll 1 "LATIN SMALL LETTER U BAR" ) 650 : ( "ʊ" U+028A Ll 1 "LATIN SMALL LETTER UPSILON" ) 651 : ( "ʋ" U+028B Ll 1 "LATIN SMALL LETTER V WITH HOOK" ) 652 : ( "ʌ" U+028C Ll 1 "LATIN SMALL LETTER TURNED V" ) 653 : ( "ʍ" U+028D Ll 1 "LATIN SMALL LETTER TURNED W" ) 654 : ( "ʎ" U+028E Ll 1 "LATIN SMALL LETTER TURNED Y" ) 655 : ( "ʏ" U+028F Ll 1 "LATIN LETTER SMALL CAPITAL Y" ) 656 : ( "ʐ" U+0290 Ll 1 "LATIN SMALL LETTER Z WITH RETROFLEX HOOK" ) 657 : ( "ʑ" U+0291 Ll 1 "LATIN SMALL LETTER Z WITH CURL" ) 658 : ( "ʒ" U+0292 Ll 1 "LATIN SMALL LETTER EZH" ) 659 : ( "ʓ" U+0293 Ll 1 "LATIN SMALL LETTER EZH WITH CURL" ) 660 : ( "ʔ" U+0294 Lo 1 "LATIN LETTER GLOTTAL STOP" ) 661 : ( "ʕ" U+0295 Ll 1 "LATIN LETTER PHARYNGEAL VOICED FRICATIVE" ) 662 : ( "ʖ" U+0296 Ll 1 "LATIN LETTER INVERTED GLOTTAL STOP" ) 663 : ( "ʗ" U+0297 Ll 1 "LATIN LETTER STRETCHED C" ) 664 : ( "ʘ" U+0298 Ll 1 "LATIN LETTER BILABIAL CLICK" ) 665 : ( "ʙ" U+0299 Ll 1 "LATIN LETTER SMALL CAPITAL B" ) 666 : ( "ʚ" U+029A Ll 1 "LATIN SMALL LETTER CLOSED OPEN E" ) 667 : ( "ʛ" U+029B Ll 1 "LATIN LETTER SMALL CAPITAL G WITH HOOK" ) 668 : ( "ʜ" U+029C Ll 1 "LATIN LETTER SMALL CAPITAL H" ) 669 : ( "ʝ" U+029D Ll 1 "LATIN SMALL LETTER J WITH CROSSED-TAIL" ) 670 : ( "ʞ" U+029E Ll 1 "LATIN SMALL LETTER TURNED K" ) 671 : ( "ʟ" U+029F Ll 1 "LATIN LETTER SMALL CAPITAL L" ) 672 : ( "ʠ" U+02A0 Ll 1 "LATIN SMALL LETTER Q WITH HOOK" ) 673 : ( "ʡ" U+02A1 Ll 1 "LATIN LETTER GLOTTAL STOP WITH STROKE" ) 674 : ( "ʢ" U+02A2 Ll 1 "LATIN LETTER REVERSED GLOTTAL STOP WITH STROKE" ) 675 : ( "ʣ" U+02A3 Ll 1 "LATIN SMALL LETTER DZ DIGRAPH" ) 676 : ( "ʤ" U+02A4 Ll 1 "LATIN SMALL LETTER DEZH DIGRAPH" ) 677 : ( "ʥ" U+02A5 Ll 1 "LATIN SMALL LETTER DZ DIGRAPH WITH CURL" ) 678 : ( "ʦ" U+02A6 Ll 1 "LATIN SMALL LETTER TS DIGRAPH" ) 679 : ( "ʧ" U+02A7 Ll 1 "LATIN SMALL LETTER TESH DIGRAPH" ) 680 : ( "ʨ" U+02A8 Ll 1 "LATIN SMALL LETTER TC DIGRAPH WITH CURL" ) 681 : ( "ʩ" U+02A9 Ll 1 "LATIN SMALL LETTER FENG DIGRAPH" ) 682 : ( "ʪ" U+02AA Ll 1 "LATIN SMALL LETTER LS DIGRAPH" ) 683 : ( "ʫ" U+02AB Ll 1 "LATIN SMALL LETTER LZ DIGRAPH" ) 684 : ( "ʬ" U+02AC Ll 1 "LATIN LETTER BILABIAL PERCUSSIVE" ) 685 : ( "ʭ" U+02AD Ll 1 "LATIN LETTER BIDENTAL PERCUSSIVE" ) 686 : ( "ʮ" U+02AE Ll 1 "LATIN SMALL LETTER TURNED H WITH FISHHOOK" ) 687 : ( "ʯ" U+02AF Ll 1 "LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL" ) 688 : ( "ʰ" U+02B0 Lm 1 "MODIFIER LETTER SMALL H" ) 689 : ( "ʱ" U+02B1 Lm 1 "MODIFIER LETTER SMALL H WITH HOOK" ) 690 : ( "ʲ" U+02B2 Lm 1 "MODIFIER LETTER SMALL J" ) 691 : ( "ʳ" U+02B3 Lm 1 "MODIFIER LETTER SMALL R" ) 692 : ( "ʴ" U+02B4 Lm 1 "MODIFIER LETTER SMALL TURNED R" ) 693 : ( "ʵ" U+02B5 Lm 1 "MODIFIER LETTER SMALL TURNED R WITH HOOK" ) 694 : ( "ʶ" U+02B6 Lm 1 "MODIFIER LETTER SMALL CAPITAL INVERTED R" ) 695 : ( "ʷ" U+02B7 Lm 1 "MODIFIER LETTER SMALL W" ) 696 : ( "ʸ" U+02B8 Lm 1 "MODIFIER LETTER SMALL Y" ) 697 : ( "ʹ" U+02B9 Lm 1 "MODIFIER LETTER PRIME" ) 698 : ( "ʺ" U+02BA Lm 1 "MODIFIER LETTER DOUBLE PRIME" ) 699 : ( "ʻ" U+02BB Lm 1 "MODIFIER LETTER TURNED COMMA" ) 700 : ( "ʼ" U+02BC Lm 1 "MODIFIER LETTER APOSTROPHE" ) 701 : ( "ʽ" U+02BD Lm 1 "MODIFIER LETTER REVERSED COMMA" ) 702 : ( "ʾ" U+02BE Lm 1 "MODIFIER LETTER RIGHT HALF RING" ) 703 : ( "ʿ" U+02BF Lm 1 "MODIFIER LETTER LEFT HALF RING" ) 704 : ( "ˀ" U+02C0 Lm 1 "MODIFIER LETTER GLOTTAL STOP" ) 705 : ( "ˁ" U+02C1 Lm 1 "MODIFIER LETTER REVERSED GLOTTAL STOP" ) 706 : ( "˂" U+02C2 Sk 1 "MODIFIER LETTER LEFT ARROWHEAD" ) 707 : ( "˃" U+02C3 Sk 1 "MODIFIER LETTER RIGHT ARROWHEAD" ) 708 : ( "˄" U+02C4 Sk 1 "MODIFIER LETTER UP ARROWHEAD" ) 709 : ( "˅" U+02C5 Sk 1 "MODIFIER LETTER DOWN ARROWHEAD" ) 710 : ( "ˆ" U+02C6 Lm 1 "MODIFIER LETTER CIRCUMFLEX ACCENT" ) 711 : ( "ˇ" U+02C7 Lm 1 "CARON" ) 712 : ( "ˈ" U+02C8 Lm 1 "MODIFIER LETTER VERTICAL LINE" ) 713 : ( "ˉ" U+02C9 Lm 1 "MODIFIER LETTER MACRON" ) 714 : ( "ˊ" U+02CA Lm 1 "MODIFIER LETTER ACUTE ACCENT" ) 715 : ( "ˋ" U+02CB Lm 1 "MODIFIER LETTER GRAVE ACCENT" ) 716 : ( "ˌ" U+02CC Lm 1 "MODIFIER LETTER LOW VERTICAL LINE" ) 717 : ( "ˍ" U+02CD Lm 1 "MODIFIER LETTER LOW MACRON" ) 718 : ( "ˎ" U+02CE Lm 1 "MODIFIER LETTER LOW GRAVE ACCENT" ) 719 : ( "ˏ" U+02CF Lm 1 "MODIFIER LETTER LOW ACUTE ACCENT" ) 720 : ( "ː" U+02D0 Lm 1 "MODIFIER LETTER TRIANGULAR COLON" ) 721 : ( "ˑ" U+02D1 Lm 1 "MODIFIER LETTER HALF TRIANGULAR COLON" ) 722 : ( "˒" U+02D2 Sk 1 "MODIFIER LETTER CENTRED RIGHT HALF RING" ) 723 : ( "˓" U+02D3 Sk 1 "MODIFIER LETTER CENTRED LEFT HALF RING" ) 724 : ( "˔" U+02D4 Sk 1 "MODIFIER LETTER UP TACK" ) 725 : ( "˕" U+02D5 Sk 1 "MODIFIER LETTER DOWN TACK" ) 726 : ( "˖" U+02D6 Sk 1 "MODIFIER LETTER PLUS SIGN" ) 727 : ( "˗" U+02D7 Sk 1 "MODIFIER LETTER MINUS SIGN" ) 728 : ( "˘" U+02D8 Sk 1 "BREVE" ) 729 : ( "˙" U+02D9 Sk 1 "DOT ABOVE" ) 730 : ( "˚" U+02DA Sk 1 "RING ABOVE" ) 731 : ( "˛" U+02DB Sk 1 "OGONEK" ) 732 : ( "˜" U+02DC Sk 1 "SMALL TILDE" ) 733 : ( "˝" U+02DD Sk 1 "DOUBLE ACUTE ACCENT" ) 734 : ( "˞" U+02DE Sk 1 "MODIFIER LETTER RHOTIC HOOK" ) 735 : ( "˟" U+02DF Sk 1 "MODIFIER LETTER CROSS ACCENT" ) 736 : ( "ˠ" U+02E0 Lm 1 "MODIFIER LETTER SMALL GAMMA" ) 737 : ( "ˡ" U+02E1 Lm 1 "MODIFIER LETTER SMALL L" ) 738 : ( "ˢ" U+02E2 Lm 1 "MODIFIER LETTER SMALL S" ) 739 : ( "ˣ" U+02E3 Lm 1 "MODIFIER LETTER SMALL X" ) 740 : ( "ˤ" U+02E4 Lm 1 "MODIFIER LETTER SMALL REVERSED GLOTTAL STOP" ) 741 : ( "˥" U+02E5 Sk 1 "MODIFIER LETTER EXTRA-HIGH TONE BAR" ) 742 : ( "˦" U+02E6 Sk 1 "MODIFIER LETTER HIGH TONE BAR" ) 743 : ( "˧" U+02E7 Sk 1 "MODIFIER LETTER MID TONE BAR" ) 744 : ( "˨" U+02E8 Sk 1 "MODIFIER LETTER LOW TONE BAR" ) 745 : ( "˩" U+02E9 Sk 1 "MODIFIER LETTER EXTRA-LOW TONE BAR" ) 746 : ( "˪" U+02EA Sk 1 "MODIFIER LETTER YIN DEPARTING TONE MARK" ) 747 : ( "˫" U+02EB Sk 1 "MODIFIER LETTER YANG DEPARTING TONE MARK" ) 748 : ( "ˬ" U+02EC Lm 1 "MODIFIER LETTER VOICING" ) 749 : ( "˭" U+02ED Sk 1 "MODIFIER LETTER UNASPIRATED" ) 750 : ( "ˮ" U+02EE Lm 1 "MODIFIER LETTER DOUBLE APOSTROPHE" ) 751 : ( "˯" U+02EF Sk 1 "MODIFIER LETTER LOW DOWN ARROWHEAD" ) 752 : ( "˰" U+02F0 Sk 1 "MODIFIER LETTER LOW UP ARROWHEAD" ) 753 : ( "˱" U+02F1 Sk 1 "MODIFIER LETTER LOW LEFT ARROWHEAD" ) 754 : ( "˲" U+02F2 Sk 1 "MODIFIER LETTER LOW RIGHT ARROWHEAD" ) 755 : ( "˳" U+02F3 Sk 1 "MODIFIER LETTER LOW RING" ) 756 : ( "˴" U+02F4 Sk 1 "MODIFIER LETTER MIDDLE GRAVE ACCENT" ) 757 : ( "˵" U+02F5 Sk 1 "MODIFIER LETTER MIDDLE DOUBLE GRAVE ACCENT" ) 758 : ( "˶" U+02F6 Sk 1 "MODIFIER LETTER MIDDLE DOUBLE ACUTE ACCENT" ) 759 : ( "˷" U+02F7 Sk 1 "MODIFIER LETTER LOW TILDE" ) 760 : ( "˸" U+02F8 Sk 1 "MODIFIER LETTER RAISED COLON" ) 761 : ( "˹" U+02F9 Sk 1 "MODIFIER LETTER BEGIN HIGH TONE" ) 762 : ( "˺" U+02FA Sk 1 "MODIFIER LETTER END HIGH TONE" ) 763 : ( "˻" U+02FB Sk 1 "MODIFIER LETTER BEGIN LOW TONE" ) 764 : ( "˼" U+02FC Sk 1 "MODIFIER LETTER END LOW TONE" ) 765 : ( "˽" U+02FD Sk 1 "MODIFIER LETTER SHELF" ) 766 : ( "˾" U+02FE Sk 1 "MODIFIER LETTER OPEN SHELF" ) 767 : ( "˿" U+02FF Sk 1 "MODIFIER LETTER LOW LEFT ARROW" ) 768 : ( "̀" U+0300 Mn 0 "COMBINING GRAVE ACCENT" ) 769 : ( "́" U+0301 Mn 0 "COMBINING ACUTE ACCENT" ) 770 : ( "̂" U+0302 Mn 0 "COMBINING CIRCUMFLEX ACCENT" ) 771 : ( "̃" U+0303 Mn 0 "COMBINING TILDE" ) 772 : ( "̄" U+0304 Mn 0 "COMBINING MACRON" ) 773 : ( "̅" U+0305 Mn 0 "COMBINING OVERLINE" ) 774 : ( "̆" U+0306 Mn 0 "COMBINING BREVE" ) 775 : ( "̇" U+0307 Mn 0 "COMBINING DOT ABOVE" ) 776 : ( "̈" U+0308 Mn 0 "COMBINING DIAERESIS" ) 777 : ( "̉" U+0309 Mn 0 "COMBINING HOOK ABOVE" ) 778 : ( "̊" U+030A Mn 0 "COMBINING RING ABOVE" ) 779 : ( "̋" U+030B Mn 0 "COMBINING DOUBLE ACUTE ACCENT" ) 780 : ( "̌" U+030C Mn 0 "COMBINING CARON" ) 781 : ( "̍" U+030D Mn 0 "COMBINING VERTICAL LINE ABOVE" ) 782 : ( "̎" U+030E Mn 0 "COMBINING DOUBLE VERTICAL LINE ABOVE" ) 783 : ( "̏" U+030F Mn 0 "COMBINING DOUBLE GRAVE ACCENT" ) 784 : ( "̐" U+0310 Mn 0 "COMBINING CANDRABINDU" ) 785 : ( "̑" U+0311 Mn 0 "COMBINING INVERTED BREVE" ) 786 : ( "̒" U+0312 Mn 0 "COMBINING TURNED COMMA ABOVE" ) 787 : ( "̓" U+0313 Mn 0 "COMBINING COMMA ABOVE" ) 788 : ( "̔" U+0314 Mn 0 "COMBINING REVERSED COMMA ABOVE" ) 789 : ( "̕" U+0315 Mn 0 "COMBINING COMMA ABOVE RIGHT" ) 790 : ( "̖" U+0316 Mn 0 "COMBINING GRAVE ACCENT BELOW" ) 791 : ( "̗" U+0317 Mn 0 "COMBINING ACUTE ACCENT BELOW" ) 792 : ( "̘" U+0318 Mn 0 "COMBINING LEFT TACK BELOW" ) 793 : ( "̙" U+0319 Mn 0 "COMBINING RIGHT TACK BELOW" ) 794 : ( "̚" U+031A Mn 0 "COMBINING LEFT ANGLE ABOVE" ) 795 : ( "̛" U+031B Mn 0 "COMBINING HORN" ) 796 : ( "̜" U+031C Mn 0 "COMBINING LEFT HALF RING BELOW" ) 797 : ( "̝" U+031D Mn 0 "COMBINING UP TACK BELOW" ) 798 : ( "̞" U+031E Mn 0 "COMBINING DOWN TACK BELOW" ) 799 : ( "̟" U+031F Mn 0 "COMBINING PLUS SIGN BELOW" ) 800 : ( "̠" U+0320 Mn 0 "COMBINING MINUS SIGN BELOW" ) 801 : ( "̡" U+0321 Mn 0 "COMBINING PALATALIZED HOOK BELOW" ) 802 : ( "̢" U+0322 Mn 0 "COMBINING RETROFLEX HOOK BELOW" ) 803 : ( "̣" U+0323 Mn 0 "COMBINING DOT BELOW" ) 804 : ( "̤" U+0324 Mn 0 "COMBINING DIAERESIS BELOW" ) 805 : ( "̥" U+0325 Mn 0 "COMBINING RING BELOW" ) 806 : ( "̦" U+0326 Mn 0 "COMBINING COMMA BELOW" ) 807 : ( "̧" U+0327 Mn 0 "COMBINING CEDILLA" ) 808 : ( "̨" U+0328 Mn 0 "COMBINING OGONEK" ) 809 : ( "̩" U+0329 Mn 0 "COMBINING VERTICAL LINE BELOW" ) 810 : ( "̪" U+032A Mn 0 "COMBINING BRIDGE BELOW" ) 811 : ( "̫" U+032B Mn 0 "COMBINING INVERTED DOUBLE ARCH BELOW" ) 812 : ( "̬" U+032C Mn 0 "COMBINING CARON BELOW" ) 813 : ( "̭" U+032D Mn 0 "COMBINING CIRCUMFLEX ACCENT BELOW" ) 814 : ( "̮" U+032E Mn 0 "COMBINING BREVE BELOW" ) 815 : ( "̯" U+032F Mn 0 "COMBINING INVERTED BREVE BELOW" ) 816 : ( "̰" U+0330 Mn 0 "COMBINING TILDE BELOW" ) 817 : ( "̱" U+0331 Mn 0 "COMBINING MACRON BELOW" ) 818 : ( "̲" U+0332 Mn 0 "COMBINING LOW LINE" ) 819 : ( "̳" U+0333 Mn 0 "COMBINING DOUBLE LOW LINE" ) 820 : ( "̴" U+0334 Mn 0 "COMBINING TILDE OVERLAY" ) 821 : ( "̵" U+0335 Mn 0 "COMBINING SHORT STROKE OVERLAY" ) 822 : ( "̶" U+0336 Mn 0 "COMBINING LONG STROKE OVERLAY" ) 823 : ( "̷" U+0337 Mn 0 "COMBINING SHORT SOLIDUS OVERLAY" ) 824 : ( "̸" U+0338 Mn 0 "COMBINING LONG SOLIDUS OVERLAY" ) 825 : ( "̹" U+0339 Mn 0 "COMBINING RIGHT HALF RING BELOW" ) 826 : ( "̺" U+033A Mn 0 "COMBINING INVERTED BRIDGE BELOW" ) 827 : ( "̻" U+033B Mn 0 "COMBINING SQUARE BELOW" ) 828 : ( "̼" U+033C Mn 0 "COMBINING SEAGULL BELOW" ) 829 : ( "̽" U+033D Mn 0 "COMBINING X ABOVE" ) 830 : ( "̾" U+033E Mn 0 "COMBINING VERTICAL TILDE" ) 831 : ( "̿" U+033F Mn 0 "COMBINING DOUBLE OVERLINE" ) 832 : ( "̀" U+0340 Mn 0 "COMBINING GRAVE TONE MARK" ) 833 : ( "́" U+0341 Mn 0 "COMBINING ACUTE TONE MARK" ) 834 : ( "͂" U+0342 Mn 0 "COMBINING GREEK PERISPOMENI" ) 835 : ( "̓" U+0343 Mn 0 "COMBINING GREEK KORONIS" ) 836 : ( "̈́" U+0344 Mn 0 "COMBINING GREEK DIALYTIKA TONOS" ) 837 : ( "ͅ" U+0345 Mn 0 "COMBINING GREEK YPOGEGRAMMENI" ) 838 : ( "͆" U+0346 Mn 0 "COMBINING BRIDGE ABOVE" ) 839 : ( "͇" U+0347 Mn 0 "COMBINING EQUALS SIGN BELOW" ) 840 : ( "͈" U+0348 Mn 0 "COMBINING DOUBLE VERTICAL LINE BELOW" ) 841 : ( "͉" U+0349 Mn 0 "COMBINING LEFT ANGLE BELOW" ) 842 : ( "͊" U+034A Mn 0 "COMBINING NOT TILDE ABOVE" ) 843 : ( "͋" U+034B Mn 0 "COMBINING HOMOTHETIC ABOVE" ) 844 : ( "͌" U+034C Mn 0 "COMBINING ALMOST EQUAL TO ABOVE" ) 845 : ( "͍" U+034D Mn 0 "COMBINING LEFT RIGHT ARROW BELOW" ) 846 : ( "͎" U+034E Mn 0 "COMBINING UPWARDS ARROW BELOW" ) 847 : ( "͏" U+034F Mn 0 "COMBINING GRAPHEME JOINER", "CGJ" ) 848 : ( "͐" U+0350 Mn 0 "COMBINING RIGHT ARROWHEAD ABOVE" ) 849 : ( "͑" U+0351 Mn 0 "COMBINING LEFT HALF RING ABOVE" ) 850 : ( "͒" U+0352 Mn 0 "COMBINING FERMATA" ) 851 : ( "͓" U+0353 Mn 0 "COMBINING X BELOW" ) 852 : ( "͔" U+0354 Mn 0 "COMBINING LEFT ARROWHEAD BELOW" ) 853 : ( "͕" U+0355 Mn 0 "COMBINING RIGHT ARROWHEAD BELOW" ) 854 : ( "͖" U+0356 Mn 0 "COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW" ) 855 : ( "͗" U+0357 Mn 0 "COMBINING RIGHT HALF RING ABOVE" ) 856 : ( "͘" U+0358 Mn 0 "COMBINING DOT ABOVE RIGHT" ) 857 : ( "͙" U+0359 Mn 0 "COMBINING ASTERISK BELOW" ) 858 : ( "͚" U+035A Mn 0 "COMBINING DOUBLE RING BELOW" ) 859 : ( "͛" U+035B Mn 0 "COMBINING ZIGZAG ABOVE" ) 860 : ( "͜" U+035C Mn 0 "COMBINING DOUBLE BREVE BELOW" ) 861 : ( "͝" U+035D Mn 0 "COMBINING DOUBLE BREVE" ) 862 : ( "͞" U+035E Mn 0 "COMBINING DOUBLE MACRON" ) 863 : ( "͟" U+035F Mn 0 "COMBINING DOUBLE MACRON BELOW" ) 864 : ( "͠" U+0360 Mn 0 "COMBINING DOUBLE TILDE" ) 865 : ( "͡" U+0361 Mn 0 "COMBINING DOUBLE INVERTED BREVE" ) 866 : ( "͢" U+0362 Mn 0 "COMBINING DOUBLE RIGHTWARDS ARROW BELOW" ) 867 : ( "ͣ" U+0363 Mn 0 "COMBINING LATIN SMALL LETTER A" ) 868 : ( "ͤ" U+0364 Mn 0 "COMBINING LATIN SMALL LETTER E" ) 869 : ( "ͥ" U+0365 Mn 0 "COMBINING LATIN SMALL LETTER I" ) 870 : ( "ͦ" U+0366 Mn 0 "COMBINING LATIN SMALL LETTER O" ) 871 : ( "ͧ" U+0367 Mn 0 "COMBINING LATIN SMALL LETTER U" ) 872 : ( "ͨ" U+0368 Mn 0 "COMBINING LATIN SMALL LETTER C" ) 873 : ( "ͩ" U+0369 Mn 0 "COMBINING LATIN SMALL LETTER D" ) 874 : ( "ͪ" U+036A Mn 0 "COMBINING LATIN SMALL LETTER H" ) 875 : ( "ͫ" U+036B Mn 0 "COMBINING LATIN SMALL LETTER M" ) 876 : ( "ͬ" U+036C Mn 0 "COMBINING LATIN SMALL LETTER R" ) 877 : ( "ͭ" U+036D Mn 0 "COMBINING LATIN SMALL LETTER T" ) 878 : ( "ͮ" U+036E Mn 0 "COMBINING LATIN SMALL LETTER V" ) 879 : ( "ͯ" U+036F Mn 0 "COMBINING LATIN SMALL LETTER X" ) 880 : ( "Ͱ" U+0370 Lu 1 "GREEK CAPITAL LETTER HETA" ) 881 : ( "ͱ" U+0371 Ll 1 "GREEK SMALL LETTER HETA" ) 882 : ( "Ͳ" U+0372 Lu 1 "GREEK CAPITAL LETTER ARCHAIC SAMPI" ) 883 : ( "ͳ" U+0373 Ll 1 "GREEK SMALL LETTER ARCHAIC SAMPI" ) 884 : ( "ʹ" U+0374 Lm 1 "GREEK NUMERAL SIGN" ) 885 : ( "͵" U+0375 Sk 1 "GREEK LOWER NUMERAL SIGN" ) 886 : ( "Ͷ" U+0376 Lu 1 "GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA" ) 887 : ( "ͷ" U+0377 Ll 1 "GREEK SMALL LETTER PAMPHYLIAN DIGAMMA" ) 890 : ( "ͺ" U+037A Lm 1 "GREEK YPOGEGRAMMENI" ) 891 : ( "ͻ" U+037B Ll 1 "GREEK SMALL REVERSED LUNATE SIGMA SYMBOL" ) 892 : ( "ͼ" U+037C Ll 1 "GREEK SMALL DOTTED LUNATE SIGMA SYMBOL" ) 893 : ( "ͽ" U+037D Ll 1 "GREEK SMALL REVERSED DOTTED LUNATE SIGMA SYMBOL" ) 894 : ( ";" U+037E Po 1 "GREEK QUESTION MARK" ) 895 : ( "Ϳ" U+037F Lu 1 "GREEK CAPITAL LETTER YOT" ) 900 : ( "΄" U+0384 Sk 1 "GREEK TONOS" ) 901 : ( "΅" U+0385 Sk 1 "GREEK DIALYTIKA TONOS" ) 902 : ( "Ά" U+0386 Lu 1 "GREEK CAPITAL LETTER ALPHA WITH TONOS" ) 903 : ( "·" U+0387 Po 1 "GREEK ANO TELEIA" ) 904 : ( "Έ" U+0388 Lu 1 "GREEK CAPITAL LETTER EPSILON WITH TONOS" ) 905 : ( "Ή" U+0389 Lu 1 "GREEK CAPITAL LETTER ETA WITH TONOS" ) 906 : ( "Ί" U+038A Lu 1 "GREEK CAPITAL LETTER IOTA WITH TONOS" ) 908 : ( "Ό" U+038C Lu 1 "GREEK CAPITAL LETTER OMICRON WITH TONOS" ) 910 : ( "Ύ" U+038E Lu 1 "GREEK CAPITAL LETTER UPSILON WITH TONOS" ) 911 : ( "Ώ" U+038F Lu 1 "GREEK CAPITAL LETTER OMEGA WITH TONOS" ) 912 : ( "ΐ" U+0390 Ll 1 "GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS" ) 913 : ( "Α" U+0391 Lu 1 "GREEK CAPITAL LETTER ALPHA" ) 914 : ( "Β" U+0392 Lu 1 "GREEK CAPITAL LETTER BETA" ) 915 : ( "Γ" U+0393 Lu 1 "GREEK CAPITAL LETTER GAMMA" ) 916 : ( "Δ" U+0394 Lu 1 "GREEK CAPITAL LETTER DELTA" ) 917 : ( "Ε" U+0395 Lu 1 "GREEK CAPITAL LETTER EPSILON" ) 918 : ( "Ζ" U+0396 Lu 1 "GREEK CAPITAL LETTER ZETA" ) 919 : ( "Η" U+0397 Lu 1 "GREEK CAPITAL LETTER ETA" ) 920 : ( "Θ" U+0398 Lu 1 "GREEK CAPITAL LETTER THETA" ) 921 : ( "Ι" U+0399 Lu 1 "GREEK CAPITAL LETTER IOTA" ) 922 : ( "Κ" U+039A Lu 1 "GREEK CAPITAL LETTER KAPPA" ) 923 : ( "Λ" U+039B Lu 1 "GREEK CAPITAL LETTER LAMDA" ) 924 : ( "Μ" U+039C Lu 1 "GREEK CAPITAL LETTER MU" ) 925 : ( "Ν" U+039D Lu 1 "GREEK CAPITAL LETTER NU" ) 926 : ( "Ξ" U+039E Lu 1 "GREEK CAPITAL LETTER XI" ) 927 : ( "Ο" U+039F Lu 1 "GREEK CAPITAL LETTER OMICRON" ) 928 : ( "Π" U+03A0 Lu 1 "GREEK CAPITAL LETTER PI" ) 929 : ( "Ρ" U+03A1 Lu 1 "GREEK CAPITAL LETTER RHO" ) 931 : ( "Σ" U+03A3 Lu 1 "GREEK CAPITAL LETTER SIGMA" ) 932 : ( "Τ" U+03A4 Lu 1 "GREEK CAPITAL LETTER TAU" ) 933 : ( "Υ" U+03A5 Lu 1 "GREEK CAPITAL LETTER UPSILON" ) 934 : ( "Φ" U+03A6 Lu 1 "GREEK CAPITAL LETTER PHI" ) 935 : ( "Χ" U+03A7 Lu 1 "GREEK CAPITAL LETTER CHI" ) 936 : ( "Ψ" U+03A8 Lu 1 "GREEK CAPITAL LETTER PSI" ) 937 : ( "Ω" U+03A9 Lu 1 "GREEK CAPITAL LETTER OMEGA" ) 938 : ( "Ϊ" U+03AA Lu 1 "GREEK CAPITAL LETTER IOTA WITH DIALYTIKA" ) 939 : ( "Ϋ" U+03AB Lu 1 "GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA" ) 940 : ( "ά" U+03AC Ll 1 "GREEK SMALL LETTER ALPHA WITH TONOS" ) 941 : ( "έ" U+03AD Ll 1 "GREEK SMALL LETTER EPSILON WITH TONOS" ) 942 : ( "ή" U+03AE Ll 1 "GREEK SMALL LETTER ETA WITH TONOS" ) 943 : ( "ί" U+03AF Ll 1 "GREEK SMALL LETTER IOTA WITH TONOS" ) 944 : ( "ΰ" U+03B0 Ll 1 "GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS" ) 945 : ( "α" U+03B1 Ll 1 "GREEK SMALL LETTER ALPHA" ) 946 : ( "β" U+03B2 Ll 1 "GREEK SMALL LETTER BETA" ) 947 : ( "γ" U+03B3 Ll 1 "GREEK SMALL LETTER GAMMA" ) 948 : ( "δ" U+03B4 Ll 1 "GREEK SMALL LETTER DELTA" ) 949 : ( "ε" U+03B5 Ll 1 "GREEK SMALL LETTER EPSILON" ) 950 : ( "ζ" U+03B6 Ll 1 "GREEK SMALL LETTER ZETA" ) 951 : ( "η" U+03B7 Ll 1 "GREEK SMALL LETTER ETA" ) 952 : ( "θ" U+03B8 Ll 1 "GREEK SMALL LETTER THETA" ) 953 : ( "ι" U+03B9 Ll 1 "GREEK SMALL LETTER IOTA" ) 954 : ( "κ" U+03BA Ll 1 "GREEK SMALL LETTER KAPPA" ) 955 : ( "λ" U+03BB Ll 1 "GREEK SMALL LETTER LAMDA" ) 956 : ( "μ" U+03BC Ll 1 "GREEK SMALL LETTER MU" ) 957 : ( "ν" U+03BD Ll 1 "GREEK SMALL LETTER NU" ) 958 : ( "ξ" U+03BE Ll 1 "GREEK SMALL LETTER XI" ) 959 : ( "ο" U+03BF Ll 1 "GREEK SMALL LETTER OMICRON" ) 960 : ( "π" U+03C0 Ll 1 "GREEK SMALL LETTER PI" ) 961 : ( "ρ" U+03C1 Ll 1 "GREEK SMALL LETTER RHO" ) 962 : ( "ς" U+03C2 Ll 1 "GREEK SMALL LETTER FINAL SIGMA" ) 963 : ( "σ" U+03C3 Ll 1 "GREEK SMALL LETTER SIGMA" ) 964 : ( "τ" U+03C4 Ll 1 "GREEK SMALL LETTER TAU" ) 965 : ( "υ" U+03C5 Ll 1 "GREEK SMALL LETTER UPSILON" ) 966 : ( "φ" U+03C6 Ll 1 "GREEK SMALL LETTER PHI" ) 967 : ( "χ" U+03C7 Ll 1 "GREEK SMALL LETTER CHI" ) 968 : ( "ψ" U+03C8 Ll 1 "GREEK SMALL LETTER PSI" ) 969 : ( "ω" U+03C9 Ll 1 "GREEK SMALL LETTER OMEGA" ) 970 : ( "ϊ" U+03CA Ll 1 "GREEK SMALL LETTER IOTA WITH DIALYTIKA" ) 971 : ( "ϋ" U+03CB Ll 1 "GREEK SMALL LETTER UPSILON WITH DIALYTIKA" ) 972 : ( "ό" U+03CC Ll 1 "GREEK SMALL LETTER OMICRON WITH TONOS" ) 973 : ( "ύ" U+03CD Ll 1 "GREEK SMALL LETTER UPSILON WITH TONOS" ) 974 : ( "ώ" U+03CE Ll 1 "GREEK SMALL LETTER OMEGA WITH TONOS" ) 975 : ( "Ϗ" U+03CF Lu 1 "GREEK CAPITAL KAI SYMBOL" ) 976 : ( "ϐ" U+03D0 Ll 1 "GREEK BETA SYMBOL" ) 977 : ( "ϑ" U+03D1 Ll 1 "GREEK THETA SYMBOL" ) 978 : ( "ϒ" U+03D2 Lu 1 "GREEK UPSILON WITH HOOK SYMBOL" ) 979 : ( "ϓ" U+03D3 Lu 1 "GREEK UPSILON WITH ACUTE AND HOOK SYMBOL" ) 980 : ( "ϔ" U+03D4 Lu 1 "GREEK UPSILON WITH DIAERESIS AND HOOK SYMBOL" ) 981 : ( "ϕ" U+03D5 Ll 1 "GREEK PHI SYMBOL" ) 982 : ( "ϖ" U+03D6 Ll 1 "GREEK PI SYMBOL" ) 983 : ( "ϗ" U+03D7 Ll 1 "GREEK KAI SYMBOL" ) 984 : ( "Ϙ" U+03D8 Lu 1 "GREEK LETTER ARCHAIC KOPPA" ) 985 : ( "ϙ" U+03D9 Ll 1 "GREEK SMALL LETTER ARCHAIC KOPPA" ) 986 : ( "Ϛ" U+03DA Lu 1 "GREEK LETTER STIGMA" ) 987 : ( "ϛ" U+03DB Ll 1 "GREEK SMALL LETTER STIGMA" ) 988 : ( "Ϝ" U+03DC Lu 1 "GREEK LETTER DIGAMMA" ) 989 : ( "ϝ" U+03DD Ll 1 "GREEK SMALL LETTER DIGAMMA" ) 990 : ( "Ϟ" U+03DE Lu 1 "GREEK LETTER KOPPA" ) 991 : ( "ϟ" U+03DF Ll 1 "GREEK SMALL LETTER KOPPA" ) 992 : ( "Ϡ" U+03E0 Lu 1 "GREEK LETTER SAMPI" ) 993 : ( "ϡ" U+03E1 Ll 1 "GREEK SMALL LETTER SAMPI" ) 994 : ( "Ϣ" U+03E2 Lu 1 "COPTIC CAPITAL LETTER SHEI" ) 995 : ( "ϣ" U+03E3 Ll 1 "COPTIC SMALL LETTER SHEI" ) 996 : ( "Ϥ" U+03E4 Lu 1 "COPTIC CAPITAL LETTER FEI" ) 997 : ( "ϥ" U+03E5 Ll 1 "COPTIC SMALL LETTER FEI" ) 998 : ( "Ϧ" U+03E6 Lu 1 "COPTIC CAPITAL LETTER KHEI" ) 999 : ( "ϧ" U+03E7 Ll 1 "COPTIC SMALL LETTER KHEI" ) 1000 : ( "Ϩ" U+03E8 Lu 1 "COPTIC CAPITAL LETTER HORI" ) 1001 : ( "ϩ" U+03E9 Ll 1 "COPTIC SMALL LETTER HORI" ) 1002 : ( "Ϫ" U+03EA Lu 1 "COPTIC CAPITAL LETTER GANGIA" ) 1003 : ( "ϫ" U+03EB Ll 1 "COPTIC SMALL LETTER GANGIA" ) 1004 : ( "Ϭ" U+03EC Lu 1 "COPTIC CAPITAL LETTER SHIMA" ) 1005 : ( "ϭ" U+03ED Ll 1 "COPTIC SMALL LETTER SHIMA" ) 1006 : ( "Ϯ" U+03EE Lu 1 "COPTIC CAPITAL LETTER DEI" ) 1007 : ( "ϯ" U+03EF Ll 1 "COPTIC SMALL LETTER DEI" ) 1008 : ( "ϰ" U+03F0 Ll 1 "GREEK KAPPA SYMBOL" ) ... -- The last 10 characters ooRexx> .unicode~characters~pipe(.take "last" 10 | .console) 917990 : ( "󠇦" U+E01E6 Mn 0 "VARIATION SELECTOR-247", "VS247" ) 917991 : ( "󠇧" U+E01E7 Mn 0 "VARIATION SELECTOR-248", "VS248" ) 917992 : ( "󠇨" U+E01E8 Mn 0 "VARIATION SELECTOR-249", "VS249" ) 917993 : ( "󠇩" U+E01E9 Mn 0 "VARIATION SELECTOR-250", "VS250" ) 917994 : ( "󠇪" U+E01EA Mn 0 "VARIATION SELECTOR-251", "VS251" ) 917995 : ( "󠇫" U+E01EB Mn 0 "VARIATION SELECTOR-252", "VS252" ) 917996 : ( "󠇬" U+E01EC Mn 0 "VARIATION SELECTOR-253", "VS253" ) 917997 : ( "󠇭" U+E01ED Mn 0 "VARIATION SELECTOR-254", "VS254" ) 917998 : ( "󠇮" U+E01EE Mn 0 "VARIATION SELECTOR-255", "VS255" ) 917999 : ( "󠇯" U+E01EF Mn 0 "VARIATION SELECTOR-256", "VS256" ) -- get a character by codepoint ooRexx> .unicode~character(8203)= -- (U+200B Cf "ZERO WIDTH SPACE") ( "" U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" ) ooRexx> .unicode~character("U+200B")= -- (U+200B Cf "ZERO WIDTH SPACE") ( "" U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" ) ooRexx> .unicode~character("u+200b")= -- (U+200B Cf "ZERO WIDTH SPACE") ( "" U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" ) -- get a character by name. -- loose matching name. See https://unicode.org/reports/tr44/#UAX44-LM2 ooRexx> .unicode~character("ZERO WIDTH SPACE")= -- (U+200B Cf "ZERO WIDTH SPACE") ( "" U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" ) ooRexx> .unicode~character("ZERO_WIDTH-SPACE")= -- (U+200B Cf "ZERO WIDTH SPACE") ( "" U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" ) ooRexx> .unicode~character("ZEROWIDTHSPACE")= -- (U+200B Cf "ZERO WIDTH SPACE") ( "" U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" ) ooRexx> .unicode~character("zerowidthspace")= -- (U+200B Cf "ZERO WIDTH SPACE") ( "" U+200B Cf 0 "ZERO WIDTH SPACE", "ZWSP" ) -- select characters using a matcher -- remember: it's better to initialize the matcher outside the iteration. ooRexx> matcher = "*chris*"~matcher; .unicode~characters~select{expose matcher; matcher~(item~name)}== an Array (shape [3], 3 items) 1 : ( "🎄" U+1F384 So 2 "CHRISTMAS TREE" ) 2 : ( "🎅" U+1F385 So 2 "FATHER CHRISTMAS" ) 3 : ( "🤶" U+1F936 So 2 "MOTHER CHRISTMAS" ) -- string character names ooRexx> "noël👩👨👩👧🎅"~text~codepoints~each{uchar = .unicode~character(item); uchar~charWidth uchar~categoryName uchar~name}== an Array (shape [12], 12 items) 1 : '1 Ll LATIN SMALL LETTER N' 2 : '1 Ll LATIN SMALL LETTER O' 3 : '1 Ll LATIN SMALL LETTER E WITH DIAERESIS' 4 : '1 Ll LATIN SMALL LETTER L' 5 : '2 So WOMAN' 6 : '0 Cf ZERO WIDTH JOINER' 7 : '2 So MAN' 8 : '0 Cf ZERO WIDTH JOINER' 9 : '2 So WOMAN' 10 : '0 Cf ZERO WIDTH JOINER' 11 : '2 So GIRL' 12 : '2 So FATHER CHRISTMAS' -- shortest name: ooRexx> .unicode~characters~reduce{if accu~name~length > item~name~length, item~name~length <> 0 then item }= ( "" U+0000 Cc 0 "", "NULL", "NUL" ) -- longest name: ooRexx> .unicode~characters~reduce{if accu~name~length < item~name~length then item }= ( "🮨" U+1FBA8 So 1 "BOX DRAWINGS LIGHT DIAGONAL UPPER CENTRE TO MIDDLE LEFT AND MIDDLE RIGHT TO LOWER CENTRE" ) -- =============================================================================== -- 2021 September 12 /* [String chunks] The functionality of splitting text by quoted/unquoted chunks is moved from ooRexxShell to a dedicated package: extension/stringChunk.cls (compatible with official ooRexx) The initial need was to parse a command line and split it the same way as a cmd or bash shell. Also used to parse the queries in ooRexxShell. The quotes are removed, but each character is associated to a 'quote flag' to remember if the character was inside a quoted section. These flags are typically used by the matchers of type string pattern, to decide if a character can be special or not. Description: routine stringChunks use strict arg string, withInfos=.false, breakTokens="", splitLevel=1 Converts a string to an array of String or to an array of stringChunk. The type of result is indicated by the argument withInfos: - If withInfos == .false (default) then the result is an array of String. - If withInfos == .true then the result is an array of StringChunk. A StringChunk is a substring which references the start and end character in its container. It's associated to a string of booleans (quotedFlags) which indicate for each character if it was inside a quoted section. A quote is either " or '. An unquoted section is splitted in StringChunks delimited by whitespaces (anything <= 32) and break tokens. A quoted section is not splitted: - Whitespaces are kept, - single occurences of quotes are removed, - double occurrence of quotes are replaced by a single embedded quote, - break tokens and escape characters are ignored. An escape character is any character passed in the argument escapeCharacters. An escape character sets the quote flag of the next character to 1. Escape characters are removed, even if they are not followed by another character (truncated string). Example with 'a' declared escape character: - "a" --> "" - "aa" --> "a" - "aaa" --> "a" - "aaaa" --> "aa" If a quote is declared escape character, there is no impact: a quote is already an escape mechanism. If a space is declared escape character, there is an impact when splitLevel=0: the quote flag of a character following an unquoted space is set to 1, the unquoted spaces are removed Example: 'one two "three four" five six' --> onetwothree fourfivesix 00010011111111111000100 Break tokens are passed in the argument breakTokens. A break token cannot contains spaces. The break tokens can be case sensitive (default) or case insensitive. Each break token can be prefixed by: - cs: case sensitive - ci: case insensitive - cl: caseless (synonym of case insensitive) Any other prefix is not an error. It's just not a case prefix. If a quote is declared break token then it's no longer recognized as a quote. If an escape character is declared break token then it's no longer recognized as an escape character. The split process is controlled by the argument splitLevel: - If splitLevel == 0 then the string is not splitted but the quotes and escape characters are managed, quotedFlags is set. 'xx aa"b b"cc"d d"ee yy' is 1 StringChunk. - If splitLevel == 1 (default) then adjacent quoted/unquoted sections are kept glued. 'xx aa"b b"cc"d d"ee yy' is 3 StringChunk: xx "aab bccd dee" yy - If splitLevel == 2 then adjacent quoted/unquoted sections are separated. 'xx aa"b b"cc"d d"ee yy' is splitted in 7 StringChunk: xx aa "b b" cc "d d" ee yy Illustration with splitLevel=1: 11111111111111111111111111 222222222222222 333333333333333333333 '"hello "John" how are you" good" bye "John "my name is ""BOND"""' 0000000001111111111222222222233333333334444444444555555555566666 1234567890123456789012345678901234567890123456789012345678901234 arg1 = |hello John how are you| containerStart = 01 containerEnd = 26 quotedFlags = 1111110000111111111111 arg2 = |good bye John| containerStart = 28 containerEnd = 42 quotedFlags = 0000111110000 arg3 = |my name is "BOND"| containerStart = 44 containerEnd = 64 quotedFlags = 11111111111111111 Extensions available in Executor only: .String~chunk withInfos is true, splitLevel is 0 --> always returns ONE StringChunk .String~chunks withInfos is true by default, splitLevel is 1 by default Examples: */ ooRexx> -- splitLevel = 0: no split ooRexx> 'aa"b\ b"cc"d\ d"ee\* ff'~chunks(splitLevel:0)~each{item~sayDescription(25, index, 2)} 1 |aab\ bccd\ dee\* ff| 01 23 |aa"b\ b"cc"d\ d"ee\* ff| 1 |0011110011110000000| /* 1 |aab\ bccd\ dee\* ff| 01 23 |aa"b\ b"cc"d\ d"ee\* ff| 1 |0011110011110000000| */ ooRexx> -- splitLevel = 1: Adjacent quoted/unquoted sections are kept glued ooRexx> 'aa"b\ b"cc"d\ d"ee\* ff'~chunks(splitLevel:1)~each{item~sayDescription(25, index, 2)} 1 |aab\ bccd\ dee\*| 01 20 |aa"b\ b"cc"d\ d"ee\*| 1 |0011110011110000| 2 |ff| 22 23 |ff| 2 |00| /* 1 |aab\ bccd\ dee\*| 01 20 |aa"b\ b"cc"d\ d"ee\*| 1 |0011110011110000| 2 |ff| 22 23 |ff| 2 |00| */ ooRexx> -- splitLevel = 2: Adjacent quoted/unquoted sections are separated ooRexx> 'aa"b\ b"cc"d\ d"ee\* ff'~chunks(splitLevel:2)~each{item~sayDescription(25, index, 2)} 1 |aa| 01 02 |aa| 1 |00| 2 |b\ b| 03 08 |"b\ b"| 2 |1111| 3 |cc| 09 10 |cc| 3 |00| 4 |d\ d| 11 16 |"d\ d"| 4 |1111| 5 |ee\*| 17 20 |ee\*| 5 |0000| 6 |ff| 22 23 |ff| 6 |00| /* 1 |aa| 01 02 |aa| 1 |00| 2 |b\ b| 03 08 |"b\ b"| 2 |1111| 3 |cc| 09 10 |cc| 3 |00| 4 |d\ d| 11 16 |"d\ d"| 4 |1111| 5 |ee\*| 17 20 |ee\*| 5 |0000| 6 |ff| 22 23 |ff| 6 |00| */ ooRexx> -- Default splitLevel (1) ooRexx> -- The quote is declared break token, there is no more quoted sections, and the quote itself is returned ooRexx> 'aa"b\ b"cc"d\ d"ee\* ff'~chunks(breakTokens: '"')~each{item~sayDescription(25, index, 2)} 1 |aa| 01 02 |aa| 1 |00| 2 |"| 03 03 |"| 2 |0| 3 |b\| 04 05 |b\| 3 |00| 4 |b| 07 07 |b| 4 |0| 5 |"| 08 08 |"| 5 |0| 6 |cc| 09 10 |cc| 6 |00| 7 |"| 11 11 |"| 7 |0| 8 |d\| 12 13 |d\| 8 |00| 9 |d| 15 15 |d| 9 |0| 10 |"| 16 16 |"| 10 |0| 11 |ee\*| 17 20 |ee\*| 11 |0000| 12 |ff| 22 23 |ff| 12 |00| /* 1 |aa| 01 02 |aa| 1 |00| 2 |"| 03 03 |"| 2 |0| 3 |b\| 04 05 |b\| 3 |00| 4 |b| 07 07 |b| 4 |0| 5 |"| 08 08 |"| 5 |0| 6 |cc| 09 10 |cc| 6 |00| 7 |"| 11 11 |"| 7 |0| 8 |d\| 12 13 |d\| 8 |00| 9 |d| 15 15 |d| 9 |0| 10 |"| 16 16 |"| 10 |0| 11 |ee\*| 17 20 |ee\*| 11 |0000| 12 |ff| 22 23 |ff| 12 |00| */ ooRexx> -- Same as previous, plus \ which is declared escape character ooRexx> 'aa"b\ b"cc"d\ d"ee\* ff'~chunks(breakTokens: '"', escapeCharacters:"\")~each{item~sayDescription(25, index, 2)} 1 |aa| 01 02 |aa| 1 |00| 2 |"| 03 03 |"| 2 |0| 3 |b b| 04 07 |b\ b| 3 |010| 4 |"| 08 08 |"| 4 |0| 5 |cc| 09 10 |cc| 5 |00| 6 |"| 11 11 |"| 6 |0| 7 |d d| 12 15 |d\ d| 7 |010| 8 |"| 16 16 |"| 8 |0| 9 |ee*| 17 20 |ee\*| 9 |001| 10 |ff| 22 23 |ff| 10 |00| /* 1 |aa| 01 02 |aa| 1 |00| 2 |"| 03 03 |"| 2 |0| 3 |b b| 04 07 |b\ b| 3 |010| 4 |"| 08 08 |"| 4 |0| 5 |cc| 09 10 |cc| 5 |00| 6 |"| 11 11 |"| 6 |0| 7 |d d| 12 15 |d\ d| 7 |010| 8 |"| 16 16 |"| 8 |0| 9 |ee*| 17 20 |ee\*| 9 |001| 10 |ff| 22 23 |ff| 10 |00| */ ooRexx> -- A break token can be made of several characters, and can contain a quote ooRexx> 'aa"b\ b"cc"d\ d"ee\* ff'~chunks(breakTokens: ' a"b ')~each{item~sayDescription(25, index, 2)} 1 |a| 01 01 |a| 1 |0| 2 |a"b| 02 04 |a"b| 2 |000| 3 |\| 05 05 |\| 3 |0| 4 |bccd\| 07 13 |b"cc"d\| 4 |01100| 5 |dee\* ff| 15 23 |d"ee\* ff| 5 |01111111| /* 1 |a| 01 01 |a| 1 |0| 2 |a"b| 02 04 |a"b| 2 |000| 3 |\| 05 05 |\| 3 |0| 4 |bccd\| 07 13 |b"cc"d\| 4 |01100| 5 |dee\* ff| 15 23 |d"ee\* ff| 5 |01111111| */ ooRexx> -- If an escape character is also declared break token then it's no longer an escape character ooRexx> 'aa"b\ b"cc"d\ d"ee\* ff'~chunks(breakTokens:"\", escapeCharacters:"\")~each{item~sayDescription(25, index, 2)}== 1 |aab\ bccd\ dee| 01 18 |aa"b\ b"cc"d\ d"ee| 1 |00111100111100| 2 |\| 19 19 |\| 2 |0| 3 |*| 20 20 |*| 3 |0| 4 |ff| 22 23 |ff| 4 |00| an Array (no shape, 0 items) /* 1 |aab\ bccd\ dee| 01 18 |aa"b\ b"cc"d\ d"ee| 1 |00111100111100| 2 |\| 19 19 |\| 2 |0| 3 |*| 20 20 |*| 3 |0| 4 |ff| 22 23 |ff| 4 |00| */ ooRexx> -- A break token can contain characters that are declared escape character ooRexx> 'aa"b\ b"cc"d\ d"ee\* ff'~chunks(breakTokens:"e\*", escapeCharacters:"\*")~each{item~sayDescription(25, index, 2)}== 1 |aab\ bccd\ de| 01 17 |aa"b\ b"cc"d\ d"e| 1 |0011110011110| 2 |e\*| 18 20 |e\*| 2 |000| 3 |ff| 22 23 |ff| 3 |00| an Array (no shape, 0 items) /* 1 |aab\ bccd\ de| 01 17 |aa"b\ b"cc"d\ d"e| 1 |0011110011110| 2 |e\*| 18 20 |e\*| 2 |000| 3 |ff| 22 23 |ff| 3 |00| */ ooRexx> -- A break token can be case insensitive (prefix ci: or cl:) ooRexx> '1Plus2'~chunks(breakTokens:"ci:plus")~each{item~sayDescription(25, index, 2)} 1 |1| 1 1 |1| 1 |0| 2 |Plus| 2 5 |Plus| 2 |0000| 3 |2| 6 6 |2| 3 |0| /* 1 |1| 1 1 |1| 1 |0| 2 |Plus| 2 5 |Plus| 2 |0000| 3 |2| 6 6 |2| 3 |0| */ /* [String patterns] The functionality of selecting text using patterns is moved from ooRexxShell to a dedicated package: extension/stringChunkExtended.cls (not compatible with official ooRexx) Description .StringChunk~matcher use strict named arg wholeString(1)=.true, caseless(1)=.true,- trace(1)=.false, displayer(1)=.traceOutput, prefix(1)="" Pattern matching by equality (whole) or by inclusion (not whole), caseless or not. If the package regex.cls is loaded, then the pattern (a StringChunk) can be a regular expression prefixed by "/". When whole, and the pattern is not a regular expression, then the charecter "*" is recognized as a generic character when first or last character. When not whole, and the pattern is not a regular expression, then the character "^" is recognized as the metacharacter 'begining of string' when first character. When not whole, and the pattern is not a regular expression, then the character "$" is recognized as the metacharacter 'end of string' when last character. The returned result is a closure (matcher) which implements the pattern matching, or .nil if error. The pattern matching is tested when the closure is evaluated with a string passed as argument. Examples: '*' or '**' : matches everything '"*"' or '"**"' : matches exactly "*" or "**", see case stringPattern '***' : matches all names containing "*", see case *stringPattern* '*"*"*' : matches all names containing "*", see case *stringPattern* '*"**"*' : matches all names containing "**", see case *stringPattern* '*stringPattern' : string~right(stringPattern~length)~caselessEquals(stringPattern) 'stringPattern*' : string~left(stringPattern~length)~caselessEquals(stringPattern) '*stringPattern*': string~caselessPos(stringPattern) <> 0 'stringPattern' : string~caselessEquals(stringPattern) */ ooRexx> -- caseless equality ooRexx> matcher = "object"~matcher ooRexx> say matcher~("ObjeCt") -- true 1 ooRexx> say matcher~("my ObjeCt") -- false 0 ooRexx> -- caseless equality with generic character ooRexx> matcher = "*object"~matcher ooRexx> say matcher~("ObjeCt") -- true 1 ooRexx> say matcher~("my ObjeCt") -- true 1 ooRexx> -- caseless inclusion ooRexx> matcher = "object"~matcher(wholeString:.false) ooRexx> say matcher~("ObjeCt") -- true 1 ooRexx> say matcher~("my ObjeCt") -- true 1 ooRexx> -- caseless inclusion, regular expression: "object" at the begining or at the end. ooRexx> matcher = "/^object|object$"~matcher(wholeString:.false) ooRexx> say matcher~("ObjeCt") -- true 1 ooRexx> say matcher~("my ObjeCt") -- true 1 ooRexx> say matcher~("my ObjeCts") -- false 0 ooRexx> -- trace ooRexx> "*stringPattern"~matcher(trace:.true) description: stringChunkPattern="*stringPattern" wholeString=1 caseless=1 stringPattern="stringPattern" matcher: expose description stringPattern; use strict arg string; return string~right(stringPattern~length)~caselessEquals(stringPattern) /* output: description: stringChunkPattern="*stringPattern" wholeString=1 caseless=1 stringPattern="stringPattern" matcher: expose description stringPattern; use strict arg string; return string~right(stringPattern~length)~caselessEquals(stringPattern) */ ooRexx> -- trace when regular expression ooRexx> "/.*stringPattern"~matcher(trace:.true) description: stringChunkPattern="/.*stringPattern" wholeString=1 caseless=1 stringPattern=".*stringPattern" pattern = .Pattern~compile(stringPattern, .RegexCompiler~new(.RegexCompiler~caseless)) matcher: expose description pattern; use strict arg string; return pattern~matches(string) /* output: description: stringChunkPattern="/.*stringPattern" wholeString=1 caseless=1 stringPattern=".*stringPattern" pattern = .Pattern~compile(stringPattern, .RegexCompiler~new(.RegexCompiler~caseless)) matcher: expose description pattern; use strict arg string; return pattern~matches(string) */ -- =============================================================================== -- 2021 August 11 /* Added support for strings of codepoints encoded as native integers. 3 representations: Unicode8_Encoding Unicode16_Encoding Unicode32_Encoding. The method ~unicode returns one of these encodings, depending on the character with the largest Unicode codepoint (1, 2, or 4 bytes) in the source string. Unlike the flexible representation of Python, the 3 representions are first-class. No BOM, the endiannes is the CPU one. This is for internal use only. Unicode32_Encoding can be used with utf8proc for the functions taking a buffer of 32-bit integers. */ ooRexx> "côté"~text("unicode8")= -- T'côté Just an interpretative layer put above the string T'côté' ooRexx> "côté"~text("unicode8")~pipe{item~description(short:1) ":" item~c2x}= 'Unicode8 not-ASCII : 63 C3 B4 74 C3 A9' -- 'Unicode8 not-ASCII : 63 C3 B4 74 C3 A9 ooRexx> "côté"~text~unicode= -- T'c?t?' UTF-8 converted to Unicode8 T'c�t�' ooRexx> "côté"~text~unicode~pipe{item~description(short:1) ":" item~c2x}= 'Unicode8 not-ASCII : 63 F4 74 E9' -- 'Unicode8 not-ASCII : 63 F4 74 E9 ooRexx> "noël👨👩👧"~text~maximumCodepoint~pipe{"U+"item~d2x}= -- U+1F469 is the maximum codepoint 'U+1F469' ooRexx> "noël👨👩👧"~text~unicode~description(technical:1)= -- For this maximum codepoint, we need Unicode32 'Unicode32 (5 characters (1 index from index 5), 10 codepoints (0 index), 40 bytes, 0 error)' -- 'Unicode32 not-ASCII (5 graphemes (1 index from index 5), 10 codepoints (0 index), 40 bytes, 0 error)' -- The endianness of the UnicodeXX_Encoding is the one of the machine. -- With an Intel CPU, it's little-endian. ooRexx> "noël👨👩👧"~text~unicode~c2x= '6E000000 6F000000 EB000000 6C000000 0D200000 68F40100 0D200000 69F40100 0D200000 67F40100' -- '6E000000 6F000000 EB000000 6C000000 0D200000 68F40100 0D200000 69F40100 0D200000 67F40100' -- The default endianness for UTF32 is big-endian. ooRexx> "noël👨👩👧"~text~utf32~c2x= '0000006E 0000006F 000000EB 0000006C 0000200D 0001F468 0000200D 0001F469 0000200D 0001F467' -- '0000006E 0000006F 000000EB 0000006C 0000200D 0001F468 0000200D 0001F469 0000200D 0001F467' -- =============================================================================== -- 2021 may 31 /* Encodeded strings. The ooRexx programmer has the choice: - working with String at byte level - working with RexxText at grapheme level. - the same instance of String is used in both cases. aString ▲ text --------> aRexxText │ indexer (anEncoding) │ codepoints (sequential access) │ graphemes (direct access) +-----------------------<- string */ -- First binding of utf8proc, for the detection of grapheme cluster break. ooRexx> "( ͡° ͜ʖ ͡°)"~text~description= -- 'UTF-8 not-ASCII ( 9 graphemes, 12 codepoints, 20 bytes )' 'UTF-8 not-ASCII (9 characters, 12 codepoints, 20 bytes, 0 error)' ooRexx> "( ͡° ͜ʖ ͡°)"~text~graphemes~each{item~c2x}= -- [ 28,'20CDA1','C2B0','20CD9C','CA96','EFBBBF','20CDA1','C2B0', 29] [ 28,'20 CDA1','C2B0','20 CD9C','CA96','EFBBBF','20 CDA1','C2B0', 29] -- Classes in relation with Unicode and encoded strings: ooRexx> ?c *encoding* *encoded* *indexer* *codepoint* *grapheme* *RexxText* *Unicode* P. 'Byte_Encoding' : (byte_encoding.cls) P. 'CodePointSupplier' : (stringIndexer.cls) .M 'EncodedMutableBuffer' : (text.cls) .M 'EncodedPackage' : (text.cls) .M 'EncodedRexxBlock' : (text.cls) .M 'EncodedString' : (text.cls) P. 'Encoding' : (encoding.cls) P. 'IBM1252_Encoding' : (ibm-1252_encoding.cls) P. 'IBM437_Encoding' : (ibm-437_encoding.cls) P. 'ISO88591_Encoding' : (iso-8859-1_encoding.cls) PM 'IndexerHelpers' : (stringInterface.cls) PM 'IndexerStringInterface' : (stringInterface.cls) P. 'RexxText' : (REXX) .M 'RexxTextContents' : (text.cls) .M 'RexxTextInitializer' : (text.cls) PM 'RexxTextMapper' : (functionals.cls) .M 'RexxTextOperators' : (text.cls) .M 'RexxTextPrettyPrinter' : (notrace.cls) .M 'RexxTextStringInterface' : (text.cls) P. 'RexxTextTransformer' : (stringIndexer.cls) PM 'StringIndexer' : (stringIndexer.cls) .M 'StringRexxTextInterface' : (text.cls) P. 'UTF16BE_Encoding' : (utf16_encoding.cls) P. 'UTF16LE_Encoding' : (utf16_encoding.cls) P. 'UTF32BE_Encoding' : (utf32_encoding.cls) P. 'UTF32LE_Encoding' : (utf32_encoding.cls) P. 'UTF8_Encoding' : (utf8_encoding.cls) P. 'Unicode' : (REXX) P. 'Unicode16_Encoding' : (unicode16_encoding.cls) P. 'Unicode32_Encoding' : (unicode32_encoding.cls) P. 'Unicode8_Encoding' : (unicode8_encoding.cls) P. 'UnicodeCharacter' : (unicode.cls) P. 'UnicodeCharacterAlias' : (unicode.cls) P. 'UnicodeCharacterInterval' : (unicode.cls) P. 'UnicodeCharacterIntervalSupplier' : (unicode.cls) P. 'UnicodeCharacterSupplier' : (unicode.cls) PM 'UnicodeN_Encoding' : (unicodeN_encoding.cls) PM 'Unicode_CommonServices' : (unicode_common.cls) P. 'WINDOWS1252_Encoding' : (windows-1252_encoding.cls) P. 'WTF16BE_Encoding' : (wtf16_encoding.cls) P. 'WTF16LE_Encoding' : (wtf16_encoding.cls) P. 'WTF8_Encoding' : (wtf8_encoding.cls) [Info] 42 lines displayed -- =============================================================================== -- 2021 mar 24 /* Optimization of String~isASCII: The old implementation checks from start to end. The new implementation checks from start ascending, from middle descending, from middle ascending, from end descending. That will divide by 4 the number of iterations, while increasing the chance to find a not-ascii character faster. Strangely, the new implementation is also faster when all the characters are ASCII. Benchmark using a version where the flag isASCII is not stored: */ -- The package encoding MUST be the byte encoding, otherwise "é" is converted to text and the concatenation is catastrophically long! ooRexx> previousEncoding = .context~package~setEncoding("byte") -- backup and change to Byte ooRexx> big10m = "0123456789"~copies(1e6) ooRexx> s = big10m -- 10 millions of ASCII characters, must check all of them -- do 1000; s~isASCIIold; end -- 9.3s ooRexx> do 1000; s~isASCII; end -- 6.2s ooRexx> s = "é" || big10m -- 1 non-ASCII character followed by 10 millions of ASCII characters -- do 1000; s~isASCIIold; end -- 0.001s ooRexx> do 1000; s~isASCII; end -- 0.001s ooRexx> s = big10m || "é" -- 10 millions of ASCII characters followed by 1 non-ASCII character -- do 1000; s~isASCIIold; end -- 9.3s ooRexx> do 1000; s~isASCII; end -- 0.001s ooRexx> big5m = "01234"~copies(1e6) ooRexx> s = big5m || "é" || big5m -- 1 non-ASCII character in the middle of 10 millions of ASCII characters -- do 1000; s~isASCIIold; end -- 4.7s ooRexx> do 1000; s~isASCII; end -- 0.001s ooRexx> .context~package~setEncoding(previousEncoding) -- restore -- =============================================================================== -- 2021 mar 15 /* Encoded strings (prototype). Added support for UTF-8. Added suppliers for codepoints and graphemes. */ ooRexx> s = "ça va ?" ooRexx> s~length= -- 7 (was 8 before automatic conversion of string literals to text) 7 ooRexx> s~eachC{item~c2x" "}= -- ['C3A7 ', 61 , 20 , 76 , 61 , 20 ,'3F '] (was ['C3 ','A7 ', 61 , 20 , 76 , 61 , 20 ,'3F '] before automatic conversion of string literals to text) ['C3A7 ', 61 , 20 , 76 , 61 , 20 ,'3F '] ooRexx> s~text~encoding= -- (The UTF8_Encoding class) (The UTF8_Encoding class) ooRexx> s~text~length= -- 7 7 ooRexx> s~text("utf8")~length== -- 7 7 ooRexx> s~text~codepoints~each= -- [ 231, 97, 32, 118, 97, 32, 63] [ 231, 97, 32, 118, 97, 32, 63] ooRexx> s~text~graphemes~each("c2x")= -- ['C3A7', 61, 20, 76, 61, 20,'3F'] ['C3A7', 61, 20, 76, 61, 20,'3F']