loadPackage OK for extension/stringChunk.cls loadPackage OK for utilities/indentedStream.cls loadPackage OK for extension/extensions.cls loadLibrary OK for rxunixsys loadPackage OK for ncurses.cls loadPackage OK for csvStream.cls loadLibrary OK for hostemu loadPackage OK for json.cls loadPackage OK for mime.cls loadPackage OK for rxftp.cls loadLibrary OK for rxmath loadPackage OK for rxregexp.cls loadPackage OK for regex/regex.cls loadPackage OK for smtp.cls loadPackage OK for socket.cls loadPackage OK for streamsocket.cls loadPackage OK for pipeline/pipe.cls loadPackage OK for rgf_util2/rgf_util2.rex loadPackage OK for BSF.CLS loadPackage OK for oorexxshell_queries.cls loadPackage OK for pipeline/pipe_extension.cls loadPackage OK for rgf_util2/rgf_util2_wrappers.rex REXX-ooRexx_4.3.0(MT)_64-bit 6.04 22 Jun 2024 Input queue name: Sae6eQ600001c5ed80 ---------------------------------- -- Text encoding - Internal checks ---------------------------------- /* Creation of a RexxText */ ooRexx[bash]> /* Next lines are no longer executed, for 2 reasons: - The method "TEXT=" should be declared package-scope but this is not supported by Executor. Not intended to be used by a rexx programmer. - The dynamic target changes the string target to a text target, because the argument is a text. So the case "assignment of a text to a string not yet linked to a text" can no longer occur, because the dynamic target converts the string to text, which becomes linked to the text. The error raised is Object "hello" does not understand message "TEXT=" */ /* s = "hello" s~text = .RexxText~new("hello") -- The counterpart must be a RexxText linked to this String s~text = .RexxText~new(s) -- ok, the RexxText is linked to s, now can be assigned to s~text */ ooRexx[bash]> "é"~text("byte") || "è"~text("utf8")= -- T'éè' (was Cannot concatenate Byte with UTF-8) T'éè' ooRexx[bash]> "é"~text("byte") || "é"~text("utf8")= -- T'éé' encoded UTF-8 T'éé' /* No error because the 2 occurences of the string "é" are the same interned string. So both texts are in fact the same instance of RexxText. The last encoding selection wins, the result is UTF-8. */ /* Byte encoding */ ooRexx[bash]> ooRexx[bash]> s = "63 C3 B4 74 65 CC 81 F0 9F 91 8D"x -- An UTF-8 string. The Byte encoding works at byte level, as the String class ooRexx[bash]> s= -- T'côté👍' T'côté👍' ooRexx[bash]> .byte_encoding~decode(s, 1)= -- 99 (63) 99 ooRexx[bash]> .byte_encoding~decode(s, 2)= -- 195 (C3) 195 ooRexx[bash]> .byte_encoding~decode(s, 12)= -- .nil (end of string) (The NIL object) ooRexx[bash]> .byte_encoding~decode(s, 1, 2)= -- 2 is an invalid codepoint size Byte encoding: 2 is an invalid size of byte sequence. Error code= 23.900 ooRexx[bash]> .byte_encoding~encode(65)= -- T'A' T'A' ooRexx[bash]> .byte_encoding~encode(256)= -- 256 is an invalid codepoint (range 0..255) Byte encoding: invalid codepoint 256 (0100x). Allowed range is 0..255. Error code= 23.900 /* UTF-8 encoding */ -- If you don't pass a size to ~decode then the method calculates it, while checking the validity of the encoding ooRexx[bash]> s = "63 C3 B4 74 65 CC 81 F0 9F 91 8D"x -- An UTF-8 string: 'côté👍' ooRexx[bash]> .utf8_encoding~decode(s, 1)= -- 99 99 ooRexx[bash]> .utf8_encoding~nextCodepointIndexB(s, 1)= -- 2 2 ooRexx[bash]> .utf8_encoding~decode(s, 2)= -- 244 244 ooRexx[bash]> .utf8_encoding~nextCodepointIndexB(s, 2)= -- 4 4 ooRexx[bash]> .utf8_encoding~decode(s, 8)= -- 128077 128077 ooRexx[bash]> .utf8_encoding~nextCodepointIndexB(s, 8)= -- 12 12 ooRexx[bash]> .utf8_encoding~decode(s, 12)= -- .nil (end of string) (The NIL object) -- Example of check ooRexx[bash]> "63 C3 B4 74 65 CC 81 F0 9F 91 8D"x~text("utf8")= -- T'côté👍' T'côté👍' ooRexx[bash]> "B4 74 65 CC 81 F0 9F 91 8D"x~text("utf8")~errors= -- Invalid start byte ['UTF-8 encoding: byte sequence at byte-position 1 has an invalid start byte 180 (B4x) (non-shortest form).'] ooRexx[bash]> "63 C3 74 65 CC 81 F0 9F 91 8D"x~text("utf8")~errors= -- Invalid continuation byte ['UTF-8 encoding: byte sequence at byte-position 2 has an invalid continuation byte 116 (74x) at byte-position 3.'] ooRexx[bash]> "63 C3 B4 74 65 CC 81 F0 9F 91"x~text("utf8")~errors= -- UTF-8 character is truncated ['UTF-8 encoding: byte sequence at byte-position 8 is truncated, expected 4 bytes.'] -- If you pass a size to ~decode, the method assumes you know what you do, there is no check ooRexx[bash]> .utf8_encoding~decode(s, 2, 1)= -- 67 invalid size, the result is wrong 67 ooRexx[bash]> .utf8_encoding~decode(s, 2, 2)= -- 244 correct 244 ooRexx[bash]> .utf8_encoding~decode(s, 2, 3)= -- 15668 invalid size, the result is wrong 15668 ooRexx[bash]> .utf8_encoding~decode(s, 2, 4)= -- 1002789 invalid size, the result is wrong 1002789 ooRexx[bash]> .utf8_encoding~decode(s, 2, 5)= -- UTF-8 encoding: 5 is an invalid codepoint size UTF-8 encoding: 5 is an invalid size of byte sequence. Error code= 23.900 -- Encoding ooRexx[bash]> .utf8_encoding~encode(65)= -- T'A' T'A' ooRexx[bash]> .utf8_encoding~encode(650)= -- T'ʊ' T'ʊ' ooRexx[bash]> .utf8_encoding~encode(6500)= -- T'ᥤ' T'ᥤ' ooRexx[bash]> .utf8_encoding~encode(65000)= -- T'' T'' ooRexx[bash]> .utf8_encoding~encode(650000)= -- T'' T'' ooRexx[bash]> .utf8_encoding~encode(6500000)= -- 6500000 is an invalid codepoint (range 0..1114111) UTF-8 encoding: invalid codepoint 6500000 (U+632EA0). Allowed range is 0..1114111. Error code= 23.900 ooRexx[bash]> .utf8_encoding~encode(55296)~errors== -- 55296 is an invalid codepoint (high surrogate) an Array (shape [3], 3 items) 1 : 'UTF-8 encoding: byte sequence at byte-position 1 has an invalid continuation byte 160 (A0x) at byte-position 2 (high surrogate, use WTF-8).' 2 : 'UTF-8 encoding: byte sequence at byte-position 2 has an invalid start byte 160 (A0x) (non-shortest form).' 3 : 'UTF-8 encoding: byte sequence at byte-position 3 has an invalid start byte 128 (80x) (non-shortest form).' ooRexx[bash]> .utf8_encoding~encode(56320)~errors== -- 56320 is an invalid codepoint (low surrogate) an Array (shape [3], 3 items) 1 : 'UTF-8 encoding: byte sequence at byte-position 1 has an invalid continuation byte 176 (B0x) at byte-position 2 (low surrogate, use WTF-8).' 2 : 'UTF-8 encoding: byte sequence at byte-position 2 has an invalid start byte 176 (B0x) (non-shortest form).' 3 : 'UTF-8 encoding: byte sequence at byte-position 3 has an invalid start byte 128 (80x) (non-shortest form).' /* UTF-16 encoding */ ooRexx[bash]> ooRexx[bash]> s = "63 C3 B4 74 65 CC 81 F0 9F 91 8D"x -- An UTF-8 string: 'côté👍' ooRexx[bash]> t16 = s~text("utf8")~utf16 -- interpret the bytes as UTF-8 and convert them to UTF-16 ooRexx[bash]> s16 = t16~string -- internal UTF-16 bytes ooRexx[bash]> t16~string~c2x= -- view raw internal bytes '006300F4007400650301D83DDC4D' ooRexx[bash]> t16~c2x= -- view encoded codepoints '0063 00F4 0074 0065 0301 D83DDC4D' ooRexx[bash]> t16~c2u= -- view decoded codepoints 'U+0063 U+00F4 U+0074 U+0065 U+0301 U+1F44D' ooRexx[bash]> t16~c2g= -- view encoded graphemes '0063 00F4 0074 00650301 D83DDC4D' -- If you don't pass a size to ~decode then the method calculates it, while checking the validity of the encoding ooRexx[bash]> .utf16be_encoding~decode(s16, 1)= -- 99 99 ooRexx[bash]> .utf16be_encoding~nextCodepointIndexB(s16, 1)= -- 3 3 ooRexx[bash]> .utf16be_encoding~decode(s16, 3)= -- 244 244 ooRexx[bash]> .utf16be_encoding~nextCodepointIndexB(s16, 3)= -- 5 5 ooRexx[bash]> .utf16be_encoding~decode(s16, 11)= -- 128077 128077 ooRexx[bash]> .utf16be_encoding~nextCodepointIndexB(s16, 11)= -- 15 15 ooRexx[bash]> .utf16be_encoding~decode(s16, 15)= -- .nil (end of string) (The NIL object) -- Example of checks ooRexx[bash]> "D800 DC00 D801 DC01"x~text("utf16")~utf8= -- T'𐀀𐐁' T'𐀀𐐁' ooRexx[bash]> "DC00 D801 DC01"x~text("utf16")~errors= -- Unpaired low surrogate ['UTF-16BE encoding: unpaired low surrogate 56320 (U+DC00) at byte-position 1, use WTF-16.'] ooRexx[bash]> "DC00 D801 DC01"x~text("wtf16")~errors= -- acceptable when WTF-16 (The NIL object) ooRexx[bash]> "DC00 D801 DC01"x~text("wtf16")~utf8~errors== -- but cannot be converted to UTF-8: invalid codepoint DC00x an Array (shape [3], 3 items) 1 : 'UTF-8 encoding: byte sequence at byte-position 1 has an invalid continuation byte 176 (B0x) at byte-position 2 (low surrogate, use WTF-8).' 2 : 'UTF-8 encoding: byte sequence at byte-position 2 has an invalid start byte 176 (B0x) (non-shortest form).' 3 : 'UTF-8 encoding: byte sequence at byte-position 3 has an invalid start byte 128 (80x) (non-shortest form).' ooRexx[bash]> "D800 1000 D801 DC01"x~text("utf16")~errors= -- Invalid low surrogate ['UTF-16BE encoding: invalid low surrogate 4096 (U+1000) at byte-position 3, use WTF-16.'] ooRexx[bash]> "D800 1000 D801 DC01"x~text("wtf16")~errors= -- acceptable when WTF-16 (The NIL object) ooRexx[bash]> "D800 DC00 D801 DC"x~text("utf16")~errors== -- character is truncated. 2 errors reported because try with 4 bytes and then 2 bytes an Array (shape [2], 2 items) 1 : 'UTF-16BE encoding: byte sequence at byte-position 5 is truncated, expected 4 bytes.' 2 : 'UTF-16BE encoding: byte sequence at byte-position 7 is truncated, expected 2 bytes.' ooRexx[bash]> "D800 DC00 D801 DC"x~text("wtf16")~errors== -- this is also an error when WTF-16 an Array (shape [2], 2 items) 1 : 'WTF-16BE encoding: byte sequence at byte-position 5 is truncated, expected 4 bytes.' 2 : 'WTF-16BE encoding: byte sequence at byte-position 7 is truncated, expected 2 bytes.' -- If you pass a size to ~decode, the method assumes you know what you do, there is no check ooRexx[bash]> .utf16be_encoding~decode(s16, 3, 1)= -- 1 is an invalid codepoint size UTF-16BE encoding: 1 is an invalid size of byte sequence. Error code= 23.900 ooRexx[bash]> .utf16be_encoding~decode(s16, 3, 2)= -- 244 correct 244 ooRexx[bash]> .utf16be_encoding~decode(s16, 3, 3)= -- 3 is an invalid codepoint size UTF-16BE encoding: 3 is an invalid size of byte sequence. Error code= 23.900 ooRexx[bash]> .utf16be_encoding~decode(s16, 3, 4)= -- invalid size, the result is wrong -56363916 ooRexx[bash]> .utf16be_encoding~decode(s16, 3, 5)= -- 5 is an invalid codepoint size UTF-16BE encoding: 5 is an invalid size of byte sequence. Error code= 23.900 -- Encoding ooRexx[bash]> .utf16be_encoding~encode(65)~c2x= -- 0041 0041 ooRexx[bash]> .utf16be_encoding~encode(650)~c2x= -- 028A '028A' ooRexx[bash]> .utf16be_encoding~encode(6500)~c2x= -- 1964 1964 ooRexx[bash]> .utf16be_encoding~encode(65000)~c2x= -- FDE8 'FDE8' ooRexx[bash]> .utf16be_encoding~encode(650000)~c2x= -- DA3ADF10 'DA3ADF10' ooRexx[bash]> .utf16be_encoding~encode(6500000)~c2x= -- 6500000 is an invalid codepoint UTF-16BE encoding: invalid codepoint 6500000 (U+632EA0). Allowed range is 0..1114111. Error code= 23.900 ooRexx[bash]> .utf16be_encoding~encode(55296)~errors= -- 55296 is an invalid codepoint (high surrogate) ['UTF-16BE encoding: unpaired high surrogate 55296 (U+D800) at byte-position 1, use WTF-16.'] ooRexx[bash]> .wtf16be_encoding~encode(55296)~errors= -- acceptable when WTF-16 (The NIL object) ooRexx[bash]> .utf16be_encoding~encode(56320)~errors= -- 56320 is an invalid codepoint (low surrogate) ['UTF-16BE encoding: unpaired low surrogate 56320 (U+DC00) at byte-position 1, use WTF-16.'] ooRexx[bash]> .wtf16be_encoding~encode(56320)~errors= -- acceptable when WTF-16 (The NIL object) /* To concatenate two WTF-8 strings: if the earlier one ends with a lead surrogate and the latter one starts with a trail surrogate, both surrogates need to be removed and replaced with a 4-byte sequence. */ ooRexx[bash]> "D800 DC01"x~text("utf16")~utf8~string~c2x= -- UTF-8 encoding of character U+10001 from its UTF-16 encoding 'F0908081' ooRexx[bash]> t1 = "D800"x~text("wtf16") -- An invalid UTF16 encoding, but a valid WTF-16 encoding (high surrogate) ooRexx[bash]> t2 = "DC01"x~text("wtf16") -- An invalid UTF16 encoding, but a valid WTF-16 encoding (low surrogate) ooRexx[bash]> (t1 || t2)~utf16~c2x= -- The concatenation is valid UTF16... 'D800DC01' ooRexx[bash]> (t1 || t2)~utf16~c2u= -- ... whose codepoint is U+10001 'U+10001' ooRexx[bash]> t1~wtf8~string~c2x= -- WTF-8 encoding of U+D800 (high surrogate) 'EDA080' ooRexx[bash]> t2~wtf8~string~c2x= -- WTF-8 encoding of U+DC01 (low surrogate) 'EDB081' ooRexx[bash]> (t1~wtf8~string || t2~wtf8~string)~c2x= -- The concatenation of both WTF8 can't be this (2 codepoints) 'EDA080EDB081' ooRexx[bash]> (t1~wtf8 || t2~wtf8)~c2x= -- The correct concatenation is 1 codepoint... 'F0908081' ooRexx[bash]> (t1~wtf8 || t2~wtf8)~c2u= -- ... equal to U+10001 'U+10001' -- WTF-8 view of this invalid concatenation ooRexx[bash]> (t1~wtf8~string || t2~wtf8~string)~text("wtf8")~errors== -- High surrogate 55296 (D800x) followed by low surrogate 56321 (DC01x) is not allowed an Array (shape [3], 3 items) 1 : 'WTF-8 encoding: high surrogate 55296 (U+D800) at byte-position 1 followed by low surrogate 56321 (U+DC01) at byte-position 4 is not allowed.' 2 : 'WTF-8 encoding: byte sequence at byte-position 2 has an invalid start byte 160 (A0x) (non-shortest form).' 3 : 'WTF-8 encoding: byte sequence at byte-position 3 has an invalid start byte 128 (80x) (non-shortest form).' ooRexx[bash]> (t1~wtf8~string || t2~wtf8~string)~text("wtf8")~c2x= -- EDA080 EDB081 'ED A0 80 EDB081' ooRexx[bash]> (t1~wtf8~string || t2~wtf8~string)~text("wtf8")~c2u= -- U+FFFD U+FFFD U+FFFD U+DC01 (yes... not 6 replacement chars because the low surrogate alone is valid) 'U+FFFD U+FFFD U+FFFD U+DC01' -- UTF-8 view of this invalid concatenation ooRexx[bash]> (t1~wtf8~string || t2~wtf8~string)~text("utf8")~errors== -- High surrogate is not allowed, Low surrogate is not allowed an Array (shape [6], 6 items) 1 : 'UTF-8 encoding: byte sequence at byte-position 1 has an invalid continuation byte 160 (A0x) at byte-position 2 (high surrogate, use WTF-8).' 2 : 'UTF-8 encoding: byte sequence at byte-position 2 has an invalid start byte 160 (A0x) (non-shortest form).' 3 : 'UTF-8 encoding: byte sequence at byte-position 3 has an invalid start byte 128 (80x) (non-shortest form).' 4 : 'UTF-8 encoding: byte sequence at byte-position 4 has an invalid continuation byte 176 (B0x) at byte-position 5 (low surrogate, use WTF-8).' 5 : 'UTF-8 encoding: byte sequence at byte-position 5 has an invalid start byte 176 (B0x) (non-shortest form).' 6 : 'UTF-8 encoding: byte sequence at byte-position 6 has an invalid start byte 129 (81x) (non-shortest form).' ooRexx[bash]> (t1~wtf8~string || t2~wtf8~string)~text("utf8")~c2x= -- EDA080 EDB081 'ED A0 80 ED B0 81' ooRexx[bash]> (t1~wtf8~string || t2~wtf8~string)~text("utf8")~c2u= -- U+FFFD U+FFFD U+FFFD U+FFFD U+FFFD U+FFFD 'U+FFFD U+FFFD U+FFFD U+FFFD U+FFFD U+FFFD' /* Noncharaters are supported without error. http://www.unicode.org/faq/private_use.html#noncharacters A "noncharacter" is a code point that is permanently reserved in the Unicode Standard for internal use. They are considered unassigned to any abstract character, and they share the General_Category value Cn (Unassigned) with unassigned reserved code points in the standard. Unicode has exactly 66 noncharacters. (In this table, "#" stands for either the hex digit "E" or "F".) UTF-32 UTF-16 UTF-8 0000FDD0 FDD0 EF B7 90 ... 0000FDEF FDEF EF B7 AF 0000FFF# FFF# EF BF B# 0001FFF# D83F DFF# F0 9F BF B# 0002FFF# D87F DFF# F0 AF BF B# 0003FFF# D8BF DFF# F0 BF BF B# 0004FFF# D8FF DFF# F1 8F BF B# ... 000FFFF# DBBF DFF# F3 BF BF B# 0010FFF# DBFF DFF# F4 8F BF B# */ ooRexx[bash]> .utf16be_encoding~encode("FDD0"~x2d)~c2x= -- FDD0 'FDD0' ooRexx[bash]> .utf8_encoding~encode("FDD0"~x2d)~c2x= -- EFB790 'EFB790' /* Create the file ill_formed_utf8.txt and check its encoding. */ ooRexx[bash]> file = .stream~new("ill_formed_utf8.txt") ooRexx[bash]> file~open("write replace") ooRexx[bash]> file~say("This file contains ill-formed UTF-8 characters.") ooRexx[bash]> file~say( "B4 74 65 CC 81 F0 9F 91 8D"x~text("utf8")~string "Invalid start byte") ooRexx[bash]> file~say("63 C3 74 65 CC 81 F0 9F 91 8D"x~text("utf8")~string "Invalid continuation byte") ooRexx[bash]> file~say("Next line, UTF-8 character is truncated") ooRexx[bash]> file~say("63 C3 B4 74 65 CC 81 F0 9F 91"x~text("utf8")~string) -- Don't add text, the purpose is to raise a 'truncated' error ooRexx[bash]> t1 = "D800"x~text("wtf16") -- An invalid UTF16 encoding, but a valid WTF-16 encoding (high surrogate) ooRexx[bash]> t2 = "DC01"x~text("wtf16") -- An invalid UTF16 encoding, but a valid WTF-16 encoding (low surrogate) ooRexx[bash]> file~say(t1~wtf8~string "invalid UTF-8 but valid WTF8 (high surrogate)") ooRexx[bash]> file~say(t2~wtf8~string "invalid UTF-8 but valid WTF8 (low surrogate)") ooRexx[bash]> file~say( (t1~wtf8~string || t2~wtf8~string)~string "The concatenation of both WTF8 can't be this (2 codepoints)") ooRexx[bash]> file~say( (t1~wtf8 || t2~wtf8)~string "The correct concatenation is 1 codepoint") ooRexx[bash]> file~close ooRexx[bash]> system rexx unicode/scripts/check_encoding utf8 ill_formed_utf8.txt Checking file "ill_formed_utf8.txt" line 2: UTF-8 encoding: byte sequence at byte-position 1 has an invalid start byte 180 (B4x) (non-shortest form). line 3: UTF-8 encoding: byte sequence at byte-position 2 has an invalid continuation byte 116 (74x) at byte-position 3. line 5: UTF-8 encoding: byte sequence at byte-position 8 is truncated, expected 4 bytes. line 6: UTF-8 encoding: byte sequence at byte-position 1 has an invalid continuation byte 160 (A0x) at byte-position 2 (high surrogate, use WTF-8). line 6: UTF-8 encoding: byte sequence at byte-position 2 has an invalid start byte 160 (A0x) (non-shortest form). line 6: UTF-8 encoding: byte sequence at byte-position 3 has an invalid start byte 128 (80x) (non-shortest form). line 7: UTF-8 encoding: byte sequence at byte-position 1 has an invalid continuation byte 176 (B0x) at byte-position 2 (low surrogate, use WTF-8). line 7: UTF-8 encoding: byte sequence at byte-position 2 has an invalid start byte 176 (B0x) (non-shortest form). line 7: UTF-8 encoding: byte sequence at byte-position 3 has an invalid start byte 129 (81x) (non-shortest form). line 8: UTF-8 encoding: byte sequence at byte-position 1 has an invalid continuation byte 160 (A0x) at byte-position 2 (high surrogate, use WTF-8). line 8: UTF-8 encoding: byte sequence at byte-position 2 has an invalid start byte 160 (A0x) (non-shortest form). line 8: UTF-8 encoding: byte sequence at byte-position 3 has an invalid start byte 128 (80x) (non-shortest form). line 8: UTF-8 encoding: byte sequence at byte-position 4 has an invalid continuation byte 176 (B0x) at byte-position 5 (low surrogate, use WTF-8). line 8: UTF-8 encoding: byte sequence at byte-position 5 has an invalid start byte 176 (B0x) (non-shortest form). line 8: UTF-8 encoding: byte sequence at byte-position 6 has an invalid start byte 129 (81x) (non-shortest form). 15 errors Total: 15 errors RC= 1 ooRexx[bash]> system rexx unicode/scripts/check_encoding wtf8 ill_formed_utf8.txt Checking file "ill_formed_utf8.txt" line 2: WTF-8 encoding: byte sequence at byte-position 1 has an invalid start byte 180 (B4x) (non-shortest form). line 3: WTF-8 encoding: byte sequence at byte-position 2 has an invalid continuation byte 116 (74x) at byte-position 3. line 5: WTF-8 encoding: byte sequence at byte-position 8 is truncated, expected 4 bytes. line 8: WTF-8 encoding: high surrogate 55296 (U+D800) at byte-position 1 followed by low surrogate 56321 (U+DC01) at byte-position 4 is not allowed. line 8: WTF-8 encoding: byte sequence at byte-position 2 has an invalid start byte 160 (A0x) (non-shortest form). line 8: WTF-8 encoding: byte sequence at byte-position 3 has an invalid start byte 128 (80x) (non-shortest form). 6 errors Total: 6 errors RC= 1 /* */ ooRexx[bash]> file = .stream~new("ill_formed_utf16.txt") ooRexx[bash]> file~open("write replace") ooRexx[bash]> file~say("This file contains ill-formed UTF-16 characters.") ooRexx[bash]> file~say( "DC00 D801 DC01"x~text("utf16")~string "Unpaired low surrogate (acceptable when WTF-16)") ooRexx[bash]> file~say("D800 1000 D801 DC01"x~text("utf16")~string "Invalid low surrogate (acceptable when WTF-16)") ooRexx[bash]> file~say("Next line, character is truncated. 2 errors reported because try with 4 bytes and then 2 bytes (this is also an error when WTF-16)") ooRexx[bash]> file~say("D800 DC00 D801 DC"x~text("utf16")~string) -- Don't add text, the purpose is to raise a 'truncated' error ooRexx[bash]> file~close ooRexx[bash]> system rexx unicode/scripts/check_encoding utf16 ill_formed_utf16.txt Checking file "ill_formed_utf16.txt" line 2: UTF-16BE encoding: unpaired low surrogate 56320 (U+DC00) at byte-position 1, use WTF-16. line 3: UTF-16BE encoding: invalid low surrogate 4096 (U+1000) at byte-position 3, use WTF-16. line 3: UTF-16BE encoding: byte sequence at byte-position 55 is truncated, expected 2 bytes. line 5: UTF-16BE encoding: byte sequence at byte-position 5 is truncated, expected 4 bytes. line 5: UTF-16BE encoding: byte sequence at byte-position 7 is truncated, expected 2 bytes. 5 errors Total: 5 errors RC= 1 ooRexx[bash]> system rexx unicode/scripts/check_encoding wtf16 ill_formed_utf16.txt Checking file "ill_formed_utf16.txt" line 3: WTF-16BE encoding: byte sequence at byte-position 55 is truncated, expected 2 bytes. line 5: WTF-16BE encoding: byte sequence at byte-position 5 is truncated, expected 4 bytes. line 5: WTF-16BE encoding: byte sequence at byte-position 7 is truncated, expected 2 bytes. 3 errors Total: 3 errors RC= 1 /* End of demonstration. */