From 8406bea5636d1fd715684f1309b03fd388ee5ce2 Mon Sep 17 00:00:00 2001 From: Alex Barney Date: Thu, 24 Jun 2021 11:45:25 -0700 Subject: [PATCH] Implement CharacterEncoding --- src/LibHac/Util/CharacterEncoding.cs | 1111 +++++++++++++++++ tests/LibHac.Tests/SpanEqualAsserts.cs | 21 + .../Util/CharacterEncodingTests.cs | 664 ++++++++++ 3 files changed, 1796 insertions(+) create mode 100644 src/LibHac/Util/CharacterEncoding.cs create mode 100644 tests/LibHac.Tests/SpanEqualAsserts.cs create mode 100644 tests/LibHac.Tests/Util/CharacterEncodingTests.cs diff --git a/src/LibHac/Util/CharacterEncoding.cs b/src/LibHac/Util/CharacterEncoding.cs new file mode 100644 index 00000000..49f40d41 --- /dev/null +++ b/src/LibHac/Util/CharacterEncoding.cs @@ -0,0 +1,1111 @@ +using System; +using System.Runtime.InteropServices; +using LibHac.Common; +using LibHac.Diag; + +namespace LibHac.Util +{ + public enum CharacterEncodingResult + { + Success = 0, + InsufficientLength = 1, + InvalidFormat = 2 + } + + public static class CharacterEncoding + { + private static ReadOnlySpan Utf8NBytesInnerTable => new sbyte[] + { + -1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 8 + }; + + private static ReadOnlySpan Utf8NBytesTable => Utf8NBytesInnerTable.Slice(1); + + private static CharacterEncodingResult ConvertStringUtf8ToUtf16Impl(out int codeUnitsWritten, + out int codeUnitsRead, Span destination, ReadOnlySpan source) + { + if (source.Length == 0) + { + codeUnitsWritten = 0; + codeUnitsRead = 0; + return CharacterEncodingResult.Success; + } + + ReadOnlySpan src = source; + Span dst = destination; + + while (src.Length > 0) + { + int codePointBytes = Utf8NBytesTable[src[0]]; + + if (src.Length < codePointBytes) + goto ReturnInvalidFormat; + + if (dst.Length == 0) + goto ReturnInsufficientLength; + + uint codePoint; + + switch (codePointBytes) + { + case 1: + dst[0] = src[0]; + src = src.Slice(1); + dst = dst.Slice(1); + break; + + case 2: + // Check if the encoding is overlong + if ((src[0] & 0x1E) == 0) + goto ReturnInvalidFormat; + + if ((src[1] & 0xC0) != 0x80) + goto ReturnInvalidFormat; + + codePoint = ((src[0] & 0x1Fu) << 6) | + ((src[1] & 0x3Fu) << 0); + + dst[0] = (ushort)codePoint; + src = src.Slice(2); + dst = dst.Slice(1); + break; + + case 3: + if ((src[1] & 0xC0) != 0x80) + goto ReturnInvalidFormat; + + if ((src[2] & 0xC0) != 0x80) + goto ReturnInvalidFormat; + + codePoint = ((src[0] & 0xFu) << 12) | + ((src[1] & 0x3Fu) << 6) | + ((src[2] & 0x3Fu) << 0); + + // Check if the encoding is overlong + if ((codePoint & 0xF800) == 0) + goto ReturnInvalidFormat; + + // Check if the code point is in the range reserved for UTF-16 surrogates + if ((codePoint & 0xF800) == 0xD800) + goto ReturnInvalidFormat; + + dst[0] = (ushort)codePoint; + src = src.Slice(3); + dst = dst.Slice(1); + break; + + case 4: + if ((src[1] & 0xC0) != 0x80) + goto ReturnInvalidFormat; + + if ((src[2] & 0xC0) != 0x80) + goto ReturnInvalidFormat; + + if ((src[3] & 0xC0) != 0x80) + goto ReturnInvalidFormat; + + codePoint = ((src[0] & 7u) << 18) | + ((src[1] & 0x3Fu) << 12) | + ((src[2] & 0x3Fu) << 6) | + ((src[3] & 0x3Fu) << 0); + + // Check if the code point is outside the range of valid code points + if (codePoint < 0x10000 || codePoint >= 0x110000) + goto ReturnInvalidFormat; + + // Make sure we have enough space left in the destination + if (dst.Length == 1) + goto ReturnInsufficientLength; + + ushort highSurrogate = (ushort)((codePoint - 0x10000) / 0x400 + 0xD800); + ushort lowSurrogate = (ushort)((codePoint - 0x10000) % 0x400 + 0xDC00); + + dst[0] = highSurrogate; + dst[1] = lowSurrogate; + src = src.Slice(4); + dst = dst.Slice(2); + break; + + default: + goto ReturnInvalidFormat; + } + } + + codeUnitsWritten = destination.Length - dst.Length; + codeUnitsRead = source.Length - src.Length; + return CharacterEncodingResult.Success; + + ReturnInvalidFormat: + codeUnitsWritten = destination.Length - dst.Length; + codeUnitsRead = source.Length - src.Length; + return CharacterEncodingResult.InvalidFormat; + + ReturnInsufficientLength: + codeUnitsWritten = destination.Length - dst.Length; + codeUnitsRead = source.Length - src.Length; + return CharacterEncodingResult.InsufficientLength; + } + + private static CharacterEncodingResult ConvertStringUtf16ToUtf8Impl(out int codeUnitsWritten, + out int codeUnitsRead, Span destination, ReadOnlySpan source) + { + if (source.Length == 0) + { + codeUnitsWritten = 0; + codeUnitsRead = 0; + return CharacterEncodingResult.Success; + } + + ReadOnlySpan src = source; + Span dst = destination; + + while (src.Length > 0) + { + ushort codeUnit1 = src[0]; + + if (codeUnit1 < 0x80) + { + if (dst.Length < 1) + goto ReturnInsufficientLength; + + dst[0] = (byte)codeUnit1; + src = src.Slice(1); + dst = dst.Slice(1); + } + else if ((codeUnit1 & 0xF800) == 0) + { + if (dst.Length < 2) + goto ReturnInsufficientLength; + + dst[0] = (byte)(0xC0 | (codeUnit1 >> 6) & 0x1F); + dst[1] = (byte)(0x80 | codeUnit1 & 0x3F); + src = src.Slice(1); + dst = dst.Slice(2); + } + else if (codeUnit1 < 0xD800 || codeUnit1 >= 0xE000) + { + if (dst.Length < 3) + goto ReturnInsufficientLength; + + dst[0] = (byte)(0xE0 | (codeUnit1 >> 12) & 0xF); + dst[1] = (byte)(0x80 | (codeUnit1 >> 6) & 0x3F); + dst[2] = (byte)(0x80 | codeUnit1 & 0x3F); + src = src.Slice(1); + dst = dst.Slice(3); + } + else + { + uint utf32; + + if (source.Length == 1) + { + // If the code unit is a high surrogate + if ((codeUnit1 & 0xF800) == 0xD800 && (codeUnit1 & 0x400) == 0) + { + if (dst.Length < 1) + goto ReturnInsufficientLength; + + // We have the first half of a surrogate pair. Get the code point as if the low surrogate + // were 0xDC00, effectively ignoring it. The first byte of the UTF-8-encoded code point does not + // ever depend on the low surrogate, so we can write what the first byte would be. + // The second byte doesn't ever depend on the low surrogate either, so I don't know why Nintendo + // doesn't write that one too. I'll admit I'm not even sure why they write the first byte. This + // reasoning is simply my best guess. + const int codeUnit2 = 0xDC00; + utf32 = ((codeUnit1 - 0xD800u) << 10) + codeUnit2 + 0x2400; + + dst[0] = (byte)(0xF0 | (utf32 >> 18)); + dst = dst.Slice(1); + } + + goto ReturnInvalidFormat; + } + + int codeUnitsUsed = ConvertCharacterUtf16ToUtf32(out utf32, codeUnit1, src[1]); + + if (codeUnitsUsed < 0) + { + if (codeUnitsUsed == -2 && dst.Length > 0) + { + // We have an unpaired surrogate. Output the first UTF-8 code unit of the code point + // ConvertCharacterUtf16ToUtf32 gave us. Nintendo's reason for doing this is unclear. + dst[0] = (byte)(0xF0 | (utf32 >> 18)); + dst = dst.Slice(1); + } + + goto ReturnInvalidFormat; + } + + if (dst.Length < 4) + goto ReturnInsufficientLength; + + dst[0] = (byte)(0xF0 | (utf32 >> 18)); + dst[1] = (byte)(0x80 | (utf32 >> 12) & 0x3F); + dst[2] = (byte)(0x80 | (utf32 >> 6) & 0x3F); + dst[3] = (byte)(0x80 | (utf32 >> 0) & 0x3F); + src = src.Slice(2); + dst = dst.Slice(4); + } + } + + codeUnitsWritten = destination.Length - dst.Length; + codeUnitsRead = source.Length - src.Length; + return CharacterEncodingResult.Success; + + ReturnInvalidFormat: + codeUnitsWritten = destination.Length - dst.Length; + codeUnitsRead = source.Length - src.Length; + return CharacterEncodingResult.InvalidFormat; + + ReturnInsufficientLength: + codeUnitsWritten = destination.Length - dst.Length; + codeUnitsRead = source.Length - src.Length; + return CharacterEncodingResult.InsufficientLength; + } + + private static CharacterEncodingResult ConvertStringUtf8ToUtf32Impl(out int codeUnitsWritten, + out int codeUnitsRead, Span destination, ReadOnlySpan source) + { + if (source.Length == 0) + { + codeUnitsWritten = 0; + codeUnitsRead = 0; + return CharacterEncodingResult.Success; + } + + ReadOnlySpan src = source; + Span dst = destination; + + while (src.Length > 0) + { + int codePointBytes = Utf8NBytesTable[src[0]]; + + if (src.Length < codePointBytes) + goto ReturnInvalidFormat; + + if (dst.Length == 0) + goto ReturnInsufficientLength; + + uint codePoint; + + switch (codePointBytes) + { + case 1: + dst[0] = src[0]; + src = src.Slice(1); + dst = dst.Slice(1); + break; + + case 2: + // Check if the encoding is overlong + if ((src[0] & 0x1E) == 0) + goto ReturnInvalidFormat; + + if (Utf8NBytesTable[src[1]] != 0) + goto ReturnInvalidFormat; + + codePoint = ((src[0] & 0x1Fu) << 6) | + ((src[1] & 0x3Fu) << 0); + + dst[0] = codePoint; + src = src.Slice(2); + dst = dst.Slice(1); + break; + + case 3: + if (Utf8NBytesTable[src[1]] != 0) + goto ReturnInvalidFormat; + + if (Utf8NBytesTable[src[2]] != 0) + goto ReturnInvalidFormat; + + codePoint = ((src[0] & 0xFu) << 12) | + ((src[1] & 0x3Fu) << 6) | + ((src[2] & 0x3Fu) << 0); + + // Check if the encoding is overlong + if ((codePoint & 0xF800) == 0) + goto ReturnInvalidFormat; + + // Check if the code point is in the range reserved for UTF-16 surrogates + if ((codePoint & 0xF800) == 0xD800) + goto ReturnInvalidFormat; + + dst[0] = codePoint; + src = src.Slice(3); + dst = dst.Slice(1); + break; + + case 4: + if (Utf8NBytesTable[src[1]] != 0) + goto ReturnInvalidFormat; + + if (Utf8NBytesTable[src[2]] != 0) + goto ReturnInvalidFormat; + + if (Utf8NBytesTable[src[3]] != 0) + goto ReturnInvalidFormat; + + codePoint = ((src[0] & 7u) << 18) | + ((src[1] & 0x3Fu) << 12) | + ((src[2] & 0x3Fu) << 6) | + ((src[3] & 0x3Fu) << 0); + + // Check if the code point is outside the range of valid code points + if (codePoint < 0x10000 || codePoint >= 0x110000) + goto ReturnInvalidFormat; + + dst[0] = codePoint; + src = src.Slice(4); + dst = dst.Slice(1); + break; + + default: + goto ReturnInvalidFormat; + } + } + + codeUnitsWritten = destination.Length - dst.Length; + codeUnitsRead = source.Length - src.Length; + return CharacterEncodingResult.Success; + + ReturnInvalidFormat: + codeUnitsWritten = destination.Length - dst.Length; + codeUnitsRead = source.Length - src.Length; + return CharacterEncodingResult.InvalidFormat; + + ReturnInsufficientLength: + codeUnitsWritten = destination.Length - dst.Length; + codeUnitsRead = source.Length - src.Length; + return CharacterEncodingResult.InsufficientLength; + } + + private static CharacterEncodingResult ConvertStringUtf32ToUtf8Impl(out int codeUnitsWritten, + out int codeUnitsRead, Span destination, ReadOnlySpan source) + { + if (source.Length == 0) + { + codeUnitsWritten = 0; + codeUnitsRead = 0; + return CharacterEncodingResult.Success; + } + + ReadOnlySpan src = source; + Span dst = destination; + + while ((uint)src.Length > 0) + { + uint codePoint = src[0]; + + if (codePoint < 0x80) + { + if (dst.Length < 1) + goto ReturnInsufficientLength; + + dst[0] = (byte)codePoint; + dst = dst.Slice(1); + } + else if (codePoint < 0x800) + { + if (dst.Length < 2) + goto ReturnInsufficientLength; + + dst[0] = (byte)(0xC0 | codePoint >> 6); + dst[1] = (byte)(0x80 | codePoint & 0x3F); + dst = dst.Slice(2); + } + else if (codePoint < 0x10000) + { + if (codePoint >= 0xD800 && codePoint <= 0xDFFF) + goto ReturnInvalidFormat; + + if (dst.Length < 3) + goto ReturnInsufficientLength; + + dst[0] = (byte)(0xE0 | (codePoint >> 12) & 0xF); + dst[1] = (byte)(0x80 | (codePoint >> 6) & 0x3F); + dst[2] = (byte)(0x80 | (codePoint >> 0) & 0x3F); + dst = dst.Slice(3); + } + else if (codePoint < 0x110000) + { + if (dst.Length < 4) + goto ReturnInsufficientLength; + + dst[0] = (byte)(0xF0 | codePoint >> 18); + dst[1] = (byte)(0x80 | (codePoint >> 12) & 0x3F); + dst[2] = (byte)(0x80 | (codePoint >> 6) & 0x3F); + dst[3] = (byte)(0x80 | (codePoint >> 0) & 0x3F); + dst = dst.Slice(4); + } + else + { + goto ReturnInvalidFormat; + } + + src = src.Slice(1); + } + + codeUnitsWritten = destination.Length - dst.Length; + codeUnitsRead = source.Length - src.Length; + return CharacterEncodingResult.Success; + + ReturnInvalidFormat: + codeUnitsWritten = destination.Length - dst.Length; + codeUnitsRead = source.Length - src.Length; + return CharacterEncodingResult.InvalidFormat; + + ReturnInsufficientLength: + codeUnitsWritten = destination.Length - dst.Length; + codeUnitsRead = source.Length - src.Length; + return CharacterEncodingResult.InsufficientLength; + } + + private static int ConvertCharacterUtf16ToUtf32(out uint outUtf32, ushort codeUnit1, ushort codeUnit2) + { + UnsafeHelpers.SkipParamInit(out outUtf32); + + // If the first code unit isn't a surrogate, simply copy it to the output + if ((codeUnit1 & 0xF800) != 0xD800) + { + outUtf32 = codeUnit1; + return 1; + } + + // Make sure the high surrogate isn't in the range of low surrogate values + if ((codeUnit1 & 0x400) != 0) + return -1; + + // We still output a code point value if we have an unpaired high surrogate. + // Nintendo's reason for doing this is unclear. + outUtf32 = ((codeUnit1 - 0xD800u) << 10) + codeUnit2 + 0x2400; + + // Make sure the low surrogate is in the range of low surrogate values + if ((codeUnit2 & 0xFC00) != 0xDC00) + return -2; + + return 2; + } + + private static int GetLengthOfUtf16(ReadOnlySpan source) + { + for (int i = 0; i < source.Length; i++) + { + if (source[i] == 0) + return i; + } + + return source.Length; + } + + private static int GetLengthOfUtf32(ReadOnlySpan source) + { + for (int i = 0; i < source.Length; i++) + { + if (source[i] == 0) + return i; + } + + return source.Length; + } + + public static CharacterEncodingResult ConvertStringUtf8ToUtf16Native(Span destination, + ReadOnlySpan source, int sourceLength) + { + Assert.SdkRequires(0 <= sourceLength, $"{nameof(sourceLength)} must not be negative."); + Assert.SdkRequires(sourceLength <= source.Length); + + return ConvertStringUtf8ToUtf16Impl(out _, out _, destination, source.Slice(0, sourceLength)); + } + + public static CharacterEncodingResult ConvertStringUtf8ToUtf16Native(Span destination, + ReadOnlySpan source, int sourceLength) + { + return ConvertStringUtf8ToUtf16Native(MemoryMarshal.Cast(destination), source, sourceLength); + } + + public static CharacterEncodingResult ConvertStringUtf8ToUtf16Native(Span destination, + ReadOnlySpan source) + { + int length = StringUtils.GetLength(source); + + Assert.SdkAssert(0 <= length); + + CharacterEncodingResult result = ConvertStringUtf8ToUtf16Impl(out int writtenCount, out _, + destination.Slice(0, destination.Length - 1), source.Slice(0, length)); + + if (result == CharacterEncodingResult.Success) + destination[writtenCount] = 0; + + return result; + } + + public static CharacterEncodingResult ConvertStringUtf8ToUtf16Native(Span destination, + ReadOnlySpan source) + { + return ConvertStringUtf8ToUtf16Native(MemoryMarshal.Cast(destination), source); + } + + public static CharacterEncodingResult ConvertStringUtf16NativeToUtf8(Span destination, + ReadOnlySpan source, int sourceLength) + { + Assert.SdkRequires(0 <= sourceLength, $"{nameof(sourceLength)} must not be negative."); + Assert.SdkRequires(sourceLength <= source.Length); + + return ConvertStringUtf16ToUtf8Impl(out _, out _, destination, source.Slice(0, sourceLength)); + } + + public static CharacterEncodingResult ConvertStringUtf16NativeToUtf8(Span destination, + ReadOnlySpan source, int sourceLength) + { + return ConvertStringUtf16NativeToUtf8(destination, MemoryMarshal.Cast(source), sourceLength); + } + + public static CharacterEncodingResult ConvertStringUtf16NativeToUtf8(Span destination, + ReadOnlySpan source) + { + int length = GetLengthOfUtf16(source); + + Assert.SdkAssert(0 <= length); + + CharacterEncodingResult result = ConvertStringUtf16ToUtf8Impl(out int writtenCount, out _, + destination.Slice(0, destination.Length - 1), source.Slice(0, length)); + + if (result == CharacterEncodingResult.Success) + destination[writtenCount] = 0; + + return result; + } + + public static CharacterEncodingResult ConvertStringUtf16NativeToUtf8(Span destination, + ReadOnlySpan source) + { + return ConvertStringUtf16NativeToUtf8(destination, MemoryMarshal.Cast(source)); + } + + public static CharacterEncodingResult GetLengthOfConvertedStringUtf8ToUtf16Native(out int length, + ReadOnlySpan source, int sourceLength) + { + UnsafeHelpers.SkipParamInit(out length); + Span buffer = stackalloc ushort[0x20]; + + Assert.SdkRequires(0 <= sourceLength, $"{nameof(sourceLength)} must not be negative."); + Assert.SdkRequires(sourceLength <= source.Length); + + int totalLength = 0; + source = source.Slice(0, sourceLength); + + while (source.Length > 0) + { + CharacterEncodingResult result = + ConvertStringUtf8ToUtf16Impl(out int writtenCount, out int readCount, buffer, source); + + if (result == CharacterEncodingResult.InvalidFormat) + return CharacterEncodingResult.InvalidFormat; + + totalLength += writtenCount; + source = source.Slice(readCount); + } + + Assert.SdkAssert(0 <= totalLength); + + length = totalLength; + return CharacterEncodingResult.Success; + } + + public static CharacterEncodingResult GetLengthOfConvertedStringUtf8ToUtf16Native(out int length, + ReadOnlySpan source) + { + int sourceLength = StringUtils.GetLength(source); + + Assert.SdkAssert(0 <= sourceLength); + + return GetLengthOfConvertedStringUtf8ToUtf16Native(out length, source, sourceLength); + } + + public static CharacterEncodingResult GetLengthOfConvertedStringUtf16NativeToUtf8(out int length, + ReadOnlySpan source, int sourceLength) + { + UnsafeHelpers.SkipParamInit(out length); + Span buffer = stackalloc byte[0x20]; + + Assert.SdkRequires(0 <= sourceLength, $"{nameof(sourceLength)} must not be negative."); + Assert.SdkRequires(sourceLength <= source.Length); + + int totalLength = 0; + source = source.Slice(0, sourceLength); + + while (source.Length > 0) + { + CharacterEncodingResult result = + ConvertStringUtf16ToUtf8Impl(out int writtenCount, out int readCount, buffer, source); + + if (result == CharacterEncodingResult.InvalidFormat) + return CharacterEncodingResult.InvalidFormat; + + totalLength += writtenCount; + source = source.Slice(readCount); + } + + Assert.SdkAssert(0 <= totalLength); + + length = totalLength; + return CharacterEncodingResult.Success; + } + + public static CharacterEncodingResult GetLengthOfConvertedStringUtf16NativeToUtf8(out int length, + ReadOnlySpan source, int sourceLength) + { + return GetLengthOfConvertedStringUtf16NativeToUtf8(out length, MemoryMarshal.Cast(source), + sourceLength); + } + + public static CharacterEncodingResult GetLengthOfConvertedStringUtf16NativeToUtf8(out int length, + ReadOnlySpan source) + { + int sourceLength = GetLengthOfUtf16(source); + + Assert.SdkAssert(0 <= sourceLength); + + return GetLengthOfConvertedStringUtf16NativeToUtf8(out length, source, sourceLength); + } + + public static CharacterEncodingResult GetLengthOfConvertedStringUtf16NativeToUtf8(out int length, + ReadOnlySpan source) + { + return GetLengthOfConvertedStringUtf16NativeToUtf8(out length, MemoryMarshal.Cast(source)); + } + + public static CharacterEncodingResult ConvertStringUtf8ToUtf32(Span destination, + ReadOnlySpan source, int sourceLength) + { + Assert.SdkRequires(0 <= sourceLength, $"{nameof(sourceLength)} must not be negative."); + Assert.SdkRequires(sourceLength <= source.Length); + + return ConvertStringUtf8ToUtf32Impl(out _, out _, destination, source.Slice(0, sourceLength)); + } + + public static CharacterEncodingResult ConvertStringUtf8ToUtf32(Span destination, + ReadOnlySpan source) + { + int sourceLength = StringUtils.GetLength(source); + + Assert.SdkAssert(0 <= sourceLength); + + CharacterEncodingResult result = ConvertStringUtf8ToUtf32Impl(out int writtenCount, out _, + destination.Slice(0, destination.Length - 1), source.Slice(0, sourceLength)); + + if (result == CharacterEncodingResult.Success) + destination[writtenCount] = 0; + + return result; + } + + public static CharacterEncodingResult ConvertStringUtf32ToUtf8(Span destination, + ReadOnlySpan source, int sourceLength) + { + Assert.SdkRequires(0 <= sourceLength, $"{nameof(sourceLength)} must not be negative."); + Assert.SdkRequires(sourceLength <= source.Length); + + return ConvertStringUtf32ToUtf8Impl(out _, out _, destination, source.Slice(0, sourceLength)); + } + + public static CharacterEncodingResult ConvertStringUtf32ToUtf8(Span destination, + ReadOnlySpan source) + { + int sourceLength = GetLengthOfUtf32(source); + + Assert.SdkAssert(0 <= sourceLength); + + CharacterEncodingResult result = ConvertStringUtf32ToUtf8Impl(out int writtenCount, out _, + destination.Slice(0, destination.Length - 1), source.Slice(0, sourceLength)); + + if (result == CharacterEncodingResult.Success) + destination[writtenCount] = 0; + + return result; + } + + public static CharacterEncodingResult GetLengthOfConvertedStringUtf8ToUtf32(out int length, + ReadOnlySpan source, int sourceLength) + { + UnsafeHelpers.SkipParamInit(out length); + Span buffer = stackalloc uint[0x20]; + + Assert.SdkRequires(0 <= sourceLength, $"{nameof(sourceLength)} must not be negative."); + Assert.SdkRequires(sourceLength <= source.Length); + + int totalLength = 0; + source = source.Slice(0, sourceLength); + + while (source.Length > 0) + { + CharacterEncodingResult result = + ConvertStringUtf8ToUtf32Impl(out int writtenCount, out int readCount, buffer, source); + + if (result == CharacterEncodingResult.InvalidFormat) + return CharacterEncodingResult.InvalidFormat; + + totalLength += writtenCount; + source = source.Slice(readCount); + } + + Assert.SdkAssert(0 <= totalLength); + + length = totalLength; + return CharacterEncodingResult.Success; + } + + public static CharacterEncodingResult GetLengthOfConvertedStringUtf8ToUtf32(out int length, + ReadOnlySpan source) + { + int sourceLength = StringUtils.GetLength(source); + + Assert.SdkAssert(0 <= sourceLength); + + return GetLengthOfConvertedStringUtf8ToUtf32(out length, source, sourceLength); + } + + public static CharacterEncodingResult GetLengthOfConvertedStringUtf32ToUtf8(out int length, + ReadOnlySpan source, int sourceLength) + { + UnsafeHelpers.SkipParamInit(out length); + Span buffer = stackalloc byte[0x20]; + + Assert.SdkRequires(0 <= sourceLength, $"{nameof(sourceLength)} must not be negative."); + Assert.SdkRequires(sourceLength <= source.Length); + + int totalLength = 0; + source = source.Slice(0, sourceLength); + + while (source.Length > 0) + { + CharacterEncodingResult result = + ConvertStringUtf32ToUtf8Impl(out int writtenCount, out int readCount, buffer, source); + + if (result == CharacterEncodingResult.InvalidFormat) + return CharacterEncodingResult.InvalidFormat; + + totalLength += writtenCount; + source = source.Slice(readCount); + } + + Assert.SdkAssert(0 <= totalLength); + + length = totalLength; + return CharacterEncodingResult.Success; + } + + public static CharacterEncodingResult GetLengthOfConvertedStringUtf32ToUtf8(out int length, + ReadOnlySpan source) + { + int sourceLength = GetLengthOfUtf32(source); + + Assert.SdkAssert(0 <= sourceLength); + + return GetLengthOfConvertedStringUtf32ToUtf8(out length, source, sourceLength); + } + + public static CharacterEncodingResult ConvertCharacterUtf8ToUtf16Native(Span destination, + ReadOnlySpan source) + { + if (destination.Length < 2) + return CharacterEncodingResult.InsufficientLength; + + if (source.Length < 1) + return CharacterEncodingResult.InvalidFormat; + + Span bufferSrc = stackalloc byte[5]; + Span bufferDst = stackalloc ushort[3]; + + bufferSrc[0] = source[0]; + bufferSrc[1] = 0; + bufferSrc[2] = 0; + bufferSrc[3] = 0; + bufferSrc[4] = 0; + + // Read more code units if needed + if (source[0] >= 0xC2 && source[0] < 0xE0) + { + if (source.Length < 2) + return CharacterEncodingResult.InvalidFormat; + + bufferSrc[1] = source[1]; + } + else if (source[0] >= 0xE0 && source[0] < 0xF0) + { + if (source.Length < 3) + return CharacterEncodingResult.InvalidFormat; + + bufferSrc[1] = source[1]; + bufferSrc[2] = source[2]; + + } + else if (source[0] >= 0xF0 && source[0] < 0xF8) + { + if (source.Length < 4) + return CharacterEncodingResult.InvalidFormat; + + bufferSrc[1] = source[1]; + bufferSrc[2] = source[2]; + bufferSrc[3] = source[3]; + } + + bufferDst.Clear(); + + CharacterEncodingResult result = ConvertStringUtf8ToUtf16Native(bufferDst, bufferSrc); + destination[0] = bufferDst[0]; + destination[1] = bufferDst[1]; + + return result; + } + + public static CharacterEncodingResult ConvertCharacterUtf8ToUtf16Native(Span destination, + ReadOnlySpan source) + { + return ConvertCharacterUtf8ToUtf16Native(MemoryMarshal.Cast(destination), source); + } + + public static CharacterEncodingResult ConvertCharacterUtf16NativeToUtf8(Span destination, + ReadOnlySpan source) + { + if (destination.Length < 4) + return CharacterEncodingResult.InsufficientLength; + + if (source.Length < 1) + return CharacterEncodingResult.InvalidFormat; + + Span bufferSrc = stackalloc ushort[3]; + Span bufferDst = stackalloc byte[5]; + + bufferSrc[0] = source[0]; + bufferSrc[1] = 0; + bufferSrc[2] = 0; + + // Read more code units if needed + if (source[0] >= 0xD800 && source[0] < 0xE000) + { + if (source.Length < 2) + return CharacterEncodingResult.InvalidFormat; + + bufferSrc[1] = source[1]; + } + + bufferDst.Clear(); + + CharacterEncodingResult result = ConvertStringUtf16NativeToUtf8(bufferDst, bufferSrc); + destination[0] = bufferDst[0]; + destination[1] = bufferDst[1]; + destination[2] = bufferDst[2]; + destination[3] = bufferDst[3]; + + return result; + } + + public static CharacterEncodingResult ConvertCharacterUtf16NativeToUtf8(Span destination, + ReadOnlySpan source) + { + return ConvertCharacterUtf16NativeToUtf8(destination, MemoryMarshal.Cast(source)); + } + + public static CharacterEncodingResult ConvertCharacterUtf8ToUtf32(out uint destination, + ReadOnlySpan source) + { + UnsafeHelpers.SkipParamInit(out destination); + + if (source.Length < 1) + return CharacterEncodingResult.InvalidFormat; + + switch (Utf8NBytesTable[source[0]]) + { + case 1: + destination = source[0]; + return CharacterEncodingResult.Success; + + case 2: + if (source.Length < 2) break; + if ((source[0] & 0x1E) == 0) break; + if (Utf8NBytesTable[source[1]] != 0) break; + + destination = ((source[0] & 0x1Fu) << 6) | ((source[1] & 0x3Fu) << 0); + return CharacterEncodingResult.Success; + + case 3: + if (source.Length < 3) break; + if (Utf8NBytesTable[source[1]] != 0 || Utf8NBytesTable[source[2]] != 0) break; + + uint codePoint3 = ((source[0] & 0xFu) << 12) | ((source[1] & 0x3Fu) << 6) | ((source[2] & 0x3Fu) << 0); + + if ((codePoint3 & 0xF800) == 0 || (codePoint3 & 0xF800) == 0xD800) + break; + + destination = codePoint3; + return CharacterEncodingResult.Success; + + case 4: + if (source.Length < 4) break; + if (Utf8NBytesTable[source[1]] != 0 || Utf8NBytesTable[source[2]] != 0 || Utf8NBytesTable[source[3]] != 0) break; + + uint codePoint4 = ((source[0] & 7u) << 18) | ((source[1] & 0x3Fu) << 12) | ((source[2] & 0x3Fu) << 6) | ((source[3] & 0x3Fu) << 0); + + if (codePoint4 < 0x10000 || codePoint4 >= 0x110000) + break; + + destination = codePoint4; + return CharacterEncodingResult.Success; + } + + return CharacterEncodingResult.InvalidFormat; + } + + public static CharacterEncodingResult ConvertCharacterUtf32ToUtf8(Span destination, uint source) + { + if (destination.Length < 4) + return CharacterEncodingResult.InsufficientLength; + + destination[0] = 0; + destination[1] = 0; + destination[2] = 0; + destination[3] = 0; + + if (source < 0x80) + { + destination[0] = (byte)source; + } + else if (source < 0x800) + { + destination[0] = (byte)(0xC0 | source >> 6); + destination[1] = (byte)(0x80 | (source & 0x3F)); + } + else if (source < 0x10000) + { + if (source >= 0xD800 && source <= 0xDFFF) + return CharacterEncodingResult.InvalidFormat; + + destination[0] = (byte)(0xE0 | (source >> 12) & 0xF); + destination[1] = (byte)(0x80 | (source >> 6) & 0x3F); + destination[2] = (byte)(0x80 | (source >> 0) & 0x3F); + + } + else if (source < 0x110000) + { + destination[0] = (byte)(0xF0 | (source >> 18)); + destination[1] = (byte)(0x80 | (source >> 12) & 0x3F); + destination[2] = (byte)(0x80 | (source >> 6) & 0x3F); + destination[3] = (byte)(0x80 | (source >> 0) & 0x3F); + } + else + { + return CharacterEncodingResult.InvalidFormat; + } + + return CharacterEncodingResult.Success; + } + + public static CharacterEncodingResult PickOutCharacterFromUtf8String(Span destinationChar, + ref ReadOnlySpan source) + { + Assert.SdkRequires(destinationChar.Length >= 4); + Assert.SdkRequires(source.Length >= 1); + Assert.SdkRequires(source[0] != 0); + + ReadOnlySpan str = source; + + if (destinationChar.Length < 4) + return CharacterEncodingResult.InsufficientLength; + + if (str.Length < 1) + return CharacterEncodingResult.InvalidFormat; + + destinationChar[0] = 0; + destinationChar[1] = 0; + destinationChar[2] = 0; + destinationChar[3] = 0; + + uint codePoint = str[0]; + + switch (Utf8NBytesTable[(int)codePoint]) + { + case 1: + destinationChar[0] = str[0]; + source = str.Slice(1); + break; + + case 2: + if (str.Length < 2) + return CharacterEncodingResult.InvalidFormat; + + if ((str[0] & 0x1E) == 0 || Utf8NBytesTable[str[1]] != 0) + return CharacterEncodingResult.InvalidFormat; + + destinationChar[0] = str[0]; + destinationChar[1] = str[1]; + source = str.Slice(2); + break; + + case 3: + if (str.Length < 3) + return CharacterEncodingResult.InvalidFormat; + + if (Utf8NBytesTable[str[1]] != 0 || Utf8NBytesTable[str[2]] != 0) + return CharacterEncodingResult.InvalidFormat; + + codePoint = ((str[0] & 0xFu) << 12) | + ((str[1] & 0x3Fu) << 6) | + ((str[2] & 0x3Fu) << 0); + + if ((codePoint & 0xF800) == 0 || (codePoint & 0xF800) == 0xD800) + return CharacterEncodingResult.InvalidFormat; + + destinationChar[0] = str[0]; + destinationChar[1] = str[1]; + destinationChar[2] = str[2]; + source = str.Slice(3); + break; + + case 4: + if (str.Length < 4) + return CharacterEncodingResult.InvalidFormat; + + if (Utf8NBytesTable[str[1]] != 0 || Utf8NBytesTable[str[2]] != 0 || Utf8NBytesTable[str[3]] != 0) + return CharacterEncodingResult.InvalidFormat; + + codePoint = ((str[0] & 7u) << 18) | + ((str[1] & 0x3Fu) << 12) | + ((str[2] & 0x3Fu) << 6) | + ((str[3] & 0x3Fu) << 0); + + if (codePoint < 0x10000 || codePoint >= 0x110000) + return CharacterEncodingResult.InvalidFormat; + + destinationChar[0] = str[0]; + destinationChar[1] = str[1]; + destinationChar[2] = str[2]; + destinationChar[3] = str[3]; + source = str.Slice(4); + break; + + default: + return CharacterEncodingResult.InvalidFormat; + } + + return CharacterEncodingResult.Success; + } + } +} diff --git a/tests/LibHac.Tests/SpanEqualAsserts.cs b/tests/LibHac.Tests/SpanEqualAsserts.cs new file mode 100644 index 00000000..8bdc74ed --- /dev/null +++ b/tests/LibHac.Tests/SpanEqualAsserts.cs @@ -0,0 +1,21 @@ +using System; +using Xunit.Sdk; + +namespace Xunit +{ + public partial class Assert + { + /// + /// Verifies that two spans are equal, using a default comparer. + /// + /// The type of the objects to be compared + /// The expected value + /// The value to be compared against + /// Thrown when the objects are not equal + public static void Equal(ReadOnlySpan expected, ReadOnlySpan actual) where T : unmanaged, IEquatable + { + if(!expected.SequenceEqual(actual)) + throw new EqualException(expected.ToArray(), actual.ToArray()); + } + } +} diff --git a/tests/LibHac.Tests/Util/CharacterEncodingTests.cs b/tests/LibHac.Tests/Util/CharacterEncodingTests.cs new file mode 100644 index 00000000..c01f949e --- /dev/null +++ b/tests/LibHac.Tests/Util/CharacterEncodingTests.cs @@ -0,0 +1,664 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using LibHac.Util; +using Xunit; + +namespace LibHac.Tests.Util +{ + public class CharacterEncodingTests + { + // Most of these tests are stolen from .NET's UTF-8 tests. Some of the comments in this file may + // mention code paths and functions being tested in the .NET runtime that don't apply here as a result. + + // ReSharper disable InconsistentNaming UnusedMember.Local + private const string X_UTF8 = "58"; // U+0058 LATIN CAPITAL LETTER X, 1 byte + private const string X_UTF16 = "X"; + + private const string Y_UTF8 = "59"; // U+0058 LATIN CAPITAL LETTER Y, 1 byte + private const string Y_UTF16 = "Y"; + + private const string Z_UTF8 = "5A"; // U+0058 LATIN CAPITAL LETTER Z, 1 byte + private const string Z_UTF16 = "Z"; + + private const string E_ACUTE_UTF8 = "C3A9"; // U+00E9 LATIN SMALL LETTER E WITH ACUTE, 2 bytes + private const string E_ACUTE_UTF16 = "\u00E9"; + + private const string EURO_SYMBOL_UTF8 = "E282AC"; // U+20AC EURO SIGN, 3 bytes + private const string EURO_SYMBOL_UTF16 = "\u20AC"; + + private const string REPLACEMENT_CHAR_UTF8 = "EFBFBD"; // U+FFFD REPLACEMENT CHAR, 3 bytes + private const string REPLACEMENT_CHAR_UTF16 = "\uFFFD"; + + private const string GRINNING_FACE_UTF8 = "F09F9880"; // U+1F600 GRINNING FACE, 4 bytes + private const string GRINNING_FACE_UTF16 = "\U0001F600"; + + private const string WOMAN_CARTWHEELING_MEDSKIN_UTF16 = "\U0001F938\U0001F3FD\u200D\u2640\uFE0F"; // U+1F938 U+1F3FD U+200D U+2640 U+FE0F WOMAN CARTWHEELING: MEDIUM SKIN TONE + + // All valid scalars [ U+0000 .. U+D7FF ] and [ U+E000 .. U+10FFFF ]. + private static readonly IEnumerable s_allValidScalars = Enumerable.Range(0x0000, 0xD800).Concat(Enumerable.Range(0xE000, 0x110000 - 0xE000)).Select(value => new Rune(value)); + + private static readonly ReadOnlyMemory s_allScalarsAsUtf32; + private static readonly ReadOnlyMemory s_allScalarsAsUtf16; + private static readonly ReadOnlyMemory s_allScalarsAsUtf8; + // ReSharper restore InconsistentNaming UnusedMember.Local + + static CharacterEncodingTests() + { + var allScalarsAsUtf32 = new List(); + var allScalarsAsUtf16 = new List(); + var allScalarsAsUtf8 = new List(); + + Span utf8 = stackalloc byte[4]; + Span utf16 = stackalloc char[2]; + + foreach (Rune rune in s_allValidScalars) + { + int utf8Length = ToUtf8(rune, utf8); + int utf16Length = ToUtf16(rune, utf16); + + allScalarsAsUtf32.Add((uint)rune.Value); + + for (int i = 0; i < utf16Length; i++) + allScalarsAsUtf16.Add(utf16[i]); + + for (int i = 0; i < utf8Length; i++) + allScalarsAsUtf8.Add(utf8[i]); + } + + s_allScalarsAsUtf32 = allScalarsAsUtf32.ToArray().AsMemory(); + s_allScalarsAsUtf16 = allScalarsAsUtf16.ToArray().AsMemory(); + s_allScalarsAsUtf8 = allScalarsAsUtf8.ToArray().AsMemory(); + } + + /* + * COMMON UTILITIES FOR UNIT TESTS + */ + + public static byte[] DecodeHex(ReadOnlySpan inputHex) + { + Assert.True(Regex.IsMatch(inputHex.ToString(), "^([0-9a-fA-F]{2})*$"), "Input must be an even number of hex characters."); + + return Convert.FromHexString(inputHex); + } + + public static byte[] ToUtf8(Rune rune) + { + Span utf8 = stackalloc byte[4]; + + int length = ToUtf8(rune, utf8); + return utf8.Slice(0, length).ToArray(); + } + + private static char[] ToUtf16(Rune rune) + { + Span utf16 = stackalloc char[2]; + + int length = ToUtf16(rune, utf16); + return utf16.Slice(0, length).ToArray(); + } + + // !! IMPORTANT !! + // Don't delete this implementation, as we use it as a reference to make sure the framework's + // transcoding logic is correct. + public static int ToUtf8(Rune rune, Span destination) + { + if (!Rune.IsValid(rune.Value)) + { + Assert.True(Rune.IsValid(rune.Value), $"Rune with value U+{(uint)rune.Value:X4} is not well-formed."); + } + + Assert.True(destination.Length == 4); + + destination[0] = 0; + destination[1] = 0; + destination[2] = 0; + destination[3] = 0; + + if (rune.Value < 0x80) + { + destination[0] = (byte)rune.Value; + return 1; + } + else if (rune.Value < 0x0800) + { + destination[0] = (byte)((rune.Value >> 6) | 0xC0); + destination[1] = (byte)((rune.Value & 0x3F) | 0x80); + return 2; + } + else if (rune.Value < 0x10000) + { + destination[0] = (byte)((rune.Value >> 12) | 0xE0); + destination[1] = (byte)(((rune.Value >> 6) & 0x3F) | 0x80); + destination[2] = (byte)((rune.Value & 0x3F) | 0x80); + return 3; + } + else + { + destination[0] = (byte)((rune.Value >> 18) | 0xF0); + destination[1] = (byte)(((rune.Value >> 12) & 0x3F) | 0x80); + destination[2] = (byte)(((rune.Value >> 6) & 0x3F) | 0x80); + destination[3] = (byte)((rune.Value & 0x3F) | 0x80); + return 4; + } + } + + // !! IMPORTANT !! + // Don't delete this implementation, as we use it as a reference to make sure the framework's + // transcoding logic is correct. + private static int ToUtf16(Rune rune, Span destination) + { + if (!Rune.IsValid(rune.Value)) + { + Assert.True(Rune.IsValid(rune.Value), $"Rune with value U+{(uint)rune.Value:X4} is not well-formed."); + } + + Assert.True(destination.Length == 2); + + destination[0] = '\0'; + destination[1] = '\0'; + + if (rune.IsBmp) + { + destination[0] = (char)rune.Value; + return 1; + } + else + { + destination[0] = (char)((rune.Value >> 10) + 0xD800 - 0x40); + destination[1] = (char)((rune.Value & 0x03FF) + 0xDC00); + return 2; + } + } + + [Theory] + [InlineData("", "")] // empty string is OK + [InlineData(X_UTF16, X_UTF8)] + [InlineData(E_ACUTE_UTF16, E_ACUTE_UTF8)] + [InlineData(EURO_SYMBOL_UTF16, EURO_SYMBOL_UTF8)] + public void Utf16ToUtf8_WithSmallValidBuffers(string utf16Input, string expectedUtf8TranscodingHex) + { + Assert.InRange(utf16Input.Length, 0, 1); + + Utf16ToUtf8_String_Test_Core( + utf16Input: utf16Input, + destinationSize: expectedUtf8TranscodingHex.Length / 2, + expectedEncodingResult: CharacterEncodingResult.Success, + expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex)); + } + + [Theory] + [InlineData('\uD800', CharacterEncodingResult.InsufficientLength)] // standalone high surrogate + [InlineData('\uDFFF', CharacterEncodingResult.InvalidFormat)] // standalone low surrogate + public void Utf16ToUtf8_WithOnlyStandaloneSurrogates(char charValue, CharacterEncodingResult expectedEncodingResult) + { + Utf16ToUtf8_String_Test_Core( + utf16Input: new[] { charValue }, + destinationSize: 0, + expectedEncodingResult: expectedEncodingResult, + expectedUtf8Transcoding: Span.Empty); + } + + [Theory] + [InlineData("", "")] // swapped surrogate pair characters + [InlineData("A", "41")] // consume standalone ASCII char, then swapped surrogate pair characters + [InlineData("AB", "41F0")] // consume standalone ASCII char, then standalone high surrogate char + [InlineData("AB", "41")] // consume standalone ASCII char, then standalone low surrogate char + [InlineData("AB", "4142F0")] // consume two ASCII chars, then standalone high surrogate char + [InlineData("AB", "4142")] // consume two ASCII chars, then standalone low surrogate char + public void Utf16ToUtf8_WithInvalidSurrogates(string utf16Input, string expectedUtf8TranscodingHex) + { + // xUnit can't handle ill-formed strings in [InlineData], so we replace here. + + utf16Input = utf16Input.Replace("", "\uD800").Replace("", "\uDFFF"); + + // These test cases are for the "fast processing" code which is the main loop of TranscodeToUtf8, + // so inputs should be at least 2 chars. + + Assert.True(utf16Input.Length >= 2); + + Utf16ToUtf8_String_Test_Core( + utf16Input: utf16Input, + destinationSize: expectedUtf8TranscodingHex.Length / 2, + expectedEncodingResult: CharacterEncodingResult.InvalidFormat, + expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex)); + + // Now try the tests again with a larger buffer. + // This ensures that running out of destination space wasn't the reason we failed. + + Utf16ToUtf8_String_Test_Core( + utf16Input: utf16Input, + destinationSize: (expectedUtf8TranscodingHex.Length) / 2 + 16, + expectedEncodingResult: CharacterEncodingResult.InvalidFormat, + expectedUtf8Transcoding: DecodeHex(expectedUtf8TranscodingHex)); + } + + [Theory] + [InlineData("80", CharacterEncodingResult.InsufficientLength, "")] // sequence cannot begin with continuation character + [InlineData("8182", CharacterEncodingResult.InsufficientLength, "")] // sequence cannot begin with continuation character + [InlineData("838485", CharacterEncodingResult.InsufficientLength, "")] // sequence cannot begin with continuation character + [InlineData(X_UTF8 + "80", CharacterEncodingResult.InsufficientLength, X_UTF16)] // sequence cannot begin with continuation character + [InlineData(X_UTF8 + "8182", CharacterEncodingResult.InsufficientLength, X_UTF16)] // sequence cannot begin with continuation character + [InlineData("C0", CharacterEncodingResult.InvalidFormat, "")] // [ C0 ] is always invalid + [InlineData("C080", CharacterEncodingResult.InsufficientLength, "")] // [ C0 ] is always invalid + [InlineData("C08081", CharacterEncodingResult.InsufficientLength, "")] // [ C0 ] is always invalid + [InlineData(X_UTF8 + "C1", CharacterEncodingResult.InvalidFormat, X_UTF16)] // [ C1 ] is always invalid + [InlineData(X_UTF8 + "C180", CharacterEncodingResult.InsufficientLength, X_UTF16)] // [ C1 ] is always invalid + [InlineData(X_UTF8 + "C27F", CharacterEncodingResult.InsufficientLength, X_UTF16)] // [ C2 ] is improperly terminated + [InlineData("E2827F", CharacterEncodingResult.InsufficientLength, "")] // [ E2 82 ] is improperly terminated + [InlineData("E09F80", CharacterEncodingResult.InsufficientLength, "")] // [ E0 9F ... ] is overlong + [InlineData("E0C080", CharacterEncodingResult.InsufficientLength, "")] // [ E0 ] is improperly terminated + [InlineData("ED7F80", CharacterEncodingResult.InsufficientLength, "")] // [ ED ] is improperly terminated + [InlineData("EDA080", CharacterEncodingResult.InsufficientLength, "")] // [ ED A0 ... ] is surrogate + public void Utf8ToUtf16_WithSmallInvalidBuffers(string utf8HexInput, CharacterEncodingResult expectedEncodingResult, string expectedUtf16Transcoding) + { + Utf8ToUtf16_String_Test_Core( + utf8Input: DecodeHex(utf8HexInput), + destinationSize: expectedUtf16Transcoding.Length, + expectedEncodingResult: expectedEncodingResult, + expectedUtf16Transcoding: expectedUtf16Transcoding); + + // Now try the tests again with a larger buffer. + // This ensures that the sequence is seen as invalid when not hitting the destination length check. + + Utf8ToUtf16_String_Test_Core( + utf8Input: DecodeHex(utf8HexInput), + destinationSize: expectedUtf16Transcoding.Length + 16, + expectedEncodingResult: CharacterEncodingResult.InvalidFormat, + expectedUtf16Transcoding: expectedUtf16Transcoding); + } + + [Theory] + /* SMALL VALID BUFFERS - tests drain loop at end of method */ + [InlineData("")] // empty string is OK + [InlineData("X")] + [InlineData("XY")] + [InlineData("XYZ")] + [InlineData(E_ACUTE_UTF16)] + [InlineData(X_UTF16 + E_ACUTE_UTF16)] + [InlineData(E_ACUTE_UTF16 + X_UTF16)] + [InlineData(EURO_SYMBOL_UTF16)] + /* LARGE VALID BUFFERS - test main loop at beginning of method */ + [InlineData(E_ACUTE_UTF16 + "ABCD" + "0123456789:;<=>?")] // Loop unrolling at end of buffer + [InlineData(E_ACUTE_UTF16 + "ABCD" + "0123456789:;<=>?" + "01234567" + E_ACUTE_UTF16 + "89:;<=>?")] // Loop unrolling interrupted by non-ASCII + [InlineData("ABC" + E_ACUTE_UTF16 + "0123")] // 3 ASCII bytes followed by non-ASCII + [InlineData("AB" + E_ACUTE_UTF16 + "0123")] // 2 ASCII bytes followed by non-ASCII + [InlineData("A" + E_ACUTE_UTF16 + "0123")] // 1 ASCII byte followed by non-ASCII + [InlineData(E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16)] // 4x 2-byte sequences, exercises optimization code path in 2-byte sequence processing + [InlineData(E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + "PQ")] // 3x 2-byte sequences + 2 ASCII bytes, exercises optimization code path in 2-byte sequence processing + [InlineData(E_ACUTE_UTF16 + "PQ")] // single 2-byte sequence + 2 trailing ASCII bytes, exercises draining logic in 2-byte sequence processing + [InlineData(E_ACUTE_UTF16 + "P" + E_ACUTE_UTF16 + "0@P")] // single 2-byte sequences + 1 trailing ASCII byte + 2-byte sequence, exercises draining logic in 2-byte sequence processing + [InlineData(EURO_SYMBOL_UTF16 + "@")] // single 3-byte sequence + 1 trailing ASCII byte, exercises draining logic in 3-byte sequence processing + [InlineData(EURO_SYMBOL_UTF16 + "@P`")] // single 3-byte sequence + 3 trailing ASCII byte, exercises draining logic and "running out of data" logic in 3-byte sequence processing + [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16)] // 3x 3-byte sequences, exercises "stay within 3-byte loop" logic in 3-byte sequence processing + [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16)] // 4x 3-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing + [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + E_ACUTE_UTF16)] // 3x 3-byte sequences + single 2-byte sequence, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing + [InlineData(EURO_SYMBOL_UTF16 + EURO_SYMBOL_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16 + E_ACUTE_UTF16)] // 2x 3-byte sequences + 4x 2-byte sequences, exercises "consume multiple bytes at a time" logic in 3-byte sequence processing + [InlineData(GRINNING_FACE_UTF16 + GRINNING_FACE_UTF16)] // 2x 4-byte sequences, exercises 4-byte sequence processing + [InlineData(GRINNING_FACE_UTF16 + "@AB")] // single 4-byte sequence + 3 ASCII bytes, exercises 4-byte sequence processing and draining logic + [InlineData(WOMAN_CARTWHEELING_MEDSKIN_UTF16)] // exercises switching between multiple sequence lengths + public void Utf8ToUtf16_ValidBuffers(string utf16Input) + { + // We're going to run the tests with destination buffer lengths ranging from 0 all the way + // to buffers large enough to hold the full output. This allows us to test logic that + // detects whether we're about to overrun our destination buffer and instead returns DestinationTooSmall. + + Rune[] enumeratedScalars = utf16Input.EnumerateRunes().ToArray(); + + // Convert entire input to UTF-8 using our unit test reference logic. + + byte[] utf8Input = enumeratedScalars.SelectMany(ToUtf8).ToArray(); + + // 0-length buffer test + Utf8ToUtf16_String_Test_Core( + utf8Input: utf8Input, + destinationSize: 0, + expectedEncodingResult: (utf8Input.Length == 0) ? CharacterEncodingResult.Success : CharacterEncodingResult.InsufficientLength, + expectedUtf16Transcoding: ReadOnlySpan.Empty); + + char[] concatenatedUtf16 = Array.Empty(); + + for (int i = 0; i < enumeratedScalars.Length; i++) + { + Rune thisScalar = enumeratedScalars[i]; + + // if this is an astral scalar value, quickly test a buffer that's not large enough to contain the entire UTF-16 encoding + + if (!thisScalar.IsBmp) + { + Utf8ToUtf16_String_Test_Core( + utf8Input: utf8Input, + destinationSize: concatenatedUtf16.Length + 1, + expectedEncodingResult: CharacterEncodingResult.InsufficientLength, + expectedUtf16Transcoding: concatenatedUtf16); + } + + // now provide a destination buffer large enough to hold the next full scalar encoding + + concatenatedUtf16 = concatenatedUtf16.Concat(ToUtf16(thisScalar)).ToArray(); + + Utf8ToUtf16_String_Test_Core( + utf8Input: utf8Input, + destinationSize: concatenatedUtf16.Length, + expectedEncodingResult: (i == enumeratedScalars.Length - 1) ? CharacterEncodingResult.Success : CharacterEncodingResult.InsufficientLength, + expectedUtf16Transcoding: concatenatedUtf16); + } + + // now throw lots of ASCII data at the beginning so that we exercise the vectorized code paths + + utf16Input = new string('x', 64) + utf16Input; + utf8Input = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray(); + + Utf8ToUtf16_String_Test_Core( + utf8Input: utf8Input, + destinationSize: utf16Input.Length, + expectedEncodingResult: CharacterEncodingResult.Success, + expectedUtf16Transcoding: utf16Input); + + // now throw some non-ASCII data at the beginning so that we *don't* exercise the vectorized code paths + + utf16Input = WOMAN_CARTWHEELING_MEDSKIN_UTF16 + utf16Input[64..]; + utf8Input = utf16Input.EnumerateRunes().SelectMany(ToUtf8).ToArray(); + + Utf8ToUtf16_String_Test_Core( + utf8Input: utf8Input, + destinationSize: utf16Input.Length, + expectedEncodingResult: CharacterEncodingResult.Success, + expectedUtf16Transcoding: utf16Input); + } + + [Fact] + public void Utf8ToUtf16_String_AllPossibleScalarValues() + { + Utf8ToUtf16_String_Test_Core( + utf8Input: s_allScalarsAsUtf8.Span, + destinationSize: s_allScalarsAsUtf16.Length, + expectedEncodingResult: CharacterEncodingResult.Success, + expectedUtf16Transcoding: s_allScalarsAsUtf16.Span); + } + + [Fact] + public void Utf16ToUtf8_String_AllPossibleScalarValues() + { + Utf16ToUtf8_String_Test_Core( + utf16Input: s_allScalarsAsUtf16.Span, + destinationSize: s_allScalarsAsUtf8.Length, + expectedEncodingResult: CharacterEncodingResult.Success, + expectedUtf8Transcoding: s_allScalarsAsUtf8.Span); + } + + [Fact] + public void Utf8ToUtf32_String_AllPossibleScalarValues() + { + Utf8ToUtf32_String_Test_Core( + utf8Input: s_allScalarsAsUtf8.Span, + destinationSize: s_allScalarsAsUtf32.Length, + expectedEncodingResult: CharacterEncodingResult.Success, + expectedUtf32Transcoding: s_allScalarsAsUtf32.Span); + } + + [Fact] + public void Utf32ToUtf8_String_AllPossibleScalarValues() + { + Utf32ToUtf8_String_Test_Core( + utf32Input: s_allScalarsAsUtf32.Span, + destinationSize: s_allScalarsAsUtf8.Length, + expectedEncodingResult: CharacterEncodingResult.Success, + expectedUtf8Transcoding: s_allScalarsAsUtf8.Span); + } + + [Fact] + public void Utf8ToUtf16_Length_AllPossibleScalarValues() + { + Utf8ToUtf16_Length_Test_Core( + // Skip the first code point because it's 0 + utf8Input: s_allScalarsAsUtf8.Span.Slice(1), + expectedEncodingResult: CharacterEncodingResult.Success, + expectedUtf16Length: s_allScalarsAsUtf16.Length - 1); + } + + [Fact] + public void Utf16ToUtf8_Length_AllPossibleScalarValues() + { + Utf16ToUtf8_Length_Test_Core( + utf16Input: s_allScalarsAsUtf16.Span.Slice(1), + expectedEncodingResult: CharacterEncodingResult.Success, + expectedUtf8Length: s_allScalarsAsUtf8.Length - 1); + } + + [Fact] + public void Utf8ToUtf32_Length_AllPossibleScalarValues() + { + Utf8ToUtf32_Length_Test_Core( + utf8Input: s_allScalarsAsUtf8.Span.Slice(1), + expectedEncodingResult: CharacterEncodingResult.Success, + expectedUtf32Length: s_allScalarsAsUtf32.Length - 1); + } + + [Fact] + public void Utf32ToUtf8_Length_AllPossibleScalarValues() + { + Utf32ToUtf8_Length_Test_Core( + utf32Input: s_allScalarsAsUtf32.Span.Slice(1), + expectedEncodingResult: CharacterEncodingResult.Success, + expectedUtf8Length: s_allScalarsAsUtf8.Length - 1); + } + + [Fact] + public void Utf8ToUtf16_Character_AllPossibleScalarValues() + { + Span utf8 = stackalloc byte[4]; + Span utf16 = stackalloc char[2]; + + foreach (Rune rune in s_allValidScalars) + { + ToUtf8(rune, utf8); + ToUtf16(rune, utf16); + + Utf8ToUtf16_Character_Test_Core( + utf8Input: utf8, + expectedEncodingResult: CharacterEncodingResult.Success, + expectedUtf16Transcoding: utf16); + } + } + + [Fact] + public void Utf16ToUtf8_Character_AllPossibleScalarValues() + { + Span utf8 = stackalloc byte[4]; + Span utf16 = stackalloc char[2]; + + foreach (Rune rune in s_allValidScalars) + { + ToUtf8(rune, utf8); + ToUtf16(rune, utf16); + + Utf16ToUtf8_Character_Test_Core( + utf16Input: utf16, + expectedEncodingResult: CharacterEncodingResult.Success, + expectedUtf8Transcoding: utf8); + } + } + + [Fact] + public void Utf8ToUtf32_Character_AllPossibleScalarValues() + { + Span utf8 = stackalloc byte[4]; + + foreach (Rune rune in s_allValidScalars) + { + ToUtf8(rune, utf8); + + Utf8ToUtf32_Character_Test_Core( + utf8Input: utf8, + expectedEncodingResult: CharacterEncodingResult.Success, + expectedUtf32Transcoding: (uint)rune.Value); + } + } + + [Fact] + public void Utf32ToUtf8_Character_AllPossibleScalarValues() + { + Span utf8 = stackalloc byte[4]; + + foreach (Rune rune in s_allValidScalars) + { + ToUtf8(rune, utf8); + + Utf32ToUtf8_Character_Test_Core( + utf32Input: (uint)rune.Value, + expectedEncodingResult: CharacterEncodingResult.Success, + expectedUtf8Transcoding: utf8); + } + } + + [Fact] + public void PickOutCharacterFromUtf8String_AllPossibleScalarValues() + { + byte[] expectedUtf8 = new byte[4]; + byte[] actualUtf8 = new byte[4]; + + ReadOnlySpan utf8Values = s_allScalarsAsUtf8.Span.Slice(1); + + foreach (Rune rune in s_allValidScalars.Skip(1)) + { + ToUtf8(rune, expectedUtf8); + + CharacterEncodingResult result = CharacterEncoding.PickOutCharacterFromUtf8String(actualUtf8, ref utf8Values); + + Assert.Equal(CharacterEncodingResult.Success, result); + Assert.Equal((ReadOnlySpan)expectedUtf8, actualUtf8); + } + } + + private static void Utf8ToUtf16_String_Test_Core(ReadOnlySpan utf8Input, int destinationSize, CharacterEncodingResult expectedEncodingResult, ReadOnlySpan expectedUtf16Transcoding) + { + char[] destination = new char[destinationSize]; + + CharacterEncodingResult actualEncodingResult = CharacterEncoding.ConvertStringUtf8ToUtf16Native(destination, utf8Input, utf8Input.Length); + + Assert.Equal(expectedEncodingResult, actualEncodingResult); + Assert.Equal(expectedUtf16Transcoding, destination.AsSpan(0, expectedUtf16Transcoding.Length)); + } + + private static void Utf16ToUtf8_String_Test_Core(ReadOnlySpan utf16Input, int destinationSize, CharacterEncodingResult expectedEncodingResult, ReadOnlySpan expectedUtf8Transcoding) + { + byte[] destination = new byte[destinationSize]; + + CharacterEncodingResult actualEncodingResult = CharacterEncoding.ConvertStringUtf16NativeToUtf8(destination, utf16Input, utf16Input.Length); + + Assert.Equal(expectedEncodingResult, actualEncodingResult); + Assert.Equal(expectedUtf8Transcoding, destination.AsSpan(0, expectedUtf8Transcoding.Length)); + } + + private static void Utf8ToUtf32_String_Test_Core(ReadOnlySpan utf8Input, int destinationSize, CharacterEncodingResult expectedEncodingResult, ReadOnlySpan expectedUtf32Transcoding) + { + uint[] destination = new uint[destinationSize]; + + CharacterEncodingResult actualEncodingResult = CharacterEncoding.ConvertStringUtf8ToUtf32(destination, utf8Input, utf8Input.Length); + + Assert.Equal(expectedEncodingResult, actualEncodingResult); + Assert.Equal(expectedUtf32Transcoding, destination.AsSpan(0, expectedUtf32Transcoding.Length)); + } + + private static void Utf32ToUtf8_String_Test_Core(ReadOnlySpan utf32Input, int destinationSize, CharacterEncodingResult expectedEncodingResult, ReadOnlySpan expectedUtf8Transcoding) + { + byte[] destination = new byte[destinationSize]; + + CharacterEncodingResult actualEncodingResult = CharacterEncoding.ConvertStringUtf32ToUtf8(destination, utf32Input, utf32Input.Length); + + Assert.Equal(expectedEncodingResult, actualEncodingResult); + Assert.Equal(expectedUtf8Transcoding, destination.AsSpan(0, expectedUtf8Transcoding.Length)); + } + + private static void Utf8ToUtf16_Length_Test_Core(ReadOnlySpan utf8Input, CharacterEncodingResult expectedEncodingResult, int expectedUtf16Length) + { + CharacterEncodingResult actualEncodingResult = CharacterEncoding.GetLengthOfConvertedStringUtf8ToUtf16Native(out int actualLength, utf8Input); + + Assert.Equal(expectedEncodingResult, actualEncodingResult); + Assert.Equal(expectedUtf16Length, actualLength); + } + + private static void Utf16ToUtf8_Length_Test_Core(ReadOnlySpan utf16Input, CharacterEncodingResult expectedEncodingResult, int expectedUtf8Length) + { + CharacterEncodingResult actualEncodingResult = CharacterEncoding.GetLengthOfConvertedStringUtf16NativeToUtf8(out int actualLength, utf16Input); + + Assert.Equal(expectedEncodingResult, actualEncodingResult); + Assert.Equal(expectedUtf8Length, actualLength); + } + + private static void Utf8ToUtf32_Length_Test_Core(ReadOnlySpan utf8Input, CharacterEncodingResult expectedEncodingResult, int expectedUtf32Length) + { + CharacterEncodingResult actualEncodingResult = CharacterEncoding.GetLengthOfConvertedStringUtf8ToUtf32(out int actualLength, utf8Input); + + Assert.Equal(expectedEncodingResult, actualEncodingResult); + Assert.Equal(expectedUtf32Length, actualLength); + } + + private static void Utf32ToUtf8_Length_Test_Core(ReadOnlySpan utf32Input, CharacterEncodingResult expectedEncodingResult, int expectedUtf8Length) + { + CharacterEncodingResult actualEncodingResult = CharacterEncoding.GetLengthOfConvertedStringUtf32ToUtf8(out int actualLength, utf32Input); + + Assert.Equal(expectedEncodingResult, actualEncodingResult); + Assert.Equal(expectedUtf8Length, actualLength); + } + + private static void Utf8ToUtf16_Character_Test_Core(ReadOnlySpan utf8Input, CharacterEncodingResult expectedEncodingResult, ReadOnlySpan expectedUtf16Transcoding) + { + Span destination = stackalloc char[2]; + + CharacterEncodingResult actualEncodingResult = CharacterEncoding.ConvertCharacterUtf8ToUtf16Native(destination, utf8Input); + + Assert.Equal(expectedEncodingResult, actualEncodingResult); + Assert.Equal(expectedUtf16Transcoding, destination.Slice(0, expectedUtf16Transcoding.Length)); + + for (int i = expectedUtf16Transcoding.Length; i < destination.Length; i++) + { + Assert.Equal(0, destination[i]); + } + } + + private static void Utf16ToUtf8_Character_Test_Core(ReadOnlySpan utf16Input, CharacterEncodingResult expectedEncodingResult, ReadOnlySpan expectedUtf8Transcoding) + { + Span destination = stackalloc byte[4]; + + CharacterEncodingResult actualEncodingResult = CharacterEncoding.ConvertCharacterUtf16NativeToUtf8(destination, utf16Input); + + Assert.Equal(expectedEncodingResult, actualEncodingResult); + Assert.Equal(expectedUtf8Transcoding, destination.Slice(0, expectedUtf8Transcoding.Length)); + + for (int i = expectedUtf8Transcoding.Length; i < destination.Length; i++) + { + Assert.Equal(0, destination[i]); + } + } + + private static void Utf8ToUtf32_Character_Test_Core(ReadOnlySpan utf8Input, CharacterEncodingResult expectedEncodingResult, uint expectedUtf32Transcoding) + { + CharacterEncodingResult actualEncodingResult = CharacterEncoding.ConvertCharacterUtf8ToUtf32(out uint destination, utf8Input); + + Assert.Equal(expectedEncodingResult, actualEncodingResult); + Assert.Equal(expectedUtf32Transcoding, destination); + } + + private static void Utf32ToUtf8_Character_Test_Core(uint utf32Input, CharacterEncodingResult expectedEncodingResult, ReadOnlySpan expectedUtf8Transcoding) + { + Span destination = stackalloc byte[4]; + + CharacterEncodingResult actualEncodingResult = CharacterEncoding.ConvertCharacterUtf32ToUtf8(destination, utf32Input); + + Assert.Equal(expectedEncodingResult, actualEncodingResult); + Assert.Equal(expectedUtf8Transcoding, destination.Slice(0, expectedUtf8Transcoding.Length)); + + for (int i = expectedUtf8Transcoding.Length; i < destination.Length; i++) + { + Assert.Equal(0, destination[i]); + } + } + } +}