From f761990225fd98160b52ef90f88b769c0f6b0dc6 Mon Sep 17 00:00:00 2001 From: Rory& Date: Mon, 15 Jul 2024 13:52:47 +0200 Subject: Working json canonicalisation --- LibMatrix/Extensions/UnicodeJsonEncoder.cs | 173 +++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 LibMatrix/Extensions/UnicodeJsonEncoder.cs (limited to 'LibMatrix/Extensions/UnicodeJsonEncoder.cs') diff --git a/LibMatrix/Extensions/UnicodeJsonEncoder.cs b/LibMatrix/Extensions/UnicodeJsonEncoder.cs new file mode 100644 index 0000000..ae58263 --- /dev/null +++ b/LibMatrix/Extensions/UnicodeJsonEncoder.cs @@ -0,0 +1,173 @@ +// LibMatrix: File sourced from https://github.com/dotnet/runtime/pull/87147/files under the MIT license. + +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Text; +using System.Text.Encodings.Web; + +namespace LibMatrix.Extensions; + +internal sealed class UnicodeJsonEncoder : JavaScriptEncoder +{ + internal static readonly UnicodeJsonEncoder Singleton = new UnicodeJsonEncoder(); + + private readonly bool _preferHexEscape; + private readonly bool _preferUppercase; + + public UnicodeJsonEncoder() + : this(preferHexEscape: false, preferUppercase: false) + { + } + + public UnicodeJsonEncoder(bool preferHexEscape, bool preferUppercase) + { + _preferHexEscape = preferHexEscape; + _preferUppercase = preferUppercase; + } + + public override int MaxOutputCharactersPerInputCharacter => 6; // "\uXXXX" for a single char ("\uXXXX\uYYYY" [12 chars] for supplementary scalar value) + + public override unsafe int FindFirstCharacterToEncode(char* text, int textLength) + { + for (int index = 0; index < textLength; ++index) + { + char value = text[index]; + + if (NeedsEncoding(value)) + { + return index; + } + } + + return -1; + } + + public override unsafe bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten) + { + bool encode = WillEncode(unicodeScalar); + + if (!encode) + { + Span span = new Span(buffer, bufferLength); + int spanWritten; + bool succeeded = new Rune(unicodeScalar).TryEncodeToUtf16(span, out spanWritten); + numberOfCharactersWritten = spanWritten; + return succeeded; + } + + if (!_preferHexEscape && unicodeScalar <= char.MaxValue && HasTwoCharacterEscape((char)unicodeScalar)) + { + if (bufferLength < 2) + { + numberOfCharactersWritten = 0; + return false; + } + + buffer[0] = '\\'; + buffer[1] = GetTwoCharacterEscapeSuffix((char)unicodeScalar); + numberOfCharactersWritten = 2; + return true; + } + else + { + if (bufferLength < 6) + { + numberOfCharactersWritten = 0; + return false; + } + + buffer[0] = '\\'; + buffer[1] = 'u'; + buffer[2] = '0'; + buffer[3] = '0'; + buffer[4] = ToHexDigit((unicodeScalar & 0xf0) >> 4, _preferUppercase); + buffer[5] = ToHexDigit(unicodeScalar & 0xf, _preferUppercase); + numberOfCharactersWritten = 6; + return true; + } + } + + public override bool WillEncode(int unicodeScalar) + { + if (unicodeScalar > char.MaxValue) + { + return false; + } + + return NeedsEncoding((char)unicodeScalar); + } + + // https://datatracker.ietf.org/doc/html/rfc8259#section-7 + private static bool NeedsEncoding(char value) + { + if (value == '"' || value == '\\') + { + return true; + } + + return value <= '\u001f'; + } + + private static bool HasTwoCharacterEscape(char value) + { + // RFC 8259, Section 7, "char = " BNF + switch (value) + { + case '"': + case '\\': + case '/': + case '\b': + case '\f': + case '\n': + case '\r': + case '\t': + return true; + default: + return false; + } + } + + private static char GetTwoCharacterEscapeSuffix(char value) + { + // RFC 8259, Section 7, "char = " BNF + switch (value) + { + case '"': + return '"'; + case '\\': + return '\\'; + case '/': + return '/'; + case '\b': + return 'b'; + case '\f': + return 'f'; + case '\n': + return 'n'; + case '\r': + return 'r'; + case '\t': + return 't'; + default: + throw new ArgumentOutOfRangeException(nameof(value)); + } + } + + private static char ToHexDigit(int value, bool uppercase) + { + if (value > 0xf) + { + throw new ArgumentOutOfRangeException(nameof(value)); + } + + if (value < 10) + { + return (char)(value + '0'); + } + else + { + return (char)(value - 0xa + (uppercase ? 'A' : 'a')); + } + } +} \ No newline at end of file -- cgit 1.4.1