// RichTextKit // Copyright © 2019-2020 Topten Software. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); you may // not use this product except in compliance with the License. You may obtain // a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using Topten.RichTextKit.Utils; namespace Topten.RichTextKit.Utils { /// /// Represents a buffer of UTF-32 encoded code point data /// public class Utf32Buffer : Buffer { /// /// Constructs a new Utf32Buffer /// public Utf32Buffer() { } /// /// Constructs a Utf32 buffer with an initial string /// /// The string to initialize with public Utf32Buffer(string str) { Add(str); } /// /// Clears this buffer. /// public new void Clear() { _surrogatePositionsValid = false; base.Clear(); } /// /// Appends utf32 data to this buffer /// /// The UTF32 data to be appended /// A slice representing the added UTF-32 data. public new Slice Add(Slice data) { _surrogatePositionsValid = false; return base.Add(data); } /// /// Appends text to this buffer, converting from UTF-16 to UTF-32 /// /// The string of text to be inserted /// A slice representing the added UTF-32 data. public Slice Add(string str) { return Insert(Length, str); } /// /// Appends text to this buffer, converting from UTF-16 to UTF-32 /// /// The string of text to be inserted /// A slice representing the added UTF-32 data. public Slice Add(ReadOnlySpan str) { return Insert(Length, str); } /// /// Appends utf32 data to this buffer /// /// Position to insert the string /// The string of text to be appended /// A slice representing the added UTF-32 data. public new Slice Insert(int position, Slice data) { _surrogatePositionsValid = false; return base.Insert(position, data); } /// /// Inserts text to this buffer, converting from UTF-16 to UTF-32 /// /// The position to insert the string /// The string of text to be inserted /// A slice representing the added UTF-32 data. public Slice Insert(int position, string str) { return Insert(position, str.AsSpan()); } /// /// Inserts text to this buffer, converting from UTF-16 to UTF-32 /// /// The position to insert the string /// The string of text to be inserted /// A slice representing the added UTF-32 data. public Slice Insert(int position, ReadOnlySpan str) { // Remember old length int oldLength = Length; // Invalidate surrogate positions _surrogatePositionsValid = false; // For performance reasons and to save copying to intermediate arrays if we use // (Encoding.UTF32), we do our own utf16 to utf32 decoding directly to our // internal code point buffer. Also stores the indicies of any surrogate pairs // for later back conversion. // Also use pointers for performance reasons too (maybe) Slice codePointBuffer = base.Insert(position, str.Length); int convertedLength; unsafe { fixed (int* pDestBuf = codePointBuffer.Underlying) fixed (char* pSrcBuf = str) { int* pDestStart = pDestBuf + codePointBuffer.Start; int* pDest = pDestStart; char* pSrc = pSrcBuf; char* pSrcEnd = pSrcBuf + str.Length; while (pSrc < pSrcEnd) { char ch = *pSrc++; if (ch >= 0xD800 && ch <= 0xDFFF) { if (ch <= 0xDBFF) { // High surrogate var chL = pSrc < pSrcEnd ? (*pSrc++) : 0; *pDest++ = 0x10000 | ((ch - 0xD800) << 10) | (chL - 0xDC00); } else { // Single low surrogte? *pDest++ = 0x10000 + ch - 0xDC00; } } else { *pDest++ = ch; } } // Work out the converted length convertedLength = (int)(pDest - pDestStart); } } // If converted length was shorter due to surrogates, then remove // the extra space that was allocated if (convertedLength < str.Length) { base.Delete(position + convertedLength, str.Length - convertedLength); } // Return the encapsulating slice return SubSlice(position, convertedLength); } /// /// Delete a section of the buffer /// /// The position to delete from /// The length to of the deletion public new void Delete(int from, int length) { _surrogatePositionsValid = false; base.Delete(from, length); } /// /// Convers an offset into this buffer to a UTF-16 offset in the originally /// added string. /// /// /// This function assumes the was text added to the buffer as UTF-16 /// and hasn't been modified in any way since. /// /// The UTF-3232 offset to convert /// The converted UTF-16 character offset public int Utf32OffsetToUtf16Offset(int utf32Offset) { // Make sure surrorgate positions are valid BuildSurrogatePositions(); // How many surrogate pairs were there before this utf32 offset? int pos = _surrogatePositions.BinarySearch(utf32Offset); if (pos < 0) { pos = ~pos; } return utf32Offset + pos; } /// /// Converts an offset in the original UTF-16 string, a code point index into /// this UTF-32 buffer. /// /// The utf-16 character index /// The utf-32 code point index public int Utf16OffsetToUtf32Offset(int utf16Offset) { // Make sure surrorgate positions are valid BuildSurrogatePositions(); var pos = utf16Offset; for (int i = 0; i < _surrogatePositions.Count; i++) { var sp = _surrogatePositions[i]; if (sp < pos) pos--; if (sp > pos) return pos; } return pos; } /// /// Gets the enture buffer's content as a string. /// /// public override string ToString() { return Utf32Utils.FromUtf32(AsSlice()); } /// /// Gets a part of the buffer as a string. /// /// The UTF-32 code point index of the first character to retrieve /// The number of code points in the string to be retrieved /// A string equivalent to the specified code point range. public string GetString(int start, int length) { return Utf32Utils.FromUtf32(SubSlice(start, length)); } /// /// Indicies of all code points in the in the buffer /// that were decoded from a surrogate pair /// List _surrogatePositions = new List(); bool _surrogatePositionsValid = false; /// /// Build an array indicies to all characters that require surrogates /// when converted to utf16. /// void BuildSurrogatePositions() { if (_surrogatePositionsValid) return; _surrogatePositionsValid = true; _surrogatePositions.Clear(); unsafe { fixed (int* pBuf = this.Underlying) { int* pEnd = pBuf + this.Length; int* p = pBuf; while (p < pEnd) { if (p[0] >= 0x10000) _surrogatePositions.Add((int)(p - pBuf)); p++; } } } } } }