LineBreaker.cs 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. // RichTextKit
  2. // Copyright © 2019-2020 Topten Software. All Rights Reserved.
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License"); you may
  5. // not use this product except in compliance with the License. You may obtain
  6. // a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  12. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  13. // License for the specific language governing permissions and limitations
  14. // under the License.
  15. //
  16. // Ported from: https://github.com/foliojs/linebreak
  17. using System.Collections.Generic;
  18. using Topten.RichTextKit.Utils;
  19. namespace Topten.RichTextKit
  20. {
  21. /// <summary>
  22. /// Implementation of the Unicode Line Break Algorithm
  23. /// </summary>
  24. internal class LineBreaker
  25. {
  26. /// <summary>
  27. /// Constructor
  28. /// </summary>
  29. static LineBreaker()
  30. {
  31. }
  32. /// <summary>
  33. /// Reset this line breaker
  34. /// </summary>
  35. /// <param name="str">The string to be broken</param>
  36. public void Reset(string str)
  37. {
  38. Reset(new Slice<int>(Utf32Utils.ToUtf32(str)));
  39. }
  40. /// <summary>
  41. /// Reset this line breaker
  42. /// </summary>
  43. /// <param name="codePoints">The code points of the string to be broken</param>
  44. public void Reset(Slice<int> codePoints)
  45. {
  46. _codePoints = codePoints;
  47. _first = true;
  48. _pos = 0;
  49. _lastPos = 0;
  50. _LB8a = false;
  51. _LB21a = false;
  52. _LB30a = 0;
  53. }
  54. Slice<int> _codePoints;
  55. bool _first = true;
  56. int _pos;
  57. int _lastPos;
  58. LineBreakClass _curClass;
  59. LineBreakClass _nextClass;
  60. bool _LB8a = false;
  61. bool _LB21a = false;
  62. int _LB30a = 0;
  63. /// <summary>
  64. /// Enumerate all line breaks
  65. /// </summary>
  66. /// <returns>A collection of line break positions</returns>
  67. public List<LineBreak> GetBreaks(bool mandatoryOnly = false)
  68. {
  69. var list = new List<LineBreak>();
  70. if (mandatoryOnly)
  71. {
  72. list.AddRange(FindMandatoryBreaks());
  73. }
  74. else
  75. {
  76. while (NextBreak(out var lb))
  77. list.Add(lb);
  78. }
  79. return list;
  80. }
  81. LineBreakClass mapClass(LineBreakClass c)
  82. {
  83. switch (c)
  84. {
  85. case LineBreakClass.AI:
  86. return LineBreakClass.AL;
  87. case LineBreakClass.SA:
  88. case LineBreakClass.SG:
  89. case LineBreakClass.XX:
  90. return LineBreakClass.AL;
  91. case LineBreakClass.CJ:
  92. return LineBreakClass.NS;
  93. default:
  94. return c;
  95. }
  96. }
  97. LineBreakClass mapFirst(LineBreakClass c)
  98. {
  99. switch (c)
  100. {
  101. case LineBreakClass.LF:
  102. case LineBreakClass.NL:
  103. return LineBreakClass.BK;
  104. case LineBreakClass.SP:
  105. return LineBreakClass.WJ;
  106. default:
  107. return c;
  108. }
  109. }
  110. // Get the next character class
  111. LineBreakClass nextCharClass()
  112. {
  113. return mapClass(UnicodeClasses.LineBreakClass(_codePoints[_pos++]));
  114. }
  115. bool? getSimpleBreak()
  116. {
  117. // handle classes not handled by the pair table
  118. switch (_nextClass)
  119. {
  120. case LineBreakClass.SP:
  121. return false;
  122. case LineBreakClass.BK:
  123. case LineBreakClass.LF:
  124. case LineBreakClass.NL:
  125. _curClass = LineBreakClass.BK;
  126. return false;
  127. case LineBreakClass.CR:
  128. _curClass = LineBreakClass.CR;
  129. return false;
  130. }
  131. return null;
  132. }
  133. bool getPairTableBreak(LineBreakClass lastClass)
  134. {
  135. // if not handled already, use the pair table
  136. bool shouldBreak = false;
  137. switch (LineBreakPairTable.table[(int)_curClass][(int)_nextClass])
  138. {
  139. case LineBreakPairTable.DI_BRK: // Direct break
  140. shouldBreak = true;
  141. break;
  142. case LineBreakPairTable.IN_BRK: // possible indirect break
  143. shouldBreak = lastClass == LineBreakClass.SP;
  144. break;
  145. case LineBreakPairTable.CI_BRK:
  146. shouldBreak = lastClass == LineBreakClass.SP;
  147. if (!shouldBreak)
  148. {
  149. shouldBreak = false;
  150. return shouldBreak;
  151. }
  152. break;
  153. case LineBreakPairTable.CP_BRK: // prohibited for combining marks
  154. if (lastClass != LineBreakClass.SP)
  155. {
  156. return shouldBreak;
  157. }
  158. break;
  159. case LineBreakPairTable.PR_BRK:
  160. break;
  161. }
  162. if (_LB8a)
  163. {
  164. shouldBreak = false;
  165. }
  166. // Rule LB21a
  167. if (_LB21a && (_curClass == LineBreakClass.HY || _curClass == LineBreakClass.BA))
  168. {
  169. shouldBreak = false;
  170. _LB21a = false;
  171. }
  172. else
  173. {
  174. _LB21a = (_curClass == LineBreakClass.HL);
  175. }
  176. // Rule LB30a
  177. if (_curClass == LineBreakClass.RI)
  178. {
  179. _LB30a++;
  180. if (_LB30a == 2 && (_nextClass == LineBreakClass.RI))
  181. {
  182. shouldBreak = true;
  183. _LB30a = 0;
  184. }
  185. }
  186. else
  187. {
  188. _LB30a = 0;
  189. }
  190. _curClass = _nextClass;
  191. return shouldBreak;
  192. }
  193. public bool NextBreak(out LineBreak lineBreak)
  194. {
  195. // get the first char if we're at the beginning of the string
  196. if (_first)
  197. {
  198. _first = false;
  199. var firstClass = nextCharClass();
  200. _curClass = mapFirst(firstClass);
  201. _nextClass = firstClass;
  202. _LB8a = (firstClass == LineBreakClass.ZWJ);
  203. _LB30a = 0;
  204. }
  205. while (_pos < _codePoints.Length)
  206. {
  207. _lastPos = _pos;
  208. var lastClass = _nextClass;
  209. _nextClass = nextCharClass();
  210. // explicit newline
  211. if ((_curClass == LineBreakClass.BK) || ((_curClass == LineBreakClass.CR) && (_nextClass != LineBreakClass.LF)))
  212. {
  213. _curClass = mapFirst(mapClass(_nextClass));
  214. lineBreak = new LineBreak(findPriorNonWhitespace(_lastPos), _lastPos, true);
  215. return true;
  216. }
  217. bool? shouldBreak = getSimpleBreak();
  218. if (!shouldBreak.HasValue)
  219. {
  220. shouldBreak = getPairTableBreak(lastClass);
  221. }
  222. // Rule LB8a
  223. _LB8a = (_nextClass == LineBreakClass.ZWJ);
  224. if (shouldBreak.Value)
  225. {
  226. lineBreak = new LineBreak(findPriorNonWhitespace(_lastPos), _lastPos, false);
  227. return true;
  228. }
  229. }
  230. if (_lastPos < _codePoints.Length)
  231. {
  232. _lastPos = _codePoints.Length;
  233. var required = (_curClass == LineBreakClass.BK) || ((_curClass == LineBreakClass.CR) && (_nextClass != LineBreakClass.LF));
  234. lineBreak = new LineBreak(findPriorNonWhitespace(_codePoints.Length), _lastPos, required);
  235. return true;
  236. }
  237. else
  238. {
  239. lineBreak = new LineBreak(0, 0, false);
  240. return false;
  241. }
  242. }
  243. public IEnumerable<LineBreak> FindMandatoryBreaks()
  244. {
  245. for (int i = 0; i < _codePoints.Length; i++)
  246. {
  247. var cls = UnicodeClasses.LineBreakClass(_codePoints[i]);
  248. switch (cls)
  249. {
  250. case LineBreakClass.BK:
  251. yield return new LineBreak(i, i + 1, true);
  252. break;
  253. case LineBreakClass.CR:
  254. if (i + 1 < _codePoints.Length && UnicodeClasses.LineBreakClass(_codePoints[i + 1]) == LineBreakClass.LF)
  255. {
  256. yield return new LineBreak(i, i + 2, true);
  257. }
  258. else
  259. {
  260. yield return new LineBreak(i, i + 1, true);
  261. }
  262. break;
  263. case LineBreakClass.LF:
  264. yield return new LineBreak(i, i + 1, true);
  265. break;
  266. }
  267. }
  268. }
  269. int findPriorNonWhitespace(int from)
  270. {
  271. if (from > 0)
  272. {
  273. var cls = UnicodeClasses.LineBreakClass(_codePoints[from - 1]);
  274. if (cls == LineBreakClass.BK || cls == LineBreakClass.LF || cls == LineBreakClass.CR)
  275. from--;
  276. // ATZ: in case of cr + lf, remove both from measure
  277. if (from > 0 && cls == LineBreakClass.LF)
  278. {
  279. cls = UnicodeClasses.LineBreakClass(_codePoints[from - 1]);
  280. if (cls == LineBreakClass.CR)
  281. from--;
  282. }
  283. }
  284. while (from > 0)
  285. {
  286. var cls = UnicodeClasses.LineBreakClass(_codePoints[from - 1]);
  287. if (cls == LineBreakClass.SP)
  288. from--;
  289. else
  290. break;
  291. }
  292. return from;
  293. }
  294. }
  295. }