RTF_Parser.cs 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Data;
  4. using System.Diagnostics;
  5. using System.Text;
  6. namespace FastReport.RichTextParser
  7. {
  8. enum ParserStatus
  9. {
  10. Collecting,
  11. Text,
  12. ControlTag,
  13. OpenBlock,
  14. CloseBlock
  15. }
  16. /// <summary>
  17. /// This class detect a RTF control sequences and text.
  18. /// </summary>
  19. class RTF_Parser
  20. {
  21. enum ParserState
  22. {
  23. Neutral,
  24. Control,
  25. Number,
  26. FirstNibble,
  27. SecondNibble,
  28. SkipToNext,
  29. RichFormatExtensions,
  30. CheckHyphen,
  31. IgnoreSpaces
  32. }
  33. ParserStatus status;
  34. ParserState parser_state;
  35. StringBuilder control;
  36. StringBuilder number;
  37. StringBuilder text;
  38. bool has_value;
  39. bool list_is_active;
  40. string control_tag;
  41. string previous_tag;
  42. long control_num;
  43. string parsed_text;
  44. char delimiter;
  45. bool has_has_value;
  46. int indirection_counter;
  47. int skip_counter;
  48. int extensions_skip_counter;
  49. // 20200605
  50. internal bool override_default_color;
  51. // 20180511
  52. internal ParagraphFormat current_paragraph_format;
  53. internal RunFormat current_run_format;
  54. internal bool insideTable;
  55. #region Uniocode converters
  56. internal int current_lang;
  57. internal long font_charset;
  58. int default_lang;
  59. const int Lang_EN_US = 1033;
  60. Decoder current_unicode_decoder;
  61. Dictionary<long, Decoder> unicode_decoders = new Dictionary<long, Decoder>();
  62. Stack<int> lang_ids = new Stack<int>();
  63. Dictionary<long, long> translate_charset = new Dictionary<long, long>();
  64. List<byte> raw_chars = new List<byte>();
  65. #endregion
  66. public string Text { get { return parsed_text; } }
  67. public string Control { get { return control_tag; } }
  68. public string PreviousTag { get { return previous_tag; } }
  69. public long Number { get { return control_num; } }
  70. public char Delimiter { get { return status == ParserStatus.ControlTag ? delimiter : '\0'; } }
  71. public ParserStatus Status { get { return status; } }
  72. public bool HasValue { get { return has_has_value; } }
  73. public bool ListItem { get { return list_is_active; } set { list_is_active = value; } }
  74. public bool EndOfFile
  75. {
  76. get
  77. {
  78. if (text.Length == 0)
  79. return false;
  80. parsed_text = text.ToString();
  81. return true;
  82. }
  83. }
  84. public void ClearParsedText() { parsed_text = String.Empty; }
  85. #region Language selector and translator
  86. private void SelectUnicodeDecoder(int lcid)
  87. {
  88. if (lcid == 0)
  89. lcid = default_lang == 0 ? Lang_EN_US : default_lang;
  90. if (!unicode_decoders.ContainsKey(lcid))
  91. {
  92. System.Globalization.CultureInfo ci;
  93. try
  94. {
  95. ci = System.Globalization.CultureInfo.GetCultureInfo(lcid);
  96. }
  97. catch (Exception)
  98. {
  99. ci = System.Globalization.CultureInfo.CurrentCulture;
  100. }
  101. Encoding encoder = Encoding.GetEncoding(ci.TextInfo.ANSICodePage);
  102. current_unicode_decoder = encoder.GetDecoder();
  103. unicode_decoders.Add(lcid, current_unicode_decoder);
  104. }
  105. else
  106. {
  107. current_unicode_decoder = unicode_decoders[lcid];
  108. }
  109. current_lang = lcid;
  110. }
  111. int TranslateCharset(long charset)
  112. {
  113. switch (charset)
  114. {
  115. case 0: return 1033; // ANSI
  116. case 1: return default_lang; // default
  117. case 2: return 42; // Symbol - to fix
  118. case 77: return 10000; // Mac romant - to fix
  119. case 78: return 10001; // Mac Shift Jis - to fix
  120. case 79: return 10003; // Mac Hangul - to fix
  121. case 80: return 10008; // Mac GB2312 - to fix
  122. case 81: return 10002; // Mac Big5 - to fix
  123. //case 82: return 10002; // Johab old
  124. case 83: return 10005; // Mac Hebrew - to fix
  125. case 84: return 10004; // Mac Arabic - to fix
  126. case 85: return 10006; // Mac Greek - to fix
  127. case 86: return 10081; // Mac Turkish - to fix
  128. case 87: return 10021; // Mac Thai - to fix
  129. case 88: return 10029; // Mac East Europe - to fix
  130. case 89: return 10007; // Mac Russian - to fix
  131. case 128: return 1041; // 932; Shift JIS
  132. case 129: return 1042; // 949; Korean Hangul
  133. case 130: return default_lang; // 1361; Korean Johab
  134. case 134: return 2052; // 936; GB2312
  135. case 136: return 1028; // 950; BIG5
  136. case 161: return 1032; // 1253; Greel
  137. case 162: return 1055; // 1254; Turkish
  138. case 163: return 1066; // 1258; Vietnamese
  139. case 177: return 1037; // 1255; Hebrew
  140. case 178: return 1056; // 1256; Arabic
  141. //case 179: return 0; // Arabic Traditional (old)
  142. //case 180: return 0; // Arabic user (old)
  143. //case 181: return 0; // Hebrew user (old)
  144. case 186: return 1062; // 1257; Baltic
  145. case 204: return 1049; // 1251; Russian
  146. case 222: return 1054; // 874; Thai
  147. case 238: return 1045; // 1250; East Europe (Polland selected)
  148. case 254: return current_lang; // 437; // PC437
  149. case 255: return current_lang; // 850; // OEM
  150. }
  151. return current_lang;
  152. }
  153. internal void SelectCodepageByFontCharset(long charset)
  154. {
  155. int lcid = TranslateCharset(charset);
  156. SelectUnicodeDecoder(lcid);
  157. }
  158. private void PushLocaleDecoder()
  159. {
  160. lang_ids.Push(current_lang);
  161. }
  162. private void PopLocaleDecoder()
  163. {
  164. if (lang_ids.Count != 0)
  165. {
  166. current_lang = lang_ids.Pop();
  167. SelectUnicodeDecoder(current_lang);
  168. }
  169. else
  170. {
  171. #if DEBUG
  172. Debug.WriteLine("Rich document error: broken source document structure. Ignore error and continue");
  173. #else
  174. throw new SyntaxErrorException("Rich document structure error - broken source document");
  175. #endif
  176. }
  177. }
  178. private void CollectCharacters()
  179. {
  180. try
  181. {
  182. byte hex = byte.Parse(number.ToString(), System.Globalization.NumberStyles.HexNumber);
  183. number = new StringBuilder();
  184. raw_chars.Add(hex);
  185. }
  186. catch (Exception e)
  187. {
  188. ;
  189. }
  190. }
  191. //private void TranslateUnicode(System.Globalization.NumberStyles num_style)
  192. //{
  193. // uint unichar = uint.Parse(number.ToString(), num_style);
  194. // number.Length = 0;
  195. // byte[] conv = new byte[2];
  196. // char[] chars = new char[2];
  197. // conv[0] = (byte)unichar;
  198. // conv[1] = 0;
  199. // if (current_lang == 0)
  200. // SelectUnicodeDecoder(default_lang == 0 ? 1033 : default_lang);
  201. // current_unicode_decoder.GetChars(conv, 0, 1, chars, 0);
  202. // text.Append(chars[0]);
  203. //}
  204. #endregion
  205. private void ControlWord()
  206. {
  207. previous_tag = control_tag;
  208. control_tag = control.ToString();
  209. control_num = number.Length != 0 ? long.Parse(number.ToString()) : 0;
  210. if (control_tag == "lang")
  211. {
  212. if (font_charset == 0)
  213. SelectUnicodeDecoder((int)control_num);
  214. }
  215. if (control_tag == "deflang")
  216. {
  217. SelectUnicodeDecoder((int)control_num);
  218. default_lang = current_lang;
  219. }
  220. RestoreEncodedText();
  221. parsed_text = text.ToString();
  222. control.Length = 0;
  223. number.Length = 0;
  224. text.Length = 0;
  225. has_has_value = has_value;
  226. has_value = false;
  227. }
  228. private void RestoreEncodedText()
  229. {
  230. if (raw_chars.Count != 0)
  231. {
  232. byte[] arr = raw_chars.ToArray();
  233. char[] result = new char[raw_chars.Count];
  234. int count = current_unicode_decoder.GetChars(arr, 0, raw_chars.Count, result, 0);
  235. char[] str = new char[count];
  236. Array.Copy(result, str, count);
  237. string text = new string(str);
  238. this.text.Append(text);
  239. raw_chars.Clear();
  240. }
  241. }
  242. private void AppendCharacter(char ch)
  243. {
  244. RestoreEncodedText();
  245. this.text.Append(ch);
  246. }
  247. internal ParserStatus ParseByte(char ch)
  248. {
  249. //Console.Write(ch);
  250. status = ParserStatus.Collecting;
  251. delimiter = ch;
  252. if (ch == '{')
  253. {
  254. indirection_counter++;
  255. PushLocaleDecoder();
  256. }
  257. if (ch == '}')
  258. {
  259. PopLocaleDecoder();
  260. indirection_counter--;
  261. }
  262. bool loop;
  263. do
  264. {
  265. loop = false;
  266. switch (parser_state)
  267. {
  268. case ParserState.Neutral:
  269. switch (ch)
  270. {
  271. case '{':
  272. ControlWord();
  273. status = ParserStatus.OpenBlock;
  274. break;
  275. case '}':
  276. ControlWord();
  277. status = ParserStatus.CloseBlock;
  278. break;
  279. case '\\':
  280. parser_state = ParserState.Control;
  281. break;
  282. default:
  283. switch (ch)
  284. {
  285. case '\r':
  286. case '\n':
  287. case '\t':
  288. case '\0':
  289. break;
  290. default:
  291. AppendCharacter(ch);
  292. break;
  293. }
  294. break;
  295. }
  296. break;
  297. case ParserState.CheckHyphen:
  298. if (char.IsDigit(ch))
  299. {
  300. number.Append('-');
  301. number.Append(ch);
  302. parser_state = ParserState.Number;
  303. has_value = true;
  304. break;
  305. }
  306. // Substitute Optional HYPHEN with ZERO WIDTH SPACE
  307. AppendCharacter((char)8203);
  308. parser_state = ParserState.Neutral;
  309. status = ParseByte(ch);
  310. break;
  311. case ParserState.Control:
  312. if (char.IsLetter(ch))
  313. {
  314. control.Append(ch);
  315. }
  316. else if (ch == '-')
  317. {
  318. parser_state = ParserState.CheckHyphen;
  319. }
  320. else if (char.IsDigit(ch))
  321. {
  322. number.Append(ch);
  323. parser_state = ParserState.Number;
  324. has_value = true;
  325. }
  326. else if (ch == '\\')
  327. {
  328. if (control.Length > 0)
  329. {
  330. ControlWord();
  331. status = ParserStatus.ControlTag;
  332. }
  333. else
  334. {
  335. AppendCharacter(ch);
  336. parser_state = ParserState.Neutral;
  337. }
  338. }
  339. else if (ch == '{')
  340. {
  341. if (control.Length > 0)
  342. {
  343. ControlWord();
  344. status = ParserStatus.OpenBlock;
  345. }
  346. else
  347. {
  348. AppendCharacter(ch);
  349. status = ParserStatus.Collecting;
  350. }
  351. parser_state = ParserState.Neutral;
  352. }
  353. else if (ch == '}')
  354. {
  355. if (control.Length > 0)
  356. {
  357. ControlWord();
  358. status = ParserStatus.CloseBlock;
  359. }
  360. else
  361. {
  362. AppendCharacter(ch);
  363. status = ParserStatus.Collecting;
  364. }
  365. parser_state = ParserState.Neutral;
  366. }
  367. else if (char.IsWhiteSpace(ch))
  368. {
  369. parser_state = ParserState.IgnoreSpaces;
  370. ControlWord();
  371. status = ParserStatus.ControlTag;
  372. }
  373. else if (ch == '*')
  374. {
  375. #if false // Preivous version which ignore pictures in \* control (20210211)
  376. parser_state = ParserState.RichFormatExtensions;
  377. if (indirection_counter == 0)
  378. throw new Exception("Broken RTF format");
  379. extensions_skip_counter = indirection_counter - 1;
  380. #else
  381. parser_state = ParserState.Neutral;
  382. ControlWord();
  383. status = ParserStatus.ControlTag;
  384. #endif
  385. }
  386. else if (ch == ';')
  387. {
  388. parser_state = ParserState.Neutral;
  389. ControlWord();
  390. status = ParserStatus.ControlTag;
  391. }
  392. else if (ch == '\'')
  393. {
  394. parser_state = ParserState.FirstNibble;
  395. }
  396. else if (ch == '~')
  397. {
  398. // Non-breaking space
  399. AppendCharacter((char)0x00a0);
  400. parser_state = ParserState.Neutral;
  401. }
  402. else if (ch == '_')
  403. {
  404. // Non-breaking hyphen
  405. AppendCharacter((char)0x2011);
  406. parser_state = ParserState.Neutral;
  407. }
  408. else if (ch == '.')
  409. {
  410. AppendCharacter('.');
  411. parser_state = ParserState.Neutral;
  412. }
  413. else if (ch == ')')
  414. {
  415. AppendCharacter(')');
  416. parser_state = ParserState.Neutral;
  417. }
  418. else
  419. throw new Exception("RTF format not parsed");
  420. break;
  421. case ParserState.Number:
  422. if (char.IsDigit(ch))
  423. number.Append(ch);
  424. else
  425. {
  426. if (ch == '{')
  427. {
  428. parser_state = ParserState.Neutral;
  429. status = ParserStatus.OpenBlock;
  430. }
  431. else if (ch == '}')
  432. {
  433. parser_state = ParserState.Neutral;
  434. status = ParserStatus.CloseBlock;
  435. }
  436. else
  437. {
  438. if (this.control.ToString() == "u")
  439. {
  440. int bukva = int.Parse(number.ToString());
  441. AppendCharacter((char)bukva);
  442. number.Length = 0;
  443. if (ch != '?')
  444. {
  445. parser_state = ParserState.SkipToNext;
  446. skip_counter = 3;
  447. }
  448. else
  449. {
  450. parser_state = ParserState.Neutral;
  451. control.Length = 0;
  452. }
  453. break;
  454. }
  455. else if (ch == '\\')
  456. parser_state = ParserState.Control;
  457. else if (ch == ';' || char.IsWhiteSpace(ch))
  458. parser_state = ParserState.Neutral;
  459. status = ParserStatus.ControlTag;
  460. }
  461. ControlWord();
  462. }
  463. break;
  464. case ParserState.FirstNibble:
  465. parser_state = ParserState.SecondNibble;
  466. number.Append(ch);
  467. break;
  468. case ParserState.SecondNibble:
  469. number.Append(ch);
  470. CollectCharacters();
  471. //TranslateUnicode(System.Globalization.NumberStyles.HexNumber);
  472. parser_state = ParserState.Neutral;
  473. break;
  474. case ParserState.SkipToNext: // Ignore hexdecmal representation of the character
  475. skip_counter--;
  476. if (skip_counter == 0)
  477. {
  478. parser_state = ParserState.Neutral;
  479. control.Length = 0;
  480. }
  481. break;
  482. case ParserState.RichFormatExtensions:
  483. status = ParseExtensionByte(ch);
  484. break;
  485. case ParserState.IgnoreSpaces:
  486. if (ch == ' ')
  487. {
  488. // Debug.WriteLine("Skip space");
  489. }
  490. else
  491. {
  492. parser_state = ParserState.Neutral;
  493. loop = true;
  494. }
  495. break;
  496. }
  497. } while (loop);
  498. return status;
  499. }
  500. #if false // Debug
  501. StringBuilder dbg = new StringBuilder();
  502. private ParserStatus ParseExtensionByte(char ch)
  503. {
  504. if (extensions_skip_counter == indirection_counter)
  505. {
  506. dbg.Append("\n\n");
  507. parser_state = ParserState.Neutral;
  508. dbg.Clear();
  509. }
  510. else
  511. dbg.Append(ch);
  512. return ch == '{' ? ParserStatus.OpenBlock : ch == '}' ? ParserStatus.CloseBlock : ParserStatus.Collecting;
  513. }
  514. #else
  515. private ParserStatus ParseExtensionByte(char ch)
  516. {
  517. if (extensions_skip_counter == indirection_counter)
  518. {
  519. parser_state = ParserState.Neutral;
  520. }
  521. return ch == '{' ? ParserStatus.OpenBlock : ch == '}' ? ParserStatus.CloseBlock : ParserStatus.Collecting;
  522. }
  523. #endif
  524. internal void ResetRunFormat()
  525. {
  526. current_run_format.bold = false;
  527. current_run_format.italic = false;
  528. current_run_format.underline = false;
  529. current_run_format.font_size = 24;
  530. current_run_format.color = System.Drawing.Color.Black;
  531. current_run_format.BColor = System.Drawing.Color.White;
  532. current_run_format.FillColor = System.Drawing.Color.White;
  533. current_run_format.font_idx = 0;
  534. current_run_format.script_type = RunFormat.ScriptType.PlainText;
  535. }
  536. public void ResetParagraphFormat()
  537. {
  538. current_paragraph_format.align = ParagraphFormat.HorizontalAlign.Left;
  539. current_paragraph_format.line_spacing = 0;
  540. current_paragraph_format.space_before = 0;
  541. current_paragraph_format.space_after = 0;
  542. current_paragraph_format.left_indent = 0;
  543. current_paragraph_format.right_indent = 0;
  544. current_paragraph_format.first_line_indent = 0;
  545. current_paragraph_format.lnspcmult = ParagraphFormat.LnSpcMult.Exactly;
  546. current_paragraph_format.pnstart = 0;
  547. current_lang = default_lang;
  548. current_paragraph_format.list_id = null;
  549. current_paragraph_format.tab_positions = null;
  550. }
  551. internal RTF_Parser()
  552. {
  553. parser_state = ParserState.Neutral;
  554. control = new StringBuilder();
  555. number = new StringBuilder(12, 12);
  556. text = new StringBuilder();
  557. has_value = false;
  558. override_default_color = false;
  559. current_lang = 0;
  560. indirection_counter = 0;
  561. ResetRunFormat();
  562. ResetParagraphFormat();
  563. }
  564. static RTF_Parser()
  565. {
  566. #if CROSSPLATFORM || COREWIN
  567. Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
  568. #endif
  569. }
  570. }
  571. }