using System; using System.Collections.Generic; using System.Data; using System.Diagnostics; using System.Diagnostics.Contracts; using System.Globalization; using System.Text; using static FastReport.Fonts.GlyphSubstitutionClass; namespace FastReport.RichTextParser { enum ParserStatus { Collecting, Text, ControlTag, OpenBlock, CloseBlock } /// /// This class detect a RTF control sequences and text. /// class RTF_Parser { enum ParserState { Neutral, Control, Number, FirstNibble, SecondNibble, TryCollectChars, WaitHexcharPrefix, ParsePrefix_0x81_phase0, ParsePrefix_0x81_phase1, ParsePrefix_0x81_phase2, ParsePrefix_0x81_phase3, SkipToNext, RichFormatExtensions, CheckHyphen, IgnoreSpaces, ParserError } ParserStatus status; ParserState parser_state; StringBuilder control; StringBuilder number; StringBuilder text; bool has_value; bool list_is_active; string control_tag; string previous_tag; long control_num; string parsed_text; char delimiter; bool has_has_value; int indirection_counter; int skip_counter; int extensions_skip_counter; // 20200605 internal bool override_default_color; // 20180511 internal ParagraphFormat current_paragraph_format; internal RunFormat current_run_format; internal bool insideTable; // 20240226 internal int skip_space_counter; #region Uniocode converters internal int current_lang; internal long font_charset; int default_lang; const int Lang_EN_US = 1033; Decoder current_unicode_decoder; Dictionary unicode_decoders = new Dictionary(); Stack lang_ids = new Stack(); Dictionary translate_charset = new Dictionary(); List raw_chars = new List(); #endregion public string Text { get { return parsed_text; } } public string Control { get { return control_tag; } } public string PreviousTag { get { return previous_tag; } } public long Number { get { return control_num; } } public char Delimiter { get { return status == ParserStatus.ControlTag ? delimiter : '\0'; } } public ParserStatus Status { get { return status; } } public bool HasValue { get { return has_has_value; } } public bool ListItem { get { return list_is_active; } set { list_is_active = value; } } public bool EndOfFile { get { if (text.Length == 0) return false; parsed_text = text.ToString(); return true; } } public void ClearParsedText() { parsed_text = String.Empty; } #region Language selector and translator private int LCID2Codepade(int lcid) { int codepage = lcid; switch(lcid) { case 1048: // Romanian — Romania ro ro 1048 418 1250 case 1052: // Albanian sq sq 1052 1250 case 1045: // Polish pl pl 1045 415 1250 case 1038: // Hungarian hu hu 1038 1250 case 2074: // Serbian — Latin sr sr-sp 2074 1250 case 1060: // Slovenian sl sl 1060 424 1250 case 1029: // Czech cs cs 1029 405 1250 case 1050: // Croatian hr hr 1050 1250 case 1051: // Slovak sk sk 1051 1250 codepage = 1250; break; case 1088: // Kyrgyz — Cyrillic 1088 440 1251 case 2092: // Azeri — Cyrillic az az-az 2092 1251 case 1049: // Russian ru ru 1049 419 1251 case 1087: // Kazakh kk kk 1087 1251 case 1058: // Ukrainian uk uk 1058 422 1251 case 1092: // Tatar tt tt 1092 444 1251 case 3098: // Serbian — Cyrillic sr sr-sp 3098 1251 case 1059: // Belarusian be be 1059 423 1251 case 2115: // Uzbek — Cyrillic uz uz-uz 2115 843 1251 case 1104: // Mongolian mn mn 1104 450 1251 case 1026: // Bulgarian bg bg 1026 402 1251 case 1071: // FYRO Macedonia mk mk 1071 1251 codepage = 1251; break; case 1078: // Afrikaans af af 1078 436 1252 case 1080: // Faroese fo fo 1080 438 1252 case 1053: // Swedish — Sweden sv sv-se 1053 1252 case 1086: // Malay — Malaysia ms ms-my 1086 1252 case 1089: // Swahili sw sw 1089 441 1252 case 15370: // Spanish — Paraguay es es-py 15370 1252 case 14346: // Spanish — Uruguay es es-uy 14346 1252 case 11273: // English — Trinidad en en-tt 11273 1252 case 13321: // English—Phillippines en en-ph 13321 3409 1252 case 16394: // Spanish — Bolivia es es-bo 16394 1252 case 19466: // Spanish — Nicaragua es es-ni 19466 1252 case 1027: // Catalan ca ca 1027 403 1252 case 1030: // Danish da da 1030 406 1252 case 1031: // German—Germany de de-de 1031 407 1252 case 1033: // English—USA en en-us 1033 409 1252 case 1034: // Spanish—Spain(Tradi) es es-es 1034 1252 case 1035: // Finnish fi fi 1035 1252 case 1036: // French — France fr fr-fr 1036 1252 case 1057: // Indonesian id id 1057 421 1252 case 1040: // Italian—Italy it it-it 1040 410 1252 case 1069: // Basque eu eu 1069 1252 case 1043: // Dutch — Netherlands nl nl-nl 1043 413 1252 case 1044: // Norwegian — Bokml nb no-no 1044 414 1252 case 1046: // Portuguese — Brazil pt pt-br 1046 416 1252 case 13322: // Spanish—Chile es es-cl 13322 1252 case 18422: // Spanish — Honduras es es-hn 18442 1252 case 1110: // Galician gl 1110 456 1252 case 17418: // Spanish— El Salvador es es-sv 17418 1252 case 1039: // Icelandic is is 1039 1252 case 3081: // English — Australia en en-au 3081 1252 case 7178: // Spanish — Dominican es es-do 7178 1252 case 7177: // English—South Africa en en-za 7177 1252 case 3079: // German — Austria de de-at 3079 1252 case 6156: // French — Monaco fr 6156 1252 case 6154: // Spanish — Panama es es-pa 6154 1252 case 6153: // English — Ireland en en-ie 6153 1809 1252 case 4106: // Spanish — Guatemala es es-gt 4106 1252 case 5130: // Spanish — Costa Rica es es-cr 5130 1252 case 2070: // Portuguese— Portugal pt pt-pt 2070 816 1252 case 3084: // French — Canada fr fr-ca 3084 1252 case 5129: // English— New Zealand en en-nz 5129 1409 1252 case 12298: // Spanish — Ecuador es es-ec 12298 1252 case 5127: // German—Liechtenstein de de-li 5127 1407 1252 case 4108: // French — Switzerland fr fr-ch 4108 1252 case 4103: // German — Luxembourg de de-lu 4103 1007 1252 case 4105: // English — Canada en en-ca 4105 1009 1252 case 5132: // French — Luxembourg fr fr-lu 5132 1252 case 8202: // Spanish — Venezuela es es-ve 8202 1252 case 12297: // English — Zimbabwe en 12297 3009 1252 case 10250: // Spanish — Peru es es-pe 10250 1252 case 10249: // English — Belize en en-bz 10249 2809 1252 case 9226: // Spanish — Colombia es es-co 9226 1252 case 20490: // Spanish— Puerto Rico es es-pr 20490 1252 case 9225: // English — Caribbean en en-cb 9225 2409 1252 case 2110: // Malay — Brunei ms ms-bn 2110 1252 case 2057: // English — Gr.Britain en en-gb 2057 809 1252 case 2077: // Swedish — Finland sv sv-fi 2077 1252 case 2058: // Spanish — Mexico es es-mx 2058 1252 case 2060: // French — Belgium fr fr-be 2060 1252 case 2064: // Italian— Switzerland it it-ch 2064 810 1252 case 2067: // Dutch — Belgium nl nl-be 2067 813 1252 case 8201: // English — Jamaica en en-jm 8201 2009 1252 case 2068: // Norwegian — Nynorsk nn no-no 2068 814 1252 case 11274: // Spanish — Argentina es es-ar 11274 1252 case 2055: // German — Switzerland de de-ch 2055 807 1252 codepage = 1252; break; case 1032: // Greek el el 1032 408 1253 codepage = 1253; break; case 1055: // Turkish tr tr 1055 1254 case 1091: // Uzbek — Latin uz uz-uz 1091 443 1254 case 1068: // Azeri — Latin az az-az 1068 1254 codepage = 1254; break; case 1037: // Hebrew he he 1037 1255 codepage = 1255; break; case 1025: // Arabic—Saudi Arabia ar ar-sa 1025 401 1256 case 11265: // Arabic— Jordan ar ar-jo 11265 1256 case 13313: // Arabic — Kuwait ar ar-kw 13313 3401 1256 case 10241: // Arabic — Syria ar ar-sy 10241 2801 1256 case 12289: // Arabic — Lebanon ar ar-lb 12289 3001 1256 case 2049: // Arabic — Iraq ar ar-iq 2049 801 1256 case 15361: // Arabic — Bahrain ar ar-bh 15361 1256 case 9217: // Arabic — Yemen ar ar-ye 9217 2401 1256 case 16385: // Arabic — Qatar ar ar-qa 16385 4001 1256 case 1056: // Urdu ur ur 1056 420 1256 case 3073: // Arabic — Egypt ar ar-eg 3073 1256 case 14337: // Arabic — UAEmirates ar ar-ae 14337 3801 1256 case 7169: // Arabic — Tunisia ar ar-tn 7169 1256 case 1065: // Farsi — Persian fa fa 1065 429 1256 case 6145: // Arabic — Morocco ar ar-ma 6145 1801 1256 case 4097: // Arabic — Libya ar ar-ly 4097 1001 1256 case 5121: // Arabic — Algeria ar ar-dz 5121 1401 1256 case 8193: // Arabic — Oman ar ar-om 8193 2001 1256 codepage = 1256; break; case 1063: // Lithuanian lt lt 1063 427 1257 case 1062: // Latvian lv lv 1062 426 1257 case 1061: // Estonian et et 1061 425 1257 codepage = 1257; break; case 1066: // Vietnamese vi vi 1066 1258 codepage = 1258; break; } return codepage; } private void SelectUnicodeDecoder(int lcid) { if (!unicode_decoders.ContainsKey(lcid)) { int codepage = LCID2Codepade(lcid); Encoding encoder = Encoding.GetEncoding(codepage); current_unicode_decoder = encoder.GetDecoder(); unicode_decoders.Add(lcid, current_unicode_decoder); } else { current_unicode_decoder = unicode_decoders[lcid]; } current_lang = lcid; } int TranslateCharset(long charset) { switch (charset) { case 0: return 1033; // ANSI case 1: return default_lang; // default case 2: return 1038; // Symbol case 77: return 10000; // Mac romant - to fix case 78: return 10001; // Mac Shift Jis - to fix case 79: return 10003; // Mac Hangul - to fix case 80: return 10008; // Mac GB2312 - to fix case 81: return 10002; // Mac Big5 - to fix //case 82: return 10002; // Johab old case 83: return 10005; // Mac Hebrew - to fix case 84: return 10004; // Mac Arabic - to fix case 85: return 10006; // Mac Greek - to fix case 86: return 10081; // Mac Turkish - to fix case 87: return 10021; // Mac Thai - to fix case 88: return 10029; // Mac East Europe - to fix case 89: return 10007; // Mac Russian - to fix case 128: return 932; // Shift JIS case 129: return 949; // Korean Hangul case 130: return default_lang; // 1361; Korean Johab case 134: return 2052; // 936; GB2312 case 136: return 1028; // 950; BIG5 case 161: return 1253; // Greek case 162: return 1055; // Turkish case 163: return 1258; // Vietnamese // 1066; // case 177: return 1255; // Hebrew 1037; // case 178: return 1256; // Arabic 1056; // //case 179: return 0; // Arabic Traditional (old) //case 180: return 0; // Arabic user (old) //case 181: return 0; // Hebrew user (old) case 186: return 1257; // Baltic 1062; // case 204: return 1251; // Russian // 1049; // case 222: return 874; // Thai 1054; // case 238: return 1250; // East Europe (Polland selected) 1045; // case 254: return 437; // PC437 case 255: return 850; // OEM } return default_lang; } internal void SelectCodepageByFontCharset(long charset) { int codepage = TranslateCharset(charset); SelectUnicodeDecoder(codepage); } private void PushLocaleDecoder() { lang_ids.Push(current_lang); } private void PopLocaleDecoder() { if (lang_ids.Count != 0) { current_lang = lang_ids.Pop(); SelectUnicodeDecoder(current_lang); } else { #if DEBUG Debug.WriteLine("Rich document error: broken source document structure. Ignore error and continue"); #else throw new SyntaxErrorException("Rich document structure error - broken source document"); #endif } } #endregion private void ControlWord() { previous_tag = control_tag; control_tag = control.ToString(); control_num = number.Length != 0 ? long.Parse(number.ToString()) : 0; if (control_tag == "lang" || control_tag == "ansicpg") { if (font_charset == 0) { SelectUnicodeDecoder((int)control_num); default_lang = (int) control_num; } } #if true // 2024 Jan 25 - deflang if (control_tag == "deflang") { if(default_lang != 0) { SelectUnicodeDecoder((int)control_num); default_lang = current_lang; } } #endif RestoreEncodedText(); parsed_text = text.ToString(); control.Length = 0; number.Length = 0; text.Length = 0; has_has_value = has_value; has_value = false; } private void RestoreEncodedText() { if (raw_chars.Count != 0) { byte[] arr = raw_chars.ToArray(); char[] result = new char[raw_chars.Count]; int count = current_unicode_decoder.GetChars(arr, 0, raw_chars.Count, result, 0); char[] str = new char[count]; Array.Copy(result, str, count); string text = new string(str); this.text.Append(text); raw_chars.Clear(); } number.Clear(); } private void AppendCharacter(char ch) { RestoreEncodedText(); this.text.Append(ch); } internal ParserStatus ParseByte(char ch) { //Console.Write(ch); byte hex; status = ParserStatus.Collecting; delimiter = ch; if (ch == '{') { indirection_counter++; PushLocaleDecoder(); } if (ch == '}') { PopLocaleDecoder(); indirection_counter--; } bool loop; do { loop = false; switch (parser_state) { case ParserState.Neutral: switch (ch) { case '{': ControlWord(); status = ParserStatus.OpenBlock; break; case '}': ControlWord(); status = ParserStatus.CloseBlock; break; case '\\': parser_state = ParserState.Control; break; default: switch (ch) { case '\r': case '\n': case '\t': case '\0': break; default: AppendCharacter(ch); break; } break; } break; case ParserState.CheckHyphen: if (char.IsDigit(ch)) { number.Append('-'); number.Append(ch); parser_state = ParserState.Number; has_value = true; break; } // Substitute Optional HYPHEN with ZERO WIDTH SPACE AppendCharacter((char)8203); parser_state = ParserState.Neutral; status = ParseByte(ch); break; case ParserState.Control: if (char.IsLetter(ch)) { control.Append(ch); } else if (ch == '-') { parser_state = ParserState.CheckHyphen; } else if (char.IsDigit(ch)) { number.Append(ch); parser_state = ParserState.Number; has_value = true; } else if (ch == '\\') { if (control.Length > 0) { ControlWord(); status = ParserStatus.ControlTag; } else { AppendCharacter(ch); parser_state = ParserState.Neutral; } } else if (ch == '{') { if (control.Length > 0) { ControlWord(); status = ParserStatus.OpenBlock; } else { AppendCharacter(ch); status = ParserStatus.Collecting; } parser_state = ParserState.Neutral; } else if (ch == '}') { if (control.Length > 0) { ControlWord(); status = ParserStatus.CloseBlock; } else { AppendCharacter(ch); status = ParserStatus.Collecting; } parser_state = ParserState.Neutral; } else if (char.IsWhiteSpace(ch)) { parser_state = ParserState.Neutral; ControlWord(); status = ParserStatus.ControlTag; } else if (ch == '*') { #if false // Preivous version which ignore pictures in \* control (20210211) parser_state = ParserState.RichFormatExtensions; if (indirection_counter == 0) throw new Exception("Broken RTF format"); extensions_skip_counter = indirection_counter - 1; #else parser_state = ParserState.Neutral; ControlWord(); status = ParserStatus.ControlTag; #endif } else if (ch == ';') { parser_state = ParserState.Neutral; ControlWord(); status = ParserStatus.ControlTag; } else if (ch == '\'') { number.Clear(); parser_state = ParserState.FirstNibble; } else if (ch == '~') { // Non-breaking space AppendCharacter((char)0x00a0); parser_state = ParserState.Neutral; } else if (ch == '_') { // Non-breaking hyphen AppendCharacter((char)0x2011); parser_state = ParserState.Neutral; } else if (ch == '.') { AppendCharacter('.'); parser_state = ParserState.Neutral; } else if (ch == ')') { AppendCharacter(')'); parser_state = ParserState.Neutral; } else throw new Exception("RTF format not parsed"); break; case ParserState.Number: if (char.IsDigit(ch)) number.Append(ch); else { if (ch == '{') { parser_state = ParserState.Neutral; status = ParserStatus.OpenBlock; } else if (ch == '}') { parser_state = ParserState.Neutral; status = ParserStatus.CloseBlock; } else { if (this.control.ToString() == "u") { int bukva = int.Parse(number.ToString()); if(bukva < 0) { // RTF control words generally accept signed 16-bit numbers as arguments. // For this reason, Unicode values greater than 32767 must be expressed as negative numbers. bukva = (char)bukva; if (bukva == 0xf0b7) bukva = '●'; } AppendCharacter((char)bukva); number.Length = 0; if (ch != '?') { parser_state = ParserState.SkipToNext; skip_counter = 3; } else { parser_state = ParserState.Neutral; control.Length = 0; } break; } else if (ch == '\\') parser_state = ParserState.Control; else if (ch == ';' || char.IsWhiteSpace(ch)) parser_state = ParserState.Neutral; status = ParserStatus.ControlTag; } ControlWord(); } break; case ParserState.TryCollectChars: if(ch!= '\\') { RestoreEncodedText(); parser_state = ParserState.Neutral; loop = true; continue; } parser_state = ParserState.WaitHexcharPrefix; break; case ParserState.WaitHexcharPrefix: if (ch != '\'') { RestoreEncodedText(); parser_state = ParserState.Control; loop = true; continue; } parser_state = ParserState.FirstNibble; number.Clear(); break; case ParserState.FirstNibble: parser_state = ParserState.SecondNibble; number.Append(ch); break; case ParserState.SecondNibble: hex = 0; // Just avoid warning try { number.Append(ch); hex = byte.Parse(number.ToString(), System.Globalization.NumberStyles.HexNumber); #if false // 2024/01/26 // This will stay disabled until we met os/platform which does not support Shift JIS encoding if (hex == 0x81) { parser_state = ParserState.ParsePrefix_0x81_phase0; break; } #endif } catch (Exception e) { Console.Error.WriteLine(e.ToString()); hex = (byte) '?'; } raw_chars.Add(hex); parser_state = ParserState.TryCollectChars; break; #if false // 2024/01/26 // This will stay disabled until we met os/platform which does not support Shift JIS encoding case ParserState.ParsePrefix_0x81_phase0: // if(ch == '\\') parser_state = ParserState.ParsePrefix_0x81_phase1; break; case ParserState.ParsePrefix_0x81_phase1: // if (ch == '\'') parser_state = ParserState.ParsePrefix_0x81_phase2; break; case ParserState.ParsePrefix_0x81_phase2: number.Append(ch); parser_state = ParserState.ParsePrefix_0x81_phase3; break; case ParserState.ParsePrefix_0x81_phase3: number.Append(ch); hex = byte.Parse(number.ToString(), System.Globalization.NumberStyles.HexNumber); switch(hex) { case 0x8b: text.Append('°'); break; case 0x7e: text.Append('×'); break; default: throw new Exception("Unparsed ANSI sequence"); } number.Clear(); parser_state = ParserState.Neutral; break; #endif case ParserState.SkipToNext: // Ignore hexdecmal representation of the character skip_counter--; if (skip_counter == 0) { parser_state = ParserState.Neutral; control.Length = 0; } break; case ParserState.RichFormatExtensions: status = ParseExtensionByte(ch); break; case ParserState.IgnoreSpaces: if (ch == ' ') { // Debug.WriteLine("Skip space"); } else { parser_state = ParserState.Neutral; loop = true; } break; } } while (loop); return status; } #if false // Debug StringBuilder dbg = new StringBuilder(); private ParserStatus ParseExtensionByte(char ch) { if (extensions_skip_counter == indirection_counter) { dbg.Append("\n\n"); parser_state = ParserState.Neutral; dbg.Clear(); } else dbg.Append(ch); return ch == '{' ? ParserStatus.OpenBlock : ch == '}' ? ParserStatus.CloseBlock : ParserStatus.Collecting; } #else private ParserStatus ParseExtensionByte(char ch) { if (extensions_skip_counter == indirection_counter) { parser_state = ParserState.Neutral; } return ch == '{' ? ParserStatus.OpenBlock : ch == '}' ? ParserStatus.CloseBlock : ParserStatus.Collecting; } #endif internal void ResetRunFormat() { current_run_format.bold = false; current_run_format.italic = false; current_run_format.underline = false; current_run_format.strike = false; current_run_format.font_size = 24; current_run_format.color = System.Drawing.Color.Black; current_run_format.BColor = System.Drawing.Color.White; current_run_format.FillColor = System.Drawing.Color.White; current_run_format.font_idx = 0; current_run_format.script_type = RunFormat.ScriptType.PlainText; } public void ResetParagraphFormat() { current_paragraph_format.Valign = ParagraphFormat.VerticalAlign.Top; // 20210722 current_paragraph_format.align = ParagraphFormat.HorizontalAlign.Left; current_paragraph_format.line_spacing = 0; current_paragraph_format.space_before = 0; current_paragraph_format.space_after = 0; current_paragraph_format.left_indent = 0; current_paragraph_format.right_indent = 0; current_paragraph_format.first_line_indent = 0; current_paragraph_format.lnspcmult = ParagraphFormat.LnSpcMult.Exactly; current_paragraph_format.pnstart = 0; current_lang = default_lang; current_paragraph_format.list_id = null; current_paragraph_format.tab_positions = null; } internal RTF_Parser() { parser_state = ParserState.Neutral; control = new StringBuilder(); number = new StringBuilder(12, 12); text = new StringBuilder(); has_value = false; override_default_color = false; current_lang = 0; indirection_counter = 0; skip_space_counter = 0; ResetRunFormat(); ResetParagraphFormat(); } static RTF_Parser() { #if CROSSPLATFORM || COREWIN Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); #endif } } }