using System;
using System.Collections.Generic;
using System.Data;
using System.Diagnostics;
using System.Diagnostics.Contracts;
using System.Globalization;
using System.Text;
using static FastReport.Fonts.GlyphSubstitutionClass;
namespace FastReport.RichTextParser
{
enum ParserStatus
{
Collecting,
Text,
ControlTag,
OpenBlock,
CloseBlock
}
///
/// This class detect a RTF control sequences and text.
///
class RTF_Parser
{
enum ParserState
{
Neutral,
Control,
Number,
FirstNibble,
SecondNibble,
TryCollectChars,
WaitHexcharPrefix,
ParsePrefix_0x81_phase0,
ParsePrefix_0x81_phase1,
ParsePrefix_0x81_phase2,
ParsePrefix_0x81_phase3,
SkipToNext,
RichFormatExtensions,
CheckHyphen,
IgnoreSpaces,
ParserError
}
ParserStatus status;
ParserState parser_state;
StringBuilder control;
StringBuilder number;
StringBuilder text;
bool has_value;
bool list_is_active;
string control_tag;
string previous_tag;
long control_num;
string parsed_text;
char delimiter;
bool has_has_value;
int indirection_counter;
int skip_counter;
int extensions_skip_counter;
// 20200605
internal bool override_default_color;
// 20180511
internal ParagraphFormat current_paragraph_format;
internal RunFormat current_run_format;
internal bool insideTable;
// 20240226
internal int skip_space_counter;
#region Uniocode converters
internal int current_lang;
internal long font_charset;
int default_lang;
const int Lang_EN_US = 1033;
Decoder current_unicode_decoder;
Dictionary unicode_decoders = new Dictionary();
Stack lang_ids = new Stack();
Dictionary translate_charset = new Dictionary();
List raw_chars = new List();
#endregion
public string Text { get { return parsed_text; } }
public string Control { get { return control_tag; } }
public string PreviousTag { get { return previous_tag; } }
public long Number { get { return control_num; } }
public char Delimiter { get { return status == ParserStatus.ControlTag ? delimiter : '\0'; } }
public ParserStatus Status { get { return status; } }
public bool HasValue { get { return has_has_value; } }
public bool ListItem { get { return list_is_active; } set { list_is_active = value; } }
public bool EndOfFile
{
get
{
if (text.Length == 0)
return false;
parsed_text = text.ToString();
return true;
}
}
public void ClearParsedText() { parsed_text = String.Empty; }
#region Language selector and translator
private int LCID2Codepade(int lcid)
{
int codepage = lcid;
switch(lcid)
{
case 1048: // Romanian — Romania ro ro 1048 418 1250
case 1052: // Albanian sq sq 1052 1250
case 1045: // Polish pl pl 1045 415 1250
case 1038: // Hungarian hu hu 1038 1250
case 2074: // Serbian — Latin sr sr-sp 2074 1250
case 1060: // Slovenian sl sl 1060 424 1250
case 1029: // Czech cs cs 1029 405 1250
case 1050: // Croatian hr hr 1050 1250
case 1051: // Slovak sk sk 1051 1250
codepage = 1250;
break;
case 1088: // Kyrgyz — Cyrillic 1088 440 1251
case 2092: // Azeri — Cyrillic az az-az 2092 1251
case 1049: // Russian ru ru 1049 419 1251
case 1087: // Kazakh kk kk 1087 1251
case 1058: // Ukrainian uk uk 1058 422 1251
case 1092: // Tatar tt tt 1092 444 1251
case 3098: // Serbian — Cyrillic sr sr-sp 3098 1251
case 1059: // Belarusian be be 1059 423 1251
case 2115: // Uzbek — Cyrillic uz uz-uz 2115 843 1251
case 1104: // Mongolian mn mn 1104 450 1251
case 1026: // Bulgarian bg bg 1026 402 1251
case 1071: // FYRO Macedonia mk mk 1071 1251
codepage = 1251;
break;
case 1078: // Afrikaans af af 1078 436 1252
case 1080: // Faroese fo fo 1080 438 1252
case 1053: // Swedish — Sweden sv sv-se 1053 1252
case 1086: // Malay — Malaysia ms ms-my 1086 1252
case 1089: // Swahili sw sw 1089 441 1252
case 15370: // Spanish — Paraguay es es-py 15370 1252
case 14346: // Spanish — Uruguay es es-uy 14346 1252
case 11273: // English — Trinidad en en-tt 11273 1252
case 13321: // English—Phillippines en en-ph 13321 3409 1252
case 16394: // Spanish — Bolivia es es-bo 16394 1252
case 19466: // Spanish — Nicaragua es es-ni 19466 1252
case 1027: // Catalan ca ca 1027 403 1252
case 1030: // Danish da da 1030 406 1252
case 1031: // German—Germany de de-de 1031 407 1252
case 1033: // English—USA en en-us 1033 409 1252
case 1034: // Spanish—Spain(Tradi) es es-es 1034 1252
case 1035: // Finnish fi fi 1035 1252
case 1036: // French — France fr fr-fr 1036 1252
case 1057: // Indonesian id id 1057 421 1252
case 1040: // Italian—Italy it it-it 1040 410 1252
case 1069: // Basque eu eu 1069 1252
case 1043: // Dutch — Netherlands nl nl-nl 1043 413 1252
case 1044: // Norwegian — Bokml nb no-no 1044 414 1252
case 1046: // Portuguese — Brazil pt pt-br 1046 416 1252
case 13322: // Spanish—Chile es es-cl 13322 1252
case 18422: // Spanish — Honduras es es-hn 18442 1252
case 1110: // Galician gl 1110 456 1252
case 17418: // Spanish— El Salvador es es-sv 17418 1252
case 1039: // Icelandic is is 1039 1252
case 3081: // English — Australia en en-au 3081 1252
case 7178: // Spanish — Dominican es es-do 7178 1252
case 7177: // English—South Africa en en-za 7177 1252
case 3079: // German — Austria de de-at 3079 1252
case 6156: // French — Monaco fr 6156 1252
case 6154: // Spanish — Panama es es-pa 6154 1252
case 6153: // English — Ireland en en-ie 6153 1809 1252
case 4106: // Spanish — Guatemala es es-gt 4106 1252
case 5130: // Spanish — Costa Rica es es-cr 5130 1252
case 2070: // Portuguese— Portugal pt pt-pt 2070 816 1252
case 3084: // French — Canada fr fr-ca 3084 1252
case 5129: // English— New Zealand en en-nz 5129 1409 1252
case 12298: // Spanish — Ecuador es es-ec 12298 1252
case 5127: // German—Liechtenstein de de-li 5127 1407 1252
case 4108: // French — Switzerland fr fr-ch 4108 1252
case 4103: // German — Luxembourg de de-lu 4103 1007 1252
case 4105: // English — Canada en en-ca 4105 1009 1252
case 5132: // French — Luxembourg fr fr-lu 5132 1252
case 8202: // Spanish — Venezuela es es-ve 8202 1252
case 12297: // English — Zimbabwe en 12297 3009 1252
case 10250: // Spanish — Peru es es-pe 10250 1252
case 10249: // English — Belize en en-bz 10249 2809 1252
case 9226: // Spanish — Colombia es es-co 9226 1252
case 20490: // Spanish— Puerto Rico es es-pr 20490 1252
case 9225: // English — Caribbean en en-cb 9225 2409 1252
case 2110: // Malay — Brunei ms ms-bn 2110 1252
case 2057: // English — Gr.Britain en en-gb 2057 809 1252
case 2077: // Swedish — Finland sv sv-fi 2077 1252
case 2058: // Spanish — Mexico es es-mx 2058 1252
case 2060: // French — Belgium fr fr-be 2060 1252
case 2064: // Italian— Switzerland it it-ch 2064 810 1252
case 2067: // Dutch — Belgium nl nl-be 2067 813 1252
case 8201: // English — Jamaica en en-jm 8201 2009 1252
case 2068: // Norwegian — Nynorsk nn no-no 2068 814 1252
case 11274: // Spanish — Argentina es es-ar 11274 1252
case 2055: // German — Switzerland de de-ch 2055 807 1252
codepage = 1252;
break;
case 1032: // Greek el el 1032 408 1253
codepage = 1253;
break;
case 1055: // Turkish tr tr 1055 1254
case 1091: // Uzbek — Latin uz uz-uz 1091 443 1254
case 1068: // Azeri — Latin az az-az 1068 1254
codepage = 1254;
break;
case 1037: // Hebrew he he 1037 1255
codepage = 1255;
break;
case 1025: // Arabic—Saudi Arabia ar ar-sa 1025 401 1256
case 11265: // Arabic— Jordan ar ar-jo 11265 1256
case 13313: // Arabic — Kuwait ar ar-kw 13313 3401 1256
case 10241: // Arabic — Syria ar ar-sy 10241 2801 1256
case 12289: // Arabic — Lebanon ar ar-lb 12289 3001 1256
case 2049: // Arabic — Iraq ar ar-iq 2049 801 1256
case 15361: // Arabic — Bahrain ar ar-bh 15361 1256
case 9217: // Arabic — Yemen ar ar-ye 9217 2401 1256
case 16385: // Arabic — Qatar ar ar-qa 16385 4001 1256
case 1056: // Urdu ur ur 1056 420 1256
case 3073: // Arabic — Egypt ar ar-eg 3073 1256
case 14337: // Arabic — UAEmirates ar ar-ae 14337 3801 1256
case 7169: // Arabic — Tunisia ar ar-tn 7169 1256
case 1065: // Farsi — Persian fa fa 1065 429 1256
case 6145: // Arabic — Morocco ar ar-ma 6145 1801 1256
case 4097: // Arabic — Libya ar ar-ly 4097 1001 1256
case 5121: // Arabic — Algeria ar ar-dz 5121 1401 1256
case 8193: // Arabic — Oman ar ar-om 8193 2001 1256
codepage = 1256;
break;
case 1063: // Lithuanian lt lt 1063 427 1257
case 1062: // Latvian lv lv 1062 426 1257
case 1061: // Estonian et et 1061 425 1257
codepage = 1257;
break;
case 1066: // Vietnamese vi vi 1066 1258
codepage = 1258;
break;
}
return codepage;
}
private void SelectUnicodeDecoder(int lcid)
{
if (!unicode_decoders.ContainsKey(lcid))
{
int codepage = LCID2Codepade(lcid);
Encoding encoder = Encoding.GetEncoding(codepage);
current_unicode_decoder = encoder.GetDecoder();
unicode_decoders.Add(lcid, current_unicode_decoder);
}
else
{
current_unicode_decoder = unicode_decoders[lcid];
}
current_lang = lcid;
}
int TranslateCharset(long charset)
{
switch (charset)
{
case 0: return 1033; // ANSI
case 1: return default_lang; // default
case 2: return 1038; // Symbol
case 77: return 10000; // Mac romant - to fix
case 78: return 10001; // Mac Shift Jis - to fix
case 79: return 10003; // Mac Hangul - to fix
case 80: return 10008; // Mac GB2312 - to fix
case 81: return 10002; // Mac Big5 - to fix
//case 82: return 10002; // Johab old
case 83: return 10005; // Mac Hebrew - to fix
case 84: return 10004; // Mac Arabic - to fix
case 85: return 10006; // Mac Greek - to fix
case 86: return 10081; // Mac Turkish - to fix
case 87: return 10021; // Mac Thai - to fix
case 88: return 10029; // Mac East Europe - to fix
case 89: return 10007; // Mac Russian - to fix
case 128: return 932; // Shift JIS
case 129: return 949; // Korean Hangul
case 130: return default_lang; // 1361; Korean Johab
case 134: return 2052; // 936; GB2312
case 136: return 1028; // 950; BIG5
case 161: return 1253; // Greek
case 162: return 1055; // Turkish
case 163: return 1258; // Vietnamese // 1066; //
case 177: return 1255; // Hebrew 1037; //
case 178: return 1256; // Arabic 1056; //
//case 179: return 0; // Arabic Traditional (old)
//case 180: return 0; // Arabic user (old)
//case 181: return 0; // Hebrew user (old)
case 186: return 1257; // Baltic 1062; //
case 204: return 1251; // Russian // 1049; //
case 222: return 874; // Thai 1054; //
case 238: return 1250; // East Europe (Polland selected) 1045; //
case 254: return 437; // PC437
case 255: return 850; // OEM
}
return default_lang;
}
internal void SelectCodepageByFontCharset(long charset)
{
int codepage = TranslateCharset(charset);
SelectUnicodeDecoder(codepage);
}
private void PushLocaleDecoder()
{
lang_ids.Push(current_lang);
}
private void PopLocaleDecoder()
{
if (lang_ids.Count != 0)
{
current_lang = lang_ids.Pop();
SelectUnicodeDecoder(current_lang);
}
else
{
#if DEBUG
Debug.WriteLine("Rich document error: broken source document structure. Ignore error and continue");
#else
throw new SyntaxErrorException("Rich document structure error - broken source document");
#endif
}
}
#endregion
private void ControlWord()
{
previous_tag = control_tag;
control_tag = control.ToString();
control_num = number.Length != 0 ? long.Parse(number.ToString()) : 0;
if (control_tag == "lang" || control_tag == "ansicpg")
{
if (font_charset == 0)
{
SelectUnicodeDecoder((int)control_num);
default_lang = (int) control_num;
}
}
#if true // 2024 Jan 25 - deflang
if (control_tag == "deflang")
{
if(default_lang != 0)
{
SelectUnicodeDecoder((int)control_num);
default_lang = current_lang;
}
}
#endif
RestoreEncodedText();
parsed_text = text.ToString();
control.Length = 0;
number.Length = 0;
text.Length = 0;
has_has_value = has_value;
has_value = false;
}
private void RestoreEncodedText()
{
if (raw_chars.Count != 0)
{
byte[] arr = raw_chars.ToArray();
char[] result = new char[raw_chars.Count];
int count = current_unicode_decoder.GetChars(arr, 0, raw_chars.Count, result, 0);
char[] str = new char[count];
Array.Copy(result, str, count);
string text = new string(str);
this.text.Append(text);
raw_chars.Clear();
}
number.Clear();
}
private void AppendCharacter(char ch)
{
RestoreEncodedText();
this.text.Append(ch);
}
internal ParserStatus ParseByte(char ch)
{
//Console.Write(ch);
byte hex;
status = ParserStatus.Collecting;
delimiter = ch;
if (ch == '{')
{
indirection_counter++;
PushLocaleDecoder();
}
if (ch == '}')
{
PopLocaleDecoder();
indirection_counter--;
}
bool loop;
do
{
loop = false;
switch (parser_state)
{
case ParserState.Neutral:
switch (ch)
{
case '{':
ControlWord();
status = ParserStatus.OpenBlock;
break;
case '}':
ControlWord();
status = ParserStatus.CloseBlock;
break;
case '\\':
parser_state = ParserState.Control;
break;
default:
switch (ch)
{
case '\r':
case '\n':
case '\t':
case '\0':
break;
default:
AppendCharacter(ch);
break;
}
break;
}
break;
case ParserState.CheckHyphen:
if (char.IsDigit(ch))
{
number.Append('-');
number.Append(ch);
parser_state = ParserState.Number;
has_value = true;
break;
}
// Substitute Optional HYPHEN with ZERO WIDTH SPACE
AppendCharacter((char)8203);
parser_state = ParserState.Neutral;
status = ParseByte(ch);
break;
case ParserState.Control:
if (char.IsLetter(ch))
{
control.Append(ch);
}
else if (ch == '-')
{
parser_state = ParserState.CheckHyphen;
}
else if (char.IsDigit(ch))
{
number.Append(ch);
parser_state = ParserState.Number;
has_value = true;
}
else if (ch == '\\')
{
if (control.Length > 0)
{
ControlWord();
status = ParserStatus.ControlTag;
}
else
{
AppendCharacter(ch);
parser_state = ParserState.Neutral;
}
}
else if (ch == '{')
{
if (control.Length > 0)
{
ControlWord();
status = ParserStatus.OpenBlock;
}
else
{
AppendCharacter(ch);
status = ParserStatus.Collecting;
}
parser_state = ParserState.Neutral;
}
else if (ch == '}')
{
if (control.Length > 0)
{
ControlWord();
status = ParserStatus.CloseBlock;
}
else
{
AppendCharacter(ch);
status = ParserStatus.Collecting;
}
parser_state = ParserState.Neutral;
}
else if (char.IsWhiteSpace(ch))
{
parser_state = ParserState.Neutral;
ControlWord();
status = ParserStatus.ControlTag;
}
else if (ch == '*')
{
#if false // Preivous version which ignore pictures in \* control (20210211)
parser_state = ParserState.RichFormatExtensions;
if (indirection_counter == 0)
throw new Exception("Broken RTF format");
extensions_skip_counter = indirection_counter - 1;
#else
parser_state = ParserState.Neutral;
ControlWord();
status = ParserStatus.ControlTag;
#endif
}
else if (ch == ';')
{
parser_state = ParserState.Neutral;
ControlWord();
status = ParserStatus.ControlTag;
}
else if (ch == '\'')
{
number.Clear();
parser_state = ParserState.FirstNibble;
}
else if (ch == '~')
{
// Non-breaking space
AppendCharacter((char)0x00a0);
parser_state = ParserState.Neutral;
}
else if (ch == '_')
{
// Non-breaking hyphen
AppendCharacter((char)0x2011);
parser_state = ParserState.Neutral;
}
else if (ch == '.')
{
AppendCharacter('.');
parser_state = ParserState.Neutral;
}
else if (ch == ')')
{
AppendCharacter(')');
parser_state = ParserState.Neutral;
}
else
throw new Exception("RTF format not parsed");
break;
case ParserState.Number:
if (char.IsDigit(ch))
number.Append(ch);
else
{
if (ch == '{')
{
parser_state = ParserState.Neutral;
status = ParserStatus.OpenBlock;
}
else if (ch == '}')
{
parser_state = ParserState.Neutral;
status = ParserStatus.CloseBlock;
}
else
{
if (this.control.ToString() == "u")
{
int bukva = int.Parse(number.ToString());
if(bukva < 0)
{
// RTF control words generally accept signed 16-bit numbers as arguments.
// For this reason, Unicode values greater than 32767 must be expressed as negative numbers.
bukva = (char)bukva;
if (bukva == 0xf0b7)
bukva = '●';
}
AppendCharacter((char)bukva);
number.Length = 0;
if (ch != '?')
{
parser_state = ParserState.SkipToNext;
skip_counter = 3;
}
else
{
parser_state = ParserState.Neutral;
control.Length = 0;
}
break;
}
else if (ch == '\\')
parser_state = ParserState.Control;
else if (ch == ';' || char.IsWhiteSpace(ch))
parser_state = ParserState.Neutral;
status = ParserStatus.ControlTag;
}
ControlWord();
}
break;
case ParserState.TryCollectChars:
if(ch!= '\\')
{
RestoreEncodedText();
parser_state = ParserState.Neutral;
loop = true;
continue;
}
parser_state = ParserState.WaitHexcharPrefix;
break;
case ParserState.WaitHexcharPrefix:
if (ch != '\'')
{
RestoreEncodedText();
parser_state = ParserState.Control;
loop = true;
continue;
}
parser_state = ParserState.FirstNibble;
number.Clear();
break;
case ParserState.FirstNibble:
parser_state = ParserState.SecondNibble;
number.Append(ch);
break;
case ParserState.SecondNibble:
hex = 0; // Just avoid warning
try
{
number.Append(ch);
hex = byte.Parse(number.ToString(), System.Globalization.NumberStyles.HexNumber);
#if false // 2024/01/26 // This will stay disabled until we met os/platform which does not support Shift JIS encoding
if (hex == 0x81)
{
parser_state = ParserState.ParsePrefix_0x81_phase0;
break;
}
#endif
}
catch (Exception e)
{
Console.Error.WriteLine(e.ToString());
hex = (byte) '?';
}
raw_chars.Add(hex);
parser_state = ParserState.TryCollectChars;
break;
#if false // 2024/01/26 // This will stay disabled until we met os/platform which does not support Shift JIS encoding
case ParserState.ParsePrefix_0x81_phase0:
// if(ch == '\\')
parser_state = ParserState.ParsePrefix_0x81_phase1;
break;
case ParserState.ParsePrefix_0x81_phase1:
// if (ch == '\'')
parser_state = ParserState.ParsePrefix_0x81_phase2;
break;
case ParserState.ParsePrefix_0x81_phase2:
number.Append(ch);
parser_state = ParserState.ParsePrefix_0x81_phase3;
break;
case ParserState.ParsePrefix_0x81_phase3:
number.Append(ch);
hex = byte.Parse(number.ToString(), System.Globalization.NumberStyles.HexNumber);
switch(hex)
{
case 0x8b:
text.Append('°');
break;
case 0x7e:
text.Append('×');
break;
default:
throw new Exception("Unparsed ANSI sequence");
}
number.Clear();
parser_state = ParserState.Neutral;
break;
#endif
case ParserState.SkipToNext: // Ignore hexdecmal representation of the character
skip_counter--;
if (skip_counter == 0)
{
parser_state = ParserState.Neutral;
control.Length = 0;
}
break;
case ParserState.RichFormatExtensions:
status = ParseExtensionByte(ch);
break;
case ParserState.IgnoreSpaces:
if (ch == ' ')
{
// Debug.WriteLine("Skip space");
}
else
{
parser_state = ParserState.Neutral;
loop = true;
}
break;
}
} while (loop);
return status;
}
#if false // Debug
StringBuilder dbg = new StringBuilder();
private ParserStatus ParseExtensionByte(char ch)
{
if (extensions_skip_counter == indirection_counter)
{
dbg.Append("\n\n");
parser_state = ParserState.Neutral;
dbg.Clear();
}
else
dbg.Append(ch);
return ch == '{' ? ParserStatus.OpenBlock : ch == '}' ? ParserStatus.CloseBlock : ParserStatus.Collecting;
}
#else
private ParserStatus ParseExtensionByte(char ch)
{
if (extensions_skip_counter == indirection_counter)
{
parser_state = ParserState.Neutral;
}
return ch == '{' ? ParserStatus.OpenBlock : ch == '}' ? ParserStatus.CloseBlock : ParserStatus.Collecting;
}
#endif
internal void ResetRunFormat()
{
current_run_format.bold = false;
current_run_format.italic = false;
current_run_format.underline = false;
current_run_format.strike = false;
current_run_format.font_size = 24;
current_run_format.color = System.Drawing.Color.Black;
current_run_format.BColor = System.Drawing.Color.White;
current_run_format.FillColor = System.Drawing.Color.White;
current_run_format.font_idx = 0;
current_run_format.script_type = RunFormat.ScriptType.PlainText;
}
public void ResetParagraphFormat()
{
current_paragraph_format.Valign = ParagraphFormat.VerticalAlign.Top; // 20210722
current_paragraph_format.align = ParagraphFormat.HorizontalAlign.Left;
current_paragraph_format.line_spacing = 0;
current_paragraph_format.space_before = 0;
current_paragraph_format.space_after = 0;
current_paragraph_format.left_indent = 0;
current_paragraph_format.right_indent = 0;
current_paragraph_format.first_line_indent = 0;
current_paragraph_format.lnspcmult = ParagraphFormat.LnSpcMult.Exactly;
current_paragraph_format.pnstart = 0;
current_lang = default_lang;
current_paragraph_format.list_id = null;
current_paragraph_format.tab_positions = null;
}
internal RTF_Parser()
{
parser_state = ParserState.Neutral;
control = new StringBuilder();
number = new StringBuilder(12, 12);
text = new StringBuilder();
has_value = false;
override_default_color = false;
current_lang = 0;
indirection_counter = 0;
skip_space_counter = 0;
ResetRunFormat();
ResetParagraphFormat();
}
static RTF_Parser()
{
#if CROSSPLATFORM || COREWIN
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif
}
}
}