HtmlEncoding.cs 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463
  1. using System;
  2. using System.Text;
  3. #pragma warning disable
  4. namespace ExCSS.Model
  5. {
  6. internal static class HtmlEncoding
  7. {
  8. internal static string Extract(string content)
  9. {
  10. var position = 0;
  11. content = content.ToLower();
  12. for (var i = position; i < content.Length - 7; i++)
  13. {
  14. if (!content.Substring(i).StartsWith("charset"))
  15. {
  16. continue;
  17. }
  18. position = i + 7;
  19. break;
  20. }
  21. if (position <= 0 || position >= content.Length)
  22. {
  23. return string.Empty;
  24. }
  25. for (var i = position; i < content.Length - 1; i++)
  26. {
  27. if (content[i].IsSpaceCharacter())
  28. {
  29. position++;
  30. }
  31. else
  32. {
  33. break;
  34. }
  35. }
  36. if (content[position] != Specification.EqualSign)
  37. {
  38. return Extract(content.Substring(position));
  39. }
  40. position++;
  41. for (var i = position; i < content.Length; i++)
  42. {
  43. if (content[i].IsSpaceCharacter())
  44. {
  45. position++;
  46. }
  47. else
  48. {
  49. break;
  50. }
  51. }
  52. if (position >= content.Length)
  53. {
  54. return string.Empty;
  55. }
  56. switch (content[position])
  57. {
  58. case Specification.DoubleQuote:
  59. {
  60. content = content.Substring(position + 1);
  61. var index = content.IndexOf(Specification.DoubleQuote);
  62. if (index != -1)
  63. {
  64. return content.Substring(0, index);
  65. }
  66. }
  67. break;
  68. case Specification.SingleQuote:
  69. {
  70. content = content.Substring(position + 1);
  71. var index = content.IndexOf(Specification.SingleQuote);
  72. if (index != -1)
  73. {
  74. return content.Substring(0, index);
  75. }
  76. }
  77. break;
  78. default:
  79. {
  80. content = content.Substring(position);
  81. var index = 0;
  82. for (var i = 0; i < content.Length; i++)
  83. {
  84. if (content[i].IsSpaceCharacter())
  85. {
  86. break;
  87. }
  88. if (content[i] == ';')
  89. {
  90. break;
  91. }
  92. index++;
  93. }
  94. return content.Substring(0, index);
  95. }
  96. }
  97. return string.Empty;
  98. }
  99. internal static bool IsSupported(string charset)
  100. {
  101. return Resolve(charset) != null;
  102. }
  103. internal static Encoding Resolve(string charset)
  104. {
  105. charset = charset.ToLower();
  106. switch (charset)
  107. {
  108. case "unicode-1-1-utf-8":
  109. case "utf-8":
  110. case "utf8":
  111. return Encoding.UTF8;
  112. case "utf-16be":
  113. return Encoding.BigEndianUnicode;
  114. case "utf-16":
  115. case "utf-16le":
  116. return Encoding.Unicode;
  117. case "dos-874":
  118. case "iso-8859-11":
  119. case "iso8859-11":
  120. case "iso885911":
  121. case "tis-620":
  122. case "windows-874":
  123. return Encoding.GetEncoding("windows-874");
  124. case "cp1250":
  125. case "windows-1250":
  126. case "x-cp1250":
  127. return Encoding.GetEncoding("windows-1250");
  128. case "cp1251":
  129. case "windows-1251":
  130. case "x-cp1251":
  131. return Encoding.GetEncoding("windows-1251");
  132. case "ansi_x3.4-1968":
  133. case "ascii":
  134. case "cp1252":
  135. case "cp819":
  136. case "csisolatin1":
  137. case "ibm819":
  138. case "iso-8859-1":
  139. case "iso-ir-100":
  140. case "iso8859-1":
  141. case "iso88591":
  142. case "iso_8859-1":
  143. case "iso_8859-1:1987":
  144. case "l1":
  145. case "latin1":
  146. case "us-ascii":
  147. case "windows-1252":
  148. case "x-cp1252":
  149. return Encoding.GetEncoding("windows-1252");
  150. case "cp1253":
  151. case "windows-1253":
  152. case "x-cp1253":
  153. return Encoding.GetEncoding("windows-1253");
  154. case "cp1254":
  155. case "csisolatin5":
  156. case "iso-8859-9":
  157. case "iso-ir-148":
  158. case "iso8859-9":
  159. case "iso88599":
  160. case "iso_8859-9":
  161. case "iso_8859-9:1989":
  162. case "l5":
  163. case "latin5":
  164. case "windows-1254":
  165. case "x-cp1254":
  166. return Encoding.GetEncoding("windows-1254");
  167. case "cp1255":
  168. case "windows-1255":
  169. case "x-cp1255":
  170. return Encoding.GetEncoding("windows-1255");
  171. case "cp1256":
  172. case "windows-1256":
  173. case "x-cp1256":
  174. return Encoding.GetEncoding("windows-1256");
  175. case "cp1257":
  176. case "windows-1257":
  177. case "x-cp1257":
  178. return Encoding.GetEncoding("windows-1257");
  179. case "cp1258":
  180. case "windows-1258":
  181. case "x-cp1258":
  182. return Encoding.GetEncoding("windows-1258");
  183. case "csmacintosh":
  184. case "mac":
  185. case "macintosh":
  186. case "x-mac-roman":
  187. return Encoding.GetEncoding("macintosh");
  188. case "x-mac-cyrillic":
  189. case "x-mac-ukrainian":
  190. return Encoding.GetEncoding("x-mac-cyrillic");
  191. case "866":
  192. case "cp866":
  193. case "csibm866":
  194. case "ibm866":
  195. return Encoding.GetEncoding("cp866");
  196. case "csisolatin2":
  197. case "iso-8859-2":
  198. case "iso-ir-101":
  199. case "iso8859-2":
  200. case "iso88592":
  201. case "iso_8859-2":
  202. case "iso_8859-2:1987":
  203. case "l2":
  204. case "latin2":
  205. return Encoding.GetEncoding("iso-8859-2");
  206. case "csisolatin3":
  207. case "iso-8859-3":
  208. case "iso-ir-109":
  209. case "iso8859-3":
  210. case "iso88593":
  211. case "iso_8859-3":
  212. case "iso_8859-3:1988":
  213. case "l3":
  214. case "latin3":
  215. return Encoding.GetEncoding("iso-8859-3");
  216. case "csisolatin4":
  217. case "iso-8859-4":
  218. case "iso-ir-110":
  219. case "iso8859-4":
  220. case "iso88594":
  221. case "iso_8859-4":
  222. case "iso_8859-4:1988":
  223. case "l4":
  224. case "latin4":
  225. return Encoding.GetEncoding("iso-8859-4");
  226. case "csisolatincyrillic":
  227. case "cyrillic":
  228. case "iso-8859-5":
  229. case "iso-ir-144":
  230. case "iso8859-5":
  231. case "iso88595":
  232. case "iso_8859-5":
  233. case "iso_8859-5:1988":
  234. return Encoding.GetEncoding("iso-8859-5");
  235. case "arabic":
  236. case "asmo-708":
  237. case "csiso88596e":
  238. case "csiso88596i":
  239. case "csisolatinarabic":
  240. case "ecma-114":
  241. case "iso-8859-6":
  242. case "iso-8859-6-e":
  243. case "iso-8859-6-i":
  244. case "iso-ir-127":
  245. case "iso8859-6":
  246. case "iso88596":
  247. case "iso_8859-6":
  248. case "iso_8859-6:1987":
  249. return Encoding.GetEncoding("iso-8859-6");
  250. case "csisolatingreek":
  251. case "ecma-118":
  252. case "elot_928":
  253. case "greek":
  254. case "greek8":
  255. case "iso-8859-7":
  256. case "iso-ir-126":
  257. case "iso8859-7":
  258. case "iso88597":
  259. case "iso_8859-7":
  260. case "iso_8859-7:1987":
  261. case "sun_eu_greek":
  262. return Encoding.GetEncoding("iso-8859-7");
  263. case "csiso88598e":
  264. case "csisolatinhebrew":
  265. case "hebrew":
  266. case "iso-8859-8":
  267. case "iso-8859-8-e":
  268. case "iso-ir-138":
  269. case "iso8859-8":
  270. case "iso88598":
  271. case "iso_8859-8":
  272. case "iso_8859-8:1988":
  273. case "visual":
  274. return Encoding.GetEncoding("iso-8859-8");
  275. case "csiso88598i":
  276. case "iso-8859-8-i":
  277. case "logical":
  278. return Encoding.GetEncoding("iso-8859-8-i");
  279. case "iso-8859-13":
  280. case "iso8859-13":
  281. case "iso885913":
  282. return Encoding.GetEncoding("iso-8859-13");
  283. case "csisolatin9":
  284. case "iso-8859-15":
  285. case "iso8859-15":
  286. case "iso885915":
  287. case "iso_8859-15":
  288. case "l9":
  289. return Encoding.GetEncoding("iso-8859-15");
  290. case "cskoi8r":
  291. case "koi":
  292. case "koi8":
  293. case "koi8-r":
  294. case "koi8_r":
  295. return Encoding.GetEncoding("koi8-r");
  296. case "koi8-u":
  297. return Encoding.GetEncoding("koi8-u");
  298. case "chinese":
  299. case "csgb2312":
  300. case "csiso58gb231280":
  301. case "gb2312":
  302. case "gb_2312":
  303. case "gb_2312-80":
  304. case "gbk":
  305. case "iso-ir-58":
  306. case "x-gbk":
  307. return Encoding.GetEncoding("x-cp20936");
  308. case "hz-gb-2312":
  309. return Encoding.GetEncoding("hz-gb-2312");
  310. case "gb18030":
  311. return Encoding.GetEncoding("GB18030");
  312. case "big5":
  313. case "big5-hkscs":
  314. case "cn-big5":
  315. case "csbig5":
  316. case "x-x-big5":
  317. return Encoding.GetEncoding("big5");
  318. case "csiso2022jp":
  319. case "iso-2022-jp":
  320. return Encoding.GetEncoding("iso-2022-jp");
  321. case "csiso2022kr":
  322. case "iso-2022-kr":
  323. return Encoding.GetEncoding("iso-2022-kr");
  324. case "iso-2022-cn":
  325. case "iso-2022-cn-ext":
  326. return Encoding.GetEncoding("iso-2022-jp");
  327. default:
  328. return null;
  329. }
  330. }
  331. internal static Encoding Suggest(string local)
  332. {
  333. if (local.Length < 2)
  334. return Encoding.UTF8;
  335. var firstTwo = local.Substring(0, 2).ToLower();
  336. switch (firstTwo)
  337. {
  338. case "ar":
  339. case "cy":
  340. case "fa":
  341. case "hr":
  342. case "kk":
  343. case "mk":
  344. case "or":
  345. case "ro":
  346. case "sr":
  347. case "vi":
  348. return Encoding.UTF8;
  349. case "be":
  350. return Encoding.GetEncoding("iso-8859-5");
  351. case "bg":
  352. case "ru":
  353. case "uk":
  354. return Encoding.GetEncoding("windows-1251");
  355. case "cs":
  356. case "hu":
  357. case "pl":
  358. case "sl":
  359. return Encoding.GetEncoding("iso-8859-2");
  360. case "tr":
  361. case "ku":
  362. return Encoding.GetEncoding("windows-1254");
  363. case "he":
  364. return Encoding.GetEncoding("windows-1255");
  365. case "lv":
  366. return Encoding.GetEncoding("iso-8859-13");
  367. case "ja":// Windows-31J ???? Replaced by something better anyway
  368. return Encoding.UTF8;
  369. case "ko":
  370. return Encoding.GetEncoding("ks_c_5601-1987");
  371. case "lt":
  372. return Encoding.GetEncoding("windows-1257");
  373. case "sk":
  374. return Encoding.GetEncoding("windows-1250");
  375. case "th":
  376. return Encoding.GetEncoding("windows-874");
  377. }
  378. if (local.Equals("zh-CN", StringComparison.OrdinalIgnoreCase))
  379. {
  380. return Encoding.GetEncoding("GB18030");
  381. }
  382. return Encoding.GetEncoding(local.Equals("zh-TW", StringComparison.OrdinalIgnoreCase)
  383. ? "big5"
  384. : "windows-1252");
  385. }
  386. }
  387. }
  388. #pragma warning restore