Tokener.cs 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. #region Copyright and License
  2. //
  3. // Fizzler - CSS Selector Engine for Microsoft .NET Framework
  4. // Copyright (c) 2009 Atif Aziz, Colin Ramsay. All rights reserved.
  5. //
  6. // This library is free software; you can redistribute it and/or modify it under
  7. // the terms of the GNU Lesser General Public License as published by the Free
  8. // Software Foundation; either version 3 of the License, or (at your option)
  9. // any later version.
  10. //
  11. // This library is distributed in the hope that it will be useful, but WITHOUT
  12. // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  13. // FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
  14. // details.
  15. //
  16. // You should have received a copy of the GNU Lesser General Public License
  17. // along with this library; if not, write to the Free Software Foundation, Inc.,
  18. // 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  19. //
  20. #endregion
  21. #pragma warning disable
  22. namespace Fizzler
  23. {
  24. #region Imports
  25. using System;
  26. using System.Collections.Generic;
  27. using System.Diagnostics;
  28. using System.IO;
  29. using System.Text;
  30. #endregion
  31. /// <summary>
  32. /// Lexer for tokens in CSS selector grammar.
  33. /// </summary>
  34. public static class Tokener
  35. {
  36. /// <summary>
  37. /// Parses tokens from a given text source.
  38. /// </summary>
  39. public static IEnumerable<Token> Tokenize(TextReader reader)
  40. {
  41. if (reader == null) throw new ArgumentNullException("reader");
  42. return Tokenize(reader.ReadToEnd());
  43. }
  44. /// <summary>
  45. /// Parses tokens from a given string.
  46. /// </summary>
  47. public static IEnumerable<Token> Tokenize(string input)
  48. {
  49. var reader = new Reader(input ?? string.Empty);
  50. while (reader.Read() != null)
  51. {
  52. var ch = reader.Value;
  53. //
  54. // Identifier or function
  55. //
  56. if (ch == '-' || IsNmStart(ch))
  57. {
  58. reader.Mark();
  59. if (reader.Value == '-')
  60. {
  61. if (!IsNmStart(reader.Read()))
  62. throw new FormatException(string.Format("Invalid identifier at position {0}.", reader.Position));
  63. }
  64. while (IsNmChar(reader.Read())) { /* NOP */ }
  65. if (reader.Value == '(')
  66. yield return Token.Function(reader.Marked());
  67. else
  68. yield return Token.Ident(reader.MarkedWithUnread());
  69. }
  70. //
  71. // Integer
  72. //
  73. else if (IsDigit(ch))
  74. {
  75. reader.Mark();
  76. do { /* NOP */ } while (IsDigit(reader.Read()));
  77. yield return Token.Integer(reader.MarkedWithUnread());
  78. }
  79. //
  80. // Whitespace, including that which is coupled with some punctuation
  81. //
  82. else if (IsS(ch))
  83. {
  84. var space = ParseWhiteSpace(reader);
  85. ch = reader.Read();
  86. switch (ch)
  87. {
  88. case ',': yield return Token.Comma(); break;
  89. case '+': yield return Token.Plus(); break;
  90. case '>': yield return Token.Greater(); break;
  91. case '~': yield return Token.Tilde(); break;
  92. default:
  93. reader.Unread();
  94. yield return Token.WhiteSpace(space);
  95. break;
  96. }
  97. }
  98. else switch (ch)
  99. {
  100. case '*': // * or *=
  101. case '~': // ~ or ~=
  102. case '|': // | or |=
  103. {
  104. if (reader.Read() == '=')
  105. {
  106. yield return ch == '*'
  107. ? Token.SubstringMatch()
  108. : ch == '|' ? Token.DashMatch()
  109. : Token.Includes();
  110. }
  111. else
  112. {
  113. reader.Unread();
  114. yield return ch == '*' || ch == '|'
  115. ? Token.Char(ch.Value)
  116. : Token.Tilde();
  117. }
  118. break;
  119. }
  120. case '^': // ^=
  121. case '$': // $=
  122. {
  123. if (reader.Read() != '=')
  124. throw new FormatException(string.Format("Invalid character at position {0}.", reader.Position));
  125. switch (ch)
  126. {
  127. case '^': yield return Token.PrefixMatch(); break;
  128. case '$': yield return Token.SuffixMatch(); break;
  129. }
  130. break;
  131. }
  132. //
  133. // Single-character punctuation
  134. //
  135. case '.': yield return Token.Dot(); break;
  136. case ':': yield return Token.Colon(); break;
  137. case ',': yield return Token.Comma(); break;
  138. case '=': yield return Token.Equals(); break;
  139. case '[': yield return Token.LeftBracket(); break;
  140. case ']': yield return Token.RightBracket(); break;
  141. case ')': yield return Token.RightParenthesis(); break;
  142. case '+': yield return Token.Plus(); break;
  143. case '>': yield return Token.Greater(); break;
  144. case '#': yield return Token.Hash(ParseHash(reader)); break;
  145. //
  146. // Single- or double-quoted strings
  147. //
  148. case '\"':
  149. case '\'': yield return ParseString(reader, /* quote */ ch.Value); break;
  150. default:
  151. throw new FormatException(string.Format("Invalid character at position {0}.", reader.Position));
  152. }
  153. }
  154. yield return Token.Eoi();
  155. }
  156. private static string ParseWhiteSpace(Reader reader)
  157. {
  158. Debug.Assert(reader != null);
  159. reader.Mark();
  160. while (IsS(reader.Read())) { /* NOP */ }
  161. return reader.MarkedWithUnread();
  162. }
  163. private static string ParseHash(Reader reader)
  164. {
  165. Debug.Assert(reader != null);
  166. reader.MarkFromNext(); // skipping #
  167. while (IsNmChar(reader.Read())) { /* NOP */ }
  168. var text = reader.MarkedWithUnread();
  169. if (text.Length == 0)
  170. throw new FormatException(string.Format("Invalid hash at position {0}.", reader.Position));
  171. return text;
  172. }
  173. private static Token ParseString(Reader reader, char quote)
  174. {
  175. Debug.Assert(reader != null);
  176. //
  177. // TODO Support full string syntax!
  178. //
  179. // string {string1}|{string2}
  180. // string1 \"([^\n\r\f\\"]|\\{nl}|{nonascii}|{escape})*\"
  181. // string2 \'([^\n\r\f\\']|\\{nl}|{nonascii}|{escape})*\'
  182. // nonascii [^\0-\177]
  183. // escape {unicode}|\\[^\n\r\f0-9a-f]
  184. // unicode \\[0-9a-f]{1,6}(\r\n|[ \n\r\t\f])?
  185. //
  186. var strpos = reader.Position;
  187. reader.MarkFromNext(); // skipping quote
  188. char? ch;
  189. StringBuilder sb = null;
  190. while ((ch = reader.Read()) != quote)
  191. {
  192. if (ch == null)
  193. throw new FormatException(string.Format("Unterminated string at position {0}.", strpos));
  194. if (ch == '\\')
  195. {
  196. ch = reader.Read();
  197. //
  198. // NOTE: Only escaping of quote and backslash supported!
  199. //
  200. if (ch != quote && ch != '\\')
  201. throw new FormatException(string.Format("Invalid escape sequence at position {0} in a string at position {1}.", reader.Position, strpos));
  202. if (sb == null)
  203. sb = new StringBuilder();
  204. sb.Append(reader.MarkedExceptLast());
  205. reader.Mark();
  206. }
  207. }
  208. var text = reader.Marked();
  209. if (sb != null)
  210. text = sb.Append(text).ToString();
  211. return Token.String(text);
  212. }
  213. private static bool IsDigit(char? ch) // [0-9]
  214. {
  215. return ch >= '0' && ch <= '9';
  216. }
  217. private static bool IsS(char? ch) // [ \t\r\n\f]
  218. {
  219. return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' || ch == '\f';
  220. }
  221. private static bool IsNmStart(char? ch) // [_a-z]|{nonascii}|{escape}
  222. {
  223. return ch == '_'
  224. || (ch >= 'a' && ch <= 'z')
  225. || (ch >= 'A' && ch <= 'Z');
  226. }
  227. private static bool IsNmChar(char? ch) // [_a-z0-9-]|{nonascii}|{escape}
  228. {
  229. return IsNmStart(ch) || ch == '-' || (ch >= '0' && ch <= '9');
  230. }
  231. private sealed class Reader
  232. {
  233. private readonly string _input;
  234. private int _index = -1;
  235. private int _start = -1;
  236. public Reader(string input)
  237. {
  238. _input = input;
  239. }
  240. private bool Ready { get { return _index >= 0 && _index < _input.Length; } }
  241. public char? Value { get { return Ready ? _input[_index] : (char?)null; } }
  242. public int Position { get { return _index + 1; } }
  243. public void Mark()
  244. {
  245. _start = _index;
  246. }
  247. public void MarkFromNext()
  248. {
  249. _start = _index + 1;
  250. }
  251. public string Marked()
  252. {
  253. return Marked(0);
  254. }
  255. public string MarkedExceptLast()
  256. {
  257. return Marked(-1);
  258. }
  259. private string Marked(int trim)
  260. {
  261. var start = _start;
  262. var count = Math.Min(_input.Length, _index + trim) - start;
  263. return count > 0
  264. ? _input.Substring(start, count)
  265. : string.Empty;
  266. }
  267. public char? Read()
  268. {
  269. _index = Position >= _input.Length ? _input.Length : _index + 1;
  270. return Value;
  271. }
  272. public void Unread()
  273. {
  274. _index = Math.Max(-1, _index - 1);
  275. }
  276. public string MarkedWithUnread()
  277. {
  278. var text = Marked();
  279. Unread();
  280. return text;
  281. }
  282. }
  283. }
  284. }
  285. #pragma warning restore